Something went wrong on our end
-
marco authored
git-svn-id: https://dng.biozentrum.unibas.ch/svn/openstructure/trunk@2100 5a81b35b-ba03-0410-adc8-b2c5c5119f08
marco authoredgit-svn-id: https://dng.biozentrum.unibas.ch/svn/openstructure/trunk@2100 5a81b35b-ba03-0410-adc8-b2c5c5119f08
pdb_reader.cc 17.99 KiB
//------------------------------------------------------------------------------
// This file is part of the OpenStructure project <www.openstructure.org>
//
// Copyright (C) 2008-2010 by the OpenStructure authors
//
// This library is free software; you can redistribute it and/or modify it under
// the terms of the GNU Lesser General Public License as published by the Free
// Software Foundation; either version 3.0 of the License, or (at your option)
// any later version.
// This library is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
// details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with this library; if not, write to the Free Software Foundation, Inc.,
// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
//------------------------------------------------------------------------------
#include <boost/iostreams/filter/gzip.hpp>
#include <boost/filesystem/convenience.hpp>
#include <boost/algorithm/string.hpp>
#include <boost/algorithm/string/trim.hpp>
#include <boost/format.hpp>
#include <boost/lexical_cast.hpp>
#include <ost/profile.hh>
#include <ost/log.hh>
#include <ost/message.hh>
#include <ost/conop/conop.hh>
#include <ost/geom/mat3.hh>
#include <ost/io/io_exception.hh>
#include "pdb_reader.hh"
namespace ost { namespace io {
using boost::format;
namespace {
bool IEquals(const StringRef& a, const StringRef& b)
{
if (a.size()!=b.size()) {
return false;
}
for (size_t i=0; i<a.size(); ++i) {
if (toupper(a[i])!=toupper(b[i])) {
return false;
}
}
return true;
}
mol::ResNum to_res_num(int num, char ins_code)
{
return mol::ResNum(num, ins_code==' ' ? '\0' : ins_code);
}
}
PDBReader::PDBReader(std::istream& instream):
infile_(), instream_(instream)
{
this->Init(boost::filesystem::path(""));
}
PDBReader::PDBReader(const String& filename)
: infile_(filename), instream_(infile_)
{
this->Init(boost::filesystem::path(filename));
}
PDBReader::PDBReader(const boost::filesystem::path& loc):
infile_(loc), instream_(infile_)
{
this->Init(loc);
}
void PDBReader::Init(const boost::filesystem::path& loc)
{
if (boost::iequals(".gz", boost::filesystem::extension(loc))) {
in_.push(boost::iostreams::gzip_decompressor());
}
in_.push(instream_);
if(!infile_) throw IOException("could not open "+loc.string());
line_num_=0;
if(boost::iequals(boost::filesystem::extension(loc), ".pqr")) {
is_pqr_=true;
} else {
is_pqr_=false;
}
hard_end_=false;
}
bool PDBReader::HasNext()
{
if (hard_end_) {
return false;
}
// since helix and sheet have to appear before any atom/hetatm records
// a HELIX/SHEET entry implies a next model.
while (std::getline(in_, curr_line_) && ++line_num_) {
StringRef curr_line(curr_line_.c_str(), curr_line_.length());
if (curr_line.size()>5 &&
(IEquals(curr_line.substr(0, 6), StringRef("ATOM ", 6)) ||
(!(PDB::Flags() & PDB::NO_HETATMS) &&
IEquals(curr_line.substr(0, 6),StringRef("HETATM ", 6))) ||
IEquals(curr_line.substr(0, 6),StringRef("ANISOU ", 6)) ||
IEquals(curr_line.substr(0, 6), StringRef("SHEET ", 6)) ||
IEquals(curr_line.substr(0, 6), StringRef("HELIX ", 6)))) {
return true;
} else if (IEquals(curr_line.rtrim(), StringRef("END", 3))) {
hard_end_=true;
return false;
}
}
return false;
}
void PDBReader::Import(mol::EntityHandle& ent,
const String& restrict_chains)
{
LOGN_DEBUG("PDBReader: current flags: " << PDB::Flags());
Profile profile_import("PDBReader::Import");
this->ClearState();
// first read curr_line and then read next...
restrict_chains_=restrict_chains;
bool go_on=true;
do {
if (curr_line_.empty()) {
continue;
}
StringRef curr_line(curr_line_.c_str(), curr_line_.size());
switch(curr_line[0]) {
case 'A':
case 'a':
if (curr_line.size()<6) {
continue;
}
if (IEquals(curr_line.substr(0, 6), StringRef("ATOM ", 6))) {
LOGN_TRACE("processing ATOM entry");
this->ParseAndAddAtom(curr_line, line_num_, ent,
StringRef("ATOM", 4));
} else if (IEquals(curr_line.substr(0, 6), StringRef("ANISOU", 6))) {
LOGN_TRACE("processing ANISOU entry");
this->ParseAnisou(curr_line, line_num_, ent);
}
break;
case 'E':
case 'e':
if (curr_line.size()<3) {
continue;
}
if (IEquals(curr_line.rtrim(), StringRef("END", 3))) {
hard_end_=true;
go_on=false;
break;
}
if (curr_line.size()<6) {
continue;
}
if (IEquals(curr_line.substr(0, 6), StringRef("ENDMDL", 6))) {
go_on=false;
break;
}
case 'H':
case 'h':
if (curr_line.size()<6) {
continue;
}
if (IEquals(curr_line.substr(0, 6), StringRef("HETATM", 6))) {
if (PDB::Flags() & PDB::NO_HETATMS)
continue;
LOGN_TRACE("processing HETATM entry");
this->ParseAndAddAtom(curr_line, line_num_, ent,
StringRef("HETATM", 6));
} else if (IEquals(curr_line.substr(0, 6), StringRef("HELIX ", 6))) {
this->ParseHelixEntry(curr_line);
}
case 'S':
case 's':
if (curr_line.size()<6) {
continue;
}
if (IEquals(curr_line.substr(0, 6), StringRef("SHEET ", 6))) {
this->ParseStrandEntry(curr_line);
}
default:
break;
}
} while (std::getline(in_, curr_line_) && ++line_num_ && go_on);
LOGN_MESSAGE("imported "
<< chain_count_ << " chains, "
<< residue_count_ << " residues, "
<< atom_count_ << " atoms; with "
<< helix_list_.size() << " helices and "
<< strand_list_.size() << " strands");
this->AssignSecStructure(ent);
}
void PDBReader::AssignSecStructure(mol::EntityHandle ent)
{
// and now for the fun part, assigning the strands/helices to residues
for (HSList::const_iterator i=helix_list_.begin(), e=helix_list_.end();
i!=e; ++i) {
mol::ChainHandle chain=ent.FindChain(i->chain);
if (!chain.IsValid()) {
LOGN_MESSAGE("ignoring helix record for unknown chain "+i->chain);
continue;
}
mol::SecStructure alpha(mol::SecStructure::ALPHA_HELIX);
// some PDB files contain helix/strand entries that are adjacent to each
// other. To avoid visual artifacts, we effectively shorten the first of the
// two secondary structure segments to insert one residue of coil
// conformation.
mol::ResNum start=i->start, end=i->end;
if (helix_list_.end()!=i+1 && (*(i+1)).start.GetNum()<=end.GetNum()+1) {
end=mol::ResNum((*(i+1)).start.GetNum()-2);
}
chain.AssignSecondaryStructure(alpha, start, end);
}
for (HSList::const_iterator i=strand_list_.begin(), e=strand_list_.end();
i!=e; ++i) {
mol::ChainHandle chain=ent.FindChain(i->chain);
if (!chain.IsValid()) {
LOGN_MESSAGE("ignoring strand record for unknown chain "+i->chain);
continue;
}
mol::SecStructure extended(mol::SecStructure::EXTENDED);
mol::ResNum start=i->start, end=i->end;
// see comment for helix assignment
if (strand_list_.end()!=i+1 && (*(i+1)).start.GetNum()<=end.GetNum()+1) {
end=mol::ResNum((*(i+1)).start.GetNum()-2);
}
chain.AssignSecondaryStructure(extended, start, end);
}
}
void PDBReader::ClearState()
{
curr_chain_=mol::ChainHandle();
curr_residue_=mol::ResidueHandle();
chain_count_=0;
residue_count_=0;
atom_count_=0;
hard_end_=false;
helix_list_.clear();
strand_list_.clear();
}
bool PDBReader::ParseAtomIdent(const StringRef& line, int line_num,
char& chain_name, StringRef& res_name,
mol::ResNum& resnum, StringRef& atom_name,
char& alt_loc, const StringRef& record_type)
{
chain_name=line[21];
if (restrict_chains_.size()>0 &&
restrict_chains_.find(chain_name)==String::npos) {
return false;
}
std::pair<bool, int> a_num=line.substr(6, 5).ltrim().to_int();
if (!a_num.first) {
if (PDB::Flags() & PDB::SKIP_FAULTY_RECORDS) {
return false;
}
throw IOException(str(format("invalid atom number on line %d") %line_num));
}
atom_name=line.substr(12, 4).trim();
alt_loc=line[16];
res_name=line.substr(17, 3).trim();
std::pair<bool, int> res_num=line.substr(22, 4).ltrim().to_int();;
if (!res_num.first) {
if (PDB::Flags() & PDB::SKIP_FAULTY_RECORDS) {
return false;
}
throw IOException(str(format("invalid res number on line %d") % line_num));
}
char ins_c=line[26];
resnum=to_res_num(res_num.second, ins_c);
if (PDB::Flags() & PDB::CALPHA_ONLY) {
if (record_type[0]=='H' || record_type[0]=='h') {
return false;
}
if (atom_name!=StringRef("CA", 2)) {
return false;
}
return true;
}
return true;
}
void PDBReader::ParseAnisou(const StringRef& line, int line_num,
mol::EntityHandle& ent)
{
char chain_name=0;
char alt_loc=0;
StringRef res_name, atom_name;
mol::ResNum res_num(0);
if (!this->ParseAtomIdent(line, line_num, chain_name, res_name, res_num,
atom_name, alt_loc, StringRef("ANISOU", 6))) {
return;
}
double anisou[6]={0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
for (int i=0;i<6; ++i) {
std::pair<bool, int> result=line.substr(29+i*7, 6).ltrim().to_int();
if (!result.first) {
if (PDB::Flags() & PDB::SKIP_FAULTY_RECORDS) {
return;
}
throw IOException(str(format("invalid ANISOU record on line %d")%line_num));
}
anisou[i]=result.second;
}
String aname(atom_name.str());
if (!curr_residue_.IsValid()) {
if (PDB::Flags() & PDB::SKIP_FAULTY_RECORDS ||
PDB::Flags() & PDB::CALPHA_ONLY) {
return;
}
const char* fmt_str="invalid ANISOU record for inexistent atom on line %d";
throw IOException(str(format(fmt_str) % line_num));
}
mol::AtomHandle atom=curr_residue_.FindAtom(aname);
if (!atom.IsValid()) {
if (PDB::Flags() & PDB::SKIP_FAULTY_RECORDS ||
PDB::Flags() & PDB::CALPHA_ONLY) {
return;
}
const char* fmt_str="invalid ANISOU record for inexistent atom on line %d";
throw IOException(str(format(fmt_str) % line_num));
}
//get properties which are already set and extend them by adding the ANISOU info
mol::AtomProp aprop=atom.GetProp();
geom::Mat3 mat(anisou[0], anisou[3], anisou[4],
anisou[3], anisou[1], anisou[5],
anisou[4], anisou[5], anisou[2]);
aprop.anisou=mat;
//divide by 10**4 to actually reflect the real values
aprop.anisou/=10000;
aprop.has_anisou=true;
atom.SetProp(aprop);
return;
}
void PDBReader::ParseAndAddAtom(const StringRef& line, int line_num,
mol::EntityHandle& ent,
const StringRef& record_type)
{
mol::XCSEditor editor=ent.RequestXCSEditor(mol::BUFFERED_EDIT);
char alt_loc=0;
char chain_name=0;
StringRef res_name, atom_name;
mol::ResNum res_num(0);
if (!this->ParseAtomIdent(line, line_num, chain_name, res_name, res_num,
atom_name, alt_loc, record_type)) {
return;
}
std::pair<bool, Real> charge, radius;
std::pair<bool, Real> occ, temp;
geom::Vec3 apos;
for (int i=0;i<3; ++i) {
std::pair<bool, float> result=line.substr(30+i*8, 8).ltrim().to_float();
if (!result.first) {
if (PDB::Flags() & PDB::SKIP_FAULTY_RECORDS) {
return;
}
throw IOException(str(format("invalid coordinate on line %d")%line_num));
}
apos[i]=result.second;
}
// b-factors and occ are replaced by radius and charge if PQR file format
if(is_pqr_) {
occ=std::make_pair(true, 1.0);
temp=std::make_pair(true, 0.0);
if (line.length()>=60) {
charge=line.substr(54,6).ltrim().to_float();
}
if (line.length()>=66) {
radius=line.substr(60, 6).ltrim().to_float();
}
} else {
if (line.length()>=60) {
occ=line.substr(54,6).ltrim().to_float();
}
if (line.length()>=66) {
temp=line.substr(60, 6).ltrim().to_float();
}
}
LOGN_TRACE( "line: [" << line << "]" );
String s_ele;
String s_chain(1, chain_name);
String aname(atom_name.str());
// determine element from element column (77-78, right justified) if present
// otherwise set to empty String. It is up to the builder to determine the
// correct element in that case.
if (line.length()>=78) { // element column present
if(line[76]==' ' && line[77]==' ') { // both characters are empty
s_ele="";
} else if(line[76]!=' ' || line[77]!=' ') { // at least one character not
// empty
if(line[76]==' ' && line[77]!=' ') { // single character element,
// right justified
s_ele=line.substr(77,1).str();
} else if(line[76]!=' ' && line[77]==' ') { // single character element,
// left justified
s_ele=line.substr(76,1).str();
} else { // Real character element
s_ele=line.substr(76,2).str();
}
}
}
else {
s_ele=""; // no element column present
}
// some postprocessing
LOGN_TRACE( "s_chain: [" << chain_name << "]" );
// determine chain and residue update
bool update_chain=false;
bool update_residue=false;
if(!curr_chain_) {
update_chain=true;
update_residue=true;
} else if(curr_chain_.GetName()!=s_chain) {
update_chain=true;
update_residue=true;
}
if(!curr_residue_) {
update_residue=true;
} else if(curr_residue_.GetNumber()!=res_num) {
update_residue=true;
}
if(update_chain) {
curr_chain_=mol::ChainHandle();
#if 0
// TODO: should this depend on JOIN_SPREAD as well?
if (PDB::Flags() & PDB::JOIN_SPREAD_ATOM_RECORDS) {
curr_chain_=ent.FindChain(s_chain);
}
#else
curr_chain_=ent.FindChain(s_chain);
#endif
if(!curr_chain_.IsValid()) {
LOGN_DUMP("new chain " << s_chain);
curr_chain_=editor.InsertChain(s_chain);
++chain_count_;
}
assert(curr_chain_.IsValid());
}
if(update_residue) {
curr_residue_=mol::ResidueHandle();
if (PDB::Flags() & PDB::JOIN_SPREAD_ATOM_RECORDS) {
curr_residue_=curr_chain_.FindResidue(res_num);
}
if (!curr_residue_.IsValid()) {
LOGN_DUMP("new residue " << res_name << " " << res_num);
curr_residue_=editor.AppendResidue(curr_chain_, res_name.str(), res_num);
++residue_count_;
}
assert(curr_residue_.IsValid());
}
// finally add atom
LOGN_DUMP("adding atom " << aname << " (" << s_ele << ") @" << apos);
mol::AtomProp aprop;
aprop.element=s_ele;
if(is_pqr_) {
if (radius.first) {
aprop.radius=radius.second;
} else {
aprop.radius=0.0;
}
} else {
aprop.radius=0.0;
}
if (temp.first) {
aprop.b_factor=temp.second;
}
if (occ.first) {
aprop.occupancy=occ.second;
}
if (charge.first) {
aprop.charge=charge.second;
}
aprop.is_hetatm=record_type[0]=='H' ? true : false;
if (alt_loc!=' ') {
// Check if there is already a atom with the same name.
mol::AtomHandle me=curr_residue_.FindAtom(aname);
if (me.IsValid()) {
try {
editor.AddAltAtomPos(String(1, alt_loc), me, apos);
} catch (Error) {
LOGN_MESSAGE("Ignoring atom alt location since there is already an atom "
"with name " << aname << ", but without an alt loc");
return;
}
} else {
mol::AtomHandle ah=editor.InsertAltAtom(curr_residue_, aname,
String(1, alt_loc), apos, aprop);
/*
for now, this is not needed - the cg will use an global atom id sorted
list insteadx
if (PDB::Flags() & PDB::SEQUENTIAL_ATOM_IMPORT) {
sequential_atom_list_.push_back(ah);
}
*/
++atom_count_;
}
} else {
mol::AtomHandle ah = editor.InsertAtom(curr_residue_, aname, apos, aprop);
/*
if (PDB::Flags() & PDB::SEQUENTIAL_ATOM_IMPORT) {
sequential_atom_list_.push_back(ah);
}
*/
++atom_count_;
}
}
void PDBReader::ParseHelixEntry(const StringRef& line)
{
std::pair<bool, int> start_num=line.substr(21, 4).ltrim().to_int();
std::pair<bool, int> end_num=line.substr(33, 4).ltrim().to_int();
if (!start_num.first || !end_num.first) {
throw IOException(str(format("invalid helix entry on line %d") % line_num_));
}
LOGN_DEBUG("making helix entry: " << start_num.second << ", "
<< line[25] << " " << end_num.second << " " << line[37]);
HSEntry hse = {to_res_num(start_num.second, line[25]),
to_res_num(end_num.second, line[37]),
line.substr(19,1).str()};
if (restrict_chains_.size()==0 ||
restrict_chains_.find(hse.chain)!=String::npos) {
helix_list_.push_back(hse);
}
}
void PDBReader::ParseStrandEntry(const StringRef& line)
{
std::pair<bool, int> start_num=line.substr(22, 4).ltrim().to_int();
std::pair<bool, int> end_num=line.substr(33, 4).ltrim().to_int();
if (!start_num.first || !end_num.first) {
throw IOException(str(format("invalid strand entry on line %d") % line_num_));
}
LOGN_DEBUG("making strand entry: " << start_num.second << ", " << line[26] << " " << end_num.second << " " << line[37]);
HSEntry hse = {to_res_num(start_num.second, line[26]),
to_res_num(end_num.second, line[37]),
line.substr(21,1).str()};
if (restrict_chains_.size()==0 ||
restrict_chains_.find(hse.chain)!=String::npos) {
strand_list_.push_back(hse);
}
}
}}