diff --git a/modules/conop/src/chemdict_parser.cc b/modules/conop/src/chemdict_parser.cc new file mode 100644 index 0000000000000000000000000000000000000000..2c7df33c0928978cb91371e82fa3ba5565d949f1 --- /dev/null +++ b/modules/conop/src/chemdict_parser.cc @@ -0,0 +1,153 @@ +#include <ost/conop/chemdict_parser.hh> + +namespace ost { namespace conop { + +bool ChemdictParser::OnBeginData(const StringRef& data_name) +{ + compound_.reset(new Compound(data_name.str())); + if (last_!=data_name[0]) { + last_=data_name[0]; + std::cout << last_ << std::flush; + } + atom_map_.clear(); + insert_=true; + return true; +} + +bool ChemdictParser::OnBeginLoop(const io::StarLoopDesc& header) +{ + if (header.GetCategory()=="chem_comp_atom") { + loop_type_=ATOM_SPEC; + indices_[ATOM_NAME]=header.GetIndex("atom_id"); + indices_[ALT_ATOM_NAME]=header.GetIndex("alt_atom_id"); + indices_[ELE]=header.GetIndex("type_symbol"); + indices_[IS_LEAVING]=header.GetIndex("pdbx_leaving_atom_flag"); + indices_[IS_AROMATIC]=header.GetIndex("pdbx_aromatic_flag"); + indices_[ORDINAL]=header.GetIndex("pdbx_ordinal"); + return true; + } else if (header.GetCategory()=="chem_comp_bond") { + loop_type_=BOND_SPEC; + indices_[ATOM_ID1]=header.GetIndex("atom_id_1"); + indices_[ATOM_ID2]=header.GetIndex("atom_id_2"); + indices_[BOND_ORDER]=header.GetIndex("value_order"); + return true; + } + loop_type_=DONT_KNOW; + return false; +} + +void ChemdictParser::OnDataRow(const io::StarLoopDesc& header, + const std::vector<StringRef>& columns) +{ + if (loop_type_==ATOM_SPEC) { + AtomSpec atom; + atom.is_leaving=columns[indices_[IS_LEAVING]][0]=='Y'; + atom.name=columns[indices_[ATOM_NAME]].str(); + atom.alt_name=columns[indices_[ALT_ATOM_NAME]].str(); + atom.ordinal=columns[indices_[ORDINAL]].to_int().second-1; + atom.element=columns[indices_[ELE]].str(); + atom.is_aromatic=columns[indices_[IS_AROMATIC]][0]=='Y'; + compound_->AddAtom(atom); + atom_map_[atom.name]=atom.ordinal; + } else if (loop_type_==BOND_SPEC) { + BondSpec bond; + String atom_one=columns[indices_[ATOM_ID1]].str(); + String atom_two=columns[indices_[ATOM_ID2]].str(); + bond.atom_one=atom_map_[atom_one]; + bond.atom_two=atom_map_[atom_two]; + char b=columns[indices_[BOND_ORDER]][0]; + bond.order=b=='D' ? 2 : (b=='S' ? 1 : 3); + if (bond.atom_one>bond.atom_two) { + std::swap(bond.atom_one, bond.atom_two); + } + compound_->AddBond(bond); + } +} + +void ChemdictParser::OnDataItem(const io::StarDataItem& item) +{ + if (item.GetCategory()==StringRef("chem_comp", 9)) { + if (item.GetName()==StringRef("type", 4)) { + // convert type to uppercase + String type=item.GetValue().str(); + for (String::iterator i=type.begin(), e=type.end(); i!=e; ++i) { + *i=toupper(*i); + } + // The type of water is set to "?". let's change it to water... + if (compound_->GetID()=="HOH") { + compound_->SetChemClass(mol::ChemClass(mol::ChemClass::Water)); + } else { + std::map<String, mol::ChemClass>::iterator i=tm_.find(type); + if (i!=tm_.end()) { + compound_->SetChemClass(i->second); + } else { + std::cout << "unknown type '" << type << "' for compound " + << compound_->GetID() << std::endl; + } + } + + } else if (item.GetName()==StringRef("formula", 7)) { + compound_->SetFormula(item.GetValue().str()); + } else if (item.GetName()==StringRef("one_letter_code", 15)) { + if (item.GetValue().length()==1) { + compound_->SetOneLetterCode(item.GetValue()[0]); + } + } else if (item.GetName()==StringRef("pdbx_initial_date", 17)) { + compound_->SetCreationDate(Date::FromString(item.GetValue())); + } else if (item.GetName()==StringRef("pdbx_modified_date", 18)) { + compound_->SetModificationDate(Date::FromString(item.GetValue())); + } + } else if (item.GetName()==StringRef("atom_id", 7)) { + atom_.name=item.GetValue().str(); + } else if (item.GetName()==StringRef("alt_atom_id", 11)) { + atom_.alt_name=item.GetValue().str(); + } else if (item.GetName()==StringRef("type_symbol", 11)) { + atom_.element=item.GetValue().str(); + } else if (item.GetName()==StringRef("pdbx_ordinal", 12)) { + atom_.ordinal=item.GetValue().to_int().second-1; + } +} + +void ChemdictParser::OnEndData() +{ + if (insert_) { + if (compound_->GetAtomSpecs().empty()) { + compound_->AddAtom(atom_); + } + lib_->AddCompound(compound_); + } +} + +std::map<String, mol::ChemClass> ChemdictParser::tm_=std::map<String, mol::ChemClass>(); + +void ChemdictParser::InitTypeMap() +{ + if (!tm_.empty()) + return; + tm_["L-PEPTIDE COOH CARBOXY TERMINUS"]=mol::ChemClass(mol::ChemClass::LPeptideLinking); + tm_["L-PEPTIDE NH3 AMINO TERMINUS"]=mol::ChemClass(mol::ChemClass::LPeptideLinking); + tm_["D-PEPTIDE NH3 AMINO TERMINUS"]=mol::ChemClass(mol::ChemClass::DPeptideLinking); + tm_["L-SACCHARIDE 1,4 AND 1,4 LINKING"]=mol::ChemClass(mol::ChemClass::LSaccharide); + tm_["D-SACCHARIDE 1,4 AND 1,4 LINKING"]=mol::ChemClass(mol::ChemClass::DSaccharide); + tm_["L-SACCHARIDE"]=mol::ChemClass(mol::ChemClass::LSaccharide); + tm_["D-SACCHARIDE"]=mol::ChemClass(mol::ChemClass::DSaccharide); + tm_["SACCHARIDE"]=mol::ChemClass(mol::ChemClass::Saccharide); + tm_["D-PEPTIDE LINKING"]=mol::ChemClass(mol::ChemClass::DPeptideLinking); + tm_["L-PEPTIDE LINKING"]=mol::ChemClass(mol::ChemClass::LPeptideLinking); + tm_["L-PEPTIDE-LINKING"]=mol::ChemClass(mol::ChemClass::LPeptideLinking); + tm_["DNA LINKING"]=mol::ChemClass(mol::ChemClass::DNALinking); + tm_["RNA LINKING"]=mol::ChemClass(mol::ChemClass::RNALinking); + tm_["L-DNA LINKING"]=mol::ChemClass(mol::ChemClass::DNALinking); + tm_["L-RNA LINKING"]=mol::ChemClass(mol::ChemClass::RNALinking); + tm_["R-DNA LINKING"]=mol::ChemClass(mol::ChemClass::DNALinking); + tm_["R-RNA LINKING"]=mol::ChemClass(mol::ChemClass::RNALinking); + tm_["DNA OH 3 PRIME TERMINUS"]=mol::ChemClass(mol::ChemClass::DNALinking); + tm_["PEPTIDE-LIKE"]=mol::ChemClass(mol::ChemClass::PeptideLinking); + tm_["PEPTIDE LINKING"]=mol::ChemClass(mol::ChemClass::PeptideLinking); + tm_["PEPTIDE-LINKING"]=mol::ChemClass(mol::ChemClass::PeptideLinking); + tm_["NON-POLYMER"]=mol::ChemClass(mol::ChemClass::NonPolymer); + tm_["RNA OH 3 PRIME TERMINUS"]=mol::ChemClass(mol::ChemClass::RNALinking); + tm_["?"]=mol::ChemClass(mol::ChemClass::Unknown); +} + +}} \ No newline at end of file diff --git a/modules/conop/src/chemdict_parser.hh b/modules/conop/src/chemdict_parser.hh new file mode 100644 index 0000000000000000000000000000000000000000..4753a389acdfa2fa7e71dbee74c2debbbfbf9a77 --- /dev/null +++ b/modules/conop/src/chemdict_parser.hh @@ -0,0 +1,89 @@ +//------------------------------------------------------------------------------ +// This file is part of the OpenStructure project <www.openstructure.org> +// +// Copyright (C) 2008-2010 by the OpenStructure authors +// +// This library is free software; you can redistribute it and/or modify it under +// the terms of the GNU Lesser General Public License as published by the Free +// Software Foundation; either version 3.0 of the License, or (at your option) +// any later version. +// This library is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more +// details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this library; if not, write to the Free Software Foundation, Inc., +// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +//------------------------------------------------------------------------------ +#ifndef OST_CONOP_CHEMDICT_PARSER_HH +#define OST_CONOP_CHEMDICT_PARSER_HH + +/* + Author: Marco Biasini + */ + + +#include <ost/mol/mol.hh> +#include <ost/io/mol/star_parser.hh> +#include <ost/conop/compound_lib.hh> + +namespace ost { namespace conop { + +typedef enum { + ATOM_SPEC, + BOND_SPEC, + DONT_KNOW +} LoopType; + +class DLLEXPORT_OST_CONOP ChemdictParser : public io::StarParser { +public: + ChemdictParser(std::istream& stream): + io::StarParser(stream), + compound_(new Compound("UNK")), last_(0), loop_type_(DONT_KNOW) + { + this->InitTypeMap(); + } + + virtual bool OnBeginData(const StringRef& data_name); + + virtual bool OnBeginLoop(const io::StarLoopDesc& header); + + virtual void OnDataRow(const io::StarLoopDesc& header, + const std::vector<StringRef>& columns); + + virtual void OnDataItem(const io::StarDataItem& item); + + virtual void OnEndData(); + + void SetCompoundLib(const CompoundLibPtr& lib) + { + lib_=lib; + } +private: + void InitTypeMap(); + CompoundLibPtr lib_; + CompoundPtr compound_; + typedef enum { + ATOM_NAME=0, + ALT_ATOM_NAME=1, + IS_AROMATIC=2, + ORDINAL=3, + IS_LEAVING=4, + ELE=5, + STEREO_CONF=6, + ATOM_ID1=0, + ATOM_ID2=1, + BOND_ORDER=2 + } PropIndex; + char last_; + int indices_[10]; + bool insert_; + static std::map<String, mol::ChemClass> tm_; + std::map<String, int> atom_map_; + LoopType loop_type_; + AtomSpec atom_; +}; + + +}} diff --git a/modules/conop/src/conop.cc b/modules/conop/src/conop.cc index 7d8b35867ed24dc18721a6afaf8de8ca147a350a..9d34eba51a68d2eef8953f7a6365ab5fdf6d98e3 100644 --- a/modules/conop/src/conop.cc +++ b/modules/conop/src/conop.cc @@ -214,10 +214,13 @@ namespace { class PropAssigner: public mol::EntityVisitor { public: PropAssigner(const BuilderP& builder): builder_(builder) {} + virtual bool VisitResidue(const mol::ResidueHandle& res) { String key=builder_->IdentifyResidue(res); - builder_->CheckResidueCompleteness(res); + if (key=="UNK") { + unk_res_[res.GetKey()]+=1; + } builder_->FillResidueProps(res); return true; } @@ -227,8 +230,23 @@ public: builder_->FillAtomProps(atom); return false; } + + virtual void OnExit() + { + for (std::map<String, int>::iterator i=unk_res_.begin(), + e=unk_res_.end(); i!=e; ++i) { + if (i->second>1) { + LOG_WARNING("structure contains unknown residues with name " << i->first + << " ("<< i->second << "x)"); + } else { + LOG_WARNING("structure contains unknown residue with name " << i->first); + } + + } + } private: BuilderP builder_; + std::map<String, int> unk_res_; }; @@ -247,8 +265,8 @@ public: virtual bool VisitResidue(const mol::ResidueHandle& res) { builder_->ConnectAtomsOfResidue(res); - if(prev_) { - builder_->ConnectResidueToPrev(res,prev_); + if (prev_) { + builder_->ConnectResidueToPrev(res,prev_); } prev_=res; return false; diff --git a/modules/conop/src/rule_based_builder.cc b/modules/conop/src/rule_based_builder.cc index 3353dfb461eabe59773beb6fc8c764df9556f81c..d79dbf75b5838e2dd80c21c8b27bd7e59b83cb3a 100644 --- a/modules/conop/src/rule_based_builder.cc +++ b/modules/conop/src/rule_based_builder.cc @@ -33,8 +33,9 @@ namespace ost { namespace conop { void RuleBasedBuilder::CompleteAtoms(mol::ResidueHandle rh) {} -void RuleBasedBuilder::CheckResidueCompleteness(const mol::ResidueHandle& rh) { - LookupCompound(rh); +void RuleBasedBuilder::CheckResidueCompleteness(const mol::ResidueHandle& rh) +{ + this->LookupCompound(rh); if (!last_compound_) { mol::AtomHandleList atoms=rh.GetAtomList(); for (mol::AtomHandleList::const_iterator i=atoms.begin(), @@ -43,7 +44,7 @@ void RuleBasedBuilder::CheckResidueCompleteness(const mol::ResidueHandle& rh) { } return; } - ReorderAtoms(rh , last_compound_); + this->ReorderAtoms(rh , last_compound_); AtomSpecList::const_iterator j=last_compound_->GetAtomSpecs().begin(); mol::AtomHandleList atoms=rh.GetAtomList(); mol::AtomHandleList::iterator i=atoms.begin(); @@ -53,12 +54,30 @@ void RuleBasedBuilder::CheckResidueCompleteness(const mol::ResidueHandle& rh) { if ((*j).ordinal!=static_cast<int>((*i).Impl()->GetState())) { this->OnMissingAtom(rh, (*j).name); } else { - this->FillAtomProps(*i, *j); ++i; } } } +bool RuleBasedBuilder::HasUnknownAtoms(mol::ResidueHandle res) +{ + this->LookupCompound(res); + if (!last_compound_) { + return true; + } + this->ReorderAtoms(res, last_compound_); + AtomSpecList::const_iterator j=last_compound_->GetAtomSpecs().begin(); + mol::AtomHandleList atoms=res.GetAtomList(); + mol::AtomHandleList::iterator i=atoms.begin(); + for (mol::AtomHandleList::iterator + i=atoms.begin(), e=atoms.end(); i!=e; ++i) { + if ((*i).Impl()->GetState()==std::numeric_limits<unsigned int>::max()) { + return true; + } + } + return false; +} + void RuleBasedBuilder::FillAtomProps(mol::AtomHandle atom, const AtomSpec& spec) { mol::AtomProp props=atom.GetAtomProps(); @@ -75,8 +94,9 @@ void RuleBasedBuilder::FillAtomProps(mol::AtomHandle atom, const AtomSpec& spec) atom.SetAtomProps(props); } -void RuleBasedBuilder::FillResidueProps(mol::ResidueHandle residue) { - LookupCompound(residue); +void RuleBasedBuilder::FillResidueProps(mol::ResidueHandle residue) +{ + this->LookupCompound(residue); if (!last_compound_) return; residue.SetChemClass(last_compound_->GetChemClass()); @@ -103,6 +123,9 @@ void RuleBasedBuilder::LookupCompound(const mol::ResidueHandle& rh) void RuleBasedBuilder::ReorderAtoms(mol::ResidueHandle residue, CompoundPtr compound) { + if (last_residue_==residue) { + return; + } mol::impl::ResidueImplPtr impl=residue.Impl(); mol::impl::AtomImplList::iterator i=impl->GetAtomList().begin(); for (; i!=impl->GetAtomList().end(); ++i) { @@ -110,7 +133,7 @@ void RuleBasedBuilder::ReorderAtoms(mol::ResidueHandle residue, int index=compound->GetAtomSpecIndex(atom->GetName()); if (index==-1) { if (!this->OnUnknownAtom(mol::AtomHandle(atom))) { - atom->SetState(std::numeric_limits<int>::max()); + atom->SetState(std::numeric_limits<unsigned int>::max()); } continue; } @@ -118,6 +141,14 @@ void RuleBasedBuilder::ReorderAtoms(mol::ResidueHandle residue, } std::sort(impl->GetAtomList().begin(), impl->GetAtomList().end(), OrdinalComp()); + last_residue_=residue; + unknown_atoms_=this->HasUnknownAtoms(residue); + if (unknown_atoms_) { + LOG_WARNING("residue " << residue << " doesn't look like a standard " + << residue.GetKey()); + residue.SetChemClass(mol::ChemClass(mol::ChemClass::Unknown)); + residue.SetOneLetterCode('?'); + } } @@ -132,7 +163,9 @@ mol::ResidueKey RuleBasedBuilder::IdentifyResidue(const mol::ResidueHandle& rh) } } -mol::AtomHandle RuleBasedBuilder::LocateAtom(const mol::AtomHandleList& ahl, int ordinal) { +mol::AtomHandle RuleBasedBuilder::LocateAtom(const mol::AtomHandleList& ahl, + int ordinal) +{ if (ahl.empty()) return mol::AtomHandle(); const mol::AtomHandle* r_it=&ahl.back(); @@ -148,14 +181,26 @@ mol::AtomHandle RuleBasedBuilder::LocateAtom(const mol::AtomHandleList& ahl, int return not_found ? mol::AtomHandle() : *r_it; } -void RuleBasedBuilder::ConnectAtomsOfResidue(mol::ResidueHandle rh) { + +inline void dist_connect(RuleBasedBuilder* b, mol::AtomHandleList atoms) +{ + for (mol::AtomHandleList::const_iterator i=atoms.begin(), + e=atoms.end(); i!=e; ++i) { + b->DistanceBasedConnect(*i); + } +} +void RuleBasedBuilder::ConnectAtomsOfResidue(mol::ResidueHandle rh) +{ + LookupCompound(rh); + if (!last_compound_) { - mol::AtomHandleList atoms=rh.GetAtomList(); - for (mol::AtomHandleList::const_iterator i=atoms.begin(), - e=atoms.end(); i!=e; ++i) { - this->DistanceBasedConnect(*i); - } + dist_connect(this, rh.GetAtomList()); + return; + } + this->ReorderAtoms(rh, last_compound_); + if (unknown_atoms_) { + dist_connect(this, rh.GetAtomList()); return; } mol::XCSEditor e=rh.GetEntity().RequestXCSEditor(mol::BUFFERED_EDIT); @@ -166,19 +211,14 @@ void RuleBasedBuilder::ConnectAtomsOfResidue(mol::ResidueHandle rh) { mol::AtomHandle a1=this->LocateAtom(atoms, bond.atom_one); mol::AtomHandle a2=this->LocateAtom(atoms, bond.atom_two); if (a1.IsValid() && a2.IsValid() && this->IsBondFeasible(a1, a2)) { - e.Connect(a1, a2, bond.order); + e.Connect(a1, a2, bond.order); } } - for (mol::AtomHandleList::const_iterator i=atoms.begin(), - e=atoms.end(); i!=e; ++i) { - if (int((*i).Impl()->GetState())==std::numeric_limits<int>::max()) { - this->DistanceBasedConnect(*i); - } - } } void RuleBasedBuilder::ConnectResidueToNext(mol::ResidueHandle rh, - mol::ResidueHandle next) { + mol::ResidueHandle next) +{ if (!next.IsValid()) { return; } @@ -208,7 +248,8 @@ void RuleBasedBuilder::ConnectResidueToNext(mol::ResidueHandle rh, } -void RuleBasedBuilder::AssignTorsions(mol::ChainHandle chain) { +void RuleBasedBuilder::AssignTorsions(mol::ChainHandle chain) +{ if (chain.GetResidueCount()==0) return; std::vector<mol::ResidueHandle> rlist = chain.GetResidueList(); @@ -219,7 +260,8 @@ void RuleBasedBuilder::AssignTorsions(mol::ChainHandle chain) { } } -void RuleBasedBuilder::AssignTorsionsToResidue(mol::ResidueHandle residue) { +void RuleBasedBuilder::AssignTorsionsToResidue(mol::ResidueHandle residue) +{ /// The only components having named torsions are the standard set of amino /// acids, plus some of compounds derived from them such as selenium /// methionine. Things are simplified a lot by only storing the torsions diff --git a/modules/conop/src/rule_based_builder.hh b/modules/conop/src/rule_based_builder.hh index 0ab392b24af6b3a113cc05631c8094caf9674bc5..d04f422727d7edf80718a61f2195a933d83f4a29 100644 --- a/modules/conop/src/rule_based_builder.hh +++ b/modules/conop/src/rule_based_builder.hh @@ -108,14 +108,16 @@ public: /// \brief Set residue properties such as chemical class virtual void FillResidueProps(mol::ResidueHandle residue); - - + /// \brief whether the residue has unknown atoms + bool HasUnknownAtoms(mol::ResidueHandle res); /// \brief Check whether the residue has all required atoms. This does not /// include hydrogens and leaving atoms such as the terminal OXT. virtual bool IsResidueComplete(const mol::ResidueHandle& residue); private: CompoundLibPtr compound_lib_; CompoundPtr last_compound_; + mol::ResidueHandle last_residue_; + bool unknown_atoms_; void LookupCompound(const mol::ResidueHandle& rh); /// Change internal order of atoms in residue to the order given by compound void ReorderAtoms(mol::ResidueHandle residue, CompoundPtr compound);