From 454ba5a78afe8ef22709650d68e1007e085c5000 Mon Sep 17 00:00:00 2001 From: Xavier Robin <xavier.robin@unibas.ch> Date: Tue, 23 Jul 2024 10:36:24 +0200 Subject: [PATCH] fix: do not fill compound lib with invalid compounds and atoms Reading the BIRD data from prd-all.cif.gz which doesn't contain any compound would result in a compound lib filled with dummy compounds and atoms, containing no useful data. This commit skips compounds with no _chem_comp.id (key data item) and non-loop atoms without an _chem_comp_atom.atom_id, with a warning. The result of reading prd-all.cif.gz is now an empty compound lib. --- modules/conop/src/compound.hh | 6 ++++++ modules/io/src/mol/chemdict_parser.cc | 28 +++++++++++++++++++++++++-- modules/io/src/mol/chemdict_parser.hh | 2 ++ 3 files changed, 34 insertions(+), 2 deletions(-) diff --git a/modules/conop/src/compound.hh b/modules/conop/src/compound.hh index 69c11d69e..07916c231 100644 --- a/modules/conop/src/compound.hh +++ b/modules/conop/src/compound.hh @@ -179,6 +179,12 @@ public: const String& GetID() const { return tlc_; } + + /// \brief set three-letter code that is unique for every compound + void SetID(const String& id) { + tlc_ = id; + } + Dialect GetDialect() const { return dialect_; } String GetDialectAsString() const { diff --git a/modules/io/src/mol/chemdict_parser.cc b/modules/io/src/mol/chemdict_parser.cc index 2213225d6..382a983af 100644 --- a/modules/io/src/mol/chemdict_parser.cc +++ b/modules/io/src/mol/chemdict_parser.cc @@ -7,6 +7,8 @@ using namespace ost::conop; bool ChemdictParser::OnBeginData(const StringRef& data_name) { + // Create empty compound skeleton + valid_compound_ = false; compound_.reset(new Compound(data_name.str())); compound_->SetDialect(dialect_); if (last_!=data_name[0]) { @@ -14,6 +16,7 @@ bool ChemdictParser::OnBeginData(const StringRef& data_name) std::cout << last_ << std::flush; } atom_map_.clear(); + valid_atom_ = false; return true; } @@ -94,6 +97,14 @@ void ChemdictParser::OnDataRow(const StarLoopDesc& header, void ChemdictParser::OnDataItem(const StarDataItem& item) { if (item.GetCategory()==StringRef("chem_comp", 9)) { + if (item.GetName()==StringRef("id", 2)) { + if (compound_->GetID() != item.GetValue().str()) { + LOG_INFO("_chem_comp.id '" << item.GetValue() << "' doesn't match" + << "ID from data block '" << compound_->GetID() << "'"); + compound_->SetID(item.GetValue().str()); + } + valid_compound_ = true; + } if (item.GetName()==StringRef("type", 4)) { // convert type to uppercase String type=item.GetValue().str(); @@ -159,6 +170,7 @@ void ChemdictParser::OnDataItem(const StarDataItem& item) } } else if (item.GetName()==StringRef("atom_id", 7)) { atom_.name=item.GetValue().str(); + valid_atom_ = true; } else if (item.GetName()==StringRef("alt_atom_id", 11)) { if (compound_->GetID()=="ILE" && item.GetValue()==StringRef("CD1", 3)) { atom_.alt_name="CD"; @@ -177,12 +189,24 @@ void ChemdictParser::OnEndData() { if (compound_) { - if (compound_->GetID() != "UNL" && + if (! valid_compound_) + { + LOG_WARNING("Skipping compound without _chem_comp.id: " << compound_->GetID()); + } + else if (compound_->GetID() != "UNL" && ! (ignore_reserved_ && IsNameReserved(compound_->GetID())) && ! (ignore_obsolete_ && compound_->GetObsolete())) { if (compound_->GetAtomSpecs().empty()) { - compound_->AddAtom(atom_); + // This happens if we had a single atom + if (valid_atom_) + { + compound_->AddAtom(atom_); + } + else + { + LOG_WARNING("Adding compound with no atoms: " << compound_->GetID()); + } } lib_->AddCompound(compound_); } diff --git a/modules/io/src/mol/chemdict_parser.hh b/modules/io/src/mol/chemdict_parser.hh index 7a30cc8fc..46a83ec3b 100644 --- a/modules/io/src/mol/chemdict_parser.hh +++ b/modules/io/src/mol/chemdict_parser.hh @@ -71,6 +71,7 @@ private: bool IsNameReserved(const String& data_name); conop::CompoundLibPtr lib_; conop::CompoundPtr compound_; + bool valid_compound_; typedef enum { ATOM_NAME=0, ALT_ATOM_NAME=1, @@ -93,6 +94,7 @@ private: std::map<String, int> atom_map_; LoopType loop_type_; conop::AtomSpec atom_; + bool valid_atom_; conop::Compound::Dialect dialect_; bool ignore_reserved_; bool ignore_obsolete_; -- GitLab