From 454ba5a78afe8ef22709650d68e1007e085c5000 Mon Sep 17 00:00:00 2001
From: Xavier Robin <xavier.robin@unibas.ch>
Date: Tue, 23 Jul 2024 10:36:24 +0200
Subject: [PATCH] fix: do not fill compound lib with invalid compounds and
 atoms

Reading the BIRD data from prd-all.cif.gz which doesn't contain any
compound would result in a compound lib filled with dummy compounds and
atoms, containing no useful data. This commit skips compounds with no
_chem_comp.id (key data item) and non-loop atoms without an
_chem_comp_atom.atom_id, with a warning.
The result of reading prd-all.cif.gz is now an empty compound lib.
---
 modules/conop/src/compound.hh         |  6 ++++++
 modules/io/src/mol/chemdict_parser.cc | 28 +++++++++++++++++++++++++--
 modules/io/src/mol/chemdict_parser.hh |  2 ++
 3 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/modules/conop/src/compound.hh b/modules/conop/src/compound.hh
index 69c11d69e..07916c231 100644
--- a/modules/conop/src/compound.hh
+++ b/modules/conop/src/compound.hh
@@ -179,6 +179,12 @@ public:
   const String& GetID() const {
     return tlc_;
   }
+
+  /// \brief set three-letter code that is unique for every compound
+   void SetID(const String& id) {
+    tlc_ = id;
+  }
+
   Dialect GetDialect() const { return dialect_; }
   
   String GetDialectAsString() const { 
diff --git a/modules/io/src/mol/chemdict_parser.cc b/modules/io/src/mol/chemdict_parser.cc
index 2213225d6..382a983af 100644
--- a/modules/io/src/mol/chemdict_parser.cc
+++ b/modules/io/src/mol/chemdict_parser.cc
@@ -7,6 +7,8 @@ using namespace ost::conop;
   
 bool ChemdictParser::OnBeginData(const StringRef& data_name) 
 {
+  // Create empty compound skeleton
+  valid_compound_ = false;
   compound_.reset(new Compound(data_name.str()));
   compound_->SetDialect(dialect_);
   if (last_!=data_name[0]) {
@@ -14,6 +16,7 @@ bool ChemdictParser::OnBeginData(const StringRef& data_name)
     std::cout << last_ << std::flush;
   }
   atom_map_.clear();
+  valid_atom_ = false;
   return true;
 }
 
@@ -94,6 +97,14 @@ void ChemdictParser::OnDataRow(const StarLoopDesc& header,
 void ChemdictParser::OnDataItem(const StarDataItem& item)
 {
   if (item.GetCategory()==StringRef("chem_comp", 9)) {
+    if (item.GetName()==StringRef("id", 2)) {
+      if (compound_->GetID() != item.GetValue().str()) {
+          LOG_INFO("_chem_comp.id '" << item.GetValue() << "' doesn't match"
+                      << "ID from data block '" << compound_->GetID() << "'");
+          compound_->SetID(item.GetValue().str());
+      }
+      valid_compound_ = true;
+    }
     if (item.GetName()==StringRef("type", 4)) {
       // convert type to uppercase
       String type=item.GetValue().str();
@@ -159,6 +170,7 @@ void ChemdictParser::OnDataItem(const StarDataItem& item)
     }
   } else if (item.GetName()==StringRef("atom_id", 7)) {
     atom_.name=item.GetValue().str();
+    valid_atom_ = true;
   } else if (item.GetName()==StringRef("alt_atom_id", 11)) {
     if (compound_->GetID()=="ILE" && item.GetValue()==StringRef("CD1", 3)) {
       atom_.alt_name="CD";
@@ -177,12 +189,24 @@ void ChemdictParser::OnEndData()
 {
   if (compound_)
   {
-    if (compound_->GetID() != "UNL" &&
+    if (! valid_compound_)
+    {
+      LOG_WARNING("Skipping compound without _chem_comp.id: " << compound_->GetID());
+    }
+    else if (compound_->GetID() != "UNL" &&
         ! (ignore_reserved_ && IsNameReserved(compound_->GetID())) &&
         ! (ignore_obsolete_ && compound_->GetObsolete()))
     {
       if (compound_->GetAtomSpecs().empty()) {
-        compound_->AddAtom(atom_);
+        // This happens if we had a single atom
+        if (valid_atom_)
+        {
+          compound_->AddAtom(atom_);
+        }
+        else
+        {
+          LOG_WARNING("Adding compound with no atoms: " << compound_->GetID());
+        }
       }
       lib_->AddCompound(compound_);
     }
diff --git a/modules/io/src/mol/chemdict_parser.hh b/modules/io/src/mol/chemdict_parser.hh
index 7a30cc8fc..46a83ec3b 100644
--- a/modules/io/src/mol/chemdict_parser.hh
+++ b/modules/io/src/mol/chemdict_parser.hh
@@ -71,6 +71,7 @@ private:
   bool IsNameReserved(const String& data_name);
   conop::CompoundLibPtr                   lib_;
   conop::CompoundPtr                      compound_;
+  bool                                    valid_compound_;
   typedef enum {
     ATOM_NAME=0,
     ALT_ATOM_NAME=1,
@@ -93,6 +94,7 @@ private:
   std::map<String, int>                   atom_map_;
   LoopType                                loop_type_;  
   conop::AtomSpec                         atom_;
+  bool                                    valid_atom_;
   conop::Compound::Dialect                dialect_;
   bool                                    ignore_reserved_;
   bool                                    ignore_obsolete_;
-- 
GitLab