From 6d884cf1b2f98dba3d14d1580559b21a065894b5 Mon Sep 17 00:00:00 2001
From: Xavier Robin <xavier.robin@unibas.ch>
Date: Wed, 16 Aug 2023 10:56:21 +0200
Subject: [PATCH] feat: add obsolete information in compound lib

This allows preferentially picking a non-obsolete compound
in FindCompound by="smiles/inchi"
---
 CHANGELOG.txt                                 | 10 ++-
 modules/conop/doc/compoundlib.rst             | 19 +++++
 modules/conop/pymod/export_compound.cc        |  4 +
 modules/conop/src/compound.hh                 | 20 +++++
 modules/conop/src/compound_lib.cc             | 49 +++++++++++-
 modules/conop/src/compound_lib.hh             |  1 +
 modules/conop/tests/test_complib.py           | 12 +++
 modules/conop/tests/test_compound.py          | 23 ++++++
 .../conop/tests/testfiles/test_compounds.cif  | 77 +++++++++++++++++++
 modules/io/src/mol/chemdict_parser.cc         | 12 ++-
 10 files changed, 219 insertions(+), 8 deletions(-)

diff --git a/CHANGELOG.txt b/CHANGELOG.txt
index 145e431ad..6154c0643 100644
--- a/CHANGELOG.txt
+++ b/CHANGELOG.txt
@@ -6,10 +6,14 @@ Changes in Release 2.6.0
  * Rotamer based compression in OMF file format.
  * Chain mapping updates. Default chain mapping strategy now uses increased
    sampling - only marginal runtime increase.
- * The compound library now knows about SMILES strings and atom charges.
-   Compound libraries created by OST 1.5.0 or later can still be read (SMILES
-   will be empty and charges set to 0, with a warning). Older files are no
+ * The compound library now knows about SMILES strings, atom charges, and
+   contain information that a compound is obsolete (including the replacement,
+   if available). Compound libraries created by OST 1.5.0 or later can still be
+   read (SMILES will be empty, charges set to 0, no information about the
+   obsolete status of compounds, with warnings). Older files are no
    longer supported.
+ * FindCompound now takes an optional 'by' argument to find compounds by SMILES
+   string, InChI code or InChI key.
  * The compound library now reads the "InChI=" part of InChI codes.
  * Several bug fixes and improvements.
 
diff --git a/modules/conop/doc/compoundlib.rst b/modules/conop/doc/compoundlib.rst
index 73514ba01..75c1065af 100644
--- a/modules/conop/doc/compoundlib.rst
+++ b/modules/conop/doc/compoundlib.rst
@@ -65,6 +65,11 @@ built with OST 1.5.0 or later can be loaded.
     The following keys are available: "tlc" (three-letter-code or compound ID),
     "inchi_code", "inchi_key" and "smiles".
 
+    .. note::
+      Multiple compounds may share the same SMILES string or InChI code or key.
+      An arbitrary compound will be returned, with a preference for a
+      non-obsolete one.
+
     If no compound with that name exists, the function returns None.
 
     Compounds are cached after they have been loaded with FindCompound.
@@ -185,6 +190,20 @@ built with OST 1.5.0 or later can be loaded.
     OpenEye OEToolkits.
 
     :type: :class:`str`
+
+  .. attribute:: obsolete
+
+    Whether the component has been obsoleted by the PDB.
+
+    :type: :class:`bool`
+
+  .. attribute:: replaced_by
+
+    If the component has been obsoleted by the PDB, this is the three-letter
+    code of the compound that replaces it. This is not set for all obsolete
+    compounds.
+
+    :type: :class:`str`
     
 
 .. class:: AtomSpec
diff --git a/modules/conop/pymod/export_compound.cc b/modules/conop/pymod/export_compound.cc
index edd30c258..465dc7c63 100644
--- a/modules/conop/pymod/export_compound.cc
+++ b/modules/conop/pymod/export_compound.cc
@@ -130,6 +130,10 @@ void export_Compound() {
     .add_property("smiles",
                   make_function(&Compound::GetSMILES,
                                 return_value_policy<copy_const_reference>()))
+    .add_property("obsolete", &Compound::GetObsolete)
+    .add_property("replaced_by",
+                  make_function(&Compound::GetReplacedBy,
+                                return_value_policy<copy_const_reference>()))
   ;
   
   class_<AtomSpec>("AtomSpec", no_init)
diff --git a/modules/conop/src/compound.hh b/modules/conop/src/compound.hh
index 0630c1caa..1b0eb1cd5 100644
--- a/modules/conop/src/compound.hh
+++ b/modules/conop/src/compound.hh
@@ -153,6 +153,8 @@ public:
     inchi_(),
     inchi_key_(),
     smiles_(),
+    replaced_by_(),
+    obsolete_(),
     atom_specs_(),
     bond_specs_(),
     chem_class_(),
@@ -206,6 +208,22 @@ public:
     return chem_class_;
   }
 
+  void SetObsolete(bool obsolete) {
+    obsolete_=obsolete;
+  }
+
+  bool GetObsolete() const {
+    return obsolete_;
+  }
+
+  void SetReplacedBy(const String& replaced_by) {
+    replaced_by_=replaced_by;
+  }
+
+  const String& GetReplacedBy() const {
+    return replaced_by_;
+  }
+
   void SetChemType(mol::ChemType chem_type) {
     chem_type_=chem_type;
   }
@@ -290,6 +308,8 @@ private:
   String                       inchi_;
   String                       inchi_key_;
   String                       smiles_;
+  String                       replaced_by_;
+  bool                         obsolete_;
   AtomSpecList                 atom_specs_;
   BondSpecList                 bond_specs_;
   mol::ChemClass               chem_class_;
diff --git a/modules/conop/src/compound_lib.cc b/modules/conop/src/compound_lib.cc
index 1611570bc..b875a3b65 100644
--- a/modules/conop/src/compound_lib.cc
+++ b/modules/conop/src/compound_lib.cc
@@ -68,7 +68,9 @@ const char* CREATE_CMD[]={
 "  name              VARCHAR(256),                                              "
 "  inchi_code        TEXT,                                                      "
 "  inchi_key         TEXT,                                                      "
-"  smiles            TEXT                                                       "
+"  smiles            TEXT,                                                      "
+"  obsolete          BOOL,                                                      "
+"  replaced_by       VARCHAR(5)                                                 "
 ");",
 " CREATE UNIQUE INDEX IF NOT EXISTS compound_tlc_index ON chem_compounds        "
 "                                  (tlc, dialect)",
@@ -110,8 +112,9 @@ const char* CREATE_CMD[]={
 
 const char* INSERT_COMPOUND_STATEMENT="INSERT INTO chem_compounds               "
 "        (tlc, olc, dialect, chem_class, chem_type, formula, pdb_initial,       "
-"         pdb_modified, name, inchi_code, inchi_key, smiles) "
-" VALUES (?, ?, ?, ?, ?, ?, DATE(?), DATE(?), ?, ?, ?, ?)";
+"         pdb_modified, name, inchi_code, inchi_key, smiles, obsolete,          "
+"         replaced_by) "
+" VALUES (?, ?, ?, ?, ?, ?, DATE(?), DATE(?), ?, ?, ?, ?, ?, ?)";
 
 const char* INSERT_ATOM_STATEMENT="INSERT INTO atoms                            "
 "        (compound_id, name, alt_name, element, is_aromatic,                    "
@@ -276,6 +279,11 @@ void CompoundLib::AddCompound(const CompoundPtr& compound)
                       compound->GetInchiKey().length(), NULL);
     sqlite3_bind_text(stmt, 12, compound->GetSMILES().c_str(),
                       compound->GetSMILES().length(), NULL);
+    sqlite3_bind_int(stmt, 13, compound->GetObsolete());
+    if (compound->GetReplacedBy() != "") {
+      sqlite3_bind_text(stmt, 14, compound->GetReplacedBy().c_str(),
+                        compound->GetReplacedBy().length(), NULL);
+    }
   } else {
     LOG_ERROR(sqlite3_errmsg(db_->ptr));
     sqlite3_finalize(stmt);
@@ -411,6 +419,14 @@ CompoundLibPtr CompoundLib::Load(const String& database, bool readonly)
   lib->smiles_available_ = retval==SQLITE_OK;
   sqlite3_finalize(stmt);
 
+  // check if obsolete info are available
+  aq="SELECT obsolete, replaced_by FROM chem_compounds LIMIT 1";
+  retval=sqlite3_prepare_v2(lib->db_->ptr, aq.c_str(),
+                            static_cast<int>(aq.length()),
+                            &stmt, NULL);
+  lib->obsolete_available_ = retval==SQLITE_OK;
+  sqlite3_finalize(stmt);
+
   // check if charges are available
   aq="SELECT charge FROM atoms LIMIT 1";
   retval=sqlite3_prepare_v2(lib->db_->ptr, aq.c_str(),
@@ -435,6 +451,11 @@ CompoundLibPtr CompoundLib::Load(const String& database, bool readonly)
                 << lib->ost_version_used_
                 << ". Only empty strings will be returned.");
   }
+  if (!lib->obsolete_available_) {
+    LOG_WARNING("Obsolete information not available in compound library v."
+                << lib->ost_version_used_
+                << ". No compound will be marked as obsolete.");
+  }
   if (!lib->charges_available_) {
     LOG_WARNING("Charges not available in compound library v."
                 << lib->ost_version_used_
@@ -531,8 +552,15 @@ CompoundPtr CompoundLib::FindCompound(const String& id,
   if(smiles_available_) {
     query+=", smiles";
   }
+  if(obsolete_available_) {
+    query+=", obsolete, replaced_by";
+  }
   query+=" FROM chem_compounds"
          " WHERE " + by + "=? AND dialect='"+String(1, char(dialect))+"'";
+  if(obsolete_available_) {
+    // Prefer active compounds, then the ones with a replacement
+    query+=" ORDER BY obsolete, replaced_by NULLS LAST";
+  }
 
   // Run the query
   sqlite3_stmt* stmt;
@@ -568,12 +596,24 @@ CompoundPtr CompoundLib::FindCompound(const String& id,
       if (inchi_key) {
         compound->SetInchiKey(inchi_key);
       }
+      int next_column = 10;
       if (smiles_available_) {
-        const char* smiles=reinterpret_cast<const char*>(sqlite3_column_text(stmt, 10));
+        const char* smiles=reinterpret_cast<const char*>(sqlite3_column_text(stmt, next_column));
+        next_column++;
         if (smiles) {
           compound->SetSMILES(smiles);
         }
       }
+      if (obsolete_available_) {
+        bool obsolete=sqlite3_column_int(stmt, next_column);
+        compound->SetObsolete(obsolete);
+        next_column++;
+        const char* replaced_by=reinterpret_cast<const char*>(sqlite3_column_text(stmt, next_column));
+        next_column++;
+        if (replaced_by) {
+          compound->SetReplacedBy(replaced_by);
+        }
+      }
 
       // Load atoms and bonds      
       this->LoadAtomsFromDB(compound, pk);
@@ -597,6 +637,7 @@ CompoundLib::CompoundLib():
   db_(new Database),
   compound_cache_(),
   smiles_available_(),
+  obsolete_available_(),
   charges_available_(),
   creation_date_(),
   ost_version_used_() { }
diff --git a/modules/conop/src/compound_lib.hh b/modules/conop/src/compound_lib.hh
index c2e75f6a6..8226ecdba 100644
--- a/modules/conop/src/compound_lib.hh
+++ b/modules/conop/src/compound_lib.hh
@@ -57,6 +57,7 @@ private:
   Database* db_;
   mutable CompoundMap       compound_cache_;
   bool                      smiles_available_; //whether smiles are available in db - introduced in 2.6.0
+  bool                      obsolete_available_; //whether obsolete info is available in db - introduced in 2.6.0
   bool                      charges_available_; //whether atom charges are available in db - introduced in 2.6.0
   Date                      creation_date_;
   String                    ost_version_used_;
diff --git a/modules/conop/tests/test_complib.py b/modules/conop/tests/test_complib.py
index 960d65e0a..275fd1459 100644
--- a/modules/conop/tests/test_complib.py
+++ b/modules/conop/tests/test_complib.py
@@ -44,6 +44,18 @@ class TestCompLib(unittest.TestCase):
         self.assertTrue(comp_nh4.atom_specs[0].charge == 1)
         self.assertTrue(comp_nh4.atom_specs[1].charge == 0)
 
+    def test_obsolete(self):
+        complib = self.complib
+        comp_ox = complib.FindCompound("OX")
+        # First test that we do get data
+        self.assertTrue(comp_ox.smiles == "O")
+        # Now the obsolete part
+        self.assertTrue(comp_ox.obsolete)
+        self.assertTrue(comp_ox.replaced_by == "O")
+        # Ensure not everything is obsolete
+        comp_001 = complib.FindCompound("001")
+        self.assertFalse(comp_001.obsolete)
+
     def test_default_lib_version(self):
         compound_lib = conop.GetDefaultLib()
         if compound_lib is None:
diff --git a/modules/conop/tests/test_compound.py b/modules/conop/tests/test_compound.py
index 39cfee070..18b88b861 100644
--- a/modules/conop/tests/test_compound.py
+++ b/modules/conop/tests/test_compound.py
@@ -1,4 +1,5 @@
 import unittest
+
 from ost import mol, conop
 
 
@@ -23,6 +24,28 @@ class TestCompound(unittest.TestCase):
         self.assertEqual(compound.inchi_key, "QNAYBMKLOCPYGJ-REOHCLBHSA-N")
         self.assertEqual(compound.smiles, "C[C@@H](C(=O)O)N"  )
 
+    def testFindCompoundBySMILES(self):
+        """ Test FindCompound by="smiles"."""
+        compound = self.compound_lib.FindCompound('O', by="smiles")
+        self.assertNotEqual(compound, None)
+        self.assertEqual(compound.smiles, 'O')
+
+        # Now we should prefer a non-obsolete compound
+        # Default ordering has DIS as first pick but FindCompound should sort
+        # active compounds first.
+        # This assumes there are non-obsolete O/HOH compounds in the compound
+        # lib, which should always be the case.
+        self.assertFalse(compound.obsolete)
+
+    def testFindCompoundByInChI(self):
+        """ Test FindCompound by="inchi_code|key"."""
+        inchi_code = "InChI=1/H2O/h1H2"
+        inchi_key = "XLYOFNOQVPJJNP-UHFFFAOYAF"
+        compound = self.compound_lib.FindCompound(inchi_code, by="inchi_code")
+        self.assertNotEqual(compound, None)
+        self.assertEqual(compound.inchi, inchi_code)
+        self.assertEqual(compound.inchi_key, inchi_key)
+
      
 if __name__=='__main__':
     from ost import testutils
diff --git a/modules/conop/tests/testfiles/test_compounds.cif b/modules/conop/tests/testfiles/test_compounds.cif
index c961f2cb8..d07ebaf97 100644
--- a/modules/conop/tests/testfiles/test_compounds.cif
+++ b/modules/conop/tests/testfiles/test_compounds.cif
@@ -699,3 +699,80 @@ _pdbx_chem_comp_audit.processing_site
 NH4 "Create component"  1999-07-08 RCSB
 NH4 "Modify descriptor" 2011-06-04 RCSB
 #
+
+data_OX
+#
+_chem_comp.id                                    OX
+_chem_comp.name                                  "BOUND OXYGEN"
+_chem_comp.type                                  NON-POLYMER
+_chem_comp.pdbx_type                             HETAIN
+_chem_comp.formula                               O
+_chem_comp.mon_nstd_parent_comp_id               ?
+_chem_comp.pdbx_synonyms                         ?
+_chem_comp.pdbx_formal_charge                    0
+_chem_comp.pdbx_initial_date                     1999-07-08
+_chem_comp.pdbx_modified_date                    2008-10-14
+_chem_comp.pdbx_ambiguous_flag                   N
+_chem_comp.pdbx_release_status                   OBS
+_chem_comp.pdbx_replaced_by                      O
+_chem_comp.pdbx_replaces                         ?
+_chem_comp.formula_weight                        15.999
+_chem_comp.one_letter_code                       ?
+_chem_comp.three_letter_code                     OX
+_chem_comp.pdbx_model_coordinates_details        ?
+_chem_comp.pdbx_model_coordinates_missing_flag   N
+_chem_comp.pdbx_ideal_coordinates_details        ?
+_chem_comp.pdbx_ideal_coordinates_missing_flag   N
+_chem_comp.pdbx_model_coordinates_db_code        ?
+_chem_comp.pdbx_subcomponent_list                ?
+_chem_comp.pdbx_processing_site                  RCSB
+#
+_chem_comp_atom.comp_id                    OX
+_chem_comp_atom.atom_id                    O
+_chem_comp_atom.alt_atom_id                O
+_chem_comp_atom.type_symbol                O
+_chem_comp_atom.charge                     0
+_chem_comp_atom.pdbx_align                 1
+_chem_comp_atom.pdbx_aromatic_flag         N
+_chem_comp_atom.pdbx_leaving_atom_flag     N
+_chem_comp_atom.pdbx_stereo_config         N
+_chem_comp_atom.model_Cartn_x              0.000
+_chem_comp_atom.model_Cartn_y              0.000
+_chem_comp_atom.model_Cartn_z              0.000
+_chem_comp_atom.pdbx_model_Cartn_x_ideal   -0.064
+_chem_comp_atom.pdbx_model_Cartn_y_ideal   0.000
+_chem_comp_atom.pdbx_model_Cartn_z_ideal   0.000
+_chem_comp_atom.pdbx_component_atom_id     O
+_chem_comp_atom.pdbx_component_comp_id     OX
+_chem_comp_atom.pdbx_ordinal               1
+#
+loop_
+_pdbx_chem_comp_descriptor.comp_id
+_pdbx_chem_comp_descriptor.type
+_pdbx_chem_comp_descriptor.program
+_pdbx_chem_comp_descriptor.program_version
+_pdbx_chem_comp_descriptor.descriptor
+OX SMILES           ACDLabs              10.04 O
+OX InChI            InChI                1.02b InChI=1/H2O/h1H2
+OX InChIKey         InChI                1.02b XLYOFNOQVPJJNP-UHFFFAOYAF
+OX SMILES_CANONICAL CACTVS               3.341 "[O]"
+OX SMILES           CACTVS               3.341 "[O]"
+OX SMILES_CANONICAL "OpenEye OEToolkits" 1.5.0 O
+OX SMILES           "OpenEye OEToolkits" 1.5.0 O
+#
+loop_
+_pdbx_chem_comp_identifier.comp_id
+_pdbx_chem_comp_identifier.type
+_pdbx_chem_comp_identifier.program
+_pdbx_chem_comp_identifier.program_version
+_pdbx_chem_comp_identifier.identifier
+OX "SYSTEMATIC NAME" ACDLabs              10.04 water
+OX "SYSTEMATIC NAME" "OpenEye OEToolkits" 1.5.0 oxidane
+#
+loop_
+_pdbx_chem_comp_audit.comp_id
+_pdbx_chem_comp_audit.action_type
+_pdbx_chem_comp_audit.date
+_pdbx_chem_comp_audit.processing_site
+OX "Create component" 1999-07-08 RCSB
+#
diff --git a/modules/io/src/mol/chemdict_parser.cc b/modules/io/src/mol/chemdict_parser.cc
index 5bdd156ed..a6e6b71db 100644
--- a/modules/io/src/mol/chemdict_parser.cc
+++ b/modules/io/src/mol/chemdict_parser.cc
@@ -141,7 +141,17 @@ void ChemdictParser::OnDataItem(const StarDataItem& item)
         compound_->SetChemClass(mol::ChemClass(mol::ChemClass::WATER));
         compound_->SetOneLetterCode('.');
       }
-    } else if (item.GetName()==StringRef("one_letter_code", 15)) {
+    } else if (item.GetName()==StringRef("pdbx_release_status", 19)) {
+      String release_status = item.GetValue().str();
+      if (release_status == "OBS") {
+        compound_->SetObsolete(true);
+      }
+    } else if (item.GetName()==StringRef("pdbx_replaced_by", 16)) {
+      String replaced_by = item.GetValue().str();
+      if (replaced_by != "?") {
+        compound_->SetReplacedBy(replaced_by);
+      }
+    }  else if (item.GetName()==StringRef("one_letter_code", 15)) {
       if (item.GetValue().length()==1) {
         compound_->SetOneLetterCode(item.GetValue()[0]);   
       }
-- 
GitLab