From 6d884cf1b2f98dba3d14d1580559b21a065894b5 Mon Sep 17 00:00:00 2001 From: Xavier Robin <xavier.robin@unibas.ch> Date: Wed, 16 Aug 2023 10:56:21 +0200 Subject: [PATCH] feat: add obsolete information in compound lib This allows preferentially picking a non-obsolete compound in FindCompound by="smiles/inchi" --- CHANGELOG.txt | 10 ++- modules/conop/doc/compoundlib.rst | 19 +++++ modules/conop/pymod/export_compound.cc | 4 + modules/conop/src/compound.hh | 20 +++++ modules/conop/src/compound_lib.cc | 49 +++++++++++- modules/conop/src/compound_lib.hh | 1 + modules/conop/tests/test_complib.py | 12 +++ modules/conop/tests/test_compound.py | 23 ++++++ .../conop/tests/testfiles/test_compounds.cif | 77 +++++++++++++++++++ modules/io/src/mol/chemdict_parser.cc | 12 ++- 10 files changed, 219 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.txt b/CHANGELOG.txt index 145e431ad..6154c0643 100644 --- a/CHANGELOG.txt +++ b/CHANGELOG.txt @@ -6,10 +6,14 @@ Changes in Release 2.6.0 * Rotamer based compression in OMF file format. * Chain mapping updates. Default chain mapping strategy now uses increased sampling - only marginal runtime increase. - * The compound library now knows about SMILES strings and atom charges. - Compound libraries created by OST 1.5.0 or later can still be read (SMILES - will be empty and charges set to 0, with a warning). Older files are no + * The compound library now knows about SMILES strings, atom charges, and + contain information that a compound is obsolete (including the replacement, + if available). Compound libraries created by OST 1.5.0 or later can still be + read (SMILES will be empty, charges set to 0, no information about the + obsolete status of compounds, with warnings). Older files are no longer supported. + * FindCompound now takes an optional 'by' argument to find compounds by SMILES + string, InChI code or InChI key. * The compound library now reads the "InChI=" part of InChI codes. * Several bug fixes and improvements. diff --git a/modules/conop/doc/compoundlib.rst b/modules/conop/doc/compoundlib.rst index 73514ba01..75c1065af 100644 --- a/modules/conop/doc/compoundlib.rst +++ b/modules/conop/doc/compoundlib.rst @@ -65,6 +65,11 @@ built with OST 1.5.0 or later can be loaded. The following keys are available: "tlc" (three-letter-code or compound ID), "inchi_code", "inchi_key" and "smiles". + .. note:: + Multiple compounds may share the same SMILES string or InChI code or key. + An arbitrary compound will be returned, with a preference for a + non-obsolete one. + If no compound with that name exists, the function returns None. Compounds are cached after they have been loaded with FindCompound. @@ -185,6 +190,20 @@ built with OST 1.5.0 or later can be loaded. OpenEye OEToolkits. :type: :class:`str` + + .. attribute:: obsolete + + Whether the component has been obsoleted by the PDB. + + :type: :class:`bool` + + .. attribute:: replaced_by + + If the component has been obsoleted by the PDB, this is the three-letter + code of the compound that replaces it. This is not set for all obsolete + compounds. + + :type: :class:`str` .. class:: AtomSpec diff --git a/modules/conop/pymod/export_compound.cc b/modules/conop/pymod/export_compound.cc index edd30c258..465dc7c63 100644 --- a/modules/conop/pymod/export_compound.cc +++ b/modules/conop/pymod/export_compound.cc @@ -130,6 +130,10 @@ void export_Compound() { .add_property("smiles", make_function(&Compound::GetSMILES, return_value_policy<copy_const_reference>())) + .add_property("obsolete", &Compound::GetObsolete) + .add_property("replaced_by", + make_function(&Compound::GetReplacedBy, + return_value_policy<copy_const_reference>())) ; class_<AtomSpec>("AtomSpec", no_init) diff --git a/modules/conop/src/compound.hh b/modules/conop/src/compound.hh index 0630c1caa..1b0eb1cd5 100644 --- a/modules/conop/src/compound.hh +++ b/modules/conop/src/compound.hh @@ -153,6 +153,8 @@ public: inchi_(), inchi_key_(), smiles_(), + replaced_by_(), + obsolete_(), atom_specs_(), bond_specs_(), chem_class_(), @@ -206,6 +208,22 @@ public: return chem_class_; } + void SetObsolete(bool obsolete) { + obsolete_=obsolete; + } + + bool GetObsolete() const { + return obsolete_; + } + + void SetReplacedBy(const String& replaced_by) { + replaced_by_=replaced_by; + } + + const String& GetReplacedBy() const { + return replaced_by_; + } + void SetChemType(mol::ChemType chem_type) { chem_type_=chem_type; } @@ -290,6 +308,8 @@ private: String inchi_; String inchi_key_; String smiles_; + String replaced_by_; + bool obsolete_; AtomSpecList atom_specs_; BondSpecList bond_specs_; mol::ChemClass chem_class_; diff --git a/modules/conop/src/compound_lib.cc b/modules/conop/src/compound_lib.cc index 1611570bc..b875a3b65 100644 --- a/modules/conop/src/compound_lib.cc +++ b/modules/conop/src/compound_lib.cc @@ -68,7 +68,9 @@ const char* CREATE_CMD[]={ " name VARCHAR(256), " " inchi_code TEXT, " " inchi_key TEXT, " -" smiles TEXT " +" smiles TEXT, " +" obsolete BOOL, " +" replaced_by VARCHAR(5) " ");", " CREATE UNIQUE INDEX IF NOT EXISTS compound_tlc_index ON chem_compounds " " (tlc, dialect)", @@ -110,8 +112,9 @@ const char* CREATE_CMD[]={ const char* INSERT_COMPOUND_STATEMENT="INSERT INTO chem_compounds " " (tlc, olc, dialect, chem_class, chem_type, formula, pdb_initial, " -" pdb_modified, name, inchi_code, inchi_key, smiles) " -" VALUES (?, ?, ?, ?, ?, ?, DATE(?), DATE(?), ?, ?, ?, ?)"; +" pdb_modified, name, inchi_code, inchi_key, smiles, obsolete, " +" replaced_by) " +" VALUES (?, ?, ?, ?, ?, ?, DATE(?), DATE(?), ?, ?, ?, ?, ?, ?)"; const char* INSERT_ATOM_STATEMENT="INSERT INTO atoms " " (compound_id, name, alt_name, element, is_aromatic, " @@ -276,6 +279,11 @@ void CompoundLib::AddCompound(const CompoundPtr& compound) compound->GetInchiKey().length(), NULL); sqlite3_bind_text(stmt, 12, compound->GetSMILES().c_str(), compound->GetSMILES().length(), NULL); + sqlite3_bind_int(stmt, 13, compound->GetObsolete()); + if (compound->GetReplacedBy() != "") { + sqlite3_bind_text(stmt, 14, compound->GetReplacedBy().c_str(), + compound->GetReplacedBy().length(), NULL); + } } else { LOG_ERROR(sqlite3_errmsg(db_->ptr)); sqlite3_finalize(stmt); @@ -411,6 +419,14 @@ CompoundLibPtr CompoundLib::Load(const String& database, bool readonly) lib->smiles_available_ = retval==SQLITE_OK; sqlite3_finalize(stmt); + // check if obsolete info are available + aq="SELECT obsolete, replaced_by FROM chem_compounds LIMIT 1"; + retval=sqlite3_prepare_v2(lib->db_->ptr, aq.c_str(), + static_cast<int>(aq.length()), + &stmt, NULL); + lib->obsolete_available_ = retval==SQLITE_OK; + sqlite3_finalize(stmt); + // check if charges are available aq="SELECT charge FROM atoms LIMIT 1"; retval=sqlite3_prepare_v2(lib->db_->ptr, aq.c_str(), @@ -435,6 +451,11 @@ CompoundLibPtr CompoundLib::Load(const String& database, bool readonly) << lib->ost_version_used_ << ". Only empty strings will be returned."); } + if (!lib->obsolete_available_) { + LOG_WARNING("Obsolete information not available in compound library v." + << lib->ost_version_used_ + << ". No compound will be marked as obsolete."); + } if (!lib->charges_available_) { LOG_WARNING("Charges not available in compound library v." << lib->ost_version_used_ @@ -531,8 +552,15 @@ CompoundPtr CompoundLib::FindCompound(const String& id, if(smiles_available_) { query+=", smiles"; } + if(obsolete_available_) { + query+=", obsolete, replaced_by"; + } query+=" FROM chem_compounds" " WHERE " + by + "=? AND dialect='"+String(1, char(dialect))+"'"; + if(obsolete_available_) { + // Prefer active compounds, then the ones with a replacement + query+=" ORDER BY obsolete, replaced_by NULLS LAST"; + } // Run the query sqlite3_stmt* stmt; @@ -568,12 +596,24 @@ CompoundPtr CompoundLib::FindCompound(const String& id, if (inchi_key) { compound->SetInchiKey(inchi_key); } + int next_column = 10; if (smiles_available_) { - const char* smiles=reinterpret_cast<const char*>(sqlite3_column_text(stmt, 10)); + const char* smiles=reinterpret_cast<const char*>(sqlite3_column_text(stmt, next_column)); + next_column++; if (smiles) { compound->SetSMILES(smiles); } } + if (obsolete_available_) { + bool obsolete=sqlite3_column_int(stmt, next_column); + compound->SetObsolete(obsolete); + next_column++; + const char* replaced_by=reinterpret_cast<const char*>(sqlite3_column_text(stmt, next_column)); + next_column++; + if (replaced_by) { + compound->SetReplacedBy(replaced_by); + } + } // Load atoms and bonds this->LoadAtomsFromDB(compound, pk); @@ -597,6 +637,7 @@ CompoundLib::CompoundLib(): db_(new Database), compound_cache_(), smiles_available_(), + obsolete_available_(), charges_available_(), creation_date_(), ost_version_used_() { } diff --git a/modules/conop/src/compound_lib.hh b/modules/conop/src/compound_lib.hh index c2e75f6a6..8226ecdba 100644 --- a/modules/conop/src/compound_lib.hh +++ b/modules/conop/src/compound_lib.hh @@ -57,6 +57,7 @@ private: Database* db_; mutable CompoundMap compound_cache_; bool smiles_available_; //whether smiles are available in db - introduced in 2.6.0 + bool obsolete_available_; //whether obsolete info is available in db - introduced in 2.6.0 bool charges_available_; //whether atom charges are available in db - introduced in 2.6.0 Date creation_date_; String ost_version_used_; diff --git a/modules/conop/tests/test_complib.py b/modules/conop/tests/test_complib.py index 960d65e0a..275fd1459 100644 --- a/modules/conop/tests/test_complib.py +++ b/modules/conop/tests/test_complib.py @@ -44,6 +44,18 @@ class TestCompLib(unittest.TestCase): self.assertTrue(comp_nh4.atom_specs[0].charge == 1) self.assertTrue(comp_nh4.atom_specs[1].charge == 0) + def test_obsolete(self): + complib = self.complib + comp_ox = complib.FindCompound("OX") + # First test that we do get data + self.assertTrue(comp_ox.smiles == "O") + # Now the obsolete part + self.assertTrue(comp_ox.obsolete) + self.assertTrue(comp_ox.replaced_by == "O") + # Ensure not everything is obsolete + comp_001 = complib.FindCompound("001") + self.assertFalse(comp_001.obsolete) + def test_default_lib_version(self): compound_lib = conop.GetDefaultLib() if compound_lib is None: diff --git a/modules/conop/tests/test_compound.py b/modules/conop/tests/test_compound.py index 39cfee070..18b88b861 100644 --- a/modules/conop/tests/test_compound.py +++ b/modules/conop/tests/test_compound.py @@ -1,4 +1,5 @@ import unittest + from ost import mol, conop @@ -23,6 +24,28 @@ class TestCompound(unittest.TestCase): self.assertEqual(compound.inchi_key, "QNAYBMKLOCPYGJ-REOHCLBHSA-N") self.assertEqual(compound.smiles, "C[C@@H](C(=O)O)N" ) + def testFindCompoundBySMILES(self): + """ Test FindCompound by="smiles".""" + compound = self.compound_lib.FindCompound('O', by="smiles") + self.assertNotEqual(compound, None) + self.assertEqual(compound.smiles, 'O') + + # Now we should prefer a non-obsolete compound + # Default ordering has DIS as first pick but FindCompound should sort + # active compounds first. + # This assumes there are non-obsolete O/HOH compounds in the compound + # lib, which should always be the case. + self.assertFalse(compound.obsolete) + + def testFindCompoundByInChI(self): + """ Test FindCompound by="inchi_code|key".""" + inchi_code = "InChI=1/H2O/h1H2" + inchi_key = "XLYOFNOQVPJJNP-UHFFFAOYAF" + compound = self.compound_lib.FindCompound(inchi_code, by="inchi_code") + self.assertNotEqual(compound, None) + self.assertEqual(compound.inchi, inchi_code) + self.assertEqual(compound.inchi_key, inchi_key) + if __name__=='__main__': from ost import testutils diff --git a/modules/conop/tests/testfiles/test_compounds.cif b/modules/conop/tests/testfiles/test_compounds.cif index c961f2cb8..d07ebaf97 100644 --- a/modules/conop/tests/testfiles/test_compounds.cif +++ b/modules/conop/tests/testfiles/test_compounds.cif @@ -699,3 +699,80 @@ _pdbx_chem_comp_audit.processing_site NH4 "Create component" 1999-07-08 RCSB NH4 "Modify descriptor" 2011-06-04 RCSB # + +data_OX +# +_chem_comp.id OX +_chem_comp.name "BOUND OXYGEN" +_chem_comp.type NON-POLYMER +_chem_comp.pdbx_type HETAIN +_chem_comp.formula O +_chem_comp.mon_nstd_parent_comp_id ? +_chem_comp.pdbx_synonyms ? +_chem_comp.pdbx_formal_charge 0 +_chem_comp.pdbx_initial_date 1999-07-08 +_chem_comp.pdbx_modified_date 2008-10-14 +_chem_comp.pdbx_ambiguous_flag N +_chem_comp.pdbx_release_status OBS +_chem_comp.pdbx_replaced_by O +_chem_comp.pdbx_replaces ? +_chem_comp.formula_weight 15.999 +_chem_comp.one_letter_code ? +_chem_comp.three_letter_code OX +_chem_comp.pdbx_model_coordinates_details ? +_chem_comp.pdbx_model_coordinates_missing_flag N +_chem_comp.pdbx_ideal_coordinates_details ? +_chem_comp.pdbx_ideal_coordinates_missing_flag N +_chem_comp.pdbx_model_coordinates_db_code ? +_chem_comp.pdbx_subcomponent_list ? +_chem_comp.pdbx_processing_site RCSB +# +_chem_comp_atom.comp_id OX +_chem_comp_atom.atom_id O +_chem_comp_atom.alt_atom_id O +_chem_comp_atom.type_symbol O +_chem_comp_atom.charge 0 +_chem_comp_atom.pdbx_align 1 +_chem_comp_atom.pdbx_aromatic_flag N +_chem_comp_atom.pdbx_leaving_atom_flag N +_chem_comp_atom.pdbx_stereo_config N +_chem_comp_atom.model_Cartn_x 0.000 +_chem_comp_atom.model_Cartn_y 0.000 +_chem_comp_atom.model_Cartn_z 0.000 +_chem_comp_atom.pdbx_model_Cartn_x_ideal -0.064 +_chem_comp_atom.pdbx_model_Cartn_y_ideal 0.000 +_chem_comp_atom.pdbx_model_Cartn_z_ideal 0.000 +_chem_comp_atom.pdbx_component_atom_id O +_chem_comp_atom.pdbx_component_comp_id OX +_chem_comp_atom.pdbx_ordinal 1 +# +loop_ +_pdbx_chem_comp_descriptor.comp_id +_pdbx_chem_comp_descriptor.type +_pdbx_chem_comp_descriptor.program +_pdbx_chem_comp_descriptor.program_version +_pdbx_chem_comp_descriptor.descriptor +OX SMILES ACDLabs 10.04 O +OX InChI InChI 1.02b InChI=1/H2O/h1H2 +OX InChIKey InChI 1.02b XLYOFNOQVPJJNP-UHFFFAOYAF +OX SMILES_CANONICAL CACTVS 3.341 "[O]" +OX SMILES CACTVS 3.341 "[O]" +OX SMILES_CANONICAL "OpenEye OEToolkits" 1.5.0 O +OX SMILES "OpenEye OEToolkits" 1.5.0 O +# +loop_ +_pdbx_chem_comp_identifier.comp_id +_pdbx_chem_comp_identifier.type +_pdbx_chem_comp_identifier.program +_pdbx_chem_comp_identifier.program_version +_pdbx_chem_comp_identifier.identifier +OX "SYSTEMATIC NAME" ACDLabs 10.04 water +OX "SYSTEMATIC NAME" "OpenEye OEToolkits" 1.5.0 oxidane +# +loop_ +_pdbx_chem_comp_audit.comp_id +_pdbx_chem_comp_audit.action_type +_pdbx_chem_comp_audit.date +_pdbx_chem_comp_audit.processing_site +OX "Create component" 1999-07-08 RCSB +# diff --git a/modules/io/src/mol/chemdict_parser.cc b/modules/io/src/mol/chemdict_parser.cc index 5bdd156ed..a6e6b71db 100644 --- a/modules/io/src/mol/chemdict_parser.cc +++ b/modules/io/src/mol/chemdict_parser.cc @@ -141,7 +141,17 @@ void ChemdictParser::OnDataItem(const StarDataItem& item) compound_->SetChemClass(mol::ChemClass(mol::ChemClass::WATER)); compound_->SetOneLetterCode('.'); } - } else if (item.GetName()==StringRef("one_letter_code", 15)) { + } else if (item.GetName()==StringRef("pdbx_release_status", 19)) { + String release_status = item.GetValue().str(); + if (release_status == "OBS") { + compound_->SetObsolete(true); + } + } else if (item.GetName()==StringRef("pdbx_replaced_by", 16)) { + String replaced_by = item.GetValue().str(); + if (replaced_by != "?") { + compound_->SetReplacedBy(replaced_by); + } + } else if (item.GetName()==StringRef("one_letter_code", 15)) { if (item.GetValue().length()==1) { compound_->SetOneLetterCode(item.GetValue()[0]); } -- GitLab