From f5acb7f9710b308f071bdbce0d50f877481e9013 Mon Sep 17 00:00:00 2001
From: Xavier Robin <xavier.robin@unibas.ch>
Date: Fri, 5 Jan 2024 15:49:11 +0100
Subject: [PATCH] feat: optionally ignore reserved compounds

---
 modules/conop/src/chemdict_tool.cc    | 24 +++++++++++++--------
 modules/io/src/mol/chemdict_parser.cc | 30 +++++++++++++++++++++++++--
 modules/io/src/mol/chemdict_parser.hh |  8 +++++--
 3 files changed, 49 insertions(+), 13 deletions(-)

diff --git a/modules/conop/src/chemdict_tool.cc b/modules/conop/src/chemdict_tool.cc
index 404f50b8a..eff12c73e 100644
--- a/modules/conop/src/chemdict_tool.cc
+++ b/modules/conop/src/chemdict_tool.cc
@@ -35,30 +35,36 @@ using namespace ost;
 
 void PrintUsage()
 {
-  std::cout << "usage: chemdict_tool action <compound-dict> <db> (pdb|charmm|amber|opls)" << std::endl;
+  std::cout << "usage: chemdict_tool ACTION <compound-dict> <db> [DIALECT] [OPTIONS]" << std::endl;
   std::cout << "supported actions are:" << std::endl;
   std::cout << "  create  - creates a new db " << std::endl;
   std::cout << "  update  - update existing db" << std::endl;
+  std::cout << "supported dialects are: pdb, charmm, amber, opls" << std::endl;
+  std::cout << "supported options are:" << std::endl;
+  std::cout << "  -i  - ignore compounds reserved by the PDB (01-99, DRG, INH, LIG)" << std::endl;
 }
 
 int main(int argc, char const *argv[])
 {
-  if (argc!=4 && argc!=5) {
+  if (argc!=4 && argc!=5 && argc!=6) {
     PrintUsage();
     return 0;
   }
   conop::Compound::Dialect dialect=conop::Compound::PDB;
-  if (argc==5) {
-    String format=argv[4];
+  bool ignore_reserved=false;
+  for (int i = 4; i < argc; i++) {
+    String param=argv[i];
 
-    if (format=="charmm") {
+    if (param=="charmm") {
       dialect=conop::Compound::CHARMM;
-    } else if (format=="pdb") {
+    } else if (param=="pdb") {
       dialect=conop::Compound::PDB;
-    } else if (format=="opls") {
+    } else if (param=="opls") {
       dialect=conop::Compound::OPLS;
-    } else if (format=="amber") {
+    } else if (param=="amber") {
       dialect=conop::Compound::AMBER;
+    } else if (param=="-i") {
+      ignore_reserved=true;
     } else {
       PrintUsage();
       return 0;
@@ -75,7 +81,7 @@ int main(int argc, char const *argv[])
     filtered_istream.push(boost::iostreams::gzip_decompressor());
   }
   filtered_istream.push(istream);  
-  io::ChemdictParser cdp(filtered_istream, dialect);
+  io::ChemdictParser cdp(filtered_istream, dialect, ignore_reserved);
   conop::CompoundLibPtr compound_lib;
   bool in_mem=false;
   if (!strncmp(argv[1], "create", 6)) {
diff --git a/modules/io/src/mol/chemdict_parser.cc b/modules/io/src/mol/chemdict_parser.cc
index a6e6b71db..d838d232a 100644
--- a/modules/io/src/mol/chemdict_parser.cc
+++ b/modules/io/src/mol/chemdict_parser.cc
@@ -17,8 +17,13 @@ bool ChemdictParser::OnBeginData(const StringRef& data_name)
     std::cout << last_ << std::flush;
   }
   atom_map_.clear();
-  insert_=true;
-  return true; 
+  if (ignore_reserved_ && IsNameReserved(data_name))  {
+    insert_=false;
+  }
+  else {
+    insert_=true;
+  }
+  return true;
 }
 
 bool ChemdictParser::OnBeginLoop(const StarLoopDesc& header)
@@ -252,4 +257,25 @@ void ChemdictParser::InitPDBXTypeMap()
   xtm_["?"]=mol::ChemType(mol::ChemType::UNKNOWN);
 }
 
+bool ChemdictParser::IsNameReserved(const StringRef& data_name)
+{
+  // This checks if the compound name is one of the reserved name that will
+  // never be used in the PDB. See
+  // https://www.rcsb.org/news/feature/630fee4cebdf34532a949c34
+  if ( // Check if in range 01-99
+       (data_name.length() == 2 &&
+         data_name[0] >= '0' && data_name[0] <= '9' &&
+         data_name[1] >= '0' && data_name[1] <= '9' &&
+         data_name != StringRef("00", 2)) ||
+       data_name == StringRef("DRG", 3) ||
+       data_name == StringRef("INH", 3) ||
+       data_name == StringRef("LIG", 3)
+     )
+  {
+    std::cout << "Ignoring reserved compound " << data_name << std::endl;
+    return true;
+  }
+  return false;
+}
+
 }}
diff --git a/modules/io/src/mol/chemdict_parser.hh b/modules/io/src/mol/chemdict_parser.hh
index f33ee72fb..00a395b98 100644
--- a/modules/io/src/mol/chemdict_parser.hh
+++ b/modules/io/src/mol/chemdict_parser.hh
@@ -40,9 +40,11 @@ typedef enum {
 
 class DLLEXPORT_OST_IO ChemdictParser : public StarParser {
 public:
-  ChemdictParser(std::istream& stream, conop::Compound::Dialect dialect): 
+  ChemdictParser(std::istream& stream, conop::Compound::Dialect dialect,
+    bool ignore_reserved=false):
     StarParser(stream), compound_(new conop::Compound("UNK")), 
-    last_(0), loop_type_(DONT_KNOW), dialect_(dialect)
+    last_(0), loop_type_(DONT_KNOW), dialect_(dialect),
+    ignore_reserved_(ignore_reserved)
   {
     this->InitTypeMap();
     this->InitPDBXTypeMap();
@@ -66,6 +68,7 @@ public:
 private:
   void InitTypeMap();  
   void InitPDBXTypeMap();
+  bool IsNameReserved(const StringRef& data_name);
   conop::CompoundLibPtr                   lib_;
   conop::CompoundPtr                      compound_;
   typedef enum {
@@ -92,6 +95,7 @@ private:
   LoopType                                loop_type_;  
   conop::AtomSpec                         atom_;
   conop::Compound::Dialect                dialect_;
+  bool                                    ignore_reserved_;
 };
 
 
-- 
GitLab