From cd8ba0804d3a987f780db9f989bac3ab5ed4930c Mon Sep 17 00:00:00 2001
From: B13nch3n <stefan.bienert@me.com>
Date: Tue, 21 Jul 2020 15:15:46 +0200
Subject: [PATCH] Allow free positioning of entity category in mmCIF files

---
 modules/io/src/mol/mmcif_reader.cc    | 69 ++++++++++++---------------
 modules/io/src/mol/mmcif_reader.hh    |  6 +++
 modules/io/tests/test_mmcif_reader.cc | 36 +++-----------
 3 files changed, 43 insertions(+), 68 deletions(-)

diff --git a/modules/io/src/mol/mmcif_reader.cc b/modules/io/src/mol/mmcif_reader.cc
index 1eb122070..e2f67b119 100644
--- a/modules/io/src/mol/mmcif_reader.cc
+++ b/modules/io/src/mol/mmcif_reader.cc
@@ -660,47 +660,50 @@ void MMCifReader::ParseAndAddAtom(const std::vector<StringRef>& columns)
                 columns[indices_[GROUP_PDB]][0]=='H');
 }
 
+MMCifReader::MMCifEntityDescMap::iterator MMCifReader::GetEntityDescMapIterator(
+  const String& entity_id)
+{
+  MMCifEntityDescMap::iterator edm_it = entity_desc_map_.find(entity_id);
+  // if the entity ID is not already stored, insert it with empty values
+  if (edm_it == entity_desc_map_.end()) {
+    MMCifEntityDesc desc = {.type=mol::CHAINTYPE_N_CHAINTYPES,
+                            .details="",
+                            .seqres=""};
+    edm_it = entity_desc_map_.insert(entity_desc_map_.begin(),
+                                     MMCifEntityDescMap::value_type(entity_id,
+                                                                    desc));
+  }
+  return edm_it;
+}
+
 void MMCifReader::ParseEntity(const std::vector<StringRef>& columns)
 {
-  bool store = false; // is it worth storing this record?
-  MMCifEntityDesc desc;
+  MMCifEntityDescMap::iterator edm_it =
+    GetEntityDescMapIterator(columns[indices_[E_ID]].str());
 
   // type
   if (indices_[E_TYPE] != -1) {
-    desc.type = mol::ChainTypeFromString(columns[indices_[E_TYPE]]);
-    store = true;
+    // only use the entity type if no other is set, entity_poly type is
+    // more precise, so if that was set before just leave it in
+    if (edm_it->second.type == mol::CHAINTYPE_N_CHAINTYPES) {
+      edm_it->second.type = mol::ChainTypeFromString(columns[indices_[E_TYPE]]);
+    }
+  } else {
+    // don't deal with entities without type
+    entity_desc_map_.erase(edm_it);
+    return;
   }
 
   // description
   if (indices_[PDBX_DESCRIPTION] != -1) {
-    desc.details = columns[indices_[PDBX_DESCRIPTION]].str();
-  } else {
-    desc.details = "";
-  }
-
-  if (store) {
-    desc.seqres = "";
-    entity_desc_map_.insert(
-                   MMCifEntityDescMap::value_type(columns[indices_[E_ID]].str(),
-                                                  desc)
-                            );
+    edm_it->second.details = columns[indices_[PDBX_DESCRIPTION]].str();
   }
 }
 
 void MMCifReader::ParseEntityPoly(const std::vector<StringRef>& columns)
 {
-  // we assume that the entity cat. ALWAYS comes before the entity_poly cat.
-  // search entity
   MMCifEntityDescMap::iterator edm_it =
-    entity_desc_map_.find(columns[indices_[ENTITY_ID]].str());
-
-  if (edm_it == entity_desc_map_.end()) {
-    throw IOException(this->FormatDiagnostic(STAR_DIAG_ERROR,
-                     "'entity_poly' category defined before 'entity' for id '" +
-                                            columns[indices_[ENTITY_ID]].str() +
-                                             "' or missing.",
-                                             this->GetCurrentLinenum()));
-  }
+    GetEntityDescMapIterator(columns[indices_[ENTITY_ID]].str());
 
   // store type
   if (indices_[EP_TYPE] != -1) {
@@ -1713,19 +1716,9 @@ void MMCifReader::ParseStructRefSeqDif(const std::vector<StringRef>& columns)
 
 void MMCifReader::ParsePdbxEntityBranch(const std::vector<StringRef>& columns)
 {
-  // we assume that the entity cat. ALWAYS comes before the pdbx_entity_branch
-  // cat.
-  // search entity
+  // get entity/ descreption entry
   MMCifEntityDescMap::iterator edm_it =
-    entity_desc_map_.find(columns[indices_[BR_ENTITY_ID]].str());
-
-  if (edm_it == entity_desc_map_.end()) {
-    throw IOException(this->FormatDiagnostic(STAR_DIAG_ERROR,
-              "'pdbx_entity_branch' category defined before 'entity' for id '" +
-                                         columns[indices_[BR_ENTITY_ID]].str() +
-                                             "' or missing.",
-                                             this->GetCurrentLinenum()));
-  }
+    GetEntityDescMapIterator(columns[indices_[BR_ENTITY_ID]].str());
 
   // store type
   if (indices_[BR_ENTITY_TYPE] != -1) {
diff --git a/modules/io/src/mol/mmcif_reader.hh b/modules/io/src/mol/mmcif_reader.hh
index a9339c6ac..03ceb32c9 100644
--- a/modules/io/src/mol/mmcif_reader.hh
+++ b/modules/io/src/mol/mmcif_reader.hh
@@ -629,6 +629,10 @@ private:
   } MMCifEntityDesc;
   typedef std::map<String, MMCifEntityDesc> MMCifEntityDescMap;
 
+  /// \brief Get an iterator for MMCifEntityDescMap by finding an element or
+  ///        inserting a new one into the map.
+  MMCifEntityDescMap::iterator GetEntityDescMapIterator(const String&);
+
   /// \struct assembly information
   typedef struct {
     String biounit_id;                              ///< identifier for the bu
@@ -739,3 +743,5 @@ DLLEXPORT_OST_IO String OSTBondOrderToMMCifValueOrder(
 }}
 
 #endif
+
+//  LocalWords:  MMCifEntityDescMap
diff --git a/modules/io/tests/test_mmcif_reader.cc b/modules/io/tests/test_mmcif_reader.cc
index 65673c858..b6fa90f1f 100644
--- a/modules/io/tests/test_mmcif_reader.cc
+++ b/modules/io/tests/test_mmcif_reader.cc
@@ -311,9 +311,13 @@ BOOST_AUTO_TEST_CASE(mmcif_unknown_entity_type)
   columns.push_back(StringRef("polymer", 7));
   BOOST_CHECK_NO_THROW(tmmcif_p.ParseEntity(columns));
   columns.pop_back();
+  columns.pop_back();
+  columns.push_back(StringRef("2", 1));
   columns.push_back(StringRef("non-polymer", 11));
   BOOST_CHECK_NO_THROW(tmmcif_p.ParseEntity(columns));
   columns.pop_back();
+  columns.pop_back();
+  columns.push_back(StringRef("3", 1));  
   columns.push_back(StringRef("water", 5));
   BOOST_CHECK_NO_THROW(tmmcif_p.ParseEntity(columns));
   BOOST_TEST_MESSAGE("          done.");
@@ -321,6 +325,8 @@ BOOST_AUTO_TEST_CASE(mmcif_unknown_entity_type)
   // negative
   BOOST_TEST_MESSAGE("          unknown type...");
   columns.pop_back();
+  columns.pop_back();
+  columns.push_back(StringRef("4", 1));
   columns.push_back(StringRef("foo", 3));
   BOOST_CHECK_THROW(tmmcif_p.ParseEntity(columns), Error);
   BOOST_TEST_MESSAGE("          done.");
@@ -404,20 +410,6 @@ BOOST_AUTO_TEST_CASE(mmcif_entity_poly_tests)
   seq::SequenceHandle curr = seqres.FindSequence("A");
   BOOST_CHECK(curr.GetString() == "VTI");
 
-  BOOST_TEST_MESSAGE("          testing missing corresponding entity entry...");
-  {
-    mol::EntityHandle eh = mol::CreateEntity();
-    std::vector<StringRef> columns;
-    TestMMCifReaderProtected tmmcif_p("testfiles/mmcif/atom_site.mmcif", eh);
-
-    tmmcif_h.SetCategory(StringRef("entity_poly", 11));
-    tmmcif_h.Add(StringRef("entity_id", 9));
-    tmmcif_p.OnBeginLoop(tmmcif_h);
-
-    columns.push_back(StringRef("1", 1));
-    BOOST_CHECK_THROW(tmmcif_p.ParseEntityPoly(columns), IOException);
-  }
-  BOOST_TEST_MESSAGE("          done.");
   BOOST_TEST_MESSAGE("          testing type recognition...");
   {
     TestMMCifReaderProtected tmmcif_p("testfiles/mmcif/atom_site.mmcif", eh);
@@ -1435,22 +1427,6 @@ BOOST_AUTO_TEST_CASE(mmcif_pdbx_entity_branch_tests)
 
   mmcif_p.Parse();
 
-  BOOST_TEST_MESSAGE("          testing missing corresponding entity entry...");
-  {
-    mol::EntityHandle eh = mol::CreateEntity();
-    std::vector<StringRef> columns;
-    TestMMCifReaderProtected tmmcif_p("testfiles/mmcif/atom_site.mmcif", eh);
-
-    tmmcif_h.SetCategory(StringRef("pdbx_entity_branch", 18));
-    tmmcif_h.Add(StringRef("entity_id", 9));
-    tmmcif_h.Add(StringRef("type", 4));
-    tmmcif_p.OnBeginLoop(tmmcif_h);
-    columns.push_back(StringRef("1", 1));
-    columns.push_back(StringRef("oligosaccharide", 15));
-
-    BOOST_CHECK_THROW(tmmcif_p.ParsePdbxEntityBranch(columns), IOException);
-  }
-  BOOST_TEST_MESSAGE("          done.");
   BOOST_TEST_MESSAGE("          testing chain type recognition...");
   {
     TestMMCifReaderProtected tmmcif_p("testfiles/mmcif/atom_site.mmcif", eh);
-- 
GitLab