From e61319899eea975db2adf8ec07cc32e72b1ae6f7 Mon Sep 17 00:00:00 2001
From: Stefan Bienert <stefan.bienert@unibas.ch>
Date: Fri, 29 Jul 2011 18:00:30 +0200
Subject: [PATCH] First step towards filling ChainType with live from the
 MMCifParser. Its fully functional but only stores the type of chain, at the
 moment.

---
 modules/io/src/mol/mmcif_reader.cc            |  84 ++++++++-
 modules/io/src/mol/mmcif_reader.hh            |  37 +++-
 modules/io/tests/test_mmcif_reader.cc         | 162 ++++++++++++++++--
 .../io/tests/testfiles/mmcif/atom_site.mmcif  |  14 ++
 .../tests/testfiles/mmcif/model_truepos.mmcif |   3 +
 5 files changed, 269 insertions(+), 31 deletions(-)

diff --git a/modules/io/src/mol/mmcif_reader.cc b/modules/io/src/mol/mmcif_reader.cc
index 587cc670c..3bac82e55 100644
--- a/modules/io/src/mol/mmcif_reader.cc
+++ b/modules/io/src/mol/mmcif_reader.cc
@@ -58,11 +58,12 @@ void MMCifParser::Init()
   residue_count_        = 0;
   auth_chain_id_        = false;
   has_model_            = false;
-  //memset(indices_, -1, MAX_ITEMS_IN_ROW * sizeof(int));
   restrict_chains_      = "";
   subst_res_id_         = "";
-  //curr_chain_           = mol::ChainHandle();
-  //curr_residue_         = mol::ResidueHandle();
+  curr_chain_           = mol::ChainHandle();
+  curr_residue_         = mol::ResidueHandle();
+  //chain_id_pairs_       = 
+  //entity_desc_map_
 }
 
 void MMCifParser::ClearState()
@@ -117,7 +118,7 @@ bool MMCifParser::OnBeginLoop(const StarLoopDesc& header)
     category_counts_[category_]++;
     // mandatory items
     this->TryStoreIdx(AUTH_ASYM_ID,    "auth_asym_id",    header);
-    this->TryStoreIdx(ID,              "id",              header);
+    this->TryStoreIdx(AS_ID,           "id",              header);
     this->TryStoreIdx(LABEL_ALT_ID,    "label_alt_id",    header);
     this->TryStoreIdx(LABEL_ASYM_ID,   "label_asym_id",   header);
     this->TryStoreIdx(LABEL_ATOM_ID,   "label_atom_id",   header);
@@ -147,6 +148,14 @@ bool MMCifParser::OnBeginLoop(const StarLoopDesc& header)
       }
     }
     return true;
+  } else if (header.GetCategory()=="entity") {
+    category_ = ENTITY;
+    category_counts_[category_]++;
+    // mandatory items
+    this->TryStoreIdx(E_ID, "id",    header);
+    // optional
+    indices_[E_TYPE] = header.GetIndex("type");
+    return true;
   }
   /*else if (header.GetCategory()=="entity_poly") {
   } else if (header.GetCategory()=="pdbx_poly_seq_scheme") {
@@ -187,7 +196,7 @@ bool MMCifParser::ParseAtomIdent(const std::vector<StringRef>& columns,
     return false;
   } 
 
-  std::pair<bool, int> a_num = this->TryGetInt(columns[indices_[ID]],
+  std::pair<bool, int> a_num = this->TryGetInt(columns[indices_[AS_ID]],
                                                "atom_site.id",
                                           profile_.fault_tolerant); // unit test
 
@@ -275,8 +284,8 @@ void MMCifParser::ParseAndAddAtom(const std::vector<StringRef>& columns)
   LOG_TRACE( "s_chain: [" << chain_name << "]" );
 
   // determine chain and residue update
-  bool update_chain=false;
-  bool update_residue=false;
+  bool update_chain = false;
+  bool update_residue = false;
   if(!curr_chain_) { // unit test
       update_chain=true;
       update_residue=true;
@@ -315,8 +324,19 @@ void MMCifParser::ParseAndAddAtom(const std::vector<StringRef>& columns)
       LOG_DEBUG("new chain " << chain_name);
       curr_chain_=editor.InsertChain(chain_name);
       ++chain_count_;
+      // store entity id
+      chain_id_pairs_.push_back(std::pair<mol::ChainHandle,String>(curr_chain_,
+                                     columns[indices_[LABEL_ENTITY_ID]].str()));
     }
     assert(curr_chain_.IsValid());
+  } else if (chain_id_pairs_.back().second != // unit test
+             columns[indices_[LABEL_ENTITY_ID]].str()) {
+    // check that label_entity_id stays the same
+    throw IOException(this->FormatDiagnostic(STAR_DIAG_ERROR,
+        "Change of 'atom_site.label_entity_id' item for chain " +
+        curr_chain_.GetName() + "! Expected: " + chain_id_pairs_.back().second +
+        ", found: " + columns[indices_[LABEL_ENTITY_ID]].str() + ".",
+                                             this->GetCurrentLinenum()));
   }
 
   if(update_residue) { // unit test
@@ -408,6 +428,37 @@ void MMCifParser::ParseAndAddAtom(const std::vector<StringRef>& columns)
 
 }
 
+void MMCifParser::ParseEntity(const std::vector<StringRef>& columns)
+{
+  bool store = false; // is it worth storing this record?
+  MMCifEntityDesc desc;
+
+  // type
+  if (indices_[E_TYPE] != -1) {
+    if(StringRef("polymer", 7) == columns[indices_[E_TYPE]]) {
+      desc.type = CHAINTYPE_POLY;
+    } else if(StringRef("non-polymer", 11) == columns[indices_[E_TYPE]]) {
+      desc.type = CHAINTYPE_NON_POLY;
+    } else if(StringRef("water", 5) == columns[indices_[E_TYPE]]) {
+      desc.type = CHAINTYPE_WATER;
+    } else {
+      throw IOException(this->FormatDiagnostic(STAR_DIAG_ERROR,
+                                               "Unrecognised chain type '" +
+                                               columns[indices_[E_TYPE]].str() +
+                                               "' found.",
+                                               this->GetCurrentLinenum()));
+    }
+    store = true;
+  }
+
+  if (store) {
+    entity_desc_map_.insert(
+                   MMCifEntityDescMap::value_type(columns[indices_[E_ID]].str(),
+                                                  desc)
+                            );
+  }
+}
+
 void MMCifParser::OnDataRow(const StarLoopDesc& header, 
                             const std::vector<StringRef>& columns)
 {
@@ -416,6 +467,9 @@ void MMCifParser::OnDataRow(const StarLoopDesc& header,
     LOG_TRACE("processing atom_site entry");
     this->ParseAndAddAtom(columns);
     break;
+  case ENTITY:
+    LOG_TRACE("processing entity entry");
+    this->ParseEntity(columns);
   default:
     return;
   }
@@ -563,6 +617,22 @@ void PDBReader::Import(mol::EntityHandle& ent,
 
 void MMCifParser::OnEndData()
 {
+  mol::XCSEditor editor=ent_handle_.EditXCS(mol::BUFFERED_EDIT);
+
+  // process chain types
+  std::vector<std::pair<mol::ChainHandle, String> >::const_iterator css;
+  MMCifEntityDescMap::const_iterator edm_it;
+  for (css = chain_id_pairs_.begin(); css != chain_id_pairs_.end(); ++css) {
+    edm_it = entity_desc_map_.find(css->second);
+
+    if (edm_it != entity_desc_map_.end()) {
+      editor.SetChainType(css->first, edm_it->second.type);
+    } else {
+      LOG_WARNING("No entity description found for atom_site.label_entity_id '"
+                  << css->second << "'");
+    }
+  }
+
   LOG_INFO("imported "
            << chain_count_ << " chains, "
            << residue_count_ << " residues, "
diff --git a/modules/io/src/mol/mmcif_reader.hh b/modules/io/src/mol/mmcif_reader.hh
index 41e586a14..529702a74 100644
--- a/modules/io/src/mol/mmcif_reader.hh
+++ b/modules/io/src/mol/mmcif_reader.hh
@@ -19,10 +19,13 @@
 #ifndef OST_MMCIF_PARSER_HH
 #define OST_MMCIF_PARSER_HH
 
+#include <map>
+
 //#include <boost/iostreams/filtering_stream.hpp>
 //#include <boost/filesystem/fstream.hpp>
 
 #include <ost/mol/residue_handle.hh>
+#include <ost/mol/chain_type.hh>
 #include <ost/io/mol/io_profile.hh>
 #include <ost/io/io_exception.hh>
 #include <ost/io/mol/star_parser.hh>
@@ -127,11 +130,12 @@ public:
 
   /// \brief fetch values identifying atoms
   ///
-  /// \param[in] columns data row
+  /// \param[in]  columns data row
   /// \param[out] chain_name takes atom_site.label_asym_id or, if
   ///             auth_chain_id_ is set, atom_site.auth_asym_id as a chain name
+  /// \param[out] res_name fetches atom_site.label_comp_id
 
-  /// \param atom_name corresponds to label_atom_id
+  /// \param[out] atom_name corresponds to label_atom_id
   bool ParseAtomIdent(const std::vector<StringRef>& columns,
                       String& chain_name,
                       StringRef& res_name,
@@ -145,6 +149,11 @@ public:
   /// \param columns data row
   void ParseAndAddAtom(const std::vector<StringRef>& columns);
 
+  /// \brief Fetch MMCif entity information
+  ///
+  /// \param columns data row
+  void ParseEntity(const std::vector<StringRef>& columns);
+
 private:
   /// \enum magic numbers of this class
   typedef enum {
@@ -155,12 +164,12 @@ private:
   /// \enum items of the atom_site category
   typedef enum {
     AUTH_ASYM_ID,      ///< chain name by author as in PDB
-    ID,                ///< atom serial id
+    AS_ID,             ///< atom serial id
     LABEL_ALT_ID,      ///< AltLoc
     LABEL_ASYM_ID,     ///< chain name by PDB
     LABEL_ATOM_ID,
     LABEL_COMP_ID,
-    LABEL_ENTITY_ID,
+    LABEL_ENTITY_ID,   ///< link to category entity
     LABEL_SEQ_ID,      ///< residue no.
     AUTH_SEQ_ID,       ///< residue no. by author
     TYPE_SYMBOL,       ///< chemical element
@@ -174,12 +183,26 @@ private:
     PDBX_PDB_MODEL_NUM ///< model no. (especially NMR structures)
   } AtomSiteItems;
 
+  /// \enum items of the entity category
+  typedef enum {
+    E_ID,              ///< unique identifier
+    E_TYPE             ///< polymer, non-polymer or water
+  } EntityItems;
+
   /// \enum categories of the mmcif format
   typedef enum {
     ATOM_SITE,
+    ENTITY,
     DONT_KNOW
   } MMCifCategory;
 
+  /// \struct keeping track of entity information
+  typedef struct {
+    ChainType type; ///< characterise entity
+  } MMCifEntityDesc;
+
+  typedef std::map<String, MMCifEntityDesc> MMCifEntityDescMap;
+
   // members
   MMCifCategory category_;
   int category_counts_[DONT_KNOW]; ///< overall no. of atom_site loops
@@ -197,9 +220,9 @@ private:
   String subst_res_id_; ///< work around for missing label_seq_id's
   bool has_model_;      ///< keep track of models through different atom_sites
   int curr_model_;      ///< if we have pdbx_PDB_model_num, store no.
-  //from pdbdreader
-  //entity als member, fill in ondatarow
-  //import function
+  std::vector<std::pair<mol::ChainHandle, String> > chain_id_pairs_;
+  ///< chain and label_entity_id
+  MMCifEntityDescMap entity_desc_map_; ///< stores entity items
 };
 
 }}
diff --git a/modules/io/tests/test_mmcif_reader.cc b/modules/io/tests/test_mmcif_reader.cc
index 524ad6bc6..ad669bd7a 100644
--- a/modules/io/tests/test_mmcif_reader.cc
+++ b/modules/io/tests/test_mmcif_reader.cc
@@ -39,9 +39,16 @@ public:
     MMCifParser(stream, ent_handle, profile)
   { }
 
+  TestMMCifParserProtected(const String& filename,
+                           mol::EntityHandle& ent_handle):
+    MMCifParser(filename, ent_handle, IOProfile())
+  { }
+
+  using MMCifParser::OnBeginLoop;
   using MMCifParser::IsValidPDBIdent;
   using MMCifParser::ParseAtomIdent;
   using MMCifParser::ParseAndAddAtom;
+  using MMCifParser::ParseEntity;
   using MMCifParser::TryStoreIdx;
 };
 
@@ -94,43 +101,54 @@ BOOST_AUTO_TEST_CASE(mmcif_onbeginloop)
   std::ifstream s("testfiles/mmcif/atom_site.mmcif");
   MMCifParser mmcif_p(s, eh, IOProfile());
   StarLoopDesc mmcif_h;
+  BOOST_MESSAGE("          testing atom_site items...");
   mmcif_h.SetCategory(StringRef("atom_site", 9));
-  BOOST_MESSAGE("    auth_asym_id");
+  BOOST_MESSAGE("             auth_asym_id");
   BOOST_CHECK_THROW(mmcif_p.OnBeginLoop(mmcif_h), IOException);
   mmcif_h.Add(StringRef("auth_asym_id", 12));
-  BOOST_MESSAGE("    id");
+  BOOST_MESSAGE("             id");
   BOOST_CHECK_THROW(mmcif_p.OnBeginLoop(mmcif_h), IOException);
   mmcif_h.Add(StringRef("id", 2));
-  BOOST_MESSAGE("    label_alt_id");
+  BOOST_MESSAGE("             label_alt_id");
   BOOST_CHECK_THROW(mmcif_p.OnBeginLoop(mmcif_h), IOException);
   mmcif_h.Add(StringRef("label_alt_id", 12));
-  BOOST_MESSAGE("    label_asym_id");
+  BOOST_MESSAGE("             label_asym_id");
   BOOST_CHECK_THROW(mmcif_p.OnBeginLoop(mmcif_h), IOException);
   mmcif_h.Add(StringRef("label_asym_id", 13));
-  BOOST_MESSAGE("    label_atom_id");
+  BOOST_MESSAGE("             label_atom_id");
   BOOST_CHECK_THROW(mmcif_p.OnBeginLoop(mmcif_h), IOException);
   mmcif_h.Add(StringRef("label_atom_id", 13));
-  BOOST_MESSAGE("    label_comp_id");
+  BOOST_MESSAGE("             label_comp_id");
   BOOST_CHECK_THROW(mmcif_p.OnBeginLoop(mmcif_h), IOException);
   mmcif_h.Add(StringRef("label_comp_id", 13));
-  BOOST_MESSAGE("    label_entity_id");
+  BOOST_MESSAGE("             label_entity_id");
   BOOST_CHECK_THROW(mmcif_p.OnBeginLoop(mmcif_h), IOException);
   mmcif_h.Add(StringRef("label_entity_id", 15));
-  BOOST_MESSAGE("    label_seq_id");
+  BOOST_MESSAGE("             label_seq_id");
   BOOST_CHECK_THROW(mmcif_p.OnBeginLoop(mmcif_h), IOException);
   mmcif_h.Add(StringRef("label_seq_id", 12));
-  BOOST_MESSAGE("    type_symbol");
+  BOOST_MESSAGE("             type_symbol");
   BOOST_CHECK_THROW(mmcif_p.OnBeginLoop(mmcif_h), IOException);
   mmcif_h.Add(StringRef("type_symbol", 11));
-  BOOST_MESSAGE("    Cartn_x");
+  BOOST_MESSAGE("             Cartn_x");
   BOOST_CHECK_THROW(mmcif_p.OnBeginLoop(mmcif_h), IOException);
   mmcif_h.Add(StringRef("Cartn_x", 7));
-  BOOST_MESSAGE("    Cartn_y");
+  BOOST_MESSAGE("             Cartn_y");
   BOOST_CHECK_THROW(mmcif_p.OnBeginLoop(mmcif_h), IOException);
   mmcif_h.Add(StringRef("Cartn_y", 7));
-  BOOST_MESSAGE("    Cartn_z");
+  BOOST_MESSAGE("             Cartn_z");
   BOOST_CHECK_THROW(mmcif_p.OnBeginLoop(mmcif_h), IOException);
   mmcif_h.Add(StringRef("Cartn_z", 7));
+  BOOST_CHECK_NO_THROW(mmcif_p.OnBeginLoop(mmcif_h));
+  BOOST_MESSAGE("          done.");
+  mmcif_h.Clear();
+  BOOST_MESSAGE("          testing entity items...");
+  mmcif_h.SetCategory(StringRef("entity", 6));
+  BOOST_MESSAGE("             id");
+  BOOST_CHECK_THROW(mmcif_p.OnBeginLoop(mmcif_h), IOException);
+  mmcif_h.Add(StringRef("id", 2));
+  BOOST_CHECK_NO_THROW(mmcif_p.OnBeginLoop(mmcif_h));
+  BOOST_MESSAGE("          done.");
   BOOST_MESSAGE("  done.");
 }
 
@@ -178,23 +196,133 @@ BOOST_AUTO_TEST_CASE(mmcif_parse_models)
   BOOST_MESSAGE("  done.");
 }
 
-BOOST_AUTO_TEST_CASE(mmcif_parseatomident)
+BOOST_AUTO_TEST_CASE(mmcif_changing_label_entity_id)
+{
+  BOOST_MESSAGE("  Running mmcif_changing_label_entity_id tests...");
+  IOProfile profile;
+
+  // positive
+  BOOST_MESSAGE("          true positive test...");
+  {
+    mol::EntityHandle eh = mol::CreateEntity();
+    MMCifParser mmcif_p("testfiles/mmcif/atom_site.mmcif", eh, profile);
+    BOOST_CHECK_NO_THROW(mmcif_p.Parse());
+  }
+  BOOST_MESSAGE("          done.");
+
+  // negative
+  BOOST_MESSAGE("          true negative test...");
+  {
+    mol::EntityHandle eh = mol::CreateEntity();
+    MMCifParser mmcif_p("testfiles/mmcif/changing_label_entity_id.mmcif", eh,
+                        profile);
+    BOOST_CHECK_THROW(mmcif_p.Parse(), IOException);
+  }
+  BOOST_MESSAGE("          done.");
+
+  BOOST_MESSAGE("  done.");
+}
+
+BOOST_AUTO_TEST_CASE(mmcif_unknown_entity_type)
 {
+  BOOST_MESSAGE("  Running mmcif_unknown_entity_type tests...");
+
   mol::EntityHandle eh = mol::CreateEntity();
+  std::vector<StringRef> columns;
+  TestMMCifParserProtected tmmcif_p("testfiles/mmcif/atom_site.mmcif", eh);
+  StarLoopDesc tmmcif_h;
 
+  // build dummy header
+  tmmcif_h.SetCategory(StringRef("entity", 6));
+  tmmcif_h.Add(StringRef("id", 2));
+  tmmcif_h.Add(StringRef("type", 4));
+  tmmcif_p.OnBeginLoop(tmmcif_h);
+
+  // positive
+  BOOST_MESSAGE("          known type...");
+  // build datarow
+  columns.push_back(StringRef("1", 1));
+  columns.push_back(StringRef("polymer", 7));
+  BOOST_CHECK_NO_THROW(tmmcif_p.ParseEntity(columns));
+  columns.pop_back();
+  columns.push_back(StringRef("non-polymer", 11));
+  BOOST_CHECK_NO_THROW(tmmcif_p.ParseEntity(columns));
+  columns.pop_back();
+  columns.push_back(StringRef("water", 5));
+  BOOST_CHECK_NO_THROW(tmmcif_p.ParseEntity(columns));
+  BOOST_MESSAGE("          done.");
+
+  // negative
+  BOOST_MESSAGE("          unknown type...");
+  columns.pop_back();
+  columns.push_back(StringRef("foo", 3));
+  BOOST_CHECK_THROW(tmmcif_p.ParseEntity(columns), IOException);
+  BOOST_MESSAGE("          done.");
+
+  BOOST_MESSAGE("  done.");
+}
+
+BOOST_AUTO_TEST_CASE(mmcif_chaintype_setting)
+{
+  BOOST_MESSAGE("  Running mmcif_chaintype_setting tests...");
+  mol::ChainHandle ch;
+  IOProfile profile;
+
+  // positive
+  BOOST_MESSAGE("          check correct settings...");
+  {
+    mol::EntityHandle eh = mol::CreateEntity();
+    MMCifParser mmcif_p("testfiles/mmcif/atom_site.mmcif", eh, profile);
+    mmcif_p.Parse();
+    ch = eh.FindChain("A");
+    BOOST_CHECK(ch.IsValid());
+    BOOST_CHECK(ch.GetChainType() == CHAINTYPE_POLY);
+    ch = eh.FindChain("C");
+    BOOST_CHECK(ch.IsValid());
+    BOOST_CHECK(ch.GetChainType() == CHAINTYPE_POLY);
+    ch = eh.FindChain("O");
+    BOOST_CHECK(ch.IsValid());
+    BOOST_CHECK(ch.GetChainType() == CHAINTYPE_WATER);
+  }
+  BOOST_MESSAGE("          done.");
+  // negative: no entity description
+  BOOST_MESSAGE("          check missing entity description...");
+  {
+    mol::EntityHandle eh = mol::CreateEntity();
+    MMCifParser mmcif_p("testfiles/mmcif/model_truepos.mmcif",
+                        eh,
+                        profile);
+    mmcif_p.Parse();
+    ch = eh.FindChain("A");
+    BOOST_CHECK(ch.IsValid());
+    BOOST_CHECK(ch.GetChainType() == CHAINTYPE_UNKNOWN);
+    ch = eh.FindChain("B");
+    BOOST_CHECK(ch.IsValid());
+    BOOST_CHECK(ch.GetChainType() == CHAINTYPE_UNKNOWN);
+  }
+  BOOST_MESSAGE("          done.");
+
+  BOOST_MESSAGE("  done.");
+}
+
+BOOST_AUTO_TEST_CASE(mmcif_parseatomident)
+{
   BOOST_MESSAGE("  Running mmcif_parseatomident tests...");
+
+  mol::EntityHandle eh = mol::CreateEntity();
+
   std::ifstream s("testfiles/mmcif/atom_site.mmcif");
   IOProfile profile;
   TestMMCifParserProtected tmmcif_p(s, eh, profile);
-  //std::vector<StringRef> cols;
-  //String chain_name;
-  //StringRef res_name;
+  std::vector<StringRef> columns;
+  String chain_name;
+  StringRef res_name;
   //mol::ResNum resnum(0);
   //StringRef atom_name;
   //char alt_loc;
 
   BOOST_MESSAGE("          testing valid line");
-  //tmmcif_p.ParseAtomIdent();
+  //tmmcif_p.ParseAtomIdent(columns, chain_name, res_name);
   BOOST_MESSAGE("          done.");
   // negative
   //cols.push_back(StringRef("ATOM", 4));
diff --git a/modules/io/tests/testfiles/mmcif/atom_site.mmcif b/modules/io/tests/testfiles/mmcif/atom_site.mmcif
index f2568fe3b..c1a43746b 100644
--- a/modules/io/tests/testfiles/mmcif/atom_site.mmcif
+++ b/modules/io/tests/testfiles/mmcif/atom_site.mmcif
@@ -3,6 +3,20 @@ data_1BAR
 # this file is also used in the mmcif_parse_models tests for a true negative
 # test, hence it is not allowed to carry atom_site.pdbx_PDB_model_num entries
 
+# this file is also used in the mmcif_mmcif_chaintype_setting test for a true
+# positive test, hence the entity category is not to be changed
+
+loop_
+_entity.id
+_entity.type
+_entity.details
+1 polymer 
+;              The enzymatically competent form of HIV
+               protease is a dimer. This entity
+               corresponds to one monomer of an active dimer.
+;
+5 water        .
+
 loop_
 _atom_site.group_PDB
 _atom_site.type_symbol
diff --git a/modules/io/tests/testfiles/mmcif/model_truepos.mmcif b/modules/io/tests/testfiles/mmcif/model_truepos.mmcif
index 613854466..9f1e39f16 100644
--- a/modules/io/tests/testfiles/mmcif/model_truepos.mmcif
+++ b/modules/io/tests/testfiles/mmcif/model_truepos.mmcif
@@ -1,6 +1,9 @@
 data_1TPM
 # derived from 2JSP
 
+# this file is also used in the mmcif_mmcif_chaintype_setting test for a true
+# negative test, hence no entity category may be added
+
 loop_
 _atom_site.group_PDB 
 _atom_site.id 
-- 
GitLab