From 84b8cdfa2fbe79837a581e84c1bc1ca78ac57394 Mon Sep 17 00:00:00 2001
From: Stefan Bienert <stefan.bienert@unibas.ch>
Date: Wed, 3 Aug 2011 10:59:49 +0200
Subject: [PATCH] MMCifParser now reading seqres

---
 modules/io/pymod/export_mmcif_io.cc           |  3 +
 modules/io/src/mol/mmcif_reader.cc            | 98 ++++++++++++-------
 modules/io/src/mol/mmcif_reader.hh            | 26 ++++-
 modules/io/tests/test_mmcif_reader.cc         | 36 ++++++-
 .../io/tests/testfiles/mmcif/atom_site.mmcif  |  1 +
 5 files changed, 124 insertions(+), 40 deletions(-)

diff --git a/modules/io/pymod/export_mmcif_io.cc b/modules/io/pymod/export_mmcif_io.cc
index 4e0a47240..e50e128c4 100644
--- a/modules/io/pymod/export_mmcif_io.cc
+++ b/modules/io/pymod/export_mmcif_io.cc
@@ -30,9 +30,12 @@ void export_mmcif_io()
   class_<MMCifParser, boost::noncopyable>("MMCifParser", init<const String&, EntityHandle&, const IOProfile&>())
     .def("Parse", &MMCifParser::Parse)
     .def("SetRestrictChains", &MMCifParser::SetRestrictChains)
+    .def("SetReadCanonicalSeqRes", &MMCifParser::SetReadCanonicalSeqRes)
+    .def("GetSeqRes", &MMCifParser::GetSeqRes)
     .add_property("restrict_chains",
                   make_function(&MMCifParser::GetRestrictChains,
                                 return_value_policy<copy_const_reference>()),
                   &MMCifParser::SetRestrictChains)
+    .add_property("seqres", &MMCifParser::GetSeqRes)
   ;
 }
diff --git a/modules/io/src/mol/mmcif_reader.cc b/modules/io/src/mol/mmcif_reader.cc
index dd43aa093..59e40c3ae 100644
--- a/modules/io/src/mol/mmcif_reader.cc
+++ b/modules/io/src/mol/mmcif_reader.cc
@@ -57,13 +57,13 @@ void MMCifParser::Init()
   atom_count_           = 0;
   residue_count_        = 0;
   auth_chain_id_        = false;
+  seqres_can_           = false;
   has_model_            = false;
   restrict_chains_      = "";
   subst_res_id_         = "";
   curr_chain_           = mol::ChainHandle();
   curr_residue_         = mol::ResidueHandle();
-  //chain_id_pairs_       = 
-  //entity_desc_map_
+  seqres_               = seq::CreateSequenceList();
 }
 
 void MMCifParser::ClearState()
@@ -75,6 +75,7 @@ void MMCifParser::ClearState()
   atom_count_           = 0;
   category_             = DONT_KNOW;
   warned_name_mismatch_ = false;
+  seqres_               = seq::CreateSequenceList();
 }
 
 void MMCifParser::SetRestrictChains(const String& restrict_chains)
@@ -164,6 +165,10 @@ bool MMCifParser::OnBeginLoop(const StarLoopDesc& header)
     this->TryStoreIdx(ENTITY_ID, "entity_id",    header);
     // optional
     indices_[EP_TYPE]  = header.GetIndex("type");
+    indices_[PDBX_SEQ_ONE_LETTER_CODE] =
+      header.GetIndex("pdbx_seq_one_letter_code");
+    indices_[PDBX_SEQ_ONE_LETTER_CODE_CAN] =
+      header.GetIndex("pdbx_seq_one_letter_code_can");
     return true;
   } /*else if (header.GetCategory()=="pdbx_poly_seq_scheme") {
   } else if (header.GetCategory()=="pdbx_struct_assembly") {
@@ -466,6 +471,7 @@ void MMCifParser::ParseEntity(const std::vector<StringRef>& columns)
   }
 
   if (store) {
+    desc.seqres = "";
     entity_desc_map_.insert(
                    MMCifEntityDescMap::value_type(columns[indices_[E_ID]].str(),
                                                   desc)
@@ -480,46 +486,62 @@ void MMCifParser::ParseEntityPoly(const std::vector<StringRef>& columns)
   MMCifEntityDescMap::iterator edm_it =
     entity_desc_map_.find(columns[indices_[ENTITY_ID]].str());
 
-  // store values in description map
-  if (edm_it != entity_desc_map_.end()) {
-    if (indices_[EP_TYPE] != -1) {
-      if(StringRef("polypeptide(D)", 14) == columns[indices_[EP_TYPE]]) {
-        edm_it->second.type = CHAINTYPE_POLY_PEPTIDE_D;
-      } else if(StringRef("polypeptide(L)", 14) == columns[indices_[EP_TYPE]]) {
-        edm_it->second.type = CHAINTYPE_POLY_PEPTIDE_L;
-      } else if(StringRef("polydeoxyribonucleotide", 23) ==
-                columns[indices_[EP_TYPE]]) {
-        edm_it->second.type = CHAINTYPE_POLY_DN;
-      } else if(StringRef("polyribonucleotide", 18) ==
-                columns[indices_[EP_TYPE]]) {
-        edm_it->second.type = CHAINTYPE_POLY_RN;
-      } else if(StringRef("polysaccharide(D)", 17) ==
-                columns[indices_[EP_TYPE]]) {
-        edm_it->second.type = CHAINTYPE_POLY_SAC_D;
-      } else if(StringRef("polysaccharide(L)", 17) ==
-                columns[indices_[EP_TYPE]]) {
-        edm_it->second.type = CHAINTYPE_POLY_SAC_L;
-      } else if(StringRef("polydeoxyribonucleotide/polyribonucleotide hybrid",
-                          49) == columns[indices_[EP_TYPE]]) {
-        edm_it->second.type = CHAINTYPE_POLY_DN_RN;
-      } else if(StringRef("other",
-                          5) == columns[indices_[EP_TYPE]]) {
-        edm_it->second.type = CHAINTYPE_UNKNOWN;
-      } else {
-        throw IOException(this->FormatDiagnostic(STAR_DIAG_ERROR,
-                                                "Unrecognised polymner type '" +
-                                              columns[indices_[EP_TYPE]].str() +
-                                                 "' found.",
-                                                 this->GetCurrentLinenum()));
-      }
-    }
-  } else {
+  if (edm_it == entity_desc_map_.end()) {
     throw IOException(this->FormatDiagnostic(STAR_DIAG_ERROR,
                      "'entity_poly' category defined before 'entity' for id '" +
                                             columns[indices_[ENTITY_ID]].str() +
                                              "' or missing.",
                                              this->GetCurrentLinenum()));
   }
+
+  // store type
+  if (indices_[EP_TYPE] != -1) {
+    if(StringRef("polypeptide(D)", 14) == columns[indices_[EP_TYPE]]) {
+      edm_it->second.type = CHAINTYPE_POLY_PEPTIDE_D;
+    } else if(StringRef("polypeptide(L)", 14) == columns[indices_[EP_TYPE]]) {
+      edm_it->second.type = CHAINTYPE_POLY_PEPTIDE_L;
+    } else if(StringRef("polydeoxyribonucleotide", 23) ==
+              columns[indices_[EP_TYPE]]) {
+      edm_it->second.type = CHAINTYPE_POLY_DN;
+    } else if(StringRef("polyribonucleotide", 18) ==
+              columns[indices_[EP_TYPE]]) {
+      edm_it->second.type = CHAINTYPE_POLY_RN;
+    } else if(StringRef("polysaccharide(D)", 17) ==
+              columns[indices_[EP_TYPE]]) {
+      edm_it->second.type = CHAINTYPE_POLY_SAC_D;
+    } else if(StringRef("polysaccharide(L)", 17) ==
+              columns[indices_[EP_TYPE]]) {
+      edm_it->second.type = CHAINTYPE_POLY_SAC_L;
+    } else if(StringRef("polydeoxyribonucleotide/polyribonucleotide hybrid",
+                        49) == columns[indices_[EP_TYPE]]) {
+      edm_it->second.type = CHAINTYPE_POLY_DN_RN;
+    } else if(StringRef("other",
+                        5) == columns[indices_[EP_TYPE]]) {
+      edm_it->second.type = CHAINTYPE_UNKNOWN;
+    } else {
+      throw IOException(this->FormatDiagnostic(STAR_DIAG_ERROR,
+                                               "Unrecognised polymner type '" +
+                                              columns[indices_[EP_TYPE]].str() +
+                                               "' found.",
+                                               this->GetCurrentLinenum()));
+    }
+  }
+
+  // store seqres
+  if (edm_it->second.seqres.length() > 0) {
+    throw IOException(this->FormatDiagnostic(STAR_DIAG_ERROR,
+     "entity_poly.pdbx_seq_one_letter_code[_can] clash: sequence for entry '" +
+                                            columns[indices_[ENTITY_ID]].str() +
+                                             "' is already set to '" +
+                                             edm_it->second.seqres + "'.",
+                                             this->GetCurrentLinenum()));
+  }
+  if ((seqres_can_) && (indices_[PDBX_SEQ_ONE_LETTER_CODE_CAN] != -1)) {
+    edm_it->second.seqres =
+      columns[indices_[PDBX_SEQ_ONE_LETTER_CODE_CAN]].str();
+  } else  if (indices_[PDBX_SEQ_ONE_LETTER_CODE] != -1) {
+    edm_it->second.seqres = columns[indices_[PDBX_SEQ_ONE_LETTER_CODE]].str();
+  }
 }
 
 void MMCifParser::OnDataRow(const StarLoopDesc& header, 
@@ -696,6 +718,10 @@ void MMCifParser::OnEndData()
     if (edm_it != entity_desc_map_.end()) {
       editor.SetChainType(css->first, edm_it->second.type);
       editor.SetChainDescription(css->first, edm_it->second.details);
+      if (edm_it->second.seqres.length() > 0) {
+        seqres_.AddSequence(seq::CreateSequence(css->first.GetName(),
+                                                edm_it->second.seqres));
+      }
     } else {
       LOG_WARNING("No entity description found for atom_site.label_entity_id '"
                   << css->second << "'");
diff --git a/modules/io/src/mol/mmcif_reader.hh b/modules/io/src/mol/mmcif_reader.hh
index b63e5c51d..b12d8fae6 100644
--- a/modules/io/src/mol/mmcif_reader.hh
+++ b/modules/io/src/mol/mmcif_reader.hh
@@ -24,6 +24,7 @@
 //#include <boost/iostreams/filtering_stream.hpp>
 //#include <boost/filesystem/fstream.hpp>
 
+#include <ost/seq/sequence_list.hh>
 #include <ost/mol/residue_handle.hh>
 #include <ost/mol/chain_type.hh>
 #include <ost/io/mol/io_profile.hh>
@@ -68,6 +69,15 @@ public:
   /// \param restrict_chains chain name
   void SetRestrictChains(const String& restrict_chains);
 
+  /// \brief Enable reading of canonical sequence residues
+  ///        (entity_poly.pdbx_seq_one_letter_code_can instead of
+  ///        entity_poly.pdbx_seq_one_letter_code). This flag is exclusive.
+  ///
+  void SetReadCanonicalSeqRes()
+  {
+    seqres_can_ = true;
+  }
+
   const String& GetRestrictChains() const
   {
     return restrict_chains_;
@@ -98,6 +108,13 @@ public:
   /// \brief Finalise parsing.
   virtual void OnEndData();
 
+  /// \brief Return sequences
+  ///
+  /// \return List of sequences
+  seq::SequenceList GetSeqRes() const {
+    return seqres_;
+  }
+
  protected:
   /// \brief Store an item index from loop header in preparation for reading a 
   ///        row. Throws an exception if the item does not exist.
@@ -198,8 +215,10 @@ private:
 
   /// \enum items of the entity_poly category
   typedef enum {
-    ENTITY_ID,         ///< pointer to entity.id
-    EP_TYPE            ///< type of polymer
+    ENTITY_ID,                    ///< pointer to entity.id
+    EP_TYPE,                      ///< type of polymer
+    PDBX_SEQ_ONE_LETTER_CODE,     ///< sequence, 1-letter code
+    PDBX_SEQ_ONE_LETTER_CODE_CAN  ///< canonical sequence, 1-letter code
   } EntityPolyItems;
 
   /// \enum categories of the mmcif format
@@ -214,6 +233,7 @@ private:
   typedef struct {
     ChainType type; ///< characterise entity
     String details; ///< description of this entity
+    String seqres;  ///< chain of monomers
   } MMCifEntityDesc;
 
   typedef std::map<String, MMCifEntityDesc> MMCifEntityDescMap;
@@ -226,6 +246,7 @@ private:
   mol::EntityHandle& ent_handle_;
   String restrict_chains_;
   bool auth_chain_id_;       ///< use chain IDs given by authors rather than pdb
+  bool seqres_can_;          ///< read canonical 1-letter residues?
   mol::ChainHandle curr_chain_;
   mol::ResidueHandle curr_residue_;
   int chain_count_;
@@ -238,6 +259,7 @@ private:
   std::vector<std::pair<mol::ChainHandle, String> > chain_id_pairs_;
   ///< chain and label_entity_id
   MMCifEntityDescMap entity_desc_map_; ///< stores entity items
+  seq::SequenceList seqres_;
 };
 
 }}
diff --git a/modules/io/tests/test_mmcif_reader.cc b/modules/io/tests/test_mmcif_reader.cc
index 7134e2f9e..f4f50b33f 100644
--- a/modules/io/tests/test_mmcif_reader.cc
+++ b/modules/io/tests/test_mmcif_reader.cc
@@ -322,13 +322,15 @@ BOOST_AUTO_TEST_CASE(mmcif_entity_poly_tests)
   mol::ChainHandle ch;
   IOProfile profile;
   StarLoopDesc tmmcif_h;
-  // positive
-  // negative: unknown polymer type
+
   mol::EntityHandle eh = mol::CreateEntity();
   MMCifParser mmcif_p("testfiles/mmcif/atom_site.mmcif", eh, profile);
 
   mmcif_p.Parse();
 
+  seq::SequenceList seqres = mmcif_p.GetSeqRes();
+  seq::SequenceHandle curr = seqres.FindSequence("A");
+  BOOST_CHECK(curr.GetString() == "VTI");
 
   BOOST_MESSAGE("          testing missing corresponding entity entry...");
   {
@@ -400,6 +402,36 @@ columns.push_back(StringRef("polydeoxyribonucleotide/polyribonucleotide hybrid",
     columns.pop_back();
   }
   BOOST_MESSAGE("          done.");
+  BOOST_MESSAGE("          testing pdbx_seq_one_letter_code reading...");
+  {
+    TestMMCifParserProtected tmmcif_p("testfiles/mmcif/atom_site.mmcif", eh);
+    std::vector<StringRef> columns;
+
+    tmmcif_h.Clear();
+    tmmcif_h.SetCategory(StringRef("entity", 6));
+    tmmcif_h.Add(StringRef("id", 2));
+    tmmcif_h.Add(StringRef("type", 4));
+    tmmcif_p.OnBeginLoop(tmmcif_h);
+    columns.push_back(StringRef("1", 1));
+    columns.push_back(StringRef("polymer", 7));
+    tmmcif_p.ParseEntity(columns);
+    columns.pop_back();
+    columns.pop_back();
+
+    tmmcif_h.Clear();
+    tmmcif_h.SetCategory(StringRef("entity_poly", 11));
+    tmmcif_h.Add(StringRef("entity_id", 9));
+    tmmcif_h.Add(StringRef("type", 4));
+    tmmcif_h.Add(StringRef("pdbx_seq_one_letter_code", 24));
+    tmmcif_p.OnBeginLoop(tmmcif_h);
+
+    columns.push_back(StringRef("1", 1));
+    columns.push_back(StringRef("other", 5));
+    columns.push_back(StringRef("ABRND", 5));
+    BOOST_CHECK_NO_THROW(tmmcif_p.ParseEntityPoly(columns));
+    BOOST_CHECK_THROW(tmmcif_p.ParseEntityPoly(columns), IOException);
+  }
+  BOOST_MESSAGE("          done.");
 
   BOOST_MESSAGE("  done.");
 }
diff --git a/modules/io/tests/testfiles/mmcif/atom_site.mmcif b/modules/io/tests/testfiles/mmcif/atom_site.mmcif
index 5987ec461..2dd1c2c52 100644
--- a/modules/io/tests/testfiles/mmcif/atom_site.mmcif
+++ b/modules/io/tests/testfiles/mmcif/atom_site.mmcif
@@ -19,6 +19,7 @@ _entity_poly.entity_id                      1
 _entity_poly.type                           'polypeptide(L)'
 _entity_poly.nstd_linkage                   no
 _entity_poly.nstd_monomer                   no
+_entity_poly.pdbx_seq_one_letter_code      'VTI'
 
 loop_
 _atom_site.group_PDB
-- 
GitLab