From 1357717fbe71839c6a6e1f18fc2d02f3e9d947fe Mon Sep 17 00:00:00 2001
From: Gabriel Studer <gabriel.studer@unibas.ch>
Date: Tue, 23 Jan 2024 15:20:21 +0100
Subject: [PATCH] mmcif reader: provide info on mmCIF entities

Available as MMCifEntityDesc objects in MMCifInfo
---
 modules/io/doc/mmcif.rst            | 64 +++++++++++++++++++++++
 modules/io/pymod/export_mmcif_io.cc | 12 +++++
 modules/io/src/mol/mmcif_info.cc    | 17 +++++++
 modules/io/src/mol/mmcif_info.hh    | 20 ++++++++
 modules/io/src/mol/mmcif_reader.cc  | 78 ++++++++++++++++++++++++++++-
 modules/io/src/mol/mmcif_reader.hh  | 23 ++++++---
 6 files changed, 204 insertions(+), 10 deletions(-)

diff --git a/modules/io/doc/mmcif.rst b/modules/io/doc/mmcif.rst
index 340d16c98..c744d54ee 100644
--- a/modules/io/doc/mmcif.rst
+++ b/modules/io/doc/mmcif.rst
@@ -397,6 +397,14 @@ of the annotation available.
 
     Establish all bonds stored for branched entities.
 
+  .. method:: GetEntityDesc(entity_id)
+
+    Get info of type :class:`MMCifEntityDesc` for specified *entity_id*.
+    The entity id for a chain can be fetched with :func:`GetMMCifEntityIdTr`.
+
+    :param entity_id: ID of entity
+    :type entity_id: :class:`str`
+
 .. class:: MMCifInfoCitation
 
   This stores citation information from an input file.
@@ -1349,6 +1357,62 @@ of the annotation available.
     See :attr:`bond_order`
 
 
+.. class:: MMCifEntityDesc
+
+  Data collected for certain mmCIF entity
+
+  .. attribute:: type
+
+    The ost chain type which can be assigned to :class:`ost.mol.ChainHandle`
+
+    :type: :class:`ost.mol.ChainType`
+
+  .. attribute:: entity_type
+
+    value of _entity.type token
+
+    :class:`str`
+
+  .. attribute:: entity_poly_type
+
+    value of _entity_poly.type token - empty string if entity is not of type
+    "polymer"
+
+    :class:`str`
+
+  .. attribute:: branched_type
+
+    value of _pdbx_entity_branch.type token - empty string if entity is not of
+    type "branched"
+
+    :type: :class:`str`
+
+  .. attribute:: details
+
+    value of _entity.pdbx_description token
+
+    :class:`str`
+
+  .. attribute:: seqres
+
+    SEQRES with gentle preprocessing - empty string if entity is not of type
+    "polymer". By default, the :class:`ost.io.MMCifReader` reads the value of the
+    _entity_poly.pdbx_seq_one_letter_code token. Copies all letters but
+    searches a :class:`ost.conop.CompoundLib` for compound names in brackets.
+    *seqres* gets an 'X' if no compound is found or the respective compound has
+    one letter code '?'. Uses the one letter code of the found compound
+    otherwise. So it's basically a canonical SEQRES with exactly one character
+    per residue.
+
+    :type: :class:`str`
+
+  .. attribute:: mon_ids
+
+    Monomer ids of all residues in a polymer - empty if entity is not of
+    type "polymer". Read from _entity_poly_seq category.
+
+    :type: :class:`ost.base.StringList`
+
 Writing mmCIF files
 --------------------------------------------------------------------------------
 
diff --git a/modules/io/pymod/export_mmcif_io.cc b/modules/io/pymod/export_mmcif_io.cc
index 411750b91..200e42556 100644
--- a/modules/io/pymod/export_mmcif_io.cc
+++ b/modules/io/pymod/export_mmcif_io.cc
@@ -485,6 +485,16 @@ void export_mmcif_io()
     .def(self_ns::str(self))
   ;
 
+  class_<MMCifEntityDesc>("MMCifEntityDesc", init<>())
+   .add_property("type", &MMCifEntityDesc::type)
+   .add_property("entity_type", &MMCifEntityDesc::entity_type)
+   .add_property("entity_poly_type", &MMCifEntityDesc::entity_poly_type)
+   .add_property("branched_type", &MMCifEntityDesc::branched_type)
+   .add_property("details", &MMCifEntityDesc::details)
+   .add_property("seqres", &MMCifEntityDesc::seqres)
+   .add_property("mon_ids", &MMCifEntityDesc::mon_ids)
+  ;
+
   class_<MMCifInfo>("MMCifInfo", init<>())
     .def("AddCitation", &MMCifInfo::AddCitation)
     .def("GetCitations", make_function(&MMCifInfo::GetCitations,
@@ -525,6 +535,8 @@ void export_mmcif_io()
     .def("ConnectBranchLinks", &MMCifInfo::ConnectBranchLinks)
     .def("GetEntityBranchChainNames", &WrapGetNames)
     .def("GetEntityBranchChains", &MMCifInfo::GetEntityBranchChains)
+    .def("SetEntityDesc", &MMCifInfo::SetEntityDesc)
+    .def("GetEntityDesc", &MMCifInfo::GetEntityDesc, return_value_policy<copy_const_reference>())
     .add_property("citations", make_function(&MMCifInfo::GetCitations,
                                    return_value_policy<copy_const_reference>()))
     .add_property("biounits", make_function(&MMCifInfo::GetBioUnits,
diff --git a/modules/io/src/mol/mmcif_info.cc b/modules/io/src/mol/mmcif_info.cc
index ef44d4813..96f256ca9 100644
--- a/modules/io/src/mol/mmcif_info.cc
+++ b/modules/io/src/mol/mmcif_info.cc
@@ -293,6 +293,23 @@ void MMCifInfo::ConnectBranchLinks()
   }
 }
 
+const MMCifEntityDesc& MMCifInfo::GetEntityDesc(const String& entity_id) const {
+  MMCifEntityDescMap::const_iterator it = entity_desc_.find(entity_id);
+  if(it == entity_desc_.end()) {
+    throw IOException("No EntityDesc for entity id \""+entity_id+"\"");
+  }
+  return it->second;
+}
+
+void MMCifInfo::SetEntityDesc(const String& entity_id,
+                              const MMCifEntityDesc& entity_desc)
+{
+  if(entity_desc_.find(entity_id) != entity_desc_.end()) {
+    throw IOException("EntityDesc for entity_id \""+entity_id+"\" already set");
+  }
+  entity_desc_[entity_id] = entity_desc;
+}
+
 std::ostream& operator<<(std::ostream& os, const MMCifInfoEntityBranchLink& eb)
 {
   os << "<MMCifInfoEntityBranchLink atom1:" << eb.GetAtom1() << " atom2:"
diff --git a/modules/io/src/mol/mmcif_info.hh b/modules/io/src/mol/mmcif_info.hh
index 389d7d1cd..5114b4d9e 100644
--- a/modules/io/src/mol/mmcif_info.hh
+++ b/modules/io/src/mol/mmcif_info.hh
@@ -959,6 +959,20 @@ private:
 };
 typedef std::map<String, std::vector<MMCifInfoEntityBranchLink> > MMCifInfoEntityBranchLinkMap;
 
+
+/// \struct keeping track of entity information
+typedef struct {
+  mol::ChainType type;         ///< characterise entity
+  String entity_type;          ///< value of _entity.type
+  String entity_poly_type;     ///< value of _entity_poly.type
+  String branched_type;        ///< value of _pdbx_entity_branch.type
+  String details;              ///< description of this entity
+  String seqres;               ///< chain of monomers
+  std::vector<String> mon_ids; ///< list of full monomer names
+} MMCifEntityDesc;
+typedef std::map<String, MMCifEntityDesc> MMCifEntityDescMap;
+
+
 /// \brief container class for additional information from MMCif files
 /// 
 /// \section mmcif annotation information
@@ -1201,6 +1215,11 @@ public:
   ///
   void ConnectBranchLinks();
 
+  const MMCifEntityDesc& GetEntityDesc(const String& entity_id) const;
+
+  void SetEntityDesc(const String& entity_id,
+                     const MMCifEntityDesc& entity_desc);
+
 //protected:
 
 private:
@@ -1216,6 +1235,7 @@ private:
   std::vector<MMCifInfoBioUnit>  biounits_;   ///< list of biounits
   std::vector<MMCifInfoTransOpPtr> transops_;
   MMCifInfoStructRefs            struct_refs_;
+  MMCifEntityDescMap entity_desc_;
   std::map<String, String> cif_2_pdb_chain_id_;
   std::map<String, String> pdb_2_cif_chain_id_;
   std::map<String, String> cif_2_entity_id_;
diff --git a/modules/io/src/mol/mmcif_reader.cc b/modules/io/src/mol/mmcif_reader.cc
index b6c2167cb..9ad17290f 100644
--- a/modules/io/src/mol/mmcif_reader.cc
+++ b/modules/io/src/mol/mmcif_reader.cc
@@ -92,6 +92,7 @@ void MMCifReader::ClearState()
   revision_types_.clear();
   database_PDB_rev_added_ = false;
   entity_branch_link_map_.clear();
+  entity_poly_seq_map_.clear();
 }
 
 void MMCifReader::SetRestrictChains(const String& restrict_chains)
@@ -389,6 +390,13 @@ bool MMCifReader::OnBeginLoop(const StarLoopDesc& header)
     indices_[BL_ATOM_STEREO_CONFIG_2] = header.GetIndex("atom_stereo_config_2");
     indices_[BL_VALUE_ORDER] = header.GetIndex("value_order");
     cat_available = true;
+ } else if(header.GetCategory() == "entity_poly_seq") {
+  category_ = ENTITY_POLY_SEQ;
+  // mandatory
+  this->TryStoreIdx(EPS_ENTITY_ID, "entity_id", header);
+  this->TryStoreIdx(EPS_MON_ID, "mon_id", header);
+  this->TryStoreIdx(EPS_NUM, "num", header);
+  cat_available = true;
  }
   category_counts_[category_]++;
   return cat_available;
@@ -710,15 +718,19 @@ void MMCifReader::ParseAndAddAtom(const std::vector<StringRef>& columns)
                 columns[indices_[GROUP_PDB]][0]=='H');
 }
 
-MMCifReader::MMCifEntityDescMap::iterator MMCifReader::GetEntityDescMapIterator(
+MMCifEntityDescMap::iterator MMCifReader::GetEntityDescMapIterator(
   const String& entity_id)
 {
   MMCifEntityDescMap::iterator edm_it = entity_desc_map_.find(entity_id);
   // if the entity ID is not already stored, insert it with empty values
   if (edm_it == entity_desc_map_.end()) {
     MMCifEntityDesc desc = {.type=mol::CHAINTYPE_N_CHAINTYPES,
+                            .entity_type = "",
+                            .entity_poly_type = "",
+                            .branched_type = "",
                             .details="",
-                            .seqres=""};
+                            .seqres="",
+                            .mon_ids=std::vector<String>()};
     edm_it = entity_desc_map_.insert(entity_desc_map_.begin(),
                                      MMCifEntityDescMap::value_type(entity_id,
                                                                     desc));
@@ -738,6 +750,8 @@ void MMCifReader::ParseEntity(const std::vector<StringRef>& columns)
     if (edm_it->second.type == mol::CHAINTYPE_N_CHAINTYPES) {
       edm_it->second.type = mol::ChainTypeFromString(columns[indices_[E_TYPE]]);
     }
+    // but set entity_type anyways
+    edm_it->second.entity_type = columns[indices_[E_TYPE]].str();
   } else {
     // don't deal with entities without type
     entity_desc_map_.erase(edm_it);
@@ -758,6 +772,7 @@ void MMCifReader::ParseEntityPoly(const std::vector<StringRef>& columns)
   // store type
   if (indices_[EP_TYPE] != -1) {
     edm_it->second.type = mol::ChainTypeFromString(columns[indices_[EP_TYPE]]);
+    edm_it->second.entity_poly_type = columns[indices_[EP_TYPE]].str();
   }
 
   // store seqres
@@ -1633,6 +1648,10 @@ void MMCifReader::OnDataRow(const StarLoopDesc& header,
     LOG_TRACE("processing pdbx_entity_branch_link entry");
     this->ParsePdbxEntityBranchLink(columns);
     break;
+  case ENTITY_POLY_SEQ:
+    LOG_TRACE("processing entity_poly_seq entry");
+    this->ParseEntityPolySeq(columns);
+    break;
   default:
     throw IOException(this->FormatDiagnostic(STAR_DIAG_ERROR,
                        "Uncatched category '"+ header.GetCategory() +"' found.",
@@ -1780,6 +1799,7 @@ void MMCifReader::ParsePdbxEntityBranch(const std::vector<StringRef>& columns)
   // store type
   if (indices_[BR_ENTITY_TYPE] != -1) {
     edm_it->second.type = mol::ChainTypeFromString(columns[indices_[EP_TYPE]]);
+    edm_it->second.branched_type = columns[indices_[EP_TYPE]].str();
   }
 }
 
@@ -1833,6 +1853,45 @@ void MMCifReader::ParsePdbxEntityBranchLink(const std::vector<StringRef>& column
   blm_it->second.push_back(link_pair);
 }
 
+void MMCifReader::ParseEntityPolySeq(const std::vector<StringRef>& columns)
+{
+  std::pair<bool, int> tmp = columns[indices_[EPS_NUM]].to_int();
+  if(!tmp.first) {
+    throw IOException(this->FormatDiagnostic(STAR_DIAG_ERROR,
+                                             "Could not cast "
+                                             "_entity_poly_seq.num to integer: "
+                                             + columns[indices_[EPS_NUM]].str(),
+                                             this->GetCurrentLinenum())); 
+  }
+  int num = tmp.second;
+
+  if(num < 1) {
+    throw IOException(this->FormatDiagnostic(STAR_DIAG_ERROR,
+                                             "_entity_poly_seq.num are "
+                                             "expected to be ints >= 1. Got:"
+                                             + columns[indices_[EPS_NUM]].str(),
+                                             this->GetCurrentLinenum())); 
+  }
+
+  // [] inserts new value if not present in container
+  std::map<int, String>& entity_map =
+  entity_poly_seq_map_[columns[indices_[EPS_ENTITY_ID]].str()];
+
+  if(entity_map.find(num) != entity_map.end()) {
+    throw IOException(this->FormatDiagnostic(STAR_DIAG_ERROR,
+                                             "_entity_poly_seq.num must be "
+                                             "unique in same "
+                                             "_entity_poly_seq.entity_id. "
+                                             "entity_id: " +
+                                             columns[indices_[EPS_ENTITY_ID]].str() +
+                                             ", num: " +
+                                             columns[indices_[EPS_ENTITY_ID]].str(),
+                                             this->GetCurrentLinenum()));    
+  }
+
+  entity_map[num] = columns[indices_[EPS_MON_ID]].str();
+}
+
 void MMCifReader::OnEndData()
 {
   mol::XCSEditor editor=ent_handle_.EditXCS(mol::BUFFERED_EDIT);
@@ -1972,6 +2031,21 @@ void MMCifReader::OnEndData()
     }
   }
 
+  // conclude EntityDesc (add entity_poly_seq if present) and add to MMCifInfo
+  for(auto entity_it: entity_desc_map_) {
+    if(entity_poly_seq_map_.find(entity_it.first) != entity_poly_seq_map_.end()) {
+      int max_num = 1;
+      for(auto seqres_it: entity_poly_seq_map_[entity_it.first]) {
+        max_num = std::max(max_num, seqres_it.first);
+      }
+      entity_it.second.mon_ids.assign(max_num, "?");
+      for(auto seqres_it: entity_poly_seq_map_[entity_it.first]) {
+        entity_it.second.mon_ids[seqres_it.first-1] = seqres_it.second;
+      }
+    }
+    info_.SetEntityDesc(entity_it.first, entity_it.second);
+  }
+
   LOG_INFO("imported "
            << chain_count_ << " chains, "
            << residue_count_ << " residues, "
diff --git a/modules/io/src/mol/mmcif_reader.hh b/modules/io/src/mol/mmcif_reader.hh
index f2f2d2dbd..43f6cc220 100644
--- a/modules/io/src/mol/mmcif_reader.hh
+++ b/modules/io/src/mol/mmcif_reader.hh
@@ -338,6 +338,11 @@ protected:
   /// \param columns data row
   void ParsePdbxEntityBranchLink(const std::vector<StringRef>& columns);
 
+  /// \brief Fetch mmCIF entity_poly_seq information
+  ///
+  /// \param columns data row
+  void ParseEntityPolySeq(const std::vector<StringRef>& columns);
+
   /// \struct types of secondary structure
   typedef enum {
     MMCIF_HELIX,
@@ -595,6 +600,13 @@ private:
     BL_VALUE_ORDER               /// < bond order
   } EntityBranchLinkItems;
 
+  /// \enum items of the entity_poly_seq category
+  typedef enum {
+    EPS_ENTITY_ID,
+    EPS_MON_ID,
+    EPS_NUM
+  } EntityPolySeqItems;
+
   /// \enum categories of the mmcif format
   typedef enum {
     ATOM_SITE,
@@ -620,17 +632,10 @@ private:
     PDBX_DATABASE_STATUS,
     PDBX_ENTITY_BRANCH,
     PDBX_ENTITY_BRANCH_LINK,
+    ENTITY_POLY_SEQ,
     DONT_KNOW
   } MMCifCategory;
 
-  /// \struct keeping track of entity information
-  typedef struct {
-    mol::ChainType type; ///< characterise entity
-    String details;      ///< description of this entity
-    String seqres;       ///< chain of monomers
-  } MMCifEntityDesc;
-  typedef std::map<String, MMCifEntityDesc> MMCifEntityDescMap;
-
   /// \brief Get an iterator for MMCifEntityDescMap by finding an element or
   ///        inserting a new one into the map.
   /// \param entity_id ID of the entity to talk to
@@ -729,6 +734,8 @@ private:
   bool database_PDB_rev_added_;
   // for entity_branch connections
   MMCifPdbxEntityBranchLinkMap entity_branch_link_map_;
+  // for storing entity_poly_seq
+  std::map<String, std::map<int, String> > entity_poly_seq_map_;
 };
 
 /// \brief Translate mmCIF info on bond type (e.g.
-- 
GitLab