From 391def0820474a881746772673057b1c39864a86 Mon Sep 17 00:00:00 2001
From: Gabriel Studer <gabriel.studer@unibas.ch>
Date: Thu, 25 Jan 2024 15:29:05 +0100
Subject: [PATCH] mmcif reader: enable heterogeneities in entity_poly_seq
 parsing

---
 modules/io/doc/mmcif.rst            | 32 ++++++++++++++++++---
 modules/io/pymod/export_mmcif_io.cc |  2 ++
 modules/io/src/mol/mmcif_info.hh    | 16 ++++++-----
 modules/io/src/mol/mmcif_reader.cc  | 44 ++++++++++++++++++++++-------
 modules/io/src/mol/mmcif_reader.hh  |  4 ++-
 5 files changed, 76 insertions(+), 22 deletions(-)

diff --git a/modules/io/doc/mmcif.rst b/modules/io/doc/mmcif.rst
index 285e51cc8..8bb3b6fc6 100644
--- a/modules/io/doc/mmcif.rst
+++ b/modules/io/doc/mmcif.rst
@@ -1421,10 +1421,32 @@ of the annotation available.
   .. attribute:: mon_ids
 
     Monomer ids of all residues in a polymer - empty if entity is not of
-    type "polymer". Read from ``_entity_poly_seq`` category.
+    type "polymer". Read from ``_entity_poly_seq`` category. If a residue is
+    heterogeneous, this list contains the monomer id that comes first in
+    the CIF file. The other variants end up in
+    :attr:`hetero_num` / :attr:`hetero_ids`.
 
     :type: :class:`ost.base.StringList`
 
+  .. attribute:: hetero_num
+
+    Residue numbers of heterogeneous compounds - empty if entity is not
+    of type "polymer". Read from _entity_poly_seq category. If a residue is
+    heterogeneous, the monomer id that comes first in the CIF file ends up
+    in :attr:`mon_ids`. The remnant is listed here.
+    This list specifies the residue numbers for the respective monomer ids
+    in :attr:`hetero_ids`.
+
+  .. attribute:: hetero_ids
+
+    Monomer ids of heterogeneous compounds - empty if entity is not
+    of type "polymer". Read from _entity_poly_seq category. If a residue is
+    heterogeneous, the monomer id that comes first in the CIF file ends up
+    in :attr:`mon_ids`. The remnant is listed here.
+    This list specifies the monomer ids for the respective locations in
+    :attr:`hetero_num`.
+
+
 Writing mmCIF files
 --------------------------------------------------------------------------------
 
@@ -1805,15 +1827,17 @@ SEQRES is set to what we observe in the chain residues given their residue
 numbers (i.e. the ATOMSEQ). If the first residue has residue number 10, the
 SEQRES gets prefixed by 9 elements using a default value (e.g. UNK for a 
 chain of type CHAINTYPE_POLY_PEPTIDE_D). The same is done for gaps.
-Matching requires an exact match for ALL residues given their residue number
-with that SEQRES. However, there might be the case that one chain resolves
+A chain is considered matching an mmCIF entity, if all of its residue names
+are an exact match at the respective location in the SEQRES. Location is 
+determined with residue numbers which follow a 1-based indexing scheme.
+However, there might be the case that one chain resolves
 more residues than another. So you may have residues at locations that are
 undefined in the current SEQRES. If the fraction of matches with undefined
 locations does not exceed 5%, we still assume an overall match and fill
 in the previsouly undefined locations in the SEQRES with the newly gained
 information. This is a heuristic that works in most cases but potentially
 introduces errors in entity assignment. If you want to avoid that, you
-must set your entities manually and pass a list of :class:`MMCifWriterEntity`
+must set your entities manually and pass a :class:`MMCifWriterEntityList`
 when calling :func:`MMCifWriter.SetStructure`.
 
 if *mmcif_conform* is enabled, there is pretty much everything in place
diff --git a/modules/io/pymod/export_mmcif_io.cc b/modules/io/pymod/export_mmcif_io.cc
index bd892405f..a6ff22c0d 100644
--- a/modules/io/pymod/export_mmcif_io.cc
+++ b/modules/io/pymod/export_mmcif_io.cc
@@ -494,6 +494,8 @@ void export_mmcif_io()
    .add_property("details", &MMCifEntityDesc::details)
    .add_property("seqres", &MMCifEntityDesc::seqres)
    .add_property("mon_ids", &MMCifEntityDesc::mon_ids)
+   .add_property("hetero_num", &MMCifEntityDesc::hetero_num)
+   .add_property("hetero_ids", &MMCifEntityDesc::hetero_ids)
   ;
 
   class_<MMCifInfo>("MMCifInfo", init<>())
diff --git a/modules/io/src/mol/mmcif_info.hh b/modules/io/src/mol/mmcif_info.hh
index 054ecb3a6..d4ca56194 100644
--- a/modules/io/src/mol/mmcif_info.hh
+++ b/modules/io/src/mol/mmcif_info.hh
@@ -962,13 +962,15 @@ typedef std::map<String, std::vector<MMCifInfoEntityBranchLink> > MMCifInfoEntit
 
 /// \struct keeping track of entity information
 typedef struct {
-  mol::ChainType type;         ///< characterise entity
-  String entity_type;          ///< value of _entity.type
-  String entity_poly_type;     ///< value of _entity_poly.type
-  String branched_type;        ///< value of _pdbx_entity_branch.type
-  String details;              ///< description of this entity
-  String seqres;               ///< chain of monomers
-  std::vector<String> mon_ids; ///< list of full monomer names
+  mol::ChainType type;           ///< characterise entity
+  String entity_type;            ///< value of _entity.type
+  String entity_poly_type;       ///< value of _entity_poly.type
+  String branched_type;          ///< value of _pdbx_entity_branch.type
+  String details;                ///< description of this entity
+  String seqres;                 ///< chain of monomers
+  std::vector<String> mon_ids;   ///< list of monomer names from _entity_poly_seq
+  std::vector<int> hetero_num;   ///< res num of heterogeneous compounds
+  std::vector<String> hetero_ids;///< names of heterogeneous compounds
 } MMCifEntityDesc;
 typedef std::map<String, MMCifEntityDesc> MMCifEntityDescMap;
 
diff --git a/modules/io/src/mol/mmcif_reader.cc b/modules/io/src/mol/mmcif_reader.cc
index 90970daa3..6f5553150 100644
--- a/modules/io/src/mol/mmcif_reader.cc
+++ b/modules/io/src/mol/mmcif_reader.cc
@@ -396,6 +396,9 @@ bool MMCifReader::OnBeginLoop(const StarLoopDesc& header)
   this->TryStoreIdx(EPS_ENTITY_ID, "entity_id", header);
   this->TryStoreIdx(EPS_MON_ID, "mon_id", header);
   this->TryStoreIdx(EPS_NUM, "num", header);
+
+  // optional items
+  indices_[EPS_HETERO] = header.GetIndex("hetero");
   cat_available = true;
  } else if (header.GetCategory() == "em_3d_reconstruction") {
     category_ = EM_3D_RECONSTRUCTION;
@@ -735,7 +738,9 @@ MMCifEntityDescMap::iterator MMCifReader::GetEntityDescMapIterator(
                             .branched_type = "",
                             .details="",
                             .seqres="",
-                            .mon_ids=std::vector<String>()};
+                            .mon_ids=std::vector<String>(),
+                            .hetero_num=std::vector<int>(),
+                            .hetero_ids=std::vector<String>()};
     edm_it = entity_desc_map_.insert(entity_desc_map_.begin(),
                                      MMCifEntityDescMap::value_type(entity_id,
                                                                     desc));
@@ -1895,15 +1900,28 @@ void MMCifReader::ParseEntityPolySeq(const std::vector<StringRef>& columns)
   entity_poly_seq_map_[columns[indices_[EPS_ENTITY_ID]].str()];
 
   if(entity_map.find(num) != entity_map.end()) {
-    throw IOException(this->FormatDiagnostic(STAR_DIAG_ERROR,
-                                             "_entity_poly_seq.num must be "
-                                             "unique in same "
-                                             "_entity_poly_seq.entity_id. "
-                                             "entity_id: " +
-                                             columns[indices_[EPS_ENTITY_ID]].str() +
-                                             ", num: " +
-                                             columns[indices_[EPS_ENTITY_ID]].str(),
-                                             this->GetCurrentLinenum()));    
+
+    if(indices_[EPS_HETERO] != -1 && columns[indices_[EPS_HETERO]][0] == 'y') {
+      // controlled vocabulary: "y", "yes", "n", "no"
+      // the first two mark hetereogeneous compounds
+
+      // [] inserts new value if not present
+      std::vector<std::pair<int, String> >& hetero_list =
+      entity_poly_seq_h_map_[columns[indices_[EPS_ENTITY_ID]].str()];
+
+      hetero_list.push_back(std::make_pair(num, columns[indices_[EPS_MON_ID]].str()));
+    } else {
+
+      throw IOException(this->FormatDiagnostic(STAR_DIAG_ERROR,
+                                               "_entity_poly_seq.num must be "
+                                               "unique in same "
+                                               "_entity_poly_seq.entity_id. "
+                                               "entity_id: " +
+                                               columns[indices_[EPS_ENTITY_ID]].str() +
+                                               ", num: " +
+                                               columns[indices_[EPS_ENTITY_ID]].str(),
+                                               this->GetCurrentLinenum()));
+    }
   }
 
   entity_map[num] = columns[indices_[EPS_MON_ID]].str();
@@ -2060,6 +2078,12 @@ void MMCifReader::OnEndData()
         entity_it.second.mon_ids[seqres_it.first-1] = seqres_it.second;
       }
     }
+    if(entity_poly_seq_h_map_.find(entity_it.first) != entity_poly_seq_h_map_.end()) {
+      for(auto hetero_it: entity_poly_seq_h_map_[entity_it.first]) {
+        entity_it.second.hetero_num.push_back(hetero_it.first);
+        entity_it.second.hetero_ids.push_back(hetero_it.second);
+      }
+    }
     info_.SetEntityDesc(entity_it.first, entity_it.second);
   }
 
diff --git a/modules/io/src/mol/mmcif_reader.hh b/modules/io/src/mol/mmcif_reader.hh
index e0b4c4c0b..28798a49f 100644
--- a/modules/io/src/mol/mmcif_reader.hh
+++ b/modules/io/src/mol/mmcif_reader.hh
@@ -609,7 +609,8 @@ private:
   typedef enum {
     EPS_ENTITY_ID,
     EPS_MON_ID,
-    EPS_NUM
+    EPS_NUM,
+    EPS_HETERO
   } EntityPolySeqItems;
 
   /// \enum items of the entity_poly_seq category
@@ -747,6 +748,7 @@ private:
   MMCifPdbxEntityBranchLinkMap entity_branch_link_map_;
   // for storing entity_poly_seq
   std::map<String, std::map<int, String> > entity_poly_seq_map_;
+  std::map<String, std::vector<std::pair<int, String> > > entity_poly_seq_h_map_;
 };
 
 /// \brief Translate mmCIF info on bond type (e.g.
-- 
GitLab