From b6b442044c9f17246fdbb0ae5fd4037b8ecf43fc Mon Sep 17 00:00:00 2001
From: Xavier Robin <xavier.robin@unibas.ch>
Date: Fri, 21 Apr 2023 15:22:28 +0200
Subject: [PATCH] feat: SCHWED-3293 read mapping data into string properties

---
 modules/io/doc/mmcif.rst              | 25 +++++++++++++++++++++++--
 modules/io/pymod/__init__.py          |  8 ++++----
 modules/io/src/mol/mmcif_reader.cc    |  9 ++++++++-
 modules/io/tests/test_mmcif_reader.cc | 11 +++++++++++
 4 files changed, 46 insertions(+), 7 deletions(-)

diff --git a/modules/io/doc/mmcif.rst b/modules/io/doc/mmcif.rst
index c4c361bda..79281e318 100644
--- a/modules/io/doc/mmcif.rst
+++ b/modules/io/doc/mmcif.rst
@@ -67,8 +67,29 @@ Notes:
   :meth:`~MMCifInfo.GetPDBMMCifChainTr` if SEQRES records are read in
   :func:`~ost.io.LoadMMCIF` and a non-empty SEQRES record exists for that chain
   (this should exclude ligands and water).
-* Molecular entities in mmCIF are identified by an ``entity.id``. Each chain is
-  mapped to an ID in :class:`MMCifInfo` as :meth:`~MMCifInfo.GetMMCifEntityIdTr`.
+* Molecular entities in mmCIF are identified by an ``entity.id``, which is
+  extracted from ``atom_site.label_entity_id`` for the first atom of the chain.
+  It is added as string property named "entity_id" to the
+  :class:`~ost.mol.ChainHandle`. Each chain is mapped to an ID in
+  :class:`MMCifInfo` as :meth:`~MMCifInfo.GetMMCifEntityIdTr`.
+* For more complex mappings, such as ligands which may be in a same "old" chain
+  as the protein chain but are represented in a separate "new" chain in mmCIF,
+  we also store string properties on a per-residue level.
+  For mmCIF files from the PDB, there is a unique mapping between
+  ("label_asym_id", "label_seq_id") and ("auth_asym_id", "auth_seq_id",
+  "pdbx_PDB_ins_code").
+  The following data items are available:
+    * ``atom_site.label_asym_id``: ``residue.chain.name``
+    * ``atom_site.label_seq_id``: ``residue.GetStringProp("resnum")``
+      (this is the same as ``residue.number`` for residues in polymer chains.
+       However, for ligands ``residue.number`` is unset in mmCIF, but it
+      is set to 1 by openstructure.)
+    * ``atom_site.label_entity_id``: ``residue.GetStringProp("entity_id")``
+    * ``atom_site.auth_asym_id``: ``residue.GetStringProp("pdb_auth_chain_name")``
+    * ``atom_site.auth_seq_id``: ``residue.GetStringProp("pdb_auth_resnum")``
+    * ``atom_site.pdbx_PDB_ins_code``: ``residue.GetStringProp("pdb_auth_ins_code")``
+* Missing values in the aforementioned data items will be denoted as ``.`` or
+  ``?``.
 
 Info Classes
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/modules/io/pymod/__init__.py b/modules/io/pymod/__init__.py
index 1e9eb0b07..4be3ea2ca 100644
--- a/modules/io/pymod/__init__.py
+++ b/modules/io/pymod/__init__.py
@@ -356,10 +356,10 @@ def LoadMMCIF(filename, fault_tolerant=None, calpha_only=None,
   customize the exact behaviour of the mmCIF import. For more information on
   these options, see :doc:`profile`.
   
-  Residues are flagged as ligand if they are not covered by an ``entity_poly``
-  record (ie. they are non-polymer entities in ``pdbx_entity_nonpoly``). Note
-  that all residues will be flagged as ligands if ``seqres=False`` (the
-  default).
+  Residues are flagged as ligand if they are not waters nor covered by an
+  ``entity_poly`` record (ie. they are non-polymer entities in
+  ``pdbx_entity_nonpoly``). Note that all residues except waters will be
+  flagged as ligands if ``seqres=False`` (the default).
 
   :param filename: File to be loaded
   :type filename: :class:`str`
diff --git a/modules/io/src/mol/mmcif_reader.cc b/modules/io/src/mol/mmcif_reader.cc
index 527f68fbe..07069ddc3 100644
--- a/modules/io/src/mol/mmcif_reader.cc
+++ b/modules/io/src/mol/mmcif_reader.cc
@@ -559,6 +559,7 @@ void MMCifReader::ParseAndAddAtom(const std::vector<StringRef>& columns)
       ++chain_count_;
       // store entity id
       String ent_id = columns[indices_[LABEL_ENTITY_ID]].str();
+      curr_chain_.SetStringProp("entity_id", ent_id);
       chain_id_pairs_.push_back(std::pair<mol::ChainHandle,String>(curr_chain_,
                                                                    ent_id));
       info_.AddMMCifEntityIdTr(cif_chain_name, ent_id);
@@ -580,14 +581,20 @@ void MMCifReader::ParseAndAddAtom(const std::vector<StringRef>& columns)
       curr_residue_=curr_chain_.FindResidue(res_num);
     }
     if (!curr_residue_.IsValid()) { // unit test
-      LOG_DEBUG("new residue " << res_name << " " << res_num);
+      LOG_TRACE("new residue " << res_name << " " << res_num);
       if (valid_res_num) {
         curr_residue_ = editor.AppendResidue(curr_chain_,
                                              res_name.str(),
                                              res_num);
+
       } else {
         curr_residue_ = editor.AppendResidue(curr_chain_, res_name.str());
       }
+      curr_residue_.SetStringProp("pdb_auth_chain_name", auth_chain_name);
+      curr_residue_.SetStringProp("pdb_auth_resnum", columns[indices_[AUTH_SEQ_ID]].str());
+      curr_residue_.SetStringProp("pdb_auth_ins_code", columns[indices_[PDBX_PDB_INS_CODE]].str());
+      curr_residue_.SetStringProp("entity_id", columns[indices_[LABEL_ENTITY_ID]].str());
+      curr_residue_.SetStringProp("resnum", columns[indices_[LABEL_SEQ_ID]].str());
       warned_name_mismatch_=false;
       ++residue_count_; 
     }
diff --git a/modules/io/tests/test_mmcif_reader.cc b/modules/io/tests/test_mmcif_reader.cc
index 57fb3d3e0..bee89e0e2 100644
--- a/modules/io/tests/test_mmcif_reader.cc
+++ b/modules/io/tests/test_mmcif_reader.cc
@@ -1225,6 +1225,17 @@ BOOST_AUTO_TEST_CASE(mmcif_testreader)
   BOOST_CHECK(ch.IsValid());
   BOOST_TEST_MESSAGE("          done.");
 
+  BOOST_TEST_MESSAGE("          testing chain/residue mapping properties...");
+  BOOST_CHECK_EQUAL(ch.GetStringProp("pdb_auth_chain_name"), "A");
+  BOOST_CHECK_EQUAL(ch.GetStringProp("entity_id"), "1");
+  mol::ResidueHandle res = ch.FindResidue(12);
+  BOOST_CHECK(res.IsValid());
+  BOOST_CHECK_EQUAL(res.GetStringProp("pdb_auth_chain_name"), "A");
+  BOOST_CHECK_EQUAL(res.GetStringProp("pdb_auth_resnum"), "12");
+  BOOST_CHECK_EQUAL(res.GetStringProp("pdb_auth_ins_code"), "?");
+  BOOST_CHECK_EQUAL(res.GetStringProp("entity_id"), "1");
+  BOOST_TEST_MESSAGE("          done.");
+
   BOOST_TEST_MESSAGE("          testing numbering water...");
   ch = eh.FindChain("O");
   BOOST_CHECK(ch.IsValid());
-- 
GitLab