From 935bc608440c2dfdfbc0ac54dfcbce44fb3bb16f Mon Sep 17 00:00:00 2001
From: Gabriel Studer <gabriel.studer@unibas.ch>
Date: Thu, 9 Mar 2023 16:07:49 +0100
Subject: [PATCH] improve doc and output of compare-structures

- Tell the user that biounit indices are 0-based
- More info in output on what chains are actually processed
---
 actions/ost-compare-structures         | 27 ++++++++++++++------
 modules/doc/actions.rst                | 28 +++++++++++++++------
 modules/mol/alg/pymod/chain_mapping.py | 35 ++++++++++++++++----------
 3 files changed, 62 insertions(+), 28 deletions(-)

diff --git a/actions/ost-compare-structures b/actions/ost-compare-structures
index dd513e0ea..e2cb51438 100644
--- a/actions/ost-compare-structures
+++ b/actions/ost-compare-structures
@@ -12,18 +12,28 @@ Loads the structures and performs basic cleanup:
  * Remove OXT atoms
  * Remove unknown atoms, i.e. atoms that are not expected according to the PDB
    compound dictionary
+ * Select for peptide/nucleotide residues
 
 The cleaned structures are optionally dumped using -d/--dump-structures
 
 Output is written in JSON format (default: out.json). In case of no additional
-options, this is a dictionary with five keys:
-
+options, this is a dictionary with 8 keys:
+
+ * "reference_chains": Chain names of reference
+ * "model_chains": Chain names of model
+ * "chem_groups": Groups of polypeptides/polynucleotides from reference that
+   are considered chemically equivalent. You can derive stoichiometry from this.
+   Contains only chains that are considered in chain mapping, i.e. pass a
+   size threshold (defaults: 10 for peptides, 4 for nucleotides).
+ * "chem_mapping": List of same length as "chem_groups". Assigns model chains to
+   the respective chem group. Again, only contains chains that are considered
+   in chain mapping.
  * "chain_mapping": A dictionary with reference chain names as keys and the
-   mapped model chain names as values.
+   mapped model chain names as values. Missing chains are either not mapped
+   (but present in "chem_groups", "chem_mapping") or were not considered in
+   chain mapping (short peptides etc.)
  * "aln": Pairwise sequence alignment for each pair of mapped chains in fasta
    format.
- * "chem_groups": Groups of polypeptides/polynucleotides that are considered
-   chemically equivalent. You can derive stoichiometry from this.
  * "inconsistent_residues": List of strings that represent name mismatches of
    aligned residues in form
    <trg_cname>.<trg_rname><trg_rnum>-<mdl_cname>.<mdl_rname><mdl_rnum>.
@@ -380,7 +390,7 @@ def _LoadStructure(structure_path, sformat=None, fault_tolerant=False,
                          fault_tolerant=fault_tolerant)
             if bu_idx >= len(cif_info.biounits):
                 raise RuntimeError(f"Invalid biounit index - requested {bu_idx} "
-                                  f"cif file has {len(cif_info.biounits)}")
+                                   f"must be < {len(cif_info.biounits)}.")
             biounit = cif_info.biounits[bu_idx]
             entity = biounit.PDBize(cif_entity, min_polymer_size=0)
             if not entity.IsValid():
@@ -458,10 +468,13 @@ def _Process(model, reference, args):
         raise RuntimeError(f"Inconsistent residues observed: {' '.join(ir)}")
 
     out = dict()
+    out["reference_chains"] = [ch.GetName() for ch in scorer.target.chains]
+    out["model_chains"] = [ch.GetName() for ch in scorer.model.chains]
+    out["chem_groups"] = scorer.chain_mapper.chem_groups
+    out["chem_mapping"] = scorer.mapping.chem_mapping
     out["chain_mapping"] = scorer.mapping.GetFlatMapping()
     out["aln"] = [_AlnToFastaStr(aln) for aln in scorer.aln]
     out["inconsistent_residues"] = ir
-    out["chem_groups"] = scorer.chain_mapper.chem_groups
 
     if args.lddt:
         out["lddt"] = scorer.lddt
diff --git a/modules/doc/actions.rst b/modules/doc/actions.rst
index c10f942e8..df447090b 100644
--- a/modules/doc/actions.rst
+++ b/modules/doc/actions.rst
@@ -55,18 +55,28 @@ Details on the usage (output of ``ost compare-structures --help``):
    * Remove OXT atoms
    * Remove unknown atoms, i.e. atoms that are not expected according to the PDB
      compound dictionary
+   * Select for peptide/nucleotide residues
   
   The cleaned structures are optionally dumped using -d/--dump-structures
   
   Output is written in JSON format (default: out.json). In case of no additional
-  options, this is a dictionary with five keys:
+  options, this is a dictionary with 8 keys:
   
+   * "reference_chains": Chain names of reference
+   * "model_chains": Chain names of model
+   * "chem_groups": Groups of polypeptides/polynucleotides from reference that
+     are considered chemically equivalent. You can derive stoichiometry from this.
+     Contains only chains that are considered in chain mapping, i.e. pass a
+     size threshold (defaults: 10 for peptides, 4 for nucleotides).
+   * "chem_mapping": List of same length as "chem_groups". Assigns model chains to
+     the respective chem group. Again, only contains chains that are considered
+     in chain mapping.
    * "chain_mapping": A dictionary with reference chain names as keys and the
-     mapped model chain names as values.
+     mapped model chain names as values. Missing chains are either not mapped
+     (but present in "chem_groups", "chem_mapping") or were not considered in
+     chain mapping (short peptides etc.)
    * "aln": Pairwise sequence alignment for each pair of mapped chains in fasta
      format.
-   * "chem_groups": Groups of polypeptides/polynucleotides that are considered
-     chemically equivalent. You can derive stoichiometry from this.
    * "inconsistent_residues": List of strings that represent name mismatches of
      aligned residues in form
      <trg_cname>.<trg_rname><trg_rnum>-<mdl_cname>.<mdl_rname><mdl_rnum>.
@@ -76,7 +86,7 @@ Details on the usage (output of ``ost compare-structures --help``):
    * "status": SUCCESS if everything ran through. In case of failure, the only
      content of the JSON output will be "status" set to FAILURE and an
      additional key: "traceback".
-  
+
   The pairwise sequence alignments are computed with Needleman-Wunsch using
   BLOSUM62 (NUC44 for nucleotides). Many benchmarking scenarios preprocess the
   structures to ensure matching residue numbers (CASP/CAMEO). In these cases,
@@ -91,7 +101,7 @@ Details on the usage (output of ``ost compare-structures --help``):
   Example to inject custom chain mapping
   
   ost compare-structures -m model.pdb -r reference.cif -c A:B B:A
-
+  
   optional arguments:
     -h, --help            show this help message and exit
     -m MODEL, --model MODEL
@@ -113,12 +123,14 @@ Details on the usage (output of ``ost compare-structures --help``):
                           Only has an effect if model is in mmcif format. By
                           default, the assymetric unit (AU) is used for scoring.
                           If there are biounits defined in the mmcif file, you
-                          can specify the index of the one which should be used.
+                          can specify the (0-based) index of the one which
+                          should be used.
     -rb REFERENCE_BIOUNIT, --reference-biounit REFERENCE_BIOUNIT
                           Only has an effect if reference is in mmcif format. By
                           default, the assymetric unit (AU) is used for scoring.
                           If there are biounits defined in the mmcif file, you
-                          can specify the index of the one which should be used.
+                          can specify the (0-based) index of the one which
+                          should be used.
     -rna, --residue-number-alignment
                           Make alignment based on residue number instead of
                           using a global BLOSUM62-based alignment (NUC44 for
diff --git a/modules/mol/alg/pymod/chain_mapping.py b/modules/mol/alg/pymod/chain_mapping.py
index 932e00332..37c36a132 100644
--- a/modules/mol/alg/pymod/chain_mapping.py
+++ b/modules/mol/alg/pymod/chain_mapping.py
@@ -34,11 +34,12 @@ class MappingResult:
     Constructor is directly called within the functions, no need to construct
     such objects yourself.
     """
-    def __init__(self, target, model, chem_groups, mapping, alns,
+    def __init__(self, target, model, chem_groups, chem_mapping, mapping, alns,
                  opt_score=None):
         self._target = target
         self._model = model
         self._chem_groups = chem_groups
+        self._chem_mapping = chem_mapping
         self._mapping = mapping
         self._alns = alns
         self._opt_score = opt_score
@@ -72,6 +73,14 @@ class MappingResult:
         """
         return self._chem_groups
 
+    @property
+    def chem_mapping(self):
+        """ Assigns chains in :attr:`~model` to :attr:`~chem_groups`.
+
+        :class:`list` of :class:`list` of :class:`str` (chain names)
+        """
+        return self._chem_mapping
+
     @property
     def mapping(self):
         """ Mapping of :attr:`~model` chains onto :attr:`~target`
@@ -853,8 +862,8 @@ class ChainMapper:
                         aln.AttachView(0, _CSel(self.target, [ref_ch]))
                         aln.AttachView(1, _CSel(mdl, [mdl_ch]))
                         alns[(ref_ch, mdl_ch)] = aln
-            return MappingResult(self.target, mdl, self.chem_groups, one_to_one,
-                                 alns)
+            return MappingResult(self.target, mdl, self.chem_groups, chem_mapping,
+                                 one_to_one, alns)
 
         mapping = None
         opt_lddt = None
@@ -890,8 +899,8 @@ class ChainMapper:
                     aln.AttachView(1, _CSel(mdl, [mdl_ch]))
                     alns[(ref_ch, mdl_ch)] = aln
 
-        return MappingResult(self.target, mdl, self.chem_groups, mapping,
-                             alns, opt_score = opt_lddt)
+        return MappingResult(self.target, mdl, self.chem_groups, chem_mapping,
+                             mapping, alns, opt_score = opt_lddt)
 
 
     def GetQSScoreMapping(self, model, contact_d = 12.0, strategy = "naive",
@@ -970,8 +979,8 @@ class ChainMapper:
                         aln.AttachView(0, _CSel(self.target, [ref_ch]))
                         aln.AttachView(1, _CSel(mdl, [mdl_ch]))
                         alns[(ref_ch, mdl_ch)] = aln
-            return MappingResult(self.target, mdl, self.chem_groups, one_to_one,
-                                 alns)
+            return MappingResult(self.target, mdl, self.chem_groups, chem_mapping,
+                                 one_to_one, alns)
         mapping = None
         opt_qsscore = None
 
@@ -1007,8 +1016,8 @@ class ChainMapper:
                     aln.AttachView(1, _CSel(mdl, [mdl_ch]))
                     alns[(ref_ch, mdl_ch)] = aln
 
-        return MappingResult(self.target, mdl, self.chem_groups, mapping,
-                             alns, opt_score = opt_qsscore)
+        return MappingResult(self.target, mdl, self.chem_groups, chem_mapping,
+                             mapping, alns, opt_score = opt_qsscore)
 
     def GetRigidMapping(self, model, strategy = "greedy_single_gdtts",
                         single_chain_gdtts_thresh=0.4, subsampling=None,
@@ -1108,8 +1117,8 @@ class ChainMapper:
                         aln.AttachView(0, _CSel(self.target, [ref_ch]))
                         aln.AttachView(1, _CSel(mdl, [mdl_ch]))
                         alns[(ref_ch, mdl_ch)] = aln
-            return MappingResult(self.target, mdl, self.chem_groups, one_to_one,
-                                 alns)
+            return MappingResult(self.target, mdl, self.chem_groups, chem_mapping,
+                                 one_to_one, alns)
 
         trg_group_pos, mdl_group_pos = _GetRefPos(self.target, mdl,
                                                   self.chem_group_alignments,
@@ -1188,8 +1197,8 @@ class ChainMapper:
                     aln.AttachView(1, _CSel(mdl, [mdl_ch]))
                     alns[(ref_ch, mdl_ch)] = aln
 
-        return MappingResult(self.target, mdl, self.chem_groups, final_mapping,
-                             alns)
+        return MappingResult(self.target, mdl, self.chem_groups, chem_mapping,
+                             final_mapping, alns)
 
 
     def GetRepr(self, substructure, model, topn=1, inclusion_radius=15.0,
-- 
GitLab