introduce uniquely identifiable residue/atom names

Mainly for reporting in the compare-structures action residues are reported as strings in format: <cname>.<rnum>.<ins_code>, atoms are reported as <cname>.<rnum>.<ins_code>.<aname>. This changes behaviour of compare-structures and ost.mol.alg.stereochemistry (ToJSON functions of violation info objects for the latter).

introduce uniquely identifiable residue/atom names
a4e9d9a7 · Studer Gabriel · 01791c49 · a4e9d9a7 · a4e9d9a7 · a4e9d9a7
Commit a4e9d9a7 authored 2 years ago by Studer Gabriel
--- a/actions/ost-compare-structures
+++ b/actions/ost-compare-structures
@@ -17,7 +17,7 @@ Loads the structures and performs basic cleanup:
 The cleaned structures are optionally dumped using -d/--dump-structures

 Output is written in JSON format (default: out.json). In case of no additional
-options, this is a dictionary with 8 keys:
+options, this is a dictionary with 8 keys describing model/reference comparison:

 * "reference_chains": Chain names of reference
 * "model_chains": Chain names of model
@@ -36,7 +36,7 @@ options, this is a dictionary with 8 keys:
   format.
 * "inconsistent_residues": List of strings that represent name mismatches of
   aligned residues in form
-   <trg_cname>.<trg_rname><trg_rnum>-<mdl_cname>.<mdl_rname><mdl_rnum>.
+   <trg_cname>.<trg_rnum>.<trg_ins_code>-<mdl_cname>.<mdl_rnum>.<mdl_ins_code>.
   Inconsistencies may lead to corrupt results but do not abort the program.
   Program abortion in these cases can be enforced with
   -ec/--enforce-consistency.
@@ -44,6 +44,20 @@ options, this is a dictionary with 8 keys:
   content of the JSON output will be \"status\" set to FAILURE and an
   additional key: "traceback".

+The following additional keys store relevant input parameters to reproduce
+results:
+
+ * "model"
+ * "reference"
+ * "fault_tolerant"
+ * "model_biounit"
+ * "reference_biounit"
+ * "residue_number_alignment"
+ * "enforce_consistency"
+ * "cad_exec"
+ * "usalign_exec"
+ * "lddt_no_stereochecks"
+
 The pairwise sequence alignments are computed with Needleman-Wunsch using
 BLOSUM62 (NUC44 for nucleotides). Many benchmarking scenarios preprocess the
 structures to ensure matching residue numbers (CASP/CAMEO). In these cases,
@@ -216,13 +230,16 @@ def _ParseArgs():
        default=False,
        action="store_true",
        help=("Compute per-residue lDDT scores with default parameterization "
-              "and store as key \"local_lddt\". Score for model residue with "
-              "number 42 in chain X can be extracted with: "
-              "data[\"local_lddt\"][\"X\"][\"42\"]. If there is an insertion "
-              "code, lets say A, the last key becomes \"42A\" Stereochemical "
-              "irregularities affecting lDDT are reported as keys "
-              "\"model_clashes\", \"model_bad_bonds\", \"model_bad_angles\" "
-              "and the respective reference counterparts."))
+              "and store as key \"local_lddt\". Score for each residue is "
+              "accessible by key <chain_name>.<resnum>.<resnum_inscode>. "
+              "Residue with number 42 in chain X can be extracted with: "
+              "data[\"local_lddt\"][\"X.42.\"]. If there is an insertion "
+              "code, lets say A, the residue key becomes \"X.42.A\". "
+              "Stereochemical irregularities affecting lDDT are reported as "
+              "keys \"model_clashes\", \"model_bad_bonds\", "
+              "\"model_bad_angles\" and the respective reference "
+              "counterparts. Atoms specified in there follow the following "
+              "format: <chain_name>.<resnum>.<resnum_inscode>.<atom_name>"))

    parser.add_argument(
        "--bb-lddt",
@@ -241,10 +258,8 @@ def _ParseArgs():
        help=("Compute per-residue lDDT scores with default parameterization "
              "and store as key \"bb_local_lddt\". lDDT in this case is only "
              "computed on backbone atoms: CA for peptides and C3' for "
-              "nucleotides. Score for model residue with "
-              "number 42 in chain X can be extracted with: "
-              "data[\"local_lddt\"][\"X\"][\"42\"]. If there is an insertion "
-              "code, lets say A, the last key becomes \"42A\""))
+              "nucleotides. Per-residue scores are accessible as described for "
+              "local_lddt."))

    parser.add_argument(
        "--cad-score",
@@ -262,12 +277,10 @@ def _ParseArgs():
        default=False,
        action="store_true",
        help=("Compute local CAD's atom-atom (AA) scores and store as key "
-              "\"local_cad_score\". Score for model residue with number 42 in "
-              "chain X can be extracted with: "
-              "data[\"local_cad_score\"][\"X\"][\"42\"]. "
-              "--residue-number-alignments must be enabled to compute this "
-              "score. Requires voronota_cadscore executable in PATH. "
-              "Alternatively you can set cad-exec."))
+              "\"local_cad_score\". Per-residue scores are accessible as "
+              "described for local_lddt. --residue-number-alignments must be "
+              "enabled to compute this score. Requires voronota_cadscore "
+              "executable in PATH. Alternatively you can set cad-exec."))

    parser.add_argument(
        "--cad-exec",
@@ -350,8 +363,9 @@ def _ParseArgs():
              "None. Model interface residues are available as key "
              "\"model_interface_residues\", reference interface residues as "
              "key \"reference_interface_residues\". Residues are represented "
-              "as string in form <num><inscode>. The respective scores are "
-              "available as keys \"patch_qs\" and \"patch_dockq\""))
+              "as string in form <chain_name>.<resnum>.<resnum_inscode>. "
+              "The respective scores are available as keys \"patch_qs\" and "
+              "\"patch_dockq\""))

    parser.add_argument(
        "--tm-score",
@@ -474,29 +488,47 @@ def _GetInconsistentResidues(alns):
            r1 = col.GetResidue(0)
            r2 = col.GetResidue(1)
            if r1.IsValid() and r2.IsValid() and r1.GetName() != r2.GetName():
-                lst.append((r1, r2))
-    lst = [f"{x[0].GetQualifiedName()}-{x[1].GetQualifiedName()}" for x in lst]
+                ch_1 = r1.GetChain().name
+                num_1 = r1.number.num
+                ins_code_1 = r1.number.ins_code.strip("\u0000")
+                id_1 = f"{ch_1}.{num_1}.{ins_code_1}"
+                ch_2 = r2.GetChain().name
+                num_2 = r2.number.num
+                ins_code_2 = r2.number.ins_code.strip("\u0000")
+                id_2 = f"{ch_2}.{num_2}.{ins_code_2}"
+                lst.append(f"{id_1}-{id_2}")
    return lst

 def _LocalScoresToJSONDict(score_dict):
-    """ Score for model residue with number rnum in chain X can be extracted
-    with: data["local_cad_score"]["X"][rnum]. Convert ResNum object to str for
-    JSON serialization
+    """ Convert ResNums to str for JSON serialization
    """
    json_dict = dict()
    for ch, ch_scores in score_dict.items():
-        json_dict[ch] = {str(rnum): s for rnum, s in ch_scores.items()}
+        for num, s in ch_scores.items():
+            ins_code = num.ins_code.strip("\u0000")
+            json_dict[f"{ch}.{num.num}.{ins_code}"] = s
    return json_dict

-def _InterfaceResiduesToJSONDict(interface_dict):
-    """ Interface residues are stored as
-    data["model_interface_residues"]["A"][rnum1, rnum2,...]. Convert ResNum
-    object to str for JSON serialization.
+def _InterfaceResiduesToJSONList(interface_dict):
+    """ Convert ResNums to str for JSON serialization.
+
+    Changes in this function will affect _PatchScoresToJSONList
    """
-    json_dict = dict()
+    json_list = list()
    for ch, ch_nums in interface_dict.items():
-        json_dict[ch] = [str(rnum) for rnum in ch_nums]
-    return json_dict
+        for num in ch_nums:
+            ins_code = num.ins_code.strip("\u0000")
+            json_list.append(f"{ch}.{num.num}.{ins_code}")
+    return json_list
+
+def _PatchScoresToJSONList(interface_dict, score_dict):
+    """ Creates List of patch scores that are consistent with interface residue
+    lists
+    """
+    json_list = list()
+    for ch, ch_nums in interface_dict.items():
+        json_list += score_dict[ch]
+    return json_list

 def _Process(model, reference, args):

@@ -579,11 +611,13 @@ def _Process(model, reference, args):

    if args.patch_scores:
        out["model_interface_residues"] = \
-        _InterfaceResiduesToJSONDict(scorer.model_interface_residues)
+        _InterfaceResiduesToJSONList(scorer.model_interface_residues)
        out["reference_interface_residues"] = \
-        _InterfaceResiduesToJSONDict(scorer.target_interface_residues)
-        out["patch_qs"] = scorer.patch_qs
-        out["patch_dockq"] = scorer.patch_dockq
+        _InterfaceResiduesToJSONList(scorer.target_interface_residues)
+        out["patch_qs"] = _PatchScoresToJSONList(scorer.model_interface_residues,
+                                                 scorer.patch_qs)
+        out["patch_dockq"] = _PatchScoresToJSONList(scorer.model_interface_residues,
+                                                    scorer.patch_dockq)

    if args.tm_score:
        out["tm_score"] = scorer.tm_score
@@ -614,10 +648,6 @@ def _Process(model, reference, args):
                                   "fails in this case.")
            else:
                raise
-
-
-
-
    return out

 def _Main():

--- a/modules/doc/actions.rst
+++ b/modules/doc/actions.rst
@@ -40,9 +40,11 @@ Details on the usage (output of ``ost compare-structures --help``):
                                [-c CHAIN_MAPPING [CHAIN_MAPPING ...]] [--lddt]
                                [--local-lddt] [--bb-lddt] [--bb-local-lddt]
                                [--cad-score] [--local-cad-score]
-                                [--cad-exec CAD_EXEC] [--qs-score]
+                                [--cad-exec CAD_EXEC]
+                                [--usalign-exec USALIGN_EXEC] [--qs-score]
                                [--rigid-scores] [--interface-scores]
-                                [--patch-scores]
+                                [--patch-scores] [--tm-score]
+                                [--lddt-no-stereochecks]
  
  Evaluate model against reference 
  
@@ -62,8 +64,8 @@ Details on the usage (output of ``ost compare-structures --help``):
  The cleaned structures are optionally dumped using -d/--dump-structures
  
  Output is written in JSON format (default: out.json). In case of no additional
-  options, this is a dictionary with 8 keys:
-
+  options, this is a dictionary with 8 keys describing model/reference comparison:
+  
   * "reference_chains": Chain names of reference
   * "model_chains": Chain names of model
   * "chem_groups": Groups of polypeptides/polynucleotides from reference that
@@ -81,29 +83,43 @@ Details on the usage (output of ``ost compare-structures --help``):
     format.
   * "inconsistent_residues": List of strings that represent name mismatches of
     aligned residues in form
-     <trg_cname>.<trg_rname><trg_rnum>-<mdl_cname>.<mdl_rname><mdl_rnum>.
+     <trg_cname>.<trg_rnum>.<trg_ins_code>-<mdl_cname>.<mdl_rnum>.<mdl_ins_code>.
     Inconsistencies may lead to corrupt results but do not abort the program.
     Program abortion in these cases can be enforced with
     -ec/--enforce-consistency.
   * "status": SUCCESS if everything ran through. In case of failure, the only
     content of the JSON output will be "status" set to FAILURE and an
     additional key: "traceback".
-
+  
+  The following additional keys store relevant input parameters to reproduce
+  results:
+  
+   * "model"
+   * "reference"
+   * "fault_tolerant"
+   * "model_biounit"
+   * "reference_biounit"
+   * "residue_number_alignment"
+   * "enforce_consistency"
+   * "cad_exec"
+   * "usalign_exec"
+   * "lddt_no_stereochecks"
+  
  The pairwise sequence alignments are computed with Needleman-Wunsch using
  BLOSUM62 (NUC44 for nucleotides). Many benchmarking scenarios preprocess the
  structures to ensure matching residue numbers (CASP/CAMEO). In these cases,
  enabling -rna/--residue-number-alignment is recommended.
-
+  
  Each score is opt-in and can be enabled with optional arguments.
-
+  
  Example to compute global and per-residue lDDT values as well as QS-score:
-
+  
  ost compare-structures -m model.pdb -r reference.cif --lddt --local-lddt --qs-score
-
+  
  Example to inject custom chain mapping
-
+  
  ost compare-structures -m model.pdb -r reference.cif -c A:B B:A
-
+  
  optional arguments:
    -h, --help            show this help message and exit
    -m MODEL, --model MODEL
@@ -164,13 +180,17 @@ Details on the usage (output of ``ost compare-structures --help``):
                          counterparts.
    --local-lddt          Compute per-residue lDDT scores with default
                          parameterization and store as key "local_lddt". Score
-                          for model residue with number 42 in chain X can be
-                          extracted with: data["local_lddt"]["X"]["42"]. If
-                          there is an insertion code, lets say A, the last key
-                          becomes "42A" Stereochemical irregularities affecting
-                          lDDT are reported as keys "model_clashes",
-                          "model_bad_bonds", "model_bad_angles" and the
-                          respective reference counterparts.
+                          for each residue is accessible by key
+                          <chain_name>.<resnum>.<resnum_inscode>. Residue with
+                          number 42 in chain X can be extracted with:
+                          data["local_lddt"]["X.42."]. If there is an insertion
+                          code, lets say A, the residue key becomes "X.42.A".
+                          Stereochemical irregularities affecting lDDT are
+                          reported as keys "model_clashes", "model_bad_bonds",
+                          "model_bad_angles" and the respective reference
+                          counterparts. Atoms specified in there follow the
+                          following format:
+                          <chain_name>.<resnum>.<resnum_inscode>.<atom_name>
    --bb-lddt             Compute global lDDT score with default
                          parameterization and store as key "bb_lddt". lDDT in
                          this case is only computed on backbone atoms: CA for
@@ -178,26 +198,26 @@ Details on the usage (output of ``ost compare-structures --help``):
    --bb-local-lddt       Compute per-residue lDDT scores with default
                          parameterization and store as key "bb_local_lddt".
                          lDDT in this case is only computed on backbone atoms:
-                          CA for peptides and C3' for nucleotides. Score for
-                          model residue with number 42 in chain X can be
-                          extracted with: data["local_lddt"]["X"]["42"]. If
-                          there is an insertion code, lets say A, the last key
-                          becomes "42A"
+                          CA for peptides and C3' for nucleotides. Per-residue
+                          scores are accessible as described for local_lddt.
    --cad-score           Compute global CAD's atom-atom (AA) score and store as
                          key "cad_score". --residue-number-alignment must be
                          enabled to compute this score. Requires
                          voronota_cadscore executable in PATH. Alternatively
                          you can set cad-exec.
    --local-cad-score     Compute local CAD's atom-atom (AA) scores and store as
-                          key "local_cad_score". Score for model residue with
-                          number 42 in chain X can be extracted with:
-                          data["local_cad_score"]["X"]["42"]. --residue-number-
-                          alignments must be enabled to compute this score.
-                          Requires voronota_cadscore executable in PATH.
+                          key "local_cad_score". Per-residue scores are
+                          accessible as described for local_lddt. --residue-
+                          number-alignments must be enabled to compute this
+                          score. Requires voronota_cadscore executable in PATH.
                          Alternatively you can set cad-exec.
    --cad-exec CAD_EXEC   Path to voronota-cadscore executable (installed from
                          https://github.com/kliment-olechnovic/voronota).
                          Searches PATH if not set.
+    --usalign-exec USALIGN_EXEC
+                          Path to USalign executable to compute TM-score. If not
+                          given, an OpenStructure internal copy of USalign code
+                          is used.
    --qs-score            Compute QS-score, stored as key "qs_global", and the
                          QS-best variant, stored as key "qs_best".
    --rigid-scores        Computes rigid superposition based scores. They're
@@ -248,12 +268,13 @@ Details on the usage (output of ``ost compare-structures --help``):
                          "model_interface_residues", reference interface
                          residues as key "reference_interface_residues".
                          Residues are represented as string in form
-                          <num><inscode>. The respective scores are available as
-                          keys "patch_qs" and "patch_dockq"
+                          <chain_name>.<resnum>.<resnum_inscode>. The respective
+                          scores are available as keys "patch_qs" and
+                          "patch_dockq"
    --tm-score            Computes TM-score with the USalign tool. Also computes
-                          a chain mapping in case of complexes that is stored
-                          in the same format as the default mapping. TM-score
-                          and the mapping are available as keys "tm_score" and
+                          a chain mapping in case of complexes that is stored in
+                          the same format as the default mapping. TM-score and
+                          the mapping are available as keys "tm_score" and
                          "usalign_mapping"
    --lddt-no-stereochecks
                          Disable stereochecks for lDDT computation

--- a/modules/mol/alg/pymod/stereochemistry.py
+++ b/modules/mol/alg/pymod/stereochemistry.py
@@ -9,6 +9,18 @@ from ost import geom
 from ost import mol


+def _AtomToQualifiedName(a):
+    """ Returns string to uniquely identify atom
+
+    format: <chain_name>.<resnum>.<resnum_inscode>.<atom_name>
+    """
+    r = a.GetResidue()
+    ch = r.GetChain()
+    num = r.number.num
+    ins_code = r.number.ins_code.strip("\u0000")
+    return f"{ch.name}.{r.number.num}.{ins_code}.{a.name}"
+
+
 def _PotentialDisulfid(a_one, a_two):
    """ Returns whether two atoms can potentially build a disulfid bond

@@ -355,9 +367,12 @@ class ClashInfo:

    def ToJSON(self):
        """ Return JSON serializable dict
+
+        Atoms are represented by a string in format:
+        <chain_name>.<resnum>.<resnum_inscode>.<atom_name>
        """
-        return {"a1": self.a1.GetQualifiedName(),
-                "a2": self.a2.GetQualifiedName(),
+        return {"a1": _AtomToQualifiedName(self.a1),
+                "a2": _AtomToQualifiedName(self.a2),
                "dist": self.dist,
                "tolerated_dist": self.tolerated_dist}

@@ -382,9 +397,12 @@ class BondViolationInfo:

    def ToJSON(self):
        """ Return JSON serializable dict
+
+        Atoms are represented by a string in format:
+        <chain_name>.<resnum>.<resnum_inscode>.<atom_name>
        """
-        return {"a1": self.a1.GetQualifiedName(),
-                "a2": self.a2.GetQualifiedName(),
+        return {"a1": _AtomToQualifiedName(self.a1),
+                "a2": _AtomToQualifiedName(self.a2),
                "length": self.length,
                "exp_length": self.exp_length,
                "std": self.std}
@@ -412,10 +430,13 @@ class AngleViolationInfo:

    def ToJSON(self):
        """ Return JSON serializable dict
+
+        Atoms are represented by a string in format:
+        <chain_name>.<resnum>.<resnum_inscode>.<atom_name>
        """
-        return {"a1": self.a1.GetQualifiedName(),
-                "a2": self.a2.GetQualifiedName(),
-                "a3": self.a3.GetQualifiedName(),
+        return {"a1": _AtomToQualifiedName(self.a1),
+                "a2": _AtomToQualifiedName(self.a2),
+                "a3": _AtomToQualifiedName(self.a3),
                "angle": self.angle,
                "exp_angle": self.exp_angle,
                "std": self.std}