refactor: mol.alg.scoring / compare-structures

enable contact scores and separate dockq and per-interface qs scores

refactor: mol.alg.scoring / compare-structures
38989614 · Studer Gabriel · 4175597e · 38989614 · 38989614 · 38989614
Commit 38989614 authored 1 year ago by Studer Gabriel
--- a/actions/ost-compare-structures
+++ b/actions/ost-compare-structures
@@ -303,7 +303,62 @@ def _ParseArgs():
        default=False,
        action="store_true",
        help=("Compute QS-score, stored as key \"qs_global\", and the QS-best "
-              "variant, stored as key \"qs_best\"."))
+              "variant, stored as key \"qs_best\". Interfaces in the reference "
+              "with non-zero contribution to QS-score are available as key "
+              "\"qs_reference_interfaces\", the ones from the model as key "
+              "\"qs_model_interfaces\". \"qs_interfaces\" is a subset of "
+              "\"qs_reference_interfaces\" that contains interfaces that "
+              "can be mapped to the model. They are stored as lists in format "
+              "[ref_ch1, ref_ch2, mdl_ch1, mdl_ch2]. The respective "
+              "per-interface scores for \"qs_interfaces\" are available as "
+              "keys \"per_interface_qs_global\" and \"per_interface_qs_best\""))
+
+    parser.add_argument(
+        "--dockq",
+        dest="dockq",
+        default=False,
+        action="store_true",
+        help=("Compute DockQ scores and its components. Relevant interfaces "
+              "with at least one contact (any atom within 5A) of the reference "
+              "structure are available as key \"dockq_reference_interfaces\". "
+              "Only interfaces between peptide chains are considered here! "
+              "Key \"dockq_interfaces\" is a subset of "
+              "\"dockq_reference_interfaces\" that contains interfaces that "
+              "can be mapped to the model. They are stored as lists in format "
+              "[ref_ch1, ref_ch2, mdl_ch1, mdl_ch2]. The respective "
+              "DockQ scores for \"dockq_interfaces\" are available as key "
+              "\"dockq\". It's components are available as keys: "
+              "\"fnat\" (fraction of reference contacts which are also there "
+              "in model) \"irmsd\" (interface RMSD), \"lrmsd\" (ligand RMSD). "
+              "The DockQ score is strictly designed to score each interface "
+              "individually. We also provide two averaged versions to get one "
+              "full model score: \"dockq_ave\", \"dockq_wave\". The first is "
+              "simply the average of \"dockq_scores\", the latter is a "
+              "weighted average with weights derived from number of contacts "
+              "in the reference interfaces. These two scores only consider "
+              "interfaces that are present in both, the model and the "
+              "reference. \"dockq_ave_full\" and \"dockq_wave_full\" add zeros "
+              "in the average computation for each interface that is only "
+              "present in the reference but not in the model."))
+
+    parser.add_argument(
+        "--contact-scores",
+        dest="contact_scores",
+        default=False,
+        action="store_true",
+        help=("Computes interface contact based scores. A contact between two "
+              "residues of different chains is defined as having at least one "
+              "heavy atom within 5A. Contacts in reference structure are "
+              "available as key \"reference_contacts\". Each contact specifies "
+              "the interacting residues in format "
+              "\"<cname>.<rnum>.<ins_code>\". Model contacts are available as "
+              "key \"model_contacts\". The precision which is available as key "
+              "\"contact_precision\" reports the fraction of model contacts "
+              "that are also present in the reference. The recall which is "
+              "available as key \"contact_recall\" reports the fraction of "
+              "reference contacts that are correctly reproduced in the model. "
+              "The ICS score (Interface Contact Similarity) available as key "
+              "\"ics\" combines precision and recall using the F1-measure."))

    parser.add_argument(
        "--rigid-scores",
@@ -319,36 +374,6 @@ def _ParseArgs():
              "these positions and transformation, \"transform\": the used 4x4 "
              "transformation matrix that superposes model onto reference."))

-    parser.add_argument(
-        "--interface-scores",
-        dest="interface_scores",
-        default=False,
-        action="store_true",
-        help=("Per interface scores for each interface that has at least one "
-              "contact in the reference, i.e. at least one pair of heavy atoms "
-              "within 5A. The respective interfaces are available from key "
-              "\"interfaces\" which is a list of tuples in form (ref_ch1, "
-              "ref_ch2, mdl_ch1, mdl_ch2). Per-interface scores are available "
-              "as lists referring to these interfaces and have the following "
-              "keys: \"nnat\" (number of contacts in reference), \"nmdl\" "
-              "(number of contacts in model), \"fnat\" (fraction of reference "
-              "contacts which are also there in model), \"fnonnat\" (fraction "
-              "of model contacts which are not there in target), \"irmsd\" "
-              "(interface RMSD), \"lrmsd\" (ligand RMSD), \"dockq_scores\" "
-              "(per-interface score computed from \"fnat\", \"irmsd\" and "
-              "\"lrmsd\"), \"interface_qs_global\" and \"interface_qs_best\" "
-              "(per-interface versions of the two QS-score variants). The "
-              "DockQ score is strictly designed to score each interface "
-              "individually. We also provide two averaged versions to get one "
-              "full model score: \"dockq_ave\", \"dockq_wave\". The first is "
-              "simply the average of \"dockq_scores\", the latter is a "
-              "weighted average with weights derived from \"nnat\". These two "
-              "scores only consider interfaces that are present in both, the "
-              "model and the reference. \"dockq_ave_full\" and "
-              "\"dockq_wave_full\" add zeros in the average computation for "
-              "each interface that is only present in the reference but not in "
-              "the model."))
-
    parser.add_argument(
        "--patch-scores",
        dest="patch_scores",
@@ -596,30 +621,42 @@ def _Process(model, reference, args):
    if args.qs_score:
        out["qs_global"] = scorer.qs_global
        out["qs_best"] = scorer.qs_best
-
-    if args.rigid_scores:
-        out["oligo_gdtts"] = scorer.gdtts
-        out["oligo_gdtha"] = scorer.gdtha
-        out["rmsd"] = scorer.rmsd
-        data = scorer.transform.data
-        out["transform"] = [data[i:i + 4] for i in range(0, len(data), 4)]
-
-    if args.interface_scores:
-        out["interfaces"] = scorer.interfaces
-        out["nnat"] = scorer.native_contacts
-        out["nmdl"] = scorer.model_contacts
+        out["qs_reference_interfaces"] = scorer.qs_target_interfaces
+        out["qs_model_interfaces"] = scorer.qs_model_interfaces
+        out["qs_interfaces"] = scorer.qs_interfaces
+        out["per_interface_qs_global"] = scorer.per_interface_qs_global
+        out["per_interface_qs_best"] = scorer.per_interface_qs_best
+
+    if args.contact_scores:
+        out["reference_contacts"] = scorer.native_contacts
+        out["model_contacts"] = scorer.model_contacts
+        out["contact_precision"] = scorer.contact_precision
+        out["contact_recall"] = scorer.contact_recall
+        out["ics"] = scorer.ics
+
+    if args.dockq:
+        out["dockq_reference_interfaces"] = scorer.dockq_target_interfaces
+        out["dockq_interfaces"] = scorer.dockq_interfaces 
+        out["dockq"] = scorer.dockq_scores
        out["fnat"] = scorer.fnat
-        out["fnonnat"] = scorer.fnonnat
        out["irmsd"] = scorer.irmsd
        out["lrmsd"] = scorer.lrmsd
-        out["dockq_scores"] = scorer.dockq_scores
-        out["interface_qs_global"] = scorer.interface_qs_global
-        out["interface_qs_best"] = scorer.interface_qs_best
        out["dockq_ave"] = scorer.dockq_ave
        out["dockq_wave"] = scorer.dockq_wave
        out["dockq_ave_full"] = scorer.dockq_ave_full
        out["dockq_wave_full"] = scorer.dockq_wave_full

+    if args.contact_scores:
+        out["reference_contacts"] = scorer.native_contacts
+        out["model_contacts"] = scorer.model_contacts
+
+    if args.rigid_scores:
+        out["oligo_gdtts"] = scorer.gdtts
+        out["oligo_gdtha"] = scorer.gdtha
+        out["rmsd"] = scorer.rmsd
+        data = scorer.transform.data
+        out["transform"] = [data[i:i + 4] for i in range(0, len(data), 4)]
+
    if args.patch_scores:
        out["model_interface_residues"] = \
        _InterfaceResiduesToJSONList(scorer.model_interface_residues)

--- a/modules/doc/actions.rst
+++ b/modules/doc/actions.rst
@@ -41,16 +41,17 @@ Details on the usage (output of ``ost compare-structures --help``):
                                [--cad-score] [--local-cad-score]
                                [--cad-exec CAD_EXEC]
                                [--usalign-exec USALIGN_EXEC] [--qs-score]
-                                [--rigid-scores] [--interface-scores]
+                                [--dockq] [--contact-scores] [--rigid-scores]
                                [--patch-scores] [--tm-score]
                                [--lddt-no-stereochecks]
-  
+                                [--n-max-naive N_MAX_NAIVE]
+
  Evaluate model against reference 
-  
+
  Example: ost compare-structures -m model.pdb -r reference.cif
-  
+
  Loads the structures and performs basic cleanup:
-  
+
   * Assign elements according to the PDB Chemical Component Dictionary
   * Map nonstandard residues to their parent residues as defined by the PDB
     Chemical Component Dictionary, e.g. phospho-serine => serine
@@ -59,12 +60,12 @@ Details on the usage (output of ``ost compare-structures --help``):
   * Remove unknown atoms, i.e. atoms that are not expected according to the PDB
     Chemical Component Dictionary
   * Select for peptide/nucleotide residues
-  
+
  The cleaned structures are optionally dumped using -d/--dump-structures
-  
+
  Output is written in JSON format (default: out.json). In case of no additional
  options, this is a dictionary with 8 keys describing model/reference comparison:
-  
+
   * "reference_chains": Chain names of reference
   * "model_chains": Chain names of model
   * "chem_groups": Groups of polypeptides/polynucleotides from reference that
@@ -89,10 +90,10 @@ Details on the usage (output of ``ost compare-structures --help``):
   * "status": SUCCESS if everything ran through. In case of failure, the only
     content of the JSON output will be "status" set to FAILURE and an
     additional key: "traceback".
-  
+
  The following additional keys store relevant input parameters to reproduce
  results:
-  
+
   * "model"
   * "reference"
   * "fault_tolerant"
@@ -103,22 +104,22 @@ Details on the usage (output of ``ost compare-structures --help``):
   * "cad_exec"
   * "usalign_exec"
   * "lddt_no_stereochecks"
-  
+
  The pairwise sequence alignments are computed with Needleman-Wunsch using
  BLOSUM62 (NUC44 for nucleotides). Many benchmarking scenarios preprocess the
  structures to ensure matching residue numbers (CASP/CAMEO). In these cases,
  enabling -rna/--residue-number-alignment is recommended.
-  
+
  Each score is opt-in and can be enabled with optional arguments.
-  
+
  Example to compute global and per-residue lDDT values as well as QS-score:
-  
+
  ost compare-structures -m model.pdb -r reference.cif --lddt --local-lddt --qs-score
-  
+
  Example to inject custom chain mapping
-  
+
  ost compare-structures -m model.pdb -r reference.cif -c A:B B:A
-  
+
  optional arguments:
    -h, --help            show this help message and exit
    -m MODEL, --model MODEL
@@ -218,7 +219,58 @@ Details on the usage (output of ``ost compare-structures --help``):
                          given, an OpenStructure internal copy of USalign code
                          is used.
    --qs-score            Compute QS-score, stored as key "qs_global", and the
-                          QS-best variant, stored as key "qs_best".
+                          QS-best variant, stored as key "qs_best". Interfaces
+                          in the reference with non-zero contribution to QS-
+                          score are available as key "qs_reference_interfaces",
+                          the ones from the model as key "qs_model_interfaces".
+                          "qs_interfaces" is a subset of
+                          "qs_reference_interfaces" that contains interfaces
+                          that can be mapped to the model. They are stored as
+                          lists in format [ref_ch1, ref_ch2, mdl_ch1, mdl_ch2].
+                          The respective per-interface scores for
+                          "qs_interfaces" are available as keys
+                          "per_interface_qs_global" and "per_interface_qs_best"
+    --dockq               Compute DockQ scores and its components. Relevant
+                          interfaces with at least one contact (any atom within
+                          5A) of the reference structure are available as key
+                          "dockq_reference_interfaces". Only interfaces between
+                          peptide chains are considered here! Key
+                          "dockq_interfaces" is a subset of
+                          "dockq_reference_interfaces" that contains interfaces
+                          that can be mapped to the model. They are stored as
+                          lists in format [ref_ch1, ref_ch2, mdl_ch1, mdl_ch2].
+                          The respective DockQ scores for "dockq_interfaces" are
+                          available as key "dockq". It's components are
+                          available as keys: "fnat" (fraction of reference
+                          contacts which are also there in model) "irmsd"
+                          (interface RMSD), "lrmsd" (ligand RMSD). The DockQ
+                          score is strictly designed to score each interface
+                          individually. We also provide two averaged versions to
+                          get one full model score: "dockq_ave", "dockq_wave".
+                          The first is simply the average of "dockq_scores", the
+                          latter is a weighted average with weights derived from
+                          number of contacts in the reference interfaces. These
+                          two scores only consider interfaces that are present
+                          in both, the model and the reference. "dockq_ave_full"
+                          and "dockq_wave_full" add zeros in the average
+                          computation for each interface that is only present in
+                          the reference but not in the model.
+    --contact-scores      Computes interface contact based scores. A contact
+                          between two residues of different chains is defined as
+                          having at least one heavy atom within 5A. Contacts in
+                          reference structure are available as key
+                          "reference_contacts". Each contact specifies the
+                          interacting residues in format
+                          "<cname>.<rnum>.<ins_code>". Model contacts are
+                          available as key "model_contacts". The precision which
+                          is available as key "contact_precision" reports the
+                          fraction of model contacts that are also present in
+                          the reference. The recall which is available as key
+                          "contact_recall" reports the fraction of reference
+                          contacts that are correctly reproduced in the model.
+                          The ICS score (Interface Contact Similarity) available
+                          as key "ics" combines precision and recall using the
+                          F1-measure.
    --rigid-scores        Computes rigid superposition based scores. They're
                          based on a Kabsch superposition of all mapped CA
                          positions (C3' for nucleotides). Makes the following
@@ -229,33 +281,6 @@ Details on the usage (output of ``ost compare-structures --help``):
                          these positions and transformation, "transform": the
                          used 4x4 transformation matrix that superposes model
                          onto reference.
-    --interface-scores    Per interface scores for each interface that has at
-                          least one contact in the reference, i.e. at least one
-                          pair of heavy atoms within 5A. The respective
-                          interfaces are available from key "interfaces" which
-                          is a list of tuples in form (ref_ch1, ref_ch2,
-                          mdl_ch1, mdl_ch2). Per-interface scores are available
-                          as lists referring to these interfaces and have the
-                          following keys: "nnat" (number of contacts in
-                          reference), "nmdl" (number of contacts in model),
-                          "fnat" (fraction of reference contacts which are also
-                          there in model), "fnonnat" (fraction of model contacts
-                          which are not there in target), "irmsd" (interface
-                          RMSD), "lrmsd" (ligand RMSD), "dockq_scores" (per-
-                          interface score computed from "fnat", "irmsd" and
-                          "lrmsd"), "interface_qs_global" and
-                          "interface_qs_best" (per-interface versions of the two
-                          QS-score variants). The DockQ score is strictly
-                          designed to score each interface individually. We also
-                          provide two averaged versions to get one full model
-                          score: "dockq_ave", "dockq_wave". The first is simply
-                          the average of "dockq_scores", the latter is a
-                          weighted average with weights derived from "nnat".
-                          These two scores only consider interfaces that are
-                          present in both, the model and the reference.
-                          "dockq_ave_full" and "dockq_wave_full" add zeros in
-                          the average computation for each interface that is
-                          only present in the reference but not in the model.
    --patch-scores        Local interface quality score used in CASP15. Scores
                          each model residue that is considered in the interface
                          (CB pos within 8A of any CB pos from another chain (CA
@@ -277,6 +302,12 @@ Details on the usage (output of ``ost compare-structures --help``):
                          "usalign_mapping"
    --lddt-no-stereochecks
                          Disable stereochecks for lDDT computation
+    --n-max-naive N_MAX_NAIVE
+                          If number of chains in model and reference are below
+                          or equal that number, the chain mapping will naively
+                          enumerate all possible mappings. A heuristic is used
+                          otherwise.
+




--- a/modules/mol/alg/pymod/contact_score.py
+++ b/modules/mol/alg/pymod/contact_score.py
@@ -54,6 +54,7 @@ class ContactEntity:
        self._interacting_chains = None
        self._sequence = dict()
        self._contacts = None
+        self._hr_contacts = None

    @property
    def view(self):
@@ -122,6 +123,18 @@ class ContactEntity:
        if self._contacts is None:
            self._SetupContacts()
        return self._contacts
+
+    @property
+    def hr_contacts(self):
+        """ Human readable interchain contacts
+
+        Human readable version of :attr:`~contacts`. Simple list with tuples
+        containing two strings specifying the residues in contact. Format:
+        <cname>.<rnum>.<ins_code>
+        """
+        if self._hr_contacts is None:
+            self._SetupContacts()
+        return self._hr_contacts
    
    def GetChain(self, chain_name):
        """ Get chain by name
@@ -152,6 +165,7 @@ class ContactEntity:
        # this function is incredibly inefficient... if performance is an issue,
        # go ahead and optimize
        self._contacts = dict()
+        self._hr_contacts = list()

        # set indices relative to full view 
        for ch in self.view.chains:
@@ -178,6 +192,12 @@ class ContactEntity:
                                    self._contacts[cname_key] = set()
                                self._contacts[cname_key].add((r1.GetIntProp("contact_idx"),
                                                               r2.GetIntProp("contact_idx")))
+                                rnum1 = r1.GetNumber()
+                                hr1 = f"{cname}.{rnum1.num}.{rnum1.ins_code}"
+                                rnum2 = r2.GetNumber()
+                                hr2 = f"{cname2}.{rnum2.num}.{rnum2.ins_code}"
+                                self._hr_contacts.append((hr1.strip("\u0000"),
+                                                          hr2.strip("\u0000")))

 class ContactScorerResult:
    """

--- a/modules/mol/alg/pymod/scoring.py
+++ b/modules/mol/alg/pymod/scoring.py