Skip to content
Snippets Groups Projects
Commit 38989614 authored by Studer Gabriel's avatar Studer Gabriel
Browse files

refactor: mol.alg.scoring / compare-structures

enable contact scores and separate dockq and per-interface qs scores
parent 4175597e
Branches
Tags
No related merge requests found
......@@ -303,7 +303,62 @@ def _ParseArgs():
default=False,
action="store_true",
help=("Compute QS-score, stored as key \"qs_global\", and the QS-best "
"variant, stored as key \"qs_best\"."))
"variant, stored as key \"qs_best\". Interfaces in the reference "
"with non-zero contribution to QS-score are available as key "
"\"qs_reference_interfaces\", the ones from the model as key "
"\"qs_model_interfaces\". \"qs_interfaces\" is a subset of "
"\"qs_reference_interfaces\" that contains interfaces that "
"can be mapped to the model. They are stored as lists in format "
"[ref_ch1, ref_ch2, mdl_ch1, mdl_ch2]. The respective "
"per-interface scores for \"qs_interfaces\" are available as "
"keys \"per_interface_qs_global\" and \"per_interface_qs_best\""))
parser.add_argument(
"--dockq",
dest="dockq",
default=False,
action="store_true",
help=("Compute DockQ scores and its components. Relevant interfaces "
"with at least one contact (any atom within 5A) of the reference "
"structure are available as key \"dockq_reference_interfaces\". "
"Only interfaces between peptide chains are considered here! "
"Key \"dockq_interfaces\" is a subset of "
"\"dockq_reference_interfaces\" that contains interfaces that "
"can be mapped to the model. They are stored as lists in format "
"[ref_ch1, ref_ch2, mdl_ch1, mdl_ch2]. The respective "
"DockQ scores for \"dockq_interfaces\" are available as key "
"\"dockq\". It's components are available as keys: "
"\"fnat\" (fraction of reference contacts which are also there "
"in model) \"irmsd\" (interface RMSD), \"lrmsd\" (ligand RMSD). "
"The DockQ score is strictly designed to score each interface "
"individually. We also provide two averaged versions to get one "
"full model score: \"dockq_ave\", \"dockq_wave\". The first is "
"simply the average of \"dockq_scores\", the latter is a "
"weighted average with weights derived from number of contacts "
"in the reference interfaces. These two scores only consider "
"interfaces that are present in both, the model and the "
"reference. \"dockq_ave_full\" and \"dockq_wave_full\" add zeros "
"in the average computation for each interface that is only "
"present in the reference but not in the model."))
parser.add_argument(
"--contact-scores",
dest="contact_scores",
default=False,
action="store_true",
help=("Computes interface contact based scores. A contact between two "
"residues of different chains is defined as having at least one "
"heavy atom within 5A. Contacts in reference structure are "
"available as key \"reference_contacts\". Each contact specifies "
"the interacting residues in format "
"\"<cname>.<rnum>.<ins_code>\". Model contacts are available as "
"key \"model_contacts\". The precision which is available as key "
"\"contact_precision\" reports the fraction of model contacts "
"that are also present in the reference. The recall which is "
"available as key \"contact_recall\" reports the fraction of "
"reference contacts that are correctly reproduced in the model. "
"The ICS score (Interface Contact Similarity) available as key "
"\"ics\" combines precision and recall using the F1-measure."))
parser.add_argument(
"--rigid-scores",
......@@ -319,36 +374,6 @@ def _ParseArgs():
"these positions and transformation, \"transform\": the used 4x4 "
"transformation matrix that superposes model onto reference."))
parser.add_argument(
"--interface-scores",
dest="interface_scores",
default=False,
action="store_true",
help=("Per interface scores for each interface that has at least one "
"contact in the reference, i.e. at least one pair of heavy atoms "
"within 5A. The respective interfaces are available from key "
"\"interfaces\" which is a list of tuples in form (ref_ch1, "
"ref_ch2, mdl_ch1, mdl_ch2). Per-interface scores are available "
"as lists referring to these interfaces and have the following "
"keys: \"nnat\" (number of contacts in reference), \"nmdl\" "
"(number of contacts in model), \"fnat\" (fraction of reference "
"contacts which are also there in model), \"fnonnat\" (fraction "
"of model contacts which are not there in target), \"irmsd\" "
"(interface RMSD), \"lrmsd\" (ligand RMSD), \"dockq_scores\" "
"(per-interface score computed from \"fnat\", \"irmsd\" and "
"\"lrmsd\"), \"interface_qs_global\" and \"interface_qs_best\" "
"(per-interface versions of the two QS-score variants). The "
"DockQ score is strictly designed to score each interface "
"individually. We also provide two averaged versions to get one "
"full model score: \"dockq_ave\", \"dockq_wave\". The first is "
"simply the average of \"dockq_scores\", the latter is a "
"weighted average with weights derived from \"nnat\". These two "
"scores only consider interfaces that are present in both, the "
"model and the reference. \"dockq_ave_full\" and "
"\"dockq_wave_full\" add zeros in the average computation for "
"each interface that is only present in the reference but not in "
"the model."))
parser.add_argument(
"--patch-scores",
dest="patch_scores",
......@@ -596,30 +621,42 @@ def _Process(model, reference, args):
if args.qs_score:
out["qs_global"] = scorer.qs_global
out["qs_best"] = scorer.qs_best
if args.rigid_scores:
out["oligo_gdtts"] = scorer.gdtts
out["oligo_gdtha"] = scorer.gdtha
out["rmsd"] = scorer.rmsd
data = scorer.transform.data
out["transform"] = [data[i:i + 4] for i in range(0, len(data), 4)]
if args.interface_scores:
out["interfaces"] = scorer.interfaces
out["nnat"] = scorer.native_contacts
out["nmdl"] = scorer.model_contacts
out["qs_reference_interfaces"] = scorer.qs_target_interfaces
out["qs_model_interfaces"] = scorer.qs_model_interfaces
out["qs_interfaces"] = scorer.qs_interfaces
out["per_interface_qs_global"] = scorer.per_interface_qs_global
out["per_interface_qs_best"] = scorer.per_interface_qs_best
if args.contact_scores:
out["reference_contacts"] = scorer.native_contacts
out["model_contacts"] = scorer.model_contacts
out["contact_precision"] = scorer.contact_precision
out["contact_recall"] = scorer.contact_recall
out["ics"] = scorer.ics
if args.dockq:
out["dockq_reference_interfaces"] = scorer.dockq_target_interfaces
out["dockq_interfaces"] = scorer.dockq_interfaces
out["dockq"] = scorer.dockq_scores
out["fnat"] = scorer.fnat
out["fnonnat"] = scorer.fnonnat
out["irmsd"] = scorer.irmsd
out["lrmsd"] = scorer.lrmsd
out["dockq_scores"] = scorer.dockq_scores
out["interface_qs_global"] = scorer.interface_qs_global
out["interface_qs_best"] = scorer.interface_qs_best
out["dockq_ave"] = scorer.dockq_ave
out["dockq_wave"] = scorer.dockq_wave
out["dockq_ave_full"] = scorer.dockq_ave_full
out["dockq_wave_full"] = scorer.dockq_wave_full
if args.contact_scores:
out["reference_contacts"] = scorer.native_contacts
out["model_contacts"] = scorer.model_contacts
if args.rigid_scores:
out["oligo_gdtts"] = scorer.gdtts
out["oligo_gdtha"] = scorer.gdtha
out["rmsd"] = scorer.rmsd
data = scorer.transform.data
out["transform"] = [data[i:i + 4] for i in range(0, len(data), 4)]
if args.patch_scores:
out["model_interface_residues"] = \
_InterfaceResiduesToJSONList(scorer.model_interface_residues)
......
......@@ -41,16 +41,17 @@ Details on the usage (output of ``ost compare-structures --help``):
[--cad-score] [--local-cad-score]
[--cad-exec CAD_EXEC]
[--usalign-exec USALIGN_EXEC] [--qs-score]
[--rigid-scores] [--interface-scores]
[--dockq] [--contact-scores] [--rigid-scores]
[--patch-scores] [--tm-score]
[--lddt-no-stereochecks]
[--n-max-naive N_MAX_NAIVE]
Evaluate model against reference
Example: ost compare-structures -m model.pdb -r reference.cif
Loads the structures and performs basic cleanup:
* Assign elements according to the PDB Chemical Component Dictionary
* Map nonstandard residues to their parent residues as defined by the PDB
Chemical Component Dictionary, e.g. phospho-serine => serine
......@@ -59,12 +60,12 @@ Details on the usage (output of ``ost compare-structures --help``):
* Remove unknown atoms, i.e. atoms that are not expected according to the PDB
Chemical Component Dictionary
* Select for peptide/nucleotide residues
The cleaned structures are optionally dumped using -d/--dump-structures
Output is written in JSON format (default: out.json). In case of no additional
options, this is a dictionary with 8 keys describing model/reference comparison:
* "reference_chains": Chain names of reference
* "model_chains": Chain names of model
* "chem_groups": Groups of polypeptides/polynucleotides from reference that
......@@ -89,10 +90,10 @@ Details on the usage (output of ``ost compare-structures --help``):
* "status": SUCCESS if everything ran through. In case of failure, the only
content of the JSON output will be "status" set to FAILURE and an
additional key: "traceback".
The following additional keys store relevant input parameters to reproduce
results:
* "model"
* "reference"
* "fault_tolerant"
......@@ -103,22 +104,22 @@ Details on the usage (output of ``ost compare-structures --help``):
* "cad_exec"
* "usalign_exec"
* "lddt_no_stereochecks"
The pairwise sequence alignments are computed with Needleman-Wunsch using
BLOSUM62 (NUC44 for nucleotides). Many benchmarking scenarios preprocess the
structures to ensure matching residue numbers (CASP/CAMEO). In these cases,
enabling -rna/--residue-number-alignment is recommended.
Each score is opt-in and can be enabled with optional arguments.
Example to compute global and per-residue lDDT values as well as QS-score:
ost compare-structures -m model.pdb -r reference.cif --lddt --local-lddt --qs-score
Example to inject custom chain mapping
ost compare-structures -m model.pdb -r reference.cif -c A:B B:A
optional arguments:
-h, --help show this help message and exit
-m MODEL, --model MODEL
......@@ -218,7 +219,58 @@ Details on the usage (output of ``ost compare-structures --help``):
given, an OpenStructure internal copy of USalign code
is used.
--qs-score Compute QS-score, stored as key "qs_global", and the
QS-best variant, stored as key "qs_best".
QS-best variant, stored as key "qs_best". Interfaces
in the reference with non-zero contribution to QS-
score are available as key "qs_reference_interfaces",
the ones from the model as key "qs_model_interfaces".
"qs_interfaces" is a subset of
"qs_reference_interfaces" that contains interfaces
that can be mapped to the model. They are stored as
lists in format [ref_ch1, ref_ch2, mdl_ch1, mdl_ch2].
The respective per-interface scores for
"qs_interfaces" are available as keys
"per_interface_qs_global" and "per_interface_qs_best"
--dockq Compute DockQ scores and its components. Relevant
interfaces with at least one contact (any atom within
5A) of the reference structure are available as key
"dockq_reference_interfaces". Only interfaces between
peptide chains are considered here! Key
"dockq_interfaces" is a subset of
"dockq_reference_interfaces" that contains interfaces
that can be mapped to the model. They are stored as
lists in format [ref_ch1, ref_ch2, mdl_ch1, mdl_ch2].
The respective DockQ scores for "dockq_interfaces" are
available as key "dockq". It's components are
available as keys: "fnat" (fraction of reference
contacts which are also there in model) "irmsd"
(interface RMSD), "lrmsd" (ligand RMSD). The DockQ
score is strictly designed to score each interface
individually. We also provide two averaged versions to
get one full model score: "dockq_ave", "dockq_wave".
The first is simply the average of "dockq_scores", the
latter is a weighted average with weights derived from
number of contacts in the reference interfaces. These
two scores only consider interfaces that are present
in both, the model and the reference. "dockq_ave_full"
and "dockq_wave_full" add zeros in the average
computation for each interface that is only present in
the reference but not in the model.
--contact-scores Computes interface contact based scores. A contact
between two residues of different chains is defined as
having at least one heavy atom within 5A. Contacts in
reference structure are available as key
"reference_contacts". Each contact specifies the
interacting residues in format
"<cname>.<rnum>.<ins_code>". Model contacts are
available as key "model_contacts". The precision which
is available as key "contact_precision" reports the
fraction of model contacts that are also present in
the reference. The recall which is available as key
"contact_recall" reports the fraction of reference
contacts that are correctly reproduced in the model.
The ICS score (Interface Contact Similarity) available
as key "ics" combines precision and recall using the
F1-measure.
--rigid-scores Computes rigid superposition based scores. They're
based on a Kabsch superposition of all mapped CA
positions (C3' for nucleotides). Makes the following
......@@ -229,33 +281,6 @@ Details on the usage (output of ``ost compare-structures --help``):
these positions and transformation, "transform": the
used 4x4 transformation matrix that superposes model
onto reference.
--interface-scores Per interface scores for each interface that has at
least one contact in the reference, i.e. at least one
pair of heavy atoms within 5A. The respective
interfaces are available from key "interfaces" which
is a list of tuples in form (ref_ch1, ref_ch2,
mdl_ch1, mdl_ch2). Per-interface scores are available
as lists referring to these interfaces and have the
following keys: "nnat" (number of contacts in
reference), "nmdl" (number of contacts in model),
"fnat" (fraction of reference contacts which are also
there in model), "fnonnat" (fraction of model contacts
which are not there in target), "irmsd" (interface
RMSD), "lrmsd" (ligand RMSD), "dockq_scores" (per-
interface score computed from "fnat", "irmsd" and
"lrmsd"), "interface_qs_global" and
"interface_qs_best" (per-interface versions of the two
QS-score variants). The DockQ score is strictly
designed to score each interface individually. We also
provide two averaged versions to get one full model
score: "dockq_ave", "dockq_wave". The first is simply
the average of "dockq_scores", the latter is a
weighted average with weights derived from "nnat".
These two scores only consider interfaces that are
present in both, the model and the reference.
"dockq_ave_full" and "dockq_wave_full" add zeros in
the average computation for each interface that is
only present in the reference but not in the model.
--patch-scores Local interface quality score used in CASP15. Scores
each model residue that is considered in the interface
(CB pos within 8A of any CB pos from another chain (CA
......@@ -277,6 +302,12 @@ Details on the usage (output of ``ost compare-structures --help``):
"usalign_mapping"
--lddt-no-stereochecks
Disable stereochecks for lDDT computation
--n-max-naive N_MAX_NAIVE
If number of chains in model and reference are below
or equal that number, the chain mapping will naively
enumerate all possible mappings. A heuristic is used
otherwise.
......
......@@ -54,6 +54,7 @@ class ContactEntity:
self._interacting_chains = None
self._sequence = dict()
self._contacts = None
self._hr_contacts = None
@property
def view(self):
......@@ -122,6 +123,18 @@ class ContactEntity:
if self._contacts is None:
self._SetupContacts()
return self._contacts
@property
def hr_contacts(self):
""" Human readable interchain contacts
Human readable version of :attr:`~contacts`. Simple list with tuples
containing two strings specifying the residues in contact. Format:
<cname>.<rnum>.<ins_code>
"""
if self._hr_contacts is None:
self._SetupContacts()
return self._hr_contacts
def GetChain(self, chain_name):
""" Get chain by name
......@@ -152,6 +165,7 @@ class ContactEntity:
# this function is incredibly inefficient... if performance is an issue,
# go ahead and optimize
self._contacts = dict()
self._hr_contacts = list()
# set indices relative to full view
for ch in self.view.chains:
......@@ -178,6 +192,12 @@ class ContactEntity:
self._contacts[cname_key] = set()
self._contacts[cname_key].add((r1.GetIntProp("contact_idx"),
r2.GetIntProp("contact_idx")))
rnum1 = r1.GetNumber()
hr1 = f"{cname}.{rnum1.num}.{rnum1.ins_code}"
rnum2 = r2.GetNumber()
hr2 = f"{cname2}.{rnum2.num}.{rnum2.ins_code}"
self._hr_contacts.append((hr1.strip("\u0000"),
hr2.strip("\u0000")))
class ContactScorerResult:
"""
......
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment