Skip to content
Snippets Groups Projects
Commit 38989614 authored by Studer Gabriel's avatar Studer Gabriel
Browse files

refactor: mol.alg.scoring / compare-structures

enable contact scores and separate dockq and per-interface qs scores
parent 4175597e
Branches
No related tags found
No related merge requests found
...@@ -303,7 +303,62 @@ def _ParseArgs(): ...@@ -303,7 +303,62 @@ def _ParseArgs():
default=False, default=False,
action="store_true", action="store_true",
help=("Compute QS-score, stored as key \"qs_global\", and the QS-best " help=("Compute QS-score, stored as key \"qs_global\", and the QS-best "
"variant, stored as key \"qs_best\".")) "variant, stored as key \"qs_best\". Interfaces in the reference "
"with non-zero contribution to QS-score are available as key "
"\"qs_reference_interfaces\", the ones from the model as key "
"\"qs_model_interfaces\". \"qs_interfaces\" is a subset of "
"\"qs_reference_interfaces\" that contains interfaces that "
"can be mapped to the model. They are stored as lists in format "
"[ref_ch1, ref_ch2, mdl_ch1, mdl_ch2]. The respective "
"per-interface scores for \"qs_interfaces\" are available as "
"keys \"per_interface_qs_global\" and \"per_interface_qs_best\""))
parser.add_argument(
"--dockq",
dest="dockq",
default=False,
action="store_true",
help=("Compute DockQ scores and its components. Relevant interfaces "
"with at least one contact (any atom within 5A) of the reference "
"structure are available as key \"dockq_reference_interfaces\". "
"Only interfaces between peptide chains are considered here! "
"Key \"dockq_interfaces\" is a subset of "
"\"dockq_reference_interfaces\" that contains interfaces that "
"can be mapped to the model. They are stored as lists in format "
"[ref_ch1, ref_ch2, mdl_ch1, mdl_ch2]. The respective "
"DockQ scores for \"dockq_interfaces\" are available as key "
"\"dockq\". It's components are available as keys: "
"\"fnat\" (fraction of reference contacts which are also there "
"in model) \"irmsd\" (interface RMSD), \"lrmsd\" (ligand RMSD). "
"The DockQ score is strictly designed to score each interface "
"individually. We also provide two averaged versions to get one "
"full model score: \"dockq_ave\", \"dockq_wave\". The first is "
"simply the average of \"dockq_scores\", the latter is a "
"weighted average with weights derived from number of contacts "
"in the reference interfaces. These two scores only consider "
"interfaces that are present in both, the model and the "
"reference. \"dockq_ave_full\" and \"dockq_wave_full\" add zeros "
"in the average computation for each interface that is only "
"present in the reference but not in the model."))
parser.add_argument(
"--contact-scores",
dest="contact_scores",
default=False,
action="store_true",
help=("Computes interface contact based scores. A contact between two "
"residues of different chains is defined as having at least one "
"heavy atom within 5A. Contacts in reference structure are "
"available as key \"reference_contacts\". Each contact specifies "
"the interacting residues in format "
"\"<cname>.<rnum>.<ins_code>\". Model contacts are available as "
"key \"model_contacts\". The precision which is available as key "
"\"contact_precision\" reports the fraction of model contacts "
"that are also present in the reference. The recall which is "
"available as key \"contact_recall\" reports the fraction of "
"reference contacts that are correctly reproduced in the model. "
"The ICS score (Interface Contact Similarity) available as key "
"\"ics\" combines precision and recall using the F1-measure."))
parser.add_argument( parser.add_argument(
"--rigid-scores", "--rigid-scores",
...@@ -319,36 +374,6 @@ def _ParseArgs(): ...@@ -319,36 +374,6 @@ def _ParseArgs():
"these positions and transformation, \"transform\": the used 4x4 " "these positions and transformation, \"transform\": the used 4x4 "
"transformation matrix that superposes model onto reference.")) "transformation matrix that superposes model onto reference."))
parser.add_argument(
"--interface-scores",
dest="interface_scores",
default=False,
action="store_true",
help=("Per interface scores for each interface that has at least one "
"contact in the reference, i.e. at least one pair of heavy atoms "
"within 5A. The respective interfaces are available from key "
"\"interfaces\" which is a list of tuples in form (ref_ch1, "
"ref_ch2, mdl_ch1, mdl_ch2). Per-interface scores are available "
"as lists referring to these interfaces and have the following "
"keys: \"nnat\" (number of contacts in reference), \"nmdl\" "
"(number of contacts in model), \"fnat\" (fraction of reference "
"contacts which are also there in model), \"fnonnat\" (fraction "
"of model contacts which are not there in target), \"irmsd\" "
"(interface RMSD), \"lrmsd\" (ligand RMSD), \"dockq_scores\" "
"(per-interface score computed from \"fnat\", \"irmsd\" and "
"\"lrmsd\"), \"interface_qs_global\" and \"interface_qs_best\" "
"(per-interface versions of the two QS-score variants). The "
"DockQ score is strictly designed to score each interface "
"individually. We also provide two averaged versions to get one "
"full model score: \"dockq_ave\", \"dockq_wave\". The first is "
"simply the average of \"dockq_scores\", the latter is a "
"weighted average with weights derived from \"nnat\". These two "
"scores only consider interfaces that are present in both, the "
"model and the reference. \"dockq_ave_full\" and "
"\"dockq_wave_full\" add zeros in the average computation for "
"each interface that is only present in the reference but not in "
"the model."))
parser.add_argument( parser.add_argument(
"--patch-scores", "--patch-scores",
dest="patch_scores", dest="patch_scores",
...@@ -596,30 +621,42 @@ def _Process(model, reference, args): ...@@ -596,30 +621,42 @@ def _Process(model, reference, args):
if args.qs_score: if args.qs_score:
out["qs_global"] = scorer.qs_global out["qs_global"] = scorer.qs_global
out["qs_best"] = scorer.qs_best out["qs_best"] = scorer.qs_best
out["qs_reference_interfaces"] = scorer.qs_target_interfaces
if args.rigid_scores: out["qs_model_interfaces"] = scorer.qs_model_interfaces
out["oligo_gdtts"] = scorer.gdtts out["qs_interfaces"] = scorer.qs_interfaces
out["oligo_gdtha"] = scorer.gdtha out["per_interface_qs_global"] = scorer.per_interface_qs_global
out["rmsd"] = scorer.rmsd out["per_interface_qs_best"] = scorer.per_interface_qs_best
data = scorer.transform.data
out["transform"] = [data[i:i + 4] for i in range(0, len(data), 4)] if args.contact_scores:
out["reference_contacts"] = scorer.native_contacts
if args.interface_scores: out["model_contacts"] = scorer.model_contacts
out["interfaces"] = scorer.interfaces out["contact_precision"] = scorer.contact_precision
out["nnat"] = scorer.native_contacts out["contact_recall"] = scorer.contact_recall
out["nmdl"] = scorer.model_contacts out["ics"] = scorer.ics
if args.dockq:
out["dockq_reference_interfaces"] = scorer.dockq_target_interfaces
out["dockq_interfaces"] = scorer.dockq_interfaces
out["dockq"] = scorer.dockq_scores
out["fnat"] = scorer.fnat out["fnat"] = scorer.fnat
out["fnonnat"] = scorer.fnonnat
out["irmsd"] = scorer.irmsd out["irmsd"] = scorer.irmsd
out["lrmsd"] = scorer.lrmsd out["lrmsd"] = scorer.lrmsd
out["dockq_scores"] = scorer.dockq_scores
out["interface_qs_global"] = scorer.interface_qs_global
out["interface_qs_best"] = scorer.interface_qs_best
out["dockq_ave"] = scorer.dockq_ave out["dockq_ave"] = scorer.dockq_ave
out["dockq_wave"] = scorer.dockq_wave out["dockq_wave"] = scorer.dockq_wave
out["dockq_ave_full"] = scorer.dockq_ave_full out["dockq_ave_full"] = scorer.dockq_ave_full
out["dockq_wave_full"] = scorer.dockq_wave_full out["dockq_wave_full"] = scorer.dockq_wave_full
if args.contact_scores:
out["reference_contacts"] = scorer.native_contacts
out["model_contacts"] = scorer.model_contacts
if args.rigid_scores:
out["oligo_gdtts"] = scorer.gdtts
out["oligo_gdtha"] = scorer.gdtha
out["rmsd"] = scorer.rmsd
data = scorer.transform.data
out["transform"] = [data[i:i + 4] for i in range(0, len(data), 4)]
if args.patch_scores: if args.patch_scores:
out["model_interface_residues"] = \ out["model_interface_residues"] = \
_InterfaceResiduesToJSONList(scorer.model_interface_residues) _InterfaceResiduesToJSONList(scorer.model_interface_residues)
......
...@@ -41,16 +41,17 @@ Details on the usage (output of ``ost compare-structures --help``): ...@@ -41,16 +41,17 @@ Details on the usage (output of ``ost compare-structures --help``):
[--cad-score] [--local-cad-score] [--cad-score] [--local-cad-score]
[--cad-exec CAD_EXEC] [--cad-exec CAD_EXEC]
[--usalign-exec USALIGN_EXEC] [--qs-score] [--usalign-exec USALIGN_EXEC] [--qs-score]
[--rigid-scores] [--interface-scores] [--dockq] [--contact-scores] [--rigid-scores]
[--patch-scores] [--tm-score] [--patch-scores] [--tm-score]
[--lddt-no-stereochecks] [--lddt-no-stereochecks]
[--n-max-naive N_MAX_NAIVE]
Evaluate model against reference Evaluate model against reference
Example: ost compare-structures -m model.pdb -r reference.cif Example: ost compare-structures -m model.pdb -r reference.cif
Loads the structures and performs basic cleanup: Loads the structures and performs basic cleanup:
* Assign elements according to the PDB Chemical Component Dictionary * Assign elements according to the PDB Chemical Component Dictionary
* Map nonstandard residues to their parent residues as defined by the PDB * Map nonstandard residues to their parent residues as defined by the PDB
Chemical Component Dictionary, e.g. phospho-serine => serine Chemical Component Dictionary, e.g. phospho-serine => serine
...@@ -59,12 +60,12 @@ Details on the usage (output of ``ost compare-structures --help``): ...@@ -59,12 +60,12 @@ Details on the usage (output of ``ost compare-structures --help``):
* Remove unknown atoms, i.e. atoms that are not expected according to the PDB * Remove unknown atoms, i.e. atoms that are not expected according to the PDB
Chemical Component Dictionary Chemical Component Dictionary
* Select for peptide/nucleotide residues * Select for peptide/nucleotide residues
The cleaned structures are optionally dumped using -d/--dump-structures The cleaned structures are optionally dumped using -d/--dump-structures
Output is written in JSON format (default: out.json). In case of no additional Output is written in JSON format (default: out.json). In case of no additional
options, this is a dictionary with 8 keys describing model/reference comparison: options, this is a dictionary with 8 keys describing model/reference comparison:
* "reference_chains": Chain names of reference * "reference_chains": Chain names of reference
* "model_chains": Chain names of model * "model_chains": Chain names of model
* "chem_groups": Groups of polypeptides/polynucleotides from reference that * "chem_groups": Groups of polypeptides/polynucleotides from reference that
...@@ -89,10 +90,10 @@ Details on the usage (output of ``ost compare-structures --help``): ...@@ -89,10 +90,10 @@ Details on the usage (output of ``ost compare-structures --help``):
* "status": SUCCESS if everything ran through. In case of failure, the only * "status": SUCCESS if everything ran through. In case of failure, the only
content of the JSON output will be "status" set to FAILURE and an content of the JSON output will be "status" set to FAILURE and an
additional key: "traceback". additional key: "traceback".
The following additional keys store relevant input parameters to reproduce The following additional keys store relevant input parameters to reproduce
results: results:
* "model" * "model"
* "reference" * "reference"
* "fault_tolerant" * "fault_tolerant"
...@@ -103,22 +104,22 @@ Details on the usage (output of ``ost compare-structures --help``): ...@@ -103,22 +104,22 @@ Details on the usage (output of ``ost compare-structures --help``):
* "cad_exec" * "cad_exec"
* "usalign_exec" * "usalign_exec"
* "lddt_no_stereochecks" * "lddt_no_stereochecks"
The pairwise sequence alignments are computed with Needleman-Wunsch using The pairwise sequence alignments are computed with Needleman-Wunsch using
BLOSUM62 (NUC44 for nucleotides). Many benchmarking scenarios preprocess the BLOSUM62 (NUC44 for nucleotides). Many benchmarking scenarios preprocess the
structures to ensure matching residue numbers (CASP/CAMEO). In these cases, structures to ensure matching residue numbers (CASP/CAMEO). In these cases,
enabling -rna/--residue-number-alignment is recommended. enabling -rna/--residue-number-alignment is recommended.
Each score is opt-in and can be enabled with optional arguments. Each score is opt-in and can be enabled with optional arguments.
Example to compute global and per-residue lDDT values as well as QS-score: Example to compute global and per-residue lDDT values as well as QS-score:
ost compare-structures -m model.pdb -r reference.cif --lddt --local-lddt --qs-score ost compare-structures -m model.pdb -r reference.cif --lddt --local-lddt --qs-score
Example to inject custom chain mapping Example to inject custom chain mapping
ost compare-structures -m model.pdb -r reference.cif -c A:B B:A ost compare-structures -m model.pdb -r reference.cif -c A:B B:A
optional arguments: optional arguments:
-h, --help show this help message and exit -h, --help show this help message and exit
-m MODEL, --model MODEL -m MODEL, --model MODEL
...@@ -218,7 +219,58 @@ Details on the usage (output of ``ost compare-structures --help``): ...@@ -218,7 +219,58 @@ Details on the usage (output of ``ost compare-structures --help``):
given, an OpenStructure internal copy of USalign code given, an OpenStructure internal copy of USalign code
is used. is used.
--qs-score Compute QS-score, stored as key "qs_global", and the --qs-score Compute QS-score, stored as key "qs_global", and the
QS-best variant, stored as key "qs_best". QS-best variant, stored as key "qs_best". Interfaces
in the reference with non-zero contribution to QS-
score are available as key "qs_reference_interfaces",
the ones from the model as key "qs_model_interfaces".
"qs_interfaces" is a subset of
"qs_reference_interfaces" that contains interfaces
that can be mapped to the model. They are stored as
lists in format [ref_ch1, ref_ch2, mdl_ch1, mdl_ch2].
The respective per-interface scores for
"qs_interfaces" are available as keys
"per_interface_qs_global" and "per_interface_qs_best"
--dockq Compute DockQ scores and its components. Relevant
interfaces with at least one contact (any atom within
5A) of the reference structure are available as key
"dockq_reference_interfaces". Only interfaces between
peptide chains are considered here! Key
"dockq_interfaces" is a subset of
"dockq_reference_interfaces" that contains interfaces
that can be mapped to the model. They are stored as
lists in format [ref_ch1, ref_ch2, mdl_ch1, mdl_ch2].
The respective DockQ scores for "dockq_interfaces" are
available as key "dockq". It's components are
available as keys: "fnat" (fraction of reference
contacts which are also there in model) "irmsd"
(interface RMSD), "lrmsd" (ligand RMSD). The DockQ
score is strictly designed to score each interface
individually. We also provide two averaged versions to
get one full model score: "dockq_ave", "dockq_wave".
The first is simply the average of "dockq_scores", the
latter is a weighted average with weights derived from
number of contacts in the reference interfaces. These
two scores only consider interfaces that are present
in both, the model and the reference. "dockq_ave_full"
and "dockq_wave_full" add zeros in the average
computation for each interface that is only present in
the reference but not in the model.
--contact-scores Computes interface contact based scores. A contact
between two residues of different chains is defined as
having at least one heavy atom within 5A. Contacts in
reference structure are available as key
"reference_contacts". Each contact specifies the
interacting residues in format
"<cname>.<rnum>.<ins_code>". Model contacts are
available as key "model_contacts". The precision which
is available as key "contact_precision" reports the
fraction of model contacts that are also present in
the reference. The recall which is available as key
"contact_recall" reports the fraction of reference
contacts that are correctly reproduced in the model.
The ICS score (Interface Contact Similarity) available
as key "ics" combines precision and recall using the
F1-measure.
--rigid-scores Computes rigid superposition based scores. They're --rigid-scores Computes rigid superposition based scores. They're
based on a Kabsch superposition of all mapped CA based on a Kabsch superposition of all mapped CA
positions (C3' for nucleotides). Makes the following positions (C3' for nucleotides). Makes the following
...@@ -229,33 +281,6 @@ Details on the usage (output of ``ost compare-structures --help``): ...@@ -229,33 +281,6 @@ Details on the usage (output of ``ost compare-structures --help``):
these positions and transformation, "transform": the these positions and transformation, "transform": the
used 4x4 transformation matrix that superposes model used 4x4 transformation matrix that superposes model
onto reference. onto reference.
--interface-scores Per interface scores for each interface that has at
least one contact in the reference, i.e. at least one
pair of heavy atoms within 5A. The respective
interfaces are available from key "interfaces" which
is a list of tuples in form (ref_ch1, ref_ch2,
mdl_ch1, mdl_ch2). Per-interface scores are available
as lists referring to these interfaces and have the
following keys: "nnat" (number of contacts in
reference), "nmdl" (number of contacts in model),
"fnat" (fraction of reference contacts which are also
there in model), "fnonnat" (fraction of model contacts
which are not there in target), "irmsd" (interface
RMSD), "lrmsd" (ligand RMSD), "dockq_scores" (per-
interface score computed from "fnat", "irmsd" and
"lrmsd"), "interface_qs_global" and
"interface_qs_best" (per-interface versions of the two
QS-score variants). The DockQ score is strictly
designed to score each interface individually. We also
provide two averaged versions to get one full model
score: "dockq_ave", "dockq_wave". The first is simply
the average of "dockq_scores", the latter is a
weighted average with weights derived from "nnat".
These two scores only consider interfaces that are
present in both, the model and the reference.
"dockq_ave_full" and "dockq_wave_full" add zeros in
the average computation for each interface that is
only present in the reference but not in the model.
--patch-scores Local interface quality score used in CASP15. Scores --patch-scores Local interface quality score used in CASP15. Scores
each model residue that is considered in the interface each model residue that is considered in the interface
(CB pos within 8A of any CB pos from another chain (CA (CB pos within 8A of any CB pos from another chain (CA
...@@ -277,6 +302,12 @@ Details on the usage (output of ``ost compare-structures --help``): ...@@ -277,6 +302,12 @@ Details on the usage (output of ``ost compare-structures --help``):
"usalign_mapping" "usalign_mapping"
--lddt-no-stereochecks --lddt-no-stereochecks
Disable stereochecks for lDDT computation Disable stereochecks for lDDT computation
--n-max-naive N_MAX_NAIVE
If number of chains in model and reference are below
or equal that number, the chain mapping will naively
enumerate all possible mappings. A heuristic is used
otherwise.
......
...@@ -54,6 +54,7 @@ class ContactEntity: ...@@ -54,6 +54,7 @@ class ContactEntity:
self._interacting_chains = None self._interacting_chains = None
self._sequence = dict() self._sequence = dict()
self._contacts = None self._contacts = None
self._hr_contacts = None
@property @property
def view(self): def view(self):
...@@ -122,6 +123,18 @@ class ContactEntity: ...@@ -122,6 +123,18 @@ class ContactEntity:
if self._contacts is None: if self._contacts is None:
self._SetupContacts() self._SetupContacts()
return self._contacts return self._contacts
@property
def hr_contacts(self):
""" Human readable interchain contacts
Human readable version of :attr:`~contacts`. Simple list with tuples
containing two strings specifying the residues in contact. Format:
<cname>.<rnum>.<ins_code>
"""
if self._hr_contacts is None:
self._SetupContacts()
return self._hr_contacts
def GetChain(self, chain_name): def GetChain(self, chain_name):
""" Get chain by name """ Get chain by name
...@@ -152,6 +165,7 @@ class ContactEntity: ...@@ -152,6 +165,7 @@ class ContactEntity:
# this function is incredibly inefficient... if performance is an issue, # this function is incredibly inefficient... if performance is an issue,
# go ahead and optimize # go ahead and optimize
self._contacts = dict() self._contacts = dict()
self._hr_contacts = list()
# set indices relative to full view # set indices relative to full view
for ch in self.view.chains: for ch in self.view.chains:
...@@ -178,6 +192,12 @@ class ContactEntity: ...@@ -178,6 +192,12 @@ class ContactEntity:
self._contacts[cname_key] = set() self._contacts[cname_key] = set()
self._contacts[cname_key].add((r1.GetIntProp("contact_idx"), self._contacts[cname_key].add((r1.GetIntProp("contact_idx"),
r2.GetIntProp("contact_idx"))) r2.GetIntProp("contact_idx")))
rnum1 = r1.GetNumber()
hr1 = f"{cname}.{rnum1.num}.{rnum1.ins_code}"
rnum2 = r2.GetNumber()
hr2 = f"{cname2}.{rnum2.num}.{rnum2.ins_code}"
self._hr_contacts.append((hr1.strip("\u0000"),
hr2.strip("\u0000")))
class ContactScorerResult: class ContactScorerResult:
""" """
......
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment