Skip to content
Snippets Groups Projects
Commit a4e9d9a7 authored by Studer Gabriel's avatar Studer Gabriel
Browse files

introduce uniquely identifiable residue/atom names

Mainly for reporting in the compare-structures action
residues are reported as strings in format:
<cname>.<rnum>.<ins_code>, atoms are reported as
<cname>.<rnum>.<ins_code>.<aname>. This changes behaviour of
compare-structures and ost.mol.alg.stereochemistry
(ToJSON functions of violation info objects for the latter).
parent 01791c49
No related branches found
No related tags found
No related merge requests found
......@@ -17,7 +17,7 @@ Loads the structures and performs basic cleanup:
The cleaned structures are optionally dumped using -d/--dump-structures
Output is written in JSON format (default: out.json). In case of no additional
options, this is a dictionary with 8 keys:
options, this is a dictionary with 8 keys describing model/reference comparison:
* "reference_chains": Chain names of reference
* "model_chains": Chain names of model
......@@ -36,7 +36,7 @@ options, this is a dictionary with 8 keys:
format.
* "inconsistent_residues": List of strings that represent name mismatches of
aligned residues in form
<trg_cname>.<trg_rname><trg_rnum>-<mdl_cname>.<mdl_rname><mdl_rnum>.
<trg_cname>.<trg_rnum>.<trg_ins_code>-<mdl_cname>.<mdl_rnum>.<mdl_ins_code>.
Inconsistencies may lead to corrupt results but do not abort the program.
Program abortion in these cases can be enforced with
-ec/--enforce-consistency.
......@@ -44,6 +44,20 @@ options, this is a dictionary with 8 keys:
content of the JSON output will be \"status\" set to FAILURE and an
additional key: "traceback".
The following additional keys store relevant input parameters to reproduce
results:
* "model"
* "reference"
* "fault_tolerant"
* "model_biounit"
* "reference_biounit"
* "residue_number_alignment"
* "enforce_consistency"
* "cad_exec"
* "usalign_exec"
* "lddt_no_stereochecks"
The pairwise sequence alignments are computed with Needleman-Wunsch using
BLOSUM62 (NUC44 for nucleotides). Many benchmarking scenarios preprocess the
structures to ensure matching residue numbers (CASP/CAMEO). In these cases,
......@@ -216,13 +230,16 @@ def _ParseArgs():
default=False,
action="store_true",
help=("Compute per-residue lDDT scores with default parameterization "
"and store as key \"local_lddt\". Score for model residue with "
"number 42 in chain X can be extracted with: "
"data[\"local_lddt\"][\"X\"][\"42\"]. If there is an insertion "
"code, lets say A, the last key becomes \"42A\" Stereochemical "
"irregularities affecting lDDT are reported as keys "
"\"model_clashes\", \"model_bad_bonds\", \"model_bad_angles\" "
"and the respective reference counterparts."))
"and store as key \"local_lddt\". Score for each residue is "
"accessible by key <chain_name>.<resnum>.<resnum_inscode>. "
"Residue with number 42 in chain X can be extracted with: "
"data[\"local_lddt\"][\"X.42.\"]. If there is an insertion "
"code, lets say A, the residue key becomes \"X.42.A\". "
"Stereochemical irregularities affecting lDDT are reported as "
"keys \"model_clashes\", \"model_bad_bonds\", "
"\"model_bad_angles\" and the respective reference "
"counterparts. Atoms specified in there follow the following "
"format: <chain_name>.<resnum>.<resnum_inscode>.<atom_name>"))
parser.add_argument(
"--bb-lddt",
......@@ -241,10 +258,8 @@ def _ParseArgs():
help=("Compute per-residue lDDT scores with default parameterization "
"and store as key \"bb_local_lddt\". lDDT in this case is only "
"computed on backbone atoms: CA for peptides and C3' for "
"nucleotides. Score for model residue with "
"number 42 in chain X can be extracted with: "
"data[\"local_lddt\"][\"X\"][\"42\"]. If there is an insertion "
"code, lets say A, the last key becomes \"42A\""))
"nucleotides. Per-residue scores are accessible as described for "
"local_lddt."))
parser.add_argument(
"--cad-score",
......@@ -262,12 +277,10 @@ def _ParseArgs():
default=False,
action="store_true",
help=("Compute local CAD's atom-atom (AA) scores and store as key "
"\"local_cad_score\". Score for model residue with number 42 in "
"chain X can be extracted with: "
"data[\"local_cad_score\"][\"X\"][\"42\"]. "
"--residue-number-alignments must be enabled to compute this "
"score. Requires voronota_cadscore executable in PATH. "
"Alternatively you can set cad-exec."))
"\"local_cad_score\". Per-residue scores are accessible as "
"described for local_lddt. --residue-number-alignments must be "
"enabled to compute this score. Requires voronota_cadscore "
"executable in PATH. Alternatively you can set cad-exec."))
parser.add_argument(
"--cad-exec",
......@@ -350,8 +363,9 @@ def _ParseArgs():
"None. Model interface residues are available as key "
"\"model_interface_residues\", reference interface residues as "
"key \"reference_interface_residues\". Residues are represented "
"as string in form <num><inscode>. The respective scores are "
"available as keys \"patch_qs\" and \"patch_dockq\""))
"as string in form <chain_name>.<resnum>.<resnum_inscode>. "
"The respective scores are available as keys \"patch_qs\" and "
"\"patch_dockq\""))
parser.add_argument(
"--tm-score",
......@@ -474,29 +488,47 @@ def _GetInconsistentResidues(alns):
r1 = col.GetResidue(0)
r2 = col.GetResidue(1)
if r1.IsValid() and r2.IsValid() and r1.GetName() != r2.GetName():
lst.append((r1, r2))
lst = [f"{x[0].GetQualifiedName()}-{x[1].GetQualifiedName()}" for x in lst]
ch_1 = r1.GetChain().name
num_1 = r1.number.num
ins_code_1 = r1.number.ins_code.strip("\u0000")
id_1 = f"{ch_1}.{num_1}.{ins_code_1}"
ch_2 = r2.GetChain().name
num_2 = r2.number.num
ins_code_2 = r2.number.ins_code.strip("\u0000")
id_2 = f"{ch_2}.{num_2}.{ins_code_2}"
lst.append(f"{id_1}-{id_2}")
return lst
def _LocalScoresToJSONDict(score_dict):
""" Score for model residue with number rnum in chain X can be extracted
with: data["local_cad_score"]["X"][rnum]. Convert ResNum object to str for
JSON serialization
""" Convert ResNums to str for JSON serialization
"""
json_dict = dict()
for ch, ch_scores in score_dict.items():
json_dict[ch] = {str(rnum): s for rnum, s in ch_scores.items()}
for num, s in ch_scores.items():
ins_code = num.ins_code.strip("\u0000")
json_dict[f"{ch}.{num.num}.{ins_code}"] = s
return json_dict
def _InterfaceResiduesToJSONDict(interface_dict):
""" Interface residues are stored as
data["model_interface_residues"]["A"][rnum1, rnum2,...]. Convert ResNum
object to str for JSON serialization.
def _InterfaceResiduesToJSONList(interface_dict):
""" Convert ResNums to str for JSON serialization.
Changes in this function will affect _PatchScoresToJSONList
"""
json_dict = dict()
json_list = list()
for ch, ch_nums in interface_dict.items():
json_dict[ch] = [str(rnum) for rnum in ch_nums]
return json_dict
for num in ch_nums:
ins_code = num.ins_code.strip("\u0000")
json_list.append(f"{ch}.{num.num}.{ins_code}")
return json_list
def _PatchScoresToJSONList(interface_dict, score_dict):
""" Creates List of patch scores that are consistent with interface residue
lists
"""
json_list = list()
for ch, ch_nums in interface_dict.items():
json_list += score_dict[ch]
return json_list
def _Process(model, reference, args):
......@@ -579,11 +611,13 @@ def _Process(model, reference, args):
if args.patch_scores:
out["model_interface_residues"] = \
_InterfaceResiduesToJSONDict(scorer.model_interface_residues)
_InterfaceResiduesToJSONList(scorer.model_interface_residues)
out["reference_interface_residues"] = \
_InterfaceResiduesToJSONDict(scorer.target_interface_residues)
out["patch_qs"] = scorer.patch_qs
out["patch_dockq"] = scorer.patch_dockq
_InterfaceResiduesToJSONList(scorer.target_interface_residues)
out["patch_qs"] = _PatchScoresToJSONList(scorer.model_interface_residues,
scorer.patch_qs)
out["patch_dockq"] = _PatchScoresToJSONList(scorer.model_interface_residues,
scorer.patch_dockq)
if args.tm_score:
out["tm_score"] = scorer.tm_score
......@@ -614,10 +648,6 @@ def _Process(model, reference, args):
"fails in this case.")
else:
raise
return out
def _Main():
......
......@@ -40,9 +40,11 @@ Details on the usage (output of ``ost compare-structures --help``):
[-c CHAIN_MAPPING [CHAIN_MAPPING ...]] [--lddt]
[--local-lddt] [--bb-lddt] [--bb-local-lddt]
[--cad-score] [--local-cad-score]
[--cad-exec CAD_EXEC] [--qs-score]
[--cad-exec CAD_EXEC]
[--usalign-exec USALIGN_EXEC] [--qs-score]
[--rigid-scores] [--interface-scores]
[--patch-scores]
[--patch-scores] [--tm-score]
[--lddt-no-stereochecks]
Evaluate model against reference
......@@ -62,8 +64,8 @@ Details on the usage (output of ``ost compare-structures --help``):
The cleaned structures are optionally dumped using -d/--dump-structures
Output is written in JSON format (default: out.json). In case of no additional
options, this is a dictionary with 8 keys:
options, this is a dictionary with 8 keys describing model/reference comparison:
* "reference_chains": Chain names of reference
* "model_chains": Chain names of model
* "chem_groups": Groups of polypeptides/polynucleotides from reference that
......@@ -81,29 +83,43 @@ Details on the usage (output of ``ost compare-structures --help``):
format.
* "inconsistent_residues": List of strings that represent name mismatches of
aligned residues in form
<trg_cname>.<trg_rname><trg_rnum>-<mdl_cname>.<mdl_rname><mdl_rnum>.
<trg_cname>.<trg_rnum>.<trg_ins_code>-<mdl_cname>.<mdl_rnum>.<mdl_ins_code>.
Inconsistencies may lead to corrupt results but do not abort the program.
Program abortion in these cases can be enforced with
-ec/--enforce-consistency.
* "status": SUCCESS if everything ran through. In case of failure, the only
content of the JSON output will be "status" set to FAILURE and an
additional key: "traceback".
The following additional keys store relevant input parameters to reproduce
results:
* "model"
* "reference"
* "fault_tolerant"
* "model_biounit"
* "reference_biounit"
* "residue_number_alignment"
* "enforce_consistency"
* "cad_exec"
* "usalign_exec"
* "lddt_no_stereochecks"
The pairwise sequence alignments are computed with Needleman-Wunsch using
BLOSUM62 (NUC44 for nucleotides). Many benchmarking scenarios preprocess the
structures to ensure matching residue numbers (CASP/CAMEO). In these cases,
enabling -rna/--residue-number-alignment is recommended.
Each score is opt-in and can be enabled with optional arguments.
Example to compute global and per-residue lDDT values as well as QS-score:
ost compare-structures -m model.pdb -r reference.cif --lddt --local-lddt --qs-score
Example to inject custom chain mapping
ost compare-structures -m model.pdb -r reference.cif -c A:B B:A
optional arguments:
-h, --help show this help message and exit
-m MODEL, --model MODEL
......@@ -164,13 +180,17 @@ Details on the usage (output of ``ost compare-structures --help``):
counterparts.
--local-lddt Compute per-residue lDDT scores with default
parameterization and store as key "local_lddt". Score
for model residue with number 42 in chain X can be
extracted with: data["local_lddt"]["X"]["42"]. If
there is an insertion code, lets say A, the last key
becomes "42A" Stereochemical irregularities affecting
lDDT are reported as keys "model_clashes",
"model_bad_bonds", "model_bad_angles" and the
respective reference counterparts.
for each residue is accessible by key
<chain_name>.<resnum>.<resnum_inscode>. Residue with
number 42 in chain X can be extracted with:
data["local_lddt"]["X.42."]. If there is an insertion
code, lets say A, the residue key becomes "X.42.A".
Stereochemical irregularities affecting lDDT are
reported as keys "model_clashes", "model_bad_bonds",
"model_bad_angles" and the respective reference
counterparts. Atoms specified in there follow the
following format:
<chain_name>.<resnum>.<resnum_inscode>.<atom_name>
--bb-lddt Compute global lDDT score with default
parameterization and store as key "bb_lddt". lDDT in
this case is only computed on backbone atoms: CA for
......@@ -178,26 +198,26 @@ Details on the usage (output of ``ost compare-structures --help``):
--bb-local-lddt Compute per-residue lDDT scores with default
parameterization and store as key "bb_local_lddt".
lDDT in this case is only computed on backbone atoms:
CA for peptides and C3' for nucleotides. Score for
model residue with number 42 in chain X can be
extracted with: data["local_lddt"]["X"]["42"]. If
there is an insertion code, lets say A, the last key
becomes "42A"
CA for peptides and C3' for nucleotides. Per-residue
scores are accessible as described for local_lddt.
--cad-score Compute global CAD's atom-atom (AA) score and store as
key "cad_score". --residue-number-alignment must be
enabled to compute this score. Requires
voronota_cadscore executable in PATH. Alternatively
you can set cad-exec.
--local-cad-score Compute local CAD's atom-atom (AA) scores and store as
key "local_cad_score". Score for model residue with
number 42 in chain X can be extracted with:
data["local_cad_score"]["X"]["42"]. --residue-number-
alignments must be enabled to compute this score.
Requires voronota_cadscore executable in PATH.
key "local_cad_score". Per-residue scores are
accessible as described for local_lddt. --residue-
number-alignments must be enabled to compute this
score. Requires voronota_cadscore executable in PATH.
Alternatively you can set cad-exec.
--cad-exec CAD_EXEC Path to voronota-cadscore executable (installed from
https://github.com/kliment-olechnovic/voronota).
Searches PATH if not set.
--usalign-exec USALIGN_EXEC
Path to USalign executable to compute TM-score. If not
given, an OpenStructure internal copy of USalign code
is used.
--qs-score Compute QS-score, stored as key "qs_global", and the
QS-best variant, stored as key "qs_best".
--rigid-scores Computes rigid superposition based scores. They're
......@@ -248,12 +268,13 @@ Details on the usage (output of ``ost compare-structures --help``):
"model_interface_residues", reference interface
residues as key "reference_interface_residues".
Residues are represented as string in form
<num><inscode>. The respective scores are available as
keys "patch_qs" and "patch_dockq"
<chain_name>.<resnum>.<resnum_inscode>. The respective
scores are available as keys "patch_qs" and
"patch_dockq"
--tm-score Computes TM-score with the USalign tool. Also computes
a chain mapping in case of complexes that is stored
in the same format as the default mapping. TM-score
and the mapping are available as keys "tm_score" and
a chain mapping in case of complexes that is stored in
the same format as the default mapping. TM-score and
the mapping are available as keys "tm_score" and
"usalign_mapping"
--lddt-no-stereochecks
Disable stereochecks for lDDT computation
......
......@@ -9,6 +9,18 @@ from ost import geom
from ost import mol
def _AtomToQualifiedName(a):
""" Returns string to uniquely identify atom
format: <chain_name>.<resnum>.<resnum_inscode>.<atom_name>
"""
r = a.GetResidue()
ch = r.GetChain()
num = r.number.num
ins_code = r.number.ins_code.strip("\u0000")
return f"{ch.name}.{r.number.num}.{ins_code}.{a.name}"
def _PotentialDisulfid(a_one, a_two):
""" Returns whether two atoms can potentially build a disulfid bond
......@@ -355,9 +367,12 @@ class ClashInfo:
def ToJSON(self):
""" Return JSON serializable dict
Atoms are represented by a string in format:
<chain_name>.<resnum>.<resnum_inscode>.<atom_name>
"""
return {"a1": self.a1.GetQualifiedName(),
"a2": self.a2.GetQualifiedName(),
return {"a1": _AtomToQualifiedName(self.a1),
"a2": _AtomToQualifiedName(self.a2),
"dist": self.dist,
"tolerated_dist": self.tolerated_dist}
......@@ -382,9 +397,12 @@ class BondViolationInfo:
def ToJSON(self):
""" Return JSON serializable dict
Atoms are represented by a string in format:
<chain_name>.<resnum>.<resnum_inscode>.<atom_name>
"""
return {"a1": self.a1.GetQualifiedName(),
"a2": self.a2.GetQualifiedName(),
return {"a1": _AtomToQualifiedName(self.a1),
"a2": _AtomToQualifiedName(self.a2),
"length": self.length,
"exp_length": self.exp_length,
"std": self.std}
......@@ -412,10 +430,13 @@ class AngleViolationInfo:
def ToJSON(self):
""" Return JSON serializable dict
Atoms are represented by a string in format:
<chain_name>.<resnum>.<resnum_inscode>.<atom_name>
"""
return {"a1": self.a1.GetQualifiedName(),
"a2": self.a2.GetQualifiedName(),
"a3": self.a3.GetQualifiedName(),
return {"a1": _AtomToQualifiedName(self.a1),
"a2": _AtomToQualifiedName(self.a2),
"a3": _AtomToQualifiedName(self.a3),
"angle": self.angle,
"exp_angle": self.exp_angle,
"std": self.std}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment