Something went wrong on our end
-
Studer Gabriel authoredStuder Gabriel authored
ost-compare-structures 41.23 KiB
"""
Evaluate model against reference
Example: ost compare-structures -m model.pdb -r reference.cif
Loads the structures and performs basic cleanup:
* Assign elements according to the PDB Chemical Component Dictionary
* Map nonstandard residues to their parent residues as defined by the PDB
Chemical Component Dictionary, e.g. phospho-serine => serine
* Remove hydrogens
* Remove OXT atoms
* Remove unknown atoms, i.e. atoms that are not expected according to the PDB
Chemical Component Dictionary
* Select for peptide/nucleotide residues
The cleaned structures are optionally dumped using -d/--dump-structures
Output is written in JSON format (default: out.json). In case of no additional
options, this is a dictionary with 8 keys describing model/reference comparison:
* "reference_chains": Chain names of reference
* "model_chains": Chain names of model
* "chem_groups": Groups of polypeptides/polynucleotides from reference that
are considered chemically equivalent. You can derive stoichiometry from this.
Contains only chains that are considered in chain mapping, i.e. pass a
size threshold (defaults: 6 for peptides, 4 for nucleotides).
* "chem_mapping": List of same length as "chem_groups". Assigns model chains to
the respective chem group. Again, only contains chains that are considered
in chain mapping.
* "chain_mapping": A dictionary with reference chain names as keys and the
mapped model chain names as values. Missing chains are either not mapped
(but present in "chem_groups", "chem_mapping") or were not considered in
chain mapping (short peptides etc.)
* "aln": Pairwise sequence alignment for each pair of mapped chains in fasta
format.
* "inconsistent_residues": List of strings that represent name mismatches of
aligned residues in form
<trg_cname>.<trg_rnum>.<trg_ins_code>-<mdl_cname>.<mdl_rnum>.<mdl_ins_code>.
Inconsistencies may lead to corrupt results but do not abort the program.
Program abortion in these cases can be enforced with
-ec/--enforce-consistency.
* "status": SUCCESS if everything ran through. In case of failure, the only
content of the JSON output will be \"status\" set to FAILURE and an
additional key: "traceback".
The following additional keys store relevant input parameters to reproduce
results:
* "model"
* "reference"
* "fault_tolerant"
* "model_biounit"
* "reference_biounit"
* "residue_number_alignment"
* "enforce_consistency"
* "cad_exec"
* "usalign_exec"
* "lddt_no_stereochecks"
* "min_pep_length"
* "min_nuc_length"
* "lddt_add_mdl_contacts"
* "dockq_capri_peptide"
* "ost_version"
The pairwise sequence alignments are computed with Needleman-Wunsch using
BLOSUM62 (NUC44 for nucleotides). Many benchmarking scenarios preprocess the
structures to ensure matching residue numbers (CASP/CAMEO). In these cases,
enabling -rna/--residue-number-alignment is recommended.
Each score is opt-in and can be enabled with optional arguments.
Example to compute global and per-residue lDDT values as well as QS-score:
ost compare-structures -m model.pdb -r reference.cif --lddt --local-lddt \
--qs-score
Example to inject custom chain mapping
ost compare-structures -m model.pdb -r reference.cif -c A:B B:A
"""
import argparse
import os
import json
import sys
import traceback
import math
import ost
from ost import io
from ost.mol.alg import scoring
def _ParseArgs():
parser = argparse.ArgumentParser(description = __doc__,
formatter_class=argparse.RawDescriptionHelpFormatter,
prog = "ost compare-structures")
parser.add_argument(
"-m",
"--model",
dest="model",
required=True,
help=("Path to model file."))
parser.add_argument(
"-r",
"--reference",
dest="reference",
required=True,
help=("Path to reference file."))
parser.add_argument(
"-o",
"--output",
dest="output",
required=False,
default="out.json",
help=("Output file name. The output will be saved as a JSON file. "
"default: out.json"))
parser.add_argument(
"-mf",
"--model-format",
dest="model_format",
required=False,
default=None,
choices=["pdb", "cif", "mmcif"],
help=("Format of model file. pdb reads pdb but also pdb.gz, same "
"applies to cif/mmcif. Inferred from filepath if not given."))
parser.add_argument(
"-rf",
"--reference-format",
dest="reference_format",
required=False,
default=None,
choices=["pdb", "cif", "mmcif"],
help=("Format of reference file. pdb reads pdb but also pdb.gz, same "
"applies to cif/mmcif. Inferred from filepath if not given."))
parser.add_argument(
"-mb",
"--model-biounit",
dest="model_biounit",
required=False,
default=None,
type=str,
help=("Only has an effect if model is in mmcif format. By default, "
"the asymmetric unit (AU) is used for scoring. If there are "
"biounits defined in the mmcif file, you can specify the "
"ID (as a string) of the one which should be used."))
parser.add_argument(
"-rb",
"--reference-biounit",
dest="reference_biounit",
required=False,
default=None,
type=str,
help=("Only has an effect if reference is in mmcif format. By default, "
"the asymmetric unit (AU) is used for scoring. If there are "
"biounits defined in the mmcif file, you can specify the "
"ID (as a string) of the one which should be used."))
parser.add_argument(
"-rna",
"--residue-number-alignment",
dest="residue_number_alignment",
default=False,
action="store_true",
help=("Make alignment based on residue number instead of using "
"a global BLOSUM62-based alignment (NUC44 for nucleotides)."))
parser.add_argument(
"-ec",
"--enforce-consistency",
dest="enforce_consistency",
default=False,
action="store_true",
help=("Enforce consistency. By default residue name discrepancies "
"between a model and reference are reported but the program "
"proceeds. If this flag is ON, the program fails for these "
"cases."))
parser.add_argument(
"-d",
"--dump-structures",
dest="dump_structures",
default=False,
action="store_true",
help=("Dump cleaned structures used to calculate all the scores as PDB"
" or mmCIF files using specified suffix. Files will be dumped to"
" the same location and in the same format as original files."))
parser.add_argument(
"-ds",
"--dump-suffix",
dest="dump_suffix",
default="_compare_structures",
help=("Use this suffix to dump structures.\n"
"Defaults to _compare_structures"))
parser.add_argument(
"-ft",
"--fault-tolerant",
dest="fault_tolerant",
default=False,
action="store_true",
help=("Fault tolerant parsing."))
parser.add_argument(
"-c",
"--chain-mapping",
nargs="+",
dest="chain_mapping",
help=("Custom mapping of chains between the reference and the model. "
"Each separate mapping consist of key:value pairs where key "
"is the chain name in reference and value is the chain name in "
"model."))
parser.add_argument(
"--lddt",
dest="lddt",
default=False,
action="store_true",
help=("Compute global lDDT score with default parameterization and "
"store as key \"lddt\". Stereochemical irregularities affecting "
"lDDT are reported as keys \"model_clashes\", "
"\"model_bad_bonds\", \"model_bad_angles\" and the respective "
"reference counterparts."))
parser.add_argument(
"--local-lddt",
dest="local_lddt",
default=False,
action="store_true",
help=("Compute per-residue lDDT scores with default parameterization "
"and store as key \"local_lddt\". Score for each residue is "
"accessible by key <chain_name>.<resnum>.<resnum_inscode>. "
"Residue with number 42 in chain X can be extracted with: "
"data[\"local_lddt\"][\"X.42.\"]. If there is an insertion "
"code, lets say A, the residue key becomes \"X.42.A\". "
"Stereochemical irregularities affecting lDDT are reported as "
"keys \"model_clashes\", \"model_bad_bonds\", "
"\"model_bad_angles\" and the respective reference "
"counterparts. Atoms specified in there follow the following "
"format: <chain_name>.<resnum>.<resnum_inscode>.<atom_name>"))
parser.add_argument(
"--bb-lddt",
dest="bb_lddt",
default=False,
action="store_true",
help=("Compute global lDDT score with default parameterization and "
"store as key \"bb_lddt\". lDDT in this case is only computed on "
"backbone atoms: CA for peptides and C3' for nucleotides"))
parser.add_argument(
"--bb-local-lddt",
dest="bb_local_lddt",
default=False,
action="store_true",
help=("Compute per-residue lDDT scores with default parameterization "
"and store as key \"bb_local_lddt\". lDDT in this case is only "
"computed on backbone atoms: CA for peptides and C3' for "
"nucleotides. Per-residue scores are accessible as described for "
"local_lddt."))
parser.add_argument(
"--ilddt",
dest="ilddt",
default=False,
action="store_true",
help=("Compute global lDDT score which is solely based on inter-chain "
"contacts and store as key \"ilddt\". Same stereochemical "
"irregularities as for lddt apply."))
parser.add_argument(
"--cad-score",
dest="cad_score",
default=False,
action="store_true",
help=("Compute global CAD's atom-atom (AA) score and store as key "
"\"cad_score\". --residue-number-alignment must be enabled "
"to compute this score. Requires voronota_cadscore executable "
"in PATH. Alternatively you can set cad-exec."))
parser.add_argument(
"--local-cad-score",
dest="local_cad_score",
default=False,
action="store_true",
help=("Compute local CAD's atom-atom (AA) scores and store as key "
"\"local_cad_score\". Per-residue scores are accessible as "
"described for local_lddt. --residue-number-alignments must be "
"enabled to compute this score. Requires voronota_cadscore "
"executable in PATH. Alternatively you can set cad-exec."))
parser.add_argument(
"--cad-exec",
dest="cad_exec",
default=None,
help=("Path to voronota-cadscore executable (installed from "
"https://github.com/kliment-olechnovic/voronota). Searches PATH "
"if not set."))
parser.add_argument(
"--usalign-exec",
dest="usalign_exec",
default=None,
help=("Path to USalign executable to compute TM-score. If not given, "
"an OpenStructure internal copy of USalign code is used."))
parser.add_argument(
"--override-usalign-mapping",
dest="oum",
default=False,
action="store_true",
help=("Override USalign mapping and inject our own rigid mapping. Only "
"works if external usalign executable is provided that is "
"reasonably new and contains that feature."))
parser.add_argument(
"--qs-score",
dest="qs_score",
default=False,
action="store_true",
help=("Compute QS-score, stored as key \"qs_global\", and the QS-best "
"variant, stored as key \"qs_best\". Interfaces in the reference "
"with non-zero contribution to QS-score are available as key "
"\"qs_reference_interfaces\", the ones from the model as key "
"\"qs_model_interfaces\". \"qs_interfaces\" is a subset of "
"\"qs_reference_interfaces\" that contains interfaces that "
"can be mapped to the model. They are stored as lists in format "
"[ref_ch1, ref_ch2, mdl_ch1, mdl_ch2]. The respective "
"per-interface scores for \"qs_interfaces\" are available as "
"keys \"per_interface_qs_global\" and \"per_interface_qs_best\""))
parser.add_argument(
"--dockq",
dest="dockq",
default=False,
action="store_true",
help=("Compute DockQ scores and its components. Relevant interfaces "
"with at least one contact (any atom within 5A) of the reference "
"structure are available as key \"dockq_reference_interfaces\". "
"Protein-protein, protein-nucleotide and nucleotide-nucleotide "
"interfaces are considered. "
"Key \"dockq_interfaces\" is a subset of "
"\"dockq_reference_interfaces\" that contains interfaces that "
"can be mapped to the model. They are stored as lists in format "
"[ref_ch1, ref_ch2, mdl_ch1, mdl_ch2]. The respective "
"DockQ scores for \"dockq_interfaces\" are available as key "
"\"dockq\". It's components are available as keys: "
"\"fnat\" (fraction of reference contacts which are also there "
"in model) \"irmsd\" (interface RMSD), \"lrmsd\" (ligand RMSD). "
"The DockQ score is strictly designed to score each interface "
"individually. We also provide two averaged versions to get one "
"full model score: \"dockq_ave\", \"dockq_wave\". The first is "
"simply the average of \"dockq_scores\", the latter is a "
"weighted average with weights derived from number of contacts "
"in the reference interfaces. These two scores only consider "
"interfaces that are present in both, the model and the "
"reference. \"dockq_ave_full\" and \"dockq_wave_full\" add zeros "
"in the average computation for each interface that is only "
"present in the reference but not in the model."))
parser.add_argument(
"--dockq-capri-peptide",
dest="dockq_capri_peptide",
default=False,
action="store_true",
help=("Flag that changes two things in the way DockQ and its "
"underlying scores are computed which is proposed by the CAPRI "
"community when scoring peptides (PMID: 31886916). "
"ONE: Two residues are considered in contact if any of their "
"atoms is within 5A. This is relevant for fnat and fnonat "
"scores. CAPRI suggests to lower this threshold to 4A for "
"protein-peptide interactions. "
"TWO: irmsd is computed on interface residues. A residue is "
"defined as interface residue if any of its atoms is within 10A "
"of another chain. CAPRI suggests to lower the default of 10A to "
"8A in combination with only considering CB atoms for "
"protein-peptide interactions. "
"Note that the resulting DockQ is not evaluated for these "
"slightly updated fnat and irmsd (lrmsd stays the same). "
"Raises an error if reference contains nucleotide chains. "
"This flag has no influence on patch_dockq scores."))
parser.add_argument(
"--ics",
dest="ics",
default=False,
action="store_true",
help=("Computes interface contact similarity (ICS) related scores. "
"A contact between two residues of different chains is defined "
"as having at least one heavy atom within 5A. Contacts in "
"reference structure are available as key "
"\"reference_contacts\". Each contact specifies the interacting "
"residues in format \"<cname>.<rnum>.<ins_code>\". Model "
"contacts are available as key \"model_contacts\". The precision "
"which is available as key \"ics_precision\" reports the "
"fraction of model contacts that are also present in the "
"reference. The recall which is available as key \"ics_recall\" "
"reports the fraction of reference contacts that are correctly "
"reproduced in the model. "
"The ICS score (Interface Contact Similarity) available as key "
"\"ics\" combines precision and recall using the F1-measure. "
"All these measures are also available on a per-interface basis "
"for each interface in the reference structure that are defined "
"as chain pairs with at least one contact (available as key "
" \"contact_reference_interfaces\"). The respective metrics are "
"available as keys \"per_interface_ics_precision\", "
"\"per_interface_ics_recall\" and \"per_interface_ics\"."))
parser.add_argument(
"--ips",
dest="ips",
default=False,
action="store_true",
help=("Computes interface patch similarity (IPS) related scores. "
"They focus on interface residues. They are defined as having "
"at least one contact to a residue from any other chain. "
"In short: if they show up in the contact lists used to compute "
"ICS. If ips is enabled, these contacts get reported too and are "
"available as keys \"reference_contacts\" and \"model_contacts\"."
"The precision which is available as key \"ips_precision\" "
"reports the fraction of model interface residues, that are also "
"interface residues in the reference. "
"The recall which is available as key \"ips_recall\" "
"reports the fraction of reference interface residues that are "
"also interface residues in the model. "
"The IPS score (Interface Patch Similarity) available as key "
"\"ips\" is the Jaccard coefficient between interface residues "
"in reference and model. "
"All these measures are also available on a per-interface basis "
"for each interface in the reference structure that are defined "
"as chain pairs with at least one contact (available as key "
" \"contact_reference_interfaces\"). The respective metrics are "
"available as keys \"per_interface_ips_precision\", "
"\"per_interface_ips_recall\" and \"per_interface_ips\"."))
parser.add_argument(
"--rigid-scores",
dest="rigid_scores",
default=False,
action="store_true",
help=("Computes rigid superposition based scores. They're based on a "
"Kabsch superposition of all mapped CA positions (C3' for "
"nucleotides). Makes the following keys available: "
"\"oligo_gdtts\": GDT with distance thresholds [1.0, 2.0, 4.0, "
"8.0] given these positions and transformation, \"oligo_gdtha\": "
"same with thresholds [0.5, 1.0, 2.0, 4.0], \"rmsd\": RMSD given "
"these positions and transformation, \"transform\": the used 4x4 "
"transformation matrix that superposes model onto reference, "
"\"rigid_chain_mapping\": equivalent of \"chain_mapping\" which "
"is used for rigid scores (optimized for RMSD instead of "
"QS-score/lDDT)."))
parser.add_argument(
"--patch-scores",
dest="patch_scores",
default=False,
action="store_true",
help=("Local interface quality score used in CASP15. Scores each "
"model residue that is considered in the interface (CB pos "
"within 8A of any CB pos from another chain (CA for GLY)). The "
"local neighborhood gets represented by \"interface patches\" "
"which are scored with QS-score and DockQ. Scores where not "
"the full patches are represented by the reference are set to "
"None. Model interface residues are available as key "
"\"model_interface_residues\", reference interface residues as "
"key \"reference_interface_residues\". Residues are represented "
"as string in form <chain_name>.<resnum>.<resnum_inscode>. "
"The respective scores are available as keys \"patch_qs\" and "
"\"patch_dockq\""))
parser.add_argument(
"--tm-score",
dest="tm_score",
default=False,
action="store_true",
help=("Computes TM-score with the USalign tool. Also computes a "
"chain mapping in case of complexes that is stored in the "
"same format as the default mapping. TM-score and the mapping "
"are available as keys \"tm_score\" and \"usalign_mapping\""))
parser.add_argument(
"--lddt-no-stereochecks",
dest="lddt_no_stereochecks",
default=False,
action="store_true",
help=("Disable stereochecks for lDDT computation"))
parser.add_argument(
"--n-max-naive",
dest="n_max_naive",
required=False,
default=40320,
type=int,
help=("Parameter for chain mapping. If the number of possible "
"mappings is <= *n_max_naive*, the full "
"mapping solution space is enumerated to find the "
"the mapping with optimal QS-score. A heuristic is used "
"otherwise. The default of 40320 corresponds to an octamer "
"(8! = 40320). A structure with stoichiometry A6B2 would be "
"6!*2! = 1440 etc."))
parser.add_argument(
"--dump-aligned-residues",
dest="dump_aligned_residues",
default=False,
action="store_true",
help=("Dump additional info on aligned model and reference residues."))
parser.add_argument(
"--dump-pepnuc-alns",
dest="dump_pepnuc_alns",
default=False,
action="store_true",
help=("Dump alignments of mapped chains but with sequences that did "
"not undergo Molck preprocessing in the scorer. Sequences are "
"extracted from model/target after undergoing selection for "
"peptide and nucleotide residues."))
parser.add_argument(
"--dump-pepnuc-aligned-residues",
dest="dump_pepnuc_aligned_residues",
default=False,
action="store_true",
help=("Dump additional info on model and reference residues that occur "
"in pepnuc alignments."))
parser.add_argument(
"--min-pep-length",
dest="min_pep_length",
default = 6,
type=int,
help=("Default: 6 - "
"Relevant parameter if short peptides are involved in scoring. "
"Minimum peptide length for a chain in the target structure to "
"be considered in chain mapping. The chain mapping algorithm "
"first performs an all vs. all pairwise sequence alignment to "
"identify \"equal\" chains within the target structure. We go "
"for simple sequence identity there. Short sequences can be "
"problematic as they may produce high sequence identity "
"alignments by pure chance.")
)
parser.add_argument(
"--min-nuc-length",
dest="min_nuc_length",
default = 4,
type=int,
help=("Default: 4 - "
"Relevant parameter if short nucleotides are involved in scoring."
"Minimum nucleotide length for a chain in the target structure to "
"be considered in chain mapping. The chain mapping algorithm "
"first performs an all vs. all pairwise sequence alignment to "
"identify \"equal\" chains within the target structure. We go "
"for simple sequence identity there. Short sequences can be "
"problematic as they may produce high sequence identity "
"alignments by pure chance.")
)
parser.add_argument(
'-v',
'--verbosity',
dest="verbosity",
type=int,
default=2,
help="Set verbosity level. Defaults to 2 (Script).")
parser.add_argument(
"--lddt-add-mdl-contacts",
dest="lddt_add_mdl_contacts",
default=False,
action="store_true",
help=("Only using contacts in lDDT that"
"are within a certain distance threshold in the "
"reference does not penalize for added model "
"contacts. If set to True, this flag will also "
"consider reference contacts that are within the "
"specified distance threshold in the model but "
"not necessarily in the reference. No contact will "
"be added if the respective atom pair is not "
"resolved in the reference."))
return parser.parse_args()
def _CheckCompoundLib():
clib = ost.conop.GetDefaultLib()
if not clib:
ost.LogError("A compound library is required for this action. "
"Please refer to the OpenStructure website: "
"https://openstructure.org/docs/conop/compoundlib/.")
sys.tracebacklimit = 0
raise RuntimeError("No compound library found")
def _RoundOrNone(num, decimals = 3):
""" Helper to create valid JSON output
"""
if num is None or math.isnan(num) or math.isinf(num):
return None
return round(num, decimals)
def _AddSuffix(filename, dump_suffix):
"""Add dump_suffix to the file name.
"""
root, ext = os.path.splitext(filename)
if ext == ".gz":
root, ext2 = os.path.splitext(root)
ext = ext2 + ext
return root + dump_suffix + ext
def _GetStructureFormat(structure_path, sformat=None):
"""Get the structure format and return it as "pdb" or "mmcif".
"""
if sformat is None:
# Determine file format from suffix.
ext = structure_path.split(".")
if ext[-1] == "gz":
ext = ext[:-1]
if len(ext) <= 1:
raise Exception(f"Could not determine format of file "
f"{structure_path}.")
sformat = ext[-1].lower()
if sformat in ["mmcif", "cif"]:
return "mmcif"
elif sformat == "pdb":
return sformat
else:
raise Exception(f"Unknown/unsupported file format found for "
f"file {structure_path}.")
def _LoadStructure(structure_path, sformat, fault_tolerant, bu_id):
"""Read OST entity either from mmCIF or PDB.
The returned structure has structure_path attached as structure name
"""
# increase loglevel, as we would pollute the info log with weird stuff
ost.PushVerbosityLevel(ost.LogLevel.Error)
# Load the structure
if sformat == "mmcif":
if bu_id is not None:
cif_entity, cif_seqres, cif_info = \
io.LoadMMCIF(structure_path, info=True, seqres=True,
fault_tolerant=fault_tolerant)
for biounit in cif_info.biounits:
if biounit.id == bu_id:
entity = ost.mol.alg.CreateBU(cif_entity, biounit)
break
else:
raise RuntimeError(f"No biounit found with ID '{bu_id}'.")
else:
entity = io.LoadMMCIF(structure_path,
fault_tolerant = fault_tolerant)
if len(entity.residues) == 0:
raise Exception(f"No residues found in file: {structure_path}")
else:
entity = io.LoadPDB(structure_path, fault_tolerant = fault_tolerant)
if len(entity.residues) == 0:
raise Exception(f"No residues found in file: {structure_path}")
# restore old loglevel and return
ost.PopVerbosityLevel()
entity.SetName(structure_path)
return entity
def _DumpStructure(entity, structure_path, sformat):
if sformat == "mmcif":
io.SaveMMCIF(entity, structure_path)
else:
io.SavePDB(entity, structure_path)
def _AlnToFastaStr(aln):
""" Returns alignment as fasta formatted string
"""
s1 = aln.GetSequence(0)
s2 = aln.GetSequence(1)
return f">reference:{s1.name}\n{str(s1)}\n>model:{s2.name}\n{str(s2)}"
def _GetInconsistentResidues(alns):
lst = list()
for aln in alns:
for col in aln:
r1 = col.GetResidue(0)
r2 = col.GetResidue(1)
if r1.IsValid() and r2.IsValid() and r1.GetName() != r2.GetName():
ch_1 = r1.GetChain().name
num_1 = r1.number.num
ins_code_1 = r1.number.ins_code.strip("\u0000")
id_1 = f"{ch_1}.{num_1}.{ins_code_1}"
ch_2 = r2.GetChain().name
num_2 = r2.number.num
ins_code_2 = r2.number.ins_code.strip("\u0000")
id_2 = f"{ch_2}.{num_2}.{ins_code_2}"
lst.append(f"{id_1}-{id_2}")
return lst
def _LocalScoresToJSONDict(score_dict):
""" Convert ResNums to str for JSON serialization
"""
json_dict = dict()
for ch, ch_scores in score_dict.items():
for num, s in ch_scores.items():
ins_code = num.ins_code.strip("\u0000")
json_dict[f"{ch}.{num.num}.{ins_code}"] = _RoundOrNone(s)
return json_dict
def _InterfaceResiduesToJSONList(interface_dict):
""" Convert ResNums to str for JSON serialization.
Changes in this function will affect _PatchScoresToJSONList
"""
json_list = list()
for ch, ch_nums in interface_dict.items():
for num in ch_nums:
ins_code = num.ins_code.strip("\u0000")
json_list.append(f"{ch}.{num.num}.{ins_code}")
return json_list
def _PatchScoresToJSONList(interface_dict, score_dict):
""" Creates List of patch scores that are consistent with interface residue
lists
"""
json_list = list()
for ch, ch_nums in interface_dict.items():
for item in score_dict[ch]:
json_list.append(_RoundOrNone(item))
return json_list
def _GetAlignedResidues(aln):
aligned_residues = list()
for a in aln:
mdl_lst = list()
ref_lst = list()
for c in a:
mdl_r = c.GetResidue(1)
ref_r = c.GetResidue(0)
if mdl_r.IsValid():
olc = mdl_r.one_letter_code
num = mdl_r.GetNumber().num
ins_code = mdl_r.GetNumber().ins_code.strip("\u0000")
mdl_lst.append({"olc": olc,
"num": f"{num}.{ins_code}"})
else:
mdl_lst.append(None)
if ref_r.IsValid():
olc = ref_r.one_letter_code
num = ref_r.GetNumber().num
ins_code = ref_r.GetNumber().ins_code.strip("\u0000")
ref_lst.append({"olc": olc,
"num": f"{num}.{ins_code}"})
else:
ref_lst.append(None)
mdl_dct = {"chain": a.GetSequence(1).GetName(),
"residues": mdl_lst}
ref_dct = {"chain": a.GetSequence(0).GetName(),
"residues": ref_lst}
aligned_residues.append({"model": mdl_dct,
"reference": ref_dct})
return aligned_residues
def _Process(model, reference, args, model_format, reference_format):
mapping = None
if args.chain_mapping is not None:
mapping = {x.split(':')[0]: x.split(':')[1] for x in args.chain_mapping}
scorer = scoring.Scorer(model, reference,
resnum_alignments = args.residue_number_alignment,
cad_score_exec = args.cad_exec,
custom_mapping = mapping,
usalign_exec = args.usalign_exec,
lddt_no_stereochecks = args.lddt_no_stereochecks,
n_max_naive = args.n_max_naive,
oum = args.oum,
min_pep_length = args.min_pep_length,
min_nuc_length = args.min_nuc_length,
lddt_add_mdl_contacts = args.lddt_add_mdl_contacts,
dockq_capri_peptide = args.dockq_capri_peptide)
ir = _GetInconsistentResidues(scorer.aln)
if len(ir) > 0 and args.enforce_consistency:
raise RuntimeError(f"Inconsistent residues observed: {' '.join(ir)}")
out = dict()
out["reference_chains"] = [ch.GetName() for ch in scorer.target.chains]
out["model_chains"] = [ch.GetName() for ch in scorer.model.chains]
out["chem_groups"] = scorer.chain_mapper.chem_groups
out["chem_mapping"] = scorer.mapping.chem_mapping
out["chain_mapping"] = scorer.mapping.GetFlatMapping()
out["aln"] = [_AlnToFastaStr(aln) for aln in scorer.aln]
out["inconsistent_residues"] = ir
if args.dump_aligned_residues:
out["aligned_residues"] = _GetAlignedResidues(scorer.aln)
if args.dump_pepnuc_alns:
out["pepnuc_aln"] = [_AlnToFastaStr(aln) for aln in scorer.pepnuc_aln]
if args.dump_pepnuc_aligned_residues:
out["pepnuc_aligned_residues"] = _GetAlignedResidues(scorer.pepnuc_aln)
if args.lddt:
out["lddt"] = _RoundOrNone(scorer.lddt)
if args.local_lddt:
out["local_lddt"] = _LocalScoresToJSONDict(scorer.local_lddt)
if args.lddt or args.local_lddt:
out["model_clashes"] = [x.ToJSON() for x in scorer.model_clashes]
out["model_bad_bonds"] = [x.ToJSON() for x in scorer.model_bad_bonds]
out["model_bad_angles"] = [x.ToJSON() for x in scorer.model_bad_angles]
out["reference_clashes"] = [x.ToJSON() for x in scorer.target_clashes]
out["reference_bad_bonds"] = [x.ToJSON() for x in scorer.target_bad_bonds]
out["reference_bad_angles"] = [x.ToJSON() for x in scorer.target_bad_angles]
if args.bb_lddt:
out["bb_lddt"] = _RoundOrNone(scorer.bb_lddt)
if args.bb_local_lddt:
out["bb_local_lddt"] = _LocalScoresToJSONDict(scorer.bb_local_lddt)
if args.ilddt:
out["ilddt"] = _RoundOrNone(scorer.ilddt)
if args.cad_score:
out["cad_score"] = scorer.cad_score
if args.local_cad_score:
out["local_cad_score"] = _LocalScoresToJSONDict(scorer.local_cad_score)
if args.qs_score:
out["qs_global"] = _RoundOrNone(scorer.qs_global)
out["qs_best"] = _RoundOrNone(scorer.qs_best)
out["qs_reference_interfaces"] = scorer.qs_target_interfaces
out["qs_model_interfaces"] = scorer.qs_model_interfaces
out["qs_interfaces"] = scorer.qs_interfaces
out["per_interface_qs_global"] = \
[_RoundOrNone(x) for x in scorer.per_interface_qs_global]
out["per_interface_qs_best"] = \
[_RoundOrNone(x) for x in scorer.per_interface_qs_best]
if args.ics or args.ips:
out["reference_contacts"] = scorer.native_contacts
out["model_contacts"] = scorer.model_contacts
out["contact_reference_interfaces"] = scorer.contact_target_interfaces
if args.ics:
out["ics_precision"] = _RoundOrNone(scorer.ics_precision)
out["ics_recall"] = _RoundOrNone(scorer.ics_recall)
out["ics"] = _RoundOrNone(scorer.ics)
out["per_interface_ics_precision"] = \
[_RoundOrNone(x) for x in scorer.per_interface_ics_precision]
out["per_interface_ics_recall"] = \
[_RoundOrNone(x) for x in scorer.per_interface_ics_recall]
out["per_interface_ics"] = \
[_RoundOrNone(x) for x in scorer.per_interface_ics]
if args.ips:
out["ips_precision"] = _RoundOrNone(scorer.ips_precision)
out["ips_recall"] = _RoundOrNone(scorer.ips_recall)
out["ips"] = _RoundOrNone(scorer.ips)
out["per_interface_ips_precision"] = \
[_RoundOrNone(x) for x in scorer.per_interface_ips_precision]
out["per_interface_ips_recall"] = \
[_RoundOrNone(x) for x in scorer.per_interface_ips_recall]
out["per_interface_ips"] = \
[_RoundOrNone(x) for x in scorer.per_interface_ips]
if args.dockq:
out["dockq_reference_interfaces"] = scorer.dockq_target_interfaces
out["dockq_interfaces"] = scorer.dockq_interfaces
out["dockq"] = [_RoundOrNone(x) for x in scorer.dockq_scores]
out["fnat"] = [_RoundOrNone(x) for x in scorer.fnat]
out["fnonnat"] = [_RoundOrNone(x) for x in scorer.fnonnat]
out["irmsd"] = [_RoundOrNone(x) for x in scorer.irmsd]
out["lrmsd"] = [_RoundOrNone(x) for x in scorer.lrmsd]
out["nnat"] = scorer.nnat
out["nmdl"] = scorer.nmdl
out["dockq_ave"] = _RoundOrNone(scorer.dockq_ave)
out["dockq_wave"] = _RoundOrNone(scorer.dockq_wave)
out["dockq_ave_full"] = _RoundOrNone(scorer.dockq_ave_full)
out["dockq_wave_full"] = _RoundOrNone(scorer.dockq_wave_full)
if args.rigid_scores:
out["oligo_gdtts"] = _RoundOrNone(scorer.gdtts)
out["oligo_gdtha"] = _RoundOrNone(scorer.gdtha)
out["rmsd"] = _RoundOrNone(scorer.rmsd)
data = scorer.rigid_transform.data
out["transform"] = [data[i:i + 4] for i in range(0, len(data), 4)]
out["rigid_chain_mapping"] = scorer.rigid_mapping.GetFlatMapping()
if args.patch_scores:
out["model_interface_residues"] = \
_InterfaceResiduesToJSONList(scorer.model_interface_residues)
out["reference_interface_residues"] = \
_InterfaceResiduesToJSONList(scorer.target_interface_residues)
out["patch_qs"] = _PatchScoresToJSONList(scorer.model_interface_residues,
scorer.patch_qs)
out["patch_dockq"] = _PatchScoresToJSONList(scorer.model_interface_residues,
scorer.patch_dockq)
if args.tm_score:
out["tm_score"] = _RoundOrNone(scorer.tm_score)
out["usalign_mapping"] = scorer.usalign_mapping
if args.dump_structures:
# Dump model
model_dump_filename = _AddSuffix(model.GetName(), args.dump_suffix)
_DumpStructure(model, model_dump_filename, model_format)
# Dump reference
reference_dump_filename = _AddSuffix(reference.GetName(), args.dump_suffix)
_DumpStructure(reference, reference_dump_filename, reference_format)
return out
def _Main():
args = _ParseArgs()
ost.PushVerbosityLevel(args.verbosity)
if args.verbosity < 4:
# Hide tracebacks by default
# Run script with -v 4 (Verbose) or higher to display them
sys.tracebacklimit = 0
_CheckCompoundLib()
try:
compute_cad = args.cad_score or args.local_cad_score
if compute_cad and not args.residue_number_alignment:
raise RuntimeError("Only support CAD score when residue numbers in "
"model and reference match. Use -rna flag if "
"this is the case.")
reference_format = _GetStructureFormat(args.reference,
sformat=args.reference_format)
reference = _LoadStructure(args.reference,
sformat=reference_format,
bu_id=args.reference_biounit,
fault_tolerant = args.fault_tolerant)
model_format = _GetStructureFormat(args.model,
sformat=args.model_format)
model = _LoadStructure(args.model,
sformat=model_format,
bu_id=args.model_biounit,
fault_tolerant = args.fault_tolerant)
out = _Process(model, reference, args, model_format, reference_format)
# append input arguments
out["model"] = args.model
out["reference"] = args.reference
out["fault_tolerant"] = args.fault_tolerant
out["model_biounit"] = args.model_biounit
out["reference_biounit"] = args.reference_biounit
out["residue_number_alignment"] = args.residue_number_alignment
out["enforce_consistency"] = args.enforce_consistency
out["cad_exec"] = args.cad_exec
out["usalign_exec"] = args.usalign_exec
out["lddt_no_stereochecks"] = args.lddt_no_stereochecks
out["min_pep_length"] = args.min_pep_length
out["min_nuc_length"] = args.min_nuc_length
out["lddt_add_mdl_contacts"] = args.lddt_add_mdl_contacts
out["dockq_capri_peptide"] = args.dockq_capri_peptide
out["ost_version"] = ost.__version__
out["status"] = "SUCCESS"
with open(args.output, 'w') as fh:
json.dump(out, fh, indent=4, sort_keys=False)
except Exception as exc:
out = dict()
out["status"] = "FAILURE"
out["traceback"] = traceback.format_exc(limit=1000)
etype, evalue, tb = sys.exc_info()
out["exception"] = " ".join(traceback.format_exception_only(etype, evalue))
with open(args.output, 'w') as fh:
json.dump(out, fh, indent=4, sort_keys=False)
raise
if __name__ == '__main__':
_Main()