Skip to content
Snippets Groups Projects
Commit db8f2609 authored by Studer Gabriel's avatar Studer Gabriel
Browse files

new compare structures action - compare-structures-new

commited for further review, currently co-exists with compare-structures
parent 97a859ad
No related branches found
No related tags found
No related merge requests found
......@@ -2,3 +2,4 @@ add_custom_target(actions ALL)
ost_action_init()
ost_action(ost-compare-structures actions)
ost_action(ost-compare-structures-new actions)
"""
Evaluate model against reference
Example: ost compare-structures-new model.pdb reference.cif
Loads the structures and performs basic cleanup:
* Assign elements according to the PDB compound dictionary
* Map nonstandard residues to their parent residues as defined by the PDB
compound dictionary, e.g. phospho-serine => serine
* Remove hydrogens
* Remove OXT atoms
* Remove unknown atoms, i.e. atoms that are not expected according to the PDB
compound dictionary
The cleaned structures are optionally dumped using -d/--dump-structures
Output is written in JSON format (default: out.json). In case of no additional
options, this is a dictionary with four keys:
* "chain_mapping": A dictionary with reference chain names as keys and the
mapped model chain names as values.
* "aln": Pairwise sequence alignment for each pair of mapped chains in fasta
format.
* "inconsistent_residues": List of strings that represent name mismatches of
aligned residues in form
<trg_cname>.<trg_rname><trg_rnum>-<mdl_cname>.<mdl_rname><mdl_rnum>.
Inconsistencies may lead to corrupt results but do not abort the program.
Program abortion in these cases can be enforced with
-ec/--enforce-consistency.
* "status": SUCCESS if everything ran through. In case of failure, the only
content of the JSON output will be \"status\" set to FAILURE and an
additional key: "traceback".
The pairwise sequence alignments are computed with Needleman-Wunsch using
BLOSUM62 (NUC44 for nucleotides). Many benchmarking scenarios preprocess the
structures to ensure matching residue numbers (CASP/CAMEO). In these cases,
enabling -rna/--residue-number-alignment is recommended.
Each score is opt-in and can be enabled with optional arguments.
Example to compute local and per-residue lDDT values as well as QS-score:
ost compare-structures-new model.pdb reference.cif --lddt --local-lddt --qs-score
"""
import argparse
import os
import json
import time
import sys
import traceback
from ost import io
from ost.mol.alg import scoring
def _ParseArgs():
parser = argparse.ArgumentParser(description = __doc__,
formatter_class=argparse.RawDescriptionHelpFormatter,
prog = "ost compare-structures-new")
parser.add_argument(
"model",
help=("Path to model file."))
parser.add_argument(
"reference",
help=("Path to reference file."))
parser.add_argument(
"-o",
"--output",
dest="output",
required=False,
default="out.json",
help=("Output file name. The output will be saved as a JSON file. "
"default: out.json"))
parser.add_argument(
"-mf",
"--model-format",
dest="model_format",
required=False,
default=None,
choices=["pdb", "cif", "mmcif"],
help=("Format of model file. pdb reads pdb but also pdb.gz, same "
"applies to cif/mmcif. Inferred from filepath if not given."))
parser.add_argument(
"-rf",
"--reference-format",
dest="reference_format",
required=False,
default=None,
choices=["pdb", "cif", "mmcif"],
help=("Format of reference file. pdb reads pdb but also pdb.gz, same "
"applies to cif/mmcif. Inferred from filepath if not given."))
parser.add_argument(
"-rna",
"--residue-number-alignment",
dest="residue_number_alignment",
default=False,
action="store_true",
help=("Make alignment based on residue number instead of using "
"a global BLOSUM62-based alignment (NUC44 for nucleotides)."))
parser.add_argument(
"-ec",
"--enforce-consistency",
dest="enforce_consistency",
default=False,
action="store_true",
help=("Enforce consistency. By default residue name discrepancies "
"between a model and reference are reported but the program "
"proceeds. If this flag is ON, the program fails for these "
"cases."))
parser.add_argument(
"-d",
"--dump-structures",
dest="dump_structures",
default=False,
action="store_true",
help=("Dump cleaned structures used to calculate all the scores as "
"PDB files using specified suffix. Files will be dumped to the "
"same location as original files."))
parser.add_argument(
"-ds",
"--dump-suffix",
dest="dump_suffix",
default=".compare.structures.pdb",
help=("Use this suffix to dump structures.\n"
"Defaults to .compare.structures.pdb."))
parser.add_argument(
"--lddt",
dest="lddt",
default=False,
action="store_true",
help=("Compute global lDDT score with default parameterization and "
"store as key \"lddt\". Stereochemical irregularities affecting "
"lDDT are reported as keys \"model_clashes\", "
"\"model_bad_bonds\", \"model_bad_angles\" and the respective "
"reference counterparts."))
parser.add_argument(
"--local-lddt",
dest="local_lddt",
default=False,
action="store_true",
help=("Compute per-residue lDDT scores with default parameterization "
"and store as key \"local_lddt\". Score for model residue with "
"number 42 in chain X can be extracted with: "
"data[\"local_lddt\"][\"X\"][\"42\"]. Stereochemical irregularities "
"affecting lDDT are reported as keys \"model_clashes\", "
"\"model_bad_bonds\", \"model_bad_angles\" and the respective "
"reference counterparts."))
parser.add_argument(
"--cad-score",
dest="cad_score",
default=False,
action="store_true",
help=("Compute global CAD's atom-atom (AA) score and store as key "
"\"cad_score\". --residue-number-alignment must be enabled "
"to compute this score. Requires voronota_cadscore executable "
"in PATH. Alternatively you can set cad-exec."))
parser.add_argument(
"--local-cad-score",
dest="local_cad_score",
default=False,
action="store_true",
help=("Compute local CAD's atom-atom (AA) scores and store as key "
"\"local_cad_score\". Score for model residue with number 42 in "
"chain X can be extracted with: "
"data[\"local_cad_score\"][\"X\"][\"42\"]. "
"--residue-number-alignments must be enabled to compute this "
"score. Requires voronota_cadscore executable in PATH. "
"Alternatively you can set cad-exec."))
parser.add_argument(
"--cad-exec",
dest="cad_exec",
default=None,
help=("Path to voronota-cadscore executable (installed from "
"https://github.com/kliment-olechnovic/voronota). Searches PATH "
"if not set."))
parser.add_argument(
"--qs-score",
dest="qs_score",
default=False,
action="store_true",
help=("Compute QS-score, stored as key \"qs_global\", and the QS-best "
"variant, stored as key \"qs_best\"."))
parser.add_argument(
"--rigid-scores",
dest="rigid_scores",
default=False,
action="store_true",
help=("Computes rigid superposition based scores. They're based on a "
"Kabsch superposition of all mapped CA positions (C3' for "
"nucleotides). Makes the following keys available: "
"\"oligo_gdtts\": GDT with distance thresholds [1.0, 2.0, 4.0, "
"8.0] given these positions and transformation, \"oligo_gdtha\": "
"same with thresholds [0.5, 1.0, 2.0, 4.0], \"rmsd\": RMSD given "
"these positions and transformation, \"transform\": the used 4x4 "
"transformation matrix that superposes model onto reference."))
parser.add_argument(
"--interface-scores",
dest="interface_scores",
default=False,
action="store_true",
help=("Per interface scores for each interface that has at least one "
"contact in the reference, i.e. at least one pair of heavy atoms "
"within 5A. The respective interfaces are available from key "
"\"interfaces\" which is a list of tuples in form (ref_ch1, "
"ref_ch2, mdl_ch1, mdl_ch2). Per-interface scores are available "
"as lists referring to these interfaces and have the following "
"keys: \"nnat\" (number of contacts in reference), \"nmdl\" "
"(number of contacts in model), \"fnat\" (fraction of reference "
"contacts which are also there in model), \"fnonnat\" (fraction "
"of model contacts which are not there in target), \"irmsd\" "
"(interface RMSD), \"lrmsd\" (ligand RMSD), \"dockq_scores\" "
"(per-interface score computed from \"fnat\", \"irmsd\" and "
"\"lrmsd\"), \"interface_qs_global\" and \"interface_qs_best\" "
"(per-interface versions of the two QS-score variants). The "
"DockQ score is strictly designed to score each interface "
"individually. We also provide two averaged versions to get one "
"full model score: \"dockq_ave\", \"dockq_wave\". The first is "
"simply the average of \"dockq_scores\", the latter is a "
"weighted average with weights derived from \"nnat\". These two "
"scores only consider interfaces that are present in both, the "
"model and the reference. \"dockq_ave_full\" and "
"\"dockq_wave_full\" add zeros in the average computation for "
"each interface that is only present in the reference but not in "
"the model."))
parser.add_argument(
"--patch-scores",
dest="patch_scores",
default=False,
action="store_true",
help=("Local interface quality score used in CASP15. Scores each "
"model residue that is considered in the interface (CB pos "
"within 8A of any CB pos from another chain (CA for GLY)). The "
"local neighborhood gets represented by \"interface patches\" "
"which are scored with QS-score and DockQ. Scores where not "
"the full patches are represented by the reference are set to "
"None. Model interface residues are available as key "
"\"model_interface_residues\", reference interface residues as "
"key \"reference_interface_residues\". The respective scores are "
"available as keys \"patch_qs\" and \"patch_dockq\""))
return parser.parse_args()
def _LoadStructure(structure_path, sformat=None):
"""Read OST entity either from mmCIF or PDB.
The returned structure has structure_path attached as structure name
"""
if not os.path.exists(structure_path):
raise Exception(f"file not found: {structure_path}")
if sformat is None:
# Determine file format from suffix.
ext = structure_path.split(".")
if ext[-1] == "gz":
ext = ext[:-1]
if len(ext) <= 1:
raise Exception(f"Could not determine format of file "
f"{structure_path}.")
sformat = ext[-1].lower()
# increase loglevel, as we would pollute the info log with weird stuff
ost.PushVerbosityLevel(ost.LogLevel.Error)
# Load the structure
if sformat in ["mmcif", "cif"]:
entity = io.LoadMMCIF(structure_path)
if len(entity.residues) == 0:
raise Exception(f"No residues found in file: {structure_path}")
elif sformat == "pdb":
entity = io.LoadPDB(structure_path)
if len(entity.residues) == 0:
raise Exception(f"No residues found in file: {structure_path}")
else:
raise Exception(f"Unknown/ unsupported file extension found for "
f"file {structure_path}.")
# restore old loglevel and return
ost.PopVerbosityLevel()
entity.SetName(structure_path)
return entity
def _AlnToFastaStr(aln):
""" Returns alignment as fasta formatted string
"""
s1 = aln.GetSequence(0)
s2 = aln.GetSequence(1)
return f">reference:{s1.name}\n{str(s1)}\nmodel:{s2.name}\n{str(s2)}"
def _GetInconsistentResidues(alns):
lst = list()
for aln in alns:
for col in aln:
r1 = col.GetResidue(0)
r2 = col.GetResidue(1)
if r1.IsValid() and r2.IsValid() and r1.GetName() != r2.GetName():
lst.append((r1, r2))
lst = [f"{x[0].GetQualifiedName()}-{x[1].GetQualifiedName()}" for x in lst]
return lst
def _Process(model, reference, args):
scorer = scoring.Scorer(model, reference,
resnum_alignments = args.residue_number_alignment,
cad_score_exec = args.cad_exec)
ir = _GetInconsistentResidues(scorer.aln)
if len(ir) > 0 and args.enforce_consistency:
raise RuntimeError(f"Inconsistent residues observed: {' '.join(ir)}")
out = dict()
out["chain_mapping"] = scorer.mapping.GetFlatMapping()
out["aln"] = [_AlnToFastaStr(aln) for aln in scorer.aln]
out["inconsistent_residues"] = ir
if args.lddt:
out["lddt"] = scorer.lddt
if args.local_lddt:
out["local_lddt"] = scorer.local_lddt
if args.lddt or args.local_lddt:
out["model_clashes"] = scorer.model_clashes
out["model_bad_bonds"] = scorer.model_bad_bonds
out["model_bad_angles"] = scorer.model_bad_angles
out["reference_clashes"] = scorer.target_clashes
out["reference_bad_bonds"] = scorer.target_bad_bonds
out["reference_bad_angles"] = scorer.target_bad_angles
if args.cad_score:
out["cad_score"] = scorer.cad_score
if args.local_cad_score:
out["local_cad_score"] = scorer.local_cad_score
if args.qs_score:
out["qs_global"] = scorer.qs_global
out["qs_best"] = scorer.qs_best
if args.rigid_scores:
out["oligo_gdtts"] = scorer.gdtts
out["oligo_gdtha"] = scorer.gdtha
out["rmsd"] = scorer.rmsd
data = scorer.transform.data
out["transform"] = [data[i:i + 4] for i in range(0, len(data), 4)]
if args.interface_scores:
out["interfaces"] = scorer.interfaces
out["nnat"] = scorer.native_contacts
out["nmdl"] = scorer.model_contacts
out["fnat"] = scorer.fnat
out["fnonnat"] = scorer.fnonnat
out["irmsd"] = scorer.irmsd
out["lrmsd"] = scorer.lrmsd
out["dockq_scores"] = scorer.dockq_scores
out["interface_qs_global"] = scorer.interface_qs_global
out["interface_qs_best"] = scorer.interface_qs_best
out["dockq_ave"] = scorer.dockq_ave
out["dockq_wave"] = scorer.dockq_wave
out["dockq_ave_full"] = scorer.dockq_ave_full
out["dockq_wave_full"] = scorer.dockq_wave_full
if args.patch_scores:
out["model_interface_residues"] = scorer.model_interface_residues
out["reference_interface_residues"] = scorer.target_interface_residues
out["patch_qs"] = scorer.patch_qs
out["patch_dockq"] = scorer.patch_dockq
if args.dump_structures:
io.SavePDB(scorer.model, model.GetName() + args.dump_suffix)
io.SavePDB(scorer.target, reference.GetName() + args.dump_suffix)
return out
def _Main():
args = _ParseArgs()
try:
reference = _LoadStructure(args.reference,
sformat=args.reference_format)
model = _LoadStructure(args.model, sformat=args.model_format)
out = _Process(model, reference, args)
out["status"] = "SUCCESS"
except:
out = dict()
out["status"] = "FAILURE"
out["traceback"] = traceback.format_exc()
with open(args.output, 'w') as fh:
json.dump(out, fh, indent=4, sort_keys=False)
if __name__ == '__main__':
_Main()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment