""" Evaluate model against reference Example: ost compare-structures-new model.pdb reference.cif Loads the structures and performs basic cleanup: * Assign elements according to the PDB compound dictionary * Map nonstandard residues to their parent residues as defined by the PDB compound dictionary, e.g. phospho-serine => serine * Remove hydrogens * Remove OXT atoms * Remove unknown atoms, i.e. atoms that are not expected according to the PDB compound dictionary The cleaned structures are optionally dumped using -d/--dump-structures Output is written in JSON format (default: out.json). In case of no additional options, this is a dictionary with four keys: * "chain_mapping": A dictionary with reference chain names as keys and the mapped model chain names as values. * "aln": Pairwise sequence alignment for each pair of mapped chains in fasta format. * "inconsistent_residues": List of strings that represent name mismatches of aligned residues in form <trg_cname>.<trg_rname><trg_rnum>-<mdl_cname>.<mdl_rname><mdl_rnum>. Inconsistencies may lead to corrupt results but do not abort the program. Program abortion in these cases can be enforced with -ec/--enforce-consistency. * "status": SUCCESS if everything ran through. In case of failure, the only content of the JSON output will be \"status\" set to FAILURE and an additional key: "traceback". The pairwise sequence alignments are computed with Needleman-Wunsch using BLOSUM62 (NUC44 for nucleotides). Many benchmarking scenarios preprocess the structures to ensure matching residue numbers (CASP/CAMEO). In these cases, enabling -rna/--residue-number-alignment is recommended. Each score is opt-in and can be enabled with optional arguments. Example to compute local and per-residue lDDT values as well as QS-score: ost compare-structures-new model.pdb reference.cif --lddt --local-lddt --qs-score """ import argparse import os import json import time import sys import traceback from ost import io from ost.mol.alg import scoring def _ParseArgs(): parser = argparse.ArgumentParser(description = __doc__, formatter_class=argparse.RawDescriptionHelpFormatter, prog = "ost compare-structures-new") parser.add_argument( "model", help=("Path to model file.")) parser.add_argument( "reference", help=("Path to reference file.")) parser.add_argument( "-o", "--output", dest="output", required=False, default="out.json", help=("Output file name. The output will be saved as a JSON file. " "default: out.json")) parser.add_argument( "-mf", "--model-format", dest="model_format", required=False, default=None, choices=["pdb", "cif", "mmcif"], help=("Format of model file. pdb reads pdb but also pdb.gz, same " "applies to cif/mmcif. Inferred from filepath if not given.")) parser.add_argument( "-rf", "--reference-format", dest="reference_format", required=False, default=None, choices=["pdb", "cif", "mmcif"], help=("Format of reference file. pdb reads pdb but also pdb.gz, same " "applies to cif/mmcif. Inferred from filepath if not given.")) parser.add_argument( "-rna", "--residue-number-alignment", dest="residue_number_alignment", default=False, action="store_true", help=("Make alignment based on residue number instead of using " "a global BLOSUM62-based alignment (NUC44 for nucleotides).")) parser.add_argument( "-ec", "--enforce-consistency", dest="enforce_consistency", default=False, action="store_true", help=("Enforce consistency. By default residue name discrepancies " "between a model and reference are reported but the program " "proceeds. If this flag is ON, the program fails for these " "cases.")) parser.add_argument( "-d", "--dump-structures", dest="dump_structures", default=False, action="store_true", help=("Dump cleaned structures used to calculate all the scores as " "PDB files using specified suffix. Files will be dumped to the " "same location as original files.")) parser.add_argument( "-ds", "--dump-suffix", dest="dump_suffix", default=".compare.structures.pdb", help=("Use this suffix to dump structures.\n" "Defaults to .compare.structures.pdb.")) parser.add_argument( "--lddt", dest="lddt", default=False, action="store_true", help=("Compute global lDDT score with default parameterization and " "store as key \"lddt\". Stereochemical irregularities affecting " "lDDT are reported as keys \"model_clashes\", " "\"model_bad_bonds\", \"model_bad_angles\" and the respective " "reference counterparts.")) parser.add_argument( "--local-lddt", dest="local_lddt", default=False, action="store_true", help=("Compute per-residue lDDT scores with default parameterization " "and store as key \"local_lddt\". Score for model residue with " "number 42 in chain X can be extracted with: " "data[\"local_lddt\"][\"X\"][\"42\"]. Stereochemical irregularities " "affecting lDDT are reported as keys \"model_clashes\", " "\"model_bad_bonds\", \"model_bad_angles\" and the respective " "reference counterparts.")) parser.add_argument( "--cad-score", dest="cad_score", default=False, action="store_true", help=("Compute global CAD's atom-atom (AA) score and store as key " "\"cad_score\". --residue-number-alignment must be enabled " "to compute this score. Requires voronota_cadscore executable " "in PATH. Alternatively you can set cad-exec.")) parser.add_argument( "--local-cad-score", dest="local_cad_score", default=False, action="store_true", help=("Compute local CAD's atom-atom (AA) scores and store as key " "\"local_cad_score\". Score for model residue with number 42 in " "chain X can be extracted with: " "data[\"local_cad_score\"][\"X\"][\"42\"]. " "--residue-number-alignments must be enabled to compute this " "score. Requires voronota_cadscore executable in PATH. " "Alternatively you can set cad-exec.")) parser.add_argument( "--cad-exec", dest="cad_exec", default=None, help=("Path to voronota-cadscore executable (installed from " "https://github.com/kliment-olechnovic/voronota). Searches PATH " "if not set.")) parser.add_argument( "--qs-score", dest="qs_score", default=False, action="store_true", help=("Compute QS-score, stored as key \"qs_global\", and the QS-best " "variant, stored as key \"qs_best\".")) parser.add_argument( "--rigid-scores", dest="rigid_scores", default=False, action="store_true", help=("Computes rigid superposition based scores. They're based on a " "Kabsch superposition of all mapped CA positions (C3' for " "nucleotides). Makes the following keys available: " "\"oligo_gdtts\": GDT with distance thresholds [1.0, 2.0, 4.0, " "8.0] given these positions and transformation, \"oligo_gdtha\": " "same with thresholds [0.5, 1.0, 2.0, 4.0], \"rmsd\": RMSD given " "these positions and transformation, \"transform\": the used 4x4 " "transformation matrix that superposes model onto reference.")) parser.add_argument( "--interface-scores", dest="interface_scores", default=False, action="store_true", help=("Per interface scores for each interface that has at least one " "contact in the reference, i.e. at least one pair of heavy atoms " "within 5A. The respective interfaces are available from key " "\"interfaces\" which is a list of tuples in form (ref_ch1, " "ref_ch2, mdl_ch1, mdl_ch2). Per-interface scores are available " "as lists referring to these interfaces and have the following " "keys: \"nnat\" (number of contacts in reference), \"nmdl\" " "(number of contacts in model), \"fnat\" (fraction of reference " "contacts which are also there in model), \"fnonnat\" (fraction " "of model contacts which are not there in target), \"irmsd\" " "(interface RMSD), \"lrmsd\" (ligand RMSD), \"dockq_scores\" " "(per-interface score computed from \"fnat\", \"irmsd\" and " "\"lrmsd\"), \"interface_qs_global\" and \"interface_qs_best\" " "(per-interface versions of the two QS-score variants). The " "DockQ score is strictly designed to score each interface " "individually. We also provide two averaged versions to get one " "full model score: \"dockq_ave\", \"dockq_wave\". The first is " "simply the average of \"dockq_scores\", the latter is a " "weighted average with weights derived from \"nnat\". These two " "scores only consider interfaces that are present in both, the " "model and the reference. \"dockq_ave_full\" and " "\"dockq_wave_full\" add zeros in the average computation for " "each interface that is only present in the reference but not in " "the model.")) parser.add_argument( "--patch-scores", dest="patch_scores", default=False, action="store_true", help=("Local interface quality score used in CASP15. Scores each " "model residue that is considered in the interface (CB pos " "within 8A of any CB pos from another chain (CA for GLY)). The " "local neighborhood gets represented by \"interface patches\" " "which are scored with QS-score and DockQ. Scores where not " "the full patches are represented by the reference are set to " "None. Model interface residues are available as key " "\"model_interface_residues\", reference interface residues as " "key \"reference_interface_residues\". The respective scores are " "available as keys \"patch_qs\" and \"patch_dockq\"")) return parser.parse_args() def _LoadStructure(structure_path, sformat=None): """Read OST entity either from mmCIF or PDB. The returned structure has structure_path attached as structure name """ if not os.path.exists(structure_path): raise Exception(f"file not found: {structure_path}") if sformat is None: # Determine file format from suffix. ext = structure_path.split(".") if ext[-1] == "gz": ext = ext[:-1] if len(ext) <= 1: raise Exception(f"Could not determine format of file " f"{structure_path}.") sformat = ext[-1].lower() # increase loglevel, as we would pollute the info log with weird stuff ost.PushVerbosityLevel(ost.LogLevel.Error) # Load the structure if sformat in ["mmcif", "cif"]: entity = io.LoadMMCIF(structure_path) if len(entity.residues) == 0: raise Exception(f"No residues found in file: {structure_path}") elif sformat == "pdb": entity = io.LoadPDB(structure_path) if len(entity.residues) == 0: raise Exception(f"No residues found in file: {structure_path}") else: raise Exception(f"Unknown/ unsupported file extension found for " f"file {structure_path}.") # restore old loglevel and return ost.PopVerbosityLevel() entity.SetName(structure_path) return entity def _AlnToFastaStr(aln): """ Returns alignment as fasta formatted string """ s1 = aln.GetSequence(0) s2 = aln.GetSequence(1) return f">reference:{s1.name}\n{str(s1)}\nmodel:{s2.name}\n{str(s2)}" def _GetInconsistentResidues(alns): lst = list() for aln in alns: for col in aln: r1 = col.GetResidue(0) r2 = col.GetResidue(1) if r1.IsValid() and r2.IsValid() and r1.GetName() != r2.GetName(): lst.append((r1, r2)) lst = [f"{x[0].GetQualifiedName()}-{x[1].GetQualifiedName()}" for x in lst] return lst def _Process(model, reference, args): scorer = scoring.Scorer(model, reference, resnum_alignments = args.residue_number_alignment, cad_score_exec = args.cad_exec) ir = _GetInconsistentResidues(scorer.aln) if len(ir) > 0 and args.enforce_consistency: raise RuntimeError(f"Inconsistent residues observed: {' '.join(ir)}") out = dict() out["chain_mapping"] = scorer.mapping.GetFlatMapping() out["aln"] = [_AlnToFastaStr(aln) for aln in scorer.aln] out["inconsistent_residues"] = ir if args.lddt: out["lddt"] = scorer.lddt if args.local_lddt: out["local_lddt"] = scorer.local_lddt if args.lddt or args.local_lddt: out["model_clashes"] = scorer.model_clashes out["model_bad_bonds"] = scorer.model_bad_bonds out["model_bad_angles"] = scorer.model_bad_angles out["reference_clashes"] = scorer.target_clashes out["reference_bad_bonds"] = scorer.target_bad_bonds out["reference_bad_angles"] = scorer.target_bad_angles if args.cad_score: out["cad_score"] = scorer.cad_score if args.local_cad_score: out["local_cad_score"] = scorer.local_cad_score if args.qs_score: out["qs_global"] = scorer.qs_global out["qs_best"] = scorer.qs_best if args.rigid_scores: out["oligo_gdtts"] = scorer.gdtts out["oligo_gdtha"] = scorer.gdtha out["rmsd"] = scorer.rmsd data = scorer.transform.data out["transform"] = [data[i:i + 4] for i in range(0, len(data), 4)] if args.interface_scores: out["interfaces"] = scorer.interfaces out["nnat"] = scorer.native_contacts out["nmdl"] = scorer.model_contacts out["fnat"] = scorer.fnat out["fnonnat"] = scorer.fnonnat out["irmsd"] = scorer.irmsd out["lrmsd"] = scorer.lrmsd out["dockq_scores"] = scorer.dockq_scores out["interface_qs_global"] = scorer.interface_qs_global out["interface_qs_best"] = scorer.interface_qs_best out["dockq_ave"] = scorer.dockq_ave out["dockq_wave"] = scorer.dockq_wave out["dockq_ave_full"] = scorer.dockq_ave_full out["dockq_wave_full"] = scorer.dockq_wave_full if args.patch_scores: out["model_interface_residues"] = scorer.model_interface_residues out["reference_interface_residues"] = scorer.target_interface_residues out["patch_qs"] = scorer.patch_qs out["patch_dockq"] = scorer.patch_dockq if args.dump_structures: io.SavePDB(scorer.model, model.GetName() + args.dump_suffix) io.SavePDB(scorer.target, reference.GetName() + args.dump_suffix) return out def _Main(): args = _ParseArgs() try: reference = _LoadStructure(args.reference, sformat=args.reference_format) model = _LoadStructure(args.model, sformat=args.model_format) out = _Process(model, reference, args) out["status"] = "SUCCESS" with open(args.output, 'w') as fh: json.dump(out, fh, indent=4, sort_keys=False) except: out = dict() out["status"] = "FAILURE" out["traceback"] = traceback.format_exc() with open(args.output, 'w') as fh: json.dump(out, fh, indent=4, sort_keys=False) raise if __name__ == '__main__': _Main()