diff --git a/actions/CMakeLists.txt b/actions/CMakeLists.txt index ce6e23204e1a1b3b08332c428e78d29384c48808..6f37c8ec237b016701407484fd842f16622ded6a 100644 --- a/actions/CMakeLists.txt +++ b/actions/CMakeLists.txt @@ -2,3 +2,4 @@ add_custom_target(actions ALL) ost_action_init() ost_action(ost-compare-structures actions) +ost_action(ost-compare-structures-new actions) diff --git a/actions/ost-compare-structures-new b/actions/ost-compare-structures-new new file mode 100644 index 0000000000000000000000000000000000000000..a4d06461a432adb7ec333b4121a7a232986584a4 --- /dev/null +++ b/actions/ost-compare-structures-new @@ -0,0 +1,411 @@ +""" +Evaluate model against reference + +Example: ost compare-structures-new model.pdb reference.cif + +Loads the structures and performs basic cleanup: + + * Assign elements according to the PDB compound dictionary + * Map nonstandard residues to their parent residues as defined by the PDB + compound dictionary, e.g. phospho-serine => serine + * Remove hydrogens + * Remove OXT atoms + * Remove unknown atoms, i.e. atoms that are not expected according to the PDB + compound dictionary + +The cleaned structures are optionally dumped using -d/--dump-structures + +Output is written in JSON format (default: out.json). In case of no additional +options, this is a dictionary with four keys: + + * "chain_mapping": A dictionary with reference chain names as keys and the + mapped model chain names as values. + * "aln": Pairwise sequence alignment for each pair of mapped chains in fasta + format. + * "inconsistent_residues": List of strings that represent name mismatches of + aligned residues in form + <trg_cname>.<trg_rname><trg_rnum>-<mdl_cname>.<mdl_rname><mdl_rnum>. + Inconsistencies may lead to corrupt results but do not abort the program. + Program abortion in these cases can be enforced with + -ec/--enforce-consistency. + * "status": SUCCESS if everything ran through. In case of failure, the only + content of the JSON output will be \"status\" set to FAILURE and an + additional key: "traceback". + +The pairwise sequence alignments are computed with Needleman-Wunsch using +BLOSUM62 (NUC44 for nucleotides). Many benchmarking scenarios preprocess the +structures to ensure matching residue numbers (CASP/CAMEO). In these cases, +enabling -rna/--residue-number-alignment is recommended. + +Each score is opt-in and can be enabled with optional arguments. + +Example to compute local and per-residue lDDT values as well as QS-score: + +ost compare-structures-new model.pdb reference.cif --lddt --local-lddt --qs-score +""" + +import argparse +import os +import json +import time +import sys +import traceback + +from ost import io +from ost.mol.alg import scoring + +def _ParseArgs(): + parser = argparse.ArgumentParser(description = __doc__, + formatter_class=argparse.RawDescriptionHelpFormatter, + prog = "ost compare-structures-new") + + parser.add_argument( + "model", + help=("Path to model file.")) + + parser.add_argument( + "reference", + help=("Path to reference file.")) + + parser.add_argument( + "-o", + "--output", + dest="output", + required=False, + default="out.json", + help=("Output file name. The output will be saved as a JSON file. " + "default: out.json")) + + parser.add_argument( + "-mf", + "--model-format", + dest="model_format", + required=False, + default=None, + choices=["pdb", "cif", "mmcif"], + help=("Format of model file. pdb reads pdb but also pdb.gz, same " + "applies to cif/mmcif. Inferred from filepath if not given.")) + + parser.add_argument( + "-rf", + "--reference-format", + dest="reference_format", + required=False, + default=None, + choices=["pdb", "cif", "mmcif"], + help=("Format of reference file. pdb reads pdb but also pdb.gz, same " + "applies to cif/mmcif. Inferred from filepath if not given.")) + + parser.add_argument( + "-rna", + "--residue-number-alignment", + dest="residue_number_alignment", + default=False, + action="store_true", + help=("Make alignment based on residue number instead of using " + "a global BLOSUM62-based alignment (NUC44 for nucleotides).")) + + parser.add_argument( + "-ec", + "--enforce-consistency", + dest="enforce_consistency", + default=False, + action="store_true", + help=("Enforce consistency. By default residue name discrepancies " + "between a model and reference are reported but the program " + "proceeds. If this flag is ON, the program fails for these " + "cases.")) + + parser.add_argument( + "-d", + "--dump-structures", + dest="dump_structures", + default=False, + action="store_true", + help=("Dump cleaned structures used to calculate all the scores as " + "PDB files using specified suffix. Files will be dumped to the " + "same location as original files.")) + + parser.add_argument( + "-ds", + "--dump-suffix", + dest="dump_suffix", + default=".compare.structures.pdb", + help=("Use this suffix to dump structures.\n" + "Defaults to .compare.structures.pdb.")) + + parser.add_argument( + "--lddt", + dest="lddt", + default=False, + action="store_true", + help=("Compute global lDDT score with default parameterization and " + "store as key \"lddt\". Stereochemical irregularities affecting " + "lDDT are reported as keys \"model_clashes\", " + "\"model_bad_bonds\", \"model_bad_angles\" and the respective " + "reference counterparts.")) + + parser.add_argument( + "--local-lddt", + dest="local_lddt", + default=False, + action="store_true", + help=("Compute per-residue lDDT scores with default parameterization " + "and store as key \"local_lddt\". Score for model residue with " + "number 42 in chain X can be extracted with: " + "data[\"local_lddt\"][\"X\"][\"42\"]. Stereochemical irregularities " + "affecting lDDT are reported as keys \"model_clashes\", " + "\"model_bad_bonds\", \"model_bad_angles\" and the respective " + "reference counterparts.")) + + parser.add_argument( + "--cad-score", + dest="cad_score", + default=False, + action="store_true", + help=("Compute global CAD's atom-atom (AA) score and store as key " + "\"cad_score\". --residue-number-alignment must be enabled " + "to compute this score. Requires voronota_cadscore executable " + "in PATH. Alternatively you can set cad-exec.")) + + parser.add_argument( + "--local-cad-score", + dest="local_cad_score", + default=False, + action="store_true", + help=("Compute local CAD's atom-atom (AA) scores and store as key " + "\"local_cad_score\". Score for model residue with number 42 in " + "chain X can be extracted with: " + "data[\"local_cad_score\"][\"X\"][\"42\"]. " + "--residue-number-alignments must be enabled to compute this " + "score. Requires voronota_cadscore executable in PATH. " + "Alternatively you can set cad-exec.")) + + parser.add_argument( + "--cad-exec", + dest="cad_exec", + default=None, + help=("Path to voronota-cadscore executable (installed from " + "https://github.com/kliment-olechnovic/voronota). Searches PATH " + "if not set.")) + + parser.add_argument( + "--qs-score", + dest="qs_score", + default=False, + action="store_true", + help=("Compute QS-score, stored as key \"qs_global\", and the QS-best " + "variant, stored as key \"qs_best\".")) + + parser.add_argument( + "--rigid-scores", + dest="rigid_scores", + default=False, + action="store_true", + help=("Computes rigid superposition based scores. They're based on a " + "Kabsch superposition of all mapped CA positions (C3' for " + "nucleotides). Makes the following keys available: " + "\"oligo_gdtts\": GDT with distance thresholds [1.0, 2.0, 4.0, " + "8.0] given these positions and transformation, \"oligo_gdtha\": " + "same with thresholds [0.5, 1.0, 2.0, 4.0], \"rmsd\": RMSD given " + "these positions and transformation, \"transform\": the used 4x4 " + "transformation matrix that superposes model onto reference.")) + + parser.add_argument( + "--interface-scores", + dest="interface_scores", + default=False, + action="store_true", + help=("Per interface scores for each interface that has at least one " + "contact in the reference, i.e. at least one pair of heavy atoms " + "within 5A. The respective interfaces are available from key " + "\"interfaces\" which is a list of tuples in form (ref_ch1, " + "ref_ch2, mdl_ch1, mdl_ch2). Per-interface scores are available " + "as lists referring to these interfaces and have the following " + "keys: \"nnat\" (number of contacts in reference), \"nmdl\" " + "(number of contacts in model), \"fnat\" (fraction of reference " + "contacts which are also there in model), \"fnonnat\" (fraction " + "of model contacts which are not there in target), \"irmsd\" " + "(interface RMSD), \"lrmsd\" (ligand RMSD), \"dockq_scores\" " + "(per-interface score computed from \"fnat\", \"irmsd\" and " + "\"lrmsd\"), \"interface_qs_global\" and \"interface_qs_best\" " + "(per-interface versions of the two QS-score variants). The " + "DockQ score is strictly designed to score each interface " + "individually. We also provide two averaged versions to get one " + "full model score: \"dockq_ave\", \"dockq_wave\". The first is " + "simply the average of \"dockq_scores\", the latter is a " + "weighted average with weights derived from \"nnat\". These two " + "scores only consider interfaces that are present in both, the " + "model and the reference. \"dockq_ave_full\" and " + "\"dockq_wave_full\" add zeros in the average computation for " + "each interface that is only present in the reference but not in " + "the model.")) + + parser.add_argument( + "--patch-scores", + dest="patch_scores", + default=False, + action="store_true", + help=("Local interface quality score used in CASP15. Scores each " + "model residue that is considered in the interface (CB pos " + "within 8A of any CB pos from another chain (CA for GLY)). The " + "local neighborhood gets represented by \"interface patches\" " + "which are scored with QS-score and DockQ. Scores where not " + "the full patches are represented by the reference are set to " + "None. Model interface residues are available as key " + "\"model_interface_residues\", reference interface residues as " + "key \"reference_interface_residues\". The respective scores are " + "available as keys \"patch_qs\" and \"patch_dockq\"")) + + return parser.parse_args() + +def _LoadStructure(structure_path, sformat=None): + """Read OST entity either from mmCIF or PDB. + + The returned structure has structure_path attached as structure name + """ + + if not os.path.exists(structure_path): + raise Exception(f"file not found: {structure_path}") + + if sformat is None: + # Determine file format from suffix. + ext = structure_path.split(".") + if ext[-1] == "gz": + ext = ext[:-1] + if len(ext) <= 1: + raise Exception(f"Could not determine format of file " + f"{structure_path}.") + sformat = ext[-1].lower() + + # increase loglevel, as we would pollute the info log with weird stuff + ost.PushVerbosityLevel(ost.LogLevel.Error) + # Load the structure + if sformat in ["mmcif", "cif"]: + entity = io.LoadMMCIF(structure_path) + if len(entity.residues) == 0: + raise Exception(f"No residues found in file: {structure_path}") + elif sformat == "pdb": + entity = io.LoadPDB(structure_path) + if len(entity.residues) == 0: + raise Exception(f"No residues found in file: {structure_path}") + else: + raise Exception(f"Unknown/ unsupported file extension found for " + f"file {structure_path}.") + # restore old loglevel and return + ost.PopVerbosityLevel() + entity.SetName(structure_path) + return entity + +def _AlnToFastaStr(aln): + """ Returns alignment as fasta formatted string + """ + s1 = aln.GetSequence(0) + s2 = aln.GetSequence(1) + return f">reference:{s1.name}\n{str(s1)}\nmodel:{s2.name}\n{str(s2)}" + +def _GetInconsistentResidues(alns): + lst = list() + for aln in alns: + for col in aln: + r1 = col.GetResidue(0) + r2 = col.GetResidue(1) + if r1.IsValid() and r2.IsValid() and r1.GetName() != r2.GetName(): + lst.append((r1, r2)) + lst = [f"{x[0].GetQualifiedName()}-{x[1].GetQualifiedName()}" for x in lst] + return lst + +def _Process(model, reference, args): + + + scorer = scoring.Scorer(model, reference, + resnum_alignments = args.residue_number_alignment, + cad_score_exec = args.cad_exec) + + ir = _GetInconsistentResidues(scorer.aln) + if len(ir) > 0 and args.enforce_consistency: + raise RuntimeError(f"Inconsistent residues observed: {' '.join(ir)}") + + out = dict() + out["chain_mapping"] = scorer.mapping.GetFlatMapping() + out["aln"] = [_AlnToFastaStr(aln) for aln in scorer.aln] + out["inconsistent_residues"] = ir + + if args.lddt: + out["lddt"] = scorer.lddt + + if args.local_lddt: + out["local_lddt"] = scorer.local_lddt + + if args.lddt or args.local_lddt: + out["model_clashes"] = scorer.model_clashes + out["model_bad_bonds"] = scorer.model_bad_bonds + out["model_bad_angles"] = scorer.model_bad_angles + out["reference_clashes"] = scorer.target_clashes + out["reference_bad_bonds"] = scorer.target_bad_bonds + out["reference_bad_angles"] = scorer.target_bad_angles + + if args.cad_score: + out["cad_score"] = scorer.cad_score + + if args.local_cad_score: + out["local_cad_score"] = scorer.local_cad_score + + if args.qs_score: + out["qs_global"] = scorer.qs_global + out["qs_best"] = scorer.qs_best + + if args.rigid_scores: + out["oligo_gdtts"] = scorer.gdtts + out["oligo_gdtha"] = scorer.gdtha + out["rmsd"] = scorer.rmsd + data = scorer.transform.data + out["transform"] = [data[i:i + 4] for i in range(0, len(data), 4)] + + if args.interface_scores: + out["interfaces"] = scorer.interfaces + out["nnat"] = scorer.native_contacts + out["nmdl"] = scorer.model_contacts + out["fnat"] = scorer.fnat + out["fnonnat"] = scorer.fnonnat + out["irmsd"] = scorer.irmsd + out["lrmsd"] = scorer.lrmsd + out["dockq_scores"] = scorer.dockq_scores + out["interface_qs_global"] = scorer.interface_qs_global + out["interface_qs_best"] = scorer.interface_qs_best + out["dockq_ave"] = scorer.dockq_ave + out["dockq_wave"] = scorer.dockq_wave + out["dockq_ave_full"] = scorer.dockq_ave_full + out["dockq_wave_full"] = scorer.dockq_wave_full + + if args.patch_scores: + out["model_interface_residues"] = scorer.model_interface_residues + out["reference_interface_residues"] = scorer.target_interface_residues + out["patch_qs"] = scorer.patch_qs + out["patch_dockq"] = scorer.patch_dockq + + if args.dump_structures: + io.SavePDB(scorer.model, model.GetName() + args.dump_suffix) + io.SavePDB(scorer.target, reference.GetName() + args.dump_suffix) + + return out + +def _Main(): + + args = _ParseArgs() + try: + reference = _LoadStructure(args.reference, + sformat=args.reference_format) + model = _LoadStructure(args.model, sformat=args.model_format) + out = _Process(model, reference, args) + out["status"] = "SUCCESS" + except: + out = dict() + out["status"] = "FAILURE" + out["traceback"] = traceback.format_exc() + + with open(args.output, 'w') as fh: + json.dump(out, fh, indent=4, sort_keys=False) + +if __name__ == '__main__': + _Main()