From ba9fe7fcc3e0150eec57750722beb729eff99748 Mon Sep 17 00:00:00 2001 From: Gabriel Studer <gabriel.studer@unibas.ch> Date: Mon, 6 Mar 2023 10:22:02 +0100 Subject: [PATCH] deprecate old compare-structures --- CHANGELOG.txt | 2 + actions/CMakeLists.txt | 2 +- actions/ost-compare-structures | 1421 +++++++++---------------- actions/ost-compare-structures-legacy | 1006 +++++++++++++++++ actions/ost-compare-structures-new | 577 ---------- modules/doc/actions.rst | 750 ++++--------- modules/doc/deprecated_actions.rst | 599 +++++++++++ 7 files changed, 2291 insertions(+), 2066 deletions(-) create mode 100644 actions/ost-compare-structures-legacy delete mode 100644 actions/ost-compare-structures-new create mode 100644 modules/doc/deprecated_actions.rst diff --git a/CHANGELOG.txt b/CHANGELOG.txt index 135c5a693..8c72c3cbb 100644 --- a/CHANGELOG.txt +++ b/CHANGELOG.txt @@ -22,6 +22,8 @@ Changes in Release 2.4.0 OpenStructure specific scoring capabilities including required pre-processing of model/reference (cleanup, stereochemistry checks, chain mapping etc.). + * Re-write of compare-structures action to include newly developed chain + mapping and scores. The old action is available as compare-structures-legacy. * Several minor bug fixes and improvements. Changes in Release 2.3.1 diff --git a/actions/CMakeLists.txt b/actions/CMakeLists.txt index abd002cc2..f4e599e28 100644 --- a/actions/CMakeLists.txt +++ b/actions/CMakeLists.txt @@ -3,4 +3,4 @@ add_custom_target(actions ALL) ost_action_init() ost_action(ost-compare-ligand-structures actions) ost_action(ost-compare-structures actions) -ost_action(ost-compare-structures-new actions) +ost_action(ost-compare-structures-legacy actions) diff --git a/actions/ost-compare-structures b/actions/ost-compare-structures index 7aaaf6914..c1a263206 100644 --- a/actions/ost-compare-structures +++ b/actions/ost-compare-structures @@ -1,1006 +1,577 @@ -"""Evaluate model structure against reference. - -eg. - - ost compare-structures \\ - --model <MODEL> \\ - --reference <REF> \\ - --output output.json \\ - --lddt \\ - --structural-checks \\ - --consistency-checks \\ - --molck \\ - --remove oxt hyd \\ - --map-nonstandard-residues - -Here we describe how the parameters can be set to mimick a CAMEO evaluation -(as of August 2018). - -CAMEO calls the lddt binary as follows: - - lddt \\ - -p <PARAMETER FILE> \\ - -f \\ - -a 15 \\ - -b 15 \\ - -r 15 \\ - <MODEL> \\ - <REF> - -Only model structures are "Molck-ed" in CAMEO. The call to molck is as follows: - - molck \\ - --complib=<COMPOUND LIB> \\ - --rm=hyd,oxt,unk,nonstd \\ - --fix-ele \\ - --map-nonstd \\ - --out=<OUTPUT> \\ - <FILEPATH> - -To be as much compatible with with CAMEO as possible one should call -compare-structures as follows: - - ost compare-structures \\ - --model <MODEL> \\ - --reference <REF> \\ - --output output.json \\ - --molck \\ - --remove oxt hyd unk nonstd \\ - --clean-element-column \\ - --map-nonstandard-residues \\ - --structural-checks \\ - --bond-tolerance 15.0 \\ - --angle-tolerance 15.0 \\ - --residue-number-alignment \\ - --consistency-checks \\ - --qs-score \\ - --lddt \\ - --inclusion-radius 15.0 """ +Evaluate model against reference -import os -import sys -import json -import argparse +Example: ost compare-structures -m model.pdb -r reference.cif -import ost -from ost.io import (LoadPDB, LoadMMCIF, SavePDB, MMCifInfoBioUnit, MMCifInfo, - MMCifInfoTransOp, ReadStereoChemicalPropsFile, profiles) -from ost import PushVerbosityLevel -from ost.mol.alg import (qsscoring, Molck, MolckSettings, lDDTSettings, - CheckStructure, ResidueNamesMatch) -from ost.conop import (CompoundLib, SetDefaultLib, GetDefaultLib, - RuleBasedProcessor) -from ost.seq.alg.renumber import Renumber +Loads the structures and performs basic cleanup: + * Assign elements according to the PDB compound dictionary + * Map nonstandard residues to their parent residues as defined by the PDB + compound dictionary, e.g. phospho-serine => serine + * Remove hydrogens + * Remove OXT atoms + * Remove unknown atoms, i.e. atoms that are not expected according to the PDB + compound dictionary -def _GetDefaultShareFilePath(filename): - """Look for filename in working directory and OST shared data path. - :return: Path to valid file or None if not found. - """ - # Try current directory - cwd = os.path.abspath(os.getcwd()) - file_path = os.path.join(cwd, filename) - if not os.path.isfile(file_path): - try: - file_path = os.path.join(ost.GetSharedDataPath(), filename) - except RuntimeError: - # Ignore errors here (caught later together with non-existing file) - pass - if not os.path.isfile(file_path): - file_path = None - # Either file_path is valid file path or None - return file_path - -def _GetDefaultParameterFilePath(): - # Try to get in default locations - parameter_file_path = _GetDefaultShareFilePath("stereo_chemical_props.txt") - if parameter_file_path is None: - msg = ( - "Could not set default stereochemical parameter file. In " - "order to use the default one please set $OST_ROOT " - "environmental variable, run the script with OST binary or" - " provide a local copy of 'stereo_chemical_props.txt' in " - "CWD. Alternatively provide the path to the local copy.") - else: - msg = "" - return parameter_file_path, msg - -def _GetDefaultCompoundLibraryPath(): - # Try to get in default locations - compound_library_path = _GetDefaultShareFilePath("compounds.chemlib") - if compound_library_path is None: - msg = ( - "Could not set default compounds library path. In " - "order to use the default one please set $OST_ROOT " - "environmental variable, run the script with OST binary or" - " provide a local copy of 'compounds.chemlib' in CWD" - ". Alternatively provide the path to the local copy.") - else: - msg = "" - return compound_library_path, msg +The cleaned structures are optionally dumped using -d/--dump-structures -def _ParseArgs(): - """Parse command-line arguments.""" +Output is written in JSON format (default: out.json). In case of no additional +options, this is a dictionary with five keys: + + * "chain_mapping": A dictionary with reference chain names as keys and the + mapped model chain names as values. + * "aln": Pairwise sequence alignment for each pair of mapped chains in fasta + format. + * "chem_groups": Groups of polypeptides/polynucleotides that are considered + chemically equivalent. You can derive stoichiometry from this. + * "inconsistent_residues": List of strings that represent name mismatches of + aligned residues in form + <trg_cname>.<trg_rname><trg_rnum>-<mdl_cname>.<mdl_rname><mdl_rnum>. + Inconsistencies may lead to corrupt results but do not abort the program. + Program abortion in these cases can be enforced with + -ec/--enforce-consistency. + * "status": SUCCESS if everything ran through. In case of failure, the only + content of the JSON output will be \"status\" set to FAILURE and an + additional key: "traceback". + +The pairwise sequence alignments are computed with Needleman-Wunsch using +BLOSUM62 (NUC44 for nucleotides). Many benchmarking scenarios preprocess the +structures to ensure matching residue numbers (CASP/CAMEO). In these cases, +enabling -rna/--residue-number-alignment is recommended. + +Each score is opt-in and can be enabled with optional arguments. + +Example to compute local and per-residue lDDT values as well as QS-score: + +ost compare-structures -m model.pdb -r reference.cif --lddt --local-lddt \ +--qs-score - parser = argparse.ArgumentParser( - formatter_class=argparse.RawTextHelpFormatter, - description=__doc__, - prog="ost compare-structures") +Example to inject custom chain mapping - # - # Required arguments - # +ost compare-structures -m model.pdb -r reference.cif -c A:B B:A +""" + +import argparse +import os +import json +import time +import sys +import traceback + +from ost import io +from ost.mol.alg import scoring - group_required = parser.add_argument_group('required arguments') +def _ParseArgs(): + parser = argparse.ArgumentParser(description = __doc__, + formatter_class=argparse.RawDescriptionHelpFormatter, + prog = "ost compare-structures") - group_required.add_argument( + parser.add_argument( "-m", "--model", dest="model", required=True, - help=("Path to the model file.")) - group_required.add_argument( + help=("Path to model file.")) + + parser.add_argument( "-r", "--reference", dest="reference", required=True, - help=("Path to the reference file.")) - - # - # General arguments - # - - group_general = parser.add_argument_group('general arguments') + help=("Path to reference file.")) - group_general.add_argument( - '-v', - '--verbosity', - type=int, - default=3, - help="Set verbosity level. Defaults to 3.") - group_general.add_argument( + parser.add_argument( "-o", "--output", dest="output", - help=("Output file name. The output will be saved as a JSON file.")) - group_general.add_argument( + required=False, + default="out.json", + help=("Output file name. The output will be saved as a JSON file. " + "default: out.json")) + + parser.add_argument( + "-mf", + "--model-format", + dest="model_format", + required=False, + default=None, + choices=["pdb", "cif", "mmcif"], + help=("Format of model file. pdb reads pdb but also pdb.gz, same " + "applies to cif/mmcif. Inferred from filepath if not given.")) + + parser.add_argument( + "-rf", + "--reference-format", + dest="reference_format", + required=False, + default=None, + choices=["pdb", "cif", "mmcif"], + help=("Format of reference file. pdb reads pdb but also pdb.gz, same " + "applies to cif/mmcif. Inferred from filepath if not given.")) + + parser.add_argument( + "-mb", + "--model-biounit", + dest="model_biounit", + required=False, + default=None, + type=int, + help=("Only has an effect if model is in mmcif format. By default, " + "the assymetric unit (AU) is used for scoring. If there are " + "biounits defined in the mmcif file, you can specify the index " + "of the one which should be used.")) + + parser.add_argument( + "-rb", + "--reference-biounit", + dest="reference_biounit", + required=False, + default=None, + type=int, + help=("Only has an effect if reference is in mmcif format. By default, " + "the assymetric unit (AU) is used for scoring. If there are " + "biounits defined in the mmcif file, you can specify the index " + "of the one which should be used.")) + + parser.add_argument( + "-rna", + "--residue-number-alignment", + dest="residue_number_alignment", + default=False, + action="store_true", + help=("Make alignment based on residue number instead of using " + "a global BLOSUM62-based alignment (NUC44 for nucleotides).")) + + parser.add_argument( + "-ec", + "--enforce-consistency", + dest="enforce_consistency", + default=False, + action="store_true", + help=("Enforce consistency. By default residue name discrepancies " + "between a model and reference are reported but the program " + "proceeds. If this flag is ON, the program fails for these " + "cases.")) + + parser.add_argument( "-d", "--dump-structures", dest="dump_structures", default=False, action="store_true", - help=("Dump cleaned structures used to calculate all the scores as\n" - "PDB files using specified suffix. Files will be dumped to the\n" + help=("Dump cleaned structures used to calculate all the scores as " + "PDB files using specified suffix. Files will be dumped to the " "same location as original files.")) - group_general.add_argument( + + parser.add_argument( "-ds", "--dump-suffix", dest="dump_suffix", default=".compare.structures.pdb", help=("Use this suffix to dump structures.\n" "Defaults to .compare.structures.pdb.")) - group_general.add_argument( - "-rs", - "--reference-selection", - dest="reference_selection", - default="", - help=("Selection performed on reference structures.")) - group_general.add_argument( - "-ms", - "--model-selection", - dest="model_selection", - default="", - help=("Selection performed on model structures.")) - group_general.add_argument( - "-ca", - "--c-alpha-only", - dest="c_alpha_only", - default=False, - action="store_true", - help=("Use C-alpha atoms only. Equivalent of calling the action with\n" - "'--model-selection=\"aname=CA\" " - "--reference-selection=\"aname=CA\"'\noptions.")) - group_general.add_argument( + + parser.add_argument( "-ft", "--fault-tolerant", dest="fault_tolerant", default=False, action="store_true", help=("Fault tolerant parsing.")) - group_general.add_argument( - "-cl", - "--compound-library", - dest="compound_library", - default=None, - help=("Location of the compound library file (compounds.chemlib).\n" - "If not provided, the following locations are searched in this\n" - "order: 1. Working directory, 2. OpenStructure standard library" - "\nlocation.")) - - # - # Molecular check arguments - # - group_molck = parser.add_argument_group('molecular check arguments') + parser.add_argument( + "-c", + "--chain-mapping", + nargs="+", + dest="chain_mapping", + help=("Custom mapping of chains between the reference and the model. " + "Each separate mapping consist of key:value pairs where key " + "is the chain name in reference and value is the chain name in " + "model.")) - group_molck.add_argument( - "-ml", - "--molck", - dest="molck", + parser.add_argument( + "--lddt", + dest="lddt", default=False, action="store_true", - help=("Run molecular checker to clean up input.")) - group_molck.add_argument( - "-rm", - "--remove", - dest="remove", - nargs="+", # *, +, ?, N - required=False, - default=["hyd"], - help=("Remove atoms and residues matching some criteria:\n" - " * zeroocc - Remove atoms with zero occupancy\n" - " * hyd - remove hydrogen atoms\n" - " * oxt - remove terminal oxygens\n" - " * nonstd - remove all residues not one of the 20\n" - " standard amino acids\n" - " * unk - Remove unknown and atoms not following the\n" - " nomenclature\n" - "Defaults to hyd.")) - group_molck.add_argument( - "-ce", - "--clean-element-column", - dest="clean_element_column", + help=("Compute global lDDT score with default parameterization and " + "store as key \"lddt\". Stereochemical irregularities affecting " + "lDDT are reported as keys \"model_clashes\", " + "\"model_bad_bonds\", \"model_bad_angles\" and the respective " + "reference counterparts.")) + + parser.add_argument( + "--local-lddt", + dest="local_lddt", default=False, action="store_true", - help=("Clean up element column")) - group_molck.add_argument( - "-mn", - "--map-nonstandard-residues", - dest="map_nonstandard_residues", + help=("Compute per-residue lDDT scores with default parameterization " + "and store as key \"local_lddt\". Score for model residue with " + "number 42 in chain X can be extracted with: " + "data[\"local_lddt\"][\"X\"][\"42\"]. If there is an insertion " + "code, lets say A, the last key becomes \"42A\" Stereochemical " + "irregularities affecting lDDT are reported as keys " + "\"model_clashes\", \"model_bad_bonds\", \"model_bad_angles\" " + "and the respective reference counterparts.")) + + parser.add_argument( + "--cad-score", + dest="cad_score", default=False, action="store_true", - help=("Map modified residues back to the parent amino acid, for\n" - "example MSE -> MET, SEP -> SER.")) - - # - # Structural check arguments - # - - group_sc = parser.add_argument_group('structural check arguments') - - group_sc.add_argument( - "-sc", - "--structural-checks", - dest="structural_checks", + help=("Compute global CAD's atom-atom (AA) score and store as key " + "\"cad_score\". --residue-number-alignment must be enabled " + "to compute this score. Requires voronota_cadscore executable " + "in PATH. Alternatively you can set cad-exec.")) + + parser.add_argument( + "--local-cad-score", + dest="local_cad_score", default=False, action="store_true", - help=("Perform structural checks and filter input data.")) - group_sc.add_argument( - "-p", - "--parameter-file", - dest="parameter_file", + help=("Compute local CAD's atom-atom (AA) scores and store as key " + "\"local_cad_score\". Score for model residue with number 42 in " + "chain X can be extracted with: " + "data[\"local_cad_score\"][\"X\"][\"42\"]. " + "--residue-number-alignments must be enabled to compute this " + "score. Requires voronota_cadscore executable in PATH. " + "Alternatively you can set cad-exec.")) + + parser.add_argument( + "--cad-exec", + dest="cad_exec", default=None, - help=("Location of the stereochemical parameter file\n" - "(stereo_chemical_props.txt).\n" - "If not provided, the following locations are searched in this\n" - "order: 1. Working directory, 2. OpenStructure standard library" - "\nlocation.")) - group_sc.add_argument( - "-bt", - "--bond-tolerance", - dest="bond_tolerance", - type=float, - default=12.0, - help=("Tolerance in STD for bonds. Defaults to 12.")) - group_sc.add_argument( - "-at", - "--angle-tolerance", - dest="angle_tolerance", - type=float, - default=12.0, - help=("Tolerance in STD for angles. Defaults to 12.")) - - # - # Chain mapping arguments - # - - group_cm = parser.add_argument_group('chain mapping arguments') - - group_cm.add_argument( - "-c", - "--chain-mapping", - nargs="+", - type=lambda x: x.split(":"), - dest="chain_mapping", - help=("Mapping of chains between the reference and the model.\n" - "Each separate mapping consist of key:value pairs where key\n" - "is the chain name in reference and value is the chain name in\n" - "model.")) - group_cm.add_argument( - "--qs-max-mappings-extensive", - dest="qs_max_mappings_extensive", - type=int, - default=1000000, - help=("Maximal number of chain mappings to test for 'extensive'\n" - "chain mapping scheme which is used as a last resort if\n" - "other schemes failed. The extensive chain mapping search\n" - "must in the worst case check O(N!) possible mappings for\n" - "complexes with N chains. Two octamers without symmetry\n" - "would require 322560 mappings to be checked. To limit\n" - "computations, no scores are computed if we try more than\n" - "the maximal number of chain mappings. Defaults to 1000000.")) - - # - # Sequence alignment arguments - # - - group_aln = parser.add_argument_group('sequence alignment arguments') - - group_aln.add_argument( - "-cc", - "--consistency-checks", - dest="consistency_checks", - default=False, - action="store_true", - help=("Take consistency checks into account. By default residue name\n" - "consistency between a model-reference pair would be checked\n" - "but only a warning message will be displayed and the script\n" - "will continue to calculate scores. If this flag is ON, checks\n" - "will not be ignored and if the pair does not pass the test\n" - "all the scores for that pair will be marked as a FAILURE.")) - group_aln.add_argument( - "-rna", - "--residue-number-alignment", - dest="residue_number_alignment", - default=False, - action="store_true", - help=("Make alignment based on residue number instead of using\n" - "a global BLOSUM62-based alignment.")) + help=("Path to voronota-cadscore executable (installed from " + "https://github.com/kliment-olechnovic/voronota). Searches PATH " + "if not set.")) - # - # QS score arguments - # - - group_qs = parser.add_argument_group('QS score arguments') - - group_qs.add_argument( - "-qs", + parser.add_argument( "--qs-score", dest="qs_score", default=False, action="store_true", - help=("Calculate QS-score.")) - group_qs.add_argument( - "--qs-rmsd", - dest="qs_rmsd", + help=("Compute QS-score, stored as key \"qs_global\", and the QS-best " + "variant, stored as key \"qs_best\".")) + + parser.add_argument( + "--rigid-scores", + dest="rigid_scores", default=False, action="store_true", - help=("Calculate CA RMSD between shared CA atoms of mapped chains.\n" - "This uses a superposition using all mapped chains which\n" - "minimizes the CA RMSD.")) - - # - # lDDT score arguments - # - - group_lddt = parser.add_argument_group('lDDT score arguments') - - group_lddt.add_argument( - "-l", - "--lddt", - dest="lddt", + help=("Computes rigid superposition based scores. They're based on a " + "Kabsch superposition of all mapped CA positions (C3' for " + "nucleotides). Makes the following keys available: " + "\"oligo_gdtts\": GDT with distance thresholds [1.0, 2.0, 4.0, " + "8.0] given these positions and transformation, \"oligo_gdtha\": " + "same with thresholds [0.5, 1.0, 2.0, 4.0], \"rmsd\": RMSD given " + "these positions and transformation, \"transform\": the used 4x4 " + "transformation matrix that superposes model onto reference.")) + + parser.add_argument( + "--interface-scores", + dest="interface_scores", default=False, action="store_true", - help=("Calculate lDDT.")) - group_lddt.add_argument( - "-ir", - "--inclusion-radius", - dest="inclusion_radius", - type=float, - default=15.0, - help=("Distance inclusion radius for lDDT. Defaults to 15 A.")) - group_lddt.add_argument( - "-ss", - "--sequence-separation", - dest="sequence_separation", - type=int, - default=0, - help=("Sequence separation. Only distances between residues whose\n" - "separation is higher than the provided parameter are\n" - "considered when computing the score. Defaults to 0.")) - group_lddt.add_argument( - "-spr", - "--save-per-residue-scores", - dest="save_per_residue_scores", + help=("Per interface scores for each interface that has at least one " + "contact in the reference, i.e. at least one pair of heavy atoms " + "within 5A. The respective interfaces are available from key " + "\"interfaces\" which is a list of tuples in form (ref_ch1, " + "ref_ch2, mdl_ch1, mdl_ch2). Per-interface scores are available " + "as lists referring to these interfaces and have the following " + "keys: \"nnat\" (number of contacts in reference), \"nmdl\" " + "(number of contacts in model), \"fnat\" (fraction of reference " + "contacts which are also there in model), \"fnonnat\" (fraction " + "of model contacts which are not there in target), \"irmsd\" " + "(interface RMSD), \"lrmsd\" (ligand RMSD), \"dockq_scores\" " + "(per-interface score computed from \"fnat\", \"irmsd\" and " + "\"lrmsd\"), \"interface_qs_global\" and \"interface_qs_best\" " + "(per-interface versions of the two QS-score variants). The " + "DockQ score is strictly designed to score each interface " + "individually. We also provide two averaged versions to get one " + "full model score: \"dockq_ave\", \"dockq_wave\". The first is " + "simply the average of \"dockq_scores\", the latter is a " + "weighted average with weights derived from \"nnat\". These two " + "scores only consider interfaces that are present in both, the " + "model and the reference. \"dockq_ave_full\" and " + "\"dockq_wave_full\" add zeros in the average computation for " + "each interface that is only present in the reference but not in " + "the model.")) + + parser.add_argument( + "--patch-scores", + dest="patch_scores", default=False, action="store_true", - help=("")) - - # Print full help is no arguments provided - if len(sys.argv) == 1: - parser.print_help(sys.stderr) - sys.exit(1) - - opts = parser.parse_args() - # Set chain mapping - if opts.chain_mapping is not None: - try: - opts.chain_mapping = dict(opts.chain_mapping) - except ValueError: - parser.error( - "Cannot parse chain mapping into dictionary. The " - "correct format is: key:value [key2:value2 ...].") - - # Check parameter file if structural checks are on - if opts.structural_checks: - if opts.parameter_file is None: - # try to get default if none provided - opts.parameter_file, msg = _GetDefaultParameterFilePath() - if msg: - parser.error(msg) - else: - # if provided it must exist - if not os.path.isfile(opts.parameter_file): - parser.error("Parameter file %s does not exist." \ - % opts.parameter_file) - - # Check compound library path (always required!) - if opts.compound_library is None: - # try to get default if none provided - opts.compound_library, msg = _GetDefaultCompoundLibraryPath() - if msg: - parser.error(msg) - else: - # if provided it must exist - if not os.path.isfile(opts.compound_library): - parser.error("Compounds library file %s does not exist." \ - % opts.compound_library) - - # Check model and reference paths - if not os.path.isfile(opts.model): - parser.error("Model file %s does not exist." % opts.model) - if not os.path.isfile(opts.reference): - parser.error("Reference file %s does not exist." % opts.reference) - - return opts - - -def _SetCompoundsChemlib(path_to_chemlib): - """Set default compound library for OST.""" - # NOTE: This is adapted from ProMod3 code and should in the future be doable - # with some shared OST code! - compound_lib = CompoundLib.Load(path_to_chemlib) - SetDefaultLib(compound_lib) - processor = RuleBasedProcessor(compound_lib) - for profile_name in profiles: - profiles[profile_name].processor = processor.Copy() - - -def _RevertChainNames(ent): + help=("Local interface quality score used in CASP15. Scores each " + "model residue that is considered in the interface (CB pos " + "within 8A of any CB pos from another chain (CA for GLY)). The " + "local neighborhood gets represented by \"interface patches\" " + "which are scored with QS-score and DockQ. Scores where not " + "the full patches are represented by the reference are set to " + "None. Model interface residues are available as key " + "\"model_interface_residues\", reference interface residues as " + "key \"reference_interface_residues\". Residues are represented " + "as string in form <num><inscode>. The respective scores are " + "available as keys \"patch_qs\" and \"patch_dockq\"")) + + return parser.parse_args() + +def _Rename(ent): """Revert chain names to original names. - By default the first chain with given name will not have any number - attached to it ie. if there are two chains mapping to chain A the resulting + PDBize assigns chain name in order A,B,C,D... which does not allow to infer + the original chain name. We do a renaming here: + if there are two chains mapping to chain A the resulting chain names will be: A and A2. """ - editor = ent.EditXCS() - suffix = "_tmp" # just a suffix for temporary chain name - separator = "" # dot causes selection error - used_names = dict() - reverted_chains = dict() - for chain in ent.chains: - try: - original_name = chain.GetStringProp("original_name") - except Exception as ex: - ost.LogError("Cannot revert chain %s back to original: %s" % ( - chain.name, - str(ex))) - reverted_chains[chain.name] = chain.name - editor.RenameChain(chain, chain.name + suffix) + new_chain_names = list() + chain_indices = list() # the chains where we actually change the name + suffix_indices = dict() # keep track of whats the current suffix index + # for each original chain name + + for ch_idx, ch in enumerate(ent.chains): + if not ch.HasProp("original_name"): + # pdbize doesnt set this property for chain names in ['_', '-'] continue - new_name = original_name - if new_name not in used_names: - used_names[original_name] = 2 - reverted_chains[chain.name] = new_name - editor.RenameChain(chain, chain.name + suffix) + original_name = ch.GetStringProp("original_name") + if original_name in new_chain_names: + new_name = original_name + str(suffix_indices[original_name]) + new_chain_names.append(new_name) + suffix_indices[original_name] = suffix_indices[original_name] + 1 else: - new_name = "%s%s%i" % (original_name, - separator, - used_names[original_name]) - reverted_chains[chain.name] = new_name - editor.RenameChain(chain, chain.name + suffix) - used_names[original_name] += 1 - for chain in ent.chains: - editor.RenameChain(chain, reverted_chains[chain.name[:-len(suffix)]]) - rev_out = ["%s -> %s" % (on, nn) for on, nn in list(reverted_chains.items())] - ost.LogInfo("Reverted chains: %s" % ", ".join(rev_out)) - - -def _CheckConsistency(alignments, log_error): - is_cons = True - for alignment in alignments: - ref_chain = Renumber(alignment.GetSequence(0)).CreateFullView() - mdl_chain = Renumber(alignment.GetSequence(1)).CreateFullView() - new_is_cons = ResidueNamesMatch(mdl_chain, ref_chain, log_error) - is_cons = is_cons and new_is_cons - return is_cons - - -def _GetAlignmentsAsFasta(alignments): - """Get the alignments as FASTA formated string. - - :param alignments: Alignments - :type alignments: list of AlignmentHandle - :returns: list of alignments in FASTA format - :rtype: list of strings - """ - strings = list() - for alignment in alignments: - aln_str = ">reference:%s\n%s\n>model:%s\n%s" % ( - alignment.GetSequence(0).name, - alignment.GetSequence(0).GetString(), - alignment.GetSequence(1).name, - alignment.GetSequence(1).GetString()) - strings.append(aln_str) - return strings - - -def _ReadStructureFile(path, c_alpha_only=False, fault_tolerant=False, - selection=""): - """Safely read structure file into OST entities (split by biounit). - - The function can read both PDB and mmCIF files. - - :param path: Path to the file. - :type path: :class:`str` - :returns: list of entities - :rtype: :class:`list` of :class:`~ost.mol.EntityHandle` + new_chain_names.append(original_name) + suffix_indices[original_name] = 2 + chain_indices.append(ch_idx) + editor = ent.EditXCS() + # rename to nonsense to avoid clashing chain names + for ch_idx in chain_indices: + editor.RenameChain(ent.chains[ch_idx], ent.chains[ch_idx].name+"_yolo") + # and do final renaming + for new_name, ch_idx in zip(new_chain_names, chain_indices): + editor.RenameChain(ent.chains[ch_idx], new_name) + +def _LoadStructure(structure_path, sformat=None, fault_tolerant=False, + bu_idx=None): + """Read OST entity either from mmCIF or PDB. + + The returned structure has structure_path attached as structure name """ - def _Select(entity): - if selection: - ost.LogInfo("Selecting %s" % selection) - ent_view = entity.Select(selection) - entity = mol.CreateEntityFromView(ent_view, False) - return entity - - entities = list() - if not os.path.isfile(path): - raise IOError("%s is not a file" % path) - - # Determine file format from suffix. - ext = path.split(".") - if ext[-1] == "gz": - ext = ext[:-1] - if len(ext) <= 1: - raise RuntimeError(f"Could not determine format of file {path}.") - sformat = ext[-1].lower() - - if sformat in ["pdb"]: - entity = LoadPDB( - path, - fault_tolerant=fault_tolerant, - calpha_only=c_alpha_only) - if not entity.IsValid() or len(entity.residues) == 0: - raise IOError("Provided file does not contain valid entity.") - entity.SetName(os.path.basename(path)) - entity = _Select(entity) - entities.append(entity) - elif sformat in ["cif", "mmcif"]: + if not os.path.exists(structure_path): + raise Exception(f"file not found: {structure_path}") + + if sformat is None: + # Determine file format from suffix. + ext = structure_path.split(".") + if ext[-1] == "gz": + ext = ext[:-1] + if len(ext) <= 1: + raise Exception(f"Could not determine format of file " + f"{structure_path}.") + sformat = ext[-1].lower() + + # increase loglevel, as we would pollute the info log with weird stuff + ost.PushVerbosityLevel(ost.LogLevel.Error) + # Load the structure + if sformat in ["mmcif", "cif"]: + if bu_idx is not None: + cif_entity, cif_seqres, cif_info = \ + io.LoadMMCIF(structure_path, info=True, seqres=True, + fault_tolerant=fault_tolerant) + if bu_idx >= len(cif_info.biounits): + raise RuntimeError(f"Invalid biounit index - requested {bu_idx} " + f"cif file has {len(cif_info.biounits)}") + biounit = cif_info.biounits[bu_idx] + entity = biounit.PDBize(cif_entity, min_polymer_size=0) + if not entity.IsValid(): + raise IOError( + "Provided file does not contain valid entity.") + _Rename(entity) + else: + entity = io.LoadMMCIF(structure_path, + fault_tolerant = fault_tolerant) + if len(entity.residues) == 0: + raise Exception(f"No residues found in file: {structure_path}") + elif sformat == "pdb": + entity = io.LoadPDB(structure_path, fault_tolerant = fault_tolerant) + if len(entity.residues) == 0: + raise Exception(f"No residues found in file: {structure_path}") + else: + raise Exception(f"Unknown/ unsupported file extension found for " + f"file {structure_path}.") + # restore old loglevel and return + ost.PopVerbosityLevel() + entity.SetName(structure_path) + return entity + +def _AlnToFastaStr(aln): + """ Returns alignment as fasta formatted string + """ + s1 = aln.GetSequence(0) + s2 = aln.GetSequence(1) + return f">reference:{s1.name}\n{str(s1)}\nmodel:{s2.name}\n{str(s2)}" + +def _GetInconsistentResidues(alns): + lst = list() + for aln in alns: + for col in aln: + r1 = col.GetResidue(0) + r2 = col.GetResidue(1) + if r1.IsValid() and r2.IsValid() and r1.GetName() != r2.GetName(): + lst.append((r1, r2)) + lst = [f"{x[0].GetQualifiedName()}-{x[1].GetQualifiedName()}" for x in lst] + return lst + +def _LocalScoresToJSONDict(score_dict): + """ Score for model residue with number rnum in chain X can be extracted + with: data["local_cad_score"]["X"][rnum]. Convert ResNum object to str for + JSON serialization + """ + json_dict = dict() + for ch, ch_scores in score_dict.items(): + json_dict[ch] = {str(rnum): s for rnum, s in ch_scores.items()} + return json_dict + +def _InterfaceResiduesToJSONDict(interface_dict): + """ Interface residues are stored as + data["model_interface_residues"]["A"][rnum1, rnum2,...]. Convert ResNum + object to str for JSON serialization. + """ + json_dict = dict() + for ch, ch_nums in interface_dict.items(): + json_dict[ch] = [str(rnum) for rnum in ch_nums] + return json_dict + +def _Process(model, reference, args): + + mapping = None + if args.chain_mapping is not None: + mapping = {x.split(':')[0]: x.split(':')[1] for x in args.chain_mapping} + + scorer = scoring.Scorer(model, reference, + resnum_alignments = args.residue_number_alignment, + cad_score_exec = args.cad_exec, + custom_mapping = mapping) + + ir = _GetInconsistentResidues(scorer.aln) + if len(ir) > 0 and args.enforce_consistency: + raise RuntimeError(f"Inconsistent residues observed: {' '.join(ir)}") + + out = dict() + out["chain_mapping"] = scorer.mapping.GetFlatMapping() + out["aln"] = [_AlnToFastaStr(aln) for aln in scorer.aln] + out["inconsistent_residues"] = ir + out["chem_groups"] = scorer.chain_mapper.chem_groups + + if args.lddt: + out["lddt"] = scorer.lddt + + if args.local_lddt: + out["local_lddt"] = _LocalScoresToJSONDict(scorer.local_lddt) + + if args.lddt or args.local_lddt: + out["model_clashes"] = [x.ToJSON() for x in scorer.model_clashes] + out["model_bad_bonds"] = [x.ToJSON() for x in scorer.model_bad_bonds] + out["model_bad_angles"] = [x.ToJSON() for x in scorer.model_bad_angles] + out["reference_clashes"] = [x.ToJSON() for x in scorer.target_clashes] + out["reference_bad_bonds"] = [x.ToJSON() for x in scorer.target_bad_bonds] + out["reference_bad_angles"] = [x.ToJSON() for x in scorer.target_bad_angles] + + if args.cad_score: + out["cad_score"] = scorer.cad_score + + if args.local_cad_score: + out["local_cad_score"] = _LocalScoresToJSONDict(scorer.local_cad_score) + + if args.qs_score: + out["qs_global"] = scorer.qs_global + out["qs_best"] = scorer.qs_best + + if args.rigid_scores: + out["oligo_gdtts"] = scorer.gdtts + out["oligo_gdtha"] = scorer.gdtha + out["rmsd"] = scorer.rmsd + data = scorer.transform.data + out["transform"] = [data[i:i + 4] for i in range(0, len(data), 4)] + + if args.interface_scores: + out["interfaces"] = scorer.interfaces + out["nnat"] = scorer.native_contacts + out["nmdl"] = scorer.model_contacts + out["fnat"] = scorer.fnat + out["fnonnat"] = scorer.fnonnat + out["irmsd"] = scorer.irmsd + out["lrmsd"] = scorer.lrmsd + out["dockq_scores"] = scorer.dockq_scores + out["interface_qs_global"] = scorer.interface_qs_global + out["interface_qs_best"] = scorer.interface_qs_best + out["dockq_ave"] = scorer.dockq_ave + out["dockq_wave"] = scorer.dockq_wave + out["dockq_ave_full"] = scorer.dockq_ave_full + out["dockq_wave_full"] = scorer.dockq_wave_full + + if args.patch_scores: + out["model_interface_residues"] = \ + _InterfaceResiduesToJSONDict(scorer.model_interface_residues) + out["reference_interface_residues"] = \ + _InterfaceResiduesToJSONDict(scorer.target_interface_residues) + out["patch_qs"] = scorer.patch_qs + out["patch_dockq"] = scorer.patch_dockq + + if args.dump_structures: try: - tmp_entity, cif_info = LoadMMCIF( - path, - info=True, - fault_tolerant=fault_tolerant, - calpha_only=c_alpha_only) - if len(cif_info.biounits) == 0: - tbu = MMCifInfoBioUnit() - tbu.id = 'ASU' - tbu.details = 'asymmetric unit' - for chain in tmp_entity.chains: - tbu.AddChain(str(chain)) - tinfo = MMCifInfo() - tops = MMCifInfoTransOp() - tinfo.AddOperation(tops) - tbu.AddOperations(tinfo.GetOperations()) - entity = tbu.PDBize(tmp_entity, min_polymer_size=0) - entity.SetName(os.path.basename(path) + ".au") - _RevertChainNames(entity) - entity = _Select(entity) - entities.append(entity) - elif len(cif_info.biounits) > 1: - for i, biounit in enumerate(cif_info.biounits, 1): - entity = biounit.PDBize(tmp_entity, min_polymer_size=0) - if not entity.IsValid(): - raise IOError( - "Provided file does not contain valid entity.") - entity.SetName(os.path.basename(path) + "." + str(i)) - _RevertChainNames(entity) - entity = _Select(entity) - entities.append(entity) + io.SavePDB(scorer.model, model.GetName() + args.dump_suffix) + except Exception as e: + if "single-letter" in str(e) and args.model_biounit is not None: + raise RuntimeError("Failed to dump processed model. PDB " + "format only supports single character " + "chain names. This is likely the result of " + "chain renaming when constructing a user " + "specified biounit. Dumping structures " + "fails in this case.") else: - biounit = cif_info.biounits[0] - entity = biounit.PDBize(tmp_entity, min_polymer_size=0) - if not entity.IsValid(): - raise IOError( - "Provided file does not contain valid entity.") - entity.SetName(os.path.basename(path)) - _RevertChainNames(entity) - entity = _Select(entity) - entities.append(entity) - - except Exception: - raise - else: - raise RuntimeError(f"Unsupported file extension found for file {path}.") - - return entities + raise + try: + io.SavePDB(scorer.target, reference.GetName() + args.dump_suffix) + except Exception as e: + if "single-letter" in str(e) and args.reference_biounit is not None: + raise RuntimeError("Failed to dump processed reference. PDB " + "format only supports single character " + "chain names. This is likely the result of " + "chain renaming when constructing a user " + "specified biounit. Dumping structures " + "fails in this case.") + else: + raise -def _MolckEntity(entity, options): - """Molck the entity.""" - lib = GetDefaultLib() - to_remove = tuple(options.remove) - ms = MolckSettings(rm_unk_atoms="unk" in to_remove, - rm_non_std="nonstd" in to_remove, - rm_hyd_atoms="hyd" in to_remove, - rm_oxt_atoms="oxt" in to_remove, - rm_zero_occ_atoms="zeroocc" in to_remove, - colored=False, - map_nonstd_res=options.map_nonstandard_residues, - assign_elem=options.clean_element_column) - Molck(entity, lib, ms) + return out def _Main(): - """Do the magic.""" - # - # Setup - opts = _ParseArgs() - PushVerbosityLevel(opts.verbosity) - _SetCompoundsChemlib(opts.compound_library) - # - # Read the input files - ost.LogInfo("#" * 80) - ost.LogInfo("Reading input files (fault_tolerant=%s)" % - str(opts.fault_tolerant)) - ost.LogInfo(" --> reading model from %s" % opts.model) - models = _ReadStructureFile( - opts.model, - c_alpha_only=opts.c_alpha_only, - fault_tolerant=opts.fault_tolerant, - selection=opts.model_selection) - ost.LogInfo(" --> reading reference from %s" % opts.reference) - references = _ReadStructureFile( - opts.reference, - c_alpha_only=opts.c_alpha_only, - fault_tolerant=opts.fault_tolerant, - selection=opts.reference_selection) - # molcking - if opts.molck: - ost.LogInfo("#" * 80) - ost.LogInfo("Cleaning up input with Molck") - for reference in references: - _MolckEntity(reference, opts) - for model in models: - _MolckEntity(model, opts) - # restrict to peptides (needed for CheckStructure anyways) - for i in range(len(references)): - references[i] = references[i].Select("peptide=true") - for i in range(len(models)): - models[i] = models[i].Select("peptide=true") - # structure checking - if opts.structural_checks: - ost.LogInfo("#" * 80) - ost.LogInfo("Performing structural checks") - stereochemical_parameters = ReadStereoChemicalPropsFile( - opts.parameter_file) - ost.LogInfo(" --> for reference(s)") - for reference in references: - ost.LogInfo("Checking %s" % reference.GetName()) - CheckStructure(reference, - stereochemical_parameters.bond_table, - stereochemical_parameters.angle_table, - stereochemical_parameters.nonbonded_table, - opts.bond_tolerance, - opts.angle_tolerance) - ost.LogInfo(" --> for model(s)") - for model in models: - ost.LogInfo("Checking %s" % model.GetName()) - CheckStructure(model, - stereochemical_parameters.bond_table, - stereochemical_parameters.angle_table, - stereochemical_parameters.nonbonded_table, - opts.bond_tolerance, - opts.angle_tolerance) - if len(models) > 1 or len(references) > 1: - ost.LogInfo("#" * 80) - ost.LogInfo( - "Multiple complexes mode ON. All combinations will be tried.") - - result = { - "result": {}, - "options": vars(opts)} - result["options"]["cwd"] = os.path.abspath(os.getcwd()) - # - # Perform scoring - skipped = list() - for model in models: - model_name = model.GetName() - model_results = dict() - for reference in references: - reference_name = reference.GetName() - reference_results = { - "info": dict()} - ost.LogInfo("#" * 80) - ost.LogInfo("Comparing %s to %s" % ( - model_name, - reference_name)) - qs_scorer = qsscoring.QSscorer(reference, - model, - opts.residue_number_alignment) - qs_scorer.max_mappings_extensive = opts.qs_max_mappings_extensive - if opts.chain_mapping is not None: - ost.LogInfo( - "Using custom chain mapping: %s" % str( - opts.chain_mapping)) - qs_scorer.chain_mapping = opts.chain_mapping - else: - try: - qs_scorer.chain_mapping # just to initialize it - except qsscoring.QSscoreError as ex: - ost.LogError('Chain mapping failed:', str(ex)) - ost.LogError('Skipping comparison') - continue - ost.LogInfo("-" * 80) - ost.LogInfo("Checking consistency between %s and %s" % ( - model_name, reference_name)) - is_cons = _CheckConsistency( - qs_scorer.alignments, - opts.consistency_checks) - reference_results["info"]["residue_names_consistent"] = is_cons - reference_results["info"]["mapping"] = { - "chain_mapping": qs_scorer.chain_mapping, - "chain_mapping_scheme": qs_scorer.chain_mapping_scheme, - "alignments": _GetAlignmentsAsFasta(qs_scorer.alignments)} - skip_score = False - if opts.consistency_checks: - if not is_cons: - msg = (("Residue names in model %s and in reference " - "%s are inconsistent.") % ( - model_name, - reference_name)) - ost.LogError(msg) - skip_score = True - skipped.append(skip_score) - else: - ost.LogInfo("Consistency check: OK") - skipped.append(False) - else: - skipped.append(False) - if not is_cons: - msg = (("Residue names in model %s and in reference " - "%s are inconsistent.\nThis might lead to " - "corrupted results.") % ( - model_name, - reference_name)) - ost.LogWarning(msg) - else: - ost.LogInfo("Consistency check: OK") - if opts.qs_rmsd: - ost.LogInfo("-" * 80) - if skip_score: - ost.LogInfo( - "Skipping QS-RMSD because consistency check failed") - reference_results["qs_rmsd"] = { - "status": "FAILURE", - "error": "Consistency check failed."} - else: - ost.LogInfo("Computing QS-RMSD") - try: - reference_results["qs_rmsd"] = { - "status": "SUCCESS", - "error": "", - "ca_rmsd": qs_scorer.superposition.rmsd} - except qsscoring.QSscoreError as ex: - ost.LogError('QS-RMSD failed:', str(ex)) - reference_results["qs_rmsd"] = { - "status": "FAILURE", - "error": str(ex)} - if opts.qs_score: - ost.LogInfo("-" * 80) - if skip_score: - ost.LogInfo( - "Skipping QS-score because consistency check failed") - reference_results["qs_score"] = { - "status": "FAILURE", - "error": "Consistency check failed.", - "global_score": 0.0, - "best_score": 0.0} - else: - ost.LogInfo("Computing QS-score") - try: - reference_results["qs_score"] = { - "status": "SUCCESS", - "error": "", - "global_score": qs_scorer.global_score, - "best_score": qs_scorer.best_score} - except qsscoring.QSscoreError as ex: - # default handling: report failure and set score to 0 - ost.LogError('QSscore failed:', str(ex)) - reference_results["qs_score"] = { - "status": "FAILURE", - "error": str(ex), - "global_score": 0.0, - "best_score": 0.0} - # Calculate lDDT - if opts.lddt: - ost.LogInfo("-" * 80) - ost.LogInfo("Computing lDDT scores") - lddt_results = { - "single_chain_lddt": list() - } - lddt_settings = lDDTSettings( - radius=opts.inclusion_radius, - sequence_separation=opts.sequence_separation, - label="lddt") - ost.LogInfo("lDDT settings: ") - ost.LogInfo(str(lddt_settings).rstrip()) - ost.LogInfo("===") - oligo_lddt_scorer = qs_scorer.GetOligoLDDTScorer(lddt_settings) - for mapped_lddt_scorer in oligo_lddt_scorer.mapped_lddt_scorers: - # Get data - lddt_scorer = mapped_lddt_scorer.lddt_scorer - model_chain = mapped_lddt_scorer.model_chain_name - reference_chain = mapped_lddt_scorer.reference_chain_name - if skip_score: - ost.LogInfo( - " --> Skipping single chain lDDT because " - "consistency check failed") - lddt_results["single_chain_lddt"].append({ - "status": "FAILURE", - "error": "Consistency check failed.", - "model_chain": model_chain, - "reference_chain": reference_chain, - "global_score": 0.0, - "conserved_contacts": 0.0, - "total_contacts": 0.0}) - else: - try: - ost.LogInfo((" --> Computing lDDT between model " - "chain %s and reference chain %s") % ( - model_chain, - reference_chain)) - ost.LogInfo("Global LDDT score: %.4f" % - lddt_scorer.global_score) - ost.LogInfo( - "(%i conserved distances out of %i checked, over " - "%i thresholds)" % (lddt_scorer.conserved_contacts, - lddt_scorer.total_contacts, - len(lddt_settings.cutoffs))) - sc_lddt_scores = { - "status": "SUCCESS", - "error": "", - "model_chain": model_chain, - "reference_chain": reference_chain, - "global_score": lddt_scorer.global_score, - "conserved_contacts": - lddt_scorer.conserved_contacts, - "total_contacts": lddt_scorer.total_contacts} - if opts.save_per_residue_scores: - per_residue_sc = \ - mapped_lddt_scorer.GetPerResidueScores() - ost.LogInfo("Per residue local lDDT (reference):") - ost.LogInfo("Chain\tResidue Number\tResidue Name" - "\tlDDT\tConserved Contacts\tTotal " - "Contacts") - for prs_scores in per_residue_sc: - ost.LogInfo("%s\t%i\t%s\t%.4f\t%i\t%i" % ( - reference_chain, - prs_scores["residue_number"], - prs_scores["residue_name"], - prs_scores["lddt"], - prs_scores["conserved_contacts"], - prs_scores["total_contacts"])) - sc_lddt_scores["per_residue_scores"] = \ - per_residue_sc - lddt_results["single_chain_lddt"].append( - sc_lddt_scores) - except Exception as ex: - ost.LogError('Single chain lDDT failed:', str(ex)) - lddt_results["single_chain_lddt"].append({ - "status": "FAILURE", - "error": str(ex), - "model_chain": model_chain, - "reference_chain": reference_chain, - "global_score": 0.0, - "conserved_contacts": 0.0, - "total_contacts": 0.0}) - # perform oligo lddt scoring - if skip_score: - ost.LogInfo( - " --> Skipping oligomeric lDDT because consistency " - "check failed") - lddt_results["oligo_lddt"] = { - "status": "FAILURE", - "error": "Consistency check failed.", - "global_score": 0.0} - else: - try: - ost.LogInfo(' --> Computing oligomeric lDDT score') - lddt_results["oligo_lddt"] = { - "status": "SUCCESS", - "error": "", - "global_score": oligo_lddt_scorer.oligo_lddt} - ost.LogInfo( - "Oligo lDDT score: %.4f" % - oligo_lddt_scorer.oligo_lddt) - except Exception as ex: - ost.LogError('Oligo lDDT failed:', str(ex)) - lddt_results["oligo_lddt"] = { - "status": "FAILURE", - "error": str(ex), - "global_score": 0.0} - if skip_score: - ost.LogInfo( - " --> Skipping weighted lDDT because consistency " - "check failed") - lddt_results["weighted_lddt"] = { - "status": "FAILURE", - "error": "Consistency check failed.", - "global_score": 0.0} - else: - try: - ost.LogInfo(' --> Computing weighted lDDT score') - lddt_results["weighted_lddt"] = { - "status": "SUCCESS", - "error": "", - "global_score": oligo_lddt_scorer.weighted_lddt} - ost.LogInfo( - "Weighted lDDT score: %.4f" % - oligo_lddt_scorer.weighted_lddt) - except Exception as ex: - ost.LogError('Weighted lDDT failed:', str(ex)) - lddt_results["weighted_lddt"] = { - "status": "FAILURE", - "error": str(ex), - "global_score": 0.0} - reference_results["lddt"] = lddt_results - model_results[reference_name] = reference_results - if opts.dump_structures: - ost.LogInfo("-" * 80) - ref_output_path = os.path.join( - os.path.dirname(opts.reference), - reference_name + opts.dump_suffix) - ost.LogInfo("Saving cleaned up reference to %s" % - ref_output_path) - try: - SavePDB(qs_scorer.qs_ent_1.ent, - ref_output_path) - except Exception as ex: - ost.LogError("Cannot save reference: %s" % str(ex)) - mdl_output_path = os.path.join( - os.path.dirname(opts.model), - model_name + opts.dump_suffix) - ost.LogInfo("Saving cleaned up model to %s" % - mdl_output_path) - try: - SavePDB(qs_scorer.qs_ent_2.ent, - mdl_output_path) - except Exception as ex: - ost.LogError("Cannot save model: %s" % str(ex)) - result["result"][model_name] = model_results - - if all(skipped) and len(skipped) > 0: - ost.LogError("Consistency check failed for all model-reference pairs.") - if opts.output is not None: - ost.LogInfo("#" * 80) - ost.LogInfo("Saving output into %s" % opts.output) - with open(opts.output, "w") as outfile: - json.dump(result, outfile, indent=4, sort_keys=True) + + args = _ParseArgs() + try: + reference = _LoadStructure(args.reference, + sformat=args.reference_format, + bu_idx=args.reference_biounit, + fault_tolerant = args.fault_tolerant) + model = _LoadStructure(args.model, + sformat=args.model_format, + bu_idx=args.model_biounit, + fault_tolerant = args.fault_tolerant) + out = _Process(model, reference, args) + out["status"] = "SUCCESS" + with open(args.output, 'w') as fh: + json.dump(out, fh, indent=4, sort_keys=False) + except: + out = dict() + out["status"] = "FAILURE" + out["traceback"] = traceback.format_exc() + with open(args.output, 'w') as fh: + json.dump(out, fh, indent=4, sort_keys=False) + raise if __name__ == '__main__': _Main() - diff --git a/actions/ost-compare-structures-legacy b/actions/ost-compare-structures-legacy new file mode 100644 index 000000000..7aaaf6914 --- /dev/null +++ b/actions/ost-compare-structures-legacy @@ -0,0 +1,1006 @@ +"""Evaluate model structure against reference. + +eg. + + ost compare-structures \\ + --model <MODEL> \\ + --reference <REF> \\ + --output output.json \\ + --lddt \\ + --structural-checks \\ + --consistency-checks \\ + --molck \\ + --remove oxt hyd \\ + --map-nonstandard-residues + +Here we describe how the parameters can be set to mimick a CAMEO evaluation +(as of August 2018). + +CAMEO calls the lddt binary as follows: + + lddt \\ + -p <PARAMETER FILE> \\ + -f \\ + -a 15 \\ + -b 15 \\ + -r 15 \\ + <MODEL> \\ + <REF> + +Only model structures are "Molck-ed" in CAMEO. The call to molck is as follows: + + molck \\ + --complib=<COMPOUND LIB> \\ + --rm=hyd,oxt,unk,nonstd \\ + --fix-ele \\ + --map-nonstd \\ + --out=<OUTPUT> \\ + <FILEPATH> + +To be as much compatible with with CAMEO as possible one should call +compare-structures as follows: + + ost compare-structures \\ + --model <MODEL> \\ + --reference <REF> \\ + --output output.json \\ + --molck \\ + --remove oxt hyd unk nonstd \\ + --clean-element-column \\ + --map-nonstandard-residues \\ + --structural-checks \\ + --bond-tolerance 15.0 \\ + --angle-tolerance 15.0 \\ + --residue-number-alignment \\ + --consistency-checks \\ + --qs-score \\ + --lddt \\ + --inclusion-radius 15.0 +""" + +import os +import sys +import json +import argparse + +import ost +from ost.io import (LoadPDB, LoadMMCIF, SavePDB, MMCifInfoBioUnit, MMCifInfo, + MMCifInfoTransOp, ReadStereoChemicalPropsFile, profiles) +from ost import PushVerbosityLevel +from ost.mol.alg import (qsscoring, Molck, MolckSettings, lDDTSettings, + CheckStructure, ResidueNamesMatch) +from ost.conop import (CompoundLib, SetDefaultLib, GetDefaultLib, + RuleBasedProcessor) +from ost.seq.alg.renumber import Renumber + + +def _GetDefaultShareFilePath(filename): + """Look for filename in working directory and OST shared data path. + :return: Path to valid file or None if not found. + """ + # Try current directory + cwd = os.path.abspath(os.getcwd()) + file_path = os.path.join(cwd, filename) + if not os.path.isfile(file_path): + try: + file_path = os.path.join(ost.GetSharedDataPath(), filename) + except RuntimeError: + # Ignore errors here (caught later together with non-existing file) + pass + if not os.path.isfile(file_path): + file_path = None + # Either file_path is valid file path or None + return file_path + +def _GetDefaultParameterFilePath(): + # Try to get in default locations + parameter_file_path = _GetDefaultShareFilePath("stereo_chemical_props.txt") + if parameter_file_path is None: + msg = ( + "Could not set default stereochemical parameter file. In " + "order to use the default one please set $OST_ROOT " + "environmental variable, run the script with OST binary or" + " provide a local copy of 'stereo_chemical_props.txt' in " + "CWD. Alternatively provide the path to the local copy.") + else: + msg = "" + return parameter_file_path, msg + +def _GetDefaultCompoundLibraryPath(): + # Try to get in default locations + compound_library_path = _GetDefaultShareFilePath("compounds.chemlib") + if compound_library_path is None: + msg = ( + "Could not set default compounds library path. In " + "order to use the default one please set $OST_ROOT " + "environmental variable, run the script with OST binary or" + " provide a local copy of 'compounds.chemlib' in CWD" + ". Alternatively provide the path to the local copy.") + else: + msg = "" + return compound_library_path, msg + +def _ParseArgs(): + """Parse command-line arguments.""" + + parser = argparse.ArgumentParser( + formatter_class=argparse.RawTextHelpFormatter, + description=__doc__, + prog="ost compare-structures") + + # + # Required arguments + # + + group_required = parser.add_argument_group('required arguments') + + group_required.add_argument( + "-m", + "--model", + dest="model", + required=True, + help=("Path to the model file.")) + group_required.add_argument( + "-r", + "--reference", + dest="reference", + required=True, + help=("Path to the reference file.")) + + # + # General arguments + # + + group_general = parser.add_argument_group('general arguments') + + group_general.add_argument( + '-v', + '--verbosity', + type=int, + default=3, + help="Set verbosity level. Defaults to 3.") + group_general.add_argument( + "-o", + "--output", + dest="output", + help=("Output file name. The output will be saved as a JSON file.")) + group_general.add_argument( + "-d", + "--dump-structures", + dest="dump_structures", + default=False, + action="store_true", + help=("Dump cleaned structures used to calculate all the scores as\n" + "PDB files using specified suffix. Files will be dumped to the\n" + "same location as original files.")) + group_general.add_argument( + "-ds", + "--dump-suffix", + dest="dump_suffix", + default=".compare.structures.pdb", + help=("Use this suffix to dump structures.\n" + "Defaults to .compare.structures.pdb.")) + group_general.add_argument( + "-rs", + "--reference-selection", + dest="reference_selection", + default="", + help=("Selection performed on reference structures.")) + group_general.add_argument( + "-ms", + "--model-selection", + dest="model_selection", + default="", + help=("Selection performed on model structures.")) + group_general.add_argument( + "-ca", + "--c-alpha-only", + dest="c_alpha_only", + default=False, + action="store_true", + help=("Use C-alpha atoms only. Equivalent of calling the action with\n" + "'--model-selection=\"aname=CA\" " + "--reference-selection=\"aname=CA\"'\noptions.")) + group_general.add_argument( + "-ft", + "--fault-tolerant", + dest="fault_tolerant", + default=False, + action="store_true", + help=("Fault tolerant parsing.")) + group_general.add_argument( + "-cl", + "--compound-library", + dest="compound_library", + default=None, + help=("Location of the compound library file (compounds.chemlib).\n" + "If not provided, the following locations are searched in this\n" + "order: 1. Working directory, 2. OpenStructure standard library" + "\nlocation.")) + + # + # Molecular check arguments + # + + group_molck = parser.add_argument_group('molecular check arguments') + + group_molck.add_argument( + "-ml", + "--molck", + dest="molck", + default=False, + action="store_true", + help=("Run molecular checker to clean up input.")) + group_molck.add_argument( + "-rm", + "--remove", + dest="remove", + nargs="+", # *, +, ?, N + required=False, + default=["hyd"], + help=("Remove atoms and residues matching some criteria:\n" + " * zeroocc - Remove atoms with zero occupancy\n" + " * hyd - remove hydrogen atoms\n" + " * oxt - remove terminal oxygens\n" + " * nonstd - remove all residues not one of the 20\n" + " standard amino acids\n" + " * unk - Remove unknown and atoms not following the\n" + " nomenclature\n" + "Defaults to hyd.")) + group_molck.add_argument( + "-ce", + "--clean-element-column", + dest="clean_element_column", + default=False, + action="store_true", + help=("Clean up element column")) + group_molck.add_argument( + "-mn", + "--map-nonstandard-residues", + dest="map_nonstandard_residues", + default=False, + action="store_true", + help=("Map modified residues back to the parent amino acid, for\n" + "example MSE -> MET, SEP -> SER.")) + + # + # Structural check arguments + # + + group_sc = parser.add_argument_group('structural check arguments') + + group_sc.add_argument( + "-sc", + "--structural-checks", + dest="structural_checks", + default=False, + action="store_true", + help=("Perform structural checks and filter input data.")) + group_sc.add_argument( + "-p", + "--parameter-file", + dest="parameter_file", + default=None, + help=("Location of the stereochemical parameter file\n" + "(stereo_chemical_props.txt).\n" + "If not provided, the following locations are searched in this\n" + "order: 1. Working directory, 2. OpenStructure standard library" + "\nlocation.")) + group_sc.add_argument( + "-bt", + "--bond-tolerance", + dest="bond_tolerance", + type=float, + default=12.0, + help=("Tolerance in STD for bonds. Defaults to 12.")) + group_sc.add_argument( + "-at", + "--angle-tolerance", + dest="angle_tolerance", + type=float, + default=12.0, + help=("Tolerance in STD for angles. Defaults to 12.")) + + # + # Chain mapping arguments + # + + group_cm = parser.add_argument_group('chain mapping arguments') + + group_cm.add_argument( + "-c", + "--chain-mapping", + nargs="+", + type=lambda x: x.split(":"), + dest="chain_mapping", + help=("Mapping of chains between the reference and the model.\n" + "Each separate mapping consist of key:value pairs where key\n" + "is the chain name in reference and value is the chain name in\n" + "model.")) + group_cm.add_argument( + "--qs-max-mappings-extensive", + dest="qs_max_mappings_extensive", + type=int, + default=1000000, + help=("Maximal number of chain mappings to test for 'extensive'\n" + "chain mapping scheme which is used as a last resort if\n" + "other schemes failed. The extensive chain mapping search\n" + "must in the worst case check O(N!) possible mappings for\n" + "complexes with N chains. Two octamers without symmetry\n" + "would require 322560 mappings to be checked. To limit\n" + "computations, no scores are computed if we try more than\n" + "the maximal number of chain mappings. Defaults to 1000000.")) + + # + # Sequence alignment arguments + # + + group_aln = parser.add_argument_group('sequence alignment arguments') + + group_aln.add_argument( + "-cc", + "--consistency-checks", + dest="consistency_checks", + default=False, + action="store_true", + help=("Take consistency checks into account. By default residue name\n" + "consistency between a model-reference pair would be checked\n" + "but only a warning message will be displayed and the script\n" + "will continue to calculate scores. If this flag is ON, checks\n" + "will not be ignored and if the pair does not pass the test\n" + "all the scores for that pair will be marked as a FAILURE.")) + group_aln.add_argument( + "-rna", + "--residue-number-alignment", + dest="residue_number_alignment", + default=False, + action="store_true", + help=("Make alignment based on residue number instead of using\n" + "a global BLOSUM62-based alignment.")) + + # + # QS score arguments + # + + group_qs = parser.add_argument_group('QS score arguments') + + group_qs.add_argument( + "-qs", + "--qs-score", + dest="qs_score", + default=False, + action="store_true", + help=("Calculate QS-score.")) + group_qs.add_argument( + "--qs-rmsd", + dest="qs_rmsd", + default=False, + action="store_true", + help=("Calculate CA RMSD between shared CA atoms of mapped chains.\n" + "This uses a superposition using all mapped chains which\n" + "minimizes the CA RMSD.")) + + # + # lDDT score arguments + # + + group_lddt = parser.add_argument_group('lDDT score arguments') + + group_lddt.add_argument( + "-l", + "--lddt", + dest="lddt", + default=False, + action="store_true", + help=("Calculate lDDT.")) + group_lddt.add_argument( + "-ir", + "--inclusion-radius", + dest="inclusion_radius", + type=float, + default=15.0, + help=("Distance inclusion radius for lDDT. Defaults to 15 A.")) + group_lddt.add_argument( + "-ss", + "--sequence-separation", + dest="sequence_separation", + type=int, + default=0, + help=("Sequence separation. Only distances between residues whose\n" + "separation is higher than the provided parameter are\n" + "considered when computing the score. Defaults to 0.")) + group_lddt.add_argument( + "-spr", + "--save-per-residue-scores", + dest="save_per_residue_scores", + default=False, + action="store_true", + help=("")) + + # Print full help is no arguments provided + if len(sys.argv) == 1: + parser.print_help(sys.stderr) + sys.exit(1) + + opts = parser.parse_args() + # Set chain mapping + if opts.chain_mapping is not None: + try: + opts.chain_mapping = dict(opts.chain_mapping) + except ValueError: + parser.error( + "Cannot parse chain mapping into dictionary. The " + "correct format is: key:value [key2:value2 ...].") + + # Check parameter file if structural checks are on + if opts.structural_checks: + if opts.parameter_file is None: + # try to get default if none provided + opts.parameter_file, msg = _GetDefaultParameterFilePath() + if msg: + parser.error(msg) + else: + # if provided it must exist + if not os.path.isfile(opts.parameter_file): + parser.error("Parameter file %s does not exist." \ + % opts.parameter_file) + + # Check compound library path (always required!) + if opts.compound_library is None: + # try to get default if none provided + opts.compound_library, msg = _GetDefaultCompoundLibraryPath() + if msg: + parser.error(msg) + else: + # if provided it must exist + if not os.path.isfile(opts.compound_library): + parser.error("Compounds library file %s does not exist." \ + % opts.compound_library) + + # Check model and reference paths + if not os.path.isfile(opts.model): + parser.error("Model file %s does not exist." % opts.model) + if not os.path.isfile(opts.reference): + parser.error("Reference file %s does not exist." % opts.reference) + + return opts + + +def _SetCompoundsChemlib(path_to_chemlib): + """Set default compound library for OST.""" + # NOTE: This is adapted from ProMod3 code and should in the future be doable + # with some shared OST code! + compound_lib = CompoundLib.Load(path_to_chemlib) + SetDefaultLib(compound_lib) + processor = RuleBasedProcessor(compound_lib) + for profile_name in profiles: + profiles[profile_name].processor = processor.Copy() + + +def _RevertChainNames(ent): + """Revert chain names to original names. + + By default the first chain with given name will not have any number + attached to it ie. if there are two chains mapping to chain A the resulting + chain names will be: A and A2. + """ + editor = ent.EditXCS() + suffix = "_tmp" # just a suffix for temporary chain name + separator = "" # dot causes selection error + used_names = dict() + reverted_chains = dict() + for chain in ent.chains: + try: + original_name = chain.GetStringProp("original_name") + except Exception as ex: + ost.LogError("Cannot revert chain %s back to original: %s" % ( + chain.name, + str(ex))) + reverted_chains[chain.name] = chain.name + editor.RenameChain(chain, chain.name + suffix) + continue + new_name = original_name + if new_name not in used_names: + used_names[original_name] = 2 + reverted_chains[chain.name] = new_name + editor.RenameChain(chain, chain.name + suffix) + else: + new_name = "%s%s%i" % (original_name, + separator, + used_names[original_name]) + reverted_chains[chain.name] = new_name + editor.RenameChain(chain, chain.name + suffix) + used_names[original_name] += 1 + for chain in ent.chains: + editor.RenameChain(chain, reverted_chains[chain.name[:-len(suffix)]]) + rev_out = ["%s -> %s" % (on, nn) for on, nn in list(reverted_chains.items())] + ost.LogInfo("Reverted chains: %s" % ", ".join(rev_out)) + + +def _CheckConsistency(alignments, log_error): + is_cons = True + for alignment in alignments: + ref_chain = Renumber(alignment.GetSequence(0)).CreateFullView() + mdl_chain = Renumber(alignment.GetSequence(1)).CreateFullView() + new_is_cons = ResidueNamesMatch(mdl_chain, ref_chain, log_error) + is_cons = is_cons and new_is_cons + return is_cons + + +def _GetAlignmentsAsFasta(alignments): + """Get the alignments as FASTA formated string. + + :param alignments: Alignments + :type alignments: list of AlignmentHandle + :returns: list of alignments in FASTA format + :rtype: list of strings + """ + strings = list() + for alignment in alignments: + aln_str = ">reference:%s\n%s\n>model:%s\n%s" % ( + alignment.GetSequence(0).name, + alignment.GetSequence(0).GetString(), + alignment.GetSequence(1).name, + alignment.GetSequence(1).GetString()) + strings.append(aln_str) + return strings + + +def _ReadStructureFile(path, c_alpha_only=False, fault_tolerant=False, + selection=""): + """Safely read structure file into OST entities (split by biounit). + + The function can read both PDB and mmCIF files. + + :param path: Path to the file. + :type path: :class:`str` + :returns: list of entities + :rtype: :class:`list` of :class:`~ost.mol.EntityHandle` + """ + + def _Select(entity): + if selection: + ost.LogInfo("Selecting %s" % selection) + ent_view = entity.Select(selection) + entity = mol.CreateEntityFromView(ent_view, False) + return entity + + entities = list() + if not os.path.isfile(path): + raise IOError("%s is not a file" % path) + + # Determine file format from suffix. + ext = path.split(".") + if ext[-1] == "gz": + ext = ext[:-1] + if len(ext) <= 1: + raise RuntimeError(f"Could not determine format of file {path}.") + sformat = ext[-1].lower() + + if sformat in ["pdb"]: + entity = LoadPDB( + path, + fault_tolerant=fault_tolerant, + calpha_only=c_alpha_only) + if not entity.IsValid() or len(entity.residues) == 0: + raise IOError("Provided file does not contain valid entity.") + entity.SetName(os.path.basename(path)) + entity = _Select(entity) + entities.append(entity) + elif sformat in ["cif", "mmcif"]: + try: + tmp_entity, cif_info = LoadMMCIF( + path, + info=True, + fault_tolerant=fault_tolerant, + calpha_only=c_alpha_only) + if len(cif_info.biounits) == 0: + tbu = MMCifInfoBioUnit() + tbu.id = 'ASU' + tbu.details = 'asymmetric unit' + for chain in tmp_entity.chains: + tbu.AddChain(str(chain)) + tinfo = MMCifInfo() + tops = MMCifInfoTransOp() + tinfo.AddOperation(tops) + tbu.AddOperations(tinfo.GetOperations()) + entity = tbu.PDBize(tmp_entity, min_polymer_size=0) + entity.SetName(os.path.basename(path) + ".au") + _RevertChainNames(entity) + entity = _Select(entity) + entities.append(entity) + elif len(cif_info.biounits) > 1: + for i, biounit in enumerate(cif_info.biounits, 1): + entity = biounit.PDBize(tmp_entity, min_polymer_size=0) + if not entity.IsValid(): + raise IOError( + "Provided file does not contain valid entity.") + entity.SetName(os.path.basename(path) + "." + str(i)) + _RevertChainNames(entity) + entity = _Select(entity) + entities.append(entity) + else: + biounit = cif_info.biounits[0] + entity = biounit.PDBize(tmp_entity, min_polymer_size=0) + if not entity.IsValid(): + raise IOError( + "Provided file does not contain valid entity.") + entity.SetName(os.path.basename(path)) + _RevertChainNames(entity) + entity = _Select(entity) + entities.append(entity) + + except Exception: + raise + else: + raise RuntimeError(f"Unsupported file extension found for file {path}.") + + return entities + + +def _MolckEntity(entity, options): + """Molck the entity.""" + lib = GetDefaultLib() + to_remove = tuple(options.remove) + + ms = MolckSettings(rm_unk_atoms="unk" in to_remove, + rm_non_std="nonstd" in to_remove, + rm_hyd_atoms="hyd" in to_remove, + rm_oxt_atoms="oxt" in to_remove, + rm_zero_occ_atoms="zeroocc" in to_remove, + colored=False, + map_nonstd_res=options.map_nonstandard_residues, + assign_elem=options.clean_element_column) + Molck(entity, lib, ms) + + +def _Main(): + """Do the magic.""" + # + # Setup + opts = _ParseArgs() + PushVerbosityLevel(opts.verbosity) + _SetCompoundsChemlib(opts.compound_library) + # + # Read the input files + ost.LogInfo("#" * 80) + ost.LogInfo("Reading input files (fault_tolerant=%s)" % + str(opts.fault_tolerant)) + ost.LogInfo(" --> reading model from %s" % opts.model) + models = _ReadStructureFile( + opts.model, + c_alpha_only=opts.c_alpha_only, + fault_tolerant=opts.fault_tolerant, + selection=opts.model_selection) + ost.LogInfo(" --> reading reference from %s" % opts.reference) + references = _ReadStructureFile( + opts.reference, + c_alpha_only=opts.c_alpha_only, + fault_tolerant=opts.fault_tolerant, + selection=opts.reference_selection) + # molcking + if opts.molck: + ost.LogInfo("#" * 80) + ost.LogInfo("Cleaning up input with Molck") + for reference in references: + _MolckEntity(reference, opts) + for model in models: + _MolckEntity(model, opts) + # restrict to peptides (needed for CheckStructure anyways) + for i in range(len(references)): + references[i] = references[i].Select("peptide=true") + for i in range(len(models)): + models[i] = models[i].Select("peptide=true") + # structure checking + if opts.structural_checks: + ost.LogInfo("#" * 80) + ost.LogInfo("Performing structural checks") + stereochemical_parameters = ReadStereoChemicalPropsFile( + opts.parameter_file) + ost.LogInfo(" --> for reference(s)") + for reference in references: + ost.LogInfo("Checking %s" % reference.GetName()) + CheckStructure(reference, + stereochemical_parameters.bond_table, + stereochemical_parameters.angle_table, + stereochemical_parameters.nonbonded_table, + opts.bond_tolerance, + opts.angle_tolerance) + ost.LogInfo(" --> for model(s)") + for model in models: + ost.LogInfo("Checking %s" % model.GetName()) + CheckStructure(model, + stereochemical_parameters.bond_table, + stereochemical_parameters.angle_table, + stereochemical_parameters.nonbonded_table, + opts.bond_tolerance, + opts.angle_tolerance) + if len(models) > 1 or len(references) > 1: + ost.LogInfo("#" * 80) + ost.LogInfo( + "Multiple complexes mode ON. All combinations will be tried.") + + result = { + "result": {}, + "options": vars(opts)} + result["options"]["cwd"] = os.path.abspath(os.getcwd()) + # + # Perform scoring + skipped = list() + for model in models: + model_name = model.GetName() + model_results = dict() + for reference in references: + reference_name = reference.GetName() + reference_results = { + "info": dict()} + ost.LogInfo("#" * 80) + ost.LogInfo("Comparing %s to %s" % ( + model_name, + reference_name)) + qs_scorer = qsscoring.QSscorer(reference, + model, + opts.residue_number_alignment) + qs_scorer.max_mappings_extensive = opts.qs_max_mappings_extensive + if opts.chain_mapping is not None: + ost.LogInfo( + "Using custom chain mapping: %s" % str( + opts.chain_mapping)) + qs_scorer.chain_mapping = opts.chain_mapping + else: + try: + qs_scorer.chain_mapping # just to initialize it + except qsscoring.QSscoreError as ex: + ost.LogError('Chain mapping failed:', str(ex)) + ost.LogError('Skipping comparison') + continue + ost.LogInfo("-" * 80) + ost.LogInfo("Checking consistency between %s and %s" % ( + model_name, reference_name)) + is_cons = _CheckConsistency( + qs_scorer.alignments, + opts.consistency_checks) + reference_results["info"]["residue_names_consistent"] = is_cons + reference_results["info"]["mapping"] = { + "chain_mapping": qs_scorer.chain_mapping, + "chain_mapping_scheme": qs_scorer.chain_mapping_scheme, + "alignments": _GetAlignmentsAsFasta(qs_scorer.alignments)} + skip_score = False + if opts.consistency_checks: + if not is_cons: + msg = (("Residue names in model %s and in reference " + "%s are inconsistent.") % ( + model_name, + reference_name)) + ost.LogError(msg) + skip_score = True + skipped.append(skip_score) + else: + ost.LogInfo("Consistency check: OK") + skipped.append(False) + else: + skipped.append(False) + if not is_cons: + msg = (("Residue names in model %s and in reference " + "%s are inconsistent.\nThis might lead to " + "corrupted results.") % ( + model_name, + reference_name)) + ost.LogWarning(msg) + else: + ost.LogInfo("Consistency check: OK") + if opts.qs_rmsd: + ost.LogInfo("-" * 80) + if skip_score: + ost.LogInfo( + "Skipping QS-RMSD because consistency check failed") + reference_results["qs_rmsd"] = { + "status": "FAILURE", + "error": "Consistency check failed."} + else: + ost.LogInfo("Computing QS-RMSD") + try: + reference_results["qs_rmsd"] = { + "status": "SUCCESS", + "error": "", + "ca_rmsd": qs_scorer.superposition.rmsd} + except qsscoring.QSscoreError as ex: + ost.LogError('QS-RMSD failed:', str(ex)) + reference_results["qs_rmsd"] = { + "status": "FAILURE", + "error": str(ex)} + if opts.qs_score: + ost.LogInfo("-" * 80) + if skip_score: + ost.LogInfo( + "Skipping QS-score because consistency check failed") + reference_results["qs_score"] = { + "status": "FAILURE", + "error": "Consistency check failed.", + "global_score": 0.0, + "best_score": 0.0} + else: + ost.LogInfo("Computing QS-score") + try: + reference_results["qs_score"] = { + "status": "SUCCESS", + "error": "", + "global_score": qs_scorer.global_score, + "best_score": qs_scorer.best_score} + except qsscoring.QSscoreError as ex: + # default handling: report failure and set score to 0 + ost.LogError('QSscore failed:', str(ex)) + reference_results["qs_score"] = { + "status": "FAILURE", + "error": str(ex), + "global_score": 0.0, + "best_score": 0.0} + # Calculate lDDT + if opts.lddt: + ost.LogInfo("-" * 80) + ost.LogInfo("Computing lDDT scores") + lddt_results = { + "single_chain_lddt": list() + } + lddt_settings = lDDTSettings( + radius=opts.inclusion_radius, + sequence_separation=opts.sequence_separation, + label="lddt") + ost.LogInfo("lDDT settings: ") + ost.LogInfo(str(lddt_settings).rstrip()) + ost.LogInfo("===") + oligo_lddt_scorer = qs_scorer.GetOligoLDDTScorer(lddt_settings) + for mapped_lddt_scorer in oligo_lddt_scorer.mapped_lddt_scorers: + # Get data + lddt_scorer = mapped_lddt_scorer.lddt_scorer + model_chain = mapped_lddt_scorer.model_chain_name + reference_chain = mapped_lddt_scorer.reference_chain_name + if skip_score: + ost.LogInfo( + " --> Skipping single chain lDDT because " + "consistency check failed") + lddt_results["single_chain_lddt"].append({ + "status": "FAILURE", + "error": "Consistency check failed.", + "model_chain": model_chain, + "reference_chain": reference_chain, + "global_score": 0.0, + "conserved_contacts": 0.0, + "total_contacts": 0.0}) + else: + try: + ost.LogInfo((" --> Computing lDDT between model " + "chain %s and reference chain %s") % ( + model_chain, + reference_chain)) + ost.LogInfo("Global LDDT score: %.4f" % + lddt_scorer.global_score) + ost.LogInfo( + "(%i conserved distances out of %i checked, over " + "%i thresholds)" % (lddt_scorer.conserved_contacts, + lddt_scorer.total_contacts, + len(lddt_settings.cutoffs))) + sc_lddt_scores = { + "status": "SUCCESS", + "error": "", + "model_chain": model_chain, + "reference_chain": reference_chain, + "global_score": lddt_scorer.global_score, + "conserved_contacts": + lddt_scorer.conserved_contacts, + "total_contacts": lddt_scorer.total_contacts} + if opts.save_per_residue_scores: + per_residue_sc = \ + mapped_lddt_scorer.GetPerResidueScores() + ost.LogInfo("Per residue local lDDT (reference):") + ost.LogInfo("Chain\tResidue Number\tResidue Name" + "\tlDDT\tConserved Contacts\tTotal " + "Contacts") + for prs_scores in per_residue_sc: + ost.LogInfo("%s\t%i\t%s\t%.4f\t%i\t%i" % ( + reference_chain, + prs_scores["residue_number"], + prs_scores["residue_name"], + prs_scores["lddt"], + prs_scores["conserved_contacts"], + prs_scores["total_contacts"])) + sc_lddt_scores["per_residue_scores"] = \ + per_residue_sc + lddt_results["single_chain_lddt"].append( + sc_lddt_scores) + except Exception as ex: + ost.LogError('Single chain lDDT failed:', str(ex)) + lddt_results["single_chain_lddt"].append({ + "status": "FAILURE", + "error": str(ex), + "model_chain": model_chain, + "reference_chain": reference_chain, + "global_score": 0.0, + "conserved_contacts": 0.0, + "total_contacts": 0.0}) + # perform oligo lddt scoring + if skip_score: + ost.LogInfo( + " --> Skipping oligomeric lDDT because consistency " + "check failed") + lddt_results["oligo_lddt"] = { + "status": "FAILURE", + "error": "Consistency check failed.", + "global_score": 0.0} + else: + try: + ost.LogInfo(' --> Computing oligomeric lDDT score') + lddt_results["oligo_lddt"] = { + "status": "SUCCESS", + "error": "", + "global_score": oligo_lddt_scorer.oligo_lddt} + ost.LogInfo( + "Oligo lDDT score: %.4f" % + oligo_lddt_scorer.oligo_lddt) + except Exception as ex: + ost.LogError('Oligo lDDT failed:', str(ex)) + lddt_results["oligo_lddt"] = { + "status": "FAILURE", + "error": str(ex), + "global_score": 0.0} + if skip_score: + ost.LogInfo( + " --> Skipping weighted lDDT because consistency " + "check failed") + lddt_results["weighted_lddt"] = { + "status": "FAILURE", + "error": "Consistency check failed.", + "global_score": 0.0} + else: + try: + ost.LogInfo(' --> Computing weighted lDDT score') + lddt_results["weighted_lddt"] = { + "status": "SUCCESS", + "error": "", + "global_score": oligo_lddt_scorer.weighted_lddt} + ost.LogInfo( + "Weighted lDDT score: %.4f" % + oligo_lddt_scorer.weighted_lddt) + except Exception as ex: + ost.LogError('Weighted lDDT failed:', str(ex)) + lddt_results["weighted_lddt"] = { + "status": "FAILURE", + "error": str(ex), + "global_score": 0.0} + reference_results["lddt"] = lddt_results + model_results[reference_name] = reference_results + if opts.dump_structures: + ost.LogInfo("-" * 80) + ref_output_path = os.path.join( + os.path.dirname(opts.reference), + reference_name + opts.dump_suffix) + ost.LogInfo("Saving cleaned up reference to %s" % + ref_output_path) + try: + SavePDB(qs_scorer.qs_ent_1.ent, + ref_output_path) + except Exception as ex: + ost.LogError("Cannot save reference: %s" % str(ex)) + mdl_output_path = os.path.join( + os.path.dirname(opts.model), + model_name + opts.dump_suffix) + ost.LogInfo("Saving cleaned up model to %s" % + mdl_output_path) + try: + SavePDB(qs_scorer.qs_ent_2.ent, + mdl_output_path) + except Exception as ex: + ost.LogError("Cannot save model: %s" % str(ex)) + result["result"][model_name] = model_results + + if all(skipped) and len(skipped) > 0: + ost.LogError("Consistency check failed for all model-reference pairs.") + if opts.output is not None: + ost.LogInfo("#" * 80) + ost.LogInfo("Saving output into %s" % opts.output) + with open(opts.output, "w") as outfile: + json.dump(result, outfile, indent=4, sort_keys=True) + +if __name__ == '__main__': + _Main() + diff --git a/actions/ost-compare-structures-new b/actions/ost-compare-structures-new deleted file mode 100644 index 51160472a..000000000 --- a/actions/ost-compare-structures-new +++ /dev/null @@ -1,577 +0,0 @@ -""" -Evaluate model against reference - -Example: ost compare-structures-new -m model.pdb -r reference.cif - -Loads the structures and performs basic cleanup: - - * Assign elements according to the PDB compound dictionary - * Map nonstandard residues to their parent residues as defined by the PDB - compound dictionary, e.g. phospho-serine => serine - * Remove hydrogens - * Remove OXT atoms - * Remove unknown atoms, i.e. atoms that are not expected according to the PDB - compound dictionary - -The cleaned structures are optionally dumped using -d/--dump-structures - -Output is written in JSON format (default: out.json). In case of no additional -options, this is a dictionary with five keys: - - * "chain_mapping": A dictionary with reference chain names as keys and the - mapped model chain names as values. - * "aln": Pairwise sequence alignment for each pair of mapped chains in fasta - format. - * "chem_groups": Groups of polypeptides/polynucleotides that are considered - chemically equivalent. You can derive stoichiometry from this. - * "inconsistent_residues": List of strings that represent name mismatches of - aligned residues in form - <trg_cname>.<trg_rname><trg_rnum>-<mdl_cname>.<mdl_rname><mdl_rnum>. - Inconsistencies may lead to corrupt results but do not abort the program. - Program abortion in these cases can be enforced with - -ec/--enforce-consistency. - * "status": SUCCESS if everything ran through. In case of failure, the only - content of the JSON output will be \"status\" set to FAILURE and an - additional key: "traceback". - -The pairwise sequence alignments are computed with Needleman-Wunsch using -BLOSUM62 (NUC44 for nucleotides). Many benchmarking scenarios preprocess the -structures to ensure matching residue numbers (CASP/CAMEO). In these cases, -enabling -rna/--residue-number-alignment is recommended. - -Each score is opt-in and can be enabled with optional arguments. - -Example to compute local and per-residue lDDT values as well as QS-score: - -ost compare-structures-new -m model.pdb -r reference.cif --lddt --local-lddt \ ---qs-score - -Example to inject custom chain mapping - -ost compare-structures-new -m model.pdb -r reference.cif -c A:B B:A -""" - -import argparse -import os -import json -import time -import sys -import traceback - -from ost import io -from ost.mol.alg import scoring - -def _ParseArgs(): - parser = argparse.ArgumentParser(description = __doc__, - formatter_class=argparse.RawDescriptionHelpFormatter, - prog = "ost compare-structures-new") - - parser.add_argument( - "-m", - "--model", - dest="model", - required=True, - help=("Path to model file.")) - - parser.add_argument( - "-r", - "--reference", - dest="reference", - required=True, - help=("Path to reference file.")) - - parser.add_argument( - "-o", - "--output", - dest="output", - required=False, - default="out.json", - help=("Output file name. The output will be saved as a JSON file. " - "default: out.json")) - - parser.add_argument( - "-mf", - "--model-format", - dest="model_format", - required=False, - default=None, - choices=["pdb", "cif", "mmcif"], - help=("Format of model file. pdb reads pdb but also pdb.gz, same " - "applies to cif/mmcif. Inferred from filepath if not given.")) - - parser.add_argument( - "-rf", - "--reference-format", - dest="reference_format", - required=False, - default=None, - choices=["pdb", "cif", "mmcif"], - help=("Format of reference file. pdb reads pdb but also pdb.gz, same " - "applies to cif/mmcif. Inferred from filepath if not given.")) - - parser.add_argument( - "-mb", - "--model-biounit", - dest="model_biounit", - required=False, - default=None, - type=int, - help=("Only has an effect if model is in mmcif format. By default, " - "the assymetric unit (AU) is used for scoring. If there are " - "biounits defined in the mmcif file, you can specify the index " - "of the one which should be used.")) - - parser.add_argument( - "-rb", - "--reference-biounit", - dest="reference_biounit", - required=False, - default=None, - type=int, - help=("Only has an effect if reference is in mmcif format. By default, " - "the assymetric unit (AU) is used for scoring. If there are " - "biounits defined in the mmcif file, you can specify the index " - "of the one which should be used.")) - - parser.add_argument( - "-rna", - "--residue-number-alignment", - dest="residue_number_alignment", - default=False, - action="store_true", - help=("Make alignment based on residue number instead of using " - "a global BLOSUM62-based alignment (NUC44 for nucleotides).")) - - parser.add_argument( - "-ec", - "--enforce-consistency", - dest="enforce_consistency", - default=False, - action="store_true", - help=("Enforce consistency. By default residue name discrepancies " - "between a model and reference are reported but the program " - "proceeds. If this flag is ON, the program fails for these " - "cases.")) - - parser.add_argument( - "-d", - "--dump-structures", - dest="dump_structures", - default=False, - action="store_true", - help=("Dump cleaned structures used to calculate all the scores as " - "PDB files using specified suffix. Files will be dumped to the " - "same location as original files.")) - - parser.add_argument( - "-ds", - "--dump-suffix", - dest="dump_suffix", - default=".compare.structures.pdb", - help=("Use this suffix to dump structures.\n" - "Defaults to .compare.structures.pdb.")) - - parser.add_argument( - "-ft", - "--fault-tolerant", - dest="fault_tolerant", - default=False, - action="store_true", - help=("Fault tolerant parsing.")) - - parser.add_argument( - "-c", - "--chain-mapping", - nargs="+", - dest="chain_mapping", - help=("Custom mapping of chains between the reference and the model. " - "Each separate mapping consist of key:value pairs where key " - "is the chain name in reference and value is the chain name in " - "model.")) - - parser.add_argument( - "--lddt", - dest="lddt", - default=False, - action="store_true", - help=("Compute global lDDT score with default parameterization and " - "store as key \"lddt\". Stereochemical irregularities affecting " - "lDDT are reported as keys \"model_clashes\", " - "\"model_bad_bonds\", \"model_bad_angles\" and the respective " - "reference counterparts.")) - - parser.add_argument( - "--local-lddt", - dest="local_lddt", - default=False, - action="store_true", - help=("Compute per-residue lDDT scores with default parameterization " - "and store as key \"local_lddt\". Score for model residue with " - "number 42 in chain X can be extracted with: " - "data[\"local_lddt\"][\"X\"][\"42\"]. If there is an insertion " - "code, lets say A, the last key becomes \"42A\" Stereochemical " - "irregularities affecting lDDT are reported as keys " - "\"model_clashes\", \"model_bad_bonds\", \"model_bad_angles\" " - "and the respective reference counterparts.")) - - parser.add_argument( - "--cad-score", - dest="cad_score", - default=False, - action="store_true", - help=("Compute global CAD's atom-atom (AA) score and store as key " - "\"cad_score\". --residue-number-alignment must be enabled " - "to compute this score. Requires voronota_cadscore executable " - "in PATH. Alternatively you can set cad-exec.")) - - parser.add_argument( - "--local-cad-score", - dest="local_cad_score", - default=False, - action="store_true", - help=("Compute local CAD's atom-atom (AA) scores and store as key " - "\"local_cad_score\". Score for model residue with number 42 in " - "chain X can be extracted with: " - "data[\"local_cad_score\"][\"X\"][\"42\"]. " - "--residue-number-alignments must be enabled to compute this " - "score. Requires voronota_cadscore executable in PATH. " - "Alternatively you can set cad-exec.")) - - parser.add_argument( - "--cad-exec", - dest="cad_exec", - default=None, - help=("Path to voronota-cadscore executable (installed from " - "https://github.com/kliment-olechnovic/voronota). Searches PATH " - "if not set.")) - - parser.add_argument( - "--qs-score", - dest="qs_score", - default=False, - action="store_true", - help=("Compute QS-score, stored as key \"qs_global\", and the QS-best " - "variant, stored as key \"qs_best\".")) - - parser.add_argument( - "--rigid-scores", - dest="rigid_scores", - default=False, - action="store_true", - help=("Computes rigid superposition based scores. They're based on a " - "Kabsch superposition of all mapped CA positions (C3' for " - "nucleotides). Makes the following keys available: " - "\"oligo_gdtts\": GDT with distance thresholds [1.0, 2.0, 4.0, " - "8.0] given these positions and transformation, \"oligo_gdtha\": " - "same with thresholds [0.5, 1.0, 2.0, 4.0], \"rmsd\": RMSD given " - "these positions and transformation, \"transform\": the used 4x4 " - "transformation matrix that superposes model onto reference.")) - - parser.add_argument( - "--interface-scores", - dest="interface_scores", - default=False, - action="store_true", - help=("Per interface scores for each interface that has at least one " - "contact in the reference, i.e. at least one pair of heavy atoms " - "within 5A. The respective interfaces are available from key " - "\"interfaces\" which is a list of tuples in form (ref_ch1, " - "ref_ch2, mdl_ch1, mdl_ch2). Per-interface scores are available " - "as lists referring to these interfaces and have the following " - "keys: \"nnat\" (number of contacts in reference), \"nmdl\" " - "(number of contacts in model), \"fnat\" (fraction of reference " - "contacts which are also there in model), \"fnonnat\" (fraction " - "of model contacts which are not there in target), \"irmsd\" " - "(interface RMSD), \"lrmsd\" (ligand RMSD), \"dockq_scores\" " - "(per-interface score computed from \"fnat\", \"irmsd\" and " - "\"lrmsd\"), \"interface_qs_global\" and \"interface_qs_best\" " - "(per-interface versions of the two QS-score variants). The " - "DockQ score is strictly designed to score each interface " - "individually. We also provide two averaged versions to get one " - "full model score: \"dockq_ave\", \"dockq_wave\". The first is " - "simply the average of \"dockq_scores\", the latter is a " - "weighted average with weights derived from \"nnat\". These two " - "scores only consider interfaces that are present in both, the " - "model and the reference. \"dockq_ave_full\" and " - "\"dockq_wave_full\" add zeros in the average computation for " - "each interface that is only present in the reference but not in " - "the model.")) - - parser.add_argument( - "--patch-scores", - dest="patch_scores", - default=False, - action="store_true", - help=("Local interface quality score used in CASP15. Scores each " - "model residue that is considered in the interface (CB pos " - "within 8A of any CB pos from another chain (CA for GLY)). The " - "local neighborhood gets represented by \"interface patches\" " - "which are scored with QS-score and DockQ. Scores where not " - "the full patches are represented by the reference are set to " - "None. Model interface residues are available as key " - "\"model_interface_residues\", reference interface residues as " - "key \"reference_interface_residues\". Residues are represented " - "as string in form <num><inscode>. The respective scores are " - "available as keys \"patch_qs\" and \"patch_dockq\"")) - - return parser.parse_args() - -def _Rename(ent): - """Revert chain names to original names. - - PDBize assigns chain name in order A,B,C,D... which does not allow to infer - the original chain name. We do a renaming here: - if there are two chains mapping to chain A the resulting - chain names will be: A and A2. - """ - new_chain_names = list() - chain_indices = list() # the chains where we actually change the name - suffix_indices = dict() # keep track of whats the current suffix index - # for each original chain name - - for ch_idx, ch in enumerate(ent.chains): - if not ch.HasProp("original_name"): - # pdbize doesnt set this property for chain names in ['_', '-'] - continue - original_name = ch.GetStringProp("original_name") - if original_name in new_chain_names: - new_name = original_name + str(suffix_indices[original_name]) - new_chain_names.append(new_name) - suffix_indices[original_name] = suffix_indices[original_name] + 1 - else: - new_chain_names.append(original_name) - suffix_indices[original_name] = 2 - chain_indices.append(ch_idx) - editor = ent.EditXCS() - # rename to nonsense to avoid clashing chain names - for ch_idx in chain_indices: - editor.RenameChain(ent.chains[ch_idx], ent.chains[ch_idx].name+"_yolo") - # and do final renaming - for new_name, ch_idx in zip(new_chain_names, chain_indices): - editor.RenameChain(ent.chains[ch_idx], new_name) - -def _LoadStructure(structure_path, sformat=None, fault_tolerant=False, - bu_idx=None): - """Read OST entity either from mmCIF or PDB. - - The returned structure has structure_path attached as structure name - """ - - if not os.path.exists(structure_path): - raise Exception(f"file not found: {structure_path}") - - if sformat is None: - # Determine file format from suffix. - ext = structure_path.split(".") - if ext[-1] == "gz": - ext = ext[:-1] - if len(ext) <= 1: - raise Exception(f"Could not determine format of file " - f"{structure_path}.") - sformat = ext[-1].lower() - - # increase loglevel, as we would pollute the info log with weird stuff - ost.PushVerbosityLevel(ost.LogLevel.Error) - # Load the structure - if sformat in ["mmcif", "cif"]: - if bu_idx is not None: - cif_entity, cif_seqres, cif_info = \ - io.LoadMMCIF(structure_path, info=True, seqres=True, - fault_tolerant=fault_tolerant) - if bu_idx >= len(cif_info.biounits): - raise RuntimeError(f"Invalid biounit index - requested {bu_idx} " - f"cif file has {len(cif_info.biounits)}") - biounit = cif_info.biounits[bu_idx] - entity = biounit.PDBize(cif_entity, min_polymer_size=0) - if not entity.IsValid(): - raise IOError( - "Provided file does not contain valid entity.") - _Rename(entity) - else: - entity = io.LoadMMCIF(structure_path, - fault_tolerant = fault_tolerant) - if len(entity.residues) == 0: - raise Exception(f"No residues found in file: {structure_path}") - elif sformat == "pdb": - entity = io.LoadPDB(structure_path, fault_tolerant = fault_tolerant) - if len(entity.residues) == 0: - raise Exception(f"No residues found in file: {structure_path}") - else: - raise Exception(f"Unknown/ unsupported file extension found for " - f"file {structure_path}.") - # restore old loglevel and return - ost.PopVerbosityLevel() - entity.SetName(structure_path) - return entity - -def _AlnToFastaStr(aln): - """ Returns alignment as fasta formatted string - """ - s1 = aln.GetSequence(0) - s2 = aln.GetSequence(1) - return f">reference:{s1.name}\n{str(s1)}\nmodel:{s2.name}\n{str(s2)}" - -def _GetInconsistentResidues(alns): - lst = list() - for aln in alns: - for col in aln: - r1 = col.GetResidue(0) - r2 = col.GetResidue(1) - if r1.IsValid() and r2.IsValid() and r1.GetName() != r2.GetName(): - lst.append((r1, r2)) - lst = [f"{x[0].GetQualifiedName()}-{x[1].GetQualifiedName()}" for x in lst] - return lst - -def _LocalScoresToJSONDict(score_dict): - """ Score for model residue with number rnum in chain X can be extracted - with: data["local_cad_score"]["X"][rnum]. Convert ResNum object to str for - JSON serialization - """ - json_dict = dict() - for ch, ch_scores in score_dict.items(): - json_dict[ch] = {str(rnum): s for rnum, s in ch_scores.items()} - return json_dict - -def _InterfaceResiduesToJSONDict(interface_dict): - """ Interface residues are stored as - data["model_interface_residues"]["A"][rnum1, rnum2,...]. Convert ResNum - object to str for JSON serialization. - """ - json_dict = dict() - for ch, ch_nums in interface_dict.items(): - json_dict[ch] = [str(rnum) for rnum in ch_nums] - return json_dict - -def _Process(model, reference, args): - - mapping = None - if args.chain_mapping is not None: - mapping = {x.split(':')[0]: x.split(':')[1] for x in args.chain_mapping} - - scorer = scoring.Scorer(model, reference, - resnum_alignments = args.residue_number_alignment, - cad_score_exec = args.cad_exec, - custom_mapping = mapping) - - ir = _GetInconsistentResidues(scorer.aln) - if len(ir) > 0 and args.enforce_consistency: - raise RuntimeError(f"Inconsistent residues observed: {' '.join(ir)}") - - out = dict() - out["chain_mapping"] = scorer.mapping.GetFlatMapping() - out["aln"] = [_AlnToFastaStr(aln) for aln in scorer.aln] - out["inconsistent_residues"] = ir - out["chem_groups"] = scorer.chain_mapper.chem_groups - - if args.lddt: - out["lddt"] = scorer.lddt - - if args.local_lddt: - out["local_lddt"] = _LocalScoresToJSONDict(scorer.local_lddt) - - if args.lddt or args.local_lddt: - out["model_clashes"] = [x.ToJSON() for x in scorer.model_clashes] - out["model_bad_bonds"] = [x.ToJSON() for x in scorer.model_bad_bonds] - out["model_bad_angles"] = [x.ToJSON() for x in scorer.model_bad_angles] - out["reference_clashes"] = [x.ToJSON() for x in scorer.target_clashes] - out["reference_bad_bonds"] = [x.ToJSON() for x in scorer.target_bad_bonds] - out["reference_bad_angles"] = [x.ToJSON() for x in scorer.target_bad_angles] - - if args.cad_score: - out["cad_score"] = scorer.cad_score - - if args.local_cad_score: - out["local_cad_score"] = _LocalScoresToJSONDict(scorer.local_cad_score) - - if args.qs_score: - out["qs_global"] = scorer.qs_global - out["qs_best"] = scorer.qs_best - - if args.rigid_scores: - out["oligo_gdtts"] = scorer.gdtts - out["oligo_gdtha"] = scorer.gdtha - out["rmsd"] = scorer.rmsd - data = scorer.transform.data - out["transform"] = [data[i:i + 4] for i in range(0, len(data), 4)] - - if args.interface_scores: - out["interfaces"] = scorer.interfaces - out["nnat"] = scorer.native_contacts - out["nmdl"] = scorer.model_contacts - out["fnat"] = scorer.fnat - out["fnonnat"] = scorer.fnonnat - out["irmsd"] = scorer.irmsd - out["lrmsd"] = scorer.lrmsd - out["dockq_scores"] = scorer.dockq_scores - out["interface_qs_global"] = scorer.interface_qs_global - out["interface_qs_best"] = scorer.interface_qs_best - out["dockq_ave"] = scorer.dockq_ave - out["dockq_wave"] = scorer.dockq_wave - out["dockq_ave_full"] = scorer.dockq_ave_full - out["dockq_wave_full"] = scorer.dockq_wave_full - - if args.patch_scores: - out["model_interface_residues"] = \ - _InterfaceResiduesToJSONDict(scorer.model_interface_residues) - out["reference_interface_residues"] = \ - _InterfaceResiduesToJSONDict(scorer.target_interface_residues) - out["patch_qs"] = scorer.patch_qs - out["patch_dockq"] = scorer.patch_dockq - - if args.dump_structures: - try: - io.SavePDB(scorer.model, model.GetName() + args.dump_suffix) - except Exception as e: - if "single-letter" in str(e) and args.model_biounit is not None: - raise RuntimeError("Failed to dump processed model. PDB " - "format only supports single character " - "chain names. This is likely the result of " - "chain renaming when constructing a user " - "specified biounit. Dumping structures " - "fails in this case.") - else: - raise - try: - io.SavePDB(scorer.target, reference.GetName() + args.dump_suffix) - except Exception as e: - if "single-letter" in str(e) and args.reference_biounit is not None: - raise RuntimeError("Failed to dump processed reference. PDB " - "format only supports single character " - "chain names. This is likely the result of " - "chain renaming when constructing a user " - "specified biounit. Dumping structures " - "fails in this case.") - else: - raise - - - - - return out - -def _Main(): - - args = _ParseArgs() - try: - reference = _LoadStructure(args.reference, - sformat=args.reference_format, - bu_idx=args.reference_biounit, - fault_tolerant = args.fault_tolerant) - model = _LoadStructure(args.model, - sformat=args.model_format, - bu_idx=args.model_biounit, - fault_tolerant = args.fault_tolerant) - out = _Process(model, reference, args) - out["status"] = "SUCCESS" - with open(args.output, 'w') as fh: - json.dump(out, fh, indent=4, sort_keys=False) - except: - out = dict() - out["status"] = "FAILURE" - out["traceback"] = traceback.format_exc() - with open(args.output, 'w') as fh: - json.dump(out, fh, indent=4, sort_keys=False) - raise - -if __name__ == '__main__': - _Main() diff --git a/modules/doc/actions.rst b/modules/doc/actions.rst index 9444c519a..6afdbe6ce 100644 --- a/modules/doc/actions.rst +++ b/modules/doc/actions.rst @@ -18,585 +18,209 @@ Here we list the most prominent actions with simple examples. Comparing two structures -------------------------------------------------------------------------------- -You can compare two structures in terms of quaternary structure score and -lDDT scores between two complexes from the command line with the +You can compare two structures from the command line with the ``ost compare-structures`` action. -In summary it performs the following steps: +.. warning:: -- Read structures (PDB or mmCIF format, can be gzipped) and split into - biological assemblies (all possible pairs are scored). -- Mandatory cleanup of hydrogen atoms, ligand and water chains, small - (< 20 residues) peptides or chains with no amino acids as described in - :attr:`QSscoreEntity.ent <ost.mol.alg.qsscoring.QSscoreEntity.ent>` and - :attr:`QSscoreEntity.removed_chains <ost.mol.alg.qsscoring.QSscoreEntity.removed_chains>`. -- Optional cleanup of structures with :func:`~ost.mol.alg.Molck`. -- Optional structural checks with :func:`~ost.mol.alg.CheckStructure`. -- Unless user-provided, find chain mapping between complexes (see - :attr:`here <ost.mol.alg.qsscoring.QSscorer.chain_mapping>` for details) -- Perform sequence alignment of chain pairs (unless user asks for alignment - based on residue numbers). Alignment can optionally checked for consistency - if 100% sequence identity is expected. -- Compute scores requested by user (CA-RMSD of oligomer, - :mod:`QS scores of oligomer <ost.mol.alg.qsscoring>`, - :class:`single chain lDDT scores <ost.mol.alg.qsscoring.OligoLDDTScorer.sc_lddt>`, - :attr:`lDDT score of oligomer <ost.mol.alg.qsscoring.OligoLDDTScorer.oligo_lddt>`). - Note that while the QS score is symmetric (same result when swapping reference - and model), the lDDT scores are not. Extra atoms in the model for mapped - chains have no effect on the score, while extra atoms in the reference reduce - the score. For oligo_lddt, we do - :attr:`penalize for extra chains <ost.mol.alg.qsscoring.OligoLDDTScorer.penalize_extra_chains>` - in both reference and model. - -.. note :: - - The action relies on OST's :mod:`~ost.mol.alg.qsscoring` module and has the - same requirements on your OST installation (needs compound library, ClustalW, - numpy and scipy). + ``compare-structures`` underwent a complete rewrite in OpenStructure + release 2.4.0. The old version is still available as + ``compare-structures-legacy`` with documentation available + :doc:`here <deprecated_actions>`. Details on the usage (output of ``ost compare-structures --help``): .. code-block:: console - usage: ost compare-structures [-h] -m MODEL -r REFERENCE [-v VERBOSITY] - [-o OUTPUT] [-d] [-ds DUMP_SUFFIX] - [-rs REFERENCE_SELECTION] [-ms MODEL_SELECTION] - [-ca] [-ft] [-cl COMPOUND_LIBRARY] [-ml] - [-rm REMOVE [REMOVE ...]] [-ce] [-mn] [-sc] - [-p PARAMETER_FILE] [-bt BOND_TOLERANCE] - [-at ANGLE_TOLERANCE] - [-c CHAIN_MAPPING [CHAIN_MAPPING ...]] - [--qs-max-mappings-extensive QS_MAX_MAPPINGS_EXTENSIVE] - [-cc] [-rna] [-qs] [--qs-rmsd] [-l] - [-ir INCLUSION_RADIUS] [-ss SEQUENCE_SEPARATION] - [-spr] - - Evaluate model structure against reference. - - eg. - - ost compare-structures \ - --model <MODEL> \ - --reference <REF> \ - --output output.json \ - --lddt \ - --structural-checks \ - --consistency-checks \ - --molck \ - --remove oxt hyd \ - --map-nonstandard-residues - - Here we describe how the parameters can be set to mimick a CAMEO evaluation - (as of August 2018). - - CAMEO calls the lddt binary as follows: - - lddt \ - -p <PARAMETER FILE> \ - -f \ - -a 15 \ - -b 15 \ - -r 15 \ - <MODEL> \ - <REF> - - Only model structures are "Molck-ed" in CAMEO. The call to molck is as follows: - - molck \ - --complib=<COMPOUND LIB> \ - --rm=hyd,oxt,unk,nonstd \ - --fix-ele \ - --map-nonstd \ - --out=<OUTPUT> \ - <FILEPATH> - - To be as much compatible with with CAMEO as possible one should call - compare-structures as follows: - - ost compare-structures \ - --model <MODEL> \ - --reference <REF> \ - --output output.json \ - --molck \ - --remove oxt hyd unk nonstd \ - --clean-element-column \ - --map-nonstandard-residues \ - --structural-checks \ - --bond-tolerance 15.0 \ - --angle-tolerance 15.0 \ - --residue-number-alignment \ - --consistency-checks \ - --qs-score \ - --lddt \ - --inclusion-radius 15.0 + usage: ost compare-structures [-h] -m MODEL -r REFERENCE [-o OUTPUT] + [-mf {pdb,cif,mmcif}] [-rf {pdb,cif,mmcif}] + [-mb MODEL_BIOUNIT] [-rb REFERENCE_BIOUNIT] + [-rna] [-ec] [-d] [-ds DUMP_SUFFIX] [-ft] + [-c CHAIN_MAPPING [CHAIN_MAPPING ...]] [--lddt] + [--local-lddt] [--cad-score] [--local-cad-score] + [--cad-exec CAD_EXEC] [--qs-score] + [--rigid-scores] [--interface-scores] + [--patch-scores] + + Evaluate model against reference + + Example: ost compare-structures -m model.pdb -r reference.cif + + Loads the structures and performs basic cleanup: + + * Assign elements according to the PDB compound dictionary + * Map nonstandard residues to their parent residues as defined by the PDB + compound dictionary, e.g. phospho-serine => serine + * Remove hydrogens + * Remove OXT atoms + * Remove unknown atoms, i.e. atoms that are not expected according to the PDB + compound dictionary + + The cleaned structures are optionally dumped using -d/--dump-structures + + Output is written in JSON format (default: out.json). In case of no additional + options, this is a dictionary with five keys: + + * "chain_mapping": A dictionary with reference chain names as keys and the + mapped model chain names as values. + * "aln": Pairwise sequence alignment for each pair of mapped chains in fasta + format. + * "chem_groups": Groups of polypeptides/polynucleotides that are considered + chemically equivalent. You can derive stoichiometry from this. + * "inconsistent_residues": List of strings that represent name mismatches of + aligned residues in form + <trg_cname>.<trg_rname><trg_rnum>-<mdl_cname>.<mdl_rname><mdl_rnum>. + Inconsistencies may lead to corrupt results but do not abort the program. + Program abortion in these cases can be enforced with + -ec/--enforce-consistency. + * "status": SUCCESS if everything ran through. In case of failure, the only + content of the JSON output will be "status" set to FAILURE and an + additional key: "traceback". + + The pairwise sequence alignments are computed with Needleman-Wunsch using + BLOSUM62 (NUC44 for nucleotides). Many benchmarking scenarios preprocess the + structures to ensure matching residue numbers (CASP/CAMEO). In these cases, + enabling -rna/--residue-number-alignment is recommended. + + Each score is opt-in and can be enabled with optional arguments. + + Example to compute local and per-residue lDDT values as well as QS-score: + + ost compare-structures -m model.pdb -r reference.cif --lddt --local-lddt --qs-score + + Example to inject custom chain mapping + + ost compare-structures -m model.pdb -r reference.cif -c A:B B:A optional arguments: -h, --help show this help message and exit - - required arguments: -m MODEL, --model MODEL - Path to the model file. + Path to model file. -r REFERENCE, --reference REFERENCE - Path to the reference file. - - general arguments: - -v VERBOSITY, --verbosity VERBOSITY - Set verbosity level. Defaults to 3. + Path to reference file. -o OUTPUT, --output OUTPUT - Output file name. The output will be saved as a JSON file. + Output file name. The output will be saved as a JSON + file. default: out.json + -mf {pdb,cif,mmcif}, --model-format {pdb,cif,mmcif} + Format of model file. pdb reads pdb but also pdb.gz, + same applies to cif/mmcif. Inferred from filepath if + not given. + -rf {pdb,cif,mmcif}, --reference-format {pdb,cif,mmcif} + Format of reference file. pdb reads pdb but also + pdb.gz, same applies to cif/mmcif. Inferred from + filepath if not given. + -mb MODEL_BIOUNIT, --model-biounit MODEL_BIOUNIT + Only has an effect if model is in mmcif format. By + default, the assymetric unit (AU) is used for scoring. + If there are biounits defined in the mmcif file, you + can specify the index of the one which should be used. + -rb REFERENCE_BIOUNIT, --reference-biounit REFERENCE_BIOUNIT + Only has an effect if reference is in mmcif format. By + default, the assymetric unit (AU) is used for scoring. + If there are biounits defined in the mmcif file, you + can specify the index of the one which should be used. + -rna, --residue-number-alignment + Make alignment based on residue number instead of + using a global BLOSUM62-based alignment (NUC44 for + nucleotides). + -ec, --enforce-consistency + Enforce consistency. By default residue name + discrepancies between a model and reference are + reported but the program proceeds. If this flag is ON, + the program fails for these cases. -d, --dump-structures - Dump cleaned structures used to calculate all the scores as - PDB files using specified suffix. Files will be dumped to the - same location as original files. + Dump cleaned structures used to calculate all the + scores as PDB files using specified suffix. Files will + be dumped to the same location as original files. -ds DUMP_SUFFIX, --dump-suffix DUMP_SUFFIX - Use this suffix to dump structures. - Defaults to .compare.structures.pdb. - -rs REFERENCE_SELECTION, --reference-selection REFERENCE_SELECTION - Selection performed on reference structures. - -ms MODEL_SELECTION, --model-selection MODEL_SELECTION - Selection performed on model structures. - -ca, --c-alpha-only Use C-alpha atoms only. Equivalent of calling the action with - '--model-selection="aname=CA" --reference-selection="aname=CA"' - options. + Use this suffix to dump structures. Defaults to + .compare.structures.pdb. -ft, --fault-tolerant Fault tolerant parsing. - -cl COMPOUND_LIBRARY, --compound-library COMPOUND_LIBRARY - Location of the compound library file (compounds.chemlib). - If not provided, the following locations are searched in this - order: 1. Working directory, 2. OpenStructure standard library - location. - - molecular check arguments: - -ml, --molck Run molecular checker to clean up input. - -rm REMOVE [REMOVE ...], --remove REMOVE [REMOVE ...] - Remove atoms and residues matching some criteria: - * zeroocc - Remove atoms with zero occupancy - * hyd - remove hydrogen atoms - * oxt - remove terminal oxygens - * nonstd - remove all residues not one of the 20 - standard amino acids - * unk - Remove unknown and atoms not following the - nomenclature - Defaults to hyd. - -ce, --clean-element-column - Clean up element column - -mn, --map-nonstandard-residues - Map modified residues back to the parent amino acid, for - example MSE -> MET, SEP -> SER. - - structural check arguments: - -sc, --structural-checks - Perform structural checks and filter input data. - -p PARAMETER_FILE, --parameter-file PARAMETER_FILE - Location of the stereochemical parameter file - (stereo_chemical_props.txt). - If not provided, the following locations are searched in this - order: 1. Working directory, 2. OpenStructure standard library - location. - -bt BOND_TOLERANCE, --bond-tolerance BOND_TOLERANCE - Tolerance in STD for bonds. Defaults to 12. - -at ANGLE_TOLERANCE, --angle-tolerance ANGLE_TOLERANCE - Tolerance in STD for angles. Defaults to 12. - - chain mapping arguments: -c CHAIN_MAPPING [CHAIN_MAPPING ...], --chain-mapping CHAIN_MAPPING [CHAIN_MAPPING ...] - Mapping of chains between the reference and the model. - Each separate mapping consist of key:value pairs where key - is the chain name in reference and value is the chain name in - model. - --qs-max-mappings-extensive QS_MAX_MAPPINGS_EXTENSIVE - Maximal number of chain mappings to test for 'extensive' - chain mapping scheme which is used as a last resort if - other schemes failed. The extensive chain mapping search - must in the worst case check O(N!) possible mappings for - complexes with N chains. Two octamers without symmetry - would require 322560 mappings to be checked. To limit - computations, no scores are computed if we try more than - the maximal number of chain mappings. Defaults to 1000000. - - sequence alignment arguments: - -cc, --consistency-checks - Take consistency checks into account. By default residue name - consistency between a model-reference pair would be checked - but only a warning message will be displayed and the script - will continue to calculate scores. If this flag is ON, checks - will not be ignored and if the pair does not pass the test - all the scores for that pair will be marked as a FAILURE. - -rna, --residue-number-alignment - Make alignment based on residue number instead of using - a global BLOSUM62-based alignment. - - QS score arguments: - -qs, --qs-score Calculate QS-score. - --qs-rmsd Calculate CA RMSD between shared CA atoms of mapped chains. - This uses a superposition using all mapped chains which - minimizes the CA RMSD. - - lDDT score arguments: - -l, --lddt Calculate lDDT. - -ir INCLUSION_RADIUS, --inclusion-radius INCLUSION_RADIUS - Distance inclusion radius for lDDT. Defaults to 15 A. - -ss SEQUENCE_SEPARATION, --sequence-separation SEQUENCE_SEPARATION - Sequence separation. Only distances between residues whose - separation is higher than the provided parameter are - considered when computing the score. Defaults to 0. - -spr, --save-per-residue-scores - - -By default the verbosity is set to 3 which will result in the informations -being shown in the console. The result can be (optionally) saved as JSON file -which is the preferred way of parsing it as the log output might change in the -future. Optionally, the local scores for lDDT can also be dumped to the output -file. Additionally, cleaned up structures can be saved to the disk. -The output file has following format: - -.. code-block:: none - - { - "options": { ... }, # Options used to run the script - "result": { - "<MODEL NAME>": { # Model name extracted from the file name - "<REFERENCE NAME>": { # Reference name extracted from the file name - "info": { - "mapping": { - "alignments": <list of chain-chain alignments in FASTA format>, - "chain_mapping": <Mapping of chains eg. {"A": "B", "B": "A"}>, - "chain_mapping_scheme": <Scheme used to get mapping, check mapping manually - if "permissive" or "extensive"> - }, - "residue_names_consistent": <Are the residue numbers consistent? true or false> - }, - "lddt": { - # calculated when --lddt (-l) option is selected - "oligo_lddt": { - "error": <ERROR message if any>, - "global_score": <calculated oligomeric lDDT score>, - "status": <SUCCESS or FAILURE> - }, - "single_chain_lddt": [ - # a list of chain-chain lDDTs - { - "conserved_contacts": <number of conserved contacts between model & reference>, - "error": <ERROR message if any>, - "global_score": <calculated single-chain lDDT score>, - "model_chain": <name of the chain in model>, - "reference_chain": <name of the chain in reference>, - "status": <SUCCESS or FAILURE>, - "total_contacts": <total number of contacts in reference>, - "per_residue_scores": [ - # per-residue lDDT scores - # only calculated when --save-per-residue-scores (-spr) option is selected - { - "residue_name": <three letter code of the residue in reference chain>, - "residue_number": <residue number in reference chain>, - "lddt": <residue lDDT score>, - "conserved_contacts": <conserved_contacts for given residue>, - "total_contacts": <total_contacts for given residue> - }, - . - . - . - ] - } - ], - "weighted_lddt": { - "error": <ERROR message if any>, - "global_score": <calculated weighted lDDT score>, - "status": <SUCCESS or FAILURE> - } - }, - "qs_score": { - # calculated when --qs-score (-q) option is selected - "best_score": <Best QS-score>, - "error": <ERROR message if any>, - "global_score": <Global QS-score>, - "status": <SUCCESS or FAILURE> - } - } - } - } - } - -The "result" filed is a dictionary mapping from model to reference as eg. in -mmCIF file there can be many entities and the script will compare all -combinations. - -Example usage: - -.. code-block:: console - - $ CAMEO_TARGET_URL=https://www.cameo3d.org/static/data/modeling/2019.07.13/6PO4_F - $ curl $CAMEO_TARGET_URL/bu_target_01.pdb > reference.pdb - $ curl $CAMEO_TARGET_URL/servers/server20/oligomodel-1/oligomodel-1.pdb > model.pdb - $ $OST_ROOT/bin/ost compare-structures \ - --model model.pdb --reference reference.pdb --output output.json \ - --qs-score --residue-number-alignment --lddt --structural-checks \ - --consistency-checks --inclusion-radius 15.0 --bond-tolerance 15.0 \ - --angle-tolerance 15.0 --molck --remove oxt hyd unk nonstd \ - --clean-element-column --map-nonstandard-residues - - ################################################################################ - Reading input files (fault_tolerant=False) - --> reading model from model.pdb - imported 2 chains, 462 residues, 3400 atoms; with 0 helices and 0 strands - --> reading reference from reference.pdb - imported 3 chains, 471 residues, 3465 atoms; with 0 helices and 0 strands - ################################################################################ - Cleaning up input with Molck - removing hydrogen atoms - --> removed 0 hydrogen atoms - removing OXT atoms - --> removed 3 OXT atoms - _.HCS1 is not a standard amino acid --> removed - _.ADE2 is not a standard amino acid --> removed - _.BO33 is not a standard amino acid --> removed - _.ADE4 is not a standard amino acid --> removed - _.HCS5 is not a standard amino acid --> removed - _.BO36 is not a standard amino acid --> removed - removing hydrogen atoms - --> removed 0 hydrogen atoms - removing OXT atoms - --> removed 0 OXT atoms - ################################################################################ - Performing structural checks - --> for reference(s) - Checking reference.pdb - Checking stereo-chemistry - Average Z-Score for bond lengths: 0.33163 - Bonds outside of tolerance range: 0 out of 2993 - Bond Avg Length Avg zscore Num Bonds - C-C 1.51236 0.03971 1682 - C-N 1.46198 0.96819 603 - C-O 1.25794 0.49967 674 - C-S 1.80242 0.15292 34 - Average Z-Score angle widths: -0.12077 - Angles outside of tolerance range: 0 out of 3260 - Filtering non-bonded clashes - 0 non-bonded short-range distances shorter than tolerance distance - Distances shorter than tolerance are on average shorter by: 0.00000 - --> for model(s) - Checking model.pdb - Checking stereo-chemistry - Average Z-Score for bond lengths: 0.23693 - Bonds outside of tolerance range: 0 out of 2976 - Bond Avg Length Avg zscore Num Bonds - C-C 1.52020 0.40359 1674 - C-N 1.43936 -0.19949 598 - C-O 1.25221 0.20230 670 - C-S 1.81182 0.38936 34 - Average Z-Score angle widths: 0.04946 - Angles outside of tolerance range: 0 out of 3241 - Filtering non-bonded clashes - 0 non-bonded short-range distances shorter than tolerance distance - Distances shorter than tolerance are on average shorter by: 0.00000 - ################################################################################ - Comparing model.pdb to reference.pdb - Chains in reference.pdb: AB - Chains in model.pdb: AB - Chemically equivalent chain-groups in reference.pdb: [['A', 'B']] - Chemically equivalent chain-groups in model.pdb: [['A', 'B']] - Chemical chain-groups mapping: {('A', 'B'): ('A', 'B')} - Identifying Symmetry Groups... - Symmetry threshold 0.1 used for angles of reference.pdb - Symmetry threshold 0.1 used for axis of reference.pdb - Symmetry threshold 0.1 used for angles of model.pdb - Symmetry threshold 0.1 used for axis of model.pdb - Selecting Symmetry Groups... - Symmetry-groups used in reference.pdb: [('A',), ('B',)] - Symmetry-groups used in model.pdb: [('A',), ('B',)] - Closed Symmetry with strict parameters - Mapping found: {'A': 'A', 'B': 'B'} - -------------------------------------------------------------------------------- - Checking consistency between model.pdb and reference.pdb - Consistency check: OK - -------------------------------------------------------------------------------- - Computing QS-score - QSscore reference.pdb, model.pdb: best: 0.96, global: 0.96 - -------------------------------------------------------------------------------- - Computing lDDT scores - lDDT settings: - Inclusion Radius: 15 - Sequence separation: 0 - Cutoffs: 0.5, 1, 2, 4 - Residue properties label: lddt - === - --> Computing lDDT between model chain A and reference chain A - Coverage: 0.991416 (231 out of 233 residues) - Global LDDT score: 0.8955 - (1194245 conserved distances out of 1333644 checked, over 4 thresholds) - --> Computing lDDT between model chain B and reference chain B - Coverage: 0.991379 (230 out of 232 residues) - Global LDDT score: 0.8998 - (1200391 conserved distances out of 1334056 checked, over 4 thresholds) - --> Computing oligomeric lDDT score - Reference reference.pdb has: 2 chains - Model model.pdb has: 2 chains - Coverage: 0.991398 (461 out of 465 residues) - Oligo lDDT score: 0.8977 - --> Computing weighted lDDT score - Weighted lDDT score: 0.8976 - ################################################################################ - Saving output into output.json - -This reads the model and reference file and calculates QS- and lDDT-scores -between them. In the example above the output file looks as follows (FASTA -alignments were cut in display here for readability): - -.. code snippet to fix output.json generated above - import json - json_data = json.load(open("output.json")) - mapping = json_data["result"]["model.pdb"]["reference.pdb"]["info"]["mapping"] - new_alns = list() - for aln in mapping["alignments"]: - aln_lines = aln.splitlines() - aln_lines[1] = aln_lines[1][:15] + "..." - aln_lines[3] = aln_lines[3][:15] + "..." - new_alns.append("\n".join(aln_lines)) - mapping["alignments"] = new_alns - json_data["options"]["parameter_file"] = "Path to stage/share/openstructure/stereo_chemical_props.txt" - json_data["options"]["compound_library"] = "Path to stage/share/openstructure/compounds.chemlib" - json_data["options"]["cwd"] = "Path to current working directory" - with open("output_fixed.json", "w") as outfile: - json.dump(json_data, outfile, indent=2, sort_keys=True) - -.. code-block:: json - - { - "options": { - "angle_tolerance": 15.0, - "bond_tolerance": 15.0, - "c_alpha_only": false, - "chain_mapping": null, - "clean_element_column": true, - "compound_library": "Path to stage/share/openstructure/compounds.chemlib", - "consistency_checks": true, - "cwd": "Path to current working directory", - "dump_structures": false, - "dump_suffix": ".compare.structures.pdb", - "fault_tolerant": false, - "inclusion_radius": 15.0, - "lddt": true, - "map_nonstandard_residues": true, - "model": "model.pdb", - "model_selection": "", - "molck": true, - "output": "output.json", - "parameter_file": "Path to stage/share/openstructure/stereo_chemical_props.txt", - "qs_max_mappings_extensive": 1000000, - "qs_rmsd": false, - "qs_score": true, - "reference": "reference.pdb", - "reference_selection": "", - "remove": [ - "oxt", - "hyd", - "unk", - "nonstd" - ], - "residue_number_alignment": true, - "save_per_residue_scores": false, - "sequence_separation": 0, - "structural_checks": true, - "verbosity": 3 - }, - "result": { - "model.pdb": { - "reference.pdb": { - "info": { - "mapping": { - "alignments": [ - ">reference:A\n-NAMKIGIVGAMAQE...\n>model:A\n---MKIGIVGAMAQE...", - ">reference:B\n-NAMKIGIVGAMAQE...\n>model:B\n---MKIGIVGAMAQE..." - ], - "chain_mapping": { - "A": "A", - "B": "B" - }, - "chain_mapping_scheme": "strict" - }, - "residue_names_consistent": true - }, - "lddt": { - "oligo_lddt": { - "error": "", - "global_score": 0.8977285786061329, - "status": "SUCCESS" - }, - "single_chain_lddt": [ - { - "conserved_contacts": 1194245, - "error": "", - "global_score": 0.8954750895500183, - "model_chain": "A", - "reference_chain": "A", - "status": "SUCCESS", - "total_contacts": 1333644 - }, - { - "conserved_contacts": 1200391, - "error": "", - "global_score": 0.8998055458068848, - "model_chain": "B", - "reference_chain": "B", - "status": "SUCCESS", - "total_contacts": 1334056 - } - ], - "weighted_lddt": { - "error": "", - "global_score": 0.8976406520766181, - "status": "SUCCESS" - } - }, - "qs_score": { - "best_score": 0.9619749105661133, - "error": "", - "global_score": 0.9619749105661133, - "status": "SUCCESS" - } - } - } - } - } - -If all the structures are clean and have matching residue numbers, one can omit -all the checking steps and calculate scores directly as here: - -.. code:: console - - $ $OST_ROOT/bin/ost compare-structures \ - --model model.pdb --reference reference.pdb --output output_qs.json \ - --qs-score --residue-number-alignment - - ################################################################################ - Reading input files (fault_tolerant=False) - --> reading model from model.pdb - imported 2 chains, 462 residues, 3400 atoms; with 0 helices and 0 strands - --> reading reference from reference.pdb - imported 3 chains, 471 residues, 3465 atoms; with 0 helices and 0 strands - ################################################################################ - Comparing model.pdb to reference.pdb - Chains removed from reference.pdb: _ - Chains in reference.pdb: AB - Chains in model.pdb: AB - Chemically equivalent chain-groups in reference.pdb: [['A', 'B']] - Chemically equivalent chain-groups in model.pdb: [['A', 'B']] - Chemical chain-groups mapping: {('A', 'B'): ('A', 'B')} - Identifying Symmetry Groups... - Symmetry threshold 0.1 used for angles of reference.pdb - Symmetry threshold 0.1 used for axis of reference.pdb - Symmetry threshold 0.1 used for angles of model.pdb - Symmetry threshold 0.1 used for axis of model.pdb - Selecting Symmetry Groups... - Symmetry-groups used in reference.pdb: [('A',), ('B',)] - Symmetry-groups used in model.pdb: [('A',), ('B',)] - Closed Symmetry with strict parameters - Mapping found: {'A': 'A', 'B': 'B'} - -------------------------------------------------------------------------------- - Checking consistency between model.pdb and reference.pdb - Consistency check: OK - -------------------------------------------------------------------------------- - Computing QS-score - QSscore reference.pdb, model.pdb: best: 0.96, global: 0.96 - ################################################################################ - Saving output into output_qs.json + Custom mapping of chains between the reference and the + model. Each separate mapping consist of key:value + pairs where key is the chain name in reference and + value is the chain name in model. + --lddt Compute global lDDT score with default + parameterization and store as key "lddt". + Stereochemical irregularities affecting lDDT are + reported as keys "model_clashes", "model_bad_bonds", + "model_bad_angles" and the respective reference + counterparts. + --local-lddt Compute per-residue lDDT scores with default + parameterization and store as key "local_lddt". Score + for model residue with number 42 in chain X can be + extracted with: data["local_lddt"]["X"]["42"]. If + there is an insertion code, lets say A, the last key + becomes "42A" Stereochemical irregularities affecting + lDDT are reported as keys "model_clashes", + "model_bad_bonds", "model_bad_angles" and the + respective reference counterparts. + --cad-score Compute global CAD's atom-atom (AA) score and store as + key "cad_score". --residue-number-alignment must be + enabled to compute this score. Requires + voronota_cadscore executable in PATH. Alternatively + you can set cad-exec. + --local-cad-score Compute local CAD's atom-atom (AA) scores and store as + key "local_cad_score". Score for model residue with + number 42 in chain X can be extracted with: + data["local_cad_score"]["X"]["42"]. --residue-number- + alignments must be enabled to compute this score. + Requires voronota_cadscore executable in PATH. + Alternatively you can set cad-exec. + --cad-exec CAD_EXEC Path to voronota-cadscore executable (installed from + https://github.com/kliment-olechnovic/voronota). + Searches PATH if not set. + --qs-score Compute QS-score, stored as key "qs_global", and the + QS-best variant, stored as key "qs_best". + --rigid-scores Computes rigid superposition based scores. They're + based on a Kabsch superposition of all mapped CA + positions (C3' for nucleotides). Makes the following + keys available: "oligo_gdtts": GDT with distance + thresholds [1.0, 2.0, 4.0, 8.0] given these positions + and transformation, "oligo_gdtha": same with + thresholds [0.5, 1.0, 2.0, 4.0], "rmsd": RMSD given + these positions and transformation, "transform": the + used 4x4 transformation matrix that superposes model + onto reference. + --interface-scores Per interface scores for each interface that has at + least one contact in the reference, i.e. at least one + pair of heavy atoms within 5A. The respective + interfaces are available from key "interfaces" which + is a list of tuples in form (ref_ch1, ref_ch2, + mdl_ch1, mdl_ch2). Per-interface scores are available + as lists referring to these interfaces and have the + following keys: "nnat" (number of contacts in + reference), "nmdl" (number of contacts in model), + "fnat" (fraction of reference contacts which are also + there in model), "fnonnat" (fraction of model contacts + which are not there in target), "irmsd" (interface + RMSD), "lrmsd" (ligand RMSD), "dockq_scores" (per- + interface score computed from "fnat", "irmsd" and + "lrmsd"), "interface_qs_global" and + "interface_qs_best" (per-interface versions of the two + QS-score variants). The DockQ score is strictly + designed to score each interface individually. We also + provide two averaged versions to get one full model + score: "dockq_ave", "dockq_wave". The first is simply + the average of "dockq_scores", the latter is a + weighted average with weights derived from "nnat". + These two scores only consider interfaces that are + present in both, the model and the reference. + "dockq_ave_full" and "dockq_wave_full" add zeros in + the average computation for each interface that is + only present in the reference but not in the model. + --patch-scores Local interface quality score used in CASP15. Scores + each model residue that is considered in the interface + (CB pos within 8A of any CB pos from another chain (CA + for GLY)). The local neighborhood gets represented by + "interface patches" which are scored with QS-score and + DockQ. Scores where not the full patches are + represented by the reference are set to None. Model + interface residues are available as key + "model_interface_residues", reference interface + residues as key "reference_interface_residues". + Residues are represented as string in form + <num><inscode>. The respective scores are available as + keys "patch_qs" and "patch_dockq" diff --git a/modules/doc/deprecated_actions.rst b/modules/doc/deprecated_actions.rst new file mode 100644 index 000000000..44cb1aa12 --- /dev/null +++ b/modules/doc/deprecated_actions.rst @@ -0,0 +1,599 @@ +:orphan: + +.. ost-actions-deprecated: + + +.. _ost compare structures legacy: + +Comparing two structures - legacy implementation +-------------------------------------------------------------------------------- + +.. warning:: + + ``compare-structures`` underwent a complete rewrite in OpenStructure + release 2.4.0. Here we keep the original documentation. Call with + ``compare-structures-legacy`` instead of ``compare-structures`` as documented + below. + +You can compare two structures in terms of quaternary structure score and +lDDT scores between two complexes from the command line with the +``ost compare-structures`` action. + +In summary it performs the following steps: + +- Read structures (PDB or mmCIF format, can be gzipped) and split into + biological assemblies (all possible pairs are scored). +- Mandatory cleanup of hydrogen atoms, ligand and water chains, small + (< 20 residues) peptides or chains with no amino acids as described in + :attr:`QSscoreEntity.ent <ost.mol.alg.qsscoring.QSscoreEntity.ent>` and + :attr:`QSscoreEntity.removed_chains <ost.mol.alg.qsscoring.QSscoreEntity.removed_chains>`. +- Optional cleanup of structures with :func:`~ost.mol.alg.Molck`. +- Optional structural checks with :func:`~ost.mol.alg.CheckStructure`. +- Unless user-provided, find chain mapping between complexes (see + :attr:`here <ost.mol.alg.qsscoring.QSscorer.chain_mapping>` for details) +- Perform sequence alignment of chain pairs (unless user asks for alignment + based on residue numbers). Alignment can optionally checked for consistency + if 100% sequence identity is expected. +- Compute scores requested by user (CA-RMSD of oligomer, + :mod:`QS scores of oligomer <ost.mol.alg.qsscoring>`, + :class:`single chain lDDT scores <ost.mol.alg.qsscoring.OligoLDDTScorer.sc_lddt>`, + :attr:`lDDT score of oligomer <ost.mol.alg.qsscoring.OligoLDDTScorer.oligo_lddt>`). + Note that while the QS score is symmetric (same result when swapping reference + and model), the lDDT scores are not. Extra atoms in the model for mapped + chains have no effect on the score, while extra atoms in the reference reduce + the score. For oligo_lddt, we do + :attr:`penalize for extra chains <ost.mol.alg.qsscoring.OligoLDDTScorer.penalize_extra_chains>` + in both reference and model. + +.. note :: + + The action relies on OST's :mod:`~ost.mol.alg.qsscoring` module and has the + same requirements on your OST installation (needs compound library, ClustalW, + numpy and scipy). + +Details on the usage (output of ``ost compare-structures --help``): + +.. code-block:: console + + usage: ost compare-structures [-h] -m MODEL -r REFERENCE [-v VERBOSITY] + [-o OUTPUT] [-d] [-ds DUMP_SUFFIX] + [-rs REFERENCE_SELECTION] [-ms MODEL_SELECTION] + [-ca] [-ft] [-cl COMPOUND_LIBRARY] [-ml] + [-rm REMOVE [REMOVE ...]] [-ce] [-mn] [-sc] + [-p PARAMETER_FILE] [-bt BOND_TOLERANCE] + [-at ANGLE_TOLERANCE] + [-c CHAIN_MAPPING [CHAIN_MAPPING ...]] + [--qs-max-mappings-extensive QS_MAX_MAPPINGS_EXTENSIVE] + [-cc] [-rna] [-qs] [--qs-rmsd] [-l] + [-ir INCLUSION_RADIUS] [-ss SEQUENCE_SEPARATION] + [-spr] + + Evaluate model structure against reference. + + eg. + + ost compare-structures \ + --model <MODEL> \ + --reference <REF> \ + --output output.json \ + --lddt \ + --structural-checks \ + --consistency-checks \ + --molck \ + --remove oxt hyd \ + --map-nonstandard-residues + + Here we describe how the parameters can be set to mimick a CAMEO evaluation + (as of August 2018). + + CAMEO calls the lddt binary as follows: + + lddt \ + -p <PARAMETER FILE> \ + -f \ + -a 15 \ + -b 15 \ + -r 15 \ + <MODEL> \ + <REF> + + Only model structures are "Molck-ed" in CAMEO. The call to molck is as follows: + + molck \ + --complib=<COMPOUND LIB> \ + --rm=hyd,oxt,unk,nonstd \ + --fix-ele \ + --map-nonstd \ + --out=<OUTPUT> \ + <FILEPATH> + + To be as much compatible with with CAMEO as possible one should call + compare-structures as follows: + + ost compare-structures \ + --model <MODEL> \ + --reference <REF> \ + --output output.json \ + --molck \ + --remove oxt hyd unk nonstd \ + --clean-element-column \ + --map-nonstandard-residues \ + --structural-checks \ + --bond-tolerance 15.0 \ + --angle-tolerance 15.0 \ + --residue-number-alignment \ + --consistency-checks \ + --qs-score \ + --lddt \ + --inclusion-radius 15.0 + + optional arguments: + -h, --help show this help message and exit + + required arguments: + -m MODEL, --model MODEL + Path to the model file. + -r REFERENCE, --reference REFERENCE + Path to the reference file. + + general arguments: + -v VERBOSITY, --verbosity VERBOSITY + Set verbosity level. Defaults to 3. + -o OUTPUT, --output OUTPUT + Output file name. The output will be saved as a JSON file. + -d, --dump-structures + Dump cleaned structures used to calculate all the scores as + PDB files using specified suffix. Files will be dumped to the + same location as original files. + -ds DUMP_SUFFIX, --dump-suffix DUMP_SUFFIX + Use this suffix to dump structures. + Defaults to .compare.structures.pdb. + -rs REFERENCE_SELECTION, --reference-selection REFERENCE_SELECTION + Selection performed on reference structures. + -ms MODEL_SELECTION, --model-selection MODEL_SELECTION + Selection performed on model structures. + -ca, --c-alpha-only Use C-alpha atoms only. Equivalent of calling the action with + '--model-selection="aname=CA" --reference-selection="aname=CA"' + options. + -ft, --fault-tolerant + Fault tolerant parsing. + -cl COMPOUND_LIBRARY, --compound-library COMPOUND_LIBRARY + Location of the compound library file (compounds.chemlib). + If not provided, the following locations are searched in this + order: 1. Working directory, 2. OpenStructure standard library + location. + + molecular check arguments: + -ml, --molck Run molecular checker to clean up input. + -rm REMOVE [REMOVE ...], --remove REMOVE [REMOVE ...] + Remove atoms and residues matching some criteria: + * zeroocc - Remove atoms with zero occupancy + * hyd - remove hydrogen atoms + * oxt - remove terminal oxygens + * nonstd - remove all residues not one of the 20 + standard amino acids + * unk - Remove unknown and atoms not following the + nomenclature + Defaults to hyd. + -ce, --clean-element-column + Clean up element column + -mn, --map-nonstandard-residues + Map modified residues back to the parent amino acid, for + example MSE -> MET, SEP -> SER. + + structural check arguments: + -sc, --structural-checks + Perform structural checks and filter input data. + -p PARAMETER_FILE, --parameter-file PARAMETER_FILE + Location of the stereochemical parameter file + (stereo_chemical_props.txt). + If not provided, the following locations are searched in this + order: 1. Working directory, 2. OpenStructure standard library + location. + -bt BOND_TOLERANCE, --bond-tolerance BOND_TOLERANCE + Tolerance in STD for bonds. Defaults to 12. + -at ANGLE_TOLERANCE, --angle-tolerance ANGLE_TOLERANCE + Tolerance in STD for angles. Defaults to 12. + + chain mapping arguments: + -c CHAIN_MAPPING [CHAIN_MAPPING ...], --chain-mapping CHAIN_MAPPING [CHAIN_MAPPING ...] + Mapping of chains between the reference and the model. + Each separate mapping consist of key:value pairs where key + is the chain name in reference and value is the chain name in + model. + --qs-max-mappings-extensive QS_MAX_MAPPINGS_EXTENSIVE + Maximal number of chain mappings to test for 'extensive' + chain mapping scheme which is used as a last resort if + other schemes failed. The extensive chain mapping search + must in the worst case check O(N!) possible mappings for + complexes with N chains. Two octamers without symmetry + would require 322560 mappings to be checked. To limit + computations, no scores are computed if we try more than + the maximal number of chain mappings. Defaults to 1000000. + + sequence alignment arguments: + -cc, --consistency-checks + Take consistency checks into account. By default residue name + consistency between a model-reference pair would be checked + but only a warning message will be displayed and the script + will continue to calculate scores. If this flag is ON, checks + will not be ignored and if the pair does not pass the test + all the scores for that pair will be marked as a FAILURE. + -rna, --residue-number-alignment + Make alignment based on residue number instead of using + a global BLOSUM62-based alignment. + + QS score arguments: + -qs, --qs-score Calculate QS-score. + --qs-rmsd Calculate CA RMSD between shared CA atoms of mapped chains. + This uses a superposition using all mapped chains which + minimizes the CA RMSD. + + lDDT score arguments: + -l, --lddt Calculate lDDT. + -ir INCLUSION_RADIUS, --inclusion-radius INCLUSION_RADIUS + Distance inclusion radius for lDDT. Defaults to 15 A. + -ss SEQUENCE_SEPARATION, --sequence-separation SEQUENCE_SEPARATION + Sequence separation. Only distances between residues whose + separation is higher than the provided parameter are + considered when computing the score. Defaults to 0. + -spr, --save-per-residue-scores + + +By default the verbosity is set to 3 which will result in the informations +being shown in the console. The result can be (optionally) saved as JSON file +which is the preferred way of parsing it as the log output might change in the +future. Optionally, the local scores for lDDT can also be dumped to the output +file. Additionally, cleaned up structures can be saved to the disk. +The output file has following format: + +.. code-block:: none + + { + "options": { ... }, # Options used to run the script + "result": { + "<MODEL NAME>": { # Model name extracted from the file name + "<REFERENCE NAME>": { # Reference name extracted from the file name + "info": { + "mapping": { + "alignments": <list of chain-chain alignments in FASTA format>, + "chain_mapping": <Mapping of chains eg. {"A": "B", "B": "A"}>, + "chain_mapping_scheme": <Scheme used to get mapping, check mapping manually + if "permissive" or "extensive"> + }, + "residue_names_consistent": <Are the residue numbers consistent? true or false> + }, + "lddt": { + # calculated when --lddt (-l) option is selected + "oligo_lddt": { + "error": <ERROR message if any>, + "global_score": <calculated oligomeric lDDT score>, + "status": <SUCCESS or FAILURE> + }, + "single_chain_lddt": [ + # a list of chain-chain lDDTs + { + "conserved_contacts": <number of conserved contacts between model & reference>, + "error": <ERROR message if any>, + "global_score": <calculated single-chain lDDT score>, + "model_chain": <name of the chain in model>, + "reference_chain": <name of the chain in reference>, + "status": <SUCCESS or FAILURE>, + "total_contacts": <total number of contacts in reference>, + "per_residue_scores": [ + # per-residue lDDT scores + # only calculated when --save-per-residue-scores (-spr) option is selected + { + "residue_name": <three letter code of the residue in reference chain>, + "residue_number": <residue number in reference chain>, + "lddt": <residue lDDT score>, + "conserved_contacts": <conserved_contacts for given residue>, + "total_contacts": <total_contacts for given residue> + }, + . + . + . + ] + } + ], + "weighted_lddt": { + "error": <ERROR message if any>, + "global_score": <calculated weighted lDDT score>, + "status": <SUCCESS or FAILURE> + } + }, + "qs_score": { + # calculated when --qs-score (-q) option is selected + "best_score": <Best QS-score>, + "error": <ERROR message if any>, + "global_score": <Global QS-score>, + "status": <SUCCESS or FAILURE> + } + } + } + } + } + +The "result" filed is a dictionary mapping from model to reference as eg. in +mmCIF file there can be many entities and the script will compare all +combinations. + +Example usage: + +.. code-block:: console + + $ CAMEO_TARGET_URL=https://www.cameo3d.org/static/data/modeling/2019.07.13/6PO4_F + $ curl $CAMEO_TARGET_URL/bu_target_01.pdb > reference.pdb + $ curl $CAMEO_TARGET_URL/servers/server20/oligomodel-1/oligomodel-1.pdb > model.pdb + $ $OST_ROOT/bin/ost compare-structures \ + --model model.pdb --reference reference.pdb --output output.json \ + --qs-score --residue-number-alignment --lddt --structural-checks \ + --consistency-checks --inclusion-radius 15.0 --bond-tolerance 15.0 \ + --angle-tolerance 15.0 --molck --remove oxt hyd unk nonstd \ + --clean-element-column --map-nonstandard-residues + + ################################################################################ + Reading input files (fault_tolerant=False) + --> reading model from model.pdb + imported 2 chains, 462 residues, 3400 atoms; with 0 helices and 0 strands + --> reading reference from reference.pdb + imported 3 chains, 471 residues, 3465 atoms; with 0 helices and 0 strands + ################################################################################ + Cleaning up input with Molck + removing hydrogen atoms + --> removed 0 hydrogen atoms + removing OXT atoms + --> removed 3 OXT atoms + _.HCS1 is not a standard amino acid --> removed + _.ADE2 is not a standard amino acid --> removed + _.BO33 is not a standard amino acid --> removed + _.ADE4 is not a standard amino acid --> removed + _.HCS5 is not a standard amino acid --> removed + _.BO36 is not a standard amino acid --> removed + removing hydrogen atoms + --> removed 0 hydrogen atoms + removing OXT atoms + --> removed 0 OXT atoms + ################################################################################ + Performing structural checks + --> for reference(s) + Checking reference.pdb + Checking stereo-chemistry + Average Z-Score for bond lengths: 0.33163 + Bonds outside of tolerance range: 0 out of 2993 + Bond Avg Length Avg zscore Num Bonds + C-C 1.51236 0.03971 1682 + C-N 1.46198 0.96819 603 + C-O 1.25794 0.49967 674 + C-S 1.80242 0.15292 34 + Average Z-Score angle widths: -0.12077 + Angles outside of tolerance range: 0 out of 3260 + Filtering non-bonded clashes + 0 non-bonded short-range distances shorter than tolerance distance + Distances shorter than tolerance are on average shorter by: 0.00000 + --> for model(s) + Checking model.pdb + Checking stereo-chemistry + Average Z-Score for bond lengths: 0.23693 + Bonds outside of tolerance range: 0 out of 2976 + Bond Avg Length Avg zscore Num Bonds + C-C 1.52020 0.40359 1674 + C-N 1.43936 -0.19949 598 + C-O 1.25221 0.20230 670 + C-S 1.81182 0.38936 34 + Average Z-Score angle widths: 0.04946 + Angles outside of tolerance range: 0 out of 3241 + Filtering non-bonded clashes + 0 non-bonded short-range distances shorter than tolerance distance + Distances shorter than tolerance are on average shorter by: 0.00000 + ################################################################################ + Comparing model.pdb to reference.pdb + Chains in reference.pdb: AB + Chains in model.pdb: AB + Chemically equivalent chain-groups in reference.pdb: [['A', 'B']] + Chemically equivalent chain-groups in model.pdb: [['A', 'B']] + Chemical chain-groups mapping: {('A', 'B'): ('A', 'B')} + Identifying Symmetry Groups... + Symmetry threshold 0.1 used for angles of reference.pdb + Symmetry threshold 0.1 used for axis of reference.pdb + Symmetry threshold 0.1 used for angles of model.pdb + Symmetry threshold 0.1 used for axis of model.pdb + Selecting Symmetry Groups... + Symmetry-groups used in reference.pdb: [('A',), ('B',)] + Symmetry-groups used in model.pdb: [('A',), ('B',)] + Closed Symmetry with strict parameters + Mapping found: {'A': 'A', 'B': 'B'} + -------------------------------------------------------------------------------- + Checking consistency between model.pdb and reference.pdb + Consistency check: OK + -------------------------------------------------------------------------------- + Computing QS-score + QSscore reference.pdb, model.pdb: best: 0.96, global: 0.96 + -------------------------------------------------------------------------------- + Computing lDDT scores + lDDT settings: + Inclusion Radius: 15 + Sequence separation: 0 + Cutoffs: 0.5, 1, 2, 4 + Residue properties label: lddt + === + --> Computing lDDT between model chain A and reference chain A + Coverage: 0.991416 (231 out of 233 residues) + Global LDDT score: 0.8955 + (1194245 conserved distances out of 1333644 checked, over 4 thresholds) + --> Computing lDDT between model chain B and reference chain B + Coverage: 0.991379 (230 out of 232 residues) + Global LDDT score: 0.8998 + (1200391 conserved distances out of 1334056 checked, over 4 thresholds) + --> Computing oligomeric lDDT score + Reference reference.pdb has: 2 chains + Model model.pdb has: 2 chains + Coverage: 0.991398 (461 out of 465 residues) + Oligo lDDT score: 0.8977 + --> Computing weighted lDDT score + Weighted lDDT score: 0.8976 + ################################################################################ + Saving output into output.json + +This reads the model and reference file and calculates QS- and lDDT-scores +between them. In the example above the output file looks as follows (FASTA +alignments were cut in display here for readability): + +.. code snippet to fix output.json generated above + import json + json_data = json.load(open("output.json")) + mapping = json_data["result"]["model.pdb"]["reference.pdb"]["info"]["mapping"] + new_alns = list() + for aln in mapping["alignments"]: + aln_lines = aln.splitlines() + aln_lines[1] = aln_lines[1][:15] + "..." + aln_lines[3] = aln_lines[3][:15] + "..." + new_alns.append("\n".join(aln_lines)) + mapping["alignments"] = new_alns + json_data["options"]["parameter_file"] = "Path to stage/share/openstructure/stereo_chemical_props.txt" + json_data["options"]["compound_library"] = "Path to stage/share/openstructure/compounds.chemlib" + json_data["options"]["cwd"] = "Path to current working directory" + with open("output_fixed.json", "w") as outfile: + json.dump(json_data, outfile, indent=2, sort_keys=True) + +.. code-block:: json + + { + "options": { + "angle_tolerance": 15.0, + "bond_tolerance": 15.0, + "c_alpha_only": false, + "chain_mapping": null, + "clean_element_column": true, + "compound_library": "Path to stage/share/openstructure/compounds.chemlib", + "consistency_checks": true, + "cwd": "Path to current working directory", + "dump_structures": false, + "dump_suffix": ".compare.structures.pdb", + "fault_tolerant": false, + "inclusion_radius": 15.0, + "lddt": true, + "map_nonstandard_residues": true, + "model": "model.pdb", + "model_selection": "", + "molck": true, + "output": "output.json", + "parameter_file": "Path to stage/share/openstructure/stereo_chemical_props.txt", + "qs_max_mappings_extensive": 1000000, + "qs_rmsd": false, + "qs_score": true, + "reference": "reference.pdb", + "reference_selection": "", + "remove": [ + "oxt", + "hyd", + "unk", + "nonstd" + ], + "residue_number_alignment": true, + "save_per_residue_scores": false, + "sequence_separation": 0, + "structural_checks": true, + "verbosity": 3 + }, + "result": { + "model.pdb": { + "reference.pdb": { + "info": { + "mapping": { + "alignments": [ + ">reference:A\n-NAMKIGIVGAMAQE...\n>model:A\n---MKIGIVGAMAQE...", + ">reference:B\n-NAMKIGIVGAMAQE...\n>model:B\n---MKIGIVGAMAQE..." + ], + "chain_mapping": { + "A": "A", + "B": "B" + }, + "chain_mapping_scheme": "strict" + }, + "residue_names_consistent": true + }, + "lddt": { + "oligo_lddt": { + "error": "", + "global_score": 0.8977285786061329, + "status": "SUCCESS" + }, + "single_chain_lddt": [ + { + "conserved_contacts": 1194245, + "error": "", + "global_score": 0.8954750895500183, + "model_chain": "A", + "reference_chain": "A", + "status": "SUCCESS", + "total_contacts": 1333644 + }, + { + "conserved_contacts": 1200391, + "error": "", + "global_score": 0.8998055458068848, + "model_chain": "B", + "reference_chain": "B", + "status": "SUCCESS", + "total_contacts": 1334056 + } + ], + "weighted_lddt": { + "error": "", + "global_score": 0.8976406520766181, + "status": "SUCCESS" + } + }, + "qs_score": { + "best_score": 0.9619749105661133, + "error": "", + "global_score": 0.9619749105661133, + "status": "SUCCESS" + } + } + } + } + } + +If all the structures are clean and have matching residue numbers, one can omit +all the checking steps and calculate scores directly as here: + +.. code:: console + + $ $OST_ROOT/bin/ost compare-structures \ + --model model.pdb --reference reference.pdb --output output_qs.json \ + --qs-score --residue-number-alignment + + ################################################################################ + Reading input files (fault_tolerant=False) + --> reading model from model.pdb + imported 2 chains, 462 residues, 3400 atoms; with 0 helices and 0 strands + --> reading reference from reference.pdb + imported 3 chains, 471 residues, 3465 atoms; with 0 helices and 0 strands + ################################################################################ + Comparing model.pdb to reference.pdb + Chains removed from reference.pdb: _ + Chains in reference.pdb: AB + Chains in model.pdb: AB + Chemically equivalent chain-groups in reference.pdb: [['A', 'B']] + Chemically equivalent chain-groups in model.pdb: [['A', 'B']] + Chemical chain-groups mapping: {('A', 'B'): ('A', 'B')} + Identifying Symmetry Groups... + Symmetry threshold 0.1 used for angles of reference.pdb + Symmetry threshold 0.1 used for axis of reference.pdb + Symmetry threshold 0.1 used for angles of model.pdb + Symmetry threshold 0.1 used for axis of model.pdb + Selecting Symmetry Groups... + Symmetry-groups used in reference.pdb: [('A',), ('B',)] + Symmetry-groups used in model.pdb: [('A',), ('B',)] + Closed Symmetry with strict parameters + Mapping found: {'A': 'A', 'B': 'B'} + -------------------------------------------------------------------------------- + Checking consistency between model.pdb and reference.pdb + Consistency check: OK + -------------------------------------------------------------------------------- + Computing QS-score + QSscore reference.pdb, model.pdb: best: 0.96, global: 0.96 + ################################################################################ + Saving output into output_qs.json -- GitLab