""" Evaluate model against reference Example: ost compare-structures -m model.pdb -r reference.cif Loads the structures and performs basic cleanup: * Assign elements according to the PDB Chemical Component Dictionary * Map nonstandard residues to their parent residues as defined by the PDB Chemical Component Dictionary, e.g. phospho-serine => serine * Remove hydrogens * Remove OXT atoms * Remove unknown atoms, i.e. atoms that are not expected according to the PDB Chemical Component Dictionary * Select for peptide/nucleotide residues The cleaned structures are optionally dumped using -d/--dump-structures Output is written in JSON format (default: out.json). In case of no additional options, this is a dictionary with 8 keys describing model/reference comparison: * "reference_chains": Chain names of reference * "model_chains": Chain names of model * "chem_groups": Groups of polypeptides/polynucleotides from reference that are considered chemically equivalent. You can derive stoichiometry from this. Contains only chains that are considered in chain mapping, i.e. pass a size threshold (defaults: 6 for peptides, 4 for nucleotides). * "chem_mapping": List of same length as "chem_groups". Assigns model chains to the respective chem group. Again, only contains chains that are considered in chain mapping. * "chain_mapping": A dictionary with reference chain names as keys and the mapped model chain names as values. Missing chains are either not mapped (but present in "chem_groups", "chem_mapping") or were not considered in chain mapping (short peptides etc.) * "aln": Pairwise sequence alignment for each pair of mapped chains in fasta format. * "inconsistent_residues": List of strings that represent name mismatches of aligned residues in form <trg_cname>.<trg_rnum>.<trg_ins_code>-<mdl_cname>.<mdl_rnum>.<mdl_ins_code>. Inconsistencies may lead to corrupt results but do not abort the program. Program abortion in these cases can be enforced with -ec/--enforce-consistency. * "status": SUCCESS if everything ran through. In case of failure, the only content of the JSON output will be \"status\" set to FAILURE and an additional key: "traceback". The following additional keys store relevant input parameters to reproduce results: * "model" * "reference" * "fault_tolerant" * "model_biounit" * "reference_biounit" * "residue_number_alignment" * "enforce_consistency" * "cad_exec" * "usalign_exec" * "lddt_no_stereochecks" * "min_pep_length" * "min_nuc_length" * "lddt_add_mdl_contacts" * "lddt_inclusion_radius" * "dockq_capri_peptide" * "ost_version" The pairwise sequence alignments are computed with Needleman-Wunsch using BLOSUM62 (NUC44 for nucleotides). Many benchmarking scenarios preprocess the structures to ensure matching residue numbers (CASP/CAMEO). In these cases, enabling -rna/--residue-number-alignment is recommended. Each score is opt-in and can be enabled with optional arguments. Example to compute global and per-residue lDDT values as well as QS-score: ost compare-structures -m model.pdb -r reference.cif --lddt --local-lddt \ --qs-score Example to inject custom chain mapping ost compare-structures -m model.pdb -r reference.cif -c A:B B:A """ import argparse import os import json import sys import traceback import math import ost from ost import io from ost.mol.alg import scoring def _ParseArgs(): parser = argparse.ArgumentParser(description = __doc__, formatter_class=argparse.RawDescriptionHelpFormatter, prog = "ost compare-structures") parser.add_argument( "-m", "--model", dest="model", required=True, help=("Path to model file.")) parser.add_argument( "-r", "--reference", dest="reference", required=True, help=("Path to reference file.")) parser.add_argument( "-o", "--output", dest="output", required=False, default="out.json", help=("Output file name. The output will be saved as a JSON file. " "default: out.json")) parser.add_argument( "-mf", "--model-format", dest="model_format", required=False, default=None, choices=["pdb", "cif", "mmcif"], help=("Format of model file. pdb reads pdb but also pdb.gz, same " "applies to cif/mmcif. Inferred from filepath if not given.")) parser.add_argument( "-rf", "--reference-format", dest="reference_format", required=False, default=None, choices=["pdb", "cif", "mmcif"], help=("Format of reference file. pdb reads pdb but also pdb.gz, same " "applies to cif/mmcif. Inferred from filepath if not given.")) parser.add_argument( "-mb", "--model-biounit", dest="model_biounit", required=False, default=None, type=str, help=("Only has an effect if model is in mmcif format. By default, " "the asymmetric unit (AU) is used for scoring. If there are " "biounits defined in the mmcif file, you can specify the " "ID (as a string) of the one which should be used.")) parser.add_argument( "-rb", "--reference-biounit", dest="reference_biounit", required=False, default=None, type=str, help=("Only has an effect if reference is in mmcif format. By default, " "the asymmetric unit (AU) is used for scoring. If there are " "biounits defined in the mmcif file, you can specify the " "ID (as a string) of the one which should be used.")) parser.add_argument( "-rna", "--residue-number-alignment", dest="residue_number_alignment", default=False, action="store_true", help=("Make alignment based on residue number instead of using " "a global BLOSUM62-based alignment (NUC44 for nucleotides).")) parser.add_argument( "-ec", "--enforce-consistency", dest="enforce_consistency", default=False, action="store_true", help=("Enforce consistency. By default residue name discrepancies " "between a model and reference are reported but the program " "proceeds. If this flag is ON, the program fails for these " "cases.")) parser.add_argument( "-d", "--dump-structures", dest="dump_structures", default=False, action="store_true", help=("Dump cleaned structures used to calculate all the scores as PDB" " or mmCIF files using specified suffix. Files will be dumped to" " the same location and in the same format as original files.")) parser.add_argument( "-ds", "--dump-suffix", dest="dump_suffix", default="_compare_structures", help=("Use this suffix to dump structures.\n" "Defaults to _compare_structures")) parser.add_argument( "-ft", "--fault-tolerant", dest="fault_tolerant", default=False, action="store_true", help=("Fault tolerant parsing.")) parser.add_argument( "-c", "--chain-mapping", nargs="+", dest="chain_mapping", help=("Custom mapping of chains between the reference and the model. " "Each separate mapping consist of key:value pairs where key " "is the chain name in reference and value is the chain name in " "model.")) parser.add_argument( "--lddt", dest="lddt", default=False, action="store_true", help=("Compute global lDDT score with default parameterization and " "store as key \"lddt\". Stereochemical irregularities affecting " "lDDT are reported as keys \"model_clashes\", " "\"model_bad_bonds\", \"model_bad_angles\" and the respective " "reference counterparts.")) parser.add_argument( "--local-lddt", dest="local_lddt", default=False, action="store_true", help=("Compute per-residue lDDT scores with default parameterization " "and store as key \"local_lddt\". Score for each residue is " "accessible by key <chain_name>.<resnum>.<resnum_inscode>. " "Residue with number 42 in chain X can be extracted with: " "data[\"local_lddt\"][\"X.42.\"]. If there is an insertion " "code, lets say A, the residue key becomes \"X.42.A\". " "Stereochemical irregularities affecting lDDT are reported as " "keys \"model_clashes\", \"model_bad_bonds\", " "\"model_bad_angles\" and the respective reference " "counterparts. Atoms specified in there follow the following " "format: <chain_name>.<resnum>.<resnum_inscode>.<atom_name>")) parser.add_argument( "--aa-local-lddt", dest="aa_local_lddt", default=False, action="store_true", help=("Compute per-atom lDDT scores with default parameterization " "and store as key \"aa_local_lddt\". Score for each atom is " "accessible by key " "<chain_name>.<resnum>.<resnum_inscode>.<aname>. " "Alpha carbon from residue with number 42 in chain X can be " "extracted with: data[\"aa_local_lddt\"][\"X.42..CA\"]. " "If there is a residue insertion code, lets say A, the atom key " "becomes \"X.42.A.CA\". " "Stereochemical irregularities affecting lDDT are reported as " "keys \"model_clashes\", \"model_bad_bonds\", " "\"model_bad_angles\" and the respective reference " "counterparts. Atoms specified in there follow the following " "format: <chain_name>.<resnum>.<resnum_inscode>.<atom_name>")) parser.add_argument( "--bb-lddt", dest="bb_lddt", default=False, action="store_true", help=("Compute global lDDT score with default parameterization and " "store as key \"bb_lddt\". lDDT in this case is only computed on " "backbone atoms: CA for peptides and C3' for nucleotides")) parser.add_argument( "--bb-local-lddt", dest="bb_local_lddt", default=False, action="store_true", help=("Compute per-residue lDDT scores with default parameterization " "and store as key \"bb_local_lddt\". lDDT in this case is only " "computed on backbone atoms: CA for peptides and C3' for " "nucleotides. Per-residue scores are accessible as described for " "local_lddt.")) parser.add_argument( "--ilddt", dest="ilddt", default=False, action="store_true", help=("Compute global lDDT score which is solely based on inter-chain " "contacts and store as key \"ilddt\". Same stereochemical " "irregularities as for lddt apply.")) parser.add_argument( "--cad-score", dest="cad_score", default=False, action="store_true", help=("Compute global CAD's atom-atom (AA) score and store as key " "\"cad_score\". --residue-number-alignment must be enabled " "to compute this score. Requires voronota_cadscore executable " "in PATH. Alternatively you can set cad-exec.")) parser.add_argument( "--local-cad-score", dest="local_cad_score", default=False, action="store_true", help=("Compute local CAD's atom-atom (AA) scores and store as key " "\"local_cad_score\". Per-residue scores are accessible as " "described for local_lddt. --residue-number-alignments must be " "enabled to compute this score. Requires voronota_cadscore " "executable in PATH. Alternatively you can set cad-exec.")) parser.add_argument( "--cad-exec", dest="cad_exec", default=None, help=("Path to voronota-cadscore executable (installed from " "https://github.com/kliment-olechnovic/voronota). Searches PATH " "if not set.")) parser.add_argument( "--usalign-exec", dest="usalign_exec", default=None, help=("Path to USalign executable to compute TM-score. If not given, " "an OpenStructure internal copy of USalign code is used.")) parser.add_argument( "--override-usalign-mapping", dest="oum", default=False, action="store_true", help=("Override USalign mapping and inject our own rigid mapping. Only " "works if external usalign executable is provided that is " "reasonably new and contains that feature.")) parser.add_argument( "--qs-score", dest="qs_score", default=False, action="store_true", help=("Compute QS-score, stored as key \"qs_global\", and the QS-best " "variant, stored as key \"qs_best\". Interfaces in the reference " "with non-zero contribution to QS-score are available as key " "\"qs_reference_interfaces\", the ones from the model as key " "\"qs_model_interfaces\". \"qs_interfaces\" is a subset of " "\"qs_reference_interfaces\" that contains interfaces that " "can be mapped to the model. They are stored as lists in format " "[ref_ch1, ref_ch2, mdl_ch1, mdl_ch2]. The respective " "per-interface scores for \"qs_interfaces\" are available as " "keys \"per_interface_qs_global\" and \"per_interface_qs_best\"")) parser.add_argument( "--dockq", dest="dockq", default=False, action="store_true", help=("Compute DockQ scores and its components. Relevant interfaces " "with at least one contact (any atom within 5A) of the reference " "structure are available as key \"dockq_reference_interfaces\". " "Protein-protein, protein-nucleotide and nucleotide-nucleotide " "interfaces are considered. " "Key \"dockq_interfaces\" is a subset of " "\"dockq_reference_interfaces\" that contains interfaces that " "can be mapped to the model. They are stored as lists in format " "[ref_ch1, ref_ch2, mdl_ch1, mdl_ch2]. The respective " "DockQ scores for \"dockq_interfaces\" are available as key " "\"dockq\". It's components are available as keys: " "\"fnat\" (fraction of reference contacts which are also there " "in model) \"irmsd\" (interface RMSD), \"lrmsd\" (ligand RMSD). " "The DockQ score is strictly designed to score each interface " "individually. We also provide two averaged versions to get one " "full model score: \"dockq_ave\", \"dockq_wave\". The first is " "simply the average of \"dockq_scores\", the latter is a " "weighted average with weights derived from number of contacts " "in the reference interfaces. These two scores only consider " "interfaces that are present in both, the model and the " "reference. \"dockq_ave_full\" and \"dockq_wave_full\" add zeros " "in the average computation for each interface that is only " "present in the reference but not in the model.")) parser.add_argument( "--dockq-capri-peptide", dest="dockq_capri_peptide", default=False, action="store_true", help=("Flag that changes two things in the way DockQ and its " "underlying scores are computed which is proposed by the CAPRI " "community when scoring peptides (PMID: 31886916). " "ONE: Two residues are considered in contact if any of their " "atoms is within 5A. This is relevant for fnat and fnonat " "scores. CAPRI suggests to lower this threshold to 4A for " "protein-peptide interactions. " "TWO: irmsd is computed on interface residues. A residue is " "defined as interface residue if any of its atoms is within 10A " "of another chain. CAPRI suggests to lower the default of 10A to " "8A in combination with only considering CB atoms for " "protein-peptide interactions. " "Note that the resulting DockQ is not evaluated for these " "slightly updated fnat and irmsd (lrmsd stays the same). " "Raises an error if reference contains nucleotide chains. " "This flag has no influence on patch_dockq scores.")) parser.add_argument( "--ics", dest="ics", default=False, action="store_true", help=("Computes interface contact similarity (ICS) related scores. " "A contact between two residues of different chains is defined " "as having at least one heavy atom within 5A. Contacts in " "reference structure are available as key " "\"reference_contacts\". Each contact specifies the interacting " "residues in format \"<cname>.<rnum>.<ins_code>\". Model " "contacts are available as key \"model_contacts\". The precision " "which is available as key \"ics_precision\" reports the " "fraction of model contacts that are also present in the " "reference. The recall which is available as key \"ics_recall\" " "reports the fraction of reference contacts that are correctly " "reproduced in the model. " "The ICS score (Interface Contact Similarity) available as key " "\"ics\" combines precision and recall using the F1-measure. " "All these measures are also available on a per-interface basis " "for each interface in the reference structure that are defined " "as chain pairs with at least one contact (available as key " " \"contact_reference_interfaces\"). The respective metrics are " "available as keys \"per_interface_ics_precision\", " "\"per_interface_ics_recall\" and \"per_interface_ics\".")) parser.add_argument( "--ics-trimmed", dest="ics_trimmed", default=False, action="store_true", help=("Computes interface contact similarity (ICS) related scores but " "on a trimmed model. That means that a mapping between model and " "reference is performed and all model residues without reference " "counterpart are removed. As a consequence, model contacts for " "which we have no experimental evidence do not affect the score. " "The effect of these added model contacts without mapping to " "target would be decreased precision and thus lower ics. Recall is " "not affected. Enabling this flag adds the following keys: " "\"ics_trimmed\", \"ics_precision_trimmed\", " "\"ics_recall_trimmed\", \"model_contacts_trimmed\". " "The reference contacts and reference interfaces are the same " "as for ics and available as keys: \"reference_contacts\", " "\"contact_reference_interfaces\". " "All these measures are also available on a per-interface basis " "for each interface in the reference structure that are defined " "as chain pairs with at least one contact (available as key " " \"contact_reference_interfaces\"). The respective metrics are " "available as keys \"per_interface_ics_precision_trimmed\", " "\"per_interface_ics_recall_trimmed\" and " "\"per_interface_ics_trimmed\".")) parser.add_argument( "--ips", dest="ips", default=False, action="store_true", help=("Computes interface patch similarity (IPS) related scores. " "They focus on interface residues. They are defined as having " "at least one contact to a residue from any other chain. " "In short: if they show up in the contact lists used to compute " "ICS. If ips is enabled, these contacts get reported too and are " "available as keys \"reference_contacts\" and \"model_contacts\"." "The precision which is available as key \"ips_precision\" " "reports the fraction of model interface residues, that are also " "interface residues in the reference. " "The recall which is available as key \"ips_recall\" " "reports the fraction of reference interface residues that are " "also interface residues in the model. " "The IPS score (Interface Patch Similarity) available as key " "\"ips\" is the Jaccard coefficient between interface residues " "in reference and model. " "All these measures are also available on a per-interface basis " "for each interface in the reference structure that are defined " "as chain pairs with at least one contact (available as key " " \"contact_reference_interfaces\"). The respective metrics are " "available as keys \"per_interface_ips_precision\", " "\"per_interface_ips_recall\" and \"per_interface_ips\".")) parser.add_argument( "--ips-trimmed", dest="ips_trimmed", default=False, action="store_true", help=("The IPS equivalent of ICS on trimmed models.")) parser.add_argument( "--rigid-scores", dest="rigid_scores", default=False, action="store_true", help=("Computes rigid superposition based scores. They're based on a " "Kabsch superposition of all mapped CA positions (C3' for " "nucleotides). Makes the following keys available: " "\"oligo_gdtts\": GDT with distance thresholds [1.0, 2.0, 4.0, " "8.0] given these positions and transformation, \"oligo_gdtha\": " "same with thresholds [0.5, 1.0, 2.0, 4.0], \"rmsd\": RMSD given " "these positions and transformation, \"transform\": the used 4x4 " "transformation matrix that superposes model onto reference, " "\"rigid_chain_mapping\": equivalent of \"chain_mapping\" which " "is used for rigid scores (optimized for RMSD instead of " "QS-score/lDDT).")) parser.add_argument( "--patch-scores", dest="patch_scores", default=False, action="store_true", help=("Local interface quality score used in CASP15. Scores each " "model residue that is considered in the interface (CB pos " "within 8A of any CB pos from another chain (CA for GLY)). The " "local neighborhood gets represented by \"interface patches\" " "which are scored with QS-score and DockQ. Scores where not " "the full patches are represented by the reference are set to " "None. Model interface residues are available as key " "\"model_interface_residues\", reference interface residues as " "key \"reference_interface_residues\". Residues are represented " "as string in form <chain_name>.<resnum>.<resnum_inscode>. " "The respective scores are available as keys \"patch_qs\" and " "\"patch_dockq\"")) parser.add_argument( "--tm-score", dest="tm_score", default=False, action="store_true", help=("Computes TM-score with the USalign tool. Also computes a " "chain mapping in case of complexes that is stored in the " "same format as the default mapping. TM-score and the mapping " "are available as keys \"tm_score\" and \"usalign_mapping\"")) parser.add_argument( "--lddt-no-stereochecks", dest="lddt_no_stereochecks", default=False, action="store_true", help=("Disable stereochecks for lDDT computation")) parser.add_argument( "--n-max-naive", dest="n_max_naive", required=False, default=40320, type=int, help=("Parameter for chain mapping. If the number of possible " "mappings is <= *n_max_naive*, the full " "mapping solution space is enumerated to find the " "the mapping with optimal QS-score. A heuristic is used " "otherwise. The default of 40320 corresponds to an octamer " "(8! = 40320). A structure with stoichiometry A6B2 would be " "6!*2! = 1440 etc.")) parser.add_argument( "--dump-aligned-residues", dest="dump_aligned_residues", default=False, action="store_true", help=("Dump additional info on aligned model and reference residues.")) parser.add_argument( "--dump-pepnuc-alns", dest="dump_pepnuc_alns", default=False, action="store_true", help=("Dump alignments of mapped chains but with sequences that did " "not undergo Molck preprocessing in the scorer. Sequences are " "extracted from model/target after undergoing selection for " "peptide and nucleotide residues.")) parser.add_argument( "--dump-pepnuc-aligned-residues", dest="dump_pepnuc_aligned_residues", default=False, action="store_true", help=("Dump additional info on model and reference residues that occur " "in pepnuc alignments.")) parser.add_argument( "--min-pep-length", dest="min_pep_length", default = 6, type=int, help=("Default: 6 - " "Relevant parameter if short peptides are involved in scoring. " "Minimum peptide length for a chain in the target structure to " "be considered in chain mapping. The chain mapping algorithm " "first performs an all vs. all pairwise sequence alignment to " "identify \"equal\" chains within the target structure. We go " "for simple sequence identity there. Short sequences can be " "problematic as they may produce high sequence identity " "alignments by pure chance.") ) parser.add_argument( "--min-nuc-length", dest="min_nuc_length", default = 4, type=int, help=("Default: 4 - " "Relevant parameter if short nucleotides are involved in scoring." "Minimum nucleotide length for a chain in the target structure to " "be considered in chain mapping. The chain mapping algorithm " "first performs an all vs. all pairwise sequence alignment to " "identify \"equal\" chains within the target structure. We go " "for simple sequence identity there. Short sequences can be " "problematic as they may produce high sequence identity " "alignments by pure chance.") ) parser.add_argument( '-v', '--verbosity', dest="verbosity", type=int, default=2, help="Set verbosity level. Defaults to 2 (Script).") parser.add_argument( "--lddt-add-mdl-contacts", dest="lddt_add_mdl_contacts", default=False, action="store_true", help=("Only using contacts in lDDT that " "are within a certain distance threshold in the " "reference does not penalize for added model " "contacts. If set to True, this flag will also " "consider reference contacts that are within the " "specified distance threshold in the model but " "not necessarily in the reference. No contact will " "be added if the respective atom pair is not " "resolved in the reference.")) parser.add_argument( "--lddt-inclusion-radius", dest="lddt_inclusion_radius", type = float, default=15.0, help=("Passed to lDDT scorer. Affects all lDDT scores but not " "chain mapping.")) return parser.parse_args() def _CheckCompoundLib(): clib = ost.conop.GetDefaultLib() if not clib: ost.LogError("A compound library is required for this action. " "Please refer to the OpenStructure website: " "https://openstructure.org/docs/conop/compoundlib/.") sys.tracebacklimit = 0 raise RuntimeError("No compound library found") def _RoundOrNone(num, decimals = 3): """ Helper to create valid JSON output """ if num is None or math.isnan(num) or math.isinf(num): return None return round(num, decimals) def _AddSuffix(filename, dump_suffix): """Add dump_suffix to the file name. """ root, ext = os.path.splitext(filename) if ext == ".gz": root, ext2 = os.path.splitext(root) ext = ext2 + ext return root + dump_suffix + ext def _GetStructureFormat(structure_path, sformat=None): """Get the structure format and return it as "pdb" or "mmcif". """ if sformat is None: # Determine file format from suffix. ext = structure_path.split(".") if ext[-1] == "gz": ext = ext[:-1] if len(ext) <= 1: raise Exception(f"Could not determine format of file " f"{structure_path}.") sformat = ext[-1].lower() if sformat in ["mmcif", "cif"]: return "mmcif" elif sformat == "pdb": return sformat else: raise Exception(f"Unknown/unsupported file format found for " f"file {structure_path}.") def _LoadStructure(structure_path, sformat, fault_tolerant, bu_id): """Read OST entity either from mmCIF or PDB. The returned structure has structure_path attached as structure name """ # increase loglevel, as we would pollute the info log with weird stuff ost.PushVerbosityLevel(ost.LogLevel.Error) # Load the structure if sformat == "mmcif": if bu_id is not None: cif_entity, cif_seqres, cif_info = \ io.LoadMMCIF(structure_path, info=True, seqres=True, fault_tolerant=fault_tolerant) for biounit in cif_info.biounits: if biounit.id == bu_id: entity = ost.mol.alg.CreateBU(cif_entity, biounit) break else: raise RuntimeError(f"No biounit found with ID '{bu_id}'.") else: entity = io.LoadMMCIF(structure_path, fault_tolerant = fault_tolerant) if len(entity.residues) == 0: raise Exception(f"No residues found in file: {structure_path}") else: entity = io.LoadPDB(structure_path, fault_tolerant = fault_tolerant) if len(entity.residues) == 0: raise Exception(f"No residues found in file: {structure_path}") # restore old loglevel and return ost.PopVerbosityLevel() entity.SetName(structure_path) return entity def _DumpStructure(entity, structure_path, sformat): if sformat == "mmcif": io.SaveMMCIF(entity, structure_path) else: io.SavePDB(entity, structure_path) def _AlnToFastaStr(aln): """ Returns alignment as fasta formatted string """ s1 = aln.GetSequence(0) s2 = aln.GetSequence(1) return f">reference:{s1.name}\n{str(s1)}\n>model:{s2.name}\n{str(s2)}" def _GetInconsistentResidues(alns): lst = list() for aln in alns: for col in aln: r1 = col.GetResidue(0) r2 = col.GetResidue(1) if r1.IsValid() and r2.IsValid() and r1.GetName() != r2.GetName(): ch_1 = r1.GetChain().name num_1 = r1.number.num ins_code_1 = r1.number.ins_code.strip("\u0000") id_1 = f"{ch_1}.{num_1}.{ins_code_1}" ch_2 = r2.GetChain().name num_2 = r2.number.num ins_code_2 = r2.number.ins_code.strip("\u0000") id_2 = f"{ch_2}.{num_2}.{ins_code_2}" lst.append(f"{id_1}-{id_2}") return lst def _LocalScoresToJSONDict(score_dict): """ Convert ResNums to str for JSON serialization """ json_dict = dict() for ch, ch_scores in score_dict.items(): for num, s in ch_scores.items(): ins_code = num.ins_code.strip("\u0000") json_dict[f"{ch}.{num.num}.{ins_code}"] = _RoundOrNone(s) return json_dict def _LocalAAScoresToJSONDict(score_dict): """ Convert ResNums and atom names to str for JSON serialization """ json_dict = dict() for ch, ch_scores in score_dict.items(): for num, res_scores in ch_scores.items(): ins_code = num.ins_code.strip("\u0000") for a, s in res_scores.items(): json_dict[f"{ch}.{num.num}.{ins_code}.{a}"] = _RoundOrNone(s) return json_dict def _InterfaceResiduesToJSONList(interface_dict): """ Convert ResNums to str for JSON serialization. Changes in this function will affect _PatchScoresToJSONList """ json_list = list() for ch, ch_nums in interface_dict.items(): for num in ch_nums: ins_code = num.ins_code.strip("\u0000") json_list.append(f"{ch}.{num.num}.{ins_code}") return json_list def _PatchScoresToJSONList(interface_dict, score_dict): """ Creates List of patch scores that are consistent with interface residue lists """ json_list = list() for ch, ch_nums in interface_dict.items(): for item in score_dict[ch]: json_list.append(_RoundOrNone(item)) return json_list def _GetAlignedResidues(aln): aligned_residues = list() for a in aln: mdl_lst = list() ref_lst = list() for c in a: mdl_r = c.GetResidue(1) ref_r = c.GetResidue(0) if mdl_r.IsValid(): olc = mdl_r.one_letter_code num = mdl_r.GetNumber().num ins_code = mdl_r.GetNumber().ins_code.strip("\u0000") mdl_lst.append({"olc": olc, "num": f"{num}.{ins_code}"}) else: mdl_lst.append(None) if ref_r.IsValid(): olc = ref_r.one_letter_code num = ref_r.GetNumber().num ins_code = ref_r.GetNumber().ins_code.strip("\u0000") ref_lst.append({"olc": olc, "num": f"{num}.{ins_code}"}) else: ref_lst.append(None) mdl_dct = {"chain": a.GetSequence(1).GetName(), "residues": mdl_lst} ref_dct = {"chain": a.GetSequence(0).GetName(), "residues": ref_lst} aligned_residues.append({"model": mdl_dct, "reference": ref_dct}) return aligned_residues def _Process(model, reference, args, model_format, reference_format): mapping = None if args.chain_mapping is not None: mapping = {x.split(':')[0]: x.split(':')[1] for x in args.chain_mapping} scorer = scoring.Scorer(model, reference, resnum_alignments = args.residue_number_alignment, cad_score_exec = args.cad_exec, custom_mapping = mapping, usalign_exec = args.usalign_exec, lddt_no_stereochecks = args.lddt_no_stereochecks, n_max_naive = args.n_max_naive, oum = args.oum, min_pep_length = args.min_pep_length, min_nuc_length = args.min_nuc_length, lddt_add_mdl_contacts = args.lddt_add_mdl_contacts, dockq_capri_peptide = args.dockq_capri_peptide, lddt_inclusion_radius = args.lddt_inclusion_radius) ir = _GetInconsistentResidues(scorer.aln) if len(ir) > 0 and args.enforce_consistency: raise RuntimeError(f"Inconsistent residues observed: {' '.join(ir)}") out = dict() out["reference_chains"] = [ch.GetName() for ch in scorer.target.chains] out["model_chains"] = [ch.GetName() for ch in scorer.model.chains] out["chem_groups"] = scorer.chain_mapper.chem_groups out["chem_mapping"] = scorer.mapping.chem_mapping out["chain_mapping"] = scorer.mapping.GetFlatMapping() out["aln"] = [_AlnToFastaStr(aln) for aln in scorer.aln] out["inconsistent_residues"] = ir if args.dump_aligned_residues: out["aligned_residues"] = _GetAlignedResidues(scorer.aln) if args.dump_pepnuc_alns: out["pepnuc_aln"] = [_AlnToFastaStr(aln) for aln in scorer.pepnuc_aln] if args.dump_pepnuc_aligned_residues: out["pepnuc_aligned_residues"] = _GetAlignedResidues(scorer.pepnuc_aln) if args.lddt: out["lddt"] = _RoundOrNone(scorer.lddt) if args.local_lddt: out["local_lddt"] = _LocalScoresToJSONDict(scorer.local_lddt) if args.aa_local_lddt: out["aa_local_lddt"] = _LocalAAScoresToJSONDict(scorer.aa_local_lddt) if args.lddt or args.local_lddt or args.aa_local_lddt: out["model_clashes"] = [x.ToJSON() for x in scorer.model_clashes] out["model_bad_bonds"] = [x.ToJSON() for x in scorer.model_bad_bonds] out["model_bad_angles"] = [x.ToJSON() for x in scorer.model_bad_angles] out["reference_clashes"] = [x.ToJSON() for x in scorer.target_clashes] out["reference_bad_bonds"] = [x.ToJSON() for x in scorer.target_bad_bonds] out["reference_bad_angles"] = [x.ToJSON() for x in scorer.target_bad_angles] if args.bb_lddt: out["bb_lddt"] = _RoundOrNone(scorer.bb_lddt) if args.bb_local_lddt: out["bb_local_lddt"] = _LocalScoresToJSONDict(scorer.bb_local_lddt) if args.ilddt: out["ilddt"] = _RoundOrNone(scorer.ilddt) if args.cad_score: out["cad_score"] = scorer.cad_score if args.local_cad_score: out["local_cad_score"] = _LocalScoresToJSONDict(scorer.local_cad_score) if args.qs_score: out["qs_global"] = _RoundOrNone(scorer.qs_global) out["qs_best"] = _RoundOrNone(scorer.qs_best) out["qs_reference_interfaces"] = scorer.qs_target_interfaces out["qs_model_interfaces"] = scorer.qs_model_interfaces out["qs_interfaces"] = scorer.qs_interfaces out["per_interface_qs_global"] = \ [_RoundOrNone(x) for x in scorer.per_interface_qs_global] out["per_interface_qs_best"] = \ [_RoundOrNone(x) for x in scorer.per_interface_qs_best] if args.ics or args.ips: out["model_contacts"] = scorer.model_contacts if args.ics_trimmed or args.ips_trimmed: out["model_contacts_trimmed"] = scorer.trimmed_model_contacts if args.ics or args.ips or args.ics_trimmed or args.ips_trimmed: out["reference_contacts"] = scorer.native_contacts out["contact_reference_interfaces"] = scorer.contact_target_interfaces if args.ics: out["ics_precision"] = _RoundOrNone(scorer.ics_precision) out["ics_recall"] = _RoundOrNone(scorer.ics_recall) out["ics"] = _RoundOrNone(scorer.ics) out["per_interface_ics_precision"] = \ [_RoundOrNone(x) for x in scorer.per_interface_ics_precision] out["per_interface_ics_recall"] = \ [_RoundOrNone(x) for x in scorer.per_interface_ics_recall] out["per_interface_ics"] = \ [_RoundOrNone(x) for x in scorer.per_interface_ics] if args.ips: out["ips_precision"] = _RoundOrNone(scorer.ips_precision) out["ips_recall"] = _RoundOrNone(scorer.ips_recall) out["ips"] = _RoundOrNone(scorer.ips) out["per_interface_ips_precision"] = \ [_RoundOrNone(x) for x in scorer.per_interface_ips_precision] out["per_interface_ips_recall"] = \ [_RoundOrNone(x) for x in scorer.per_interface_ips_recall] out["per_interface_ips"] = \ [_RoundOrNone(x) for x in scorer.per_interface_ips] if args.ics_trimmed: out["ics_trimmed"] = _RoundOrNone(scorer.ics_trimmed) out["ics_precision_trimmed"] = _RoundOrNone(scorer.ics_precision_trimmed) out["ics_recall_trimmed"] = _RoundOrNone(scorer.ics_recall_trimmed) out["per_interface_ics_precision_trimmed"] = \ [_RoundOrNone(x) for x in scorer.per_interface_ics_precision_trimmed] out["per_interface_ics_recall_trimmed"] = \ [_RoundOrNone(x) for x in scorer.per_interface_ics_recall_trimmed] out["per_interface_ics_trimmed"] = \ [_RoundOrNone(x) for x in scorer.per_interface_ics_trimmed] if args.ips_trimmed: out["ips_trimmed"] = _RoundOrNone(scorer.ips_trimmed) out["ips_precision_trimmed"] = _RoundOrNone(scorer.ips_precision_trimmed) out["ips_recall_trimmed"] = _RoundOrNone(scorer.ips_recall_trimmed) out["per_interface_ips_precision_trimmed"] = \ [_RoundOrNone(x) for x in scorer.per_interface_ips_precision_trimmed] out["per_interface_ips_recall_trimmed"] = \ [_RoundOrNone(x) for x in scorer.per_interface_ips_recall_trimmed] out["per_interface_ips_trimmed"] = \ [_RoundOrNone(x) for x in scorer.per_interface_ips_trimmed] if args.dockq: out["dockq_reference_interfaces"] = scorer.dockq_target_interfaces out["dockq_interfaces"] = scorer.dockq_interfaces out["dockq"] = [_RoundOrNone(x) for x in scorer.dockq_scores] out["fnat"] = [_RoundOrNone(x) for x in scorer.fnat] out["fnonnat"] = [_RoundOrNone(x) for x in scorer.fnonnat] out["irmsd"] = [_RoundOrNone(x) for x in scorer.irmsd] out["lrmsd"] = [_RoundOrNone(x) for x in scorer.lrmsd] out["nnat"] = scorer.nnat out["nmdl"] = scorer.nmdl out["dockq_ave"] = _RoundOrNone(scorer.dockq_ave) out["dockq_wave"] = _RoundOrNone(scorer.dockq_wave) out["dockq_ave_full"] = _RoundOrNone(scorer.dockq_ave_full) out["dockq_wave_full"] = _RoundOrNone(scorer.dockq_wave_full) if args.rigid_scores: out["oligo_gdtts"] = _RoundOrNone(scorer.gdtts) out["oligo_gdtha"] = _RoundOrNone(scorer.gdtha) out["rmsd"] = _RoundOrNone(scorer.rmsd) data = scorer.rigid_transform.data out["transform"] = [data[i:i + 4] for i in range(0, len(data), 4)] out["rigid_chain_mapping"] = scorer.rigid_mapping.GetFlatMapping() if args.patch_scores: out["model_interface_residues"] = \ _InterfaceResiduesToJSONList(scorer.model_interface_residues) out["reference_interface_residues"] = \ _InterfaceResiduesToJSONList(scorer.target_interface_residues) out["patch_qs"] = _PatchScoresToJSONList(scorer.model_interface_residues, scorer.patch_qs) out["patch_dockq"] = _PatchScoresToJSONList(scorer.model_interface_residues, scorer.patch_dockq) if args.tm_score: out["tm_score"] = _RoundOrNone(scorer.tm_score) out["usalign_mapping"] = scorer.usalign_mapping if args.dump_structures: # Dump model model_dump_filename = _AddSuffix(model.GetName(), args.dump_suffix) _DumpStructure(model, model_dump_filename, model_format) # Dump reference reference_dump_filename = _AddSuffix(reference.GetName(), args.dump_suffix) _DumpStructure(reference, reference_dump_filename, reference_format) return out def _Main(): args = _ParseArgs() ost.PushVerbosityLevel(args.verbosity) if args.verbosity < 4: # Hide tracebacks by default # Run script with -v 4 (Verbose) or higher to display them sys.tracebacklimit = 0 _CheckCompoundLib() try: compute_cad = args.cad_score or args.local_cad_score if compute_cad and not args.residue_number_alignment: raise RuntimeError("Only support CAD score when residue numbers in " "model and reference match. Use -rna flag if " "this is the case.") reference_format = _GetStructureFormat(args.reference, sformat=args.reference_format) reference = _LoadStructure(args.reference, sformat=reference_format, bu_id=args.reference_biounit, fault_tolerant = args.fault_tolerant) model_format = _GetStructureFormat(args.model, sformat=args.model_format) model = _LoadStructure(args.model, sformat=model_format, bu_id=args.model_biounit, fault_tolerant = args.fault_tolerant) out = _Process(model, reference, args, model_format, reference_format) # append input arguments out["model"] = args.model out["reference"] = args.reference out["fault_tolerant"] = args.fault_tolerant out["model_biounit"] = args.model_biounit out["reference_biounit"] = args.reference_biounit out["residue_number_alignment"] = args.residue_number_alignment out["enforce_consistency"] = args.enforce_consistency out["cad_exec"] = args.cad_exec out["usalign_exec"] = args.usalign_exec out["lddt_no_stereochecks"] = args.lddt_no_stereochecks out["min_pep_length"] = args.min_pep_length out["min_nuc_length"] = args.min_nuc_length out["lddt_add_mdl_contacts"] = args.lddt_add_mdl_contacts out["lddt_inclusion_radius"] = args.lddt_inclusion_radius out["dockq_capri_peptide"] = args.dockq_capri_peptide out["ost_version"] = ost.__version__ out["status"] = "SUCCESS" with open(args.output, 'w') as fh: json.dump(out, fh, indent=4, sort_keys=False) except Exception as exc: out = dict() out["status"] = "FAILURE" out["traceback"] = traceback.format_exc(limit=1000) etype, evalue, tb = sys.exc_info() out["exception"] = " ".join(traceback.format_exception_only(etype, evalue)) with open(args.output, 'w') as fh: json.dump(out, fh, indent=4, sort_keys=False) raise if __name__ == '__main__': _Main()