diff --git a/actions/ost-compare-structures b/actions/ost-compare-structures index 5a6d09b7fa3b7419ab2df9c97a1933d163a922fb..f2b9a58565f630d6a7757d4e4e0f7915aca723a5 100644 --- a/actions/ost-compare-structures +++ b/actions/ost-compare-structures @@ -17,7 +17,7 @@ Loads the structures and performs basic cleanup: The cleaned structures are optionally dumped using -d/--dump-structures Output is written in JSON format (default: out.json). In case of no additional -options, this is a dictionary with 8 keys: +options, this is a dictionary with 8 keys describing model/reference comparison: * "reference_chains": Chain names of reference * "model_chains": Chain names of model @@ -36,7 +36,7 @@ options, this is a dictionary with 8 keys: format. * "inconsistent_residues": List of strings that represent name mismatches of aligned residues in form - <trg_cname>.<trg_rname><trg_rnum>-<mdl_cname>.<mdl_rname><mdl_rnum>. + <trg_cname>.<trg_rnum>.<trg_ins_code>-<mdl_cname>.<mdl_rnum>.<mdl_ins_code>. Inconsistencies may lead to corrupt results but do not abort the program. Program abortion in these cases can be enforced with -ec/--enforce-consistency. @@ -44,6 +44,20 @@ options, this is a dictionary with 8 keys: content of the JSON output will be \"status\" set to FAILURE and an additional key: "traceback". +The following additional keys store relevant input parameters to reproduce +results: + + * "model" + * "reference" + * "fault_tolerant" + * "model_biounit" + * "reference_biounit" + * "residue_number_alignment" + * "enforce_consistency" + * "cad_exec" + * "usalign_exec" + * "lddt_no_stereochecks" + The pairwise sequence alignments are computed with Needleman-Wunsch using BLOSUM62 (NUC44 for nucleotides). Many benchmarking scenarios preprocess the structures to ensure matching residue numbers (CASP/CAMEO). In these cases, @@ -216,13 +230,16 @@ def _ParseArgs(): default=False, action="store_true", help=("Compute per-residue lDDT scores with default parameterization " - "and store as key \"local_lddt\". Score for model residue with " - "number 42 in chain X can be extracted with: " - "data[\"local_lddt\"][\"X\"][\"42\"]. If there is an insertion " - "code, lets say A, the last key becomes \"42A\" Stereochemical " - "irregularities affecting lDDT are reported as keys " - "\"model_clashes\", \"model_bad_bonds\", \"model_bad_angles\" " - "and the respective reference counterparts.")) + "and store as key \"local_lddt\". Score for each residue is " + "accessible by key <chain_name>.<resnum>.<resnum_inscode>. " + "Residue with number 42 in chain X can be extracted with: " + "data[\"local_lddt\"][\"X.42.\"]. If there is an insertion " + "code, lets say A, the residue key becomes \"X.42.A\". " + "Stereochemical irregularities affecting lDDT are reported as " + "keys \"model_clashes\", \"model_bad_bonds\", " + "\"model_bad_angles\" and the respective reference " + "counterparts. Atoms specified in there follow the following " + "format: <chain_name>.<resnum>.<resnum_inscode>.<atom_name>")) parser.add_argument( "--bb-lddt", @@ -241,10 +258,8 @@ def _ParseArgs(): help=("Compute per-residue lDDT scores with default parameterization " "and store as key \"bb_local_lddt\". lDDT in this case is only " "computed on backbone atoms: CA for peptides and C3' for " - "nucleotides. Score for model residue with " - "number 42 in chain X can be extracted with: " - "data[\"local_lddt\"][\"X\"][\"42\"]. If there is an insertion " - "code, lets say A, the last key becomes \"42A\"")) + "nucleotides. Per-residue scores are accessible as described for " + "local_lddt.")) parser.add_argument( "--cad-score", @@ -262,12 +277,10 @@ def _ParseArgs(): default=False, action="store_true", help=("Compute local CAD's atom-atom (AA) scores and store as key " - "\"local_cad_score\". Score for model residue with number 42 in " - "chain X can be extracted with: " - "data[\"local_cad_score\"][\"X\"][\"42\"]. " - "--residue-number-alignments must be enabled to compute this " - "score. Requires voronota_cadscore executable in PATH. " - "Alternatively you can set cad-exec.")) + "\"local_cad_score\". Per-residue scores are accessible as " + "described for local_lddt. --residue-number-alignments must be " + "enabled to compute this score. Requires voronota_cadscore " + "executable in PATH. Alternatively you can set cad-exec.")) parser.add_argument( "--cad-exec", @@ -350,8 +363,9 @@ def _ParseArgs(): "None. Model interface residues are available as key " "\"model_interface_residues\", reference interface residues as " "key \"reference_interface_residues\". Residues are represented " - "as string in form <num><inscode>. The respective scores are " - "available as keys \"patch_qs\" and \"patch_dockq\"")) + "as string in form <chain_name>.<resnum>.<resnum_inscode>. " + "The respective scores are available as keys \"patch_qs\" and " + "\"patch_dockq\"")) parser.add_argument( "--tm-score", @@ -474,29 +488,47 @@ def _GetInconsistentResidues(alns): r1 = col.GetResidue(0) r2 = col.GetResidue(1) if r1.IsValid() and r2.IsValid() and r1.GetName() != r2.GetName(): - lst.append((r1, r2)) - lst = [f"{x[0].GetQualifiedName()}-{x[1].GetQualifiedName()}" for x in lst] + ch_1 = r1.GetChain().name + num_1 = r1.number.num + ins_code_1 = r1.number.ins_code.strip("\u0000") + id_1 = f"{ch_1}.{num_1}.{ins_code_1}" + ch_2 = r2.GetChain().name + num_2 = r2.number.num + ins_code_2 = r2.number.ins_code.strip("\u0000") + id_2 = f"{ch_2}.{num_2}.{ins_code_2}" + lst.append(f"{id_1}-{id_2}") return lst def _LocalScoresToJSONDict(score_dict): - """ Score for model residue with number rnum in chain X can be extracted - with: data["local_cad_score"]["X"][rnum]. Convert ResNum object to str for - JSON serialization + """ Convert ResNums to str for JSON serialization """ json_dict = dict() for ch, ch_scores in score_dict.items(): - json_dict[ch] = {str(rnum): s for rnum, s in ch_scores.items()} + for num, s in ch_scores.items(): + ins_code = num.ins_code.strip("\u0000") + json_dict[f"{ch}.{num.num}.{ins_code}"] = s return json_dict -def _InterfaceResiduesToJSONDict(interface_dict): - """ Interface residues are stored as - data["model_interface_residues"]["A"][rnum1, rnum2,...]. Convert ResNum - object to str for JSON serialization. +def _InterfaceResiduesToJSONList(interface_dict): + """ Convert ResNums to str for JSON serialization. + + Changes in this function will affect _PatchScoresToJSONList """ - json_dict = dict() + json_list = list() for ch, ch_nums in interface_dict.items(): - json_dict[ch] = [str(rnum) for rnum in ch_nums] - return json_dict + for num in ch_nums: + ins_code = num.ins_code.strip("\u0000") + json_list.append(f"{ch}.{num.num}.{ins_code}") + return json_list + +def _PatchScoresToJSONList(interface_dict, score_dict): + """ Creates List of patch scores that are consistent with interface residue + lists + """ + json_list = list() + for ch, ch_nums in interface_dict.items(): + json_list += score_dict[ch] + return json_list def _Process(model, reference, args): @@ -579,11 +611,13 @@ def _Process(model, reference, args): if args.patch_scores: out["model_interface_residues"] = \ - _InterfaceResiduesToJSONDict(scorer.model_interface_residues) + _InterfaceResiduesToJSONList(scorer.model_interface_residues) out["reference_interface_residues"] = \ - _InterfaceResiduesToJSONDict(scorer.target_interface_residues) - out["patch_qs"] = scorer.patch_qs - out["patch_dockq"] = scorer.patch_dockq + _InterfaceResiduesToJSONList(scorer.target_interface_residues) + out["patch_qs"] = _PatchScoresToJSONList(scorer.model_interface_residues, + scorer.patch_qs) + out["patch_dockq"] = _PatchScoresToJSONList(scorer.model_interface_residues, + scorer.patch_dockq) if args.tm_score: out["tm_score"] = scorer.tm_score @@ -614,10 +648,6 @@ def _Process(model, reference, args): "fails in this case.") else: raise - - - - return out def _Main(): diff --git a/modules/doc/actions.rst b/modules/doc/actions.rst index 3178a2f5f4db18daf545281aaf3bcc1903f42ade..a71c8db2ec2cbc978da9de73001857e8718da23d 100644 --- a/modules/doc/actions.rst +++ b/modules/doc/actions.rst @@ -40,9 +40,11 @@ Details on the usage (output of ``ost compare-structures --help``): [-c CHAIN_MAPPING [CHAIN_MAPPING ...]] [--lddt] [--local-lddt] [--bb-lddt] [--bb-local-lddt] [--cad-score] [--local-cad-score] - [--cad-exec CAD_EXEC] [--qs-score] + [--cad-exec CAD_EXEC] + [--usalign-exec USALIGN_EXEC] [--qs-score] [--rigid-scores] [--interface-scores] - [--patch-scores] + [--patch-scores] [--tm-score] + [--lddt-no-stereochecks] Evaluate model against reference @@ -62,8 +64,8 @@ Details on the usage (output of ``ost compare-structures --help``): The cleaned structures are optionally dumped using -d/--dump-structures Output is written in JSON format (default: out.json). In case of no additional - options, this is a dictionary with 8 keys: - + options, this is a dictionary with 8 keys describing model/reference comparison: + * "reference_chains": Chain names of reference * "model_chains": Chain names of model * "chem_groups": Groups of polypeptides/polynucleotides from reference that @@ -81,29 +83,43 @@ Details on the usage (output of ``ost compare-structures --help``): format. * "inconsistent_residues": List of strings that represent name mismatches of aligned residues in form - <trg_cname>.<trg_rname><trg_rnum>-<mdl_cname>.<mdl_rname><mdl_rnum>. + <trg_cname>.<trg_rnum>.<trg_ins_code>-<mdl_cname>.<mdl_rnum>.<mdl_ins_code>. Inconsistencies may lead to corrupt results but do not abort the program. Program abortion in these cases can be enforced with -ec/--enforce-consistency. * "status": SUCCESS if everything ran through. In case of failure, the only content of the JSON output will be "status" set to FAILURE and an additional key: "traceback". - + + The following additional keys store relevant input parameters to reproduce + results: + + * "model" + * "reference" + * "fault_tolerant" + * "model_biounit" + * "reference_biounit" + * "residue_number_alignment" + * "enforce_consistency" + * "cad_exec" + * "usalign_exec" + * "lddt_no_stereochecks" + The pairwise sequence alignments are computed with Needleman-Wunsch using BLOSUM62 (NUC44 for nucleotides). Many benchmarking scenarios preprocess the structures to ensure matching residue numbers (CASP/CAMEO). In these cases, enabling -rna/--residue-number-alignment is recommended. - + Each score is opt-in and can be enabled with optional arguments. - + Example to compute global and per-residue lDDT values as well as QS-score: - + ost compare-structures -m model.pdb -r reference.cif --lddt --local-lddt --qs-score - + Example to inject custom chain mapping - + ost compare-structures -m model.pdb -r reference.cif -c A:B B:A - + optional arguments: -h, --help show this help message and exit -m MODEL, --model MODEL @@ -164,13 +180,17 @@ Details on the usage (output of ``ost compare-structures --help``): counterparts. --local-lddt Compute per-residue lDDT scores with default parameterization and store as key "local_lddt". Score - for model residue with number 42 in chain X can be - extracted with: data["local_lddt"]["X"]["42"]. If - there is an insertion code, lets say A, the last key - becomes "42A" Stereochemical irregularities affecting - lDDT are reported as keys "model_clashes", - "model_bad_bonds", "model_bad_angles" and the - respective reference counterparts. + for each residue is accessible by key + <chain_name>.<resnum>.<resnum_inscode>. Residue with + number 42 in chain X can be extracted with: + data["local_lddt"]["X.42."]. If there is an insertion + code, lets say A, the residue key becomes "X.42.A". + Stereochemical irregularities affecting lDDT are + reported as keys "model_clashes", "model_bad_bonds", + "model_bad_angles" and the respective reference + counterparts. Atoms specified in there follow the + following format: + <chain_name>.<resnum>.<resnum_inscode>.<atom_name> --bb-lddt Compute global lDDT score with default parameterization and store as key "bb_lddt". lDDT in this case is only computed on backbone atoms: CA for @@ -178,26 +198,26 @@ Details on the usage (output of ``ost compare-structures --help``): --bb-local-lddt Compute per-residue lDDT scores with default parameterization and store as key "bb_local_lddt". lDDT in this case is only computed on backbone atoms: - CA for peptides and C3' for nucleotides. Score for - model residue with number 42 in chain X can be - extracted with: data["local_lddt"]["X"]["42"]. If - there is an insertion code, lets say A, the last key - becomes "42A" + CA for peptides and C3' for nucleotides. Per-residue + scores are accessible as described for local_lddt. --cad-score Compute global CAD's atom-atom (AA) score and store as key "cad_score". --residue-number-alignment must be enabled to compute this score. Requires voronota_cadscore executable in PATH. Alternatively you can set cad-exec. --local-cad-score Compute local CAD's atom-atom (AA) scores and store as - key "local_cad_score". Score for model residue with - number 42 in chain X can be extracted with: - data["local_cad_score"]["X"]["42"]. --residue-number- - alignments must be enabled to compute this score. - Requires voronota_cadscore executable in PATH. + key "local_cad_score". Per-residue scores are + accessible as described for local_lddt. --residue- + number-alignments must be enabled to compute this + score. Requires voronota_cadscore executable in PATH. Alternatively you can set cad-exec. --cad-exec CAD_EXEC Path to voronota-cadscore executable (installed from https://github.com/kliment-olechnovic/voronota). Searches PATH if not set. + --usalign-exec USALIGN_EXEC + Path to USalign executable to compute TM-score. If not + given, an OpenStructure internal copy of USalign code + is used. --qs-score Compute QS-score, stored as key "qs_global", and the QS-best variant, stored as key "qs_best". --rigid-scores Computes rigid superposition based scores. They're @@ -248,12 +268,13 @@ Details on the usage (output of ``ost compare-structures --help``): "model_interface_residues", reference interface residues as key "reference_interface_residues". Residues are represented as string in form - <num><inscode>. The respective scores are available as - keys "patch_qs" and "patch_dockq" + <chain_name>.<resnum>.<resnum_inscode>. The respective + scores are available as keys "patch_qs" and + "patch_dockq" --tm-score Computes TM-score with the USalign tool. Also computes - a chain mapping in case of complexes that is stored - in the same format as the default mapping. TM-score - and the mapping are available as keys "tm_score" and + a chain mapping in case of complexes that is stored in + the same format as the default mapping. TM-score and + the mapping are available as keys "tm_score" and "usalign_mapping" --lddt-no-stereochecks Disable stereochecks for lDDT computation diff --git a/modules/mol/alg/pymod/stereochemistry.py b/modules/mol/alg/pymod/stereochemistry.py index 625ee4ea15be6e0654a1048efb42d0b64cf86726..b3708648bf26a4e331f683e9fe7309196450f72c 100644 --- a/modules/mol/alg/pymod/stereochemistry.py +++ b/modules/mol/alg/pymod/stereochemistry.py @@ -9,6 +9,18 @@ from ost import geom from ost import mol +def _AtomToQualifiedName(a): + """ Returns string to uniquely identify atom + + format: <chain_name>.<resnum>.<resnum_inscode>.<atom_name> + """ + r = a.GetResidue() + ch = r.GetChain() + num = r.number.num + ins_code = r.number.ins_code.strip("\u0000") + return f"{ch.name}.{r.number.num}.{ins_code}.{a.name}" + + def _PotentialDisulfid(a_one, a_two): """ Returns whether two atoms can potentially build a disulfid bond @@ -355,9 +367,12 @@ class ClashInfo: def ToJSON(self): """ Return JSON serializable dict + + Atoms are represented by a string in format: + <chain_name>.<resnum>.<resnum_inscode>.<atom_name> """ - return {"a1": self.a1.GetQualifiedName(), - "a2": self.a2.GetQualifiedName(), + return {"a1": _AtomToQualifiedName(self.a1), + "a2": _AtomToQualifiedName(self.a2), "dist": self.dist, "tolerated_dist": self.tolerated_dist} @@ -382,9 +397,12 @@ class BondViolationInfo: def ToJSON(self): """ Return JSON serializable dict + + Atoms are represented by a string in format: + <chain_name>.<resnum>.<resnum_inscode>.<atom_name> """ - return {"a1": self.a1.GetQualifiedName(), - "a2": self.a2.GetQualifiedName(), + return {"a1": _AtomToQualifiedName(self.a1), + "a2": _AtomToQualifiedName(self.a2), "length": self.length, "exp_length": self.exp_length, "std": self.std} @@ -412,10 +430,13 @@ class AngleViolationInfo: def ToJSON(self): """ Return JSON serializable dict + + Atoms are represented by a string in format: + <chain_name>.<resnum>.<resnum_inscode>.<atom_name> """ - return {"a1": self.a1.GetQualifiedName(), - "a2": self.a2.GetQualifiedName(), - "a3": self.a3.GetQualifiedName(), + return {"a1": _AtomToQualifiedName(self.a1), + "a2": _AtomToQualifiedName(self.a2), + "a3": _AtomToQualifiedName(self.a3), "angle": self.angle, "exp_angle": self.exp_angle, "std": self.std}