diff --git a/actions/ost-compare-ligand-structures b/actions/ost-compare-ligand-structures index 58eeb6eb3da64daa581b0ce701ed58b454322a80..a22db143093a5c0c6a0e8c35aa0ae850ec49b117 100644 --- a/actions/ost-compare-ligand-structures +++ b/actions/ost-compare-ligand-structures @@ -47,10 +47,10 @@ options, this is a dictionary with three keys: content of the JSON output will be \"status\" set to FAILURE and an additional key: "traceback". -Each score is opt-in and, be enabled with optional arguments and is added -to the output. Keys correspond to the values in "model_ligands" above. -Unassigned ligands are reported with a message in "unassigned_model_ligands" -and "unassigned_reference_ligands". +Each score is opt-in and must be enabled with optional arguments. The scores +perform a model/reference ligand assignment and report a score for each assigned +model ligand. Optionally, unassigned model ligands are reported with a null +score and a reason why no assignment has been performed (--unassigned/-u). """ import argparse @@ -61,8 +61,9 @@ import traceback import ost from ost import io -from ost.mol.alg import ligand_scoring - +from ost.mol.alg import ligand_scoring_base +from ost.mol.alg import ligand_scoring_lddtpli +from ost.mol.alg import ligand_scoring_scrmsd def _ParseArgs(): parser = argparse.ArgumentParser(description = __doc__, @@ -175,21 +176,6 @@ def _ParseArgs(): help=("Make alignment based on residue number instead of using " "a global BLOSUM62-based alignment (NUC44 for nucleotides).")) - parser.add_argument( - "-ec", - "--enforce-consistency", - dest="enforce_consistency", - default=False, - action="store_true", - help=("Enforce consistency of residue names between the reference " - "binding site and the model. By default residue name " - "discrepancies are reported but the program proceeds. " - "If this is set to True, the program will fail with an error " - "message if the residues names differ. " - "Note: more binding site mappings may be explored during " - "scoring, but only inconsistencies in the selected mapping are " - "reported.")) - parser.add_argument( "-sm", "--substructure-match", @@ -205,51 +191,6 @@ def _ParseArgs(): default=0.2, help=("Coverage delta for partial ligand assignment.")) - parser.add_argument( - "-gcm", - "--global-chain-mapping", - dest="global_chain_mapping", - default=False, - action="store_true", - help=("Use a global chain mapping.")) - - parser.add_argument( - "-c", - "--chain-mapping", - nargs="+", - dest="chain_mapping", - help=("Custom mapping of chains between the reference and the model. " - "Each separate mapping consist of key:value pairs where key " - "is the chain name in reference and value is the chain name in " - "model. Only has an effect if the --global-chain-mapping flag " - "is set.")) - - parser.add_argument( - "-fbs", - "--full-bs-search", - dest="full_bs_search", - default=False, - action="store_true", - help=("Enumerate all potential binding sites in the model.")) - - parser.add_argument( - "-ra", - "--rmsd-assignment", - dest="rmsd_assignment", - default=True, - action="store_true", - help=("Use RMSD only for ligand assignment " - "(default since OpenStructure 2.8).")) - - parser.add_argument( - "-sa", - "--separate-assignment", - dest="rmsd_assignment", - default=True, - action="store_false", - help=("Use separate ligand assignments for RMSD and lDDT-PLI " - "(opposite of --rmsd-assignment).")) - parser.add_argument( "-u", "--unassigned", @@ -260,6 +201,16 @@ def _ParseArgs(): "assigned ligands, with a null score, and reason for not being " "assigned.")) + parser.add_argument( + '-v', + '--verbosity', + dest="verbosity", + type=int, + default=3, + help="Set verbosity level. Defaults to 3 (INFO).") + + # arguments relevant for lddt-pli + parser.add_argument( "--lddt-pli", dest="lddt_pli", @@ -267,6 +218,21 @@ def _ParseArgs(): action="store_true", help=("Compute lDDT-PLI score and store as key \"lddt-pli\".")) + parser.add_argument( + "--lddt-pli-radius", + dest="lddt_pli_radius", + default=6.0, + help=("lDDT inclusion radius for lDDT-PLI.")) + + parser.add_argument( + "--lddt-pli-amc", + dest="lddt_pli_amc", + default=False, + action="store_true", + help=("Add model contacts (amc) when computing lDDT-PLI.")) + + # arguments relevant for rmsd + parser.add_argument( "--rmsd", dest="rmsd", @@ -278,39 +244,24 @@ def _ParseArgs(): "--radius", dest="radius", default=4.0, - help=("Inclusion radius for the binding site. Any residue with atoms " - "within this distance of the ligand will be included in the " - "binding site.")) - - parser.add_argument( - "--lddt-pli-radius", - dest="lddt_pli_radius", - default=6.0, - help=("lDDT inclusion radius for lDDT-PLI.")) + help=("Inclusion radius to extract reference binding site that is used " + "for RMSD computation. Any residue with atoms within this " + "distance of the ligand will be included in the binding site.")) parser.add_argument( "--lddt-lp-radius", dest="lddt_lp_radius", - default=10.0, + default=15.0, help=("lDDT inclusion radius for lDDT-LP.")) parser.add_argument( - '-v', - '--verbosity', - dest="verbosity", - type=int, - default=3, - help="Set verbosity level. Defaults to 3 (INFO).") - - parser.add_argument( - "--n-max-naive", - dest="n_max_naive", - required=False, - default=12, - type=int, - help=("If number of chains in model and reference are below or equal " - "that number, the global chain mapping will naively enumerate " - "all possible mappings. A heuristic is used otherwise.")) + "-fbs", + "--full-bs-search", + dest="full_bs_search", + default=False, + action="store_true", + help=("Enumerate all potential binding sites in the model when " + "searching rigid superposition for RMSD computation")) return parser.parse_args() @@ -448,36 +399,67 @@ def _QualifiedResidueNotation(r): ins_code=resnum.ins_code.strip("\u0000"), ) +def _SetupLDDTPLIScorer(model, model_ligands, reference, reference_ligands, args): + return ligand_scoring_lddtpli.LDDTPLIScorer(model, reference, + model_ligands = model_ligands, + target_ligands = reference_ligands, + resnum_alignments = args.residue_number_alignment, + rename_ligand_chain = True, + substructure_match = args.substructure_match, + coverage_delta = args.coverage_delta, + lddt_pli_radius = args.lddt_pli_radius, + add_mdl_contacts = args.lddt_pli_amc) + +def _SetupSCRMSDScorer(model, model_ligands, reference, reference_ligands, args): + return ligand_scoring_scrmsd.SCRMSDScorer(model, reference, + model_ligands = model_ligands, + target_ligands = reference_ligands, + resnum_alignments = args.residue_number_alignment, + rename_ligand_chain = True, + substructure_match = args.substructure_match, + coverage_delta = args.coverage_delta, + bs_radius = args.radius, + lddt_lp_radius = args.lddt_lp_radius) def _Process(model, model_ligands, reference, reference_ligands, args): - mapping = None - if args.chain_mapping is not None: - mapping = {x.split(':')[0]: x.split(':')[1] for x in args.chain_mapping} - - scorer = ligand_scoring.LigandScorer( - model=model, - target=reference, - model_ligands=model_ligands, - target_ligands=reference_ligands, - resnum_alignments=args.residue_number_alignment, - check_resnames=args.enforce_consistency, - rename_ligand_chain=True, - substructure_match=args.substructure_match, - coverage_delta=args.coverage_delta, - global_chain_mapping=args.global_chain_mapping, - full_bs_search=args.full_bs_search, - rmsd_assignment=args.rmsd_assignment, - unassigned=args.unassigned, - radius=args.radius, - lddt_pli_radius=args.lddt_pli_radius, - lddt_lp_radius=args.lddt_lp_radius, - n_max_naive=args.n_max_naive, - custom_mapping=mapping - ) - out = dict() + ########################## + # Setup required scorers # + ########################## + + lddtpli_scorer = None + scrmsd_scorer = None + + if args.lddt_pli: + lddtpli_scorer = _SetupLDDTPLIScorer(model, model_ligands, + reference, reference_ligands, + args) + + if args.rmsd: + scrmsd_scorer = _SetupSCRMSDScorer(model, model_ligands, + reference, reference_ligands, + args) + + # basic info on ligands only requires baseclass functionality + # doesn't matter which scorer we use + scorer = None + if lddtpli_scorer is not None: + scorer = lddtpli_scorer + elif scrmsd_scorer is not None: + scorer = scrmsd_scorer + else: + ost.LogWarning("No score selected, output will be empty.") + # just create SCRMSD scorer to fill basic ligand info + scorer = _SetupSCRMSDScorer(model, model_ligands, + reference, reference_ligands, + args) + + #################################### + # Extract / Map ligand information # + #################################### + if model_ligands is not None: # Replace model ligand by path if len(model_ligands) == len(scorer.model_ligands): @@ -486,7 +468,7 @@ def _Process(model, model_ligands, reference, reference_ligands, args): elif len(model_ligands) < len(scorer.model_ligands): # Multi-ligand SDF files were given # Map ligand => path:idx - out["model_ligands"] = [] + out["model_ligands"] = list() for ligand, filename in zip(model_ligands, args.model_ligands): assert isinstance(ligand, ost.mol.EntityHandle) for i, residue in enumerate(ligand.residues): @@ -500,9 +482,6 @@ def _Process(model, model_ligands, reference, reference_ligands, args): # Map ligand => qualified residue out["model_ligands"] = [_QualifiedResidueNotation(l) for l in scorer.model_ligands] - model_ligands_map = {k.hash_code: v for k, v in zip( - scorer.model_ligands, out["model_ligands"])} - if reference_ligands is not None: # Replace reference ligand by path if len(reference_ligands) == len(scorer.target_ligands): @@ -511,7 +490,7 @@ def _Process(model, model_ligands, reference, reference_ligands, args): elif len(reference_ligands) < len(scorer.target_ligands): # Multi-ligand SDF files were given # Map ligand => path:idx - out["reference_ligands"] = [] + out["reference_ligands"] = list() for ligand, filename in zip(reference_ligands, args.reference_ligands): assert isinstance(ligand, ost.mol.EntityHandle) for i, residue in enumerate(ligand.residues): @@ -521,86 +500,72 @@ def _Process(model, model_ligands, reference, reference_ligands, args): raise RuntimeError("Fewer ligands in the reference scorer " "(%d) than given (%d)" % ( len(scorer.target_ligands), len(reference_ligands))) - else: # Map ligand => qualified residue out["reference_ligands"] = [_QualifiedResidueNotation(l) for l in scorer.target_ligands] - reference_ligands_map = {k.hash_code: v for k, v in zip( - scorer.target_ligands, out["reference_ligands"])} - - - if not (args.lddt_pli or args.rmsd): - ost.LogWarning("No score selected, output will be empty.") - else: - out["unassigned_model_ligands"] = {} - for chain, unassigned_residues in scorer.unassigned_model_ligands.items(): - for resnum, unassigned in unassigned_residues.items(): - mdl_lig = scorer.model.FindResidue(chain, resnum) - out["unassigned_model_ligands"][model_ligands_map[ - mdl_lig.hash_code]] = unassigned - out["unassigned_reference_ligands"] = {} - for chain, unassigned_residues in scorer.unassigned_target_ligands.items(): - for resnum, unassigned in unassigned_residues.items(): - trg_lig = scorer.target.FindResidue(chain, resnum) - out["unassigned_reference_ligands"][reference_ligands_map[ - trg_lig.hash_code]] = unassigned - out["unassigned_model_ligand_descriptions"] = scorer.unassigned_model_ligand_descriptions - out["unassigned_reference_ligand_descriptions"] = scorer.unassigned_target_ligand_descriptions + ################## + # Compute scores # + ################## if args.lddt_pli: - out["lddt_pli"] = {} - for chain, lddt_pli_results in scorer.lddt_pli_details.items(): - for resnum, lddt_pli in lddt_pli_results.items(): - if args.unassigned and lddt_pli["unassigned"]: - mdl_lig = scorer.model.FindResidue(chain, resnum) - model_key = model_ligands_map[mdl_lig.hash_code] - else: - model_key = model_ligands_map[lddt_pli["model_ligand"].hash_code] - lddt_pli["reference_ligand"] = reference_ligands_map[ - lddt_pli.pop("target_ligand").hash_code] - lddt_pli["model_ligand"] = model_key - transform_data = lddt_pli["transform"].data - lddt_pli["transform"] = [transform_data[i:i + 4] - for i in range(0, len(transform_data), - 4)] - lddt_pli["bs_ref_res"] = [_QualifiedResidueNotation(r) for r in - lddt_pli["bs_ref_res"]] - lddt_pli["bs_ref_res_mapped"] = [_QualifiedResidueNotation(r) for r in - lddt_pli["bs_ref_res_mapped"]] - lddt_pli["bs_mdl_res_mapped"] = [_QualifiedResidueNotation(r) for r in - lddt_pli["bs_mdl_res_mapped"]] - lddt_pli["inconsistent_residues"] = ["%s-%s" %( - _QualifiedResidueNotation(x), _QualifiedResidueNotation(y)) for x,y in lddt_pli[ - "inconsistent_residues"]] - out["lddt_pli"][model_key] = lddt_pli + out["lddt_pli"] = dict() + for lig_pair in lddtpli_scorer.assignment: + score = float(lddtpli_scorer.score_matrix[lig_pair[0], lig_pair[1]]) + coverage = float(lddtpli_scorer.coverage_matrix[lig_pair[0], lig_pair[1]]) + aux_data = lddtpli_scorer.aux_matrix[lig_pair[0], lig_pair[1]] + target_key = out["reference_ligands"][lig_pair[0]] + model_key = out["model_ligands"][lig_pair[1]] + out["lddt_pli"][model_key] = {"lddt_pli": score, + "coverage": coverage, + "lddt_pli_n_contacts": aux_data["lddt_pli_n_contacts"], + "model_ligand": model_key, + "reference_ligand": target_key, + "bs_ref_res": [_QualifiedResidueNotation(r) for r in + aux_data["bs_ref_res"]], + "bs_mdl_res": [_QualifiedResidueNotation(r) for r in + aux_data["bs_mdl_res"]]} + if args.unassigned: + for i in lddtpli_scorer.unassigned_model_ligands: + model_key = out["model_ligands"][i] + reason = lddtpli_scorer.guess_model_ligand_unassigned_reason(i) + out["lddt_pli"][model_key] = {"lddt_pli": None, + "unassigned_reason": reason} + if args.rmsd: - out["rmsd"] = {} - for chain, rmsd_results in scorer.rmsd_details.items(): - for _, rmsd in rmsd_results.items(): - if args.unassigned and rmsd["unassigned"]: - mdl_lig = scorer.model.FindResidue(chain, resnum) - model_key = model_ligands_map[mdl_lig.hash_code] - else: - model_key = model_ligands_map[rmsd["model_ligand"].hash_code] - rmsd["reference_ligand"] = reference_ligands_map[ - rmsd.pop("target_ligand").hash_code] - rmsd["model_ligand"] = model_key - transform_data = rmsd["transform"].data - rmsd["transform"] = [transform_data[i:i + 4] - for i in range(0, len(transform_data), 4)] - rmsd["bs_ref_res"] = [_QualifiedResidueNotation(r) for r in - rmsd["bs_ref_res"]] - rmsd["bs_ref_res_mapped"] = [_QualifiedResidueNotation(r) for r in - rmsd["bs_ref_res_mapped"]] - rmsd["bs_mdl_res_mapped"] = [_QualifiedResidueNotation(r) for r in - rmsd["bs_mdl_res_mapped"]] - rmsd["inconsistent_residues"] = ["%s-%s" %( - _QualifiedResidueNotation(x), _QualifiedResidueNotation(y)) for x,y in rmsd[ - "inconsistent_residues"]] - out["rmsd"][model_key] = rmsd + out["rmsd"] = dict() + for lig_pair in scrmsd_scorer.assignment: + score = float(scrmsd_scorer.score_matrix[lig_pair[0], lig_pair[1]]) + coverage = float(lddtpli_scorer.coverage_matrix[lig_pair[0], lig_pair[1]]) + aux_data = scrmsd_scorer.aux_matrix[lig_pair[0], lig_pair[1]] + target_key = out["reference_ligands"][lig_pair[0]] + model_key = out["model_ligands"][lig_pair[1]] + transform_data = aux_data["transform"].data + out["rmsd"][model_key] = {"rmsd": score, + "coverage": coverage, + "lddt_lp": aux_data["lddt_lp"], + "bb_rmsd": aux_data["bb_rmsd"], + "model_ligand": model_key, + "reference_ligand": target_key, + "chain_mapping": aux_data["chain_mapping"], + "bs_ref_res": [_QualifiedResidueNotation(r) for r in + aux_data["bs_ref_res"]], + "bs_ref_res_mapped": [_QualifiedResidueNotation(r) for r in + aux_data["bs_ref_res_mapped"]], + "bs_mdl_res_mapped": [_QualifiedResidueNotation(r) for r in + aux_data["bs_mdl_res_mapped"]], + "inconsistent_residues": [_QualifiedResidueNotation(r) for r in + aux_data["inconsistent_residues"]], + "transform": [transform_data[i:i + 4] + for i in range(0, len(transform_data), 4)]} + if args.unassigned: + for i in scrmsd_scorer.unassigned_model_ligands: + model_key = out["model_ligands"][i] + reason = scrmsd_scorer.guess_model_ligand_unassigned_reason(i) + out["rmsd"][model_key] = {"rmsd": None, + "unassigned_reason": reason} return out @@ -610,7 +575,7 @@ def _Main(): args = _ParseArgs() ost.PushVerbosityLevel(args.verbosity) if args.verbosity < 4: - sys.tracebacklimit = 0 + sys.tracebacklimit = 100 _CheckCompoundLib() try: # Load structures diff --git a/modules/doc/actions.rst b/modules/doc/actions.rst index 7a0b812d314daf475496a0c9b46d77caf1735d6f..c7be780d281570749ac6701918d506e75fc4d3d0 100644 --- a/modules/doc/actions.rst +++ b/modules/doc/actions.rst @@ -395,141 +395,133 @@ Details on the usage (output of ``ost compare-ligand-structures --help``): .. code-block:: console - usage: ost compare-ligand-structures [-h] -m MODEL [-ml [MODEL_LIGANDS ...]] - -r REFERENCE [-rl [REFERENCE_LIGANDS ...]] - [-o OUTPUT] [-mf {pdb,mmcif,cif}] - [-rf {pdb,mmcif,cif}] [-ft] [-rna] [-ec] [-sm] - [-gcm] [-c CHAIN_MAPPING [CHAIN_MAPPING ...]] - [-ra] [--lddt-pli] [--rmsd] [--radius RADIUS] - [--lddt-pli-radius LDDT_PLI_RADIUS] - [--lddt-lp-radius LDDT_LP_RADIUS] - [-v VERBOSITY] [--n-max-naive N_MAX_NAIVE] - - Evaluate model with non-polymer/small molecule ligands against reference. - - Example: ost compare-ligand-structures \ - -m model.pdb \ - -ml ligand.sdf \ - -r reference.cif \ - --lddt-pli --rmsd - - Structures of polymer entities (proteins and nucleotides) can be given in PDB - or mmCIF format. If the structure is given in mmCIF format, only the asymmetric - unit (AU) is used for scoring. - - Ligands can be given as path to SDF files containing the ligand for both model - (--model-ligands/-ml) and reference (--reference-ligands/-rl). If omitted, - ligands will be detected in the model and reference structures. For structures - given in mmCIF format, this is based on the annotation as "non polymer entity" - (i.e. ligands in the _pdbx_entity_nonpoly mmCIF category) and works reliably. - For structures given in PDB format, this is based on the HET records and is - normally not what you want. You should always give ligands as SDF for - structures in PDB format. - - Polymer/oligomeric ligands (saccharides, peptides, nucleotides) are not - supported. - - Only minimal cleanup steps are performed (remove hydrogens, and for structures - of polymers only, remove unknown atoms and cleanup element column). - - Ligands in mmCIF and PDB files must comply with the PDB component dictionary - definition, and have properly named residues and atoms, in order for - ligand connectivity to be loaded correctly. Ligands loaded from SDF files - are exempt from this restriction, meaning any arbitrary ligand can be assessed. - - Output is written in JSON format (default: out.json). In case of no additional - options, this is a dictionary with three keys: - - * "model_ligands": A list of ligands in the model. If ligands were provided - explicitly with --model-ligands, elements of the list will be the paths to - the ligand SDF file(s). Otherwise, they will be the chain name, residue - number and insertion code of the ligand, separated by a dot. - * "reference_ligands": A list of ligands in the reference. If ligands were - provided explicitly with --reference-ligands, elements of the list will be - the paths to the ligand SDF file(s). Otherwise, they will be the chain name, - residue number and insertion code of the ligand, separated by a dot. - * "status": SUCCESS if everything ran through. In case of failure, the only - content of the JSON output will be "status" set to FAILURE and an - additional key: "traceback". - - Each score is opt-in and, be enabled with optional arguments and is added - to the output. Keys correspond to the values in "model_ligands" above. - Unassigned ligands are reported with a message in - "unassigned_model_ligands" and "unassigned_reference_ligands". - - options: - -h, --help show this help message and exit - -m MODEL, --mdl MODEL, --model MODEL - Path to model file. - -ml [MODEL_LIGANDS ...], --mdl-ligands [MODEL_LIGANDS ...], - --model-ligands [MODEL_LIGANDS ...] - Path to model ligand files. - -r REFERENCE, --ref REFERENCE, --reference REFERENCE - Path to reference file. - -rl [REFERENCE_LIGANDS ...], --ref-ligands [REFERENCE_LIGANDS ...], - --reference-ligands [REFERENCE_LIGANDS ...] - Path to reference ligand files. - -o OUTPUT, --out OUTPUT, --output OUTPUT - Output file name. The output will be saved as a JSON - file. default: out.json - -mf {pdb,mmcif,cif}, --mdl-format {pdb,mmcif,cif}, - --model-format {pdb,mmcif,cif} - Format of model file. Inferred from path if not - given. - -rf {pdb,mmcif,cif}, --reference-format {pdb,mmcif,cif}, - --ref-format {pdb,mmcif,cif} - Format of reference file. Inferred from path if not - given. - -ft, --fault-tolerant - Fault tolerant parsing. - -rna, --residue-number-alignment - Make alignment based on residue number instead of - using a global BLOSUM62-based alignment (NUC44 for - nucleotides). - -ec, --enforce-consistency - Enforce consistency of residue names between the - reference binding site and the model. By default - residue name discrepancies are reported but the - program proceeds. If this is set to True, the program - will fail with an error message if the residues names - differ. Note: more binding site mappings may be - explored during scoring, but only inconsistencies in - the selected mapping are reported. - -sm, --substructure-match - Allow incomplete target ligands. - -gcm, --global-chain-mapping - Use a global chain mapping. - -c CHAIN_MAPPING [CHAIN_MAPPING ...], - --chain-mapping CHAIN_MAPPING [CHAIN_MAPPING ...] - Custom mapping of chains between the reference and - the model. Each separate mapping consist of key:value - pairs where key is the chain name in reference and - value is the chain name in model. Only has an effect - if global-chain-mapping flag is set. - -ra, --rmsd-assignment - Use RMSD for ligand assignment. - -u, --unassigned Report unassigned model ligands in the output - together with assigned ligands, with a null score, - and reason for not being assigned. - - --lddt-pli Compute lDDT-PLI score and store as key "lddt-pli". - --rmsd Compute RMSD score and store as key "rmsd". - --radius RADIUS Inclusion radius for the binding site. Any residue - with atoms within this distance of the ligand will - be included in the binding site. - --lddt-pli-radius LDDT_PLI_RADIUS - lDDT inclusion radius for lDDT-PLI. - --lddt-lp-radius LDDT_LP_RADIUS - lDDT inclusion radius for lDDT-LP. - -v VERBOSITY, --verbosity VERBOSITY - Set verbosity level. Defaults to 3 (INFO). - --n-max-naive N_MAX_NAIVE - If number of chains in model and reference are - below or equal that number, the global chain - mapping will naively enumerate all possible - mappings. A heuristic is used otherwise. - - -Additional information about the scores and output values is available in -:meth:`rmsd_details <ost.mol.alg.ligand_scoring.LigandScorer.rmsd_details>` and -:meth:`lddt_pli_details <ost.mol.alg.ligand_scoring.LigandScorer.lddt_pli_details>`. + usage: ost compare-ligand-structures [-h] -m MODEL [-ml [MODEL_LIGANDS ...]] + -r REFERENCE + [-rl [REFERENCE_LIGANDS ...]] [-o OUTPUT] + [-mf {pdb,cif,mmcif}] + [-rf {pdb,cif,mmcif}] [-mb MODEL_BIOUNIT] + [-rb REFERENCE_BIOUNIT] [-ft] [-rna] + [-sm] [-cd COVERAGE_DELTA] [-u] + [-v VERBOSITY] [--lddt-pli] + [--lddt-pli-radius LDDT_PLI_RADIUS] + [--lddt-pli-amc] [--rmsd] + [--radius RADIUS] + [--lddt-lp-radius LDDT_LP_RADIUS] [-fbs] + + Evaluate model with non-polymer/small molecule ligands against reference. + + Example: ost compare-ligand-structures \ + -m model.pdb \ + -ml ligand.sdf \ + -r reference.cif \ + --lddt-pli --rmsd + + Structures of polymer entities (proteins and nucleotides) can be given in PDB + or mmCIF format. + + Ligands can be given as path to SDF files containing the ligand for both model + (--model-ligands/-ml) and reference (--reference-ligands/-rl). If omitted, + ligands will be detected in the model and reference structures. For structures + given in mmCIF format, this is based on the annotation as "non polymer entity" + (i.e. ligands in the _pdbx_entity_nonpoly mmCIF category) and works reliably. + For structures given in legacy PDB format, this is based on the HET records + which is usually only set properly on files downloaded from the PDB (and even + then, this is not always the case). This is normally not what you want. You + should always give ligands as SDF for structures in legacy PDB format. + + Polymer/oligomeric ligands (saccharides, peptides, nucleotides) are not + supported. + + Only minimal cleanup steps are performed (remove hydrogens and deuteriums, + and for structures of polymers only, remove unknown atoms and cleanup element + column). + + Ligands in mmCIF and PDB files must comply with the PDB component dictionary + definition, and have properly named residues and atoms, in order for + ligand connectivity to be loaded correctly. Ligands loaded from SDF files + are exempt from this restriction, meaning any arbitrary ligand can be assessed. + + Output is written in JSON format (default: out.json). In case of no additional + options, this is a dictionary with three keys: + + * "model_ligands": A list of ligands in the model. If ligands were provided + explicitly with --model-ligands, elements of the list will be the paths to + the ligand SDF file(s). Otherwise, they will be the chain name, residue + number and insertion code of the ligand, separated by a dot. + * "reference_ligands": A list of ligands in the reference. If ligands were + provided explicitly with --reference-ligands, elements of the list will be + the paths to the ligand SDF file(s). Otherwise, they will be the chain name, + residue number and insertion code of the ligand, separated by a dot. + * "status": SUCCESS if everything ran through. In case of failure, the only + content of the JSON output will be "status" set to FAILURE and an + additional key: "traceback". + + Each score is opt-in and must be enabled with optional arguments. The scores + perform a model/reference ligand assignment and report a score for each assigned + model ligand. Optionally, unassigned model ligands are reported with a null + score and a reason why no assignment has been performed (--unassigned/-u). + + options: + -h, --help show this help message and exit + -m MODEL, --mdl MODEL, --model MODEL + Path to model file. + -ml [MODEL_LIGANDS ...], --mdl-ligands [MODEL_LIGANDS ...], --model-ligands [MODEL_LIGANDS ...] + Path to model ligand files. + -r REFERENCE, --ref REFERENCE, --reference REFERENCE + Path to reference file. + -rl [REFERENCE_LIGANDS ...], --ref-ligands [REFERENCE_LIGANDS ...], --reference-ligands [REFERENCE_LIGANDS ...] + Path to reference ligand files. + -o OUTPUT, --out OUTPUT, --output OUTPUT + Output file name. The output will be saved as a JSON + file. default: out.json + -mf {pdb,cif,mmcif}, --mdl-format {pdb,cif,mmcif}, --model-format {pdb,cif,mmcif} + Format of model file. pdb reads pdb but also pdb.gz, + same applies to cif/mmcif. Inferred from filepath if + not given. + -rf {pdb,cif,mmcif}, --reference-format {pdb,cif,mmcif}, --ref-format {pdb,cif,mmcif} + Format of reference file. pdb reads pdb but also + pdb.gz, same applies to cif/mmcif. Inferred from + filepath if not given. + -mb MODEL_BIOUNIT, --model-biounit MODEL_BIOUNIT + Only has an effect if model is in mmcif format. By + default, the asymmetric unit (AU) is used for scoring. + If there are biounits defined in the mmcif file, you + can specify the ID (as a string) of the one which + should be used. + -rb REFERENCE_BIOUNIT, --reference-biounit REFERENCE_BIOUNIT + Only has an effect if reference is in mmcif format. By + default, the asymmetric unit (AU) is used for scoring. + If there are biounits defined in the mmcif file, you + can specify the ID (as a string) of the one which + should be used. + -ft, --fault-tolerant + Fault tolerant parsing. + -rna, --residue-number-alignment + Make alignment based on residue number instead of + using a global BLOSUM62-based alignment (NUC44 for + nucleotides). + -sm, --substructure-match + Allow incomplete (ie partially resolved) target + ligands. + -cd COVERAGE_DELTA, --coverage-delta COVERAGE_DELTA + Coverage delta for partial ligand assignment. + -u, --unassigned Report unassigned model ligands in the output together + with assigned ligands, with a null score, and reason + for not being assigned. + -v VERBOSITY, --verbosity VERBOSITY + Set verbosity level. Defaults to 3 (INFO). + --lddt-pli Compute lDDT-PLI score and store as key "lddt-pli". + --lddt-pli-radius LDDT_PLI_RADIUS + lDDT inclusion radius for lDDT-PLI. + --lddt-pli-amc Add model contacts (amc) when computing lDDT-PLI. + --rmsd Compute RMSD score and store as key "rmsd". + --radius RADIUS Inclusion radius to extract reference binding site + that is used for RMSD computation. Any residue with + atoms within this distance of the ligand will be + included in the binding site. + --lddt-lp-radius LDDT_LP_RADIUS + lDDT inclusion radius for lDDT-LP. + -fbs, --full-bs-search + Enumerate all potential binding sites in the model + when searching rigid superposition for RMSD + computation diff --git a/modules/mol/alg/doc/molalg.rst b/modules/mol/alg/doc/molalg.rst index 79efa947e6f9f275528331c1104068ec1f3b74b4..0210375dbe3bb43c8f394c5baeb4e03c7ab49836 100644 --- a/modules/mol/alg/doc/molalg.rst +++ b/modules/mol/alg/doc/molalg.rst @@ -115,10 +115,20 @@ Local Distance Test scores (lDDT, DRMSD) .. currentmodule:: ost.mol.alg -:mod:`ligand_scoring <ost.mol.alg.ligand_scoring>` -- Ligand scoring functions --------------------------------------------------------------------------------- +:mod:`ligand_scoring <ost.mol.alg.ligand_scoring_base>` -- Ligand scoring functions +----------------------------------------------------------------------------------- + +.. automodule:: ost.mol.alg.ligand_scoring_base + :members: + :member-order: bysource + :synopsis: Scoring of ligands + +.. automodule:: ost.mol.alg.ligand_scoring_lddtpli + :members: + :member-order: bysource + :synopsis: Scoring of ligands -.. automodule:: ost.mol.alg.ligand_scoring +.. automodule:: ost.mol.alg.ligand_scoring_scrmsd :members: :member-order: bysource :synopsis: Scoring of ligands diff --git a/modules/mol/alg/pymod/CMakeLists.txt b/modules/mol/alg/pymod/CMakeLists.txt index 8506d7be3ad9a95631ec61a5db2371e942aff879..ecf4ca0fa0a70ca1ed658b6d4b71bef12830731e 100644 --- a/modules/mol/alg/pymod/CMakeLists.txt +++ b/modules/mol/alg/pymod/CMakeLists.txt @@ -29,9 +29,11 @@ set(OST_MOL_ALG_PYMOD_MODULES scoring.py chain_mapping.py stereochemistry.py - ligand_scoring.py dockq.py contact_score.py + ligand_scoring_base.py + ligand_scoring_scrmsd.py + ligand_scoring_lddtpli.py ) if (NOT ENABLE_STATIC) diff --git a/modules/mol/alg/pymod/chain_mapping.py b/modules/mol/alg/pymod/chain_mapping.py index 41f7f69755b56ecc04c36f494c6e469176032223..15ba5b1535c11eb8ad60da457e02a9a8a20de4ea 100644 --- a/modules/mol/alg/pymod/chain_mapping.py +++ b/modules/mol/alg/pymod/chain_mapping.py @@ -3390,6 +3390,9 @@ def _RefEqualGenerator(ref_chains, mdl_chains): for p in itertools.permutations(mdl_chains): yield list(p) +def _RefEmptyGenerator(ref_chains, mdl_chains): + yield list() + def _ConcatIterators(iterators): for item in itertools.product(*iterators): yield list(item) @@ -3435,8 +3438,8 @@ def _ChainMappings(ref_chains, mdl_chains, n_max=None): iterators = list() for ref, mdl in zip(ref_chains, mdl_chains): if len(ref) == 0: - raise RuntimeError("Expext at least one chain in ref chem group") - if len(ref) == len(mdl): + iterators.append(_RefEmptyGenerator(ref, mdl)) + elif len(ref) == len(mdl): iterators.append(_RefEqualGenerator(ref, mdl)) elif len(ref) < len(mdl): iterators.append(_RefSmallerGenerator(ref, mdl)) diff --git a/modules/mol/alg/pymod/lddt.py b/modules/mol/alg/pymod/lddt.py index adc0a8901971657d39f8eb3f692f82cf615efaa8..ac2458a51e460542a9158bd58b98c71b0677fe40 100644 --- a/modules/mol/alg/pymod/lddt.py +++ b/modules/mol/alg/pymod/lddt.py @@ -430,7 +430,8 @@ class lDDTScorer: chain_mapping=None, no_interchain=False, no_intrachain=False, penalize_extra_chains=False, residue_mapping=None, return_dist_test=False, - check_resnames=True, add_mdl_contacts=False): + check_resnames=True, add_mdl_contacts=False, + interaction_data=None): """Computes lDDT of *model* - globally and per-residue :param model: Model to be scored - models are preferably scored upon @@ -520,6 +521,8 @@ class lDDTScorer: be added if the respective atom pair is not resolved in the target. :type add_mdl_contacts: :class:`bool` + :param interaction_data: Pro param - don't use + :type interaction_data: :class:`tuple` :returns: global and per-residue lDDT scores as a tuple - first element is global lDDT score (None if *target* has no @@ -551,37 +554,51 @@ class lDDTScorer: # data objects defining model data - see _ProcessModel for rough # description pos, res_ref_atom_indices, res_atom_indices, res_atom_hashes, \ - res_indices, symmetries = self._ProcessModel(model, chain_mapping, - residue_mapping = residue_mapping, - thresholds = thresholds, - check_resnames = check_resnames) + res_indices, ref_res_indices, symmetries = \ + self._ProcessModel(model, chain_mapping, + residue_mapping = residue_mapping, + thresholds = thresholds, + check_resnames = check_resnames) if no_interchain and no_intrachain: raise RuntimeError("no_interchain and no_intrachain flags are " "mutually exclusive") - if no_interchain: - sym_ref_indices = self.sym_ref_indices_sc - sym_ref_distances = self.sym_ref_distances_sc - ref_indices = self.ref_indices_sc - ref_distances = self.ref_distances_sc - elif no_intrachain: - sym_ref_indices = self.sym_ref_indices_ic - sym_ref_distances = self.sym_ref_distances_ic - ref_indices = self.ref_indices_ic - ref_distances = self.ref_distances_ic + + sym_ref_indices = None + sym_ref_distances = None + ref_indices = None + ref_distances = None + + if interaction_data is None: + if no_interchain: + sym_ref_indices = self.sym_ref_indices_sc + sym_ref_distances = self.sym_ref_distances_sc + ref_indices = self.ref_indices_sc + ref_distances = self.ref_distances_sc + elif no_intrachain: + sym_ref_indices = self.sym_ref_indices_ic + sym_ref_distances = self.sym_ref_distances_ic + ref_indices = self.ref_indices_ic + ref_distances = self.ref_distances_ic + else: + sym_ref_indices = self.sym_ref_indices + sym_ref_distances = self.sym_ref_distances + ref_indices = self.ref_indices + ref_distances = self.ref_distances + + if add_mdl_contacts: + ref_indices, ref_distances = \ + self._AddMdlContacts(model, res_atom_indices, res_atom_hashes, + ref_indices, ref_distances, + no_interchain, no_intrachain) + # recompute symmetry related indices/distances + sym_ref_indices, sym_ref_distances = \ + lDDTScorer._NonSymDistances(self.n_atoms, self.symmetric_atoms, + ref_indices, ref_distances) else: - sym_ref_indices = self.sym_ref_indices - sym_ref_distances = self.sym_ref_distances - ref_indices = self.ref_indices - ref_distances = self.ref_distances - - if add_mdl_contacts: - ref_indices, ref_distances, \ - sym_ref_indices, sym_ref_distances = \ - self._AddMdlContacts(model, res_atom_indices, res_atom_hashes, - ref_indices, ref_distances, - no_interchain, no_intrachain) + sym_ref_indices, sym_ref_distances, ref_indices, ref_distances = \ + interaction_data self._ResolveSymmetries(pos, thresholds, symmetries, sym_ref_indices, sym_ref_distances) @@ -689,6 +706,9 @@ class lDDTScorer: # indices of the scored residues res_indices = list() + # respective residue indices in reference + ref_res_indices = list() + # Will contain one element per symmetry group symmetries = list() @@ -724,6 +744,7 @@ class lDDTScorer: res_atom_indices.append(list()) res_atom_hashes.append(list()) res_indices.append(current_model_res_idx) + ref_res_indices.append(r_idx) for a_idx, a in enumerate(atoms): if a.IsValid(): p = a.GetPos() @@ -748,7 +769,7 @@ class lDDTScorer: symmetries.append(sym_indices) return (pos, res_ref_atom_indices, res_atom_indices, res_atom_hashes, - res_indices, symmetries) + res_indices, ref_res_indices, symmetries) def _GetExtraModelChainPenalty(self, model, chain_mapping): @@ -1006,12 +1027,7 @@ class lDDTScorer: np.sqrt(tmp, out=tmp) # distances against all relevant atoms ref_distances[i] = np.append(ref_distances[i], tmp) - # recompute symmetry related indices/distances - sym_ref_indices, sym_ref_distances = \ - lDDTScorer._NonSymDistances(self.n_atoms, self.symmetric_atoms, - ref_indices, ref_distances) - - return (ref_indices, ref_distances, sym_ref_indices, sym_ref_distances) + return (ref_indices, ref_distances) diff --git a/modules/mol/alg/pymod/ligand_scoring.py b/modules/mol/alg/pymod/ligand_scoring.py deleted file mode 100644 index ce98690ddbf60f5d7511bee9dfb0492e5e5b2232..0000000000000000000000000000000000000000 --- a/modules/mol/alg/pymod/ligand_scoring.py +++ /dev/null @@ -1,2071 +0,0 @@ -import warnings - -import numpy as np -import numpy.ma as np_ma -import networkx - -from ost import mol -from ost import geom -from ost import LogError, LogWarning, LogScript, LogInfo, LogVerbose, LogDebug -from ost.mol.alg import chain_mapping - - -class LigandScorer: - """ Scorer class to compute various small molecule ligand (non polymer) scores. - - .. note :: - Extra requirements: - - - Python modules `numpy` and `networkx` must be available - (e.g. use ``pip install numpy networkx``) - - At the moment, two scores are available: - - * lDDT-PLI, that looks at the conservation of protein-ligand contacts - with :class:`lDDT <ost.mol.alg.lddt.lDDTScorer>`. - * Binding-site superposed, symmetry-corrected RMSD that assesses the - accuracy of the ligand pose (BiSyRMSD, hereinafter referred to as RMSD). - - Both scores involve local chain mapping of the reference binding site - onto the model, symmetry-correction, and finally assignment (mapping) - of model and target ligands, as described in (Manuscript in preparation). - - The binding site is defined based on a radius around the target ligand - in the reference structure only. It only contains protein and nucleic - acid chains that pass the criteria for the - :class:`chain mapping <ost.mol.alg.chain_mapping>`. This means ignoring - other ligands, waters, short polymers as well as any incorrectly connected - chains that may be in proximity. - - Results are available as matrices (`(lddt_pli|rmsd)_matrix`), where every - target-model score is reported in a matrix; as `(lddt_pli|rmsd)` where - a model-target assignment has been determined (see below) and reported in - a dictionary; and as (`(lddt_pli|rmsd)_details`) methods, which report - additional details about different aspects of the scoring such as chain - mapping. - - The behavior of chain mapping and ligand assignment can be controlled - with the `global_chain_mapping` and `rmsd_assignment` arguments. - - By default, chain mapping is performed locally, ie. only within the - binding site. As a result, different ligand scores can correspond to - different chain mappings. This tends to produce more favorable scores, - especially in large, partially regular oligomeric complexes. - Setting `global_chain_mapping=True` enforces a single global chain mapping, - as per :meth:`ost.mol.alg.chain_mapping.ChainMapper.GetMapping`. - Note that this global chain mapping currently ignores non polymer entities - such as small ligands, and may result in overly pessimistic scores. - - By default, target-model ligand assignments are computed identically - for the RMSD and lDDT-PLI scores. Each model ligand is uniquely assigned - to a target ligand, starting from the lowest RMSD and using each target and - model ligand in a single assignment. If `rmsd_assignment` is set to False, - RMSD and lDDT-PLI are assigned separately to optimize each score, and the - other score is used as a tiebreaker. - - By default, only exact matches between target and model ligands are - considered. This is a problem when the target only contains a subset - of the expected atoms (for instance if atoms are missing in an - experimental structure, which often happens in the PDB). With - `substructure_match=True`, complete model ligands can be scored against - partial target ligands. One problem with this approach is that it is - very easy to find good matches to small, irrelevant ligands like EDO, CO2 - or GOL. To counter that, the assignment algorithm considers the coverage, - expressed as the fraction of atoms of the model ligand atoms covered in the - target. Higher coverage matches are prioritized, but a match with a better - score will be preferred if it falls within a window of `coverage_delta` - (by default 0.2) of a worse-scoring match. As a result, for instance, - with a delta of 0.2, a low-score match with coverage 0.96 would be - preferred to a high-score match with coverage 0.90. - - Assumptions: - - The class generally assumes that the - :attr:`~ost.mol.ResidueHandle.is_ligand` property is properly set on all - the ligand atoms, and only ligand atoms. This is typically the case for - entities loaded from mmCIF (tested with mmCIF files from the PDB and - SWISS-MODEL). Legacy PDB files must contain `HET` headers (which is usually - the case for files downloaded from the PDB but not elsewhere). - - The class doesn't perform any cleanup of the provided structures. - It is up to the caller to ensure that the data is clean and suitable for - scoring. :ref:`Molck <molck>` should be used with extra - care, as many of the options (such as `rm_non_std` or `map_nonstd_res`) can - cause ligands to be removed from the structure. If cleanup with Molck is - needed, ligands should be kept aside and passed separately. Non-ligand residues - should be valid compounds with atom names following the naming conventions - of the component dictionary. Non-standard residues are acceptable, and if - the model contains a standard residue at that position, only atoms with - matching names will be considered. - - Unlike most of OpenStructure, this class does not assume that the ligands - (either in the model or the target) are part of the PDB component - dictionary. They may have arbitrary residue names. Residue names do not - have to match between the model and the target. Matching is based on - the calculation of isomorphisms which depend on the atom element name and - atom connectivity (bond order is ignored). - It is up to the caller to ensure that the connectivity of atoms is properly - set before passing any ligands to this class. Ligands with improper - connectivity will lead to bogus results. - - Note, however, that atom names should be unique within a residue (ie two - distinct atoms cannot have the same atom name). - - This only applies to the ligand. The rest of the model and target - structures (protein, nucleic acids) must still follow the usual rules and - contain only residues from the compound library. - - Although it isn't a requirement, hydrogen atoms should be removed from the - structures. Here is an example code snippet that will perform a reasonable - cleanup. Keep in mind that this is most likely not going to work as - expected with entities loaded from PDB files, as the `is_ligand` flag is - probably not set properly. - - Here is a snippet example of how to use this code:: - - from ost.mol.alg.ligand_scoring import LigandScorer - from ost.mol.alg import Molck, MolckSettings - - # Load data - # Structure model in PDB format, containing the receptor only - model = io.LoadPDB("path_to_model.pdb") - # Ligand model as SDF file - model_ligand = io.LoadEntity("path_to_ligand.sdf", format="sdf") - # Target loaded from mmCIF, containing the ligand - target = io.LoadMMCIF("path_to_target.cif") - - # Cleanup a copy of the structures - cleaned_model = model.Copy() - cleaned_target = target.Copy() - molck_settings = MolckSettings(rm_unk_atoms=True, - rm_non_std=False, - rm_hyd_atoms=True, - rm_oxt_atoms=False, - rm_zero_occ_atoms=False, - colored=False, - map_nonstd_res=False, - assign_elem=True) - Molck(cleaned_model, conop.GetDefaultLib(), molck_settings) - Molck(cleaned_target, conop.GetDefaultLib(), molck_settings) - - # Setup scorer object and compute lDDT-PLI - model_ligands = [model_ligand.Select("ele != H")] - ls = LigandScorer(model=cleaned_model, target=cleaned_target, model_ligands=model_ligands) - print("lDDT-PLI:", ls.lddt_pli) - print("RMSD:", ls.rmsd) - - :param model: Model structure - a deep copy is available as :attr:`model`. - No additional processing (ie. Molck), checks, - stereochemistry checks or sanitization is performed on the - input. Hydrogen atoms are kept. - :type model: :class:`ost.mol.EntityHandle`/:class:`ost.mol.EntityView` - :param target: Target structure - a deep copy is available as :attr:`target`. - No additional processing (ie. Molck), checks or sanitization - is performed on the input. Hydrogen atoms are kept. - :type target: :class:`ost.mol.EntityHandle`/:class:`ost.mol.EntityView` - :param model_ligands: Model ligands, as a list of - :class:`~ost.mol.ResidueHandle` belonging to the model - entity. Can be instantiated with either a :class:list of - :class:`~ost.mol.ResidueHandle`/:class:`ost.mol.ResidueView` - or of :class:`ost.mol.EntityHandle`/:class:`ost.mol.EntityView`. - If `None`, ligands will be extracted based on the - :attr:`~ost.mol.ResidueHandle.is_ligand` flag (this is - normally set properly in entities loaded from mmCIF). - :type model_ligands: :class:`list` - :param target_ligands: Target ligands, as a list of - :class:`~ost.mol.ResidueHandle` belonging to the target - entity. Can be instantiated either a :class:list of - :class:`~ost.mol.ResidueHandle`/:class:`ost.mol.ResidueView` - or of :class:`ost.mol.EntityHandle`/:class:`ost.mol.EntityView` - containing a single residue each. - If `None`, ligands will be extracted based on the - :attr:`~ost.mol.ResidueHandle.is_ligand` flag (this is - normally set properly in entities loaded from mmCIF). - :type target_ligands: :class:`list` - :param resnum_alignments: Whether alignments between chemically equivalent - chains in *model* and *target* can be computed - based on residue numbers. This can be assumed in - benchmarking setups such as CAMEO/CASP. - :type resnum_alignments: :class:`bool` - :param check_resnames: On by default. Enforces residue name matches - between mapped model and target residues. - :type check_resnames: :class:`bool` - :param rename_ligand_chain: If a residue with the same chain name and - residue number than an explicitly passed model - or target ligand exits in the structure, - and `rename_ligand_chain` is False, a - RuntimeError will be raised. If - `rename_ligand_chain` is True, the ligand will - be moved to a new chain instead, and the move - will be logged to the console with SCRIPT - level. - :type rename_ligand_chain: :class:`bool` - :param chain_mapper: a chain mapper initialized for the target structure. - If None (default), a chain mapper will be initialized - lazily as required. - :type chain_mapper: :class:`ost.mol.alg.chain_mapping.ChainMapper` - :param substructure_match: Set this to True to allow incomplete (ie - partially resolved) target ligands. - :type substructure_match: :class:`bool` - :param coverage_delta: the coverage delta for partial ligand assignment. - :type coverage_delta: :class:`float` - :param radius: Inclusion radius for the binding site. Residues with - atoms within this distance of the ligand will be considered - for inclusion in the binding site. - :type radius: :class:`float` - :param lddt_pli_radius: lDDT inclusion radius for lDDT-PLI. Should be - at least equal to or larger than `radius`. - :type lddt_pli_radius: :class:`float` - :param lddt_lp_radius: lDDT inclusion radius for lDDT-LP. - :type lddt_lp_radius: :class:`float` - :param model_bs_radius: inclusion radius for model binding sites. - Only used when full_bs_search=False, otherwise the - radius is effectively infinite. Only chains with - atoms within this distance of a model ligand will - be considered in the chain mapping. - :type model_bs_radius: :class:`float` - :param binding_sites_topn: maximum number of target binding site - representations to assess, per target ligand. - Ignored if `global_chain_mapping` is True. - :type binding_sites_topn: :class:`int` - :param global_chain_mapping: set to True to use a global chain mapping for - the polymer (protein, nucleotide) chains. - Defaults to False, in which case only local - chain mappings are allowed (where different - ligand may be scored against different chain - mappings). - :type global_chain_mapping: :class:`bool` - :param custom_mapping: Provide custom chain mapping between *model* and - *target* that is used as global chain mapping. - Dictionary with target chain names as key and model - chain names as value. Only has an effect if - *global_chain_mapping* is True. - :type custom_mapping: :class:`dict` - :param rmsd_assignment: set to False to assign lDDT-PLI and RMSD separately - using a combination of these two scores to - optimize the assignment. By default (True), only - RMSD is considered for the ligand assignment. - :type rmsd_assignment: :class:`bool` - :param n_max_naive: Parameter for global chain mapping. If *model* and - *target* have less or equal that number of chains, - the full - mapping solution space is enumerated to find the - the optimum. A heuristic is used otherwise. - :type n_max_naive: :class:`int` - :param max_symmetries: If more than that many isomorphisms exist for - a target-ligand pair, it will be ignored and reported - as unassigned. - :type max_symmetries: :class:`int` - :param unassigned: If True, unassigned model ligands are reported in - the output together with assigned ligands, with a score - of None, and reason for not being assigned in the - \\*_details matrix. Defaults to False. - :type unassigned: :class:`bool` - :param full_bs_search: If True, all potential binding sites in the model - are searched for each target binding site. If False, - the search space in the model is reduced to chains - around (`model_bs_radius` Å) model ligands. - This speeds up computations, but may result in - ligands not being scored if the predicted ligand - pose is too far from the actual binding site. - When that's the case, the value in the - `unassigned_*_ligands` property will be - `model_representation` and is indistinguishable from - cases where the binding site was not modeled at all. - Ignored when `global_chain_mapping` is True. - :type full_bs_search: :class:`bool` - """ - def __init__(self, model, target, model_ligands=None, target_ligands=None, - resnum_alignments=False, check_resnames=True, - rename_ligand_chain=False, chain_mapper=None, - substructure_match=False, coverage_delta=0.2, radius=4.0, - lddt_pli_radius=6.0, lddt_lp_radius=10.0, model_bs_radius=20, - binding_sites_topn=100000, global_chain_mapping=False, - rmsd_assignment=False, n_max_naive=12, max_symmetries=1e5, - custom_mapping=None, unassigned=False, full_bs_search=False): - - if isinstance(model, mol.EntityView): - self.model = mol.CreateEntityFromView(model, False) - elif isinstance(model, mol.EntityHandle): - self.model = model.Copy() - else: - raise RuntimeError("model must be of type EntityView/EntityHandle") - - if isinstance(target, mol.EntityView): - self.target = mol.CreateEntityFromView(target, False) - elif isinstance(target, mol.EntityHandle): - self.target = target.Copy() - else: - raise RuntimeError("target must be of type EntityView/EntityHandle") - - # Extract ligands from target - if target_ligands is None: - self.target_ligands = self._extract_ligands(self.target) - else: - self.target_ligands = self._prepare_ligands(self.target, target, - target_ligands, - rename_ligand_chain) - if len(self.target_ligands) == 0: - LogWarning("No ligands in the target") - - # Extract ligands from model - if model_ligands is None: - self.model_ligands = self._extract_ligands(self.model) - else: - self.model_ligands = self._prepare_ligands(self.model, model, - model_ligands, - rename_ligand_chain) - if len(self.model_ligands) == 0: - LogWarning("No ligands in the model") - if len(self.target_ligands) == 0: - raise ValueError("No ligand in the model and in the target") - - self._chain_mapper = chain_mapper - self.resnum_alignments = resnum_alignments - self.check_resnames = check_resnames - self.rename_ligand_chain = rename_ligand_chain - self.substructure_match = substructure_match - self.radius = radius - self.model_bs_radius = model_bs_radius - self.lddt_pli_radius = lddt_pli_radius - self.lddt_lp_radius = lddt_lp_radius - self.binding_sites_topn = binding_sites_topn - self.global_chain_mapping = global_chain_mapping - self.rmsd_assignment = rmsd_assignment - self.n_max_naive = n_max_naive - self.max_symmetries = max_symmetries - self.unassigned = unassigned - self.coverage_delta = coverage_delta - self.full_bs_search = full_bs_search - - # scoring matrices - self._rmsd_matrix = None - self._rmsd_full_matrix = None - self._lddt_pli_matrix = None - self._lddt_pli_full_matrix = None - - # lazily computed scores - self._rmsd = None - self._rmsd_details = None - self._lddt_pli = None - self._lddt_pli_details = None - - # lazily precomputed variables - self.__model_mapping = None - - # Residues that are in contact with a ligand => binding site - # defined as all residues with at least one atom within self.radius - # key: ligand.handle.hash_code, value: EntityView of whatever - # entity ligand belongs to - self._binding_sites = dict() - - # lazily precomputed variables to speedup GetRepr chain mapping calls - # for localized GetRepr searches - self._chem_mapping = None - self._chem_group_alns = None - self._chain_mapping_mdl = None - self._get_repr_input = dict() - - # the actual representations as returned by ChainMapper.GetRepr - # key: (target_ligand.handle.hash_code, model_ligand.handle.hash_code) - # value: list of repr results - self._repr = dict() - - # cache for rmsd values - # rmsd is used as tie breaker in lddt-pli, we therefore need access to - # the rmsd for each target_ligand/model_ligand/repr combination - # key: (target_ligand.handle.hash_code, model_ligand.handle.hash_code, - # repr_id) - # value: rmsd - self._rmsd_cache = dict() - - # Bookkeeping of unassigned ligands - self._unassigned_target_ligands = None - self._unassigned_model_ligands = None - self._unassigned_target_ligands_reason = {} - self._unassigned_target_ligand_short = None - self._unassigned_model_ligand_short = None - self._unassigned_target_ligand_descriptions = None - self._unassigned_model_ligand_descriptions = None - # Keep track of symmetries/isomorphisms (regardless of scoring) - # 0.0: no isomorphism - # 1.0: isomorphic - # np.nan: not assessed yet - that's why we can't use a boolean - self._assignment_isomorphisms = None - # Keep track of match coverage (only in case there was a score) - self._assignment_match_coverage = None - - if custom_mapping is not None: - self._set_custom_mapping(custom_mapping) - - @property - def chain_mapper(self): - """ Chain mapper object for the given :attr:`target`. - - :type: :class:`ost.mol.alg.chain_mapping.ChainMapper` - """ - if self._chain_mapper is None: - self._chain_mapper = chain_mapping.ChainMapper(self.target, - n_max_naive=1e9, - resnum_alignments=self.resnum_alignments) - return self._chain_mapper - - def get_target_binding_site(self, target_ligand): - - if target_ligand.handle.hash_code not in self._binding_sites: - - # create view of reference binding site - ref_residues_hashes = set() # helper to keep track of added residues - ignored_residue_hashes = {target_ligand.hash_code} - for ligand_at in target_ligand.atoms: - close_atoms = self.target.FindWithin(ligand_at.GetPos(), self.radius) - for close_at in close_atoms: - # Skip any residue not in the chain mapping target - ref_res = close_at.GetResidue() - h = ref_res.handle.GetHashCode() - if h not in ref_residues_hashes and \ - h not in ignored_residue_hashes: - if self.chain_mapper.target.ViewForHandle(ref_res).IsValid(): - h = ref_res.handle.GetHashCode() - ref_residues_hashes.add(h) - elif ref_res.is_ligand: - LogWarning("Ignoring ligand %s in binding site of %s" % ( - ref_res.qualified_name, target_ligand.qualified_name)) - ignored_residue_hashes.add(h) - elif ref_res.chem_type == mol.ChemType.WATERS: - pass # That's ok, no need to warn - else: - LogWarning("Ignoring residue %s in binding site of %s" % ( - ref_res.qualified_name, target_ligand.qualified_name)) - ignored_residue_hashes.add(h) - - ref_bs = self.target.CreateEmptyView() - if ref_residues_hashes: - # reason for doing that separately is to guarantee same ordering of - # residues as in underlying entity. (Reorder by ResNum seems only - # available on ChainHandles) - for ch in self.target.chains: - for r in ch.residues: - if r.handle.GetHashCode() in ref_residues_hashes: - ref_bs.AddResidue(r, mol.ViewAddFlag.INCLUDE_ALL) - if len(ref_bs.residues) == 0: - raise RuntimeError("Failed to add proximity residues to " - "the reference binding site entity") - else: - # Flag missing binding site - self._unassigned_target_ligands_reason[target_ligand] = ("binding_site", - "No residue in proximity of the target ligand") - - self._binding_sites[target_ligand.handle.hash_code] = ref_bs - - return self._binding_sites[target_ligand.handle.hash_code] - - @property - def _model_mapping(self): - """Get the global chain mapping for the model.""" - if self.__model_mapping is None: - self.__model_mapping = self.chain_mapper.GetMapping(self.model, - n_max_naive=self.n_max_naive) - return self.__model_mapping - - @staticmethod - def _extract_ligands(entity): - """Extract ligands from entity. Return a list of residues. - - Assumes that ligands have the :attr:`~ost.mol.ResidueHandle.is_ligand` - flag set. This is typically the case for entities loaded from mmCIF - (tested with mmCIF files from the PDB and SWISS-MODEL). - Legacy PDB files must contain `HET` headers (which is usually the - case for files downloaded from the PDB but not elsewhere). - - This function performs basic checks to ensure that the residues in this - chain are not forming polymer bonds (ie peptide/nucleotide ligands) and - will raise a RuntimeError if this assumption is broken. - - :param entity: the entity to extract ligands from - :type entity: :class:`~ost.mol.EntityHandle` - :rtype: :class:`list` of :class:`~ost.mol.ResidueHandle` - - """ - extracted_ligands = [] - for residue in entity.residues: - if residue.is_ligand: - if mol.InSequence(residue, residue.next): - raise RuntimeError("Residue %s connected in polymer sequen" - "ce %s" % (residue.qualified_name)) - extracted_ligands.append(residue) - LogVerbose("Detected residue %s as ligand" % residue) - return extracted_ligands - - @staticmethod - def _prepare_ligands(new_entity, old_entity, ligands, rename_chain): - """Prepare the ligands given into a list of ResidueHandles which are - part of the copied entity, suitable for the model_ligands and - target_ligands properties. - - This function takes a list of ligands as (Entity|Residue)(Handle|View). - Entities can contain multiple ligands, which will be considered as - separate ligands. - - Ligands which are part of the entity are simply fetched in the new - copied entity. Otherwise, they are copied over to the copied entity. - """ - extracted_ligands = [] - - next_chain_num = 1 - new_editor = None - - def _copy_residue(residue, rename_chain): - """ Copy the residue into the new chain. - Return the new residue handle.""" - nonlocal next_chain_num, new_editor - - # Instantiate the editor - if new_editor is None: - new_editor = new_entity.EditXCS() - - new_chain = new_entity.FindChain(residue.chain.name) - if not new_chain.IsValid(): - new_chain = new_editor.InsertChain(residue.chain.name) - else: - # Does a residue with the same name already exist? - already_exists = new_chain.FindResidue(residue.number).IsValid() - if already_exists: - if rename_chain: - chain_ext = 2 # Extend the chain name by this - while True: - new_chain_name = residue.chain.name + "_" + str(chain_ext) - new_chain = new_entity.FindChain(new_chain_name) - if new_chain.IsValid(): - chain_ext += 1 - continue - else: - new_chain = new_editor.InsertChain(new_chain_name) - break - LogScript("Moved ligand residue %s to new chain %s" % ( - residue.qualified_name, new_chain.name)) - else: - msg = "A residue number %s already exists in chain %s" % ( - residue.number, residue.chain.name) - raise RuntimeError(msg) - - # Add the residue with its original residue number - new_res = new_editor.AppendResidue(new_chain, residue.name, residue.number) - # Add atoms - for old_atom in residue.atoms: - new_editor.InsertAtom(new_res, old_atom.name, old_atom.pos, - element=old_atom.element, occupancy=old_atom.occupancy, - b_factor=old_atom.b_factor, is_hetatm=old_atom.is_hetatom) - # Add bonds - for old_atom in residue.atoms: - for old_bond in old_atom.bonds: - new_first = new_res.FindAtom(old_bond.first.name) - new_second = new_res.FindAtom(old_bond.second.name) - new_editor.Connect(new_first, new_second) - return new_res - - def _process_ligand_residue(res, rename_chain): - """Copy or fetch the residue. Return the residue handle.""" - new_res = None - if res.entity.handle == old_entity.handle: - # Residue is part of the old_entity handle. - # However, it may not be in the copied one, for instance it may have been a view - # We try to grab it first, otherwise we copy it - new_res = new_entity.FindResidue(res.chain.name, res.number) - if new_res and new_res.valid: - LogVerbose("Ligand residue %s already in entity" % res.handle.qualified_name) - else: - # Residue is not part of the entity, need to copy it first - new_res = _copy_residue(res, rename_chain) - LogVerbose("Copied ligand residue %s" % res.handle.qualified_name) - new_res.SetIsLigand(True) - return new_res - - for ligand in ligands: - if isinstance(ligand, mol.EntityHandle) or isinstance(ligand, mol.EntityView): - for residue in ligand.residues: - new_residue = _process_ligand_residue(residue, rename_chain) - extracted_ligands.append(new_residue) - elif isinstance(ligand, mol.ResidueHandle) or isinstance(ligand, mol.ResidueView): - new_residue = _process_ligand_residue(ligand, rename_chain) - extracted_ligands.append(new_residue) - else: - raise RuntimeError("Ligands should be given as Entity or Residue") - - if new_editor is not None: - new_editor.UpdateICS() - return extracted_ligands - - @staticmethod - def _build_binding_site_entity(ligand, residues, extra_residues=[]): - """ Build an entity with all the binding site residues in chain A - and the ligand in chain _. Residues are renumbered consecutively from - 1. The ligand is assigned residue number 1 and residue name LIG. - Residues in extra_residues not in `residues` in the model are added - at the end of chain A. - - :param ligand: the Residue Handle of the ligand - :type ligand: :class:`~ost.mol.ResidueHandle` - :param residues: a list of binding site residues - :type residues: :class:`list` of :class:`~ost.mol.ResidueHandle` - :param extra_residues: an optional list with addition binding site - residues. Residues in this list which are not - in `residues` will be added at the end of chain - A. This allows for instance adding unmapped - residues missing from the model into the - reference binding site. - :type extra_residues: :class:`list` of :class:`~ost.mol.ResidueHandle` - :rtype: :class:`~ost.mol.EntityHandle` - """ - bs_ent = mol.CreateEntity() - ed = bs_ent.EditXCS() - bs_chain = ed.InsertChain("A") - seen_res_qn = [] - for resnum, old_res in enumerate(residues, 1): - seen_res_qn.append(old_res.qualified_name) - new_res = ed.AppendResidue(bs_chain, old_res.handle, - deep=True) - ed.SetResidueNumber(new_res, mol.ResNum(resnum)) - - # Add extra residues at the end. - for extra_res in extra_residues: - if extra_res.qualified_name not in seen_res_qn: - resnum += 1 - seen_res_qn.append(extra_res.qualified_name) - new_res = ed.AppendResidue(bs_chain, - extra_res.handle, - deep=True) - ed.SetResidueNumber(new_res, mol.ResNum(resnum)) - # Add the ligand in chain _ - ligand_chain = ed.InsertChain("_") - ligand_res = ed.AppendResidue(ligand_chain, ligand.handle, - deep=True) - ed.RenameResidue(ligand_res, "LIG") - ed.SetResidueNumber(ligand_res, mol.ResNum(1)) - ed.UpdateICS() - - return bs_ent - - def _compute_scores(self): - """ - Compute the RMSD and lDDT-PLI scores for every possible target-model - ligand pair and store the result in internal matrices. - """ - ############################## - # Create the result matrices # - ############################## - self._rmsd_full_matrix = np.empty( - (len(self.target_ligands), len(self.model_ligands)), dtype=dict) - self._lddt_pli_full_matrix = np.empty( - (len(self.target_ligands), len(self.model_ligands)), dtype=dict) - self._assignment_isomorphisms = np.full( - (len(self.target_ligands), len(self.model_ligands)), fill_value=np.nan) - self._assignment_match_coverage = np.zeros( - (len(self.target_ligands), len(self.model_ligands))) - - for target_id, target_ligand in enumerate(self.target_ligands): - LogVerbose("Analyzing target ligand %s" % target_ligand) - for model_id, model_ligand in enumerate(self.model_ligands): - LogVerbose("Compare to model ligand %s" % model_ligand) - - ######################################################### - # Compute symmetries for given target/model ligand pair # - ######################################################### - try: - symmetries = _ComputeSymmetries( - model_ligand, target_ligand, - substructure_match=self.substructure_match, - by_atom_index=True, - max_symmetries=self.max_symmetries) - LogVerbose("Ligands %s and %s symmetry match" % ( - str(model_ligand), str(target_ligand))) - except NoSymmetryError: - # Ligands are different - skip - LogVerbose("No symmetry between %s and %s" % ( - str(model_ligand), str(target_ligand))) - self._assignment_isomorphisms[target_id, model_id] = 0. - continue - except TooManySymmetriesError: - # Ligands are too symmetrical - skip - LogVerbose("Too many symmetries between %s and %s" % ( - str(model_ligand), str(target_ligand))) - self._assignment_isomorphisms[target_id, model_id] = -1. - continue - except DisconnectedGraphError: - # Disconnected graph is handled elsewhere - continue - - ################################################################ - # Compute best rmsd/lddt-pli by naively enumerating symmetries # - ################################################################ - # rmsds MUST be computed first, as lDDT uses them as tiebreaker - # and expects the values to be in self._rmsd_cache - rmsd_result = self._compute_rmsd(symmetries, target_ligand, - model_ligand) - lddt_pli_result = self._compute_lddtpli(symmetries, target_ligand, - model_ligand) - - ########################################### - # Extend results by symmetry related info # - ########################################### - if (rmsd_result is None) != (lddt_pli_result is None): - # Ligand assignment makes assumptions here, and is likely - # to not work properly if this differs. There is no reason - # it would ever do, so let's just check it - raise Exception("Ligand scoring bug: discrepency between " - "RMSD and lDDT-PLI definition.") - if rmsd_result is not None: - # Now we assume both rmsd_result and lddt_pli_result are defined - # Add coverage - substructure_match = len(symmetries[0][0]) != len(model_ligand.atoms) - coverage = len(symmetries[0][0]) / len(model_ligand.atoms) - self._assignment_match_coverage[target_id, model_id] = coverage - self._assignment_isomorphisms[target_id, model_id] = 1. - # Add RMSD - rmsd_result["substructure_match"] = substructure_match - rmsd_result["coverage"] = coverage - if self.unassigned: - rmsd_result["unassigned"] = False - # Add lDDT-PLI - lddt_pli_result["substructure_match"] = substructure_match - lddt_pli_result["coverage"] = coverage - if self.unassigned: - lddt_pli_result["unassigned"] = False - - ############ - # Finalize # - ############ - self._lddt_pli_full_matrix[target_id, model_id] = lddt_pli_result - self._rmsd_full_matrix[target_id, model_id] = rmsd_result - - - def _compute_rmsd(self, symmetries, target_ligand, model_ligand): - best_rmsd_result = None - for r_i, r in enumerate(self.get_repr(target_ligand, model_ligand)): - rmsd = _SCRMSD_symmetries(symmetries, model_ligand, - target_ligand, transformation=r.transform) - - cache_key = (target_ligand.handle.hash_code, - model_ligand.handle.hash_code, r_i) - self._rmsd_cache[cache_key] = rmsd - - if best_rmsd_result is None or rmsd < best_rmsd_result["rmsd"]: - best_rmsd_result = {"rmsd": rmsd, - "lddt_lp": r.lDDT, - "bs_ref_res": r.substructure.residues, - "bs_ref_res_mapped": r.ref_residues, - "bs_mdl_res_mapped": r.mdl_residues, - "bb_rmsd": r.bb_rmsd, - "target_ligand": target_ligand, - "model_ligand": model_ligand, - "chain_mapping": r.GetFlatChainMapping(), - "transform": r.transform, - "inconsistent_residues": r.inconsistent_residues} - - return best_rmsd_result - - - def _compute_lddtpli(self, symmetries, target_ligand, model_ligand): - ref_bs = self.get_target_binding_site(target_ligand) - best_lddt_result = None - for r_i, r in enumerate(self.get_repr(target_ligand, model_ligand)): - ref_bs_ent = self._build_binding_site_entity( - target_ligand, r.ref_residues, - r.substructure.residues) - ref_bs_ent_ligand = ref_bs_ent.FindResidue("_", 1) # by definition - - custom_compounds = { - ref_bs_ent_ligand.name: - mol.alg.lddt.CustomCompound.FromResidue( - ref_bs_ent_ligand)} - lddt_scorer = mol.alg.lddt.lDDTScorer( - ref_bs_ent, - custom_compounds=custom_compounds, - inclusion_radius=self.lddt_pli_radius) - - lddt_tot = 4 * sum([len(x) for x in lddt_scorer.ref_indices_ic]) - if lddt_tot == 0: - # it's a space ship in the reference! - self._unassigned_target_ligands_reason[ - target_ligand] = ("no_contact", - "No lDDT-PLI contacts in the" - " reference structure") - #continue - mdl_bs_ent = self._build_binding_site_entity( - model_ligand, r.mdl_residues, []) - mdl_bs_ent_ligand = mdl_bs_ent.FindResidue("_", 1) # by definition - # Now for each symmetry, loop and rename atoms according - # to ref. - mdl_editor = mdl_bs_ent.EditXCS() - for i, (trg_sym, mdl_sym) in enumerate(symmetries): - for mdl_anum, trg_anum in zip(mdl_sym, trg_sym): - # Rename model atoms according to symmetry - trg_atom = ref_bs_ent_ligand.atoms[trg_anum] - mdl_atom = mdl_bs_ent_ligand.atoms[mdl_anum] - mdl_editor.RenameAtom(mdl_atom, trg_atom.name) - mdl_editor.UpdateICS() - - global_lddt, local_lddt = lddt_scorer.lDDT( - mdl_bs_ent, chain_mapping={"A": "A", "_": "_"}, - no_intrachain=True, - check_resnames=self.check_resnames) - - its_awesome = (best_lddt_result is None) or \ - (global_lddt > best_lddt_result["lddt_pli"]) - - # additionally consider rmsd as tiebreaker - if (not its_awesome) and (global_lddt == best_lddt_result["lddt_pli"]): - rmsd_cache_key = (target_ligand.handle.hash_code, - model_ligand.handle.hash_code, r_i) - rmsd = self._rmsd_cache[rmsd_cache_key] - if rmsd < best_lddt_result["rmsd"]: - its_awesome = True - - if its_awesome: - rmsd_cache_key = (target_ligand.handle.hash_code, - model_ligand.handle.hash_code, r_i) - best_lddt_result = {"lddt_pli": global_lddt, - "lddt_lp": r.lDDT, - "lddt_pli_n_contacts": lddt_tot, - "rmsd": self._rmsd_cache[rmsd_cache_key], - "bs_ref_res": r.substructure.residues, - "bs_ref_res_mapped": r.ref_residues, - "bs_mdl_res_mapped": r.mdl_residues, - "bb_rmsd": r.bb_rmsd, - "target_ligand": target_ligand, - "model_ligand": model_ligand, - "chain_mapping": r.GetFlatChainMapping(), - "transform": r.transform, - "inconsistent_residues": r.inconsistent_residues} - - return best_lddt_result - - - @staticmethod - def _find_ligand_assignment(mat1, mat2=None, coverage=None, coverage_delta=None): - """ Find the ligand assignment based on mat1. If mat2 is provided, it - will be used to break ties in mat1. If mat2 is not provided, ties will - be resolved by taking the first match arbitrarily. - - Both mat1 and mat2 should "look" like RMSD - ie be between inf (bad) - and 0 (good). - """ - # We will modify mat1 and mat2, so make copies of it first - mat1 = np.copy(mat1) - if mat2 is None: - mat2 = np.copy(mat1) - mat2[~np.isnan(mat2)] = np.inf - else: - mat2 = np.copy(mat2) - if coverage is None: - coverage = np.copy(mat1) - coverage[:] = 1 # Assume full coverage by default - else: - coverage = np.copy(coverage) - - assignments = [] - if 0 in mat1.shape: - # No model or target ligand - LogDebug("No model or target ligand, returning no assignment.") - return assignments - - def _get_best_match(mat1_val, coverage_val): - """ Extract the row/column indices of the prediction matching the - given values.""" - mat1_match_idx = np.argwhere((mat1 == mat1_val) & (coverage >= coverage_val)) - # Multiple "best" - use mat2 to disambiguate - if len(mat1_match_idx) > 1: - # Get the values of mat2 at these positions - best_mat2_match = [mat2[tuple(x)] for x in mat1_match_idx] - # Find the index of the best mat2 - # Note: argmin returns the first value which is min. - best_mat2_idx = np.array(best_mat2_match).argmin() - # Now get the original indices - return mat1_match_idx[best_mat2_idx] - else: - return mat1_match_idx[0] - - # First only consider top coverage matches. - min_coverage = np.max(coverage) - i = mat1.size + 1 - while min_coverage > 0 and not np.all(np.isnan(mat1)): - LogVerbose("Looking for matches with coverage >= %s" % min_coverage) - min_mat1 = LigandScorer._nanmin_nowarn(mat1, coverage < min_coverage) - while not np.isnan(min_mat1): - max_i_trg, max_i_mdl = _get_best_match(min_mat1, min_coverage) - - # Would we have a match for this model ligand with higher score - # but lower coverage? - alternative_matches = (mat1[:, max_i_mdl] < min_mat1) & ( - coverage[:, max_i_mdl] > (min_coverage - coverage_delta)) - if np.any(alternative_matches): - # Get the scores of these matches - LogVerbose("Found match with lower coverage but better score") - min_mat1 = np.nanmin(mat1[alternative_matches]) - max_i_trg, max_i_mdl = _get_best_match(min_mat1, min_coverage - coverage_delta) - - # Disable row and column - mat1[max_i_trg, :] = np.nan - mat1[:, max_i_mdl] = np.nan - mat2[max_i_trg, :] = np.nan - mat2[:, max_i_mdl] = np.nan - coverage[max_i_trg, :] = -np.inf - coverage[:, max_i_mdl] = -np.inf - - # Save - assignments.append((max_i_trg, max_i_mdl)) - - # Recompute min - min_mat1 = LigandScorer._nanmin_nowarn(mat1, coverage < min_coverage) - if i < 0: - raise Exception("Ligand scoring bug: hit appatent infinite loop!") - i -= 1 - # Recompute min_coverage - min_coverage = np.max(coverage) - if i < 0: - raise Exception("Ligand scoring bug: hit appatent infinite loop!") - i -= 1 - return assignments - - @staticmethod - def _nanmin_nowarn(array, mask): - """Compute np.nanmin but ignore the RuntimeWarning.""" - masked_array = np_ma.masked_array(array, mask=mask) - with warnings.catch_warnings(): # RuntimeWarning: All-NaN slice encountered - warnings.simplefilter("ignore") - min = np.nanmin(masked_array, ) - if np_ma.is_masked(min): - return np.nan # Everything was masked - else: - return min - - @staticmethod - def _reverse_lddt(lddt): - """Reverse lDDT means turning it from a number between 0 and 1 to a - number between infinity and 0 (0 being better). - - In practice, this is 1/lDDT. If lDDT is 0, the result is infinity. - """ - with warnings.catch_warnings(): # RuntimeWarning: divide by zero - warnings.simplefilter("ignore") - return np.float64(1) / lddt - - def _assign_ligands_rmsd(self): - """Assign (map) ligands between model and target. - - Sets self._rmsd and self._rmsd_details. - """ - mat2 = self._reverse_lddt(self.lddt_pli_matrix) - - mat_tuple = self._assign_matrices(self.rmsd_matrix, - mat2, - self._rmsd_full_matrix, - "rmsd") - self._rmsd = mat_tuple[0] - self._rmsd_details = mat_tuple[1] - # Ignore unassigned ligands - they are dealt with in lddt_pli. - # So the following lines should stay commented out: - # self._unassigned_target_ligands = mat_tuple[2] - # self._unassigned_model_ligands = mat_tuple[3] - - def _assign_matrices(self, mat1, mat2, data, main_key): - """ - Perform the ligand assignment, ie find the mapping between model and - target ligands. - - The algorithm starts by assigning the "best" mapping, and then discards - the target and model ligands (row, column) so that every model ligand - can be assigned to a single target ligand, and every target ligand - is only assigned to a single model ligand. Repeat until there is - nothing left to assign. - - In case of a tie in values in `mat1`, it uses `mat2` to break the tie. - - This algorithm doesn't guarantee a globally optimal assignment. - - Both `mat1` and `mat2` should contain values between 0 and infinity, - with lower values representing better scores. Use the - :meth:`_reverse_lddt` method to convert lDDT values to such a score. - - :param mat1: the main ligand assignment criteria (RMSD or lDDT-PLI) - :param mat2: the secondary ligand assignment criteria (lDDT-PLI or RMSD) - :param data: the data (either self._rmsd_full_matrix or self._lddt_pli_matrix) - :param main_key: the key of data (dictionnaries within `data`) to - assign into out_main. - :return: a tuple with 2 dictionaries of matrices containing the main - data, and details, respectively. - """ - assignments = self._find_ligand_assignment(mat1, mat2, - self._assignment_match_coverage, - self.coverage_delta) - out_main = {} - out_details = {} - assigned_trg = [False] * len(self.target_ligands) - assigned_mdl = [False] * len(self.model_ligands) - for assignment in assignments: - trg_idx, mdl_idx = assignment - assigned_mdl[mdl_idx] = True - assigned_trg[trg_idx] = True - mdl_lig = self.model_ligands[mdl_idx] - mdl_cname = mdl_lig.chain.name - mdl_resnum = mdl_lig.number - if mdl_cname not in out_main: - out_main[mdl_cname] = {} - out_details[mdl_cname] = {} - out_main[mdl_cname][mdl_resnum] = data[ - trg_idx, mdl_idx][main_key] - out_details[mdl_cname][mdl_resnum] = data[ - trg_idx, mdl_idx] - - unassigned_trg, unassigned_mdl = self._assign_unassigned( - assigned_trg, assigned_mdl, [out_main], [out_details], [main_key]) - return out_main, out_details, unassigned_trg, unassigned_mdl - - def _assign_unassigned(self, assigned_trg, assigned_mdl, - out_main, out_details, main_key): - unassigned_trg = {} - unassigned_mdl = {} - - unassigned_trg_idx = [i for i, x in enumerate(assigned_trg) if not x] - unassigned_mdl_idx = [i for i, x in enumerate(assigned_mdl) if not x] - - for mdl_idx in unassigned_mdl_idx: - mdl_lig = self.model_ligands[mdl_idx] - reason = self._find_unassigned_model_ligand_reason(mdl_lig, check=False) - mdl_cname = mdl_lig.chain.name - mdl_resnum = mdl_lig.number - if mdl_cname not in unassigned_mdl: - unassigned_mdl[mdl_cname] = {} - unassigned_mdl[mdl_cname][mdl_resnum] = reason - if self.unassigned: - for i, _ in enumerate(out_main): - if mdl_cname not in out_main[i]: - out_main[i][mdl_cname] = {} - out_details[i][mdl_cname] = {} - out_main[i][mdl_cname][mdl_resnum] = None - out_details[i][mdl_cname][mdl_resnum] = { - "unassigned": True, - "reason_short": reason[0], - "reason_long": reason[1], - main_key[i]: None, - } - LogInfo("Model ligand %s is unassigned: %s" % ( - mdl_lig.qualified_name, reason[1])) - - for trg_idx in unassigned_trg_idx: - trg_lig = self.target_ligands[trg_idx] - reason = self._find_unassigned_target_ligand_reason(trg_lig, check=False) - trg_cname = trg_lig.chain.name - trg_resnum = trg_lig.number - if trg_cname not in unassigned_trg: - unassigned_trg[trg_cname] = {} - unassigned_trg[trg_cname][trg_resnum] = reason - LogInfo("Target ligand %s is unassigned: %s" % ( - trg_lig.qualified_name, reason[1])) - - return unassigned_trg, unassigned_mdl - - - def _assign_matrix(self, mat, data1, main_key1, data2, main_key2): - """ - Perform the ligand assignment, ie find the mapping between model and - target ligands, based on a single matrix - - The algorithm starts by assigning the "best" mapping, and then discards - the target and model ligands (row, column) so that every model ligand - can be assigned to a single target ligand, and every target ligand - is only assigned to a single model ligand. Repeat until there is - nothing left to assign. - - This algorithm doesn't guarantee a globally optimal assignment. - - `mat` should contain values between 0 and infinity, - with lower values representing better scores. Use the - :meth:`_reverse_lddt` method to convert lDDT values to such a score. - - :param mat: the ligand assignment criteria (RMSD or lDDT-PLI) - :param data1: the first data (either self._rmsd_full_matrix or self._lddt_pli_matrix) - :param main_key1: the first key of data (dictionnaries within `data`) to - assign into out_main. - :param data2: the second data (either self._rmsd_full_matrix or self._lddt_pli_matrix) - :param main_key2: the second key of data (dictionnaries within `data`) to - assign into out_main. - :return: a tuple with 4 dictionaries of matrices containing the main - data1, details1, main data2 and details2, respectively. - """ - assignments = self._find_ligand_assignment(mat, - coverage=self._assignment_match_coverage, - coverage_delta=self.coverage_delta) - out_main1 = {} - out_details1 = {} - out_main2 = {} - out_details2 = {} - assigned_trg = [False] * len(self.target_ligands) - assigned_mdl = [False] * len(self.model_ligands) - for assignment in assignments: - trg_idx, mdl_idx = assignment - assigned_mdl[mdl_idx] = True - assigned_trg[trg_idx] = True - mdl_lig = self.model_ligands[mdl_idx] - mdl_cname = mdl_lig.chain.name - mdl_resnum = mdl_lig.number - # Data 1 - if mdl_cname not in out_main1: - out_main1[mdl_cname] = {} - out_details1[mdl_cname] = {} - out_main1[mdl_cname][mdl_resnum] = data1[ - trg_idx, mdl_idx][main_key1] - out_details1[mdl_cname][mdl_resnum] = data1[ - trg_idx, mdl_idx] - # Data2 - if mdl_cname not in out_main2: - out_main2[mdl_cname] = {} - out_details2[mdl_cname] = {} - out_main2[mdl_cname][mdl_resnum] = data2[ - trg_idx, mdl_idx][main_key2] - out_details2[mdl_cname][mdl_resnum] = data2[ - trg_idx, mdl_idx] - - unassigned_trg, unassigned_mdl = self._assign_unassigned( - assigned_trg, assigned_mdl, - [out_main1, out_main2], [out_details1, out_details2], - [main_key1, main_key2]) - - return out_main1, out_details1, out_main2, out_details2, \ - unassigned_trg, unassigned_mdl - - def _assign_ligands_lddt_pli(self): - """ Assign ligands based on lDDT-PLI. - - Sets self._lddt_pli and self._lddt_pli_details. - """ - mat1 = self._reverse_lddt(self.lddt_pli_matrix) - - mat_tuple = self._assign_matrices(mat1, - self.rmsd_matrix, - self._lddt_pli_full_matrix, - "lddt_pli") - self._lddt_pli = mat_tuple[0] - self._lddt_pli_details = mat_tuple[1] - self._unassigned_target_ligands = mat_tuple[2] - self._unassigned_model_ligands = mat_tuple[3] - - def _assign_ligands_rmsd_only(self): - """Assign (map) ligands between model and target based on RMSD only. - - Sets self._rmsd, self._rmsd_details, self._lddt_pli and - self._lddt_pli_details. - """ - mat_tuple = self._assign_matrix(self.rmsd_matrix, - self._rmsd_full_matrix, - "rmsd", - self._lddt_pli_full_matrix, - "lddt_pli") - self._rmsd = mat_tuple[0] - self._rmsd_details = mat_tuple[1] - self._lddt_pli = mat_tuple[2] - self._lddt_pli_details = mat_tuple[3] - self._unassigned_target_ligands = mat_tuple[4] - self._unassigned_model_ligands = mat_tuple[5] - - @property - def rmsd_matrix(self): - """ Get the matrix of RMSD values. - - Target ligands are in rows, model ligands in columns. - - NaN values indicate that no RMSD could be computed (i.e. different - ligands). - - :rtype: :class:`~numpy.ndarray` - """ - if self._rmsd_full_matrix is None: - self._compute_scores() - if self._rmsd_matrix is None: - # convert - shape = self._rmsd_full_matrix.shape - self._rmsd_matrix = np.full(shape, np.nan) - for i, j in np.ndindex(shape): - if self._rmsd_full_matrix[i, j] is not None: - self._rmsd_matrix[i, j] = self._rmsd_full_matrix[ - i, j]["rmsd"] - return self._rmsd_matrix - - @property - def lddt_pli_matrix(self): - """ Get the matrix of lDDT-PLI values. - - Target ligands are in rows, model ligands in columns. - - NaN values indicate that no lDDT-PLI could be computed (i.e. different - ligands). - - :rtype: :class:`~numpy.ndarray` - """ - if self._lddt_pli_full_matrix is None: - self._compute_scores() - if self._lddt_pli_matrix is None: - # convert - shape = self._lddt_pli_full_matrix.shape - self._lddt_pli_matrix = np.full(shape, np.nan) - for i, j in np.ndindex(shape): - if self._lddt_pli_full_matrix[i, j] is not None: - self._lddt_pli_matrix[i, j] = self._lddt_pli_full_matrix[ - i, j]["lddt_pli"] - return self._lddt_pli_matrix - - @property - def coverage_matrix(self): - """ Get the matrix of model ligand atom coverage in the target. - - Target ligands are in rows, model ligands in columns. - - A value of 0 indicates that there was no isomorphism between the model - and target ligands. If `substructure_match=False`, only full match - isomorphisms are considered, and therefore only values of 1.0 and 0.0 - are reported. - - :rtype: :class:`~numpy.ndarray` - """ - if self._assignment_match_coverage is None: - self._compute_scores() - return self._assignment_match_coverage - - @property - def rmsd(self): - """Get a dictionary of RMSD score values, keyed by model ligand - (chain name, :class:`~ost.mol.ResNum`). - - If the scoring object was instantiated with `unassigned=True`, some - scores may be `None`. - - :rtype: :class:`dict` - """ - if self._rmsd is None: - if self.rmsd_assignment: - self._assign_ligands_rmsd_only() - else: - self._assign_ligands_rmsd() - return self._rmsd - - @property - def rmsd_details(self): - """Get a dictionary of RMSD score details (dictionaries), keyed by - model ligand (chain name, :class:`~ost.mol.ResNum`). - - The value is a dictionary. For ligands that were assigned (mapped) to - the target, the dictionary contain the following information: - - * `rmsd`: the RMSD score value. - * `lddt_lp`: the lDDT score of the ligand pocket (lDDT-LP). - * `bs_ref_res`: a list of residues (:class:`~ost.mol.ResidueHandle`) - that define the binding site in the reference. - * `bs_ref_res_mapped`: a list of residues - (:class:`~ost.mol.ResidueHandle`) in the reference binding site - that could be mapped to the model. - * `bs_mdl_res_mapped`: a list of residues - (:class:`~ost.mol.ResidueHandle`) in the model that were mapped to - the reference binding site. The residues are in the same order as - `bs_ref_res_mapped`. - * `bb_rmsd`: the RMSD of the binding site backbone after superposition - * `target_ligand`: residue handle of the target ligand. - * `model_ligand`: residue handle of the model ligand. - * `chain_mapping`: local chain mapping as a dictionary, with target - chain name as key and model chain name as value. - * `transform`: transformation to superpose the model onto the target. - * `substructure_match`: whether the score is the result of a partial - (substructure) match. A value of `True` indicates that the target - ligand covers only part of the model, while `False` indicates a - perfect match. - * `coverage`: the fraction of model atoms covered by the assigned - target ligand, in the interval (0, 1]. If `substructure_match` - is `False`, this will always be 1. - * `inconsistent_residues`: a list of tuples of mapped residues views - (:class:`~ost.mol.ResidueView`) with residue names that differ - between the reference and the model, respectively. - The list is empty if all residue names match, which is guaranteed - if `check_resnames=True`. - Note: more binding site mappings may be explored during scoring, - but only inconsistencies in the selected mapping are reported. - * `unassigned`: only if the scorer was instantiated with - `unassigned=True`: `False` - - If the scoring object was instantiated with `unassigned=True`, in - addition the unassigned ligands will be reported with a score of `None` - and the following information: - - * `unassigned`: `True`, - * `reason_short`: a short token of the reason, see - :attr:`unassigned_model_ligands` for details and meaning. - * `reason_long`: a human-readable text of the reason, see - :attr:`unassigned_model_ligands` for details and meaning. - * `rmsd`: `None` - - :rtype: :class:`dict` - """ - if self._rmsd_details is None: - if self.rmsd_assignment: - self._assign_ligands_rmsd_only() - else: - self._assign_ligands_rmsd() - return self._rmsd_details - - @property - def lddt_pli(self): - """Get a dictionary of lDDT-PLI score values, keyed by model ligand - (chain name, :class:`~ost.mol.ResNum`). - - If the scoring object was instantiated with `unassigned=True`, some - scores may be `None`. - - :rtype: :class:`dict` - """ - if self._lddt_pli is None: - if self.rmsd_assignment: - self._assign_ligands_rmsd_only() - else: - self._assign_ligands_lddt_pli() - return self._lddt_pli - - @property - def lddt_pli_details(self): - """Get a dictionary of lDDT-PLI score details (dictionaries), keyed by - model ligand (chain name, :class:`~ost.mol.ResNum`). - - Each sub-dictionary contains the following information: - - * `lddt_pli`: the lDDT-PLI score value. - * `rmsd`: the RMSD score value corresponding to the lDDT-PLI - chain mapping and assignment. This may differ from the RMSD-based - assignment. Note that a different isomorphism than `lddt_pli` may - be used. - * `lddt_lp`: the lDDT score of the ligand pocket (lDDT-LP). - * `lddt_pli_n_contacts`: number of total contacts used in lDDT-PLI, - summed over all thresholds. Can be divided by 8 to obtain the number - of atomic contacts. - * `bs_ref_res`: a list of residues (:class:`~ost.mol.ResidueHandle`) - that define the binding site in the reference. - * `bs_ref_res_mapped`: a list of residues - (:class:`~ost.mol.ResidueHandle`) in the reference binding site - that could be mapped to the model. - * `bs_mdl_res_mapped`: a list of residues - (:class:`~ost.mol.ResidueHandle`) in the model that were mapped to - the reference binding site. The residues are in the same order as - `bs_ref_res_mapped`. - * `bb_rmsd`: the RMSD of the binding site backbone after superposition. - Note: not used for lDDT-PLI computation. - * `target_ligand`: residue handle of the target ligand. - * `model_ligand`: residue handle of the model ligand. - * `chain_mapping`: local chain mapping as a dictionary, with target - chain name as key and model chain name as value. - * `transform`: transformation to superpose the model onto the target - (for RMSD only). - * `substructure_match`: whether the score is the result of a partial - (substructure) match. A value of `True` indicates that the target - ligand covers only part of the model, while `False` indicates a - perfect match. - * `inconsistent_residues`: a list of tuples of mapped residues views - (:class:`~ost.mol.ResidueView`) with residue names that differ - between the reference and the model, respectively. - The list is empty if all residue names match, which is guaranteed - if `check_resnames=True`. - Note: more binding site mappings may be explored during scoring, - but only inconsistencies in the selected mapping are reported. - * `unassigned`: only if the scorer was instantiated with - `unassigned=True`: `False` - - If the scoring object was instantiated with `unassigned=True`, in - addition the unmapped ligands will be reported with a score of `None` - and the following information: - - * `unassigned`: `True`, - * `reason_short`: a short token of the reason, see - :attr:`unassigned_model_ligands` for details and meaning. - * `reason_long`: a human-readable text of the reason, see - :attr:`unassigned_model_ligands` for details and meaning. - * `lddt_pli`: `None` - - :rtype: :class:`dict` - """ - if self._lddt_pli_details is None: - if self.rmsd_assignment: - self._assign_ligands_rmsd_only() - else: - self._assign_ligands_lddt_pli() - return self._lddt_pli_details - - @property - def unassigned_target_ligands(self): - """Get a dictionary of target ligands not assigned to any model ligand, - keyed by target ligand (chain name, :class:`~ost.mol.ResNum`). - - The assignment for the lDDT-PLI score is used (and is controlled - by the `rmsd_assignment` argument). - - Each item contains a string from a controlled dictionary - about the reason for the absence of assignment. - A human-readable description can be obtained from the - :attr:`unassigned_target_ligand_descriptions` property. - - Currently, the following reasons are reported: - - * `no_ligand`: there was no ligand in the model. - * `disconnected`: the ligand graph was disconnected. - * `binding_site`: no residues were in proximity of the ligand. - * `model_representation`: no representation of the reference binding - site was found in the model. (I.e. the binding site was not modeled, - or the model ligand was positioned too far in combination with - `full_bs_search=False`) - * `identity`: the ligand was not found in the model (by graph - isomorphism). Check your ligand connectivity, and enable the - `substructure_match` option if the target ligand is incomplete. - * `stoichiometry`: there was a possible assignment in the model, but - the model ligand was already assigned to a different target ligand. - This indicates different stoichiometries. - * `symmetries`: too many symmetries were found (by graph isomorphisms). - Increase `max_symmetries`. - * `no_contact`: there were no lDDT contacts between the binding site - and the ligand, and lDDT-PLI is undefined. Increase the value of - `lddt_pli_radius` to at least the value of the binding site `radius`. - - Some of these reasons can be overlapping, but a single reason will be - reported. - - :rtype: :class:`dict` - """ - if self._unassigned_target_ligand_short is None: - if self.rmsd_assignment: - self._assign_ligands_rmsd_only() - else: - self._assign_ligands_lddt_pli() - self._unassigned_target_ligand_short = {} - self._unassigned_target_ligand_descriptions = {} - for cname, res in self._unassigned_target_ligands.items(): - self._unassigned_target_ligand_short[cname] = {} - for resnum, val in res.items(): - self._unassigned_target_ligand_short[cname][resnum] = val[0] - self._unassigned_target_ligand_descriptions[val[0]] = val[1] - return self._unassigned_target_ligand_short - - @property - def unassigned_target_ligand_descriptions(self): - """Get a human-readable description of why target ligands were - unassigned, as a dictionary keyed by the controlled dictionary - from :attr:`unassigned_target_ligands`. - """ - if self._unassigned_target_ligand_descriptions is None: - _ = self.unassigned_target_ligands # assigned there - return self._unassigned_target_ligand_descriptions - - @property - def unassigned_model_ligands(self): - """Get a dictionary of model ligands not assigned to any target ligand, - keyed by model ligand (chain name, :class:`~ost.mol.ResNum`). - - The assignment for the lDDT-PLI score is used (and is controlled - by the `rmsd_assignment` argument). - - Each item contains a string from a controlled dictionary - about the reason for the absence of assignment. - A human-readable description can be obtained from the - :attr:`unassigned_model_ligand_descriptions` property. - Currently, the following reasons are reported: - - * `no_ligand`: there was no ligand in the target. - * `disconnected`: the ligand graph is disconnected. - * `binding_site`: a potential assignment was found in the target, but - there were no polymer residues in proximity of the ligand in the - target. - * `model_representation`: a potential assignment was found in the target, - but no representation of the binding site was found in the model. - (I.e. the binding site was not modeled, or the model ligand was - positioned too far in combination with `full_bs_search=False`) - * `identity`: the ligand was not found in the target (by graph - isomorphism). Check your ligand connectivity, and enable the - `substructure_match` option if the target ligand is incomplete. - * `stoichiometry`: there was a possible assignment in the target, but - the model target was already assigned to a different model ligand. - This indicates different stoichiometries. - * `symmetries`: too many symmetries were found (by graph isomorphisms). - Increase `max_symmetries`. - * `no_contact`: there were no lDDT contacts between the binding site - and the ligand in the target structure, and lDDT-PLI is undefined. - Increase the value of `lddt_pli_radius` to at least the value of the - binding site `radius`. - - Some of these reasons can be overlapping, but a single reason will be - reported. - - :rtype: :class:`dict` - """ - if self._unassigned_model_ligand_short is None: - if self.rmsd_assignment: - self._assign_ligands_rmsd_only() - else: - self._assign_ligands_lddt_pli() - self._unassigned_model_ligand_short = {} - self._unassigned_model_ligand_descriptions = {} - for cname, res in self._unassigned_model_ligands.items(): - self._unassigned_model_ligand_short[cname] = {} - for resnum, val in res.items(): - self._unassigned_model_ligand_short[cname][resnum] = val[0] - self._unassigned_model_ligand_descriptions[val[0]] = val[1] - return self._unassigned_model_ligand_short - - @property - def unassigned_model_ligand_descriptions(self): - """Get a human-readable description of why model ligands were - unassigned, as a dictionary keyed by the controlled dictionary - from :attr:`unassigned_model_ligands`. - """ - if self._unassigned_model_ligand_descriptions is None: - _ = self.unassigned_model_ligands # assigned there - return self._unassigned_model_ligand_descriptions - - - def _set_custom_mapping(self, mapping): - """ sets self.__model_mapping with a full blown MappingResult object - - :param mapping: mapping with trg chains as key and mdl ch as values - :type mapping: :class:`dict` - """ - chain_mapper = self.chain_mapper - chem_mapping, chem_group_alns, mdl = \ - chain_mapper.GetChemMapping(self.model) - - # now that we have a chem mapping, lets do consistency checks - # - check whether chain names are unique and available in structures - # - check whether the mapped chains actually map to the same chem groups - if len(mapping) != len(set(mapping.keys())): - raise RuntimeError(f"Expect unique trg chain names in mapping. Got " - f"{mapping.keys()}") - if len(mapping) != len(set(mapping.values())): - raise RuntimeError(f"Expect unique mdl chain names in mapping. Got " - f"{mapping.values()}") - - trg_chains = set([ch.GetName() for ch in chain_mapper.target.chains]) - mdl_chains = set([ch.GetName() for ch in mdl.chains]) - for k,v in mapping.items(): - if k not in trg_chains: - raise RuntimeError(f"Target chain \"{k}\" is not available " - f"in target processed for chain mapping " - f"({trg_chains})") - if v not in mdl_chains: - raise RuntimeError(f"Model chain \"{v}\" is not available " - f"in model processed for chain mapping " - f"({mdl_chains})") - - for trg_ch, mdl_ch in mapping.items(): - trg_group_idx = None - mdl_group_idx = None - for idx, group in enumerate(chain_mapper.chem_groups): - if trg_ch in group: - trg_group_idx = idx - break - for idx, group in enumerate(chem_mapping): - if mdl_ch in group: - mdl_group_idx = idx - break - if trg_group_idx is None or mdl_group_idx is None: - raise RuntimeError("Could not establish a valid chem grouping " - "of chain names provided in custom mapping.") - - if trg_group_idx != mdl_group_idx: - raise RuntimeError(f"Chem group mismatch in custom mapping: " - f"target chain \"{trg_ch}\" groups with the " - f"following chemically equivalent target " - f"chains: " - f"{chain_mapper.chem_groups[trg_group_idx]} " - f"but model chain \"{mdl_ch}\" maps to the " - f"following target chains: " - f"{chain_mapper.chem_groups[mdl_group_idx]}") - - pairs = set([(trg_ch, mdl_ch) for trg_ch, mdl_ch in mapping.items()]) - ref_mdl_alns = \ - chain_mapping._GetRefMdlAlns(chain_mapper.chem_groups, - chain_mapper.chem_group_alignments, - chem_mapping, - chem_group_alns, - pairs = pairs) - - # translate mapping format - final_mapping = list() - for ref_chains in chain_mapper.chem_groups: - mapped_mdl_chains = list() - for ref_ch in ref_chains: - if ref_ch in mapping: - mapped_mdl_chains.append(mapping[ref_ch]) - else: - mapped_mdl_chains.append(None) - final_mapping.append(mapped_mdl_chains) - - alns = dict() - for ref_group, mdl_group in zip(chain_mapper.chem_groups, - final_mapping): - for ref_ch, mdl_ch in zip(ref_group, mdl_group): - if ref_ch is not None and mdl_ch is not None: - aln = ref_mdl_alns[(ref_ch, mdl_ch)] - trg_view = chain_mapper.target.Select(f"cname={mol.QueryQuoteName(ref_ch)}") - mdl_view = mdl.Select(f"cname={mol.QueryQuoteName(mdl_ch)}") - aln.AttachView(0, trg_view) - aln.AttachView(1, mdl_view) - alns[(ref_ch, mdl_ch)] = aln - - self.__model_mapping = chain_mapping.MappingResult(chain_mapper.target, mdl, - chain_mapper.chem_groups, - chem_mapping, - final_mapping, alns) - - def _find_unassigned_model_ligand_reason(self, ligand, assignment="lddt_pli", check=True): - # Is this a model ligand? - try: - ligand_idx = self.model_ligands.index(ligand) - except ValueError: - # Raise with a better error message - raise ValueError("Ligand %s is not in self.model_ligands" % ligand) - - # Ensure we are unassigned - if check: - details = getattr(self, assignment + "_details") - if ligand.chain.name in details and ligand.number in details[ligand.chain.name]: - ligand_details = details[ligand.chain.name][ligand.number] - if not ("unassigned" in ligand_details and ligand_details["unassigned"]): - raise RuntimeError("Ligand %s is mapped to %s" % (ligand, ligand_details["target_ligand"])) - - # Were there any ligands in the target? - if len(self.target_ligands) == 0: - return ("no_ligand", "No ligand in the target") - - # Is the ligand disconnected? - graph = _ResidueToGraph(ligand) - if not networkx.is_connected(graph): - return ("disconnected", "Ligand graph is disconnected") - - # Do we have isomorphisms with the target? - for trg_lig_idx, assigned in enumerate(self._assignment_isomorphisms[:, ligand_idx]): - if np.isnan(assigned): - try: - _ComputeSymmetries( - self.model_ligands[ligand_idx], - self.target_ligands[trg_lig_idx], - substructure_match=self.substructure_match, - by_atom_index=True, - return_symmetries=False) - except (NoSymmetryError, DisconnectedGraphError): - assigned = 0. - except TooManySymmetriesError: - assigned = -1. - else: - assigned = 1. - self._assignment_isomorphisms[trg_lig_idx,ligand_idx] = assigned - if assigned == 1.: - # Could have been assigned - # So what's up with this target ligand? - assignment_matrix = getattr(self, assignment + "_matrix") - all_nan = np.all(np.isnan(assignment_matrix[:, ligand_idx])) - if all_nan: - # The assignment matrix is all nans so we have a problem - # with the binding site or the representation - trg_ligand = self.target_ligands[trg_lig_idx] - return self._unassigned_target_ligands_reason[trg_ligand] - else: - # Ligand was already assigned - return ("stoichiometry", - "Ligand was already assigned to an other " - "model ligand (different stoichiometry)") - elif assigned == -1: - # Symmetries / isomorphisms exceeded limit - return ("symmetries", - "Too many symmetries were found.") - - # Could not be assigned to any ligand - must be different - if self.substructure_match: - iso = "subgraph isomorphism" - else: - iso = "full graph isomorphism" - return ("identity", "Ligand was not found in the target (by %s)" % iso) - - def _find_unassigned_target_ligand_reason(self, ligand, assignment="lddt_pli", check=True): - # Is this a target ligand? - try: - ligand_idx = self.target_ligands.index(ligand) - except ValueError: - # Raise with a better error message - raise ValueError("Ligand %s is not in self.target_ligands" % ligand) - - # Ensure we are unassigned - if check: - details = getattr(self, assignment + "_details") - for cname, chain_ligands in details.items(): - for rnum, details in chain_ligands.items(): - if "unassigned" in details and details["unassigned"]: - continue - if details['target_ligand'] == ligand: - raise RuntimeError("Ligand %s is mapped to %s.%s" % ( - ligand, cname, rnum)) - - # Were there any ligands in the model? - if len(self.model_ligands) == 0: - return ("no_ligand", "No ligand in the model") - - # Is the ligand disconnected? - graph = _ResidueToGraph(ligand) - if not networkx.is_connected(graph): - return ("disconnected", "Ligand graph is disconnected") - - # Is it because there was no valid binding site or no representation? - if ligand in self._unassigned_target_ligands_reason: - return self._unassigned_target_ligands_reason[ligand] - # Or because no symmetry? - for model_lig_idx, assigned in enumerate( - self._assignment_isomorphisms[ligand_idx, :]): - if np.isnan(assigned): - try: - _ComputeSymmetries( - self.model_ligands[model_lig_idx], - self.target_ligands[ligand_idx], - substructure_match=self.substructure_match, - by_atom_index=True, - return_symmetries=False) - except (NoSymmetryError, DisconnectedGraphError): - assigned = 0. - except TooManySymmetriesError: - assigned = -1. - else: - assigned = 1. - self._assignment_isomorphisms[ligand_idx,model_lig_idx] = assigned - if assigned == 1: - # Could have been assigned but was assigned to a different ligand - return ("stoichiometry", - "Ligand was already assigned to an other " - "target ligand (different stoichiometry)") - elif assigned == -1: - # Symmetries / isomorphisms exceeded limit - return ("symmetries", - "Too many symmetries were found.") - - # Could not be assigned to any ligand - must be different - if self.substructure_match: - iso = "subgraph isomorphism" - else: - iso = "full graph isomorphism" - return ("identity", "Ligand was not found in the model (by %s)" % iso) - - @property - def chem_mapping(self): - if self._chem_mapping is None: - self._chem_mapping, self._chem_group_alns, \ - self._chain_mapping_mdl = \ - self.chain_mapper.GetChemMapping(self.model) - return self._chem_mapping - - @property - def chem_group_alns(self): - if self._chem_group_alns is None: - self._chem_mapping, self._chem_group_alns, \ - self._chain_mapping_mdl = \ - self.chain_mapper.GetChemMapping(self.model) - return self._chem_group_alns - - @property - def chain_mapping_mdl(self): - if self._chain_mapping_mdl is None: - self._chem_mapping, self._chem_group_alns, \ - self._chain_mapping_mdl = \ - self.chain_mapper.GetChemMapping(self.model) - return self._chain_mapping_mdl - - def get_get_repr_input(self, mdl_ligand): - if mdl_ligand.handle.hash_code not in self._get_repr_input: - - # figure out what chains in the model are in contact with the ligand - # that may give a non-zero contribution to lDDT in - # chain_mapper.GetRepr - radius = self.model_bs_radius - chains = set() - for at in mdl_ligand.atoms: - close_atoms = self.chain_mapping_mdl.FindWithin(at.GetPos(), - radius) - for close_at in close_atoms: - chains.add(close_at.GetChain().GetName()) - - if len(chains) > 0: - - # the chain mapping model which only contains close chains - query = "cname=" - query += ','.join([mol.QueryQuoteName(x) for x in chains]) - mdl = self.chain_mapping_mdl.Select(query) - - # chem mapping which is reduced to the respective chains - chem_mapping = list() - for m in self.chem_mapping: - chem_mapping.append([x for x in m if x in chains]) - - self._get_repr_input[mdl_ligand.handle.hash_code] = \ - (mdl, chem_mapping) - - else: - self._get_repr_input[mdl_ligand.handle.hash_code] = \ - (self.chain_mapping_mdl.CreateEmptyView(), - [list() for _ in self.chem_mapping]) - - return (self._get_repr_input[mdl_ligand.hash_code][1], - self.chem_group_alns, - self._get_repr_input[mdl_ligand.hash_code][0]) - - - def get_repr(self, target_ligand, model_ligand): - - key = None - if self.full_bs_search: - # all possible binding sites, independent from actual model ligand - key = (target_ligand.handle.hash_code, 0) - else: - key = (target_ligand.handle.hash_code, model_ligand.handle.hash_code) - - if key not in self._repr: - ref_bs = self.get_target_binding_site(target_ligand) - if self.global_chain_mapping: - reprs = self.chain_mapper.GetRepr( - ref_bs, self.model, inclusion_radius=self.lddt_lp_radius, - global_mapping = self._model_mapping) - elif self.full_bs_search: - reprs = self.chain_mapper.GetRepr( - ref_bs, self.model, inclusion_radius=self.lddt_lp_radius, - topn=self.binding_sites_topn) - else: - reprs = self.chain_mapper.GetRepr(ref_bs, self.model, - inclusion_radius=self.lddt_lp_radius, - topn=self.binding_sites_topn, - chem_mapping_result = self.get_get_repr_input(model_ligand)) - self._repr[key] = reprs - if len(reprs) == 0: - # whatever is in there already has precedence - if target_ligand not in self._unassigned_target_ligands_reason: - self._unassigned_target_ligands_reason[target_ligand] = ( - "model_representation", - "No representation of the reference binding site was " - "found in the model") - - return self._repr[key] - - -def _ResidueToGraph(residue, by_atom_index=False): - """Return a NetworkX graph representation of the residue. - - :param residue: the residue from which to derive the graph - :type residue: :class:`ost.mol.ResidueHandle` or - :class:`ost.mol.ResidueView` - :param by_atom_index: Set this parameter to True if you need the nodes to - be labeled by atom index (within the residue). - Otherwise, if False, the nodes will be labeled by - atom names. - :type by_atom_index: :class:`bool` - :rtype: :class:`~networkx.classes.graph.Graph` - - Nodes are labeled with the Atom's uppercase :attr:`~ost.mol.AtomHandle.element`. - """ - nxg = networkx.Graph() - - for atom in residue.atoms: - nxg.add_node(atom.name, element=atom.element.upper()) - - # This will list all edges twice - once for every atom of the pair. - # But as of NetworkX 3.0 adding the same edge twice has no effect, so we're good. - nxg.add_edges_from([( - b.first.name, - b.second.name) for a in residue.atoms for b in a.GetBondList()]) - - if by_atom_index: - nxg = networkx.relabel_nodes(nxg, - {a: b for a, b in zip( - [a.name for a in residue.atoms], - range(len(residue.atoms)))}, - True) - return nxg - - -def SCRMSD(model_ligand, target_ligand, transformation=geom.Mat4(), - substructure_match=False, max_symmetries=1e6): - """Calculate symmetry-corrected RMSD. - - Binding site superposition must be computed separately and passed as - `transformation`. - - :param model_ligand: The model ligand - :type model_ligand: :class:`ost.mol.ResidueHandle` or - :class:`ost.mol.ResidueView` - :param target_ligand: The target ligand - :type target_ligand: :class:`ost.mol.ResidueHandle` or - :class:`ost.mol.ResidueView` - :param transformation: Optional transformation to apply on each atom - position of model_ligand. - :type transformation: :class:`ost.geom.Mat4` - :param substructure_match: Set this to True to allow partial target - ligand. - :type substructure_match: :class:`bool` - :param max_symmetries: If more than that many isomorphisms exist, raise - a :class:`TooManySymmetriesError`. This can only be assessed by - generating at least that many isomorphisms and can take some time. - :type max_symmetries: :class:`int` - :rtype: :class:`float` - :raises: :class:`NoSymmetryError` when no symmetry can be found, - :class:`DisconnectedGraphError` when ligand graph is disconnected, - :class:`TooManySymmetriesError` when more than `max_symmetries` - isomorphisms are found. - """ - - symmetries = _ComputeSymmetries(model_ligand, target_ligand, - substructure_match=substructure_match, - by_atom_index=True, - max_symmetries=max_symmetries) - return _SCRMSD_symmetries(symmetries, model_ligand, target_ligand, - transformation) - - -def _SCRMSD_symmetries(symmetries, model_ligand, target_ligand, - transformation): - """Compute SCRMSD with pre-computed symmetries. Internal. """ - - best_rmsd = np.inf - for i, (trg_sym, mdl_sym) in enumerate(symmetries): - # Prepare Entities for RMSD - trg_lig_rmsd_ent = mol.CreateEntity() - trg_lig_rmsd_editor = trg_lig_rmsd_ent.EditXCS() - trg_lig_rmsd_chain = trg_lig_rmsd_editor.InsertChain("_") - trg_lig_rmsd_res = trg_lig_rmsd_editor.AppendResidue(trg_lig_rmsd_chain, "LIG") - - mdl_lig_rmsd_ent = mol.CreateEntity() - mdl_lig_rmsd_editor = mdl_lig_rmsd_ent.EditXCS() - mdl_lig_rmsd_chain = mdl_lig_rmsd_editor.InsertChain("_") - mdl_lig_rmsd_res = mdl_lig_rmsd_editor.AppendResidue(mdl_lig_rmsd_chain, "LIG") - - for mdl_anum, trg_anum in zip(mdl_sym, trg_sym): - # Rename model atoms according to symmetry - trg_atom = target_ligand.atoms[trg_anum] - mdl_atom = model_ligand.atoms[mdl_anum] - # Add atoms in the correct order to the RMSD entities - trg_lig_rmsd_editor.InsertAtom(trg_lig_rmsd_res, trg_atom.name, trg_atom.pos) - mdl_lig_rmsd_editor.InsertAtom(mdl_lig_rmsd_res, mdl_atom.name, mdl_atom.pos) - - trg_lig_rmsd_editor.UpdateICS() - mdl_lig_rmsd_editor.UpdateICS() - - rmsd = mol.alg.CalculateRMSD(mdl_lig_rmsd_ent.CreateFullView(), - trg_lig_rmsd_ent.CreateFullView(), - transformation) - if rmsd < best_rmsd: - best_rmsd = rmsd - - return best_rmsd - - -def _ComputeSymmetries(model_ligand, target_ligand, substructure_match=False, - by_atom_index=False, return_symmetries=True, - max_symmetries=1e6): - """Return a list of symmetries (isomorphisms) of the model onto the target - residues. - - :param model_ligand: The model ligand - :type model_ligand: :class:`ost.mol.ResidueHandle` or - :class:`ost.mol.ResidueView` - :param target_ligand: The target ligand - :type target_ligand: :class:`ost.mol.ResidueHandle` or - :class:`ost.mol.ResidueView` - :param substructure_match: Set this to True to allow partial ligands - in the reference. - :type substructure_match: :class:`bool` - :param by_atom_index: Set this parameter to True if you need the symmetries - to refer to atom index (within the residue). - Otherwise, if False, the symmetries refer to atom - names. - :type by_atom_index: :class:`bool` - :type return_symmetries: If Truthy, return the mappings, otherwise simply - return True if a mapping is found (and raise if - no mapping is found). This is useful to quickly - find out if a mapping exist without the expensive - step to find all the mappings. - :type return_symmetries: :class:`bool` - :param max_symmetries: If more than that many isomorphisms exist, raise - a :class:`TooManySymmetriesError`. This can only be assessed by - generating at least that many isomorphisms and can take some time. - :type max_symmetries: :class:`int` - :raises: :class:`NoSymmetryError` when no symmetry can be found; - :class:`TooManySymmetriesError` when more than `max_symmetries` - isomorphisms are found. - - """ - - # Get the Graphs of the ligands - model_graph = _ResidueToGraph(model_ligand, by_atom_index=by_atom_index) - target_graph = _ResidueToGraph(target_ligand, by_atom_index=by_atom_index) - - if not networkx.is_connected(model_graph): - raise DisconnectedGraphError("Disconnected graph for model ligand %s" % model_ligand) - if not networkx.is_connected(target_graph): - raise DisconnectedGraphError("Disconnected graph for target ligand %s" % target_ligand) - - # Note the argument order (model, target) which differs from spyrmsd. - # This is because a subgraph of model is isomorphic to target - but not the opposite - # as we only consider partial ligands in the reference. - # Make sure to generate the symmetries correctly in the end - gm = networkx.algorithms.isomorphism.GraphMatcher( - model_graph, target_graph, node_match=lambda x, y: - x["element"] == y["element"]) - if gm.is_isomorphic(): - if not return_symmetries: - return True - symmetries = [] - for i, isomorphism in enumerate(gm.isomorphisms_iter()): - if i >= max_symmetries: - raise TooManySymmetriesError( - "Too many symmetries between %s and %s" % ( - str(model_ligand), str(target_ligand))) - symmetries.append((list(isomorphism.values()), list(isomorphism.keys()))) - assert len(symmetries) > 0 - LogDebug("Found %s isomorphic mappings (symmetries)" % len(symmetries)) - elif gm.subgraph_is_isomorphic() and substructure_match: - if not return_symmetries: - return True - symmetries = [] - for i, isomorphism in enumerate(gm.subgraph_isomorphisms_iter()): - if i >= max_symmetries: - raise TooManySymmetriesError( - "Too many symmetries between %s and %s" % ( - str(model_ligand), str(target_ligand))) - symmetries.append((list(isomorphism.values()), list(isomorphism.keys()))) - assert len(symmetries) > 0 - # Assert that all the atoms in the target are part of the substructure - assert len(symmetries[0][0]) == len(target_ligand.atoms) - LogDebug("Found %s subgraph isomorphisms (symmetries)" % len(symmetries)) - elif gm.subgraph_is_isomorphic(): - LogDebug("Found subgraph isomorphisms (symmetries), but" - " ignoring because substructure_match=False") - raise NoSymmetryError("No symmetry between %s and %s" % ( - str(model_ligand), str(target_ligand))) - else: - LogDebug("Found no isomorphic mappings (symmetries)") - raise NoSymmetryError("No symmetry between %s and %s" % ( - str(model_ligand), str(target_ligand))) - - return symmetries - -class NoSymmetryError(ValueError): - """Exception raised when no symmetry can be found. - """ - pass - - -class TooManySymmetriesError(ValueError): - """Exception raised when too many symmetries are found. - """ - pass - -class DisconnectedGraphError(Exception): - """Exception raised when the ligand graph is disconnected. - """ - pass - - -__all__ = ["LigandScorer", "SCRMSD", "NoSymmetryError", - "TooManySymmetriesError", "DisconnectedGraphError"] diff --git a/modules/mol/alg/pymod/ligand_scoring_base.py b/modules/mol/alg/pymod/ligand_scoring_base.py new file mode 100644 index 0000000000000000000000000000000000000000..4cd4bb588bbe44dd0eb8473cd80c200a49f13fa8 --- /dev/null +++ b/modules/mol/alg/pymod/ligand_scoring_base.py @@ -0,0 +1,1217 @@ +import numpy as np +import networkx + +from ost import mol +from ost import LogWarning, LogScript, LogVerbose, LogDebug +from ost.mol.alg import chain_mapping + +class LigandScorer: + """ Scorer to compute various small molecule ligand (non polymer) scores. + + .. note :: + Extra requirements: + + - Python modules `numpy` and `networkx` must be available + (e.g. use ``pip install numpy networkx``) + + :class:`LigandScorer` is an abstract base class dealing with all the setup, + data storage, enumerating ligand symmetries and target/model ligand + matching/assignment. But actual score computation is delegated to child + classes. + + At the moment, two such classes are available: + + * :class:`ost.mol.alg.ligand_scoring_lddtpli.LDDTPLIScorer` + that assesses the conservation of protein-ligand + contacts + * :class:`ost.mol.alg.ligand_scoring_scrmsd.SCRMSDScorer` + that computes a binding-site superposed, symmetry-corrected RMSD. + + All versus all scores are available through the lazily computed + :attr:`score_matrix`. However, many things can go wrong... be it even + something as simple as two ligands not matching. Error states therefore + encode scoring issues. An Issue for a particular ligand is indicated by a + non-zero state in :attr:`model_ligand_states`/:attr:`target_ligand_states`. + This invalidates pairwise scores of such a ligand with all other ligands. + This and other issues in pairwise score computation are reported in + :attr:`state_matrix` which has the same size as :attr:`score_matrix`. + Only if the respective location is 0, a valid pairwise score can be + expected. The states and their meaning can be explored with code:: + + for state_code, (short_desc, desc) in scorer_obj.state_decoding.items(): + print(state_code) + print(short_desc) + print(desc) + + A common use case is to derive a one-to-one mapping between ligands in + the model and the target for which :class:`LigandScorer` provides an + automated assignment procedure. + By default, only exact matches between target and model ligands are + considered. This is a problem when the target only contains a subset + of the expected atoms (for instance if atoms are missing in an + experimental structure, which often happens in the PDB). With + `substructure_match=True`, complete model ligands can be scored against + partial target ligands. One problem with this approach is that it is + very easy to find good matches to small, irrelevant ligands like EDO, CO2 + or GOL. The assignment algorithm therefore considers the coverage, + expressed as the fraction of atoms of the model ligand atoms covered in the + target. Higher coverage matches are prioritized, but a match with a better + score will be preferred if it falls within a window of `coverage_delta` + (by default 0.2) of a worse-scoring match. As a result, for instance, + with a delta of 0.2, a low-score match with coverage 0.96 would be + preferred over a high-score match with coverage 0.70. + + Assumptions: + + :class:`LigandScorer` generally assumes that the + :attr:`~ost.mol.ResidueHandle.is_ligand` property is properly set on all + the ligand atoms, and only ligand atoms. This is typically the case for + entities loaded from mmCIF (tested with mmCIF files from the PDB and + SWISS-MODEL). Legacy PDB files must contain `HET` headers (which is usually + the case for files downloaded from the PDB but not elsewhere). + + The class doesn't perform any cleanup of the provided structures. + It is up to the caller to ensure that the data is clean and suitable for + scoring. :ref:`Molck <molck>` should be used with extra + care, as many of the options (such as `rm_non_std` or `map_nonstd_res`) can + cause ligands to be removed from the structure. If cleanup with Molck is + needed, ligands should be kept aside and passed separately. Non-ligand + residues should be valid compounds with atom names following the naming + conventions of the component dictionary. Non-standard residues are + acceptable, and if the model contains a standard residue at that position, + only atoms with matching names will be considered. + + Unlike most of OpenStructure, this class does not assume that the ligands + (either in the model or the target) are part of the PDB component + dictionary. They may have arbitrary residue names. Residue names do not + have to match between the model and the target. Matching is based on + the calculation of isomorphisms which depend on the atom element name and + atom connectivity (bond order is ignored). + It is up to the caller to ensure that the connectivity of atoms is properly + set before passing any ligands to this class. Ligands with improper + connectivity will lead to bogus results. + + Note, however, that atom names should be unique within a residue (ie two + distinct atoms cannot have the same atom name). + + This only applies to the ligand. The rest of the model and target + structures (protein, nucleic acids) must still follow the usual rules and + contain only residues from the compound library. + + Although it isn't a requirement, hydrogen atoms should be removed from the + structures. Here is an example code snippet that will perform a reasonable + cleanup. Keep in mind that this is most likely not going to work as + expected with entities loaded from PDB files, as the `is_ligand` flag is + probably not set properly. + + Here is an example of how to use setup a scorer code:: + + from ost.mol.alg.ligand_scoring_scrmsd import SCRMSDScorer + from ost.mol.alg import Molck, MolckSettings + + # Load data + # Structure model in PDB format, containing the receptor only + model = io.LoadPDB("path_to_model.pdb") + # Ligand model as SDF file + model_ligand = io.LoadEntity("path_to_ligand.sdf", format="sdf") + # Target loaded from mmCIF, containing the ligand + target = io.LoadMMCIF("path_to_target.cif") + + # Cleanup a copy of the structures + cleaned_model = model.Copy() + cleaned_target = target.Copy() + molck_settings = MolckSettings(rm_unk_atoms=True, + rm_non_std=False, + rm_hyd_atoms=True, + rm_oxt_atoms=False, + rm_zero_occ_atoms=False, + colored=False, + map_nonstd_res=False, + assign_elem=True) + Molck(cleaned_model, conop.GetDefaultLib(), molck_settings) + Molck(cleaned_target, conop.GetDefaultLib(), molck_settings) + + # Setup scorer object and compute lDDT-PLI + model_ligands = [model_ligand.Select("ele != H")] + sc = SCRMSDScorer(cleaned_model, cleaned_target, model_ligands) + + # Perform assignment and read respective scores + for lig_pair in sc.assignment: + trg_lig = sc.target_ligands[lig_pair[0]] + mdl_lig = sc.model_ligands[lig_pair[1]] + score = sc.score_matrix[lig_pair[0], lig_pair[1]] + print(f"Score for {trg_lig} and {mdl_lig}: {score}") + + :param model: Model structure - a deep copy is available as :attr:`model`. + No additional processing (ie. Molck), checks, + stereochemistry checks or sanitization is performed on the + input. Hydrogen atoms are kept. + :type model: :class:`ost.mol.EntityHandle`/:class:`ost.mol.EntityView` + :param target: Target structure - a deep copy is available as + :attr:`target`. No additional processing (ie. Molck), checks + or sanitization is performed on the input. Hydrogen atoms are + kept. + :type target: :class:`ost.mol.EntityHandle`/:class:`ost.mol.EntityView` + :param model_ligands: Model ligands, as a list of + :class:`~ost.mol.ResidueHandle` belonging to the model + entity. Can be instantiated with either a :class:list + of :class:`~ost.mol.ResidueHandle`/ + :class:`ost.mol.ResidueView` or of + :class:`ost.mol.EntityHandle`/ + :class:`ost.mol.EntityView`. + If `None`, ligands will be extracted based on the + :attr:`~ost.mol.ResidueHandle.is_ligand` flag (this is + normally set properly in entities loaded from mmCIF). + :type model_ligands: :class:`list` + :param target_ligands: Target ligands, as a list of + :class:`~ost.mol.ResidueHandle` belonging to the + target entity. Can be instantiated either a + :class:list of :class:`~ost.mol.ResidueHandle`/ + :class:`ost.mol.ResidueView` or of + :class:`ost.mol.EntityHandle`/ + :class:`ost.mol.EntityView` containing a single + residue each. If `None`, ligands will be extracted + based on the :attr:`~ost.mol.ResidueHandle.is_ligand` + flag (this is normally set properly in entities + loaded from mmCIF). + :type target_ligands: :class:`list` + :param resnum_alignments: Whether alignments between chemically equivalent + chains in *model* and *target* can be computed + based on residue numbers. This can be assumed in + benchmarking setups such as CAMEO/CASP. + :type resnum_alignments: :class:`bool` + :param rename_ligand_chain: If a residue with the same chain name and + residue number than an explicitly passed model + or target ligand exits in the structure, + and `rename_ligand_chain` is False, a + RuntimeError will be raised. If + `rename_ligand_chain` is True, the ligand will + be moved to a new chain instead, and the move + will be logged to the console with SCRIPT + level. + :type rename_ligand_chain: :class:`bool` + :param substructure_match: Set this to True to allow incomplete (i.e. + partially resolved) target ligands. + :type substructure_match: :class:`bool` + :param coverage_delta: the coverage delta for partial ligand assignment. + :type coverage_delta: :class:`float` + :param max_symmetries: If more than that many isomorphisms exist for + a target-ligand pair, it will be ignored and reported + as unassigned. + :type max_symmetries: :class:`int` + """ + + def __init__(self, model, target, model_ligands=None, target_ligands=None, + resnum_alignments=False, rename_ligand_chain=False, + substructure_match=False, coverage_delta=0.2, + max_symmetries=1e5): + + if isinstance(model, mol.EntityView): + self.model = mol.CreateEntityFromView(model, False) + elif isinstance(model, mol.EntityHandle): + self.model = model.Copy() + else: + raise RuntimeError("model must be of type EntityView/EntityHandle") + + if isinstance(target, mol.EntityView): + self.target = mol.CreateEntityFromView(target, False) + elif isinstance(target, mol.EntityHandle): + self.target = target.Copy() + else: + raise RuntimeError("target must be of type EntityView/EntityHandle") + + # Extract ligands from target + if target_ligands is None: + self.target_ligands = self._extract_ligands(self.target) + else: + self.target_ligands = self._prepare_ligands(self.target, target, + target_ligands, + rename_ligand_chain) + if len(self.target_ligands) == 0: + LogWarning("No ligands in the target") + + # Extract ligands from model + if model_ligands is None: + self.model_ligands = self._extract_ligands(self.model) + else: + self.model_ligands = self._prepare_ligands(self.model, model, + model_ligands, + rename_ligand_chain) + if len(self.model_ligands) == 0: + LogWarning("No ligands in the model") + if len(self.target_ligands) == 0: + raise ValueError("No ligand in the model and in the target") + + self.resnum_alignments = resnum_alignments + self.rename_ligand_chain = rename_ligand_chain + self.substructure_match = substructure_match + self.coverage_delta = coverage_delta + self.max_symmetries = max_symmetries + + # lazily computed attributes + self.__chain_mapper = None + + # keep track of states + # simple integers instead of enums - encoding available in + # self.state_decoding + self._state_matrix = None + self._model_ligand_states = None + self._target_ligand_states = None + + # score matrices + self._score_matrix = None + self._coverage_matrix = None + self._aux_matrix = None + + # assignment and derived data + self._assignment = None + self._score_dict = None + self._aux_dict = None + + # human readable description of states - child class must extend with + # with child class specific states + # each state code comes with a tuple of two elements: + # 1) short description 2) human readable description + # The actual states are set in _compute_scores in :class:`LigandScorer` + # or _compute_score of the child class. + if self.substructure_match: + iso = "subgraph isomorphism" + else: + iso = "full graph isomorphism" + + self.state_decoding = \ + {0: ("OK", "OK"), + 1: ("identity", f"Ligands could not be matched (by {iso})"), + 2: ("symmetries", "Too many symmetries between ligand atoms were " + "found - increasing max_symmetries might help"), + 3: ("no_iso", "No fully isomorphic match could be found - enabling " + "substructure_match might allow a match"), + 4: ("disconnected", "Ligand graph is disconnected"), + 5: ("stoichiometry", "Ligand was already assigned to another ligand " + "(different stoichiometry)"), + 6: ("single_ligand_issue", "Cannot compute valid pairwise score as " + "either model or target ligand have non-zero state."), + 9: ("unknown", "An unknown error occured in LigandScorer")} + + @property + def state_matrix(self): + """ Encodes states of ligand pairs + + Ligand pairs can be matched and a valid score can be expected if + respective location in this matrix is 0. + Target ligands are in rows, model ligands in columns. States are encoded + as integers <= 9. Larger numbers encode errors for child classes. + Use something like ``self.state_decoding[3]`` to get a decscription. + + :rtype: :class:`~numpy.ndarray` + """ + if self._state_matrix is None: + self._compute_scores() + return self._state_matrix + + @property + def model_ligand_states(self): + """ Encodes states of model ligands + + Non-zero state in any of the model ligands invalidates the full + respective column in :attr:`~state_matrix`. + + :rtype: :class:`~numpy.ndarray` + """ + if self._model_ligand_states is None: + self._compute_scores() + return self._model_ligand_states + + @property + def target_ligand_states(self): + """ Encodes states of target ligands + + Non-zero state in any of the target ligands invalidates the full + respective row in :attr:`~state_matrix`. + + :rtype: :class:`~numpy.ndarray` + """ + if self._target_ligand_states is None: + self._compute_scores() + return self._target_ligand_states + + @property + def score_matrix(self): + """ Get the matrix of scores. + + Target ligands are in rows, model ligands in columns. + + NaN values indicate that no value could be computed (i.e. different + ligands). In other words: values are only valid if the respective + location in :attr:`~state_matrix` is 0. + + :rtype: :class:`~numpy.ndarray` + """ + if self._score_matrix is None: + self._compute_scores() + return self._score_matrix + + @property + def coverage_matrix(self): + """ Get the matrix of model ligand atom coverage in the target. + + Target ligands are in rows, model ligands in columns. + + NaN values indicate that no value could be computed (i.e. different + ligands). In other words: values are only valid if the respective + location in :attr:`~state_matrix` is 0. If `substructure_match=False`, + only full match isomorphisms are considered, and therefore only values + of 1.0 can be observed. + + :rtype: :class:`~numpy.ndarray` + """ + if self._coverage_matrix is None: + self._compute_scores() + return self._coverage_matrix + + @property + def aux_matrix(self): + """ Get the matrix of scorer specific auxiliary data. + + Target ligands are in rows, model ligands in columns. + + Auxiliary data consists of arbitrary data dicts which allow a child + class to provide additional information for a scored ligand pair. + empty dictionaries indicate that the child class simply didn't return + anything or that no value could be computed (e.g. different ligands). + In other words: values are only valid if respective location in the + :attr:`~state_matrix` is 0. + + :rtype: :class:`~numpy.ndarray` + """ + if self._aux_matrix is None: + self._compute_scores() + return self._aux_matrix + + @property + def assignment(self): + """ Ligand assignment based on computed scores + + Implements a greedy algorithm to assign target and model ligands + with each other. Starts from each valid ligand pair as indicated + by a state of 0 in :attr:`state_matrix`. Each iteration first selects + high coverage pairs. Given max_coverage defined as the highest + coverage observed in the available pairs, all pairs with coverage + in [max_coverage-*coverage_delta*, max_coverage] are selected. + The best scoring pair among those is added to the assignment + and the whole process is repeated until there are no ligands to + assign anymore. + + :rtype: :class:`list` of :class:`tuple` (trg_lig_idx, mdl_lig_idx) + """ + if self._assignment is None: + self._assignment = list() + # Build working array that contains tuples for all mdl/trg ligand + # pairs with valid score as indicated by a state of 0: + # (score, coverage, trg_ligand_idx, mdl_ligand_idx) + tmp = list() + for trg_idx in range(self.score_matrix.shape[0]): + for mdl_idx in range(self.score_matrix.shape[1]): + if self.state_matrix[trg_idx, mdl_idx] == 0: + tmp.append((self.score_matrix[trg_idx, mdl_idx], + self.coverage_matrix[trg_idx, mdl_idx], + trg_idx, mdl_idx)) + + # sort by score, such that best scoring item is in front + if self._score_dir() == '+': + tmp.sort(reverse=True) + elif self._score_dir() == '-': + tmp.sort() + else: + raise RuntimeError("LigandScorer._score_dir must return one in " + "['+', '-']") + + while len(tmp) > 0: + # select high coverage ligand pairs in working array + coverage_thresh = max([x[1] for x in tmp]) - self.coverage_delta + top_coverage = [x for x in tmp if x[1] >= coverage_thresh] + + # working array is sorted by score => just pick first one + a = top_coverage[0][2] # selected trg_ligand_idx + b = top_coverage[0][3] # selected mdl_ligand_idx + self._assignment.append((a, b)) + + # kick out remaining pairs involving these ligands + tmp = [x for x in tmp if (x[2] != a and x[3] != b)] + + return self._assignment + + @property + def score(self): + """ Get a dictionary of score values, keyed by model ligand + + Extract score with something like: + ``scorer.score[lig.GetChain().GetName()][lig.GetNumber()]``. + The returned scores are based on :attr:`~assignment`. + + :rtype: :class:`dict` + """ + if self._score_dict is None: + self._score_dict = dict() + for (trg_lig_idx, mdl_lig_idx) in self.assignment: + mdl_lig = self.model_ligands[mdl_lig_idx] + cname = mdl_lig.GetChain().GetName() + rnum = mdl_lig.GetNumber() + if cname not in self._score_dict: + self._score_dict[cname] = dict() + score = self.score_matrix[trg_lig_idx, mdl_lig_idx] + self._score_dict[cname][rnum] = score + return self._score_dict + + @property + def aux(self): + """ Get a dictionary of score details, keyed by model ligand + + Extract dict with something like: + ``scorer.score[lig.GetChain().GetName()][lig.GetNumber()]``. + The returned info dicts are based on :attr:`~assignment`. The content is + documented in the respective child class. + + :rtype: :class:`dict` + """ + if self._aux_dict is None: + self._aux_dict = dict() + for (trg_lig_idx, mdl_lig_idx) in self.assignment: + mdl_lig = self.model_ligands[mdl_lig_idx] + cname = mdl_lig.GetChain().GetName() + rnum = mdl_lig.GetNumber() + if cname not in self._aux_dict: + self._aux_dict[cname] = dict() + d = self.aux_matrix[trg_lig_idx, mdl_lig_idx] + self._aux_dict[cname][rnum] = d + return self._aux_dict + + @property + def unassigned_target_ligands(self): + """ Get indices of target ligands which are not assigned + + :rtype: :class:`list` of :class:`int` + """ + # compute on-the-fly, no need for caching + assigned = set([x[0] for x in self.assignment]) + return [x for x in range(len(self.target_ligands)) if x not in assigned] + + @property + def unassigned_model_ligands(self): + """ Get indices of model ligands which are not assigned + + :rtype: :class:`list` of :class:`int` + """ + # compute on-the-fly, no need for caching + assigned = set([x[1] for x in self.assignment]) + return [x for x in range(len(self.model_ligands)) if x not in assigned] + + def get_target_ligand_state_report(self, trg_lig_idx): + """ Get summary of states observed with respect to all model ligands + + Mainly for debug purposes + + :param trg_lig_idx: Index of target ligand for which report should be + generated + :type trg_lig_idx: :class:`int` + """ + return self._get_report(self.target_ligand_states[trg_lig_idx], + self.state_matrix[trg_lig_idx,:]) + + def get_model_ligand_state_report(self, mdl_lig_idx): + """ Get summary of states observed with respect to all target ligands + + Mainly for debug purposes + + :param mdl_lig_idx: Index of model ligand for which report should be + generated + :type mdl_lig_idx: :class:`int` + """ + return self._get_report(self.model_ligand_states[mdl_lig_idx], + self.state_matrix[:, mdl_lig_idx]) + + def _get_report(self, ligand_state, pair_states): + """ Helper + """ + pair_report = list() + for s in np.unique(pair_states): + desc = self.state_decoding[s] + indices = np.flatnonzero(pair_states == s).tolist() + pair_report.append({"state": s, + "short desc": desc[0], + "desc": desc[1], + "indices": indices}) + + desc = self.state_decoding[ligand_state] + ligand_report = {"state": ligand_state, + "short desc": desc[0], + "desc": desc[1]} + + return (ligand_report, pair_report) + + def guess_target_ligand_unassigned_reason(self, trg_lig_idx): + """ Makes an educated guess why target ligand is not assigned + + This either returns actual error states or custom states that are + derived from them. + + :param trg_lig_idx: Index of target ligand + :type trg_lig_idx: :class:`int` + :returns: :class:`tuple` with two elements: 1) keyword 2) human readable + sentence describing the issue, (\"unknown\",\"unknown\") if + nothing obvious can be found. + :raises: :class:`RuntimeError` if specified target ligand is assigned + """ + if trg_lig_idx not in self.unassigned_target_ligands: + raise RuntimeError("Specified target ligand is not unassigned") + + # hardcoded tuple if there is simply nothing we can assign it to + if len(self.model_ligands) == 0: + return ("no_ligand", "No ligand in the model") + + # if something with the ligand itself is wrong, we can be pretty sure + # thats why the ligand is unassigned + if self.target_ligand_states[trg_lig_idx] != 0: + return self.state_decoding[self.target_ligand_states[trg_lig_idx]] + + # The next best guess comes from looking at pair states + tmp = np.unique(self.state_matrix[trg_lig_idx,:]) + + # In case of any 0, it could have been assigned so it's probably + # just not selected due to different stoichiometry - this is no + # defined state, we just return a hardcoded tuple in this case + if 0 in tmp: + return ("stoichiometry", + "Ligand was already assigned to an other " + "model ligand (different stoichiometry)") + + # maybe its a symmetry issue? + if 2 in tmp: + return self.state_decoding[2] + + # if the state is 6 (single_ligand_issue), there is an issue with its + # target counterpart. + if 6 in tmp: + mdl_idx = np.where(self.state_matrix[trg_lig_idx,:]==6)[0] + # we're reporting everything except disconnected error... + # don't ask... + for i in mdl_idx: + if self.model_ligand_states[i] == 0: + raise RuntimeError("This should never happen") + if self.model_ligand_states[i] != 4: + return self.state_decoding[self.model_ligand_states[i]] + + # get rid of remaining single ligand issues (only disconnected error) + if 6 in tmp and len(tmp) > 1: + tmp = tmp[tmp!=6] + + # prefer everything over identity state + if 1 in tmp and len(tmp) > 1: + tmp = tmp[tmp!=1] + + # just return whatever is left + return self.state_decoding[tmp[0]] + + + def guess_model_ligand_unassigned_reason(self, mdl_lig_idx): + """ Makes an educated guess why model ligand is not assigned + + This either returns actual error states or custom states that are + derived from them. + + :param mdl_lig_idx: Index of model ligand + :type mdl_lig_idx: :class:`int` + :returns: :class:`tuple` with two elements: 1) keyword 2) human readable + sentence describing the issue, (\"unknown\",\"unknown\") if + nothing obvious can be found. + :raises: :class:`RuntimeError` if specified model ligand is assigned + """ + if mdl_lig_idx not in self.unassigned_model_ligands: + raise RuntimeError("Specified model ligand is not unassigned") + + # hardcoded tuple if there is simply nothing we can assign it to + if len(self.target_ligands) == 0: + return ("no_ligand", "No ligand in the target") + + # if something with the ligand itself is wrong, we can be pretty sure + # thats why the ligand is unassigned + if self.model_ligand_states[mdl_lig_idx] != 0: + return self.state_decoding[self.model_ligand_states[mdl_lig_idx]] + + # The next best guess comes from looking at pair states + tmp = np.unique(self.state_matrix[:,mdl_lig_idx]) + + # In case of any 0, it could have been assigned so it's probably + # just not selected due to different stoichiometry - this is no + # defined state, we just return a hardcoded tuple in this case + if 0 in tmp: + return ("stoichiometry", + "Ligand was already assigned to an other " + "model ligand (different stoichiometry)") + + # maybe its a symmetry issue? + if 2 in tmp: + return self.state_decoding[2] + + # if the state is 6 (single_ligand_issue), there is an issue with its + # target counterpart. + if 6 in tmp: + trg_idx = np.where(self.state_matrix[:,mdl_lig_idx]==6)[0] + # we're reporting everything except disconnected error... + # don't ask... + for i in trg_idx: + if self.target_ligand_states[i] == 0: + raise RuntimeError("This should never happen") + if self.target_ligand_states[i] != 4: + return self.state_decoding[self.target_ligand_states[i]] + + # get rid of remaining single ligand issues (only disconnected error) + if 6 in tmp and len(tmp) > 1: + tmp = tmp[tmp!=6] + + # prefer everything over identity state + if 1 in tmp and len(tmp) > 1: + tmp = tmp[tmp!=1] + + # just return whatever is left + return self.state_decoding[tmp[0]] + + @property + def unassigned_model_ligands_reasons(self): + return_dict = dict() + for i in self.unassigned_model_ligands: + lig = self.model_ligands[i] + cname = lig.GetChain().GetName() + rnum = lig.GetNumber() + if cname not in return_dict: + return_dict[cname] = dict() + return_dict[cname][rnum] = \ + self.guess_model_ligand_unassigned_reason(i)[0] + return return_dict + + @property + def unassigned_target_ligands_reasons(self): + return_dict = dict() + for i in self.unassigned_target_ligands: + lig = self.target_ligands[i] + cname = lig.GetChain().GetName() + rnum = lig.GetNumber() + if cname not in return_dict: + return_dict[cname] = dict() + return_dict[cname][rnum] = \ + self.guess_target_ligand_unassigned_reason(i)[0] + return return_dict + + @property + def _chain_mapper(self): + """ Chain mapper object for the given :attr:`target`. + + Can be used by child classes if needed, constructed with + *resnum_alignments* flag + + :type: :class:`ost.mol.alg.chain_mapping.ChainMapper` + """ + if self.__chain_mapper is None: + self.__chain_mapper = \ + chain_mapping.ChainMapper(self.target, + n_max_naive=1e9, + resnum_alignments=self.resnum_alignments) + return self.__chain_mapper + + @staticmethod + def _extract_ligands(entity): + """Extract ligands from entity. Return a list of residues. + + Assumes that ligands have the :attr:`~ost.mol.ResidueHandle.is_ligand` + flag set. This is typically the case for entities loaded from mmCIF + (tested with mmCIF files from the PDB and SWISS-MODEL). + Legacy PDB files must contain `HET` headers (which is usually the + case for files downloaded from the PDB but not elsewhere). + + This function performs basic checks to ensure that the residues in this + chain are not forming polymer bonds (ie peptide/nucleotide ligands) and + will raise a RuntimeError if this assumption is broken. + + :param entity: the entity to extract ligands from + :type entity: :class:`~ost.mol.EntityHandle` + :rtype: :class:`list` of :class:`~ost.mol.ResidueHandle` + + """ + extracted_ligands = [] + for residue in entity.residues: + if residue.is_ligand: + if mol.InSequence(residue, residue.next): + raise RuntimeError("Residue %s connected in polymer sequen" + "ce %s" % (residue.qualified_name)) + extracted_ligands.append(residue) + LogVerbose("Detected residue %s as ligand" % residue) + return extracted_ligands + + @staticmethod + def _prepare_ligands(new_entity, old_entity, ligands, rename_chain): + """Prepare the ligands given into a list of ResidueHandles which are + part of the copied entity, suitable for the model_ligands and + target_ligands properties. + + This function takes a list of ligands as (Entity|Residue)(Handle|View). + Entities can contain multiple ligands, which will be considered as + separate ligands. + + Ligands which are part of the entity are simply fetched in the new + copied entity. Otherwise, they are copied over to the copied entity. + """ + extracted_ligands = [] + + next_chain_num = 1 + new_editor = None + + def _copy_residue(residue, rename_chain): + """ Copy the residue into the new chain. + Return the new residue handle.""" + nonlocal next_chain_num, new_editor + + # Instantiate the editor + if new_editor is None: + new_editor = new_entity.EditXCS() + + new_chain = new_entity.FindChain(residue.chain.name) + if not new_chain.IsValid(): + new_chain = new_editor.InsertChain(residue.chain.name) + else: + # Does a residue with the same name already exist? + already_exists = new_chain.FindResidue(residue.number).IsValid() + if already_exists: + if rename_chain: + chain_ext = 2 # Extend the chain name by this + while True: + new_chain_name = \ + residue.chain.name + "_" + str(chain_ext) + new_chain = new_entity.FindChain(new_chain_name) + if new_chain.IsValid(): + chain_ext += 1 + continue + else: + new_chain = \ + new_editor.InsertChain(new_chain_name) + break + LogScript("Moved ligand residue %s to new chain %s" % ( + residue.qualified_name, new_chain.name)) + else: + msg = \ + "A residue number %s already exists in chain %s" % ( + residue.number, residue.chain.name) + raise RuntimeError(msg) + + # Add the residue with its original residue number + new_res = new_editor.AppendResidue(new_chain, residue.name, + residue.number) + # Add atoms + for old_atom in residue.atoms: + new_editor.InsertAtom(new_res, old_atom.name, old_atom.pos, + element=old_atom.element, occupancy=old_atom.occupancy, + b_factor=old_atom.b_factor, is_hetatm=old_atom.is_hetatom) + # Add bonds + for old_atom in residue.atoms: + for old_bond in old_atom.bonds: + new_first = new_res.FindAtom(old_bond.first.name) + new_second = new_res.FindAtom(old_bond.second.name) + new_editor.Connect(new_first, new_second) + return new_res + + def _process_ligand_residue(res, rename_chain): + """Copy or fetch the residue. Return the residue handle.""" + new_res = None + if res.entity.handle == old_entity.handle: + # Residue is part of the old_entity handle. + # However, it may not be in the copied one, for instance it may + # have been a view We try to grab it first, otherwise we copy it + new_res = new_entity.FindResidue(res.chain.name, res.number) + if new_res and new_res.valid: + qname = res.handle.qualified_name + LogVerbose("Ligand residue %s already in entity" % qname) + else: + # Residue is not part of the entity, need to copy it first + new_res = _copy_residue(res, rename_chain) + qname = res.handle.qualified_name + LogVerbose("Copied ligand residue %s" % qname) + new_res.SetIsLigand(True) + return new_res + + for ligand in ligands: + is_eh = isinstance(ligand, mol.EntityHandle) + is_ev = isinstance(ligand, mol.EntityView) + is_rh = isinstance(ligand, mol.ResidueHandle) + is_rv = isinstance(ligand, mol.ResidueView) + if is_eh or is_ev: + for residue in ligand.residues: + new_residue = _process_ligand_residue(residue, rename_chain) + extracted_ligands.append(new_residue) + elif is_rh or is_rv: + new_residue = _process_ligand_residue(ligand, rename_chain) + extracted_ligands.append(new_residue) + else: + raise RuntimeError("Ligands should be given as Entity or " + "Residue") + + if new_editor is not None: + new_editor.UpdateICS() + return extracted_ligands + + def _compute_scores(self): + """ + Compute score for every possible target-model ligand pair and store the + result in internal matrices. + """ + ############################## + # Create the result matrices # + ############################## + shape = (len(self.target_ligands), len(self.model_ligands)) + self._score_matrix = np.full(shape, np.nan, dtype=np.float32) + self._coverage_matrix = np.full(shape, np.nan, dtype=np.float32) + self._state_matrix = np.full(shape, 0, dtype=np.int32) + self._model_ligand_states = np.zeros(len(self.model_ligands)) + self._target_ligand_states = np.zeros(len(self.target_ligands)) + self._aux_matrix = np.empty(shape, dtype=dict) + + # precompute ligand graphs + target_graphs = \ + [_ResidueToGraph(l, by_atom_index=True) for l in self.target_ligands] + model_graphs = \ + [_ResidueToGraph(l, by_atom_index=True) for l in self.model_ligands] + + for g_idx, g in enumerate(target_graphs): + if not networkx.is_connected(g): + self._target_ligand_states[g_idx] = 4 + self._state_matrix[g_idx,:] = 6 + msg = "Disconnected graph observed for target ligand " + msg += str(self.target_ligands[g_idx]) + LogVerbose(msg) + + for g_idx, g in enumerate(model_graphs): + if not networkx.is_connected(g): + self._model_ligand_states[g_idx] = 4 + self._state_matrix[:,g_idx] = 6 + msg = "Disconnected graph observed for model ligand " + msg += str(self.model_ligands[g_idx]) + LogVerbose(msg) + + + for target_id, target_ligand in enumerate(self.target_ligands): + LogVerbose("Analyzing target ligand %s" % target_ligand) + + if self._target_ligand_states[target_id] == 4: + # Disconnected graph - already updated states and reported + # to LogVerbose + continue + + for model_id, model_ligand in enumerate(self.model_ligands): + LogVerbose("Compare to model ligand %s" % model_ligand) + + ######################################################### + # Compute symmetries for given target/model ligand pair # + ######################################################### + + if self._model_ligand_states[model_id] == 4: + # Disconnected graph - already updated states and reported + # to LogVerbose + continue + + try: + symmetries = ComputeSymmetries( + model_ligand, target_ligand, + substructure_match=self.substructure_match, + by_atom_index=True, + max_symmetries=self.max_symmetries, + model_graph = model_graphs[model_id], + target_graph = target_graphs[target_id]) + LogVerbose("Ligands %s and %s symmetry match" % ( + str(model_ligand), str(target_ligand))) + except NoSymmetryError: + # Ligands are different - skip + LogVerbose("No symmetry between %s and %s" % ( + str(model_ligand), str(target_ligand))) + self._state_matrix[target_id, model_id] = 1 + continue + except TooManySymmetriesError: + # Ligands are too symmetrical - skip + LogVerbose("Too many symmetries between %s and %s" % ( + str(model_ligand), str(target_ligand))) + self._state_matrix[target_id, model_id] = 2 + continue + except NoIsomorphicSymmetryError: + # Ligands are different - skip + LogVerbose("No isomorphic symmetry between %s and %s" % ( + str(model_ligand), str(target_ligand))) + self._state_matrix[target_id, model_id] = 3 + continue + except DisconnectedGraphError: + # this should never happen as we guard against + # DisconnectedGraphError when precomputing the graph + LogVerbose("Disconnected graph observed for %s and %s" % ( + str(model_ligand), str(target_ligand))) + # just set both ligand states to 4 + self._model_ligand_states[model_id] = 4 + self._model_ligand_states[target_id] = 4 + self._state_matrix[target_id, model_id] = 6 + continue + + ##################################################### + # Compute score by calling the child class _compute # + ##################################################### + score, pair_state, target_ligand_state, model_ligand_state, aux\ + = self._compute(symmetries, target_ligand, model_ligand) + + ############ + # Finalize # + ############ + + # Ensure that returned states are associated with a + # description. This is a requirement when subclassing + # LigandScorer => state_decoding dict from base class must + # be modified in subclass constructor + if pair_state not in self.state_decoding or \ + target_ligand_state not in self.state_decoding or \ + model_ligand_state not in self.state_decoding: + raise RuntimeError(f"Subclass returned state " + f"\"{state}\" for which no " + f"description is available. Point " + f"the developer of the used scorer " + f"to this error message.") + + # if any of the ligand states is non-zero, this must be marked + # by the scorer subclass by specifying a specific pair state + if target_ligand_state != 0 and pair_state != 6: + raise RuntimeError("Observed non-zero target-ligand " + "state which must trigger certain " + "pair state. Point the developer of " + "the used scorer to this error message") + + if model_ligand_state != 0 and pair_state != 6: + raise RuntimeError("Observed non-zero model-ligand " + "state which must trigger certain " + "pair state. Point the developer of " + "the used scorer to this error message") + + self._state_matrix[target_id, model_id] = pair_state + self._target_ligand_states[target_id] = target_ligand_state + self._model_ligand_states[model_id] = model_ligand_state + if pair_state == 0: + if score is None or np.isnan(score): + raise RuntimeError("LigandScorer returned invalid " + "score despite 0 error state") + # it's a valid score! + self._score_matrix[target_id, model_id] = score + cvg = len(symmetries[0][0]) / len(model_ligand.atoms) + self._coverage_matrix[target_id, model_id] = cvg + self._aux_matrix[target_id, model_id] = aux + + def _compute(self, symmetries, target_ligand, model_ligand): + """ Compute score for specified ligand pair - defined by child class + + Raises :class:`NotImplementedError` if not implemented by child class. + + :param symmetries: Defines symmetries between *target_ligand* and + *model_ligand*. Return value of + :func:`ComputeSymmetries` + :type symmetries: :class:`list` of :class:`tuple` with two elements + each: 1) :class:`list` of atom indices in + *target_ligand* 2) :class:`list` of respective atom + indices in *model_ligand* + :param target_ligand: The target ligand + :type target_ligand: :class:`ost.mol.ResidueHandle` or + :class:`ost.mol.ResidueView` + :param model_ligand: The model ligand + :type model_ligand: :class:`ost.mol.ResidueHandle` or + :class:`ost.mol.ResidueView` + + :returns: A :class:`tuple` with three elements: 1) a score + (:class:`float`) 2) state (:class:`int`). + 3) auxiliary data for this ligand pair (:class:`dict`). + If state is 0, the score and auxiliary data will be + added to :attr:`~score_matrix` and :attr:`~aux_matrix` as well + as the respective value in :attr:`~coverage_matrix`. + Returned score must be valid in this case (not None/NaN). + Child specific non-zero states must be >= 10. + """ + raise NotImplementedError("_compute must be implemented by child " + "class") + + def _score_dir(self): + """ Return direction of score - defined by child class + + Relevant for ligand assignment. Must return a string in ['+', '-']. + '+' for ascending scores, i.e. higher is better (lddt etc.) + '-' for descending scores, i.e. lower is better (rmsd etc.) + """ + raise NotImplementedError("_score_dir must be implemented by child " + "class") + + +def _ResidueToGraph(residue, by_atom_index=False): + """Return a NetworkX graph representation of the residue. + + :param residue: the residue from which to derive the graph + :type residue: :class:`ost.mol.ResidueHandle` or + :class:`ost.mol.ResidueView` + :param by_atom_index: Set this parameter to True if you need the nodes to + be labeled by atom index (within the residue). + Otherwise, if False, the nodes will be labeled by + atom names. + :type by_atom_index: :class:`bool` + :rtype: :class:`~networkx.classes.graph.Graph` + + Nodes are labeled with the Atom's uppercase + :attr:`~ost.mol.AtomHandle.element`. + """ + nxg = networkx.Graph() + + for atom in residue.atoms: + nxg.add_node(atom.name, element=atom.element.upper()) + + # This will list all edges twice - once for every atom of the pair. + # But as of NetworkX 3.0 adding the same edge twice has no effect, + # so we're good. + nxg.add_edges_from([( + b.first.name, + b.second.name) for a in residue.atoms for b in a.GetBondList()]) + + if by_atom_index: + nxg = networkx.relabel_nodes(nxg, + {a: b for a, b in zip( + [a.name for a in residue.atoms], + range(len(residue.atoms)))}, + True) + return nxg + +def ComputeSymmetries(model_ligand, target_ligand, substructure_match=False, + by_atom_index=False, return_symmetries=True, + max_symmetries=1e6, model_graph = None, + target_graph = None): + """Return a list of symmetries (isomorphisms) of the model onto the target + residues. + + :param model_ligand: The model ligand + :type model_ligand: :class:`ost.mol.ResidueHandle` or + :class:`ost.mol.ResidueView` + :param target_ligand: The target ligand + :type target_ligand: :class:`ost.mol.ResidueHandle` or + :class:`ost.mol.ResidueView` + :param substructure_match: Set this to True to allow partial ligands + in the reference. + :type substructure_match: :class:`bool` + :param by_atom_index: Set this parameter to True if you need the symmetries + to refer to atom index (within the residue). + Otherwise, if False, the symmetries refer to atom + names. + :type by_atom_index: :class:`bool` + :type return_symmetries: If Truthy, return the mappings, otherwise simply + return True if a mapping is found (and raise if + no mapping is found). This is useful to quickly + find out if a mapping exist without the expensive + step to find all the mappings. + :type return_symmetries: :class:`bool` + :param max_symmetries: If more than that many isomorphisms exist, raise + a :class:`TooManySymmetriesError`. This can only be assessed by + generating at least that many isomorphisms and can take some time. + :type max_symmetries: :class:`int` + :raises: :class:`NoSymmetryError` when no symmetry can be found; + :class:`NoIsomorphicSymmetryError` in case of isomorphic + subgraph but *substructure_match* is False; + :class:`TooManySymmetriesError` when more than `max_symmetries` + isomorphisms are found; :class:`DisconnectedGraphError` if + graph for *model_ligand*/*target_ligand* is disconnected. + """ + + # Get the Graphs of the ligands + if model_graph is None: + model_graph = _ResidueToGraph(model_ligand, + by_atom_index=by_atom_index) + + if not networkx.is_connected(model_graph): + msg = "Disconnected graph for model ligand %s" % model_ligand + raise DisconnectedGraphError(msg) + + + if target_graph is None: + target_graph = _ResidueToGraph(target_ligand, + by_atom_index=by_atom_index) + + if not networkx.is_connected(target_graph): + msg = "Disconnected graph for target ligand %s" % target_ligand + raise DisconnectedGraphError(msg) + + # Note the argument order (model, target) which differs from spyrmsd. + # This is because a subgraph of model is isomorphic to target - but not the + # opposite as we only consider partial ligands in the reference. + # Make sure to generate the symmetries correctly in the end + gm = networkx.algorithms.isomorphism.GraphMatcher( + model_graph, target_graph, node_match=lambda x, y: + x["element"] == y["element"]) + if gm.is_isomorphic(): + if not return_symmetries: + return True + symmetries = [] + for i, isomorphism in enumerate(gm.isomorphisms_iter()): + if i >= max_symmetries: + raise TooManySymmetriesError( + "Too many symmetries between %s and %s" % ( + str(model_ligand), str(target_ligand))) + symmetries.append((list(isomorphism.values()), + list(isomorphism.keys()))) + assert len(symmetries) > 0 + LogDebug("Found %s isomorphic mappings (symmetries)" % len(symmetries)) + elif gm.subgraph_is_isomorphic() and substructure_match: + if not return_symmetries: + return True + symmetries = [] + for i, isomorphism in enumerate(gm.subgraph_isomorphisms_iter()): + if i >= max_symmetries: + raise TooManySymmetriesError( + "Too many symmetries between %s and %s" % ( + str(model_ligand), str(target_ligand))) + symmetries.append((list(isomorphism.values()), + list(isomorphism.keys()))) + assert len(symmetries) > 0 + # Assert that all the atoms in the target are part of the substructure + assert len(symmetries[0][0]) == len(target_ligand.atoms) + n_sym = len(symmetries) + LogDebug("Found %s subgraph isomorphisms (symmetries)" % n_sym) + elif gm.subgraph_is_isomorphic(): + LogDebug("Found subgraph isomorphisms (symmetries), but" + " ignoring because substructure_match=False") + raise NoIsomorphicSymmetryError("No symmetry between %s and %s" % ( + str(model_ligand), str(target_ligand))) + else: + LogDebug("Found no isomorphic mappings (symmetries)") + raise NoSymmetryError("No symmetry between %s and %s" % ( + str(model_ligand), str(target_ligand))) + + return symmetries + +class NoSymmetryError(ValueError): + """ Exception raised when no symmetry can be found. + """ + pass + +class NoIsomorphicSymmetryError(ValueError): + """ Exception raised when no isomorphic symmetry can be found + + There would be isomorphic subgraphs for which symmetries can + be found, but substructure_match is disabled + """ + pass + +class TooManySymmetriesError(ValueError): + """ Exception raised when too many symmetries are found. + """ + pass + +class DisconnectedGraphError(Exception): + """ Exception raised when the ligand graph is disconnected. + """ + pass + +# specify public interface +__all__ = ('LigandScorer', 'ComputeSymmetries', 'NoSymmetryError', + 'NoIsomorphicSymmetryError', 'TooManySymmetriesError', + 'DisconnectedGraphError') diff --git a/modules/mol/alg/pymod/ligand_scoring_lddtpli.py b/modules/mol/alg/pymod/ligand_scoring_lddtpli.py new file mode 100644 index 0000000000000000000000000000000000000000..f4ae9c4c6c27c4879119f2ade73ebe7f07e4fcbc --- /dev/null +++ b/modules/mol/alg/pymod/ligand_scoring_lddtpli.py @@ -0,0 +1,958 @@ +import numpy as np + +from ost import LogWarning +from ost import geom +from ost import mol +from ost import seq + +from ost.mol.alg import lddt +from ost.mol.alg import chain_mapping +from ost.mol.alg import ligand_scoring_base + +class LDDTPLIScorer(ligand_scoring_base.LigandScorer): + """ :class:`LigandScorer` implementing lDDT-PLI. + + lDDT-PLI is an lDDT score considering contacts between ligand and + receptor. Where receptor consists of protein and nucleic acid chains that + pass the criteria for :class:`chain mapping <ost.mol.alg.chain_mapping>`. + This means ignoring other ligands, waters, short polymers as well as any + incorrectly connected chains that may be in proximity. + + :class:`LDDTPLIScorer` computes a score for a specific pair of target/model + ligands. Given a target/model ligand pair, all possible mappings of + model chains onto their chemically equivalent target chains are enumerated. + For each of these enumerations, all possible symmetries, i.e. atom-atom + assignments of the ligand as given by :class:`LigandScorer`, are evaluated + and an lDDT-PLI score is computed. The best possible lDDT-PLI score is + returned. + + By default, classic lDDT is computed. That means, contacts within + *lddt_pli_radius* are identified in the target and checked if they're + conserved in the model. Added contacts are not penalized. That means if + the ligand is nicely placed in the correct pocket, but that pocket now + suddenly interacts with MORE residues in the model, you still get a high + score. You can penalize for these added contacts with the + *add_mdl_contacts* flag. This additionally considers contacts within + *lddt_pli_radius* in the model but only if the involved atoms can + be mapped to the target. This is a requirement to 1) extract the respective + reference distance from the target 2) avoid usage of contacts for which + we have no experimental evidence. One special case are + contacts from chains that are NOT mapped to the target binding site. It is + very well possible that we have experimental evidence for this chain though + its just too far away from the target binding site. + We therefore try to map these contacts to the chain in the target with + equivalent sequence that is closest to the target binding site. If the + respective atoms can be mapped there, the contact is considered not + fulfilled and added as penalty. + + Populates :attr:`LigandScorer.aux_data` with following :class:`dict` keys: + + * lddt_pli: The score + * lddt_pli_n_contacts: Number of contacts considered in lDDT computation + * target_ligand: The actual target ligand for which the score was computed + * model_ligand: The actual model ligand for which the score was computed + * bs_ref_res: :class:`set` of residues with potentially non-zero + contribution to score. That is every residue with at least one + atom within *lddt_pli_radius* + max(*lddt_pli_thresholds*) of + the ligand. + * bs_mdl_res: Same for model + + :param model: Passed to parent constructor - see :class:`LigandScorer`. + :type model: :class:`ost.mol.EntityHandle`/:class:`ost.mol.EntityView` + :param target: Passed to parent constructor - see :class:`LigandScorer`. + :type target: :class:`ost.mol.EntityHandle`/:class:`ost.mol.EntityView` + :param model_ligands: Passed to parent constructor - see + :class:`LigandScorer`. + :type model_ligands: :class:`list` + :param target_ligands: Passed to parent constructor - see + :class:`LigandScorer`. + :type target_ligands: :class:`list` + :param resnum_alignments: Passed to parent constructor - see + :class:`LigandScorer`. + :type resnum_alignments: :class:`bool` + :param rename_ligand_chain: Passed to parent constructor - see + :class:`LigandScorer`. + :type rename_ligand_chain: :class:`bool` + :param substructure_match: Passed to parent constructor - see + :class:`LigandScorer`. + :type substructure_match: :class:`bool` + :param coverage_delta: Passed to parent constructor - see + :class:`LigandScorer`. + :type coverage_delta: :class:`float` + :param max_symmetries: Passed to parent constructor - see + :class:`LigandScorer`. + :type max_symmetries: :class:`int` + :param check_resnames: On by default. Enforces residue name matches + between mapped model and target residues. + :type check_resnames: :class:`bool` + :param lddt_pli_radius: lDDT inclusion radius for lDDT-PLI. + :type lddt_pli_radius: :class:`float` + :param add_mdl_contacts: Whether to add mdl contacts. + :type add_mdl_contacts: :class:`bool` + :param lddt_pli_thresholds: Distance difference thresholds for lDDT. + :type lddt_pli_thresholds: :class:`list` of :class:`float` + :param lddt_pli_binding_site_radius: Pro param - dont use. Providing a value + Restores behaviour from previous + implementation that first extracted a + binding site with strict distance + threshold and computed lDDT-PLI only on + those target residues whereas the + current implementation includes every + atom within *lddt_pli_radius*. + :type lddt_pli_binding_site_radius: :class:`float` + """ + + def __init__(self, model, target, model_ligands=None, target_ligands=None, + resnum_alignments=False, rename_ligand_chain=False, + substructure_match=False, coverage_delta=0.2, + max_symmetries=1e5, check_resnames=True, lddt_pli_radius=6.0, + add_mdl_contacts=False, + lddt_pli_thresholds = [0.5, 1.0, 2.0, 4.0], + lddt_pli_binding_site_radius=None): + + super().__init__(model, target, model_ligands = model_ligands, + target_ligands = target_ligands, + resnum_alignments = resnum_alignments, + rename_ligand_chain = rename_ligand_chain, + substructure_match = substructure_match, + coverage_delta = coverage_delta, + max_symmetries = max_symmetries) + + self.check_resnames = check_resnames + self.lddt_pli_radius = lddt_pli_radius + self.add_mdl_contacts = add_mdl_contacts + self.lddt_pli_thresholds = lddt_pli_thresholds + self.lddt_pli_binding_site_radius = lddt_pli_binding_site_radius + + # lazily precomputed variables to speedup lddt-pli computation + self._lddt_pli_target_data = dict() + self._lddt_pli_model_data = dict() + self.__mappable_atoms = None + self.__chem_mapping = None + self.__chem_group_alns = None + self.__ref_mdl_alns = None + self.__chain_mapping_mdl = None + + # update state decoding from parent with subclass specific stuff + self.state_decoding[10] = ("no_contact", + "There were no lDDT contacts between the " + "binding site and the ligand, and lDDT-PLI " + "is undefined.") + self.state_decoding[20] = ("unknown", + "Unknown error occured in LDDTPLIScorer") + + def _compute(self, symmetries, target_ligand, model_ligand): + """ Implements interface from parent + """ + if self.add_mdl_contacts: + result = self._compute_lddt_pli_add_mdl_contacts(symmetries, + target_ligand, + model_ligand) + else: + result = self._compute_lddt_pli_classic(symmetries, + target_ligand, + model_ligand) + + pair_state = 0 + score = result["lddt_pli"] + + if score is None or np.isnan(score): + if result["lddt_pli_n_contacts"] == 0: + # it's a space ship! + pair_state = 10 + else: + # unknwon error state + pair_state = 20 + + # the ligands get a zero-state... + target_ligand_state = 0 + model_ligand_state = 0 + + return (score, pair_state, target_ligand_state, model_ligand_state, + result) + + def _score_dir(self): + """ Implements interface from parent + """ + return '+' + + def _compute_lddt_pli_add_mdl_contacts(self, symmetries, target_ligand, + model_ligand): + + ############################### + # Get stuff from model/target # + ############################### + + trg_residues, trg_bs, trg_chains, trg_ligand_chain, \ + trg_ligand_res, scorer, chem_groups = \ + self._lddt_pli_get_trg_data(target_ligand) + + trg_bs_center = trg_bs.geometric_center + + # Copy to make sure that we don't change anything on underlying + # references + # This is not strictly necessary in the current implementation but + # hey, maybe it avoids hard to debug errors when someone changes things + ref_indices = [a.copy() for a in scorer.ref_indices_ic] + ref_distances = [a.copy() for a in scorer.ref_distances_ic] + + # distance hacking... remove any interchain distance except the ones + # with the ligand + ligand_start_idx = scorer.chain_start_indices[-1] + for at_idx in range(ligand_start_idx): + mask = ref_indices[at_idx] >= ligand_start_idx + ref_indices[at_idx] = ref_indices[at_idx][mask] + ref_distances[at_idx] = ref_distances[at_idx][mask] + + mdl_residues, mdl_bs, mdl_chains, mdl_ligand_chain, mdl_ligand_res, \ + chem_mapping = self._lddt_pli_get_mdl_data(model_ligand) + + if len(mdl_chains) == 0 or len(trg_chains) == 0: + # It's a spaceship! + return {"lddt_pli": None, + "lddt_pli_n_contacts": 0, + "target_ligand": target_ligand, + "model_ligand": model_ligand, + "bs_ref_res": trg_residues, + "bs_mdl_res": mdl_residues} + + #################### + # Setup alignments # + #################### + + # ref_mdl_alns refers to full chain mapper trg and mdl structures + # => need to adapt mdl sequence that only contain residues in contact + # with ligand + cut_ref_mdl_alns = self._lddt_pli_cut_ref_mdl_alns(chem_groups, + chem_mapping, + mdl_bs, trg_bs) + + ######################################## + # Setup cache for added model contacts # + ######################################## + + # get each chain mapping that we ever observe in scoring + chain_mappings = list(chain_mapping._ChainMappings(chem_groups, + chem_mapping)) + + # for each mdl ligand atom, we collect all trg ligand atoms that are + # ever mapped onto it given *symmetries* + ligand_atom_mappings = [set() for a in mdl_ligand_res.atoms] + for (trg_sym, mdl_sym) in symmetries: + for trg_i, mdl_i in zip(trg_sym, mdl_sym): + ligand_atom_mappings[mdl_i].add(trg_i) + + mdl_ligand_pos = np.zeros((mdl_ligand_res.GetAtomCount(), 3)) + for a_idx, a in enumerate(mdl_ligand_res.atoms): + p = a.GetPos() + mdl_ligand_pos[a_idx, 0] = p[0] + mdl_ligand_pos[a_idx, 1] = p[1] + mdl_ligand_pos[a_idx, 2] = p[2] + + trg_ligand_pos = np.zeros((trg_ligand_res.GetAtomCount(), 3)) + for a_idx, a in enumerate(trg_ligand_res.atoms): + p = a.GetPos() + trg_ligand_pos[a_idx, 0] = p[0] + trg_ligand_pos[a_idx, 1] = p[1] + trg_ligand_pos[a_idx, 2] = p[2] + + mdl_lig_hashes = [a.hash_code for a in mdl_ligand_res.atoms] + + symmetric_atoms = np.asarray(sorted(list(scorer.symmetric_atoms)), + dtype=np.int64) + + # two caches to cache things for each chain mapping => lists + # of len(chain_mappings) + # + # In principle we're caching for each trg/mdl ligand atom pair all + # information to update ref_indices/ref_distances and resolving the + # symmetries of the binding site. + # in detail: each list entry in *scoring_cache* is a dict with + # key: (mdl_lig_at_idx, trg_lig_at_idx) + # value: tuple with 4 elements - 1: indices of atoms representing added + # contacts relative to overall inexing scheme in scorer 2: the + # respective distances 3: the same but only containing indices towards + # atoms of the binding site that are considered symmetric 4: the + # respective indices. + # each list entry in *penalty_cache* is a list of len N mdl lig atoms. + # For each mdl lig at it contains a penalty for this mdl lig at. That + # means the number of contacts in the mdl binding site that can + # directly be mapped to the target given the local chain mapping but + # are not present in the target binding site, i.e. interacting atoms are + # too far away. + scoring_cache = list() + penalty_cache = list() + + for mapping in chain_mappings: + + # flat mapping with mdl chain names as key + flat_mapping = dict() + for trg_chem_group, mdl_chem_group in zip(chem_groups, mapping): + for a,b in zip(trg_chem_group, mdl_chem_group): + if a is not None and b is not None: + flat_mapping[b] = a + + # for each mdl bs atom (as atom hash), the trg bs atoms (as index in + # scorer) + bs_atom_mapping = dict() + for mdl_cname, ref_cname in flat_mapping.items(): + aln = cut_ref_mdl_alns[(ref_cname, mdl_cname)] + ref_ch = trg_bs.Select(f"cname={mol.QueryQuoteName(ref_cname)}") + mdl_ch = mdl_bs.Select(f"cname={mol.QueryQuoteName(mdl_cname)}") + aln.AttachView(0, ref_ch) + aln.AttachView(1, mdl_ch) + for col in aln: + ref_r = col.GetResidue(0) + mdl_r = col.GetResidue(1) + if ref_r.IsValid() and mdl_r.IsValid(): + for mdl_a in mdl_r.atoms: + ref_a = ref_r.FindAtom(mdl_a.GetName()) + if ref_a.IsValid(): + ref_h = ref_a.handle.hash_code + if ref_h in scorer.atom_indices: + mdl_h = mdl_a.handle.hash_code + bs_atom_mapping[mdl_h] = \ + scorer.atom_indices[ref_h] + + cache = dict() + n_penalties = list() + + for mdl_a_idx, mdl_a in enumerate(mdl_ligand_res.atoms): + n_penalty = 0 + trg_bs_indices = list() + close_a = mdl_bs.FindWithin(mdl_a.GetPos(), + self.lddt_pli_radius) + for a in close_a: + mdl_a_hash_code = a.hash_code + if mdl_a_hash_code in bs_atom_mapping: + trg_bs_indices.append(bs_atom_mapping[mdl_a_hash_code]) + elif mdl_a_hash_code not in mdl_lig_hashes: + if a.GetChain().GetName() in flat_mapping: + # Its in a mapped chain + at_key = (a.GetResidue().GetNumber(), a.name) + cname = a.GetChain().name + cname_key = (flat_mapping[cname], cname) + if at_key in self._mappable_atoms[cname_key]: + # Its a contact in the model but not part of + # trg_bs. It can still be mapped using the + # global mdl_ch/ref_ch alignment + # d in ref > self.lddt_pli_radius + max_thresh + # => guaranteed to be non-fulfilled contact + n_penalty += 1 + + n_penalties.append(n_penalty) + + trg_bs_indices = np.asarray(sorted(trg_bs_indices)) + + for trg_a_idx in ligand_atom_mappings[mdl_a_idx]: + # mask selects entries in trg_bs_indices that are not yet + # part of classic lDDT ref_indices for atom at trg_a_idx + # => added mdl contacts + mask = np.isin(trg_bs_indices, + ref_indices[ligand_start_idx + trg_a_idx], + assume_unique=True, invert=True) + added_indices = np.asarray([], dtype=np.int64) + added_distances = np.asarray([], dtype=np.float64) + if np.sum(mask) > 0: + # compute ref distances on reference positions + added_indices = trg_bs_indices[mask] + tmp = scorer.positions.take(added_indices, axis=0) + np.subtract(tmp, trg_ligand_pos[trg_a_idx][None, :], + out=tmp) + np.square(tmp, out=tmp) + tmp = tmp.sum(axis=1) + np.sqrt(tmp, out=tmp) + added_distances = tmp + + # extract the distances towards bs atoms that are symmetric + sym_mask = np.isin(added_indices, symmetric_atoms, + assume_unique=True) + + cache[(mdl_a_idx, trg_a_idx)] = (added_indices, + added_distances, + added_indices[sym_mask], + added_distances[sym_mask]) + + scoring_cache.append(cache) + penalty_cache.append(n_penalties) + + # cache for model contacts towards non mapped trg chains - this is + # relevant for self._lddt_pli_unmapped_chain_penalty + # key: tuple in form (trg_ch, mdl_ch) + # value: yet another dict with + # key: ligand_atom_hash + # value: n contacts towards respective trg chain that can be mapped + non_mapped_cache = dict() + + ############################################################### + # compute lDDT for all possible chain mappings and symmetries # + ############################################################### + + best_score = -1.0 + best_result = {"lddt_pli": None, + "lddt_pli_n_contacts": 0} + + # dummy alignment for ligand chains which is needed as input later on + ligand_aln = seq.CreateAlignment() + trg_s = seq.CreateSequence(trg_ligand_chain.name, + trg_ligand_res.GetOneLetterCode()) + mdl_s = seq.CreateSequence(mdl_ligand_chain.name, + mdl_ligand_res.GetOneLetterCode()) + ligand_aln.AddSequence(trg_s) + ligand_aln.AddSequence(mdl_s) + ligand_at_indices = list(range(ligand_start_idx, scorer.n_atoms)) + + sym_idx_collector = [None] * scorer.n_atoms + sym_dist_collector = [None] * scorer.n_atoms + + for mapping, s_cache, p_cache in zip(chain_mappings, scoring_cache, + penalty_cache): + + lddt_chain_mapping = dict() + lddt_alns = dict() + for ref_chem_group, mdl_chem_group in zip(chem_groups, mapping): + for ref_ch, mdl_ch in zip(ref_chem_group, mdl_chem_group): + # some mdl chains can be None + if mdl_ch is not None: + lddt_chain_mapping[mdl_ch] = ref_ch + lddt_alns[mdl_ch] = cut_ref_mdl_alns[(ref_ch, mdl_ch)] + + # add ligand to lddt_chain_mapping/lddt_alns + lddt_chain_mapping[mdl_ligand_chain.name] = trg_ligand_chain.name + lddt_alns[mdl_ligand_chain.name] = ligand_aln + + # already process model, positions will be manually hacked for each + # symmetry - small overhead for variables that are thrown away here + pos, _, _, _, _, _, lddt_symmetries = \ + scorer._ProcessModel(mdl_bs, lddt_chain_mapping, + residue_mapping = lddt_alns, + thresholds = self.lddt_pli_thresholds, + check_resnames = self.check_resnames) + + # estimate a penalty for unsatisfied model contacts from chains + # that are not in the local trg binding site, but can be mapped in + # the target. + # We're using the trg chain with the closest geometric center to + # the trg binding site that can be mapped to the mdl chain + # according the chem mapping. An alternative would be to search for + # the target chain with the minimal number of additional contacts. + # There is not good solution for this problem... + unmapped_chains = list() + already_mapped = set() + for mdl_ch in mdl_chains: + if mdl_ch not in lddt_chain_mapping: + # check which chain in trg is closest + chem_grp_idx = None + for i, m in enumerate(self._chem_mapping): + if mdl_ch in m: + chem_grp_idx = i + break + if chem_grp_idx is None: + raise RuntimeError("This should never happen... " + "ask Gabriel...") + closest_ch = None + closest_dist = None + for trg_ch in self._chain_mapper.chem_groups[chem_grp_idx]: + if trg_ch not in lddt_chain_mapping.values(): + if trg_ch not in already_mapped: + ch = self._chain_mapper.target.FindChain(trg_ch) + c = ch.geometric_center + d = geom.Distance(trg_bs_center, c) + if closest_dist is None or d < closest_dist: + closest_dist = d + closest_ch = trg_ch + if closest_ch is not None: + unmapped_chains.append((closest_ch, mdl_ch)) + already_mapped.add(closest_ch) + + for (trg_sym, mdl_sym) in symmetries: + + # update positions + for mdl_i, trg_i in zip(mdl_sym, trg_sym): + pos[ligand_start_idx + trg_i, :] = mdl_ligand_pos[mdl_i, :] + + # start new ref_indices/ref_distances from original values + funky_ref_indices = [np.copy(a) for a in ref_indices] + funky_ref_distances = [np.copy(a) for a in ref_distances] + + # The only distances from the binding site towards the ligand + # we care about are the ones from the symmetric atoms to + # correctly compute scorer._ResolveSymmetries. + # We collect them while updating distances from added mdl + # contacts + for idx in symmetric_atoms: + sym_idx_collector[idx] = list() + sym_dist_collector[idx] = list() + + # add data from added mdl contacts cache + added_penalty = 0 + for mdl_i, trg_i in zip(mdl_sym, trg_sym): + added_penalty += p_cache[mdl_i] + cache = s_cache[mdl_i, trg_i] + full_trg_i = ligand_start_idx + trg_i + funky_ref_indices[full_trg_i] = \ + np.append(funky_ref_indices[full_trg_i], cache[0]) + funky_ref_distances[full_trg_i] = \ + np.append(funky_ref_distances[full_trg_i], cache[1]) + for idx, d in zip(cache[2], cache[3]): + sym_idx_collector[idx].append(full_trg_i) + sym_dist_collector[idx].append(d) + + for idx in symmetric_atoms: + funky_ref_indices[idx] = \ + np.append(funky_ref_indices[idx], + np.asarray(sym_idx_collector[idx], + dtype=np.int64)) + funky_ref_distances[idx] = \ + np.append(funky_ref_distances[idx], + np.asarray(sym_dist_collector[idx], + dtype=np.float64)) + + # we can pass funky_ref_indices/funky_ref_distances as + # sym_ref_indices/sym_ref_distances in + # scorer._ResolveSymmetries as we only have distances of the bs + # to the ligand and ligand atoms are "non-symmetric" + scorer._ResolveSymmetries(pos, self.lddt_pli_thresholds, + lddt_symmetries, + funky_ref_indices, + funky_ref_distances) + + N = sum([len(funky_ref_indices[i]) for i in ligand_at_indices]) + N += added_penalty + + # collect number of expected contacts which can be mapped + if len(unmapped_chains) > 0: + N += self._lddt_pli_unmapped_chain_penalty(unmapped_chains, + non_mapped_cache, + mdl_bs, + mdl_ligand_res, + mdl_sym) + + conserved = np.sum(scorer._EvalAtoms(pos, ligand_at_indices, + self.lddt_pli_thresholds, + funky_ref_indices, + funky_ref_distances), + axis=0) + score = None + if N > 0: + score = np.mean(conserved/N) + + if score is not None and score > best_score: + best_score = score + best_result = {"lddt_pli": score, + "lddt_pli_n_contacts": N} + + # fill misc info to result object + best_result["target_ligand"] = target_ligand + best_result["model_ligand"] = model_ligand + best_result["bs_ref_res"] = trg_residues + best_result["bs_mdl_res"] = mdl_residues + + return best_result + + + def _compute_lddt_pli_classic(self, symmetries, target_ligand, + model_ligand): + + ############################### + # Get stuff from model/target # + ############################### + + max_r = None + if self.lddt_pli_binding_site_radius: + max_r = self.lddt_pli_binding_site_radius + + trg_residues, trg_bs, trg_chains, trg_ligand_chain, \ + trg_ligand_res, scorer, chem_groups = \ + self._lddt_pli_get_trg_data(target_ligand, max_r = max_r) + + # Copy to make sure that we don't change anything on underlying + # references + # This is not strictly necessary in the current implementation but + # hey, maybe it avoids hard to debug errors when someone changes things + ref_indices = [a.copy() for a in scorer.ref_indices_ic] + ref_distances = [a.copy() for a in scorer.ref_distances_ic] + + # no matter what mapping/symmetries, the number of expected + # contacts stays the same + ligand_start_idx = scorer.chain_start_indices[-1] + ligand_at_indices = list(range(ligand_start_idx, scorer.n_atoms)) + n_exp = sum([len(ref_indices[i]) for i in ligand_at_indices]) + + mdl_residues, mdl_bs, mdl_chains, mdl_ligand_chain, mdl_ligand_res, \ + chem_mapping = self._lddt_pli_get_mdl_data(model_ligand) + + if n_exp == 0: + # no contacts... nothing to compute... + return {"lddt_pli": None, + "lddt_pli_n_contacts": 0, + "target_ligand": target_ligand, + "model_ligand": model_ligand, + "bs_ref_res": trg_residues, + "bs_mdl_res": mdl_residues} + + # Distance hacking... remove any interchain distance except the ones + # with the ligand + for at_idx in range(ligand_start_idx): + mask = ref_indices[at_idx] >= ligand_start_idx + ref_indices[at_idx] = ref_indices[at_idx][mask] + ref_distances[at_idx] = ref_distances[at_idx][mask] + + #################### + # Setup alignments # + #################### + + # ref_mdl_alns refers to full chain mapper trg and mdl structures + # => need to adapt mdl sequence that only contain residues in contact + # with ligand + cut_ref_mdl_alns = self._lddt_pli_cut_ref_mdl_alns(chem_groups, + chem_mapping, + mdl_bs, trg_bs) + + ############################################################### + # compute lDDT for all possible chain mappings and symmetries # + ############################################################### + + best_score = -1.0 + + # dummy alignment for ligand chains which is needed as input later on + l_aln = seq.CreateAlignment() + l_aln.AddSequence(seq.CreateSequence(trg_ligand_chain.name, + trg_ligand_res.GetOneLetterCode())) + l_aln.AddSequence(seq.CreateSequence(mdl_ligand_chain.name, + mdl_ligand_res.GetOneLetterCode())) + + mdl_ligand_pos = np.zeros((model_ligand.GetAtomCount(), 3)) + for a_idx, a in enumerate(model_ligand.atoms): + p = a.GetPos() + mdl_ligand_pos[a_idx, 0] = p[0] + mdl_ligand_pos[a_idx, 1] = p[1] + mdl_ligand_pos[a_idx, 2] = p[2] + + for mapping in chain_mapping._ChainMappings(chem_groups, chem_mapping): + + lddt_chain_mapping = dict() + lddt_alns = dict() + for ref_chem_group, mdl_chem_group in zip(chem_groups, mapping): + for ref_ch, mdl_ch in zip(ref_chem_group, mdl_chem_group): + # some mdl chains can be None + if mdl_ch is not None: + lddt_chain_mapping[mdl_ch] = ref_ch + lddt_alns[mdl_ch] = cut_ref_mdl_alns[(ref_ch, mdl_ch)] + + # add ligand to lddt_chain_mapping/lddt_alns + lddt_chain_mapping[mdl_ligand_chain.name] = trg_ligand_chain.name + lddt_alns[mdl_ligand_chain.name] = l_aln + + # already process model, positions will be manually hacked for each + # symmetry - small overhead for variables that are thrown away here + pos, _, _, _, _, _, lddt_symmetries = \ + scorer._ProcessModel(mdl_bs, lddt_chain_mapping, + residue_mapping = lddt_alns, + thresholds = self.lddt_pli_thresholds, + check_resnames = self.check_resnames) + + for (trg_sym, mdl_sym) in symmetries: + for mdl_i, trg_i in zip(mdl_sym, trg_sym): + pos[ligand_start_idx + trg_i, :] = mdl_ligand_pos[mdl_i, :] + # we can pass ref_indices/ref_distances as + # sym_ref_indices/sym_ref_distances in + # scorer._ResolveSymmetries as we only have distances of the bs + # to the ligand and ligand atoms are "non-symmetric" + scorer._ResolveSymmetries(pos, self.lddt_pli_thresholds, + lddt_symmetries, + ref_indices, + ref_distances) + # compute number of conserved distances for ligand atoms + conserved = np.sum(scorer._EvalAtoms(pos, ligand_at_indices, + self.lddt_pli_thresholds, + ref_indices, + ref_distances), axis=0) + score = np.mean(conserved/n_exp) + + if score > best_score: + best_score = score + + # fill misc info to result object + best_result = {"lddt_pli": best_score, + "lddt_pli_n_contacts": n_exp, + "target_ligand": target_ligand, + "model_ligand": model_ligand, + "bs_ref_res": trg_residues, + "bs_mdl_res": mdl_residues} + + return best_result + + def _lddt_pli_unmapped_chain_penalty(self, unmapped_chains, + non_mapped_cache, + mdl_bs, + mdl_ligand_res, + mdl_sym): + + n_exp = 0 + for ch_tuple in unmapped_chains: + if ch_tuple not in non_mapped_cache: + # for each ligand atom, we count the number of mappable atoms + # within lddt_pli_radius + counts = dict() + # the select statement also excludes the ligand in mdl_bs + # as it resides in a separate chain + mdl_cname = ch_tuple[1] + query = "cname=" + mol.QueryQuoteName(mdl_cname) + mdl_bs_ch = mdl_bs.Select(query) + for a in mdl_ligand_res.atoms: + close_atoms = \ + mdl_bs_ch.FindWithin(a.GetPos(), self.lddt_pli_radius) + N = 0 + for close_a in close_atoms: + at_key = (close_a.GetResidue().GetNumber(), + close_a.GetName()) + if at_key in self._mappable_atoms[ch_tuple]: + N += 1 + counts[a.hash_code] = N + + # fill cache + non_mapped_cache[ch_tuple] = counts + + # add number of mdl contacts which can be mapped to target + # as non-fulfilled contacts + counts = non_mapped_cache[ch_tuple] + lig_hash_codes = [a.hash_code for a in mdl_ligand_res.atoms] + for i in mdl_sym: + n_exp += counts[lig_hash_codes[i]] + + return n_exp + + + def _lddt_pli_get_mdl_data(self, model_ligand): + if model_ligand not in self._lddt_pli_model_data: + + mdl = self._chain_mapping_mdl + + mdl_residues = set() + for at in model_ligand.atoms: + close_atoms = mdl.FindWithin(at.GetPos(), self.lddt_pli_radius) + for close_at in close_atoms: + mdl_residues.add(close_at.GetResidue()) + + max_r = self.lddt_pli_radius + max(self.lddt_pli_thresholds) + for r in mdl.residues: + r.SetIntProp("bs", 0) + for at in model_ligand.atoms: + close_atoms = mdl.FindWithin(at.GetPos(), max_r) + for close_at in close_atoms: + close_at.GetResidue().SetIntProp("bs", 1) + + mdl_bs = mol.CreateEntityFromView(mdl.Select("grbs:0=1"), True) + mdl_chains = set([ch.name for ch in mdl_bs.chains]) + + mdl_editor = mdl_bs.EditXCS(mol.BUFFERED_EDIT) + mdl_ligand_chain = None + for cname in ["hugo_the_cat_terminator", "ida_the_cheese_monster"]: + try: + # I'm pretty sure, one of these chain names is not there... + mdl_ligand_chain = mdl_editor.InsertChain(cname) + break + except: + pass + if mdl_ligand_chain is None: + raise RuntimeError("Fuck this, I'm out...") + mdl_ligand_res = mdl_editor.AppendResidue(mdl_ligand_chain, + model_ligand, + deep=True) + mdl_editor.RenameResidue(mdl_ligand_res, "LIG") + mdl_editor.SetResidueNumber(mdl_ligand_res, mol.ResNum(1)) + + chem_mapping = list() + for m in self._chem_mapping: + chem_mapping.append([x for x in m if x in mdl_chains]) + + self._lddt_pli_model_data[model_ligand] = (mdl_residues, + mdl_bs, + mdl_chains, + mdl_ligand_chain, + mdl_ligand_res, + chem_mapping) + + return self._lddt_pli_model_data[model_ligand] + + + def _lddt_pli_get_trg_data(self, target_ligand, max_r = None): + if target_ligand not in self._lddt_pli_target_data: + + trg = self._chain_mapper.target + + if max_r is None: + max_r = self.lddt_pli_radius + max(self.lddt_pli_thresholds) + + trg_residues = set() + for at in target_ligand.atoms: + close_atoms = trg.FindWithin(at.GetPos(), max_r) + for close_at in close_atoms: + trg_residues.add(close_at.GetResidue()) + + for r in trg.residues: + r.SetIntProp("bs", 0) + + for r in trg_residues: + r.SetIntProp("bs", 1) + + trg_bs = mol.CreateEntityFromView(trg.Select("grbs:0=1"), True) + trg_chains = set([ch.name for ch in trg_bs.chains]) + + trg_editor = trg_bs.EditXCS(mol.BUFFERED_EDIT) + trg_ligand_chain = None + for cname in ["hugo_the_cat_terminator", "ida_the_cheese_monster"]: + try: + # I'm pretty sure, one of these chain names is not there yet + trg_ligand_chain = trg_editor.InsertChain(cname) + break + except: + pass + if trg_ligand_chain is None: + raise RuntimeError("Fuck this, I'm out...") + + trg_ligand_res = trg_editor.AppendResidue(trg_ligand_chain, + target_ligand, + deep=True) + trg_editor.RenameResidue(trg_ligand_res, "LIG") + trg_editor.SetResidueNumber(trg_ligand_res, mol.ResNum(1)) + + compound_name = trg_ligand_res.name + compound = lddt.CustomCompound.FromResidue(trg_ligand_res) + custom_compounds = {compound_name: compound} + + scorer = lddt.lDDTScorer(trg_bs, + custom_compounds = custom_compounds, + inclusion_radius = self.lddt_pli_radius) + + chem_groups = list() + for g in self._chain_mapper.chem_groups: + chem_groups.append([x for x in g if x in trg_chains]) + + self._lddt_pli_target_data[target_ligand] = (trg_residues, + trg_bs, + trg_chains, + trg_ligand_chain, + trg_ligand_res, + scorer, + chem_groups) + + return self._lddt_pli_target_data[target_ligand] + + + def _lddt_pli_cut_ref_mdl_alns(self, chem_groups, chem_mapping, mdl_bs, + ref_bs): + cut_ref_mdl_alns = dict() + for ref_chem_group, mdl_chem_group in zip(chem_groups, chem_mapping): + for ref_ch in ref_chem_group: + + ref_bs_chain = ref_bs.FindChain(ref_ch) + query = "cname=" + mol.QueryQuoteName(ref_ch) + ref_view = self._chain_mapper.target.Select(query) + + for mdl_ch in mdl_chem_group: + aln = self._ref_mdl_alns[(ref_ch, mdl_ch)] + + aln.AttachView(0, ref_view) + + mdl_bs_chain = mdl_bs.FindChain(mdl_ch) + query = "cname=" + mol.QueryQuoteName(mdl_ch) + aln.AttachView(1, self._chain_mapping_mdl.Select(query)) + + cut_mdl_seq = ['-'] * aln.GetLength() + cut_ref_seq = ['-'] * aln.GetLength() + for i, col in enumerate(aln): + + # check ref residue + r = col.GetResidue(0) + if r.IsValid(): + bs_r = ref_bs_chain.FindResidue(r.GetNumber()) + if bs_r.IsValid(): + cut_ref_seq[i] = col[0] + + # check mdl residue + r = col.GetResidue(1) + if r.IsValid(): + bs_r = mdl_bs_chain.FindResidue(r.GetNumber()) + if bs_r.IsValid(): + cut_mdl_seq[i] = col[1] + + cut_ref_seq = ''.join(cut_ref_seq) + cut_mdl_seq = ''.join(cut_mdl_seq) + cut_aln = seq.CreateAlignment() + cut_aln.AddSequence(seq.CreateSequence(ref_ch, cut_ref_seq)) + cut_aln.AddSequence(seq.CreateSequence(mdl_ch, cut_mdl_seq)) + cut_ref_mdl_alns[(ref_ch, mdl_ch)] = cut_aln + return cut_ref_mdl_alns + + @property + def _mappable_atoms(self): + """ Stores mappable atoms given a chain mapping + + Store for each ref_ch,mdl_ch pair all mdl atoms that can be + mapped. Don't store mappable atoms as hashes but rather as tuple + (mdl_r.GetNumber(), mdl_a.GetName()). Reason for that is that one might + operate on Copied EntityHandle objects without corresponding hashes. + Given a tuple defining c_pair: (ref_cname, mdl_cname), one + can check if a certain atom is mappable by evaluating: + if (mdl_r.GetNumber(), mdl_a.GetName()) in self._mappable_atoms(c_pair) + """ + if self.__mappable_atoms is None: + self.__mappable_atoms = dict() + for (ref_cname, mdl_cname), aln in self._ref_mdl_alns.items(): + self._mappable_atoms[(ref_cname, mdl_cname)] = set() + ref_query = f"cname={mol.QueryQuoteName(ref_cname)}" + mdl_query = f"cname={mol.QueryQuoteName(mdl_cname)}" + ref_ch = self._chain_mapper.target.Select(ref_query) + mdl_ch = self._chain_mapping_mdl.Select(mdl_query) + aln.AttachView(0, ref_ch) + aln.AttachView(1, mdl_ch) + for col in aln: + ref_r = col.GetResidue(0) + mdl_r = col.GetResidue(1) + if ref_r.IsValid() and mdl_r.IsValid(): + for mdl_a in mdl_r.atoms: + if ref_r.FindAtom(mdl_a.name).IsValid(): + c_key = (ref_cname, mdl_cname) + at_key = (mdl_r.GetNumber(), mdl_a.name) + self.__mappable_atoms[c_key].add(at_key) + + return self.__mappable_atoms + + @property + def _chem_mapping(self): + if self.__chem_mapping is None: + self.__chem_mapping, self.__chem_group_alns, \ + self.__chain_mapping_mdl = \ + self._chain_mapper.GetChemMapping(self.model) + return self.__chem_mapping + + @property + def _chem_group_alns(self): + if self.__chem_group_alns is None: + self.__chem_mapping, self.__chem_group_alns, \ + self.__chain_mapping_mdl = \ + self._chain_mapper.GetChemMapping(self.model) + return self.__chem_group_alns + + @property + def _ref_mdl_alns(self): + if self.__ref_mdl_alns is None: + self.__ref_mdl_alns = \ + chain_mapping._GetRefMdlAlns(self._chain_mapper.chem_groups, + self._chain_mapper.chem_group_alignments, + self._chem_mapping, + self._chem_group_alns) + return self.__ref_mdl_alns + + @property + def _chain_mapping_mdl(self): + if self.__chain_mapping_mdl is None: + self.__chem_mapping, self.__chem_group_alns, \ + self.__chain_mapping_mdl = \ + self._chain_mapper.GetChemMapping(self.model) + return self.__chain_mapping_mdl + +# specify public interface +__all__ = ('LDDTPLIScorer',) diff --git a/modules/mol/alg/pymod/ligand_scoring_scrmsd.py b/modules/mol/alg/pymod/ligand_scoring_scrmsd.py new file mode 100644 index 0000000000000000000000000000000000000000..c76a23597af8c7d46b3cdcd7bf3772e3525cdecf --- /dev/null +++ b/modules/mol/alg/pymod/ligand_scoring_scrmsd.py @@ -0,0 +1,461 @@ +import numpy as np + +from ost import LogWarning +from ost import geom +from ost import mol + +from ost.mol.alg import ligand_scoring_base + +class SCRMSDScorer(ligand_scoring_base.LigandScorer): + """ :class:`LigandScorer` implementing symmetry corrected RMSD. + + :class:`SCRMSDScorer` computes a score for a specific pair of target/model + ligands. + + The returned RMSD is based on a binding site superposition. + The binding site of the target structure is defined as all residues with at + least one atom within *bs_radius* around the target ligand. + It only contains protein and nucleic acid residues from chains that + pass the criteria for the + :class:`chain mapping <ost.mol.alg.chain_mapping>`. This means ignoring + other ligands, waters, short polymers as well as any incorrectly connected + chains that may be in proximity. + The respective model binding site for superposition is identified by + naively enumerating all possible mappings of model chains onto their + chemically equivalent target counterparts from the target binding site. + The *binding_sites_topn* with respect to lDDT score are evaluated and + an RMSD is computed. + You can either try to map ALL model chains onto the target binding site by + enabling *full_bs_search* or restrict the model chains for a specific + target/model ligand pair to the chains with at least one atom within + *model_bs_radius* around the model ligand. The latter can be significantly + faster in case of large complexes. + Symmetry correction is achieved by simply computing an RMSD value for + each symmetry, i.e. atom-atom assignments of the ligand as given by + :class:`LigandScorer`. The lowest RMSD value is returned + + Populates :attr:`LigandScorer.aux_data` with following :class:`dict` keys: + + * rmsd: The score + * lddt_lp: lDDT of the binding site used for superposition + * bs_ref_res: :class:`list` of binding site residues in target + * bs_ref_res_mapped: :class:`list` of target binding site residues that + are mapped to model + * bs_mdl_res_mapped: :class:`list` of same length with respective model + residues + * bb_rmsd: Backbone RMSD (CA, C3' for nucleotides) for mapped residues + given transform + * target_ligand: The actual target ligand for which the score was computed + * model_ligand: The actual model ligand for which the score was computed + * chain_mapping: :class:`dict` with a chain mapping of chains involved in + binding site - key: trg chain name, value: mdl chain name + * transform: :class:`geom.Mat4` to transform model binding site onto target + binding site + * inconsistent_residues: :class:`list` of :class:`tuple` representing + residues with inconsistent residue names upon mapping (which is given by + bs_ref_res_mapped and bs_mdl_res_mapped). Tuples have two elements: + 1) trg residue 2) mdl residue + + :param model: Passed to parent constructor - see :class:`LigandScorer`. + :type model: :class:`ost.mol.EntityHandle`/:class:`ost.mol.EntityView` + :param target: Passed to parent constructor - see :class:`LigandScorer`. + :type target: :class:`ost.mol.EntityHandle`/:class:`ost.mol.EntityView` + :param model_ligands: Passed to parent constructor - see + :class:`LigandScorer`. + :type model_ligands: :class:`list` + :param target_ligands: Passed to parent constructor - see + :class:`LigandScorer`. + :type target_ligands: :class:`list` + :param resnum_alignments: Passed to parent constructor - see + :class:`LigandScorer`. + :type resnum_alignments: :class:`bool` + :param rename_ligand_chain: Passed to parent constructor - see + :class:`LigandScorer`. + :type rename_ligand_chain: :class:`bool` + :param substructure_match: Passed to parent constructor - see + :class:`LigandScorer`. + :type substructure_match: :class:`bool` + :param coverage_delta: Passed to parent constructor - see + :class:`LigandScorer`. + :type coverage_delta: :class:`float` + :param max_symmetries: Passed to parent constructor - see + :class:`LigandScorer`. + :type max_symmetries: :class:`int` + :param bs_radius: Inclusion radius for the binding site. Residues with + atoms within this distance of the ligand will be considered + for inclusion in the binding site. + :type bs_radius: :class:`float` + :param lddt_lp_radius: lDDT inclusion radius for lDDT-LP. + :type lddt_lp_radius: :class:`float` + :param model_bs_radius: inclusion radius for model binding sites. + Only used when full_bs_search=False, otherwise the + radius is effectively infinite. Only chains with + atoms within this distance of a model ligand will + be considered in the chain mapping. + :type model_bs_radius: :class:`float` + :param binding_sites_topn: maximum number of model binding site + representations to assess per target binding + site. + :type binding_sites_topn: :class:`int` + :param full_bs_search: If True, all potential binding sites in the model + are searched for each target binding site. If False, + the search space in the model is reduced to chains + around (`model_bs_radius` Å) model ligands. + This speeds up computations, but may result in + ligands not being scored if the predicted ligand + pose is too far from the actual binding site. + :type full_bs_search: :class:`bool` + """ + def __init__(self, model, target, model_ligands=None, target_ligands=None, + resnum_alignments=False, rename_ligand_chain=False, + substructure_match=False, coverage_delta=0.2, + max_symmetries=1e5, bs_radius=4.0, lddt_lp_radius=15.0, + model_bs_radius=25, binding_sites_topn=100000, + full_bs_search=False): + + super().__init__(model, target, model_ligands = model_ligands, + target_ligands = target_ligands, + resnum_alignments = resnum_alignments, + rename_ligand_chain = rename_ligand_chain, + substructure_match = substructure_match, + coverage_delta = coverage_delta, + max_symmetries = max_symmetries) + + self.bs_radius = bs_radius + self.lddt_lp_radius = lddt_lp_radius + self.model_bs_radius = model_bs_radius + self.binding_sites_topn = binding_sites_topn + self.full_bs_search = full_bs_search + + # Residues that are in contact with a ligand => binding site + # defined as all residues with at least one atom within self.radius + # key: ligand.handle.hash_code, value: EntityView of whatever + # entity ligand belongs to + self._binding_sites = dict() + + # cache for GetRepr chain mapping calls + self._repr = dict() + + # lazily precomputed variables to speedup GetRepr chain mapping calls + # for localized GetRepr searches + self.__chem_mapping = None + self.__chem_group_alns = None + self.__ref_mdl_alns = None + self.__chain_mapping_mdl = None + self._get_repr_input = dict() + + # update state decoding from parent with subclass specific stuff + self.state_decoding[10] = ("binding_site", + "No residues were in proximity of the " + "target ligand.") + self.state_decoding[11] = ("model_representation", "No representation " + "of the reference binding site was found in " + "the model, i.e. the binding site was not " + "modeled or the model ligand was positioned " + "too far in combination with " + "full_bs_search=False.") + self.state_decoding[20] = ("unknown", + "Unknown error occured in SCRMSDScorer") + + def _compute(self, symmetries, target_ligand, model_ligand): + """ Implements interface from parent + """ + # set default to invalid scores + best_rmsd_result = {"rmsd": None, + "lddt_lp": None, + "bs_ref_res": list(), + "bs_ref_res_mapped": list(), + "bs_mdl_res_mapped": list(), + "bb_rmsd": None, + "target_ligand": target_ligand, + "model_ligand": model_ligand, + "chain_mapping": dict(), + "transform": geom.Mat4(), + "inconsistent_residues": list()} + + representations = self._get_repr(target_ligand, model_ligand) + + for r in representations: + rmsd = _SCRMSD_symmetries(symmetries, model_ligand, + target_ligand, transformation=r.transform) + + if best_rmsd_result["rmsd"] is None or \ + rmsd < best_rmsd_result["rmsd"]: + best_rmsd_result = {"rmsd": rmsd, + "lddt_lp": r.lDDT, + "bs_ref_res": r.substructure.residues, + "bs_ref_res_mapped": r.ref_residues, + "bs_mdl_res_mapped": r.mdl_residues, + "bb_rmsd": r.bb_rmsd, + "target_ligand": target_ligand, + "model_ligand": model_ligand, + "chain_mapping": r.GetFlatChainMapping(), + "transform": r.transform, + "inconsistent_residues": + r.inconsistent_residues} + + target_ligand_state = 0 + model_ligand_state = 0 + pair_state = 0 + + if best_rmsd_result["rmsd"] is not None: + best_rmsd = best_rmsd_result["rmsd"] + else: + # try to identify error states + best_rmsd = np.nan + error_state = 20 # unknown error + N = self._get_target_binding_site(target_ligand).GetResidueCount() + if N == 0: + pair_state = 6 # binding_site + target_ligand_state = 10 + elif len(representations) == 0: + pair_state = 11 + + return (best_rmsd, pair_state, target_ligand_state, model_ligand_state, + best_rmsd_result) + + def _score_dir(self): + """ Implements interface from parent + """ + return '-' + + def _get_repr(self, target_ligand, model_ligand): + + key = None + if self.full_bs_search: + # all possible binding sites, independent from actual model ligand + key = (target_ligand.handle.hash_code, 0) + else: + key = (target_ligand.handle.hash_code, + model_ligand.handle.hash_code) + + if key not in self._repr: + ref_bs = self._get_target_binding_site(target_ligand) + if self.full_bs_search: + reprs = self._chain_mapper.GetRepr( + ref_bs, self.model, inclusion_radius=self.lddt_lp_radius, + topn=self.binding_sites_topn) + else: + repr_in = self._get_get_repr_input(model_ligand) + radius = self.lddt_lp_radius + reprs = self._chain_mapper.GetRepr(ref_bs, self.model, + inclusion_radius=radius, + topn=self.binding_sites_topn, + chem_mapping_result=repr_in) + self._repr[key] = reprs + + return self._repr[key] + + def _get_target_binding_site(self, target_ligand): + + if target_ligand.handle.hash_code not in self._binding_sites: + + # create view of reference binding site + ref_residues_hashes = set() # helper to keep track of added residues + ignored_residue_hashes = {target_ligand.hash_code} + for ligand_at in target_ligand.atoms: + close_atoms = self.target.FindWithin(ligand_at.GetPos(), + self.bs_radius) + for close_at in close_atoms: + # Skip any residue not in the chain mapping target + ref_res = close_at.GetResidue() + h = ref_res.handle.GetHashCode() + if h not in ref_residues_hashes and \ + h not in ignored_residue_hashes: + view = self._chain_mapper.target.ViewForHandle(ref_res) + if view.IsValid(): + h = ref_res.handle.GetHashCode() + ref_residues_hashes.add(h) + elif ref_res.is_ligand: + msg = f"Ignoring ligand {ref_res.qualified_name} " + msg += "in binding site of " + msg += str(target_ligand.qualified_name) + LogWarning(msg) + ignored_residue_hashes.add(h) + elif ref_res.chem_type == mol.ChemType.WATERS: + pass # That's ok, no need to warn + else: + msg = f"Ignoring residue {ref_res.qualified_name} " + msg += "in binding site of " + msg += str(target_ligand.qualified_name) + LogWarning(msg) + ignored_residue_hashes.add(h) + + ref_bs = self.target.CreateEmptyView() + if ref_residues_hashes: + # reason for doing that separately is to guarantee same ordering + # of residues as in underlying entity. (Reorder by ResNum seems + # only available on ChainHandles) + for ch in self.target.chains: + for r in ch.residues: + if r.handle.GetHashCode() in ref_residues_hashes: + ref_bs.AddResidue(r, mol.ViewAddFlag.INCLUDE_ALL) + if len(ref_bs.residues) == 0: + raise RuntimeError("Failed to add proximity residues to " + "the reference binding site entity") + + self._binding_sites[target_ligand.handle.hash_code] = ref_bs + + return self._binding_sites[target_ligand.handle.hash_code] + + @property + def _chem_mapping(self): + if self.__chem_mapping is None: + self.__chem_mapping, self.__chem_group_alns, \ + self.__chain_mapping_mdl = \ + self._chain_mapper.GetChemMapping(self.model) + return self.__chem_mapping + + @property + def _chem_group_alns(self): + if self.__chem_group_alns is None: + self.__chem_mapping, self.__chem_group_alns, \ + self.__chain_mapping_mdl = \ + self._chain_mapper.GetChemMapping(self.model) + return self.__chem_group_alns + + @property + def _ref_mdl_alns(self): + if self.__ref_mdl_alns is None: + self.__ref_mdl_alns = \ + chain_mapping._GetRefMdlAlns(self._chain_mapper.chem_groups, + self._chain_mapper.chem_group_alignments, + self._chem_mapping, + self._chem_group_alns) + return self.__ref_mdl_alns + + @property + def _chain_mapping_mdl(self): + if self.__chain_mapping_mdl is None: + self.__chem_mapping, self.__chem_group_alns, \ + self.__chain_mapping_mdl = \ + self._chain_mapper.GetChemMapping(self.model) + return self.__chain_mapping_mdl + + def _get_get_repr_input(self, mdl_ligand): + if mdl_ligand.handle.hash_code not in self._get_repr_input: + + # figure out what chains in the model are in contact with the ligand + # that may give a non-zero contribution to lDDT in + # chain_mapper.GetRepr + radius = self.model_bs_radius + chains = set() + for at in mdl_ligand.atoms: + close_atoms = self._chain_mapping_mdl.FindWithin(at.GetPos(), + radius) + for close_at in close_atoms: + chains.add(close_at.GetChain().GetName()) + + if len(chains) > 0: + + # the chain mapping model which only contains close chains + query = "cname=" + query += ','.join([mol.QueryQuoteName(x) for x in chains]) + mdl = self._chain_mapping_mdl.Select(query) + + # chem mapping which is reduced to the respective chains + chem_mapping = list() + for m in self._chem_mapping: + chem_mapping.append([x for x in m if x in chains]) + + self._get_repr_input[mdl_ligand.handle.hash_code] = \ + (mdl, chem_mapping) + + else: + self._get_repr_input[mdl_ligand.handle.hash_code] = \ + (self._chain_mapping_mdl.CreateEmptyView(), + [list() for _ in self._chem_mapping]) + + return (self._get_repr_input[mdl_ligand.hash_code][1], + self._chem_group_alns, + self._get_repr_input[mdl_ligand.hash_code][0]) + + +def SCRMSD(model_ligand, target_ligand, transformation=geom.Mat4(), + substructure_match=False, max_symmetries=1e6): + """Calculate symmetry-corrected RMSD. + + Binding site superposition must be computed separately and passed as + `transformation`. + + :param model_ligand: The model ligand + :type model_ligand: :class:`ost.mol.ResidueHandle` or + :class:`ost.mol.ResidueView` + :param target_ligand: The target ligand + :type target_ligand: :class:`ost.mol.ResidueHandle` or + :class:`ost.mol.ResidueView` + :param transformation: Optional transformation to apply on each atom + position of model_ligand. + :type transformation: :class:`ost.geom.Mat4` + :param substructure_match: Set this to True to allow partial target + ligand. + :type substructure_match: :class:`bool` + :param max_symmetries: If more than that many isomorphisms exist, raise + a :class:`TooManySymmetriesError`. This can only be assessed by + generating at least that many isomorphisms and can take some time. + :type max_symmetries: :class:`int` + :rtype: :class:`float` + :raises: :class:`ost.mol.alg.ligand_scoring_base.NoSymmetryError` when no + symmetry can be found, + :class:`ost.mol.alg.ligand_scoring_base.DisconnectedGraphError` + when ligand graph is disconnected, + :class:`ost.mol.alg.ligand_scoring_base.TooManySymmetriesError` + when more than *max_symmetries* isomorphisms are found. + """ + + symmetries = ligand_scoring_base.ComputeSymmetries(model_ligand, + target_ligand, + substructure_match=substructure_match, + by_atom_index=True, + max_symmetries=max_symmetries) + return _SCRMSD_symmetries(symmetries, model_ligand, target_ligand, + transformation) + + +def _SCRMSD_symmetries(symmetries, model_ligand, target_ligand, + transformation): + """Compute SCRMSD with pre-computed symmetries. Internal. """ + + # setup numpy positions for model ligand and apply transformation + mdl_ligand_pos = np.ones((model_ligand.GetAtomCount(), 4)) + for a_idx, a in enumerate(model_ligand.atoms): + p = a.GetPos() + mdl_ligand_pos[a_idx, 0] = p[0] + mdl_ligand_pos[a_idx, 1] = p[1] + mdl_ligand_pos[a_idx, 2] = p[2] + np_transformation = np.zeros((4,4)) + for i in range(4): + for j in range(4): + np_transformation[i,j] = transformation[i,j] + mdl_ligand_pos = mdl_ligand_pos.dot(np_transformation.T)[:,:3] + + # setup numpy positions for target ligand + trg_ligand_pos = np.zeros((target_ligand.GetAtomCount(), 3)) + for a_idx, a in enumerate(target_ligand.atoms): + p = a.GetPos() + trg_ligand_pos[a_idx, 0] = p[0] + trg_ligand_pos[a_idx, 1] = p[1] + trg_ligand_pos[a_idx, 2] = p[2] + + # position matrices to iterate symmetries + # there is a guarantee that + # target_ligand.GetAtomCount() <= model_ligand.GetAtomCount() + # and that each target ligand atom is part of every symmetry + # => target_ligand.GetAtomCount() is size of both position matrices + rmsd_mdl_pos = np.zeros((target_ligand.GetAtomCount(), 3)) + rmsd_trg_pos = np.zeros((target_ligand.GetAtomCount(), 3)) + + # iterate symmetries and find the one with lowest RMSD + best_rmsd = np.inf + for i, (trg_sym, mdl_sym) in enumerate(symmetries): + for idx, (mdl_anum, trg_anum) in enumerate(zip(mdl_sym, trg_sym)): + rmsd_mdl_pos[idx,:] = mdl_ligand_pos[mdl_anum, :] + rmsd_trg_pos[idx,:] = trg_ligand_pos[trg_anum, :] + rmsd = np.sqrt(((rmsd_mdl_pos - rmsd_trg_pos)**2).sum(-1).mean()) + if rmsd < best_rmsd: + best_rmsd = rmsd + + return best_rmsd + +# specify public interface +__all__ = ('SCRMSDScorer', 'SCRMSD') diff --git a/modules/mol/alg/tests/test_ligand_scoring.py b/modules/mol/alg/tests/test_ligand_scoring.py index 6755b0e8ee10c4724c4730a5acbcc0d44661794b..934ef783407872ce57f5e4785505367a3d062f11 100644 --- a/modules/mol/alg/tests/test_ligand_scoring.py +++ b/modules/mol/alg/tests/test_ligand_scoring.py @@ -7,8 +7,10 @@ import ost from ost import io, mol, geom # check if we can import: fails if numpy or scipy not available try: - from ost.mol.alg.ligand_scoring import * - from ost.mol.alg import ligand_scoring + from ost.mol.alg.ligand_scoring_base import * + from ost.mol.alg import ligand_scoring_base + from ost.mol.alg import ligand_scoring_scrmsd + from ost.mol.alg import ligand_scoring_lddtpli except ImportError: print("Failed to import ligand_scoring.py. Happens when numpy, scipy or " "networkx is missing. Ignoring test_ligand_scoring.py tests.") @@ -41,7 +43,7 @@ def _LoadEntity(filename): return ent -class TestLigandScoring(unittest.TestCase): +class TestLigandScoringFancy(unittest.TestCase): def setUp(self): # Silence expected warnings about ignoring of ligands in binding site @@ -178,19 +180,19 @@ class TestLigandScoring(unittest.TestCase): """ mdl_lig = _LoadEntity("P84080_model_02_ligand_0.sdf") - graph = ligand_scoring._ResidueToGraph(mdl_lig.residues[0]) + graph = ligand_scoring_base._ResidueToGraph(mdl_lig.residues[0]) self.assertEqual(len(graph.edges), 34) self.assertEqual(len(graph.nodes), 32) # Check an arbitrary node self.assertEqual([a for a in graph.adj["14"].keys()], ["13", "29"]) - graph = ligand_scoring._ResidueToGraph(mdl_lig.residues[0], by_atom_index=True) + graph = ligand_scoring_base._ResidueToGraph(mdl_lig.residues[0], by_atom_index=True) self.assertEqual(len(graph.edges), 34) self.assertEqual(len(graph.nodes), 32) # Check an arbitrary node self.assertEqual([a for a in graph.adj[13].keys()], [12, 28]) - def test__ComputeSymmetries(self): + def test_ComputeSymmetries(self): """Test that _ComputeSymmetries works. """ trg = _LoadMMCIF("1r8q.cif.gz") @@ -202,15 +204,15 @@ class TestLigandScoring(unittest.TestCase): trg_g3d2 = trg.FindResidue("J", 1) mdl_g3d = mdl.FindResidue("L_2", 1) - sym = ligand_scoring._ComputeSymmetries(mdl_g3d, trg_g3d1) + sym = ligand_scoring_base.ComputeSymmetries(mdl_g3d, trg_g3d1) self.assertEqual(len(sym), 72) - sym = ligand_scoring._ComputeSymmetries(mdl_g3d, trg_g3d1, by_atom_index=True) + sym = ligand_scoring_base.ComputeSymmetries(mdl_g3d, trg_g3d1, by_atom_index=True) self.assertEqual(len(sym), 72) # Test that we can match ions read from SDF sdf_lig = _LoadEntity("1r8q_ligand_0.sdf") - sym = ligand_scoring._ComputeSymmetries(trg_mg1, sdf_lig.residues[0], by_atom_index=True) + sym = ligand_scoring_base.ComputeSymmetries(trg_mg1, sdf_lig.residues[0], by_atom_index=True) self.assertEqual(len(sym), 1) # Test that it works with views and only consider atoms in the view @@ -221,19 +223,19 @@ class TestLigandScoring(unittest.TestCase): mdl_g3d_sub_ent = mdl_g3d.Select("aindex>1447") mdl_g3d_sub = mdl_g3d_sub_ent.residues[0] - sym = ligand_scoring._ComputeSymmetries(mdl_g3d_sub, trg_g3d1_sub) + sym = ligand_scoring_base.ComputeSymmetries(mdl_g3d_sub, trg_g3d1_sub) self.assertEqual(len(sym), 6) - sym = ligand_scoring._ComputeSymmetries(mdl_g3d_sub, trg_g3d1_sub, by_atom_index=True) + sym = ligand_scoring_base.ComputeSymmetries(mdl_g3d_sub, trg_g3d1_sub, by_atom_index=True) self.assertEqual(len(sym), 6) # Substructure matches - sym = ligand_scoring._ComputeSymmetries(mdl_g3d, trg_g3d1_sub, substructure_match=True) + sym = ligand_scoring_base.ComputeSymmetries(mdl_g3d, trg_g3d1_sub, substructure_match=True) self.assertEqual(len(sym), 6) # Missing atoms only allowed in target, not in model with self.assertRaises(NoSymmetryError): - ligand_scoring._ComputeSymmetries(mdl_g3d_sub, trg_g3d1, substructure_match=True) + ligand_scoring_base.ComputeSymmetries(mdl_g3d_sub, trg_g3d1, substructure_match=True) def test_SCRMSD(self): """Test that SCRMSD works. @@ -247,53 +249,51 @@ class TestLigandScoring(unittest.TestCase): trg_g3d2 = trg.FindResidue("J", 1) mdl_g3d = mdl.FindResidue("L_2", 1) - rmsd = SCRMSD(mdl_g3d, trg_g3d1) + rmsd = ligand_scoring_scrmsd.SCRMSD(mdl_g3d, trg_g3d1) self.assertAlmostEqual(rmsd, 2.21341e-06, 10) - rmsd = SCRMSD(mdl_g3d, trg_g3d2) + rmsd = ligand_scoring_scrmsd.SCRMSD(mdl_g3d, trg_g3d2) self.assertAlmostEqual(rmsd, 61.21325, 4) # Ensure we raise a NoSymmetryError if the ligand is wrong with self.assertRaises(NoSymmetryError): - SCRMSD(mdl_g3d, trg_mg1) + ligand_scoring_scrmsd.SCRMSD(mdl_g3d, trg_mg1) with self.assertRaises(NoSymmetryError): - SCRMSD(mdl_g3d, trg_afb1) + ligand_scoring_scrmsd.SCRMSD(mdl_g3d, trg_afb1) # Assert that transform works trans = geom.Mat4(-0.999256, 0.00788487, -0.0377333, -15.4397, 0.0380652, 0.0473315, -0.998154, 29.9477, -0.00608426, -0.998848, -0.0475963, 28.8251, 0, 0, 0, 1) - rmsd = SCRMSD(mdl_g3d, trg_g3d2, transformation=trans) + rmsd = ligand_scoring_scrmsd.SCRMSD(mdl_g3d, trg_g3d2, transformation=trans) self.assertAlmostEqual(rmsd, 0.293972, 5) # Assert that substructure matches work trg_g3d1_sub = trg_g3d1.Select("aindex>6019").residues[0] # Skip PA, PB and O[1-3]A and O[1-3]B. # mdl_g3d_sub = mdl_g3d.Select("aindex>1447").residues[0] # Skip PA, PB and O[1-3]A and O[1-3]B. - with self.assertRaises(NoSymmetryError): - SCRMSD(mdl_g3d, trg_g3d1_sub) # no full match + with self.assertRaises(NoIsomorphicSymmetryError): + ligand_scoring_scrmsd.SCRMSD(mdl_g3d, trg_g3d1_sub) # no full match # But partial match is OK - rmsd = SCRMSD(mdl_g3d, trg_g3d1_sub, substructure_match=True) + rmsd = ligand_scoring_scrmsd.SCRMSD(mdl_g3d, trg_g3d1_sub, substructure_match=True) self.assertAlmostEqual(rmsd, 2.2376232209353475e-06, 8) # Ensure it doesn't work the other way around - ie incomplete model is invalid with self.assertRaises(NoSymmetryError): - SCRMSD(trg_g3d1_sub, mdl_g3d) # no full match + ligand_scoring_scrmsd.SCRMSD(trg_g3d1_sub, mdl_g3d) # no full match + - def test__compute_scores(self): + def test_compute_rmsd_scores(self): """Test that _compute_scores works. """ trg = _LoadMMCIF("1r8q.cif.gz") mdl = _LoadMMCIF("P84080_model_02.cif.gz") mdl_lig = io.LoadEntity(os.path.join('testfiles', "P84080_model_02_ligand_0.sdf")) - sc = LigandScorer(mdl, trg, [mdl_lig], None) + sc = ligand_scoring_scrmsd.SCRMSDScorer(mdl, trg, [mdl_lig], None) # Note: expect warning about Binding site of H.ZN1 not mapped to the model - sc._compute_scores() - - # Check RMSD - self.assertEqual(sc.rmsd_matrix.shape, (7, 1)) - np.testing.assert_almost_equal(sc.rmsd_matrix, np.array( + self.assertEqual(sc.score_matrix.shape, (7, 1)) + np.testing.assert_almost_equal(sc.score_matrix, np.array( [[np.nan], [0.04244993], [np.nan], @@ -302,15 +302,21 @@ class TestLigandScoring(unittest.TestCase): [0.29399303], [np.nan]]), decimal=5) - # Check lDDT-PLI - self.assertEqual(sc.lddt_pli_matrix.shape, (7, 1)) - self.assertTrue(np.isnan(sc.lddt_pli_matrix[0, 0])) - self.assertAlmostEqual(sc.lddt_pli_matrix[1, 0], 0.99843, 5) - self.assertTrue(np.isnan(sc.lddt_pli_matrix[2, 0])) - self.assertTrue(np.isnan(sc.lddt_pli_matrix[3, 0])) - self.assertTrue(np.isnan(sc.lddt_pli_matrix[4, 0])) - self.assertAlmostEqual(sc.lddt_pli_matrix[5, 0], 1.0) - self.assertTrue(np.isnan(sc.lddt_pli_matrix[6, 0])) + def test_compute_lddtpli_scores(self): + trg = _LoadMMCIF("1r8q.cif.gz") + mdl = _LoadMMCIF("P84080_model_02.cif.gz") + mdl_lig = io.LoadEntity(os.path.join('testfiles', "P84080_model_02_ligand_0.sdf")) + sc = ligand_scoring_lddtpli.LDDTPLIScorer(mdl, trg, [mdl_lig], None, + add_mdl_contacts = False, + lddt_pli_binding_site_radius = 4.0) + self.assertEqual(sc.score_matrix.shape, (7, 1)) + self.assertTrue(np.isnan(sc.score_matrix[0, 0])) + self.assertAlmostEqual(sc.score_matrix[1, 0], 0.99843, 5) + self.assertTrue(np.isnan(sc.score_matrix[2, 0])) + self.assertTrue(np.isnan(sc.score_matrix[3, 0])) + self.assertTrue(np.isnan(sc.score_matrix[4, 0])) + self.assertAlmostEqual(sc.score_matrix[5, 0], 1.0) + self.assertTrue(np.isnan(sc.score_matrix[6, 0])) def test_check_resnames(self): """Test that the check_resname argument works. @@ -318,7 +324,7 @@ class TestLigandScoring(unittest.TestCase): When set to True, it should raise an error if any residue in the representation of the binding site in the model has a different name than in the reference. Here we manually modify a residue - name to achieve that effect. + name to achieve that effect. This is only relevant for the LDDTPLIScorer """ trg_4c0a = _LoadMMCIF("4c0a.cif.gz") trg = trg_4c0a.Select("cname=C or cname=I") @@ -331,124 +337,428 @@ class TestLigandScoring(unittest.TestCase): ed.UpdateXCS() with self.assertRaises(RuntimeError): - sc = LigandScorer(mdl, trg, [mdl.FindResidue("I", 1)], [trg.FindResidue("I", 1)], check_resnames=True) + sc = ligand_scoring_lddtpli.LDDTPLIScorer(mdl, trg, [mdl.FindResidue("I", 1)], [trg.FindResidue("I", 1)], check_resnames=True) sc._compute_scores() - sc = LigandScorer(mdl, trg, [mdl.FindResidue("I", 1)], [trg.FindResidue("I", 1)], check_resnames=False) + sc = ligand_scoring_lddtpli.LDDTPLIScorer(mdl, trg, [mdl.FindResidue("I", 1)], [trg.FindResidue("I", 1)], check_resnames=False) sc._compute_scores() - def test__scores(self): + def test_added_mdl_contacts(self): + + # binding site for ligand in chain G consists of chains A and B + prot = _LoadMMCIF("1r8q.cif.gz").Copy() + + # model has the full binding site + mdl = mol.CreateEntityFromView(prot.Select("cname=A,B,G"), True) + + # chain C has same sequence as chain A but is not in contact + # with ligand in chain G + # target has thus incomplete binding site only from chain B + trg = mol.CreateEntityFromView(prot.Select("cname=B,C,G"), True) + + # if added model contacts are not considered, the incomplete binding + # site only from chain B is perfectly reproduced by model which also has + # chain B + sc = ligand_scoring_lddtpli.LDDTPLIScorer(mdl, trg, add_mdl_contacts=False) + self.assertAlmostEqual(sc.score_matrix[0,0], 1.0, 5) + + # if added model contacts are considered, contributions from chain B are + # perfectly reproduced but all contacts of ligand towards chain A are + # added as penalty + sc = ligand_scoring_lddtpli.LDDTPLIScorer(mdl, trg, add_mdl_contacts=True) + + lig = prot.Select("cname=G") + A_count = 0 + B_count = 0 + for a in lig.atoms: + close_atoms = mdl.FindWithin(a.GetPos(), sc.lddt_pli_radius) + for ca in close_atoms: + cname = ca.GetChain().GetName() + if cname == "G": + pass # its a ligand atom... + elif cname == "A": + A_count += 1 + elif cname == "B": + B_count += 1 + + self.assertAlmostEqual(sc.score_matrix[0,0], + B_count/(A_count + B_count), 5) + + # Same as before but additionally we remove residue TRP.66 + # from chain C in the target to test mapping magic... + # Chain C is NOT in contact with the ligand but we only + # add contacts from chain A as penalty that are mappable + # to the closest chain with same sequence. That would be + # chain C + query = "cname=B,G or (cname=C and rnum!=66)" + trg = mol.CreateEntityFromView(prot.Select(query), True) + sc = ligand_scoring_lddtpli.LDDTPLIScorer(mdl, trg, add_mdl_contacts=True) + + TRP66_count = 0 + for a in lig.atoms: + close_atoms = mdl.FindWithin(a.GetPos(), sc.lddt_pli_radius) + for ca in close_atoms: + cname = ca.GetChain().GetName() + if cname == "A" and ca.GetResidue().GetNumber().GetNum() == 66: + TRP66_count += 1 + + self.assertEqual(TRP66_count, 134) + + # remove TRP66_count from original penalty + self.assertAlmostEqual(sc.score_matrix[0,0], + B_count/(A_count + B_count - TRP66_count), 5) + + # Move a random atom in the model from chain B towards the ligand center + # chain B is also present in the target and interacts with the ligand, + # but that atom would be far away and thus adds to the penalty. Since + # the ligand is small enough, the number of added contacts should be + # exactly the number of ligand atoms. + mdl_ed = mdl.EditXCS() + at = mdl.FindResidue("B", mol.ResNum(8)).FindAtom("NZ") + mdl_ed.SetAtomPos(at, lig.geometric_center) + sc = ligand_scoring_lddtpli.LDDTPLIScorer(mdl, trg, add_mdl_contacts=True) + + # compared to the last assertAlmostEqual, we add the number of ligand + # atoms as additional penalties + self.assertAlmostEqual(sc.score_matrix[0,0], + B_count/(A_count + B_count - TRP66_count + \ + lig.GetAtomCount()), 5) + + def test_assignment(self): + trg = _LoadMMCIF("1r8q.cif.gz") + mdl = _LoadMMCIF("P84080_model_02.cif.gz") + sc = ligand_scoring_scrmsd.SCRMSDScorer(mdl, trg) + self.assertEqual(sc.assignment, [(1, 0)]) + + sc = ligand_scoring_lddtpli.LDDTPLIScorer(mdl, trg) + self.assertEqual(sc.assignment, [(5, 0)]) + + def test_dict_results_rmsd(self): """Test that the scores are computed correctly """ # 4C0A has more ligands trg = _LoadMMCIF("1r8q.cif.gz") trg_4c0a = _LoadMMCIF("4c0a.cif.gz") - sc = LigandScorer(trg, trg_4c0a, None, None, check_resnames=False) - + sc = ligand_scoring_scrmsd.SCRMSDScorer(trg, trg_4c0a, None, None) expected_keys = {"J", "F"} - self.assertFalse(expected_keys.symmetric_difference(sc.rmsd.keys())) - self.assertFalse(expected_keys.symmetric_difference(sc.rmsd_details.keys())) - self.assertFalse(expected_keys.symmetric_difference(sc.lddt_pli.keys())) - self.assertFalse(expected_keys.symmetric_difference(sc.lddt_pli_details.keys())) - + self.assertFalse(expected_keys.symmetric_difference(sc.score.keys())) + self.assertFalse(expected_keys.symmetric_difference(sc.aux.keys())) # rmsd - self.assertAlmostEqual(sc.rmsd["J"][mol.ResNum(1)], 0.8016608357429504, 5) - self.assertAlmostEqual(sc.rmsd["F"][mol.ResNum(1)], 0.9286373257637024, 5) + self.assertAlmostEqual(sc.score["J"][mol.ResNum(1)], 0.8016608357429504, 5) + self.assertAlmostEqual(sc.score["F"][mol.ResNum(1)], 0.9286373257637024, 5) # rmsd_details - self.assertEqual(sc.rmsd_details["J"][mol.ResNum(1)]["chain_mapping"], {'F': 'D', 'C': 'C'}) - self.assertEqual(len(sc.rmsd_details["J"][mol.ResNum(1)]["bs_ref_res"]), 15) - self.assertEqual(len(sc.rmsd_details["J"][mol.ResNum(1)]["bs_ref_res_mapped"]), 15) - self.assertEqual(len(sc.rmsd_details["J"][mol.ResNum(1)]["bs_mdl_res_mapped"]), 15) - self.assertEqual(sc.rmsd_details["J"][mol.ResNum(1)]["target_ligand"].qualified_name, 'I.G3D1') - self.assertEqual(sc.rmsd_details["J"][mol.ResNum(1)]["model_ligand"].qualified_name, 'J.G3D1') - self.assertEqual(sc.rmsd_details["F"][mol.ResNum(1)]["chain_mapping"], {'B': 'B', 'G': 'A'}) - self.assertEqual(len(sc.rmsd_details["F"][mol.ResNum(1)]["bs_ref_res"]), 15) - self.assertEqual(len(sc.rmsd_details["F"][mol.ResNum(1)]["bs_ref_res_mapped"]), 15) - self.assertEqual(len(sc.rmsd_details["F"][mol.ResNum(1)]["bs_mdl_res_mapped"]), 15) - self.assertEqual(sc.rmsd_details["F"][mol.ResNum(1)]["target_ligand"].qualified_name, 'K.G3D1') - self.assertEqual(sc.rmsd_details["F"][mol.ResNum(1)]["model_ligand"].qualified_name, 'F.G3D1') + self.assertEqual(sc.aux["J"][mol.ResNum(1)]["chain_mapping"], {'F': 'D', 'C': 'C'}) + self.assertEqual(len(sc.aux["J"][mol.ResNum(1)]["bs_ref_res"]), 15) + self.assertEqual(len(sc.aux["J"][mol.ResNum(1)]["bs_ref_res_mapped"]), 15) + self.assertEqual(len(sc.aux["J"][mol.ResNum(1)]["bs_mdl_res_mapped"]), 15) + self.assertEqual(sc.aux["J"][mol.ResNum(1)]["target_ligand"].qualified_name, 'I.G3D1') + self.assertEqual(sc.aux["J"][mol.ResNum(1)]["model_ligand"].qualified_name, 'J.G3D1') + self.assertEqual(sc.aux["F"][mol.ResNum(1)]["chain_mapping"], {'B': 'B', 'G': 'A'}) + self.assertEqual(len(sc.aux["F"][mol.ResNum(1)]["bs_ref_res"]), 15) + self.assertEqual(len(sc.aux["F"][mol.ResNum(1)]["bs_ref_res_mapped"]), 15) + self.assertEqual(len(sc.aux["F"][mol.ResNum(1)]["bs_mdl_res_mapped"]), 15) + self.assertEqual(sc.aux["F"][mol.ResNum(1)]["target_ligand"].qualified_name, 'K.G3D1') + self.assertEqual(sc.aux["F"][mol.ResNum(1)]["model_ligand"].qualified_name, 'F.G3D1') + + def test_dict_results_lddtpli(self): + """Test that the scores are computed correctly + """ + # 4C0A has more ligands + trg = _LoadMMCIF("1r8q.cif.gz") + trg_4c0a = _LoadMMCIF("4c0a.cif.gz") + sc = ligand_scoring_lddtpli.LDDTPLIScorer(trg, trg_4c0a, None, None, + check_resnames=False, + add_mdl_contacts=False, + lddt_pli_binding_site_radius = 4.0) + expected_keys = {"J", "F"} + self.assertFalse(expected_keys.symmetric_difference(sc.score.keys())) + self.assertFalse(expected_keys.symmetric_difference(sc.aux.keys())) # lddt_pli - self.assertAlmostEqual(sc.lddt_pli["J"][mol.ResNum(1)], 0.9127105666156202, 5) - self.assertAlmostEqual(sc.lddt_pli["F"][mol.ResNum(1)], 0.915929203539823, 6) + self.assertAlmostEqual(sc.score["J"][mol.ResNum(1)], 0.9127105666156202, 5) + self.assertAlmostEqual(sc.score["F"][mol.ResNum(1)], 0.915929203539823, 5) # lddt_pli_details - self.assertAlmostEqual(sc.lddt_pli_details["J"][mol.ResNum(1)]["rmsd"], 0.8016608357429504, 4) - self.assertEqual(sc.lddt_pli_details["J"][mol.ResNum(1)]["lddt_pli_n_contacts"], 5224) - self.assertEqual(sc.lddt_pli_details["J"][mol.ResNum(1)]["chain_mapping"], {'F': 'D', 'C': 'C'}) - self.assertEqual(len(sc.lddt_pli_details["J"][mol.ResNum(1)]["bs_ref_res"]), 15) - self.assertEqual(len(sc.lddt_pli_details["J"][mol.ResNum(1)]["bs_ref_res_mapped"]), 15) - self.assertEqual(len(sc.lddt_pli_details["J"][mol.ResNum(1)]["bs_mdl_res_mapped"]), 15) - self.assertEqual(sc.lddt_pli_details["J"][mol.ResNum(1)]["target_ligand"].qualified_name, 'I.G3D1') - self.assertEqual(sc.lddt_pli_details["J"][mol.ResNum(1)]["model_ligand"].qualified_name, 'J.G3D1') - self.assertAlmostEqual(sc.lddt_pli_details["F"][mol.ResNum(1)]["rmsd"], 0.9286373257637024, 4) - self.assertEqual(sc.lddt_pli_details["F"][mol.ResNum(1)]["lddt_pli_n_contacts"], 5424) - self.assertEqual(sc.lddt_pli_details["F"][mol.ResNum(1)]["chain_mapping"], {'B': 'B', 'G': 'A'}) - self.assertEqual(len(sc.lddt_pli_details["F"][mol.ResNum(1)]["bs_ref_res"]), 15) - self.assertEqual(len(sc.lddt_pli_details["F"][mol.ResNum(1)]["bs_ref_res_mapped"]), 15) - self.assertEqual(len(sc.lddt_pli_details["F"][mol.ResNum(1)]["bs_mdl_res_mapped"]), 15) - self.assertEqual(sc.lddt_pli_details["F"][mol.ResNum(1)]["target_ligand"].qualified_name, 'K.G3D1') - self.assertEqual(sc.lddt_pli_details["F"][mol.ResNum(1)]["model_ligand"].qualified_name, 'F.G3D1') - - def test_global_chain_mapping(self): - """Test that the global and local chain mappings works. - - For RMSD, A: A results in a better chain mapping. However, C: A is a - better global chain mapping from an lDDT perspective (and lDDT-PLI). - """ + self.assertEqual(sc.aux["J"][mol.ResNum(1)]["lddt_pli_n_contacts"], 653) + self.assertEqual(len(sc.aux["J"][mol.ResNum(1)]["bs_ref_res"]), 15) + self.assertEqual(sc.aux["J"][mol.ResNum(1)]["target_ligand"].qualified_name, 'I.G3D1') + self.assertEqual(sc.aux["J"][mol.ResNum(1)]["model_ligand"].qualified_name, 'J.G3D1') + self.assertEqual(sc.aux["F"][mol.ResNum(1)]["lddt_pli_n_contacts"], 678) + self.assertEqual(len(sc.aux["F"][mol.ResNum(1)]["bs_ref_res"]), 15) + self.assertEqual(sc.aux["F"][mol.ResNum(1)]["target_ligand"].qualified_name, 'K.G3D1') + self.assertEqual(sc.aux["F"][mol.ResNum(1)]["model_ligand"].qualified_name, 'F.G3D1') + + # lddt_pli with added mdl contacts + sc = ligand_scoring_lddtpli.LDDTPLIScorer(trg, trg_4c0a, None, None, + check_resnames=False, + add_mdl_contacts=True) + self.assertAlmostEqual(sc.score["J"][mol.ResNum(1)], 0.8988340192043895, 5) + self.assertAlmostEqual(sc.score["F"][mol.ResNum(1)], 0.9039735099337749, 5) + # lddt_pli_details + self.assertEqual(sc.aux["J"][mol.ResNum(1)]["lddt_pli_n_contacts"], 729) + self.assertEqual(len(sc.aux["J"][mol.ResNum(1)]["bs_ref_res"]), 63) + self.assertEqual(sc.aux["J"][mol.ResNum(1)]["target_ligand"].qualified_name, 'I.G3D1') + self.assertEqual(sc.aux["J"][mol.ResNum(1)]["model_ligand"].qualified_name, 'J.G3D1') + self.assertEqual(sc.aux["F"][mol.ResNum(1)]["lddt_pli_n_contacts"], 755) + self.assertEqual(len(sc.aux["F"][mol.ResNum(1)]["bs_ref_res"]), 62) + self.assertEqual(sc.aux["F"][mol.ResNum(1)]["target_ligand"].qualified_name, 'K.G3D1') + self.assertEqual(sc.aux["F"][mol.ResNum(1)]["model_ligand"].qualified_name, 'F.G3D1') + + def test_ignore_binding_site(self): + """Test that we ignore non polymer stuff in the binding site. + NOTE: we should consider changing this behavior in the future and take + other ligands, peptides and short oligomers into account for superposition. + When that's the case this test should be adapter + """ + trg = _LoadMMCIF("1SSP.cif.gz") + sc = ligand_scoring_scrmsd.SCRMSDScorer(trg, trg, None, None) + expected_bs_ref_res = ['C.GLY62', 'C.GLN63', 'C.ASP64', 'C.PRO65', 'C.TYR66', 'C.CYS76', 'C.PHE77', 'C.ASN123', 'C.HIS187'] + ost.PushVerbosityLevel(ost.LogLevel.Error) + self.assertEqual([str(r) for r in sc.aux["D"][1]["bs_ref_res"]], expected_bs_ref_res) + ost.PopVerbosityLevel() + + def test_substructure_match(self): + """Test that substructure_match=True works.""" trg = _LoadMMCIF("1r8q.cif.gz") mdl = _LoadMMCIF("P84080_model_02.cif.gz") - # Local by default - sc = LigandScorer(mdl, trg, None, None) - self.assertEqual(sc.rmsd_details["L_2"][1]["chain_mapping"], {'A': 'A'}) - self.assertEqual(sc.lddt_pli_details["L_2"][1]["chain_mapping"], {'C': 'A'}) + trg_g3d1 = trg.FindResidue("F", 1) + mdl_g3d = mdl.FindResidue("L_2", 1) - # Global - sc = LigandScorer(mdl, trg, None, None, global_chain_mapping=True) - self.assertEqual(sc.rmsd_details["L_2"][1]["chain_mapping"], {'C': 'A'}) - self.assertEqual(sc.lddt_pli_details["L_2"][1]["chain_mapping"], {'C': 'A'}) + # Skip PA, PB and O[1-3]A and O[1-3]B in target and model + # ie 8 / 32 atoms => coverage 0.75 + # We assume atom index are fixed and won't change + trg_g3d1_sub_ent = trg_g3d1.Select("aindex>6019") + trg_g3d1_sub = trg_g3d1_sub_ent.residues[0] + + # without enabling substructure matches + sc = ligand_scoring_scrmsd.SCRMSDScorer(mdl.Select("protein=True"), trg.Select("protein=True"), + model_ligands=[mdl_g3d], target_ligands=[trg_g3d1_sub], + substructure_match=False) + self.assertEqual(sc.coverage_matrix.shape, (1,1)) + self.assertTrue(np.isnan(sc.coverage_matrix[0,0])) + self.assertEqual(sc.state_matrix[0,0], 3) # error encoding for that particular issue + + # Substructure matches + sc = ligand_scoring_scrmsd.SCRMSDScorer(mdl.Select("protein=True"), trg.Select("protein=True"), + model_ligands=[mdl_g3d], target_ligands=[trg_g3d1_sub], + substructure_match=True) + self.assertEqual(sc.coverage_matrix.shape, (1,1)) + self.assertEqual(sc.coverage_matrix[0,0], 0.75) + self.assertEqual(sc.state_matrix[0,0], 0) # no error encoded in state + + def test_6jyf(self): + """6JYF initially caused issues in the CASP15-CAMEO/LIGATE paper where + the ligand RET was wrongly assigned to short copies of OLA that float + around and yielded higher scores. + Here we test that this is resolved correctly.""" + mdl = _LoadPDB("6jyf_mdl.pdb") + trg = _LoadMMCIF("6jyf_trg.cif") + mdl_lig = _LoadEntity("6jyf_RET_pred.sdf") + mdl_lig_full = _LoadEntity("6jyf_RET_pred_complete.sdf") - # Custom - sc = LigandScorer(mdl, trg, None, None, global_chain_mapping=True, custom_mapping={'A': 'A'}) - self.assertEqual(sc.rmsd_details["L_2"][1]["chain_mapping"], {'A': 'A'}) - self.assertEqual(sc.lddt_pli_details["L_2"][1]["chain_mapping"], {'A': 'A'}) + # Problem is easily fixed by just prioritizing full coverage + sc = ligand_scoring_scrmsd.SCRMSDScorer(mdl, trg, model_ligands=[mdl_lig], + substructure_match=True) + self.assertEqual(len(sc.assignment), 1) # only one mdl ligand => 1 assignment + trg_lig_idx, mdl_lig_idx = sc.assignment[0] + self.assertEqual(sc.coverage_matrix[trg_lig_idx, mdl_lig_idx], 1.0) + self.assertEqual(sc.aux['00001_'][1]["target_ligand"].name, "RET") + self.assertAlmostEqual(sc.score['00001_'][1], 15.56022, 4) + self.assertAlmostEqual(sc.coverage_matrix[0,0], 1.) + self.assertTrue(np.isnan(sc.coverage_matrix[1,0])) + self.assertTrue(np.isnan(sc.coverage_matrix[2,0])) + self.assertTrue(np.isnan(sc.coverage_matrix[3,0])) + self.assertTrue(np.isnan(sc.coverage_matrix[4,0])) + self.assertTrue(np.isnan(sc.coverage_matrix[5,0])) + self.assertTrue(np.isnan(sc.coverage_matrix[6,0])) + self.assertTrue(np.isnan(sc.coverage_matrix[7,0])) + self.assertAlmostEqual(sc.coverage_matrix[8,0], 0.5) + self.assertAlmostEqual(sc.coverage_matrix[9,0], 0.3) + self.assertAlmostEqual(sc.coverage_matrix[10,0], 0.45) + self.assertTrue(np.isnan(sc.coverage_matrix[11,0])) + self.assertTrue(np.isnan(sc.coverage_matrix[12,0])) + self.assertAlmostEqual(sc.coverage_matrix[13,0], 0.55) - # Custom only active with global chain mapping - sc = LigandScorer(mdl, trg, None, None, global_chain_mapping=False, custom_mapping={'A': 'A'}) - self.assertEqual(sc.rmsd_details["L_2"][1]["chain_mapping"], {'A': 'A'}) - self.assertEqual(sc.lddt_pli_details["L_2"][1]["chain_mapping"], {'C': 'A'}) + # We need to make sure that it also works if the match is partial. + # For that we load the complete ligand incl. the O missing in target + # with a coverage of around 95% only. + sc = ligand_scoring_scrmsd.SCRMSDScorer(mdl, trg, model_ligands=[mdl_lig_full], + substructure_match=True) + self.assertEqual(len(sc.assignment), 1) # only one mdl ligand => 1 assignment + trg_lig_idx, mdl_lig_idx = sc.assignment[0] + self.assertAlmostEqual(sc.coverage_matrix[trg_lig_idx, mdl_lig_idx],0.95238096) + self.assertEqual(sc.aux['00001_'][1]["target_ligand"].name, "RET") + self.assertAlmostEqual(sc.score['00001_'][1], 15.56022, 4) - def test_rmsd_assignment(self): - """Test that the RMSD-based assignment works. + # Next, we check that coverage_delta has an effect. With a large + # delta of 0.5 we will assign to OLA which has a higher RMSD + # but a coverage of 0.52 only. + sc = ligand_scoring_scrmsd.SCRMSDScorer(mdl, trg, model_ligands=[mdl_lig_full], + substructure_match=True, + coverage_delta=0.5) + self.assertEqual(len(sc.assignment), 1) # only one mdl ligand => 1 assignment + trg_lig_idx, mdl_lig_idx = sc.assignment[0] + self.assertAlmostEqual(sc.coverage_matrix[trg_lig_idx, mdl_lig_idx], 0.52380955) + self.assertEqual(sc.aux['00001_'][1]["target_ligand"].name, "OLA") + self.assertAlmostEqual(sc.score['00001_'][1], 6.13006878, 4) - For RMSD, A: A results in a better chain mapping. However, C: A is a - better global chain mapping from an lDDT perspective (and lDDT-PLI). + def test_skip_too_many_symmetries(self): + """ + Test behaviour of max_symmetries. """ trg = _LoadMMCIF("1r8q.cif.gz") mdl = _LoadMMCIF("P84080_model_02.cif.gz") - # By default, assignment differs between RMSD and lDDT-PLI in this - # specific test case, so we can first ensure it does. - # For now we skip as this is slow - # sc = LigandScorer(mdl, trg, None, None) - # assert sc.rmsd_details["L_2"][1]["target_ligand"] != sc.lddt_pli_details["L_2"][1]["target_ligand"] + # Pass entity views + trg_lig = [trg.Select("cname=F")] + mdl_lig = [mdl.Select("rname=G3D")] - # RMSD assignment forces the same assignment - sc = LigandScorer(mdl, trg, None, None, rmsd_assignment=True) - self.assertEqual(sc.rmsd_details["L_2"][1]["target_ligand"], sc.lddt_pli_details["L_2"][1]["target_ligand"]) + # G3D has 72 isomorphic mappings to itself. + # Limit to 10 to raise + symmetries = ligand_scoring_base.ComputeSymmetries(mdl_lig[0], trg_lig[0], max_symmetries=100) + self.assertEqual(len(symmetries), 72) + with self.assertRaises(TooManySymmetriesError): + ligand_scoring_base.ComputeSymmetries(mdl_lig[0], trg_lig[0], max_symmetries=10) - def test_ignore_binding_site(self): - """Test that we ignore non polymer stuff in the binding site. - NOTE: we should consider changing this behavior in the future and take - other ligands, peptides and short oligomers into account for superposition. - When that's the case this test should be adapter - """ - trg = _LoadMMCIF("1SSP.cif.gz") - sc = LigandScorer(trg, trg, None, None) - expected_bs_ref_res = ['C.GLY62', 'C.GLN63', 'C.ASP64', 'C.PRO65', 'C.TYR66', 'C.CYS76', 'C.PHE77', 'C.ASN123', 'C.HIS187'] - ost.PushVerbosityLevel(ost.LogLevel.Error) - self.assertEqual([str(r) for r in sc.rmsd_details["D"][1]["bs_ref_res"]], expected_bs_ref_res) - ost.PopVerbosityLevel() + # Check the unassignment + sc = ligand_scoring_scrmsd.SCRMSDScorer(mdl, trg, mdl_lig, trg_lig, + max_symmetries=10) + + self.assertFalse("L_2" in sc.score) + self.assertEqual(sc.assignment, []) + self.assertEqual(sc.unassigned_target_ligands, [0]) + self.assertEqual(sc.unassigned_model_ligands, [0]) + + trg_report, trg_pair_report = sc.get_target_ligand_state_report(0) + mdl_report, mdl_pair_report = sc.get_model_ligand_state_report(0) + + # the individual ligands are OK + self.assertEqual(trg_report["short desc"], "OK") + self.assertEqual(mdl_report["short desc"], "OK") + + # but there are too many symmetries + self.assertEqual(len(trg_pair_report), 1) + self.assertEqual(len(mdl_pair_report), 1) + self.assertEqual(trg_pair_report[0]["short desc"], "symmetries") + self.assertEqual(mdl_pair_report[0]["short desc"], "symmetries") + + def test_no_binding_site(self): + """ + Test the behavior when there's no binding site in proximity of + the ligand. This test was introduced to identify some subtle issues + with the ligand assignment that can cause it to enter an infinite + loop when the data matrices are not filled properly. + """ + trg = _LoadMMCIF("1r8q.cif.gz").Copy() + mdl = trg.Copy() + + trg_zn = trg.FindResidue("H", 1) + trg_g3d = trg.FindResidue("F", 1) + + # Move the zinc out of the reference binding site... + ed = trg.EditXCS() + ed.SetAtomPos(trg_zn.FindAtom("ZN"), + trg_zn.FindAtom("ZN").pos + geom.Vec3(6, 0, 0)) + # Remove some atoms from G3D to decrease coverage. This messed up + # the assignment in the past. + ed.DeleteAtom(trg_g3d.FindAtom("O6")) + ed.UpdateICS() + + sc = ligand_scoring_scrmsd.SCRMSDScorer(mdl, trg, + target_ligands=[trg_zn, trg_g3d], + coverage_delta=0, substructure_match=True) + + self.assertTrue(np.isnan(sc.score_matrix[0, 3])) + + trg_report, trg_pair_report = sc.get_target_ligand_state_report(0) + + exp_lig_report = {'state': 10.0, + 'short desc': 'binding_site', + 'desc': 'No residues were in proximity of the target ligand.'} + + exp_pair_report = [{'state': 1, 'short desc': 'identity', + 'desc': 'Ligands could not be matched (by subgraph isomorphism)', + 'indices': [0, 1, 2, 4, 5, 6]}, + {'state': 6, 'short desc': 'single_ligand_issue', + 'desc': 'Cannot compute valid pairwise score as either model or target ligand have non-zero state.', + 'indices': [3]}] + + # order of report is fix + self.assertDictEqual(trg_report, exp_lig_report) + self.assertDictEqual(trg_pair_report[0], exp_pair_report[0]) + self.assertDictEqual(trg_pair_report[1], exp_pair_report[1]) + + + def test_no_lddt_pli_contact(self): + """ + Test behaviour where a binding site has no lDDT-PLI contacts. + + We give two copies of the target ligand which have binding site atoms + within radius=5A but no atoms at 4A. We set lddt_pli_radius=4 so that + there are no contacts for the lDDT-PLI computation, and lDDT is None. + + We check that: + - We don't get an lDDT-PLI assignment + - Both target ligands are unassigned and have the + - We get an RMSD assignment + - The second copy of the target and model ligands ensure that the + disambiguation code (which checks for the best lDDT-PLI when 2 RMSDs + are identical) works in this case (where there is no lDDT-PLI to + disambiguate the RMSDs). + - We get lDDT-PLI = None with RMSD assignment + """ + trg = _LoadPDB("T1118v1.pdb") + trg_lig = _LoadEntity("T1118v1_001.sdf") + mdl = _LoadPDB("T1118v1LG035_1.pdb") + mdl_lig = _LoadEntity("T1118v1LG035_1_1_1.sdf") + + # Ensure it's unassigned in lDDT + sc = ligand_scoring_lddtpli.LDDTPLIScorer(mdl, trg, [mdl_lig, mdl_lig], + [trg_lig, trg_lig], + lddt_pli_radius=4, + rename_ligand_chain=True) + + # assignment should be empty + self.assertEqual(len(sc.assignment), 0) + self.assertEqual(sc.unassigned_target_ligands, [0, 1]) + self.assertEqual(sc.unassigned_model_ligands, [0, 1]) + + expected_report = [{'state': 10, 'short desc': 'no_contact', + 'desc': 'There were no lDDT contacts between the binding site and the ligand, and lDDT-PLI is undefined.', + 'indices': [0, 1]}] + + # both target ligands are expected to have the same report => no_contact + report_1, report_pair_1 = sc.get_target_ligand_state_report(0) + report_2, report_pair_2 = sc.get_target_ligand_state_report(1) + self.assertEqual(len(report_pair_1), 1) + self.assertEqual(len(report_pair_2), 1) + self.assertDictEqual(report_pair_1[0], expected_report[0]) + self.assertDictEqual(report_pair_2[0], expected_report[0]) + + + # However RMSD should be assigned + sc = ligand_scoring_scrmsd.SCRMSDScorer(mdl, trg, [mdl_lig, mdl_lig], + [trg_lig, trg_lig], + bs_radius=5, rename_ligand_chain=True) + + self.assertEqual(sc.assignment, [(0,0), (1,1)]) + self.assertEqual(sc.unassigned_target_ligands, []) + self.assertEqual(sc.unassigned_model_ligands, []) + + self.assertAlmostEqual(sc.score['00001_FE'][1], 2.1703, 4) + self.assertAlmostEqual(sc.score['00001_FE_2'][1], 2.1703, 4) + + expected_report = [{'state': 0, 'short desc': 'OK', + 'desc': 'OK', + 'indices': [0, 1]}] + + # both target ligands are expected to have the same report => no_contact + report_1, report_pair_1 = sc.get_target_ligand_state_report(0) + report_2, report_pair_2 = sc.get_target_ligand_state_report(1) + self.assertEqual(len(report_pair_1), 1) + self.assertEqual(len(report_pair_2), 1) + self.assertDictEqual(report_pair_1[0], expected_report[0]) + self.assertDictEqual(report_pair_2[0], expected_report[0]) def test_unassigned_reasons(self): """Test reasons for being unassigned.""" @@ -527,74 +837,59 @@ class TestLigandScoring(unittest.TestCase): mdl_ed.UpdateICS() trg_ed.UpdateICS() - sc = LigandScorer(mdl, trg, None, None, unassigned=True, - full_bs_search=True) + sc = ligand_scoring_lddtpli.LDDTPLIScorer(mdl, trg, None, None) # Check unassigned targets # NA: not in contact with target trg_na = sc.target.FindResidue("L_NA", 1) - self.assertEqual(sc.unassigned_target_ligands["L_NA"][1], "binding_site") - # ZN: no representation - trg_zn = sc.target.FindResidue("H", 1) - self.assertEqual(sc.unassigned_target_ligands["H"][1], "model_representation") + self.assertEqual(sc.unassigned_target_ligands_reasons["L_NA"][1], "no_contact") # AFB: not identical to anything in the model trg_afb = sc.target.FindResidue("G", 1) - self.assertEqual(sc.unassigned_target_ligands["G"][1], "identity") + self.assertEqual(sc.unassigned_target_ligands_reasons["G"][1], "identity") # F.G3D1: J.G3D1 assigned instead trg_fg3d1 = sc.target.FindResidue("F", 1) - self.assertEqual(sc.unassigned_target_ligands["F"][1], "stoichiometry") + self.assertEqual(sc.unassigned_target_ligands_reasons["F"][1], "stoichiometry") # CMO: disconnected trg_cmo1 = sc.target.FindResidue("L_CMO", 1) - self.assertEqual(sc.unassigned_target_ligands["L_CMO"][1], "disconnected") - # J.G3D1: assigned to L_2.G3D1 => error - trg_jg3d1 = sc.target.FindResidue("J", 1) - with self.assertRaises(RuntimeError): - sc._find_unassigned_target_ligand_reason(trg_jg3d1) - self.assertNotIn("J", sc.unassigned_target_ligands) - # Raises with an invalid ligand - with self.assertRaises(ValueError): - sc._find_unassigned_target_ligand_reason(sc.model_ligands[0]) + self.assertEqual(sc.unassigned_target_ligands_reasons["L_CMO"][1], "disconnected") + # J.G3D1: assigned to L_2.G3D1 => check if it is assigned + self.assertTrue(5 not in sc.unassigned_target_ligands) + self.assertNotIn("J", sc.unassigned_target_ligands_reasons) # Check unassigned models # OXY: not identical to anything in the model mdl_oxy = sc.model.FindResidue("L_OXY", 1) - self.assertEqual(sc.unassigned_model_ligands["L_OXY"][1], "identity") - self.assertIsNone(sc.lddt_pli["L_OXY"][1]) + self.assertEqual(sc.unassigned_model_ligands_reasons["L_OXY"][1], "identity") + self.assertTrue("L_OXY" not in sc.score) # NA: not in contact with target mdl_na = sc.model.FindResidue("L_NA", 1) - self.assertEqual(sc.unassigned_model_ligands["L_NA"][1], "binding_site") - self.assertIsNone(sc.lddt_pli["L_NA"][1]) - # ZN: no representation - mdl_zn = sc.model.FindResidue("L_ZN", 1) - self.assertEqual(sc.unassigned_model_ligands["L_ZN"][1], "model_representation") - self.assertIsNone(sc.lddt_pli["L_ZN"][1]) + self.assertEqual(sc.unassigned_model_ligands_reasons["L_NA"][1], "no_contact") + self.assertTrue("L_NA" not in sc.score) + # MG in L_MG_2 has stupid coordinates and is not assigned mdl_mg_2 = sc.model.FindResidue("L_MG_2", 1) - self.assertEqual(sc.unassigned_model_ligands["L_MG_2"][1], "stoichiometry") - self.assertIsNone(sc.lddt_pli["L_MG_2"][1]) - # MG in L_MG_0: assigned to I.MG1 => error - mdl_mg_0 = sc.model.FindResidue("L_MG_0", 1) - with self.assertRaises(RuntimeError): - sc._find_unassigned_model_ligand_reason(mdl_mg_0) - self.assertNotIn("L_MG_0", sc.unassigned_model_ligands) + self.assertEqual(sc.unassigned_model_ligands_reasons["L_MG_2"][1], "stoichiometry") + self.assertTrue("L_MG_2" not in sc.score) + + self.assertNotIn("L_MG_0", sc.unassigned_model_ligands_reasons) # CMO: disconnected mdl_cmo1 = sc.model.FindResidue("L_CMO", 1) - self.assertEqual(sc.unassigned_model_ligands["L_CMO"][1], "disconnected") - # Raises with an invalid ligand - with self.assertRaises(ValueError): - sc._find_unassigned_model_ligand_reason(sc.target_ligands[0]) + self.assertEqual(sc.unassigned_model_ligands_reasons["L_CMO"][1], "disconnected") # Should work with rmsd_assignment too - sc = LigandScorer(mdl, trg, None, None, unassigned=True, - rmsd_assignment=True, full_bs_search=True) - self.assertEqual(sc.unassigned_model_ligands, { + sc = ligand_scoring_scrmsd.SCRMSDScorer(mdl, trg, None, None, + full_bs_search=True) + + + + self.assertDictEqual(sc.unassigned_model_ligands_reasons, { 'L_ZN': {1: 'model_representation'}, 'L_NA': {1: 'binding_site'}, 'L_OXY': {1: 'identity'}, 'L_MG_2': {1: 'stoichiometry'}, "L_CMO": {1: 'disconnected'} }) - self.assertEqual(sc.unassigned_target_ligands, { + self.assertDictEqual(sc.unassigned_target_ligands_reasons, { 'G': {1: 'identity'}, 'H': {1: 'model_representation'}, 'J': {1: 'stoichiometry'}, @@ -602,42 +897,39 @@ class TestLigandScoring(unittest.TestCase): 'L_NA': {1: 'binding_site'}, "L_CMO": {1: 'disconnected'} }) - self.assertIsNone(sc.lddt_pli["L_OXY"][1]) + self.assertTrue("L_OXY" not in sc.score) # With missing ligands - sc = LigandScorer(mdl.Select("cname=A"), trg, None, None) - self.assertEqual(sc.unassigned_target_ligands["E"][1], 'no_ligand') + sc = ligand_scoring_lddtpli.LDDTPLIScorer(mdl.Select("cname=A"), trg, None, None) + self.assertEqual(sc.unassigned_target_ligands_reasons["E"][1], 'no_ligand') - sc = LigandScorer(mdl, trg.Select("cname=A"), None, None) - self.assertEqual(sc.unassigned_model_ligands["L_2"][1], 'no_ligand') + sc = ligand_scoring_lddtpli.LDDTPLIScorer(mdl, trg.Select("cname=A"), None, None) + self.assertEqual(sc.unassigned_model_ligands_reasons["L_2"][1], 'no_ligand') - sc = LigandScorer(mdl.Select("cname=A"), trg, None, None, - unassigned=True, rmsd_assignment=True) - self.assertEqual(sc.unassigned_target_ligands["E"][1], 'no_ligand') + sc = ligand_scoring_scrmsd.SCRMSDScorer(mdl.Select("cname=A"), trg, None, None) + self.assertEqual(sc.unassigned_target_ligands_reasons["E"][1], 'no_ligand') - sc = LigandScorer(mdl, trg.Select("cname=A"), None, None, - unassigned=True, rmsd_assignment=True) - self.assertEqual(sc.unassigned_model_ligands["L_2"][1], 'no_ligand') + sc = ligand_scoring_scrmsd.SCRMSDScorer(mdl, trg.Select("cname=A"), None, None) + self.assertEqual(sc.unassigned_model_ligands_reasons["L_2"][1], 'no_ligand') # However not everything must be missing with self.assertRaises(ValueError): - sc = LigandScorer(mdl.Select("cname=A"), trg.Select("cname=A"), None, None, - unassigned=True, rmsd_assignment=True) - - # Test with partial bs search (full_bs_search=False) - # Here we expect L_MG_2 to be unassigned because of model_representation - # rather than stoichiometry, as it is so far from the binding site that - # there is no longer a model binding site. - sc = LigandScorer(mdl, trg, None, None, unassigned=True, - rmsd_assignment=True, full_bs_search=False) - self.assertEqual(sc.unassigned_model_ligands, { + sc = LigandScorer(mdl.Select("cname=A"), trg.Select("cname=A"), None, None) + + # Test with partial bs search (full_bs_search=True) + # Here we expect L_MG_2 to be unassigned because of stoichiometry + # rather than model_representation, as it no longer matters so far from + # the binding site. + sc = ligand_scoring_scrmsd.SCRMSDScorer(mdl, trg, None, None, + full_bs_search=True) + self.assertEqual(sc.unassigned_model_ligands_reasons, { 'L_ZN': {1: 'model_representation'}, 'L_NA': {1: 'binding_site'}, 'L_OXY': {1: 'identity'}, - 'L_MG_2': {1: 'model_representation'}, + 'L_MG_2': {1: 'stoichiometry'}, "L_CMO": {1: 'disconnected'} }) - self.assertEqual(sc.unassigned_target_ligands, { + self.assertEqual(sc.unassigned_target_ligands_reasons, { 'G': {1: 'identity'}, 'H': {1: 'model_representation'}, 'J': {1: 'stoichiometry'}, @@ -646,170 +938,9 @@ class TestLigandScoring(unittest.TestCase): "L_CMO": {1: 'disconnected'} }) - - def test_substructure_match(self): - """Test that substructure_match=True works.""" - trg = _LoadMMCIF("1r8q.cif.gz") - mdl = _LoadMMCIF("P84080_model_02.cif.gz") - - trg_g3d1 = trg.FindResidue("F", 1) - mdl_g3d = mdl.FindResidue("L_2", 1) - - # Skip PA, PB and O[1-3]A and O[1-3]B in target and model - # ie 8 / 32 atoms => coverage 0.75 - # We assume atom index are fixed and won't change - trg_g3d1_sub_ent = trg_g3d1.Select("aindex>6019") - trg_g3d1_sub = trg_g3d1_sub_ent.residues[0] - - # Substructure matches - sc = LigandScorer(mdl.Select("protein=True"), trg.Select("protein=True"), - model_ligands=[mdl_g3d], target_ligands=[trg_g3d1_sub], - substructure_match=True) - self.assertEqual(sc.rmsd_details["L_2"][1]["coverage"], 0.75) - - def test_6jyf(self): - """6JYF initially caused issues in the CASP15-CAMEO/LIGATE paper where - the ligand RET was wrongly assigned to short copies of OLA that float - around and yielded higher scores. - Here we test that this is resolved correctly.""" - mdl = _LoadPDB("6jyf_mdl.pdb") - trg = _LoadMMCIF("6jyf_trg.cif") - mdl_lig = _LoadEntity("6jyf_RET_pred.sdf") - mdl_lig_full = _LoadEntity("6jyf_RET_pred_complete.sdf") - - # Problem is easily fixed by just prioritizing full coverage - sc = LigandScorer(mdl, trg, model_ligands=[mdl_lig], - substructure_match=True) - self.assertEqual(sc.rmsd_details['00001_'][1]["coverage"], 1.0) - self.assertEqual(sc.rmsd_details['00001_'][1]["target_ligand"].name, "RET") - self.assertAlmostEqual(sc.rmsd['00001_'][1], 15.56022, 4) - self.assertTrue(np.array_equal(sc.coverage_matrix, - np.array([[1, 0, 0, 0, 0, 0, 0, 0, 0.5, 0.3, 0.45, 0, 0, 0.55]]).transpose())) - - # We need to make sure that it also works if the match is partial. - # For that we load the complete ligand incl. the O missing in target - # with a coverage of around 95% only. - sc = LigandScorer(mdl, trg, model_ligands=[mdl_lig_full], - substructure_match=True) - self.assertTrue(sc.rmsd_details['00001_'][1]["coverage"] > 0.95) - self.assertEqual(sc.rmsd_details['00001_'][1]["target_ligand"].name, "RET") - self.assertAlmostEqual(sc.rmsd['00001_'][1], 15.56022, 4) - - # Next, we check that coverage_delta has an effect. With a large - # delta of 0.5 we will assign to OLA which has a higher RMSD - # but a coverage of 0.52 only. - sc = LigandScorer(mdl, trg, model_ligands=[mdl_lig_full], - substructure_match=True, - coverage_delta=0.5) - self.assertTrue(sc.rmsd_details['00001_'][1]["coverage"] > 0.5) - self.assertEqual(sc.rmsd_details['00001_'][1]["target_ligand"].name, "OLA") - self.assertAlmostEqual(sc.rmsd['00001_'][1], 6.13006878, 4) - - - def test_skip_too_many_symmetries(self): - """ - Test behaviour of max_symmetries. - """ - trg = _LoadMMCIF("1r8q.cif.gz") - mdl = _LoadMMCIF("P84080_model_02.cif.gz") - - # Pass entity views - trg_lig = [trg.Select("cname=F")] - mdl_lig = [mdl.Select("rname=G3D")] - - # G3D has 72 isomorphic mappings to itself. - # Limit to 10 to raise - symmetries = ligand_scoring._ComputeSymmetries(mdl_lig[0], trg_lig[0], max_symmetries=100) - assert len(symmetries) == 72 - with self.assertRaises(TooManySymmetriesError): - ligand_scoring._ComputeSymmetries(mdl_lig[0], trg_lig[0], max_symmetries=10) - - # Check the unassignment - sc = LigandScorer(mdl, trg, mdl_lig, trg_lig, max_symmetries=10) - assert "L_2" not in sc.rmsd - sc.unassigned_model_ligands["L_2"][1] == "symmetries" - sc.unassigned_target_ligands["F"][1] == "symmetries" - - - def test_no_binding_site(self): - """ - Test the behavior when there's no binding site in proximity of - the ligand. This test was introduced to identify some subtle issues - with the ligand assignment that can cause it to enter an infinite - loop when the data matrices are not filled properly. - """ - trg = _LoadMMCIF("1r8q.cif.gz").Copy() - mdl = trg.Copy() - - trg_zn = trg.FindResidue("H", 1) - trg_g3d = trg.FindResidue("F", 1) - - - # Move the zinc out of the reference binding site... - ed = trg.EditXCS() - ed.SetAtomPos(trg_zn.FindAtom("ZN"), - trg_zn.FindAtom("ZN").pos + geom.Vec3(6, 0, 0)) - # Remove some atoms from G3D to decrease coverage. This messed up - # the assignment in the past. - ed.DeleteAtom(trg_g3d.FindAtom("O6")) - ed.UpdateICS() - - sc = LigandScorer(mdl, trg, target_ligands=[trg_zn, trg_g3d], - coverage_delta=0, substructure_match=True) - self.assertTrue(np.isnan(sc.rmsd_matrix[0, 3])) - self.assertEqual(sc.unassigned_target_ligands["H"][1], "binding_site") - - - def test_no_lddt_pli_contact(self): - """ - Test behaviour where a binding site has no lDDT-PLI contacts. - - We give two copies of the target ligand which have binding site atoms - within radius=5A but no atoms at 4A. We set lddt_pli_radius=4 so that - there are no contacts for the lDDT-PLI computation, and lDDT is None. - - We check that: - - We don't get an lDDT-PLI assignment - - Both target ligands are unassigned and have the - - We get an RMSD assignment - - The second copy of the target and model ligands ensure that the - disambiguation code (which checks for the best lDDT-PLI when 2 RMSDs - are identical) works in this case (where there is no lDDT-PLI to - disambiguate the RMSDs). - - We get lDDT-PLI = None with RMSD assignment - """ - trg = _LoadPDB("T1118v1.pdb") - trg_lig = _LoadEntity("T1118v1_001.sdf") - mdl = _LoadPDB("T1118v1LG035_1.pdb") - mdl_lig = _LoadEntity("T1118v1LG035_1_1_1.sdf") - - # Ensure it's unassigned in lDDT - sc = LigandScorer(mdl, trg, [mdl_lig, mdl_lig], [trg_lig, trg_lig], - radius=5, lddt_pli_radius=4, rename_ligand_chain=True) - self.assertEqual(sc.lddt_pli, {}) - self.assertEqual(sc.unassigned_target_ligands['00001_C'][1], "no_contact") - self.assertEqual(sc.unassigned_target_ligands['00001_C_2'][1], "no_contact") - self.assertEqual(sc.unassigned_model_ligands['00001_FE'][1], "no_contact") - self.assertEqual(sc.unassigned_model_ligands['00001_FE_2'][1], "no_contact") - # However RMSD should be assigned - self.assertAlmostEqual(sc.rmsd['00001_FE'][1], 2.1703, 4) - self.assertAlmostEqual(sc.rmsd['00001_FE_2'][1], 2.1703, 4) - - # Check with RMSD assignment - sc2 = LigandScorer(mdl, trg, [mdl_lig, mdl_lig], [trg_lig, trg_lig], - radius=5, lddt_pli_radius=4, rename_ligand_chain=True, - rmsd_assignment=True) - self.assertEqual(sc2.unassigned_target_ligands, {}) - self.assertEqual(sc2.unassigned_model_ligands, {}) - self.assertIsNone(sc2.lddt_pli['00001_FE'][1]) - self.assertIsNone(sc2.lddt_pli['00001_FE_2'][1]) - self.assertAlmostEqual(sc2.rmsd['00001_FE'][1], 2.1703, 4) - self.assertAlmostEqual(sc2.rmsd['00001_FE_2'][1], 2.1703, 4) - - if __name__ == "__main__": from ost import testutils if testutils.DefaultCompoundLibIsSet(): testutils.RunTests() else: - print('No compound lib available. Ignoring test_ligand_scoring.py tests.') + print('No compound lib available. Ignoring test_ligand_scoring.py tests.') \ No newline at end of file