diff --git a/actions/ost-compare-ligand-structures b/actions/ost-compare-ligand-structures index cb5527b9e781ee6297c935f4cd1d52bfc358fb26..f09003588167a6af2e3cad4022c985ebd18a256e 100644 --- a/actions/ost-compare-ligand-structures +++ b/actions/ost-compare-ligand-structures @@ -35,10 +35,13 @@ Ligands can be given as path to SDF files containing the ligand for both model ligands are optionally detected from a structure file if it is given in mmCIF format. This is based on "non-polymer" _entity.type annotation and the respective entries must exist in the PDB component dictionary in order to get -connectivity information. For example, receptor structure and ligand(s) are -loaded from the same mmCIF file given as '-m'/'-r'. This does not work for -structures provided in PDB format and an error is raised if ligands are not -explitely given in SDF format. +connectivity information. You can avoid the requirement of the PDB component +dictionary by enabling --allow-heuristic-conn. In this case, connectivity +is established through a distance based heuristic if the ligand is not found in +the component dictionary. Be aware that this might be an issue in ligand +matching. +If you provide structures in PDB format, an error is raised if ligands are not +explicitely given in SDF format. Ligands undergo gentle processing where hydrogens are removed. Connectivity is relevant for scoring. It is read directly from SDF input. If ligands are @@ -486,6 +489,19 @@ def _ParseArgs(): "specify this mapping with: --trg-seqres-mapping A:1 B:1") ) + parser.add_argument( + "--allow-heuristic-conn", + dest="allow_heuristic_conn", + default=False, + action="store_true", + help=("Default: False - Only relevant if ligands are extracted from " + "ref/mdl in mmCIF format. Connectivity in these cases is based " + "on the chemical component dictionary. If you enable this flag, " + "connectivity can be established by a distance based heuristic " + "if the ligand is not present in the component dictionary. This " + "might cause issues in ligand matching, i.e. graph matching.") + ) + args = parser.parse_args() if args.output is None: args.output = "out.%s" % args.output_format @@ -564,7 +580,8 @@ def _LoadStructureData(receptor_path, ligand_path, sformat = None, bu_id = None, - fault_tolerant = False): + fault_tolerant = False, + allow_heuristic_conn = False): receptor = None ligands = None @@ -587,7 +604,8 @@ def _LoadStructureData(receptor_path, receptor, ligands = ligand_scoring_base.MMCIFPrep(receptor_path, biounit = bu_id, extract_nonpoly = True, - fault_tolerant = fault_tolerant) + fault_tolerant = fault_tolerant, + allow_heuristic_conn = allow_heuristic_conn) else: receptor = ligand_scoring_base.MMCIFPrep(receptor_path, biounit = bu_id, @@ -997,14 +1015,16 @@ def _Main(): args.reference_ligands, sformat = args.reference_format, bu_id = args.reference_biounit, - fault_tolerant = args.fault_tolerant) + fault_tolerant = args.fault_tolerant, + allow_heuristic_conn = args.allow_heuristic_conn) LogInfo("Loading model data") model, model_ligands = _LoadStructureData(args.model, args.model_ligands, sformat = args.model_format, bu_id = args.model_biounit, - fault_tolerant = args.fault_tolerant) + fault_tolerant = args.fault_tolerant, + allow_heuristic_conn = args.allow_heuristic_conn) out = _Process(model, model_ligands, reference, reference_ligands, args) @@ -1028,6 +1048,7 @@ def _Main(): out["trg_seqres_mapping"] = tmp else: out["trg_seqres_mapping"] = None + out["allow_heuristic_conn"] = args.allow_heuristic_conn # only add lddtpli if actually computed if args.lddt_pli: diff --git a/modules/doc/actions.rst b/modules/doc/actions.rst index 1610738437c33a857d52025858714e840e4a67bb..e8263fb17d91810f0ea9045647895d0c6ed36323 100644 --- a/modules/doc/actions.rst +++ b/modules/doc/actions.rst @@ -543,6 +543,7 @@ Details on the usage (output of ``ost compare-ligand-structures --help``): [--chem-map-seqid-thresh CHEM_MAP_SEQID_THRESH] [--seqres SEQRES] [--trg-seqres-mapping TRG_SEQRES_MAPPING [TRG_SEQRES_MAPPING ...]] + [--allow-heuristic-conn] Evaluate model with non-polymer/small molecule ligands against reference. @@ -580,10 +581,13 @@ Details on the usage (output of ``ost compare-ligand-structures --help``): ligands are optionally detected from a structure file if it is given in mmCIF format. This is based on "non-polymer" _entity.type annotation and the respective entries must exist in the PDB component dictionary in order to get - connectivity information. For example, receptor structure and ligand(s) are - loaded from the same mmCIF file given as '-m'/'-r'. This does not work for - structures provided in PDB format and an error is raised if ligands are not - explitely given in SDF format. + connectivity information. You can avoid the requirement of the PDB component + dictionary by enabling --allow-heuristic-conn. In this case, connectivity + is established through a distance based heuristic if the ligand is not found in + the component dictionary. Be aware that this might be an issue in ligand + matching. + If you provide structures in PDB format, an error is raised if ligands are not + explicitely given in SDF format. Ligands undergo gentle processing where hydrogens are removed. Connectivity is relevant for scoring. It is read directly from SDF input. If ligands are @@ -613,10 +617,10 @@ Details on the usage (output of ``ost compare-ligand-structures --help``): in chain mapping. That is 1) pass the same size threshold as for chem_groups 2) can be aligned to any of the chem groups with a sequence identity threshold that can be controlled by --chem-map-seqid-thresh. - * "mdl_chains_without_chem_mapping": Model chains that could be considered in chain mapping, - i.e. are long enough, but could not be mapped to any chem group. - Depends on --chem-map-seqid-thresh. A mapping for each model chain can be - enforced by setting it to 0. + * "mdl_chains_without_chem_mapping": Model chains that could be considered in + chain mapping, i.e. are long enough, but could not be mapped to any chem + group. Depends on --chem-map-seqid-thresh. A mapping for each model chain can + be enforced by setting it to 0. * "status": SUCCESS if everything ran through. In case of failure, the only content of the JSON output will be "status" set to FAILURE and an additional key: "traceback". @@ -691,7 +695,7 @@ Details on the usage (output of ``ost compare-ligand-structures --help``): Path to model ligand files. -r REFERENCE, --ref REFERENCE, --reference REFERENCE Path to reference file. - -rl [REFERENCE_LIGANDS ...], --ref-ligands [REFERENCE_LIGANDS ...], --reference-ligands [ REFERENCE_LIGANDS ...] + -rl [REFERENCE_LIGANDS ...], --ref-ligands [REFERENCE_LIGANDS ...], --reference-ligands [REFERENCE_LIGANDS ...] Path to reference ligand files. -o OUTPUT, --out OUTPUT, --output OUTPUT Output file name. Default depends on format: out.json @@ -810,4 +814,14 @@ Details on the usage (output of ``ost compare-ligand-structures --help``): which you provide a seqres file containing one sequence with name "1". You can specify this mapping with: --trg-seqres-mapping A:1 B:1 + --allow-heuristic-conn + Default: False - Only relevant if ligands are + extracted from ref/mdl in mmCIF format. Connectivity + in these cases is based on the chemical component + dictionary. If you enable this flag, connectivity can + be established by a distance based heuristic if the + ligand is not present in the component dictionary. + This might cause issues in ligand matching, i.e. graph + matching. + diff --git a/modules/mol/alg/pymod/ligand_scoring_base.py b/modules/mol/alg/pymod/ligand_scoring_base.py index cf0a73f498dae5e42381fd1372983333f4a29cd1..bdda481e0355eb3632b77c80b8936af8de741216 100644 --- a/modules/mol/alg/pymod/ligand_scoring_base.py +++ b/modules/mol/alg/pymod/ligand_scoring_base.py @@ -77,7 +77,7 @@ def CleanHydrogens(ent, clib): def MMCIFPrep(mmcif_path, biounit=None, extract_nonpoly=False, - fault_tolerant=False): + fault_tolerant=False, allow_heuristic_conn=False): """ Ligand scoring helper - Prepares :class:`LigandScorer` input from mmCIF Only performs gentle cleanup of hydrogen atoms. Further cleanup is delegated @@ -95,6 +95,15 @@ def MMCIFPrep(mmcif_path, biounit=None, extract_nonpoly=False, :type extract_nonpoly: :class:`bool` :param fault_tolerant: Passed as parameter to :func:`ost.io.LoadMMCIF` :type fault_tolerant: :class:`bool` + :param allow_heuristic_conn: Only relevant if extract_nonpoly is True. + The chemical component dictionary is relevant + for connectivity information. By default, we + enforce the presence of each non-polymer in + the dictionary to ensure correct connectity. + If you enable this flag, you allow the use + of a distance based heuristic as fallback. + With all its consequences in ligand matching. + :type allow_heuristic_conn: :class:`bool` :returns: :class:`ost.mol.EntityHandle` which only contains polymer entities representing the receptor structure. If *extract_nonpoly* is True, a tuple is returned which additionally contains a @@ -175,12 +184,13 @@ def MMCIFPrep(mmcif_path, biounit=None, extract_nonpoly=False, f"{mmcif_path} to contain exactly 1 " f"residue. Got {ch.GetResidueCount()} " f"in chain {ch.name}") - compound = clib.FindCompound(view.residues[0].name) - if compound is None: - raise RuntimeError(f"Can only extract non-polymer entities if " - f"respective residues are available in PDB " - f"component dictionary. Can't find " - f"\"{view.residues[0].name}\"") + if not allow_heuristic_conn: + compound = clib.FindCompound(view.residues[0].name) + if compound is None: + raise RuntimeError(f"Can only extract non-polymer entities if " + f"respective residues are available in PDB " + f"component dictionary. Can't find " + f"\"{view.residues[0].name}\"") non_poly_entities.append(mol.CreateEntityFromView(view, True))