From bf7dbe0b8ece98b0eae85022fe8fcd244d4625ec Mon Sep 17 00:00:00 2001
From: Gabriel Studer <gabriel.studer@unibas.ch>
Date: Wed, 5 Mar 2025 14:02:19 +0100
Subject: [PATCH] ligand scoring: allow dist based connectivity heuristic for
 ligands from mmCIF

The default behaviour is to only allow ligands from mmCIF if they're
present in the PDB chemical component dictionary. Reason for that is
the connectivity information we get from there. Actively enabling this
flag in the compare-ligand-structures action allows to use a distance
based heuristic as fallback if the ligand is not in the dicitonary.
---
 actions/ost-compare-ligand-structures        | 37 +++++++++++++++-----
 modules/doc/actions.rst                      | 32 ++++++++++++-----
 modules/mol/alg/pymod/ligand_scoring_base.py | 24 +++++++++----
 3 files changed, 69 insertions(+), 24 deletions(-)

diff --git a/actions/ost-compare-ligand-structures b/actions/ost-compare-ligand-structures
index cb5527b9e..f09003588 100644
--- a/actions/ost-compare-ligand-structures
+++ b/actions/ost-compare-ligand-structures
@@ -35,10 +35,13 @@ Ligands can be given as path to SDF files containing the ligand for both model
 ligands are optionally detected from a structure file if it is given in mmCIF
 format. This is based on "non-polymer" _entity.type annotation and the
 respective entries must exist in the PDB component dictionary in order to get
-connectivity information. For example, receptor structure and ligand(s) are
-loaded from the same mmCIF file given as '-m'/'-r'. This does not work for
-structures provided in PDB format and an error is raised if ligands are not
-explitely given in SDF format.
+connectivity information. You can avoid the requirement of the PDB component
+dictionary by enabling --allow-heuristic-conn. In this case, connectivity
+is established through a distance based heuristic if the ligand is not found in
+the component dictionary. Be aware that this might be an issue in ligand
+matching.
+If you provide structures in PDB format, an error is raised if ligands are not
+explicitely given in SDF format.
 
 Ligands undergo gentle processing where hydrogens are removed. Connectivity
 is relevant for scoring. It is read directly from SDF input. If ligands are
@@ -486,6 +489,19 @@ def _ParseArgs():
               "specify this mapping with: --trg-seqres-mapping A:1 B:1")
     )
 
+    parser.add_argument(
+        "--allow-heuristic-conn",
+        dest="allow_heuristic_conn",
+        default=False,
+        action="store_true",
+        help=("Default: False - Only relevant if ligands are extracted from "
+              "ref/mdl in mmCIF format. Connectivity in these cases is based "
+              "on the chemical component dictionary. If you enable this flag, "
+              "connectivity can be established by a distance based heuristic "
+              "if the ligand is not present in the component dictionary. This "
+              "might cause issues in ligand matching, i.e. graph matching.")
+    )
+
     args = parser.parse_args()
     if args.output is None:
         args.output = "out.%s" % args.output_format
@@ -564,7 +580,8 @@ def _LoadStructureData(receptor_path,
                        ligand_path,
                        sformat = None,
                        bu_id = None,
-                       fault_tolerant = False):
+                       fault_tolerant = False,
+                       allow_heuristic_conn = False):
 
     receptor = None
     ligands = None
@@ -587,7 +604,8 @@ def _LoadStructureData(receptor_path,
             receptor, ligands = ligand_scoring_base.MMCIFPrep(receptor_path,
                                                               biounit = bu_id,
                                                               extract_nonpoly = True,
-                                                              fault_tolerant = fault_tolerant)
+                                                              fault_tolerant = fault_tolerant,
+                                                              allow_heuristic_conn = allow_heuristic_conn)
         else:
             receptor = ligand_scoring_base.MMCIFPrep(receptor_path,
                                                      biounit = bu_id,
@@ -997,14 +1015,16 @@ def _Main():
                                                           args.reference_ligands,
                                                           sformat = args.reference_format,
                                                           bu_id = args.reference_biounit,
-                                                          fault_tolerant = args.fault_tolerant)
+                                                          fault_tolerant = args.fault_tolerant,
+                                                          allow_heuristic_conn = args.allow_heuristic_conn)
 
         LogInfo("Loading model data")
         model, model_ligands = _LoadStructureData(args.model,
                                                   args.model_ligands,
                                                   sformat = args.model_format,
                                                   bu_id = args.model_biounit,
-                                                  fault_tolerant = args.fault_tolerant)
+                                                  fault_tolerant = args.fault_tolerant,
+                                                  allow_heuristic_conn = args.allow_heuristic_conn)
 
         out = _Process(model, model_ligands, reference, reference_ligands, args)
 
@@ -1028,6 +1048,7 @@ def _Main():
             out["trg_seqres_mapping"] = tmp
         else:
             out["trg_seqres_mapping"] = None
+        out["allow_heuristic_conn"] = args.allow_heuristic_conn
 
         # only add lddtpli if actually computed
         if args.lddt_pli:
diff --git a/modules/doc/actions.rst b/modules/doc/actions.rst
index 161073843..e8263fb17 100644
--- a/modules/doc/actions.rst
+++ b/modules/doc/actions.rst
@@ -543,6 +543,7 @@ Details on the usage (output of ``ost compare-ligand-structures --help``):
                                        [--chem-map-seqid-thresh CHEM_MAP_SEQID_THRESH]
                                        [--seqres SEQRES]
                                        [--trg-seqres-mapping TRG_SEQRES_MAPPING [TRG_SEQRES_MAPPING ...]]
+                                       [--allow-heuristic-conn]
 
   Evaluate model with non-polymer/small molecule ligands against reference.
 
@@ -580,10 +581,13 @@ Details on the usage (output of ``ost compare-ligand-structures --help``):
   ligands are optionally detected from a structure file if it is given in mmCIF
   format. This is based on "non-polymer" _entity.type annotation and the
   respective entries must exist in the PDB component dictionary in order to get
-  connectivity information. For example, receptor structure and ligand(s) are
-  loaded from the same mmCIF file given as '-m'/'-r'. This does not work for
-  structures provided in PDB format and an error is raised if ligands are not
-  explitely given in SDF format.
+  connectivity information. You can avoid the requirement of the PDB component
+  dictionary by enabling --allow-heuristic-conn. In this case, connectivity
+  is established through a distance based heuristic if the ligand is not found in
+  the component dictionary. Be aware that this might be an issue in ligand
+  matching.
+  If you provide structures in PDB format, an error is raised if ligands are not
+  explicitely given in SDF format.
 
   Ligands undergo gentle processing where hydrogens are removed. Connectivity
   is relevant for scoring. It is read directly from SDF input. If ligands are
@@ -613,10 +617,10 @@ Details on the usage (output of ``ost compare-ligand-structures --help``):
      in chain mapping. That is 1) pass the same size threshold as for chem_groups
      2) can be aligned to any of the chem groups with a sequence identity
      threshold that can be controlled by --chem-map-seqid-thresh.
-   * "mdl_chains_without_chem_mapping": Model chains that could be considered in chain mapping,
-     i.e. are long enough, but could not be mapped to any chem group.
-     Depends on --chem-map-seqid-thresh. A mapping for each model chain can be
-     enforced by setting it to 0.
+   * "mdl_chains_without_chem_mapping": Model chains that could be considered in
+     chain mapping, i.e. are long enough, but could not be mapped to any chem
+     group. Depends on --chem-map-seqid-thresh. A mapping for each model chain can
+     be enforced by setting it to 0.
    * "status": SUCCESS if everything ran through. In case of failure, the only
      content of the JSON output will be "status" set to FAILURE and an
      additional key: "traceback".
@@ -691,7 +695,7 @@ Details on the usage (output of ``ost compare-ligand-structures --help``):
                           Path to model ligand files.
     -r REFERENCE, --ref REFERENCE, --reference REFERENCE
                           Path to reference file.
-    -rl [REFERENCE_LIGANDS ...], --ref-ligands [REFERENCE_LIGANDS ...], --reference-ligands [  REFERENCE_LIGANDS ...]
+    -rl [REFERENCE_LIGANDS ...], --ref-ligands [REFERENCE_LIGANDS ...], --reference-ligands [REFERENCE_LIGANDS ...]
                           Path to reference ligand files.
     -o OUTPUT, --out OUTPUT, --output OUTPUT
                           Output file name. Default depends on format: out.json
@@ -810,4 +814,14 @@ Details on the usage (output of ``ost compare-ligand-structures --help``):
                           which you provide a seqres file containing one
                           sequence with name "1". You can specify this mapping
                           with: --trg-seqres-mapping A:1 B:1
+    --allow-heuristic-conn
+                          Default: False - Only relevant if ligands are
+                          extracted from ref/mdl in mmCIF format. Connectivity
+                          in these cases is based on the chemical component
+                          dictionary. If you enable this flag, connectivity can
+                          be established by a distance based heuristic if the
+                          ligand is not present in the component dictionary.
+                          This might cause issues in ligand matching, i.e. graph
+                          matching.
+
 
diff --git a/modules/mol/alg/pymod/ligand_scoring_base.py b/modules/mol/alg/pymod/ligand_scoring_base.py
index cf0a73f49..bdda481e0 100644
--- a/modules/mol/alg/pymod/ligand_scoring_base.py
+++ b/modules/mol/alg/pymod/ligand_scoring_base.py
@@ -77,7 +77,7 @@ def CleanHydrogens(ent, clib):
 
 
 def MMCIFPrep(mmcif_path, biounit=None, extract_nonpoly=False,
-              fault_tolerant=False):
+              fault_tolerant=False, allow_heuristic_conn=False):
     """ Ligand scoring helper - Prepares :class:`LigandScorer` input from mmCIF
 
     Only performs gentle cleanup of hydrogen atoms. Further cleanup is delegated
@@ -95,6 +95,15 @@ def MMCIFPrep(mmcif_path, biounit=None, extract_nonpoly=False,
     :type extract_nonpoly: :class:`bool`
     :param fault_tolerant: Passed as parameter to :func:`ost.io.LoadMMCIF`
     :type fault_tolerant: :class:`bool`
+    :param allow_heuristic_conn: Only relevant if extract_nonpoly is True.
+                                 The chemical component dictionary is relevant
+                                 for connectivity information. By default, we
+                                 enforce the presence of each non-polymer in
+                                 the dictionary to ensure correct connectity.
+                                 If you enable this flag, you allow the use
+                                 of a distance based heuristic as fallback.
+                                 With all its consequences in ligand matching.
+    :type allow_heuristic_conn: :class:`bool`
     :returns: :class:`ost.mol.EntityHandle` which only contains polymer
               entities representing the receptor structure. If *extract_nonpoly*
               is True, a tuple is returned which additionally contains a
@@ -175,12 +184,13 @@ def MMCIFPrep(mmcif_path, biounit=None, extract_nonpoly=False,
                                f"{mmcif_path} to contain exactly 1 "
                                f"residue. Got {ch.GetResidueCount()} "
                                f"in chain {ch.name}")
-        compound = clib.FindCompound(view.residues[0].name)
-        if compound is None:
-            raise RuntimeError(f"Can only extract non-polymer entities if "
-                               f"respective residues are available in PDB "
-                               f"component dictionary. Can't find "
-                               f"\"{view.residues[0].name}\"")
+        if not allow_heuristic_conn:
+            compound = clib.FindCompound(view.residues[0].name)
+            if compound is None:
+                raise RuntimeError(f"Can only extract non-polymer entities if "
+                                   f"respective residues are available in PDB "
+                                   f"component dictionary. Can't find "
+                                   f"\"{view.residues[0].name}\"")
 
         non_poly_entities.append(mol.CreateEntityFromView(view, True))
 
-- 
GitLab