From 341783cce4abe39ef1a33db8f844571efe7b30a6 Mon Sep 17 00:00:00 2001
From: Gabriel Studer <gabriel.studer@unibas.ch>
Date: Thu, 11 Jan 2024 18:30:27 +0100
Subject: [PATCH] scoring: min_pep_length/min_nuc_length in compare-structures
 action

Minimum length for a chain in the target structure
to be considered in chain mapping. The chain mapping algorithm first
performs an all vs. all pairwise sequence alignment to identify
"equal" chains within the target structure. We go for simple sequence
identity there. Short sequences can be problematic as they may
produce high sequence identity alignments by pure chance.

BUT: if you're scoring peptides or short nucleotides, you
really want to be able to reduce the default thresholds
(pep: 10, nuc: 4)
---
 actions/ost-compare-structures   | 38 +++++++++++++++++++++++++++++++-
 modules/doc/actions.rst          | 31 ++++++++++++++++++++++++--
 modules/mol/alg/pymod/scoring.py | 29 ++++++++++++++++++++++--
 3 files changed, 93 insertions(+), 5 deletions(-)

diff --git a/actions/ost-compare-structures b/actions/ost-compare-structures
index cfaabbd37..9606e32f4 100644
--- a/actions/ost-compare-structures
+++ b/actions/ost-compare-structures
@@ -57,6 +57,8 @@ results:
  * "cad_exec"
  * "usalign_exec"
  * "lddt_no_stereochecks"
+ * "min_pep_length"
+ * "min_nuc_length"
 
 The pairwise sequence alignments are computed with Needleman-Wunsch using
 BLOSUM62 (NUC44 for nucleotides). Many benchmarking scenarios preprocess the
@@ -492,6 +494,36 @@ def _ParseArgs():
         action="store_true",
         help=("Dump additional info on model and reference residues that occur "
               "in pepnuc alignments."))
+    
+    parser.add_argument(
+        "--min-pep-length",
+        dest="min_pep_length",
+        default = 10,
+        type=int,
+        help=("Relevant parameter if short peptides are involved in scoring."
+              "Minimum peptide length for a chain in the target structure to "
+              "be considered in chain mapping. The chain mapping algorithm "
+              "first performs an all vs. all pairwise sequence alignment to "
+              "identify \"equal\" chains within the target structure. We go "
+              "for simple sequence identity there. Short sequences can be "
+              "problematic as they may produce high sequence identity "
+              "alignments by pure chance.")
+    )
+
+    parser.add_argument(
+        "--min-nuc-length",
+        dest="min_nuc_length",
+        default = 4,
+        type=int,
+        help=("Relevant parameter if short nucleotides are involved in scoring."
+              "Minimum nucleotide length for a chain in the target structure to "
+              "be considered in chain mapping. The chain mapping algorithm "
+              "first performs an all vs. all pairwise sequence alignment to "
+              "identify \"equal\" chains within the target structure. We go "
+              "for simple sequence identity there. Short sequences can be "
+              "problematic as they may produce high sequence identity "
+              "alignments by pure chance.")
+    )
  
     return parser.parse_args()
 
@@ -695,7 +727,9 @@ def _Process(model, reference, args):
                             usalign_exec = args.usalign_exec,
                             lddt_no_stereochecks = args.lddt_no_stereochecks,
                             n_max_naive = args.n_max_naive,
-                            oum = args.oum)
+                            oum = args.oum,
+                            min_pep_length = args.min_pep_length,
+                            min_nuc_length = args.min_nuc_length)
 
     ir = _GetInconsistentResidues(scorer.aln)
     if len(ir) > 0 and args.enforce_consistency:
@@ -876,6 +910,8 @@ def _Main():
         out["cad_exec"] = args.cad_exec
         out["usalign_exec"] = args.usalign_exec
         out["lddt_no_stereochecks"] = args.lddt_no_stereochecks
+        out["min_pep_length"] = args.min_pep_length
+        out["min_nuc_length"] = args.min_nuc_length
         out["status"] = "SUCCESS"
         with open(args.output, 'w') as fh:
             json.dump(out, fh, indent=4, sort_keys=False)
diff --git a/modules/doc/actions.rst b/modules/doc/actions.rst
index eee1181a1..523926fc3 100644
--- a/modules/doc/actions.rst
+++ b/modules/doc/actions.rst
@@ -40,11 +40,16 @@ Details on the usage (output of ``ost compare-structures --help``):
                                 [--local-lddt] [--bb-lddt] [--bb-local-lddt]
                                 [--cad-score] [--local-cad-score]
                                 [--cad-exec CAD_EXEC]
-                                [--usalign-exec USALIGN_EXEC] [--qs-score]
-                                [--dockq] [--contact-scores] [--rigid-scores]
+                                [--usalign-exec USALIGN_EXEC]
+                                [--override-usalign-mapping] [--qs-score]
+                                [--dockq] [--ics] [--ips] [--rigid-scores]
                                 [--patch-scores] [--tm-score]
                                 [--lddt-no-stereochecks]
                                 [--n-max-naive N_MAX_NAIVE]
+                                [--dump-aligned-residues] [--dump-pepnuc-alns]
+                                [--dump-pepnuc-aligned-residues]
+                                [--min-pep-length MIN_PEP_LENGTH]
+                                [--min-nuc-length MIN_NUC_LENGTH]
 
   Evaluate model against reference 
 
@@ -104,6 +109,8 @@ Details on the usage (output of ``ost compare-structures --help``):
    * "cad_exec"
    * "usalign_exec"
    * "lddt_no_stereochecks"
+   * "min_pep_length"
+   * "min_nuc_length"
 
   The pairwise sequence alignments are computed with Needleman-Wunsch using
   BLOSUM62 (NUC44 for nucleotides). Many benchmarking scenarios preprocess the
@@ -351,6 +358,26 @@ Details on the usage (output of ``ost compare-structures --help``):
     --dump-pepnuc-aligned-residues
                           Dump additional info on model and reference residues
                           that occur in pepnuc alignments.
+    --min-pep-length MIN_PEP_LENGTH
+                          Relevant parameter if short peptides are involved in
+                          scoring.Minimum peptide length for a chain in the
+                          target structure to be considered in chain mapping.
+                          The chain mapping algorithm first performs an all vs.
+                          all pairwise sequence alignment to identify "equal"
+                          chains within the target structure. We go for simple
+                          sequence identity there. Short sequences can be
+                          problematic as they may produce high sequence identity
+                          alignments by pure chance.
+    --min-nuc-length MIN_NUC_LENGTH
+                          Relevant parameter if short nucleotides are involved
+                          in scoring.Minimum nucleotide length for a chain in
+                          the target structure to be considered in chain
+                          mapping. The chain mapping algorithm first performs an
+                          all vs. all pairwise sequence alignment to identify
+                          "equal" chains within the target structure. We go for
+                          simple sequence identity there. Short sequences can be
+                          problematic as they may produce high sequence identity
+                          alignments by pure chance.
 
 
 .. _ost compare ligand structures:
diff --git a/modules/mol/alg/pymod/scoring.py b/modules/mol/alg/pymod/scoring.py
index 6aaa2276e..e568fdf3a 100644
--- a/modules/mol/alg/pymod/scoring.py
+++ b/modules/mol/alg/pymod/scoring.py
@@ -141,12 +141,33 @@ class Scorer:
                 object into USalign to compute TM-score. Experimental feature
                 with limitations.
     :type oum: :class:`bool`
+    :param min_pep_length: Relevant parameter if short peptides are involved in
+                           scoring. Minimum peptide length for a chain in the
+                           target structure to be considered in chain mapping.
+                           The chain mapping algorithm first performs an all vs.
+                           all pairwise sequence alignment to identify \"equal\"
+                           chains within the target structure. We go for simple
+                           sequence identity there. Short sequences can be
+                           problematic as they may produce high sequence identity
+                           alignments by pure chance.
+    :type min_pep_length: :class:`int`
+    :param min_nuc_length: Relevant parameter if short nucleotides are involved
+                           in scoring. Minimum nucleotide length for a chain in
+                           the target structure to be considered in chain
+                           mapping. The chain mapping algorithm first performs
+                           an all vs. all pairwise sequence alignment to
+                           identify \"equal\" chains within the target
+                           structure. We go for simple sequence identity there.
+                           Short sequences can be problematic as they may
+                           produce high sequence identity alignments by pure
+                           chance.
+    :type min_nuc_length: :class:`int`
     """
     def __init__(self, model, target, resnum_alignments=False,
                  molck_settings = None, cad_score_exec = None,
                  custom_mapping=None, usalign_exec = None,
                  lddt_no_stereochecks=False, n_max_naive=40320,
-                 oum=False):
+                 oum=False, min_pep_length = 10, min_nuc_length = 4):
 
         self._target_orig = target
         self._model_orig = model
@@ -231,6 +252,8 @@ class Scorer:
         self.lddt_no_stereochecks = lddt_no_stereochecks
         self.n_max_naive = n_max_naive
         self.oum = oum
+        self.min_pep_length = min_pep_length
+        self.min_nuc_length = min_nuc_length
 
         # lazily evaluated attributes
         self._stereochecked_model = None
@@ -491,7 +514,9 @@ class Scorer:
         if self._chain_mapper is None:
             self._chain_mapper = chain_mapping.ChainMapper(self.target,
                                                            n_max_naive=1e9,
-                                                           resnum_alignments=self.resnum_alignments)
+                                                           resnum_alignments=self.resnum_alignments,
+                                                           min_pep_length=self.min_pep_length,
+                                                           min_nuc_length=self.min_nuc_length)
         return self._chain_mapper
 
     @property
-- 
GitLab