scoring: min_pep_length/min_nuc_length in compare-structures action

Minimum length for a chain in the target structure to be considered in chain mapping. The chain mapping algorithm first performs an all vs. all pairwise sequence alignment to identify "equal" chains within the target structure. We go for simple sequence identity there. Short sequences can be problematic as they may produce high sequence identity alignments by pure chance. BUT: if you're scoring peptides or short nucleotides, you really want to be able to reduce the default thresholds (pep: 10, nuc: 4)

scoring: min_pep_length/min_nuc_length in compare-structures action
341783cc · Studer Gabriel · 6a67fea1 · 341783cc · 341783cc · 341783cc
Commit 341783cc authored Jan 11, 2024 by Studer Gabriel
--- a/actions/ost-compare-structures
+++ b/actions/ost-compare-structures
@@ -57,6 +57,8 @@ results:
 * "cad_exec"
 * "usalign_exec"
 * "lddt_no_stereochecks"
+ * "min_pep_length"
+ * "min_nuc_length"

 The pairwise sequence alignments are computed with Needleman-Wunsch using
 BLOSUM62 (NUC44 for nucleotides). Many benchmarking scenarios preprocess the
@@ -493,6 +495,36 @@ def _ParseArgs():
        help=("Dump additional info on model and reference residues that occur "
              "in pepnuc alignments."))
    
+    parser.add_argument(
+        "--min-pep-length",
+        dest="min_pep_length",
+        default = 10,
+        type=int,
+        help=("Relevant parameter if short peptides are involved in scoring."
+              "Minimum peptide length for a chain in the target structure to "
+              "be considered in chain mapping. The chain mapping algorithm "
+              "first performs an all vs. all pairwise sequence alignment to "
+              "identify \"equal\" chains within the target structure. We go "
+              "for simple sequence identity there. Short sequences can be "
+              "problematic as they may produce high sequence identity "
+              "alignments by pure chance.")
+    )
+
+    parser.add_argument(
+        "--min-nuc-length",
+        dest="min_nuc_length",
+        default = 4,
+        type=int,
+        help=("Relevant parameter if short nucleotides are involved in scoring."
+              "Minimum nucleotide length for a chain in the target structure to "
+              "be considered in chain mapping. The chain mapping algorithm "
+              "first performs an all vs. all pairwise sequence alignment to "
+              "identify \"equal\" chains within the target structure. We go "
+              "for simple sequence identity there. Short sequences can be "
+              "problematic as they may produce high sequence identity "
+              "alignments by pure chance.")
+    )
+ 
    return parser.parse_args()

 def _RoundOrNone(num, decimals = 3):
@@ -695,7 +727,9 @@ def _Process(model, reference, args):
                            usalign_exec = args.usalign_exec,
                            lddt_no_stereochecks = args.lddt_no_stereochecks,
                            n_max_naive = args.n_max_naive,
-                            oum = args.oum)
+                            oum = args.oum,
+                            min_pep_length = args.min_pep_length,
+                            min_nuc_length = args.min_nuc_length)

    ir = _GetInconsistentResidues(scorer.aln)
    if len(ir) > 0 and args.enforce_consistency:
@@ -876,6 +910,8 @@ def _Main():
        out["cad_exec"] = args.cad_exec
        out["usalign_exec"] = args.usalign_exec
        out["lddt_no_stereochecks"] = args.lddt_no_stereochecks
+        out["min_pep_length"] = args.min_pep_length
+        out["min_nuc_length"] = args.min_nuc_length
        out["status"] = "SUCCESS"
        with open(args.output, 'w') as fh:
            json.dump(out, fh, indent=4, sort_keys=False)


--- a/modules/doc/actions.rst
+++ b/modules/doc/actions.rst
@@ -40,11 +40,16 @@ Details on the usage (output of ``ost compare-structures --help``):
                                [--local-lddt] [--bb-lddt] [--bb-local-lddt]
                                [--cad-score] [--local-cad-score]
                                [--cad-exec CAD_EXEC]
-                                [--usalign-exec USALIGN_EXEC] [--qs-score]
-                                [--dockq] [--contact-scores] [--rigid-scores]
+                                [--usalign-exec USALIGN_EXEC]
+                                [--override-usalign-mapping] [--qs-score]
+                                [--dockq] [--ics] [--ips] [--rigid-scores]
                                [--patch-scores] [--tm-score]
                                [--lddt-no-stereochecks]
                                [--n-max-naive N_MAX_NAIVE]
+                                [--dump-aligned-residues] [--dump-pepnuc-alns]
+                                [--dump-pepnuc-aligned-residues]
+                                [--min-pep-length MIN_PEP_LENGTH]
+                                [--min-nuc-length MIN_NUC_LENGTH]

  Evaluate model against reference 

@@ -104,6 +109,8 @@ Details on the usage (output of ``ost compare-structures --help``):
   * "cad_exec"
   * "usalign_exec"
   * "lddt_no_stereochecks"
+   * "min_pep_length"
+   * "min_nuc_length"

  The pairwise sequence alignments are computed with Needleman-Wunsch using
  BLOSUM62 (NUC44 for nucleotides). Many benchmarking scenarios preprocess the
@@ -351,6 +358,26 @@ Details on the usage (output of ``ost compare-structures --help``):
    --dump-pepnuc-aligned-residues
                          Dump additional info on model and reference residues
                          that occur in pepnuc alignments.
+    --min-pep-length MIN_PEP_LENGTH
+                          Relevant parameter if short peptides are involved in
+                          scoring.Minimum peptide length for a chain in the
+                          target structure to be considered in chain mapping.
+                          The chain mapping algorithm first performs an all vs.
+                          all pairwise sequence alignment to identify "equal"
+                          chains within the target structure. We go for simple
+                          sequence identity there. Short sequences can be
+                          problematic as they may produce high sequence identity
+                          alignments by pure chance.
+    --min-nuc-length MIN_NUC_LENGTH
+                          Relevant parameter if short nucleotides are involved
+                          in scoring.Minimum nucleotide length for a chain in
+                          the target structure to be considered in chain
+                          mapping. The chain mapping algorithm first performs an
+                          all vs. all pairwise sequence alignment to identify
+                          "equal" chains within the target structure. We go for
+                          simple sequence identity there. Short sequences can be
+                          problematic as they may produce high sequence identity
+                          alignments by pure chance.


 .. _ost compare ligand structures:


--- a/modules/mol/alg/pymod/scoring.py
+++ b/modules/mol/alg/pymod/scoring.py
@@ -141,12 +141,33 @@ class Scorer:
                object into USalign to compute TM-score. Experimental feature
                with limitations.
    :type oum: :class:`bool`
+    :param min_pep_length: Relevant parameter if short peptides are involved in
+                           scoring. Minimum peptide length for a chain in the
+                           target structure to be considered in chain mapping.
+                           The chain mapping algorithm first performs an all vs.
+                           all pairwise sequence alignment to identify \"equal\"
+                           chains within the target structure. We go for simple
+                           sequence identity there. Short sequences can be
+                           problematic as they may produce high sequence identity
+                           alignments by pure chance.
+    :type min_pep_length: :class:`int`
+    :param min_nuc_length: Relevant parameter if short nucleotides are involved
+                           in scoring. Minimum nucleotide length for a chain in
+                           the target structure to be considered in chain
+                           mapping. The chain mapping algorithm first performs
+                           an all vs. all pairwise sequence alignment to
+                           identify \"equal\" chains within the target
+                           structure. We go for simple sequence identity there.
+                           Short sequences can be problematic as they may
+                           produce high sequence identity alignments by pure
+                           chance.
+    :type min_nuc_length: :class:`int`
    """
    def __init__(self, model, target, resnum_alignments=False,
                 molck_settings = None, cad_score_exec = None,
                 custom_mapping=None, usalign_exec = None,
                 lddt_no_stereochecks=False, n_max_naive=40320,
-                 oum=False):
+                 oum=False, min_pep_length = 10, min_nuc_length = 4):

        self._target_orig = target
        self._model_orig = model
@@ -231,6 +252,8 @@ class Scorer:
        self.lddt_no_stereochecks = lddt_no_stereochecks
        self.n_max_naive = n_max_naive
        self.oum = oum
+        self.min_pep_length = min_pep_length
+        self.min_nuc_length = min_nuc_length

        # lazily evaluated attributes
        self._stereochecked_model = None
@@ -491,7 +514,9 @@ class Scorer:
        if self._chain_mapper is None:
            self._chain_mapper = chain_mapping.ChainMapper(self.target,
                                                           n_max_naive=1e9,
-                                                           resnum_alignments=self.resnum_alignments)
+                                                           resnum_alignments=self.resnum_alignments,
+                                                           min_pep_length=self.min_pep_length,
+                                                           min_nuc_length=self.min_nuc_length)
        return self._chain_mapper

    @property