From fe259ddf4acbf83d38fcef2a435fb6070568c006 Mon Sep 17 00:00:00 2001
From: Gabriel Studer <gabriel.studer@unibas.ch>
Date: Fri, 9 Aug 2024 15:25:36 +0200
Subject: [PATCH] chain mapping: reduce likelihood of grouping dissimilar
 sequences together

In principle one can have the following alignment:

XXXXXXXXXXA--------
----------AYYYYYYYY

It has a 100% sequence identity! The previously implemented logic of gap
thresholds was also not very helpfil to filter out these cases as it
operated on fraction of gaps between first and last aligned column in
the alignment. That's 0.0 and thus perfect.

This commit simplifies this logic and simply checks for a sequence identity
threshold and a minimum number of aligned columns when grouping sequences
together. This should make grouping these cases together very unlikely.
---
 modules/mol/alg/pymod/chain_mapping.py      | 82 ++++++++-------------
 modules/mol/alg/tests/test_chain_mapping.py | 10 +--
 2 files changed, 30 insertions(+), 62 deletions(-)

diff --git a/modules/mol/alg/pymod/chain_mapping.py b/modules/mol/alg/pymod/chain_mapping.py
index 9d025f316..4aabece65 100644
--- a/modules/mol/alg/pymod/chain_mapping.py
+++ b/modules/mol/alg/pymod/chain_mapping.py
@@ -522,10 +522,9 @@ class ChainMapper:
       simply derived from residue numbers (*resnum_alignments* flag).
       In case of NW, *pep_subst_mat*, *pep_gap_open* and *pep_gap_ext*
       and their nucleotide equivalents are relevant. Two chains are
-      considered identical if they fulfill the thresholds given by
-      *pep_seqid_thr*, *pep_gap_thr*, their nucleotide equivalents
-      respectively. The grouping information is available as
-      attributes of this class.
+      considered identical if they fulfill the *pep_seqid_thr* and
+      have at least *min_pep_length* aligned residues. Same logic
+      is applied for nucleotides with respective thresholds.
 
     * Map chains in an input model to these groups. Generating alignments
       and the similarity criteria are the same as above. You can either
@@ -551,19 +550,8 @@ class ChainMapper:
                           identical. 95 percent tolerates the few mutations
                           crystallographers like to do.
     :type pep_seqid_thr:  :class:`float`
-    :param pep_gap_thr: Additional threshold to avoid gappy alignments with
-                        high seqid. By default this is disabled (set to 1.0).
-                        This threshold checks for a maximum allowed fraction
-                        of gaps in any of the two sequences after stripping
-                        terminal gaps. The reason for not just normalizing
-                        seqid by the longer sequence is that one sequence
-                        might be a perfect subsequence of the other but only
-                        cover half of it. 
-    :type pep_gap_thr:  :class:`float`
     :param nuc_seqid_thr: Nucleotide equivalent for *pep_seqid_thr*
     :type nuc_seqid_thr:  :class:`float`
-    :param nuc_gap_thr: Nucleotide equivalent for *nuc_gap_thr*
-    :type nuc_gap_thr:  :class:`float`
     :param pep_subst_mat: Substitution matrix to align peptide sequences,
                           irrelevant if *resnum_alignments* is True,
                           defaults to seq.alg.BLOSUM62
@@ -595,8 +583,7 @@ class ChainMapper:
     :type n_max_naive: :class:`int`
     """
     def __init__(self, target, resnum_alignments=False,
-                 pep_seqid_thr = 95., pep_gap_thr = 1.0,
-                 nuc_seqid_thr = 95., nuc_gap_thr = 1.0,
+                 pep_seqid_thr = 95., nuc_seqid_thr = 95., 
                  pep_subst_mat = seq.alg.BLOSUM62, pep_gap_open = -11,
                  pep_gap_ext = -1, nuc_subst_mat = seq.alg.NUC44,
                  nuc_gap_open = -4, nuc_gap_ext = -4,
@@ -606,9 +593,7 @@ class ChainMapper:
         # attributes
         self.resnum_alignments = resnum_alignments
         self.pep_seqid_thr = pep_seqid_thr
-        self.pep_gap_thr = pep_gap_thr
         self.nuc_seqid_thr = nuc_seqid_thr
-        self.nuc_gap_thr = nuc_gap_thr
         self.min_pep_length = min_pep_length
         self.min_nuc_length = min_nuc_length
         self.n_max_naive = n_max_naive
@@ -697,9 +682,9 @@ class ChainMapper:
             _GetChemGroupAlignments(self.polypep_seqs, self.polynuc_seqs,
                                     self.aligner,
                                     pep_seqid_thr=self.pep_seqid_thr,
-                                    pep_gap_thr=self.pep_gap_thr,
+                                    min_pep_length=self.min_pep_length,
                                     nuc_seqid_thr=self.nuc_seqid_thr,
-                                    nuc_gap_thr=self.nuc_gap_thr)
+                                    min_nuc_length=self.min_nuc_length)
 
         return self._chem_group_alignments
 
@@ -738,9 +723,9 @@ class ChainMapper:
             _GetChemGroupAlignments(self.polypep_seqs, self.polynuc_seqs,
                                     self.aligner,
                                     pep_seqid_thr=self.pep_seqid_thr,
-                                    pep_gap_thr=self.pep_gap_thr,
+                                    min_pep_length=self.min_pep_length,
                                     nuc_seqid_thr=self.nuc_seqid_thr,
-                                    nuc_gap_thr=self.nuc_gap_thr)
+                                    min_nuc_length=self.min_nuc_length)
 
         return self._chem_group_types
         
@@ -1760,20 +1745,17 @@ def _GetAlnPropsOne(aln):
 
     :param aln: Alignment to compute properties
     :type aln: :class:`seq.AlignmentHandle`
-    :returns: Tuple with 3 elements. 1) sequence identify in range [0, 100] 
-              considering aligned columns 2) Fraction of gaps between
-              first and last aligned column in s1 3) same for s2.
+    :returns: Tuple with 2 elements. 1) sequence identify in range [0, 100] 
+              considering aligned columns 2) Number of aligned columns.
     """
     assert(aln.GetCount() == 2)
-    n_gaps_1 = str(aln.GetSequence(0)).strip('-').count('-')
-    n_gaps_2 = str(aln.GetSequence(1)).strip('-').count('-')
-    gap_frac_1 = float(n_gaps_1)/len(aln.GetSequence(0).GetGaplessString())
-    gap_frac_2 = float(n_gaps_2)/len(aln.GetSequence(1).GetGaplessString())
-    return (seq.alg.SequenceIdentity(aln), gap_frac_1, gap_frac_2) 
+    seqid = seq.alg.SequenceIdentity(aln)
+    n_aligned = sum([1 for col in aln if (col[0] != '-' and col[1] != '-')])
+    return (seqid, n_aligned) 
 
 def _GetChemGroupAlignments(pep_seqs, nuc_seqs, aligner, pep_seqid_thr=95.,
-                            pep_gap_thr=0.1, nuc_seqid_thr=95.,
-                            nuc_gap_thr=0.1):
+                            min_pep_length=6, nuc_seqid_thr=95.,
+                            min_nuc_length=4):
     """Returns alignments with groups of chemically equivalent chains
 
     :param pep_seqs: List of polypeptide sequences
@@ -1786,17 +1768,14 @@ def _GetChemGroupAlignments(pep_seqs, nuc_seqs, aligner, pep_seqid_thr=95.,
                           identical. 95 percent tolerates the few mutations
                           crystallographers like to do.
     :type pep_seqid_thr:  :class:`float`
-    :param pep_gap_thr: Additional threshold to avoid gappy alignments with high
-                        seqid. The reason for not just normalizing seqid by the
-                        longer sequence is that one sequence might be a perfect
-                        subsequence of the other but only cover half of it. This
-                        threshold checks for a maximum allowed fraction of gaps
-                        in any of the two sequences after stripping terminal gaps.
-    :type pep_gap_thr: :class:`float`
+    :param min_pep_length: Additional threshold to avoid gappy alignments with high
+                           seqid. Number of aligned columns must be at least this
+                           number.
+    :type min_pep_length: :class:`int`
     :param nuc_seqid_thr: Nucleotide equivalent of *pep_seqid_thr*
     :type nuc_seqid_thr:  :class:`float`
-    :param nuc_gap_thr: Nucleotide equivalent of *nuc_gap_thr*
-    :type nuc_gap_thr: :class:`float`
+    :param min_nuc_length: Nucleotide equivalent of *min_pep_length*
+    :type min_nuc_length: :class:`int`
     :returns: Tuple with first element being an AlignmentList. Each alignment
               represents a group of chemically equivalent chains and the first
               sequence is the longest. Second element is a list of equivalent
@@ -1804,9 +1783,9 @@ def _GetChemGroupAlignments(pep_seqs, nuc_seqs, aligner, pep_seqid_thr=95.,
               [:class:`ost.ChemType.AMINOACIDS`,
               :class:`ost.ChemType.NUCLEOTIDES`] 
     """
-    pep_groups = _GroupSequences(pep_seqs, pep_seqid_thr, pep_gap_thr, aligner,
+    pep_groups = _GroupSequences(pep_seqs, pep_seqid_thr, min_pep_length, aligner,
                                  mol.ChemType.AMINOACIDS)
-    nuc_groups = _GroupSequences(nuc_seqs, nuc_seqid_thr, nuc_gap_thr, aligner,
+    nuc_groups = _GroupSequences(nuc_seqs, nuc_seqid_thr, min_nuc_length, aligner,
                                  mol.ChemType.NUCLEOTIDES)
     group_types = [mol.ChemType.AMINOACIDS] * len(pep_groups)
     group_types += [mol.ChemType.NUCLEOTIDES] * len(nuc_groups)
@@ -1814,18 +1793,15 @@ def _GetChemGroupAlignments(pep_seqs, nuc_seqs, aligner, pep_seqid_thr=95.,
     groups.extend(nuc_groups)
     return (groups, group_types)
 
-def _GroupSequences(seqs, seqid_thr, gap_thr, aligner, chem_type):
+def _GroupSequences(seqs, seqid_thr, min_length, aligner, chem_type):
     """Get list of alignments representing groups of equivalent sequences
 
     :param seqid_thr: Threshold used to decide when two chains are identical.
     :type seqid_thr:  :class:`float`
     :param gap_thr: Additional threshold to avoid gappy alignments with high
-                    seqid. The reason for not just normalizing seqid by the
-                    longer sequence is that one sequence might be a perfect
-                    subsequence of the other but only cover half of it. This
-                    threshold checks for a maximum allowed fraction of gaps
-                    in any of the two sequences after stripping terminal gaps.
-    :type gap_thr: :class:`float`
+                    seqid. Number of aligned columns must be at least this
+                    number.
+    :type gap_thr: :class:`int`
     :param aligner: Helper class to generate pairwise alignments
     :type aligner: :class:`_Aligner`
     :param chem_type: ChemType of seqs which is passed to *aligner*, must be in
@@ -1843,8 +1819,8 @@ def _GroupSequences(seqs, seqid_thr, gap_thr, aligner, chem_type):
             for g_s_idx in range(len(groups[g_idx])):
                 aln  = aligner.Align(seqs[s_idx], seqs[groups[g_idx][g_s_idx]],
                                      chem_type)
-                sid, frac_i, frac_j = _GetAlnPropsOne(aln)
-                if sid >= seqid_thr and frac_i < gap_thr and frac_j < gap_thr:
+                sid, n_aligned = _GetAlnPropsOne(aln)
+                if sid >= seqid_thr and n_aligned >= min_length:
                     matching_group = g_idx
                     break
             if matching_group is not None:
diff --git a/modules/mol/alg/tests/test_chain_mapping.py b/modules/mol/alg/tests/test_chain_mapping.py
index 6e17a67e6..b71676b26 100644
--- a/modules/mol/alg/tests/test_chain_mapping.py
+++ b/modules/mol/alg/tests/test_chain_mapping.py
@@ -133,13 +133,8 @@ class TestChainMapper(unittest.TestCase):
     ed.SetResidueNumber(r, mol.ResNum(r.GetNumber().GetNum() + 42))
     self.assertRaises(Exception, ChainMapper, tmp_ent, resnum_alignments=True)
 
-    # chain B has a missing Valine... set pep_gap_thr to 0.0 should give an
-    # additional chem group
-    mapper = ChainMapper(ent, pep_gap_thr=0.0)
-    self.assertEqual(len(mapper.chem_groups), 4)
-
     # introduce single point mutation in ent... increasing pep_seqid_thr to 100
-    # should give an additional chem group too
+    # should give an additional chem group
     mapper = ChainMapper(ent, pep_seqid_thr=100.)
     self.assertEqual(len(mapper.chem_groups), 3)
     tmp_ent = ent.Copy()
@@ -166,9 +161,6 @@ class TestChainMapper(unittest.TestCase):
     mapper = ChainMapper(tmp_ent, nuc_seqid_thr=100.)
     self.assertEqual(len(mapper.polynuc_seqs), 3)
     self.assertEqual(len(mapper.chem_groups), 4)
-    mapper = ChainMapper(tmp_ent, nuc_gap_thr=0.0)
-    self.assertEqual(len(mapper.polynuc_seqs), 3)
-    self.assertEqual(len(mapper.chem_groups), 4)
 
   def test_chem_mapping(self):
 
-- 
GitLab