diff --git a/modelling/doc/pipeline.rst b/modelling/doc/pipeline.rst
index 3e90542c05f05af6398f4dd9bc514be793c4dcaf..bd114c4575734a8a2ecbbf73c0699e19e01334e1 100644
--- a/modelling/doc/pipeline.rst
+++ b/modelling/doc/pipeline.rst
@@ -443,3 +443,11 @@ Modelling Steps
:members:
.. autofunction:: AddModellingIssue
+
+
+Alignment Fiddling
+--------------------------------------------------------------------------------
+
+.. autofunction:: DeleteGapCols
+
+.. autofunction:: PullTerminalDeletions
diff --git a/modelling/pymod/CMakeLists.txt b/modelling/pymod/CMakeLists.txt
index e75ef2691e15c3fbba1e4d6ca2d5566f49750634..d000ba36e7b68adea62bb41ce064ca9b0b03b901 100644
--- a/modelling/pymod/CMakeLists.txt
+++ b/modelling/pymod/CMakeLists.txt
@@ -25,6 +25,7 @@ set(MODELLING_PYMOD
_reconstruct_sidechains.py
_monte_carlo.py
_mhandle_helper.py
+ _alignment_fiddling.py
)
pymod(NAME modelling
diff --git a/modelling/pymod/__init__.py b/modelling/pymod/__init__.py
index 359767927e84ae846742430b9e8474d476273076..54749fcabdaf32fe3864307ec7ae2ce7a8320a5f 100644
--- a/modelling/pymod/__init__.py
+++ b/modelling/pymod/__init__.py
@@ -15,6 +15,7 @@
"""Initialise the modelling module."""
+from ._alignment_fiddling import *
from ._modelling import *
from ._closegaps import *
from ._molprobity import *
diff --git a/modelling/pymod/_alignment_fiddling.py b/modelling/pymod/_alignment_fiddling.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ca25aa2f80c4837db04e781aa7d176b58803950
--- /dev/null
+++ b/modelling/pymod/_alignment_fiddling.py
@@ -0,0 +1,198 @@
+# Copyright (c) 2013-2020, SIB - Swiss Institute of Bioinformatics and
+# Biozentrum - University of Basel
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ost import seq
+
+
+def _PullLeftToRight(s0, s1, min_terminal_anchor_size):
+
+ s_len = len(s0)
+
+ # search for first seqres residue which is covered by a template residue
+ first_idx = None
+ for i in range(s_len):
+ if s0[i] != '-' and s1[i] != '-':
+ first_idx = i
+ break
+ if first_idx is None:
+ return False # not a single residue in SEQRES is covered by ATOMSEQ...
+ # nothing to do...
+ # search for first gap start in both sequences
+ try:
+ s0_first_gap = s0.index('-', first_idx)
+ except:
+ return False # no gap in SEQRES... nothing to do...
+ try:
+ s1_first_gap = s1.index('-', first_idx)
+ except:
+ s1_first_gap = s_len # no gap in ATOMSEQ
+
+ if s0_first_gap < s1_first_gap:
+ # the first gap is in the SEQRES => its a deletion!
+ l = s0_first_gap - first_idx
+ if l < min_terminal_anchor_size:
+ del_end = None
+ for i in range(s0_first_gap, s_len):
+ if s0[i] != '-':
+ del_end = i
+ break
+ if del_end is None:
+ return False # gap reaches c-terminus... nothing to do...
+ # shift it!
+ anchor = list(s0[first_idx:first_idx+l])
+ s0[first_idx:first_idx+l] = l*['-']
+ s0[del_end-l:del_end] = anchor
+ return True
+ return False
+
+
+def DeleteGapCols(aln):
+ '''Deletes alignment columns that only contain gaps.
+
+ Columns that only contain gaps ('-') are removed. If no such column can be
+ identified, the input alignment gets returned. A new alignment gets
+ constructed otherwise. The sequences of the new alignment retain name, offset
+ and the potentially attached :class:`ost.mol.EntityView` from the original
+ sequences.
+
+ :param aln: Input alignment
+ :type aln: :class:`ost.seq.AlignmentHandle`
+ :returns: The processed alignment
+ :rtype: :class:`ost.seq.AlignmentHandle`
+ '''
+ n_sequences = aln.GetCount()
+ gap_cols = list()
+ for c_idx, c in enumerate(aln):
+ gaps = [int(c[idx] == '-') for idx in range(n_sequences)]
+ if sum(gaps) == n_sequences:
+ gap_cols.append(c_idx)
+ if len(gap_cols) == 0:
+ return aln # nothing to do
+
+ # remove the found gap columns
+ s_lists = list()
+ for s_idx in range(n_sequences):
+ s_lists.append([c for c in str(aln.GetSequence(s_idx))])
+ for gap_col_idx in reversed(gap_cols):
+ for sl in s_lists:
+ sl.pop(gap_col_idx)
+
+ # construct a return alignment with the new sequences but names, offsets
+ # and attached views from the old alignment
+ return_aln = seq.CreateAlignment()
+ for s_idx in range(n_sequences):
+ s = aln.GetSequence(s_idx)
+ s_processed = seq.CreateSequence(s.GetName(), ''.join(s_lists[s_idx]))
+ s_processed.SetOffset(s.GetOffset())
+ if s.HasAttachedView():
+ s_processed.AttachView(s.GetAttachedView())
+ return_aln.AddSequence(s_processed)
+
+ return return_aln
+
+
+def PullTerminalDeletions(aln, min_terminal_anchor_size = 4):
+ '''Fixes deletions close to termini.
+
+ Some alignment tools may produce alignments with deletions close to the
+ termini. For example:
+
+ .. code-block:: none
+
+ SEQRES: A-----BCDE...
+ ATOMSEQ: ABCDEFGHIJ...
+
+ where A is the anchor residue. The default loop modelling pipeline would
+ keep the position of A fixed and start to omit structural information from
+ B, C, ... until it is able to resolve the deletion. If the anchor is very
+ short, a shift typically results in a better model:
+
+ .. code-block:: none
+
+ SEQRES: -----ABCDE...
+ ATOMSEQ: ABCDEFGHIJ...
+
+ This function checks whether the gap closest to any termini is a deletion
+ (one or several '-' in the first sequence) and estimates the anchor size
+ (number of aligned residues towards the respective termini from that gap).
+ If the anchor size is smaller than *min_terminal_anchor_size*, a shift is
+ applied and the deletion removed.
+
+ This is done iteratively, until no deletion can be removed anymore.
+
+ .. code-block:: none
+
+ SEQRES: A-B--CDEF...
+ ATOMSEQ: ABCDEFGHI...
+
+ becomes
+
+ .. code-block:: none
+
+ SEQRES: ---ABCDEF...
+ ATOMSEQ: ABCDEFGHI...
+
+ given a *min_terminal_anchor_size*>2. If no shift can be performed, the
+ input alignment gets returned. A new alignment gets constructed otherwise.
+ The sequences of the new alignment retain name, offset and the potentially
+ attached :class:`ost.mol.EntityView` from the original sequences.
+
+ :param aln: Input alignment
+ :type aln: :class:`ost.seq.AlignmentHandle`
+ :returns: The processed alignment
+ :rtype: :class:`ost.seq.AlignmentHandle`
+ '''
+ if aln.GetCount() != 2:
+ raise RuntimeError("Expect 2 seqs for aln in PullTerminalDeletions")
+
+ if min_terminal_anchor_size < 1:
+ raise RuntimeError("min_terminal_anchor_size must be > 0 in PullTerminalDeletions")
+
+ aln_no_gap_cols = DeleteGapCols(aln)
+ s0 = [c for c in str(aln_no_gap_cols.GetSequence(0))]
+ s1 = [c for c in str(aln_no_gap_cols.GetSequence(1))]
+ something_happened = False
+
+ # Pull in deletions from the n-terminus
+ while _PullLeftToRight(s0, s1, min_terminal_anchor_size):
+ something_happened = True
+
+ # Pull in deletions from the c-terminus
+ s0.reverse()
+ s1.reverse()
+ while _PullLeftToRight(s0, s1, min_terminal_anchor_size):
+ something_happened = True
+
+ if not something_happened:
+ return aln # return original alignment as no pulling happened...
+
+ s0.reverse()
+ s1.reverse()
+
+ return_aln = seq.CreateAlignment()
+ s = aln.GetSequence(0)
+ s_processed = seq.CreateSequence(s.GetName(), ''.join(s0))
+ s_processed.SetOffset(s.GetOffset())
+ if s.HasAttachedView():
+ s_processed.AttachView(s.GetAttachedView())
+ return_aln.AddSequence(s_processed)
+ s = aln.GetSequence(1)
+ s_processed = seq.CreateSequence(s.GetName(), ''.join(s1))
+ s_processed.SetOffset(s.GetOffset())
+ if s.HasAttachedView():
+ s_processed.AttachView(s.GetAttachedView())
+ return_aln.AddSequence(s_processed)
+
+ return return_aln
diff --git a/modelling/tests/CMakeLists.txt b/modelling/tests/CMakeLists.txt
index 72ce3f9aff830412fd28f48bd6fe2b4511cf7c6e..5afe06ee9f50df622b2524d9f67c432741af04bf 100644
--- a/modelling/tests/CMakeLists.txt
+++ b/modelling/tests/CMakeLists.txt
@@ -5,6 +5,7 @@ set(MODELLING_UNIT_TESTS
test_pipeline.py
test_ring_punches.py
test_sidechain_reconstruction.py
+ test_alignment_fiddling.py
test_loop_candidates.cc
test_loop_closing.cc
test_sidechain_reconstructor.cc
@@ -69,6 +70,8 @@ set(MODELLING_TEST_DATA
data/seq.fasta
data/smtl2bl4.1.pdb
data/ter.fasta
+ data/terminal_deletion_aln.fasta
+ data/double_gap_aln.fasta
)
promod3_unittest(MODULE modelling
diff --git a/modelling/tests/data/double_gap_aln.fasta b/modelling/tests/data/double_gap_aln.fasta
new file mode 100644
index 0000000000000000000000000000000000000000..51c7a742943594456123dfcbb468ad94a524b38a
--- /dev/null
+++ b/modelling/tests/data/double_gap_aln.fasta
@@ -0,0 +1,4 @@
+>target
+-ABCDE-FG-H----I-
+>template
+-ABCDE-FGHI-JK-L-
\ No newline at end of file
diff --git a/modelling/tests/data/terminal_deletion_aln.fasta b/modelling/tests/data/terminal_deletion_aln.fasta
new file mode 100644
index 0000000000000000000000000000000000000000..095e547d4986c68cdfe94a9aa3daf02dced5a93a
--- /dev/null
+++ b/modelling/tests/data/terminal_deletion_aln.fasta
@@ -0,0 +1,4 @@
+>target
+GSHMGD----LK--------------YSLERLREILERLEENPSEKQIVEAIRAIVENNAQIVE--AAI-ENNAQIVENNRAIIEALEAIGVDQKILEEMKKQLKDLKRSLERG
+>template
+-----DAQDKLKYLVKQLERALRELKKSLDELERSLEELEKNPSEDALVENNRLNVENNKIIVEVLRIILE-------------------------------------------
diff --git a/modelling/tests/test_alignment_fiddling.py b/modelling/tests/test_alignment_fiddling.py
new file mode 100644
index 0000000000000000000000000000000000000000..e015cb3c2f7f6d2f7d4331764c7f87e9d108cf63
--- /dev/null
+++ b/modelling/tests/test_alignment_fiddling.py
@@ -0,0 +1,129 @@
+# Copyright (c) 2013-2020, SIB - Swiss Institute of Bioinformatics and
+# Biozentrum - University of Basel
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+Unit tests for alignment fiddling.
+"""
+import unittest
+from promod3 import modelling
+from ost import seq, io
+
+class ModellingTests(unittest.TestCase):
+
+ def testDeleteGapCols(self):
+ aln = io.LoadAlignment('data/double_gap_aln.fasta')
+ s0_orig = aln.GetSequence(0)
+ s1_orig = aln.GetSequence(1)
+ # recreate aln with added offsets and attach some random structure to
+ # both sequences
+ s0 = seq.CreateSequence(s0_orig.GetName(), str(s0_orig))
+ s1 = seq.CreateSequence(s1_orig.GetName(), str(s1_orig))
+ s0.SetOffset(1)
+ s1.SetOffset(2)
+ random_structure = io.LoadPDB('data/1CRN.pdb')
+ s0.AttachView(random_structure.CreateFullView())
+ s1.AttachView(random_structure.CreateFullView())
+ pimped_aln = seq.CreateAlignment()
+ pimped_aln.AddSequence(s0)
+ pimped_aln.AddSequence(s1)
+ processed_aln = modelling.DeleteGapCols(pimped_aln)
+ processed_s0 = processed_aln.GetSequence(0)
+ processed_s1 = processed_aln.GetSequence(1)
+
+ # the following things should be equal between sx and processed_sx:
+ # - name
+ # - offset
+ # - view attached
+ self.assertEqual(s0.GetName(), processed_s0.GetName())
+ self.assertEqual(s1.GetName(), processed_s1.GetName())
+ self.assertEqual(s0.GetOffset(), processed_s0.GetOffset())
+ self.assertEqual(s1.GetOffset(), processed_s1.GetOffset())
+ self.assertTrue(processed_s0.HasAttachedView())
+ self.assertTrue(processed_s1.HasAttachedView())
+
+ # the actual sequences changed and are checked with the expected outcome
+ self.assertEqual(str(processed_s0), 'ABCDEFG-H--I')
+ self.assertEqual(str(processed_s1), 'ABCDEFGHIJKL')
+
+
+ def testPullTerminalDeletions(self):
+
+ aln = io.LoadAlignment('data/terminal_deletion_aln.fasta')
+
+ # min_terminal_anchor_sizes which are <= 0 are disallowed
+ self.assertRaises(RuntimeError, modelling.PullTerminalDeletions, aln, -10)
+ self.assertRaises(RuntimeError, modelling.PullTerminalDeletions, aln, 0)
+
+ # check expected results for different min_terminal_anchor_sizes
+
+ # nothing should happen for anchor size 1
+ processed_aln = modelling.PullTerminalDeletions(aln, 1)
+ self.assertEqual(str(aln.GetSequence(0)), str(processed_aln.GetSequence(0)))
+ self.assertEqual(str(aln.GetSequence(1)), str(processed_aln.GetSequence(1)))
+
+ processed_aln = modelling.PullTerminalDeletions(aln, 2)
+ exp_s0 = 'GSHMG----DLK--------------YSLERLREILERLEENPSEKQIVEAIRAIVENNAQIVE--AAIE-NNAQIVENNRAIIEALEAIGVDQKILEEMKKQLKDLKRSLERG'
+ exp_s1 = '-----DAQDKLKYLVKQLERALRELKKSLDELERSLEELEKNPSEDALVENNRLNVENNKIIVEVLRIILE-------------------------------------------'
+ self.assertEqual(str(processed_aln.GetSequence(0)), exp_s0)
+ self.assertEqual(str(processed_aln.GetSequence(1)), exp_s1)
+
+ processed_aln = modelling.PullTerminalDeletions(aln, 3)
+ exp_s0 = 'GSHMG----DLK--------------YSLERLREILERLEENPSEKQIVEAIRAIVENNAQIVE--AAIE-NNAQIVENNRAIIEALEAIGVDQKILEEMKKQLKDLKRSLERG'
+ exp_s1 = '-----DAQDKLKYLVKQLERALRELKKSLDELERSLEELEKNPSEDALVENNRLNVENNKIIVEVLRIILE-------------------------------------------'
+ self.assertEqual(str(processed_aln.GetSequence(0)), exp_s0)
+ self.assertEqual(str(processed_aln.GetSequence(1)), exp_s1)
+
+ processed_aln = modelling.PullTerminalDeletions(aln, 4)
+ exp_s0 = 'GSHMG------------------DLKYSLERLREILERLEENPSEKQIVEAIRAIVENNAQIVE--AAIE-NNAQIVENNRAIIEALEAIGVDQKILEEMKKQLKDLKRSLERG'
+ exp_s1 = '-----DAQDKLKYLVKQLERALRELKKSLDELERSLEELEKNPSEDALVENNRLNVENNKIIVEVLRIILE-------------------------------------------'
+ self.assertEqual(str(processed_aln.GetSequence(0)), exp_s0)
+ self.assertEqual(str(processed_aln.GetSequence(1)), exp_s1)
+
+ processed_aln = modelling.PullTerminalDeletions(aln, 5)
+ exp_s0 = 'GSHMG------------------DLKYSLERLREILERLEENPSEKQIVEAIRAIVENNAQIVEAAIE---NNAQIVENNRAIIEALEAIGVDQKILEEMKKQLKDLKRSLERG'
+ exp_s1 = '-----DAQDKLKYLVKQLERALRELKKSLDELERSLEELEKNPSEDALVENNRLNVENNKIIVEVLRIILE-------------------------------------------'
+ self.assertEqual(str(processed_aln.GetSequence(0)), exp_s0)
+ self.assertEqual(str(processed_aln.GetSequence(1)), exp_s1)
+
+ # also check, whether sequence name, offsets and attached views are preserved
+ random_structure = io.LoadPDB('data/1CRN.pdb')
+ s0 = seq.CreateSequence(aln.GetSequence(0).GetName(), str(aln.GetSequence(0)))
+ s1 = seq.CreateSequence(aln.GetSequence(1).GetName(), str(aln.GetSequence(1)))
+ s0.AttachView(random_structure.CreateFullView())
+ s1.AttachView(random_structure.CreateFullView())
+ pimped_aln = seq.CreateAlignment()
+ pimped_aln.AddSequence(s0)
+ pimped_aln.AddSequence(s1)
+ processed_aln = modelling.PullTerminalDeletions(pimped_aln, 5)
+ processed_s0 = processed_aln.GetSequence(0)
+ processed_s1 = processed_aln.GetSequence(1)
+
+ # the following things should be equal between sx and processed_sx:
+ # - name
+ # - offset
+ # - view attached
+ self.assertEqual(s0.GetName(), processed_s0.GetName())
+ self.assertEqual(s1.GetName(), processed_s1.GetName())
+ self.assertEqual(s0.GetOffset(), processed_s0.GetOffset())
+ self.assertEqual(s1.GetOffset(), processed_s1.GetOffset())
+ self.assertTrue(processed_s0.HasAttachedView())
+ self.assertTrue(processed_s1.HasAttachedView())
+
+
+if __name__ == "__main__":
+ from ost import testutils
+ testutils.RunTests()
+