diff --git a/modelling/doc/pipeline.rst b/modelling/doc/pipeline.rst index 3e90542c05f05af6398f4dd9bc514be793c4dcaf..bd114c4575734a8a2ecbbf73c0699e19e01334e1 100644 --- a/modelling/doc/pipeline.rst +++ b/modelling/doc/pipeline.rst @@ -443,3 +443,11 @@ Modelling Steps :members: .. autofunction:: AddModellingIssue + + +Alignment Fiddling +-------------------------------------------------------------------------------- + +.. autofunction:: DeleteGapCols + +.. autofunction:: PullTerminalDeletions diff --git a/modelling/pymod/CMakeLists.txt b/modelling/pymod/CMakeLists.txt index e75ef2691e15c3fbba1e4d6ca2d5566f49750634..d000ba36e7b68adea62bb41ce064ca9b0b03b901 100644 --- a/modelling/pymod/CMakeLists.txt +++ b/modelling/pymod/CMakeLists.txt @@ -25,6 +25,7 @@ set(MODELLING_PYMOD _reconstruct_sidechains.py _monte_carlo.py _mhandle_helper.py + _alignment_fiddling.py ) pymod(NAME modelling diff --git a/modelling/pymod/__init__.py b/modelling/pymod/__init__.py index 359767927e84ae846742430b9e8474d476273076..54749fcabdaf32fe3864307ec7ae2ce7a8320a5f 100644 --- a/modelling/pymod/__init__.py +++ b/modelling/pymod/__init__.py @@ -15,6 +15,7 @@ """Initialise the modelling module.""" +from ._alignment_fiddling import * from ._modelling import * from ._closegaps import * from ._molprobity import * diff --git a/modelling/pymod/_alignment_fiddling.py b/modelling/pymod/_alignment_fiddling.py new file mode 100644 index 0000000000000000000000000000000000000000..4ca25aa2f80c4837db04e781aa7d176b58803950 --- /dev/null +++ b/modelling/pymod/_alignment_fiddling.py @@ -0,0 +1,198 @@ +# Copyright (c) 2013-2020, SIB - Swiss Institute of Bioinformatics and +# Biozentrum - University of Basel +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ost import seq + + +def _PullLeftToRight(s0, s1, min_terminal_anchor_size): + + s_len = len(s0) + + # search for first seqres residue which is covered by a template residue + first_idx = None + for i in range(s_len): + if s0[i] != '-' and s1[i] != '-': + first_idx = i + break + if first_idx is None: + return False # not a single residue in SEQRES is covered by ATOMSEQ... + # nothing to do... + # search for first gap start in both sequences + try: + s0_first_gap = s0.index('-', first_idx) + except: + return False # no gap in SEQRES... nothing to do... + try: + s1_first_gap = s1.index('-', first_idx) + except: + s1_first_gap = s_len # no gap in ATOMSEQ + + if s0_first_gap < s1_first_gap: + # the first gap is in the SEQRES => its a deletion! + l = s0_first_gap - first_idx + if l < min_terminal_anchor_size: + del_end = None + for i in range(s0_first_gap, s_len): + if s0[i] != '-': + del_end = i + break + if del_end is None: + return False # gap reaches c-terminus... nothing to do... + # shift it! + anchor = list(s0[first_idx:first_idx+l]) + s0[first_idx:first_idx+l] = l*['-'] + s0[del_end-l:del_end] = anchor + return True + return False + + +def DeleteGapCols(aln): + '''Deletes alignment columns that only contain gaps. + + Columns that only contain gaps ('-') are removed. If no such column can be + identified, the input alignment gets returned. A new alignment gets + constructed otherwise. The sequences of the new alignment retain name, offset + and the potentially attached :class:`ost.mol.EntityView` from the original + sequences. + + :param aln: Input alignment + :type aln: :class:`ost.seq.AlignmentHandle` + :returns: The processed alignment + :rtype: :class:`ost.seq.AlignmentHandle` + ''' + n_sequences = aln.GetCount() + gap_cols = list() + for c_idx, c in enumerate(aln): + gaps = [int(c[idx] == '-') for idx in range(n_sequences)] + if sum(gaps) == n_sequences: + gap_cols.append(c_idx) + if len(gap_cols) == 0: + return aln # nothing to do + + # remove the found gap columns + s_lists = list() + for s_idx in range(n_sequences): + s_lists.append([c for c in str(aln.GetSequence(s_idx))]) + for gap_col_idx in reversed(gap_cols): + for sl in s_lists: + sl.pop(gap_col_idx) + + # construct a return alignment with the new sequences but names, offsets + # and attached views from the old alignment + return_aln = seq.CreateAlignment() + for s_idx in range(n_sequences): + s = aln.GetSequence(s_idx) + s_processed = seq.CreateSequence(s.GetName(), ''.join(s_lists[s_idx])) + s_processed.SetOffset(s.GetOffset()) + if s.HasAttachedView(): + s_processed.AttachView(s.GetAttachedView()) + return_aln.AddSequence(s_processed) + + return return_aln + + +def PullTerminalDeletions(aln, min_terminal_anchor_size = 4): + '''Fixes deletions close to termini. + + Some alignment tools may produce alignments with deletions close to the + termini. For example: + + .. code-block:: none + + SEQRES: A-----BCDE... + ATOMSEQ: ABCDEFGHIJ... + + where A is the anchor residue. The default loop modelling pipeline would + keep the position of A fixed and start to omit structural information from + B, C, ... until it is able to resolve the deletion. If the anchor is very + short, a shift typically results in a better model: + + .. code-block:: none + + SEQRES: -----ABCDE... + ATOMSEQ: ABCDEFGHIJ... + + This function checks whether the gap closest to any termini is a deletion + (one or several '-' in the first sequence) and estimates the anchor size + (number of aligned residues towards the respective termini from that gap). + If the anchor size is smaller than *min_terminal_anchor_size*, a shift is + applied and the deletion removed. + + This is done iteratively, until no deletion can be removed anymore. + + .. code-block:: none + + SEQRES: A-B--CDEF... + ATOMSEQ: ABCDEFGHI... + + becomes + + .. code-block:: none + + SEQRES: ---ABCDEF... + ATOMSEQ: ABCDEFGHI... + + given a *min_terminal_anchor_size*>2. If no shift can be performed, the + input alignment gets returned. A new alignment gets constructed otherwise. + The sequences of the new alignment retain name, offset and the potentially + attached :class:`ost.mol.EntityView` from the original sequences. + + :param aln: Input alignment + :type aln: :class:`ost.seq.AlignmentHandle` + :returns: The processed alignment + :rtype: :class:`ost.seq.AlignmentHandle` + ''' + if aln.GetCount() != 2: + raise RuntimeError("Expect 2 seqs for aln in PullTerminalDeletions") + + if min_terminal_anchor_size < 1: + raise RuntimeError("min_terminal_anchor_size must be > 0 in PullTerminalDeletions") + + aln_no_gap_cols = DeleteGapCols(aln) + s0 = [c for c in str(aln_no_gap_cols.GetSequence(0))] + s1 = [c for c in str(aln_no_gap_cols.GetSequence(1))] + something_happened = False + + # Pull in deletions from the n-terminus + while _PullLeftToRight(s0, s1, min_terminal_anchor_size): + something_happened = True + + # Pull in deletions from the c-terminus + s0.reverse() + s1.reverse() + while _PullLeftToRight(s0, s1, min_terminal_anchor_size): + something_happened = True + + if not something_happened: + return aln # return original alignment as no pulling happened... + + s0.reverse() + s1.reverse() + + return_aln = seq.CreateAlignment() + s = aln.GetSequence(0) + s_processed = seq.CreateSequence(s.GetName(), ''.join(s0)) + s_processed.SetOffset(s.GetOffset()) + if s.HasAttachedView(): + s_processed.AttachView(s.GetAttachedView()) + return_aln.AddSequence(s_processed) + s = aln.GetSequence(1) + s_processed = seq.CreateSequence(s.GetName(), ''.join(s1)) + s_processed.SetOffset(s.GetOffset()) + if s.HasAttachedView(): + s_processed.AttachView(s.GetAttachedView()) + return_aln.AddSequence(s_processed) + + return return_aln diff --git a/modelling/tests/CMakeLists.txt b/modelling/tests/CMakeLists.txt index 72ce3f9aff830412fd28f48bd6fe2b4511cf7c6e..5afe06ee9f50df622b2524d9f67c432741af04bf 100644 --- a/modelling/tests/CMakeLists.txt +++ b/modelling/tests/CMakeLists.txt @@ -5,6 +5,7 @@ set(MODELLING_UNIT_TESTS test_pipeline.py test_ring_punches.py test_sidechain_reconstruction.py + test_alignment_fiddling.py test_loop_candidates.cc test_loop_closing.cc test_sidechain_reconstructor.cc @@ -69,6 +70,8 @@ set(MODELLING_TEST_DATA data/seq.fasta data/smtl2bl4.1.pdb data/ter.fasta + data/terminal_deletion_aln.fasta + data/double_gap_aln.fasta ) promod3_unittest(MODULE modelling diff --git a/modelling/tests/data/double_gap_aln.fasta b/modelling/tests/data/double_gap_aln.fasta new file mode 100644 index 0000000000000000000000000000000000000000..51c7a742943594456123dfcbb468ad94a524b38a --- /dev/null +++ b/modelling/tests/data/double_gap_aln.fasta @@ -0,0 +1,4 @@ +>target +-ABCDE-FG-H----I- +>template +-ABCDE-FGHI-JK-L- \ No newline at end of file diff --git a/modelling/tests/data/terminal_deletion_aln.fasta b/modelling/tests/data/terminal_deletion_aln.fasta new file mode 100644 index 0000000000000000000000000000000000000000..095e547d4986c68cdfe94a9aa3daf02dced5a93a --- /dev/null +++ b/modelling/tests/data/terminal_deletion_aln.fasta @@ -0,0 +1,4 @@ +>target +GSHMGD----LK--------------YSLERLREILERLEENPSEKQIVEAIRAIVENNAQIVE--AAI-ENNAQIVENNRAIIEALEAIGVDQKILEEMKKQLKDLKRSLERG +>template +-----DAQDKLKYLVKQLERALRELKKSLDELERSLEELEKNPSEDALVENNRLNVENNKIIVEVLRIILE------------------------------------------- diff --git a/modelling/tests/test_alignment_fiddling.py b/modelling/tests/test_alignment_fiddling.py new file mode 100644 index 0000000000000000000000000000000000000000..e015cb3c2f7f6d2f7d4331764c7f87e9d108cf63 --- /dev/null +++ b/modelling/tests/test_alignment_fiddling.py @@ -0,0 +1,129 @@ +# Copyright (c) 2013-2020, SIB - Swiss Institute of Bioinformatics and +# Biozentrum - University of Basel +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +Unit tests for alignment fiddling. +""" +import unittest +from promod3 import modelling +from ost import seq, io + +class ModellingTests(unittest.TestCase): + + def testDeleteGapCols(self): + aln = io.LoadAlignment('data/double_gap_aln.fasta') + s0_orig = aln.GetSequence(0) + s1_orig = aln.GetSequence(1) + # recreate aln with added offsets and attach some random structure to + # both sequences + s0 = seq.CreateSequence(s0_orig.GetName(), str(s0_orig)) + s1 = seq.CreateSequence(s1_orig.GetName(), str(s1_orig)) + s0.SetOffset(1) + s1.SetOffset(2) + random_structure = io.LoadPDB('data/1CRN.pdb') + s0.AttachView(random_structure.CreateFullView()) + s1.AttachView(random_structure.CreateFullView()) + pimped_aln = seq.CreateAlignment() + pimped_aln.AddSequence(s0) + pimped_aln.AddSequence(s1) + processed_aln = modelling.DeleteGapCols(pimped_aln) + processed_s0 = processed_aln.GetSequence(0) + processed_s1 = processed_aln.GetSequence(1) + + # the following things should be equal between sx and processed_sx: + # - name + # - offset + # - view attached + self.assertEqual(s0.GetName(), processed_s0.GetName()) + self.assertEqual(s1.GetName(), processed_s1.GetName()) + self.assertEqual(s0.GetOffset(), processed_s0.GetOffset()) + self.assertEqual(s1.GetOffset(), processed_s1.GetOffset()) + self.assertTrue(processed_s0.HasAttachedView()) + self.assertTrue(processed_s1.HasAttachedView()) + + # the actual sequences changed and are checked with the expected outcome + self.assertEqual(str(processed_s0), 'ABCDEFG-H--I') + self.assertEqual(str(processed_s1), 'ABCDEFGHIJKL') + + + def testPullTerminalDeletions(self): + + aln = io.LoadAlignment('data/terminal_deletion_aln.fasta') + + # min_terminal_anchor_sizes which are <= 0 are disallowed + self.assertRaises(RuntimeError, modelling.PullTerminalDeletions, aln, -10) + self.assertRaises(RuntimeError, modelling.PullTerminalDeletions, aln, 0) + + # check expected results for different min_terminal_anchor_sizes + + # nothing should happen for anchor size 1 + processed_aln = modelling.PullTerminalDeletions(aln, 1) + self.assertEqual(str(aln.GetSequence(0)), str(processed_aln.GetSequence(0))) + self.assertEqual(str(aln.GetSequence(1)), str(processed_aln.GetSequence(1))) + + processed_aln = modelling.PullTerminalDeletions(aln, 2) + exp_s0 = 'GSHMG----DLK--------------YSLERLREILERLEENPSEKQIVEAIRAIVENNAQIVE--AAIE-NNAQIVENNRAIIEALEAIGVDQKILEEMKKQLKDLKRSLERG' + exp_s1 = '-----DAQDKLKYLVKQLERALRELKKSLDELERSLEELEKNPSEDALVENNRLNVENNKIIVEVLRIILE-------------------------------------------' + self.assertEqual(str(processed_aln.GetSequence(0)), exp_s0) + self.assertEqual(str(processed_aln.GetSequence(1)), exp_s1) + + processed_aln = modelling.PullTerminalDeletions(aln, 3) + exp_s0 = 'GSHMG----DLK--------------YSLERLREILERLEENPSEKQIVEAIRAIVENNAQIVE--AAIE-NNAQIVENNRAIIEALEAIGVDQKILEEMKKQLKDLKRSLERG' + exp_s1 = '-----DAQDKLKYLVKQLERALRELKKSLDELERSLEELEKNPSEDALVENNRLNVENNKIIVEVLRIILE-------------------------------------------' + self.assertEqual(str(processed_aln.GetSequence(0)), exp_s0) + self.assertEqual(str(processed_aln.GetSequence(1)), exp_s1) + + processed_aln = modelling.PullTerminalDeletions(aln, 4) + exp_s0 = 'GSHMG------------------DLKYSLERLREILERLEENPSEKQIVEAIRAIVENNAQIVE--AAIE-NNAQIVENNRAIIEALEAIGVDQKILEEMKKQLKDLKRSLERG' + exp_s1 = '-----DAQDKLKYLVKQLERALRELKKSLDELERSLEELEKNPSEDALVENNRLNVENNKIIVEVLRIILE-------------------------------------------' + self.assertEqual(str(processed_aln.GetSequence(0)), exp_s0) + self.assertEqual(str(processed_aln.GetSequence(1)), exp_s1) + + processed_aln = modelling.PullTerminalDeletions(aln, 5) + exp_s0 = 'GSHMG------------------DLKYSLERLREILERLEENPSEKQIVEAIRAIVENNAQIVEAAIE---NNAQIVENNRAIIEALEAIGVDQKILEEMKKQLKDLKRSLERG' + exp_s1 = '-----DAQDKLKYLVKQLERALRELKKSLDELERSLEELEKNPSEDALVENNRLNVENNKIIVEVLRIILE-------------------------------------------' + self.assertEqual(str(processed_aln.GetSequence(0)), exp_s0) + self.assertEqual(str(processed_aln.GetSequence(1)), exp_s1) + + # also check, whether sequence name, offsets and attached views are preserved + random_structure = io.LoadPDB('data/1CRN.pdb') + s0 = seq.CreateSequence(aln.GetSequence(0).GetName(), str(aln.GetSequence(0))) + s1 = seq.CreateSequence(aln.GetSequence(1).GetName(), str(aln.GetSequence(1))) + s0.AttachView(random_structure.CreateFullView()) + s1.AttachView(random_structure.CreateFullView()) + pimped_aln = seq.CreateAlignment() + pimped_aln.AddSequence(s0) + pimped_aln.AddSequence(s1) + processed_aln = modelling.PullTerminalDeletions(pimped_aln, 5) + processed_s0 = processed_aln.GetSequence(0) + processed_s1 = processed_aln.GetSequence(1) + + # the following things should be equal between sx and processed_sx: + # - name + # - offset + # - view attached + self.assertEqual(s0.GetName(), processed_s0.GetName()) + self.assertEqual(s1.GetName(), processed_s1.GetName()) + self.assertEqual(s0.GetOffset(), processed_s0.GetOffset()) + self.assertEqual(s1.GetOffset(), processed_s1.GetOffset()) + self.assertTrue(processed_s0.HasAttachedView()) + self.assertTrue(processed_s1.HasAttachedView()) + + +if __name__ == "__main__": + from ost import testutils + testutils.RunTests() +