Skip to content
Snippets Groups Projects
Commit 77579a0f authored by Studer Gabriel's avatar Studer Gabriel
Browse files

simple alignment manipulation algorithms

parent 8e443842
Branches
Tags
No related merge requests found
......@@ -443,3 +443,11 @@ Modelling Steps
:members:
.. autofunction:: AddModellingIssue
Alignment Fiddling
--------------------------------------------------------------------------------
.. autofunction:: DeleteGapCols
.. autofunction:: PullTerminalDeletions
......@@ -25,6 +25,7 @@ set(MODELLING_PYMOD
_reconstruct_sidechains.py
_monte_carlo.py
_mhandle_helper.py
_alignment_fiddling.py
)
pymod(NAME modelling
......
......@@ -15,6 +15,7 @@
"""Initialise the modelling module."""
from ._alignment_fiddling import *
from ._modelling import *
from ._closegaps import *
from ._molprobity import *
......
# Copyright (c) 2013-2020, SIB - Swiss Institute of Bioinformatics and
# Biozentrum - University of Basel
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from ost import seq
def _PullLeftToRight(s0, s1, min_terminal_anchor_size):
s_len = len(s0)
# search for first seqres residue which is covered by a template residue
first_idx = None
for i in range(s_len):
if s0[i] != '-' and s1[i] != '-':
first_idx = i
break
if first_idx is None:
return False # not a single residue in SEQRES is covered by ATOMSEQ...
# nothing to do...
# search for first gap start in both sequences
try:
s0_first_gap = s0.index('-', first_idx)
except:
return False # no gap in SEQRES... nothing to do...
try:
s1_first_gap = s1.index('-', first_idx)
except:
s1_first_gap = s_len # no gap in ATOMSEQ
if s0_first_gap < s1_first_gap:
# the first gap is in the SEQRES => its a deletion!
l = s0_first_gap - first_idx
if l < min_terminal_anchor_size:
del_end = None
for i in range(s0_first_gap, s_len):
if s0[i] != '-':
del_end = i
break
if del_end is None:
return False # gap reaches c-terminus... nothing to do...
# shift it!
anchor = list(s0[first_idx:first_idx+l])
s0[first_idx:first_idx+l] = l*['-']
s0[del_end-l:del_end] = anchor
return True
return False
def DeleteGapCols(aln):
'''Deletes alignment columns that only contain gaps.
Columns that only contain gaps ('-') are removed. If no such column can be
identified, the input alignment gets returned. A new alignment gets
constructed otherwise. The sequences of the new alignment retain name, offset
and the potentially attached :class:`ost.mol.EntityView` from the original
sequences.
:param aln: Input alignment
:type aln: :class:`ost.seq.AlignmentHandle`
:returns: The processed alignment
:rtype: :class:`ost.seq.AlignmentHandle`
'''
n_sequences = aln.GetCount()
gap_cols = list()
for c_idx, c in enumerate(aln):
gaps = [int(c[idx] == '-') for idx in range(n_sequences)]
if sum(gaps) == n_sequences:
gap_cols.append(c_idx)
if len(gap_cols) == 0:
return aln # nothing to do
# remove the found gap columns
s_lists = list()
for s_idx in range(n_sequences):
s_lists.append([c for c in str(aln.GetSequence(s_idx))])
for gap_col_idx in reversed(gap_cols):
for sl in s_lists:
sl.pop(gap_col_idx)
# construct a return alignment with the new sequences but names, offsets
# and attached views from the old alignment
return_aln = seq.CreateAlignment()
for s_idx in range(n_sequences):
s = aln.GetSequence(s_idx)
s_processed = seq.CreateSequence(s.GetName(), ''.join(s_lists[s_idx]))
s_processed.SetOffset(s.GetOffset())
if s.HasAttachedView():
s_processed.AttachView(s.GetAttachedView())
return_aln.AddSequence(s_processed)
return return_aln
def PullTerminalDeletions(aln, min_terminal_anchor_size = 4):
'''Fixes deletions close to termini.
Some alignment tools may produce alignments with deletions close to the
termini. For example:
.. code-block:: none
SEQRES: A-----BCDE...
ATOMSEQ: ABCDEFGHIJ...
where A is the anchor residue. The default loop modelling pipeline would
keep the position of A fixed and start to omit structural information from
B, C, ... until it is able to resolve the deletion. If the anchor is very
short, a shift typically results in a better model:
.. code-block:: none
SEQRES: -----ABCDE...
ATOMSEQ: ABCDEFGHIJ...
This function checks whether the gap closest to any termini is a deletion
(one or several '-' in the first sequence) and estimates the anchor size
(number of aligned residues towards the respective termini from that gap).
If the anchor size is smaller than *min_terminal_anchor_size*, a shift is
applied and the deletion removed.
This is done iteratively, until no deletion can be removed anymore.
.. code-block:: none
SEQRES: A-B--CDEF...
ATOMSEQ: ABCDEFGHI...
becomes
.. code-block:: none
SEQRES: ---ABCDEF...
ATOMSEQ: ABCDEFGHI...
given a *min_terminal_anchor_size*>2. If no shift can be performed, the
input alignment gets returned. A new alignment gets constructed otherwise.
The sequences of the new alignment retain name, offset and the potentially
attached :class:`ost.mol.EntityView` from the original sequences.
:param aln: Input alignment
:type aln: :class:`ost.seq.AlignmentHandle`
:returns: The processed alignment
:rtype: :class:`ost.seq.AlignmentHandle`
'''
if aln.GetCount() != 2:
raise RuntimeError("Expect 2 seqs for aln in PullTerminalDeletions")
if min_terminal_anchor_size < 1:
raise RuntimeError("min_terminal_anchor_size must be > 0 in PullTerminalDeletions")
aln_no_gap_cols = DeleteGapCols(aln)
s0 = [c for c in str(aln_no_gap_cols.GetSequence(0))]
s1 = [c for c in str(aln_no_gap_cols.GetSequence(1))]
something_happened = False
# Pull in deletions from the n-terminus
while _PullLeftToRight(s0, s1, min_terminal_anchor_size):
something_happened = True
# Pull in deletions from the c-terminus
s0.reverse()
s1.reverse()
while _PullLeftToRight(s0, s1, min_terminal_anchor_size):
something_happened = True
if not something_happened:
return aln # return original alignment as no pulling happened...
s0.reverse()
s1.reverse()
return_aln = seq.CreateAlignment()
s = aln.GetSequence(0)
s_processed = seq.CreateSequence(s.GetName(), ''.join(s0))
s_processed.SetOffset(s.GetOffset())
if s.HasAttachedView():
s_processed.AttachView(s.GetAttachedView())
return_aln.AddSequence(s_processed)
s = aln.GetSequence(1)
s_processed = seq.CreateSequence(s.GetName(), ''.join(s1))
s_processed.SetOffset(s.GetOffset())
if s.HasAttachedView():
s_processed.AttachView(s.GetAttachedView())
return_aln.AddSequence(s_processed)
return return_aln
......@@ -5,6 +5,7 @@ set(MODELLING_UNIT_TESTS
test_pipeline.py
test_ring_punches.py
test_sidechain_reconstruction.py
test_alignment_fiddling.py
test_loop_candidates.cc
test_loop_closing.cc
test_sidechain_reconstructor.cc
......@@ -69,6 +70,8 @@ set(MODELLING_TEST_DATA
data/seq.fasta
data/smtl2bl4.1.pdb
data/ter.fasta
data/terminal_deletion_aln.fasta
data/double_gap_aln.fasta
)
promod3_unittest(MODULE modelling
......
>target
-ABCDE-FG-H----I-
>template
-ABCDE-FGHI-JK-L-
\ No newline at end of file
>target
GSHMGD----LK--------------YSLERLREILERLEENPSEKQIVEAIRAIVENNAQIVE--AAI-ENNAQIVENNRAIIEALEAIGVDQKILEEMKKQLKDLKRSLERG
>template
-----DAQDKLKYLVKQLERALRELKKSLDELERSLEELEKNPSEDALVENNRLNVENNKIIVEVLRIILE-------------------------------------------
# Copyright (c) 2013-2020, SIB - Swiss Institute of Bioinformatics and
# Biozentrum - University of Basel
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Unit tests for alignment fiddling.
"""
import unittest
from promod3 import modelling
from ost import seq, io
class ModellingTests(unittest.TestCase):
def testDeleteGapCols(self):
aln = io.LoadAlignment('data/double_gap_aln.fasta')
s0_orig = aln.GetSequence(0)
s1_orig = aln.GetSequence(1)
# recreate aln with added offsets and attach some random structure to
# both sequences
s0 = seq.CreateSequence(s0_orig.GetName(), str(s0_orig))
s1 = seq.CreateSequence(s1_orig.GetName(), str(s1_orig))
s0.SetOffset(1)
s1.SetOffset(2)
random_structure = io.LoadPDB('data/1CRN.pdb')
s0.AttachView(random_structure.CreateFullView())
s1.AttachView(random_structure.CreateFullView())
pimped_aln = seq.CreateAlignment()
pimped_aln.AddSequence(s0)
pimped_aln.AddSequence(s1)
processed_aln = modelling.DeleteGapCols(pimped_aln)
processed_s0 = processed_aln.GetSequence(0)
processed_s1 = processed_aln.GetSequence(1)
# the following things should be equal between sx and processed_sx:
# - name
# - offset
# - view attached
self.assertEqual(s0.GetName(), processed_s0.GetName())
self.assertEqual(s1.GetName(), processed_s1.GetName())
self.assertEqual(s0.GetOffset(), processed_s0.GetOffset())
self.assertEqual(s1.GetOffset(), processed_s1.GetOffset())
self.assertTrue(processed_s0.HasAttachedView())
self.assertTrue(processed_s1.HasAttachedView())
# the actual sequences changed and are checked with the expected outcome
self.assertEqual(str(processed_s0), 'ABCDEFG-H--I')
self.assertEqual(str(processed_s1), 'ABCDEFGHIJKL')
def testPullTerminalDeletions(self):
aln = io.LoadAlignment('data/terminal_deletion_aln.fasta')
# min_terminal_anchor_sizes which are <= 0 are disallowed
self.assertRaises(RuntimeError, modelling.PullTerminalDeletions, aln, -10)
self.assertRaises(RuntimeError, modelling.PullTerminalDeletions, aln, 0)
# check expected results for different min_terminal_anchor_sizes
# nothing should happen for anchor size 1
processed_aln = modelling.PullTerminalDeletions(aln, 1)
self.assertEqual(str(aln.GetSequence(0)), str(processed_aln.GetSequence(0)))
self.assertEqual(str(aln.GetSequence(1)), str(processed_aln.GetSequence(1)))
processed_aln = modelling.PullTerminalDeletions(aln, 2)
exp_s0 = 'GSHMG----DLK--------------YSLERLREILERLEENPSEKQIVEAIRAIVENNAQIVE--AAIE-NNAQIVENNRAIIEALEAIGVDQKILEEMKKQLKDLKRSLERG'
exp_s1 = '-----DAQDKLKYLVKQLERALRELKKSLDELERSLEELEKNPSEDALVENNRLNVENNKIIVEVLRIILE-------------------------------------------'
self.assertEqual(str(processed_aln.GetSequence(0)), exp_s0)
self.assertEqual(str(processed_aln.GetSequence(1)), exp_s1)
processed_aln = modelling.PullTerminalDeletions(aln, 3)
exp_s0 = 'GSHMG----DLK--------------YSLERLREILERLEENPSEKQIVEAIRAIVENNAQIVE--AAIE-NNAQIVENNRAIIEALEAIGVDQKILEEMKKQLKDLKRSLERG'
exp_s1 = '-----DAQDKLKYLVKQLERALRELKKSLDELERSLEELEKNPSEDALVENNRLNVENNKIIVEVLRIILE-------------------------------------------'
self.assertEqual(str(processed_aln.GetSequence(0)), exp_s0)
self.assertEqual(str(processed_aln.GetSequence(1)), exp_s1)
processed_aln = modelling.PullTerminalDeletions(aln, 4)
exp_s0 = 'GSHMG------------------DLKYSLERLREILERLEENPSEKQIVEAIRAIVENNAQIVE--AAIE-NNAQIVENNRAIIEALEAIGVDQKILEEMKKQLKDLKRSLERG'
exp_s1 = '-----DAQDKLKYLVKQLERALRELKKSLDELERSLEELEKNPSEDALVENNRLNVENNKIIVEVLRIILE-------------------------------------------'
self.assertEqual(str(processed_aln.GetSequence(0)), exp_s0)
self.assertEqual(str(processed_aln.GetSequence(1)), exp_s1)
processed_aln = modelling.PullTerminalDeletions(aln, 5)
exp_s0 = 'GSHMG------------------DLKYSLERLREILERLEENPSEKQIVEAIRAIVENNAQIVEAAIE---NNAQIVENNRAIIEALEAIGVDQKILEEMKKQLKDLKRSLERG'
exp_s1 = '-----DAQDKLKYLVKQLERALRELKKSLDELERSLEELEKNPSEDALVENNRLNVENNKIIVEVLRIILE-------------------------------------------'
self.assertEqual(str(processed_aln.GetSequence(0)), exp_s0)
self.assertEqual(str(processed_aln.GetSequence(1)), exp_s1)
# also check, whether sequence name, offsets and attached views are preserved
random_structure = io.LoadPDB('data/1CRN.pdb')
s0 = seq.CreateSequence(aln.GetSequence(0).GetName(), str(aln.GetSequence(0)))
s1 = seq.CreateSequence(aln.GetSequence(1).GetName(), str(aln.GetSequence(1)))
s0.AttachView(random_structure.CreateFullView())
s1.AttachView(random_structure.CreateFullView())
pimped_aln = seq.CreateAlignment()
pimped_aln.AddSequence(s0)
pimped_aln.AddSequence(s1)
processed_aln = modelling.PullTerminalDeletions(pimped_aln, 5)
processed_s0 = processed_aln.GetSequence(0)
processed_s1 = processed_aln.GetSequence(1)
# the following things should be equal between sx and processed_sx:
# - name
# - offset
# - view attached
self.assertEqual(s0.GetName(), processed_s0.GetName())
self.assertEqual(s1.GetName(), processed_s1.GetName())
self.assertEqual(s0.GetOffset(), processed_s0.GetOffset())
self.assertEqual(s1.GetOffset(), processed_s1.GetOffset())
self.assertTrue(processed_s0.HasAttachedView())
self.assertTrue(processed_s1.HasAttachedView())
if __name__ == "__main__":
from ost import testutils
testutils.RunTests()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment