From 23a01b0b32a5b0b401b0e4d5fcc6981f3576ddbf Mon Sep 17 00:00:00 2001 From: Martino Bertoni <martino.bertoni@unibas.ch> Date: Fri, 8 Nov 2013 15:29:14 +0100 Subject: [PATCH] added a semiglobal alignment. equal to global alignment but without starting and ending gaps penalty. good when 1 sequence is much smaller then the other or you expect the 2 sequences just to overlap a bit. --- modules/seq/alg/pymod/wrap_seq_alg.cc | 3 + modules/seq/alg/src/CMakeLists.txt | 2 + modules/seq/alg/src/semiglobal_align.cc | 103 ++++++++++++++++++ modules/seq/alg/src/semiglobal_align.hh | 35 ++++++ modules/seq/alg/tests/CMakeLists.txt | 1 + .../seq/alg/tests/test_semiglobal_align.py | 23 ++++ 6 files changed, 167 insertions(+) create mode 100644 modules/seq/alg/src/semiglobal_align.cc create mode 100644 modules/seq/alg/src/semiglobal_align.hh create mode 100644 modules/seq/alg/tests/test_semiglobal_align.py diff --git a/modules/seq/alg/pymod/wrap_seq_alg.cc b/modules/seq/alg/pymod/wrap_seq_alg.cc index f22866761..418defe25 100644 --- a/modules/seq/alg/pymod/wrap_seq_alg.cc +++ b/modules/seq/alg/pymod/wrap_seq_alg.cc @@ -26,6 +26,7 @@ #include <ost/seq/alg/subst_weight_matrix.hh> #include <ost/seq/alg/local_align.hh> #include <ost/seq/alg/global_align.hh> +#include <ost/seq/alg/semiglobal_align.hh> #include <ost/seq/alg/entropy.hh> using namespace boost::python; using namespace ost::seq; @@ -59,6 +60,8 @@ BOOST_PYTHON_MODULE(_ost_seq_alg) arg("gap_open")=-5, arg("gap_ext")=-2)); def("GlobalAlign", &GlobalAlign,(arg("seq1"),arg("seq2"),arg("subst_weight"), arg("gap_open")=-5, arg("gap_ext")=-2)); + def("SemiGlobalAlign", &SemiGlobalAlign,(arg("seq1"),arg("seq2"),arg("subst_weight"), + arg("gap_open")=-5, arg("gap_ext")=-2)); def("ShannonEntropy", &ShannonEntropy, (arg("aln"), arg("ignore_gaps")=true)); } diff --git a/modules/seq/alg/src/CMakeLists.txt b/modules/seq/alg/src/CMakeLists.txt index ea6284a98..76549ab0d 100644 --- a/modules/seq/alg/src/CMakeLists.txt +++ b/modules/seq/alg/src/CMakeLists.txt @@ -9,12 +9,14 @@ merge_pairwise_alignments.hh conservation.hh local_align.hh global_align.hh +semiglobal_align.hh ) set(OST_SEQ_ALG_SOURCES merge_pairwise_alignments.cc local_align.cc global_align.cc +semiglobal_align.cc entropy.cc sequence_identity.cc ins_del.cc diff --git a/modules/seq/alg/src/semiglobal_align.cc b/modules/seq/alg/src/semiglobal_align.cc new file mode 100644 index 000000000..4b89fd15c --- /dev/null +++ b/modules/seq/alg/src/semiglobal_align.cc @@ -0,0 +1,103 @@ +//------------------------------------------------------------------------------ +// This file is part of the OpenStructure project <www.openstructure.org> +// +// Copyright (C) 2008-2011 by the OpenStructure authors +// +// This library is free software; you can redistribute it and/or modify it under +// the terms of the GNU Lesser General Public License as published by the Free +// Software Foundation; either version 3.0 of the License, or (at your option) +// any later version. +// This library is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more +// details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this library; if not, write to the Free Software Foundation, Inc., +// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +//------------------------------------------------------------------------------ +#include <ost/log.hh> +#include "impl/align_impl.hh" +#include "semiglobal_align.hh" + +namespace ost { namespace seq { namespace alg { + +void SemiTraceback(impl::AlnMat& mat, int max_i, int max_j, + const ConstSequenceHandle& s1, + const ConstSequenceHandle& s2, + AlignmentList& alignments) +{ + int i = (max_i > 0 ? max_i-1 : 0); + int j = (max_j > 0 ? max_j-1 : 0); + String aln_str1; + String aln_str2; + while (i > 0 || j > 0) { + impl::SetRoute(mat, i, j, s1, s2, aln_str1, aln_str2); + } + impl::StoreStrAsAln(aln_str1, aln_str2, s1, s2, i, j, alignments); +} + +AlignmentList SemiGlobalAlign(const ConstSequenceHandle& s1, + const ConstSequenceHandle& s2, + alg::SubstWeightMatrixPtr& subst, + int gap_open,int gap_ext) +{ + impl::AlnMat mat(s1.GetLength()+1, s2.GetLength()+1); + // init first column of matrix no start gap penalty + if (mat.GetHeight() > 1) { + mat(0, 1).score = mat(0, 0).score; + mat(0, 1).from = impl::INS1; + } + for (int j = 2; j < mat.GetHeight(); ++j) { + mat(0, j).score = mat(0, j-1).score; + mat(0, j).from = impl::INS1; + } + // fill the alignment matrix + for (int i = 0; i < mat.GetWidth()-1; ++i) { + mat(i+1, 0).score = mat(i, 0).score; + mat(i+1, 0).from = impl::INS2; + for (int j = 0; j < mat.GetHeight()-1; ++j) { + char c1=s1[i]; + char c2=s2[j]; + short weight=subst->GetWeight(c1, c2); + int diag=weight+mat(i, j).score; + int ins1=mat(i+1, j).score + +(mat(i+1, j).from==impl::INS1 ? gap_ext : gap_open); + int ins2=mat(i, j+1).score + +(mat(i, j+1).from==impl::INS2 ? gap_ext : gap_open); + // ignore end gaps (if one of the two seqs is done) + if (j+1==mat.GetHeight()-1){ + ins2=mat(i, j+1).score; + } + if (i+1==mat.GetWidth()-1){ + ins1=mat(i+1, j).score; + } + if (diag>=ins1) { + if (diag>=ins2) { + mat(i+1, j+1).score=diag; + mat(i+1, j+1).from=impl::DIAG; + } else { + mat(i+1, j+1).score=ins2; + mat(i+1, j+1).from=impl::INS2; + } + } else if (ins1>ins2) { + mat(i+1, j+1).score=ins1; + mat(i+1, j+1).from=impl::INS1; + } else { + mat(i+1, j+1).score=ins2; + mat(i+1, j+1).from=impl::INS2; + } + } + } + // write traceback matrix in debug mode +#if !defined(NDEBUG) + DbgWriteAlnMatrix(mat,s1, s2); +#endif + AlignmentList alignments; + SemiTraceback(mat, mat.GetWidth(), mat.GetHeight(), s1, s2, alignments); + LOG_DEBUG(alignments.back().ToString(80)); + + return alignments; +} + +}}} diff --git a/modules/seq/alg/src/semiglobal_align.hh b/modules/seq/alg/src/semiglobal_align.hh new file mode 100644 index 000000000..6d1ca5002 --- /dev/null +++ b/modules/seq/alg/src/semiglobal_align.hh @@ -0,0 +1,35 @@ +//------------------------------------------------------------------------------ +// This file is part of the OpenStructure project <www.openstructure.org> +// +// Copyright (C) 2008-2011 by the OpenStructure authors +// +// This library is free software; you can redistribute it and/or modify it under +// the terms of the GNU Lesser General Public License as published by the Free +// Software Foundation; either version 3.0 of the License, or (at your option) +// any later version. +// This library is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more +// details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this library; if not, write to the Free Software Foundation, Inc., +// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +//------------------------------------------------------------------------------ +#ifndef OST_SEQ_ALG_SEMIGLOBAL_ALIGN_HH +#define OST_SEQ_ALG_SEMIGLOBAL_ALIGN_HH + +#include <ost/seq/alignment_handle.hh> +#include <ost/seq/alg/subst_weight_matrix.hh> +#include "module_config.hh" + +namespace ost { namespace seq { namespace alg { + + +AlignmentList DLLEXPORT_OST_SEQ_ALG SemiGlobalAlign(const ConstSequenceHandle& s1, + const ConstSequenceHandle& s2, + SubstWeightMatrixPtr& subst, + int gap_open=-5,int gap_ext=-2); +}}} + +#endif diff --git a/modules/seq/alg/tests/CMakeLists.txt b/modules/seq/alg/tests/CMakeLists.txt index 76b599790..ca0e72988 100644 --- a/modules/seq/alg/tests/CMakeLists.txt +++ b/modules/seq/alg/tests/CMakeLists.txt @@ -5,6 +5,7 @@ set(OST_SEQ_ALG_UNIT_TESTS test_renumber.py test_local_align.py test_global_align.py + test_semiglobal_align.py test_weight_matrix.py test_aligntoseqres.py ) diff --git a/modules/seq/alg/tests/test_semiglobal_align.py b/modules/seq/alg/tests/test_semiglobal_align.py new file mode 100644 index 000000000..e50198928 --- /dev/null +++ b/modules/seq/alg/tests/test_semiglobal_align.py @@ -0,0 +1,23 @@ +import unittest +from ost import * +from ost import settings +from ost import seq +from ost.bindings.clustalw import * + +class TestSemiGlobalAlign(unittest.TestCase): + def testSemiGlobalAlignment(self): + seq_a=seq.CreateSequence('A', 'abcdefghijklmnok') + seq_b=seq.CreateSequence('B', 'cdehijk') + alns=seq.alg.SemiGlobalAlign(seq_a, seq_b, seq.alg.BLOSUM62) + self.assertEqual(len(alns), 1) + self.assertEqual(alns[0].sequences[0].name, 'A') + self.assertEqual(alns[0].sequences[1].name, 'B') + self.assertEqual(str(alns[0].sequences[0]), 'abcdefghijklmnok') + self.assertEqual(str(alns[0].sequences[1]), '--cde--hijk-----') + self.assertEqual(alns[0].sequences[0].offset, 0) + self.assertEqual(alns[0].sequences[1].offset, 0) + + +if __name__ == "__main__": + from ost import testutils + testutils.RunTests() -- GitLab