diff --git a/modules/seq/alg/doc/seqalg.rst b/modules/seq/alg/doc/seqalg.rst index 66cbb0f41eb3f10c9455b7aad7f9e821cacaf27d..4475202dd9c4248b6b75df89491c5d32029df26e 100644 --- a/modules/seq/alg/doc/seqalg.rst +++ b/modules/seq/alg/doc/seqalg.rst @@ -1,7 +1,8 @@ :mod:`seq.alg <ost.seq.alg>` -- Algorithms for Sequences ================================================================================ -.. currentmodule:: ost.seq.alg +.. module:: ost.seq.alg + :synopsis: Algorithms for sequences .. function:: MergePairwiseAlignments(pairwise_alns, ref_seq) @@ -32,4 +33,35 @@ .. autofunction:: AlignToSEQRES -.. autofunction:: AlignmentFromChainView \ No newline at end of file +.. autofunction:: AlignmentFromChainView + + + +.. function:: LocalAlign(seq1, seq2, subst_weight, gap_open=-5, gap_ext=-2) + + Performs a Smith/Waterman local alignment of *seq1* and *seq2* and returns the + best-scoring alignments as a list of pairwise alignments. + + **Example:** + + .. code-block:: python + + seq_a=seq.CreateSequence('A', 'acdefghiklmn') + seq_b=seq.CreateSequence('B', 'acdhiklmn') + alns=seq.alg.LocalAlign(seq_a, seq_b, seq.alg.BLOSUM62) + print alns[0].ToString(80) + # >>> A acdefghiklmn + # >>> B acd---hiklmn + + :param seq1: A valid sequence + :type seq1: :class:`~ost.seq.ConstSequenceHandle` + :param seq2: A valid sequence + :type seq2: :class:`~ost.seq.ConstSequenceHandle` + + :param subst_weigth: The substitution weights matrix + :type subst_weight: :class:`SubstWeightMatrix` + :param gap_open: The gap opening penalty. Must be a negative number + :param gap_ext: The gap extension penalty. Must be a negative number + :returns: list of best-scoring, non-overlapping alignments of *seq1* and + *seq2*. The start of the alignments is stored in the sequence offset of the + two sequences. \ No newline at end of file diff --git a/modules/seq/alg/pymod/wrap_seq_alg.cc b/modules/seq/alg/pymod/wrap_seq_alg.cc index c39a75a80855bba1fe43d79331c1f13a8a364cae..212ad8e35cb520c51757e1874eb027f4acc2f9f9 100644 --- a/modules/seq/alg/pymod/wrap_seq_alg.cc +++ b/modules/seq/alg/pymod/wrap_seq_alg.cc @@ -24,6 +24,7 @@ #include <ost/seq/alg/ins_del.hh> #include <ost/seq/alg/conservation.hh> #include <ost/seq/alg/subst_weight_matrix.hh> +#include <ost/seq/alg/local_align.hh> using namespace boost::python; using namespace ost::seq; using namespace ost::seq::alg; @@ -52,5 +53,7 @@ BOOST_PYTHON_MODULE(_seq_alg) ; def("MergePairwiseAlignments", &MergePairwiseAlignments); def("Conservation", &Conservation, (arg("assign")=true, arg("prop_name")="cons")); + def("LocalAlign", &LocalAlign, (arg("seq1"), arg("seq2"), arg("subst_weight"), + arg("gap_open")=-10, arg("gap_ext")=-5)); } diff --git a/modules/seq/alg/src/CMakeLists.txt b/modules/seq/alg/src/CMakeLists.txt index 7b4f86f53e5bb8c08feeba088deb1c8319691d0d..b45f036cb89c35002c6cda176eb483eabdf0e01b 100644 --- a/modules/seq/alg/src/CMakeLists.txt +++ b/modules/seq/alg/src/CMakeLists.txt @@ -6,10 +6,12 @@ subst_weight_matrix.hh alignment_opts.hh merge_pairwise_alignments.hh conservation.hh +local_align.hh ) set(OST_SEQ_ALG_SOURCES merge_pairwise_alignments.cc +local_align.cc sequence_identity.cc ins_del.cc subst_weight_matrix.cc diff --git a/modules/seq/alg/src/local_align.cc b/modules/seq/alg/src/local_align.cc new file mode 100644 index 0000000000000000000000000000000000000000..4a043d4c3d4dfad1bc29b0c4f8a073eaf67a8692 --- /dev/null +++ b/modules/seq/alg/src/local_align.cc @@ -0,0 +1,188 @@ +//------------------------------------------------------------------------------ +// This file is part of the OpenStructure project <www.openstructure.org> +// +// Copyright (C) 2008-2011 by the OpenStructure authors +// +// This library is free software; you can redistribute it and/or modify it under +// the terms of the GNU Lesser General Public License as published by the Free +// Software Foundation; either version 3.0 of the License, or (at your option) +// any later version. +// This library is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more +// details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this library; if not, write to the Free Software Foundation, Inc., +// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +//------------------------------------------------------------------------------ +#include <ost/log.hh> +#include "local_align.hh" + +namespace ost { namespace seq { namespace alg { + +typedef enum { + DIAG, + INS1, + INS2, + UNKN +} Path; + +struct AlnPos { + AlnPos(): score(0), from(UNKN) { } + int score; + Path from; +}; + +struct AlnMat { + AlnMat(int width, int height): + mat_(width*height), width_(width), height_(height) + { } + + AlnPos& operator()(int x, int y) { return mat_[x*height_+y]; } + + const AlnPos& operator()(int x, int y) const { return mat_[x*height_+y]; } + + int GetWidth() const { return width_; } + int GetHeight() const { return height_; } + +private: + std::vector<AlnPos> mat_; + int width_; + int height_; +}; + +bool CollectPatches(AlnMat& mat, int min_i, int max_i, int min_j, int max_j, + const ConstSequenceHandle& s1, + const ConstSequenceHandle& s2, + AlignmentList& alignments) +{ + int max_val=0, mmax_i=0, mmax_j=0; + for (int i=min_i; i<max_i; ++i) { + for (int j=min_j; j<max_j; ++j) { + if (mat(i, j).score>=max_val) { + mmax_i=i; + mmax_j=j; + max_val=mat(i, j).score; + } + } + } + int i=mmax_i; int j=mmax_j; + String aln_str1; + String aln_str2; + while (i>min_i && j>min_j && mat(i, j).score>0) { + switch (mat(i, j).from) { + case DIAG: + --i; + --j; + aln_str1.push_back(s1[i]); + aln_str2.push_back(s2[j]); + break; + case INS1: + --j; + aln_str1.push_back('-'); + aln_str2.push_back(s2[j]); + break; + case INS2: + --i; + aln_str1.push_back(s1[i]); + aln_str2.push_back('-'); + break; + default: + assert(0 && "should never get here"); + } + } + if (aln_str1.size()<2) { + return false; + } + for (size_t x=0; x<aln_str1.size()/2; ++x) { + std::swap(aln_str1[x], aln_str1[aln_str1.size()-x-1]); + std::swap(aln_str2[x], aln_str2[aln_str1.size()-x-1]); + } + AlignmentHandle aln=CreateAlignment(); + aln.AddSequence(CreateSequence(s1.GetName(), aln_str1)); + aln.AddSequence(CreateSequence(s2.GetName(), aln_str2)); + aln.SetSequenceOffset(0, i); + aln.SetSequenceOffset(1, j); + alignments.push_back(aln); + // clear used regions + for (int x=i; x<=mmax_i; ++x) { + for (int y=0; y<mat.GetHeight(); ++y) { + mat(x, y).score=0; + } + } + for (int x=0; x<mat.GetWidth(); ++x) { + for (int y=j; y<mmax_j; ++y) { + mat(x, y).score=0; + } + } + return true; +} + +AlignmentList LocalAlign(const ConstSequenceHandle& s1, + const ConstSequenceHandle& s2, + alg::SubstWeightMatrixPtr& subst, + int gap_open, int gap_ext) +{ + AlnMat mat(s1.GetLength()+1, s2.GetLength()+1); + for (int i=0; i<mat.GetWidth()-1; ++i) { + for (int j=0; j<mat.GetHeight()-1; ++j) { + char c1=s1[i]; + char c2=s2[j]; + short weight=subst->GetWeight(c1, c2); + int diag=weight+mat(i, j).score; + int ins1=mat(i+1, j).score+(mat(i+1, j).from==INS1 ? gap_ext : gap_open); + ins1=std::max(0, ins1); + int ins2=mat(i, j+1).score+(mat(i, j+1).from==INS2 ? gap_ext : gap_open); + ins2=std::max(0, ins2); + if (diag>=ins1) { + if (diag>=ins2) { + mat(i+1, j+1).score=diag; + mat(i+1, j+1).from=DIAG; + } else { + mat(i+1, j+1).score=ins2; + mat(i+1, j+1).from=INS2; + } + } else if (ins1>ins2) { + mat(i+1, j+1).score=ins1; + mat(i+1, j+1).from=INS1; + } else { + mat(i+1, j+1).score=ins2; + mat(i+1, j+1).from=INS2; + } + } + } +#if !defined(NDEBUG) + std::stringstream ss; + for (int i=0; i<mat.GetWidth(); ++i) { + for (int j=0; j<mat.GetHeight(); ++j) { + if (mat(i, j).from==DIAG) { + ss << "\\"; + } else if (mat(i, j).from==INS1) { + ss << "-"; + } else if (mat(i, j).from==INS2) { + ss << "|"; + } + if (i==0 && j>0) { + ss << s2[j-1]; + } + if (j==0 && i>0) { + ss << s1[i-1] << " "; + } + if (i==0 && j==0) { + ss << " "; + } + } + ss << std::endl; + } + LOG_DEBUG(ss.str()); +#endif + AlignmentList alignments; + while (CollectPatches(mat, 0, mat.GetWidth(), 0, mat.GetHeight(), + s1, s2, alignments)) { + LOG_DEBUG(alignments.back().aln.ToString(80)); + } + return alignments; +} + +}}} \ No newline at end of file diff --git a/modules/seq/alg/src/local_align.hh b/modules/seq/alg/src/local_align.hh new file mode 100644 index 0000000000000000000000000000000000000000..14b8f9af5f797f41f46296d2623ebb1d52a1b8f6 --- /dev/null +++ b/modules/seq/alg/src/local_align.hh @@ -0,0 +1,35 @@ +//------------------------------------------------------------------------------ +// This file is part of the OpenStructure project <www.openstructure.org> +// +// Copyright (C) 2008-2011 by the OpenStructure authors +// +// This library is free software; you can redistribute it and/or modify it under +// the terms of the GNU Lesser General Public License as published by the Free +// Software Foundation; either version 3.0 of the License, or (at your option) +// any later version. +// This library is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more +// details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this library; if not, write to the Free Software Foundation, Inc., +// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +//------------------------------------------------------------------------------ +#ifndef OST_SEQ_ALG_LOCAL_ALIGN_HH +#define OST_SEQ_ALG_LOCAL_ALIGN_HH + +#include <ost/seq/alignment_handle.hh> +#include <ost/seq/alg/subst_weight_matrix.hh> +#include "module_config.hh" + +namespace ost { namespace seq { namespace alg { + + +AlignmentList DLLEXPORT_OST_SEQ_ALG LocalAlign(const ConstSequenceHandle& s1, + const ConstSequenceHandle& s2, + SubstWeightMatrixPtr& subst, + int gap_open=-5, int gap_ext=-2); +}}} + +#endif diff --git a/modules/seq/alg/tests/CMakeLists.txt b/modules/seq/alg/tests/CMakeLists.txt index aafdabc4b723b7ee0ac6b978eb28cf34f95b5c23..53053090ee969e747f5f81c6f048f31c33424f29 100644 --- a/modules/seq/alg/tests/CMakeLists.txt +++ b/modules/seq/alg/tests/CMakeLists.txt @@ -3,6 +3,7 @@ set(OST_SEQ_ALG_UNIT_TESTS test_sequence_identity.cc tests.cc test_renumber.py + test_local_align.py ) ost_unittest(seq_alg "${OST_SEQ_ALG_UNIT_TESTS}") diff --git a/modules/seq/alg/tests/test_local_align.py b/modules/seq/alg/tests/test_local_align.py new file mode 100644 index 0000000000000000000000000000000000000000..ff6872f9df22b261947e9abdbda12319e22a6d5a --- /dev/null +++ b/modules/seq/alg/tests/test_local_align.py @@ -0,0 +1,49 @@ +import unittest +from ost import * +from ost import settings +from ost import seq +from ost.bindings.clustalw import * + +class TestLocalAlign(unittest.TestCase): + def testDeletionInSeqB(self): + seq_a=seq.CreateSequence('A', 'acdefghiklmn') + seq_b=seq.CreateSequence('B', 'acdhiklmn') + alns=seq.alg.LocalAlign(seq_a, seq_b, seq.alg.BLOSUM62) + self.assertEqual(len(alns), 1) + self.assertEqual(alns[0].sequences[0].name, 'A') + self.assertEqual(alns[0].sequences[1].name, 'B') + self.assertEqual(str(alns[0].sequences[0]), 'acdefghiklmn') + self.assertEqual(str(alns[0].sequences[1]), 'acd---hiklmn') + self.assertEqual(alns[0].sequences[0].offset, 0) + self.assertEqual(alns[0].sequences[1].offset, 0) + + def testDeletionInSeqA(self): + seq_a=seq.CreateSequence('A', 'acdhiklmn') + seq_b=seq.CreateSequence('B', 'acdefghiklmn') + alns=seq.alg.LocalAlign(seq_a, seq_b, seq.alg.BLOSUM62) + self.assertEqual(len(alns), 1) + self.assertEqual(alns[0].sequences[0].name, 'A') + self.assertEqual(alns[0].sequences[1].name, 'B') + + self.assertEqual(str(alns[0].sequences[0]), 'acd---hiklmn') + self.assertEqual(str(alns[0].sequences[1]), 'acdefghiklmn') + self.assertEqual(alns[0].sequences[0].offset, 0) + self.assertEqual(alns[0].sequences[1].offset, 0) + def testOffset(self): + seq_a=seq.CreateSequence('A', 'acdhiklmn') + seq_b=seq.CreateSequence('B', 'ggiklmn') + alns=seq.alg.LocalAlign(seq_a, seq_b, seq.alg.BLOSUM62) + self.assertEqual(len(alns), 1) + self.assertEqual(alns[0].sequences[0].name, 'A') + self.assertEqual(alns[0].sequences[1].name, 'B') + + self.assertEqual(str(alns[0].sequences[0]), 'iklmn') + self.assertEqual(str(alns[0].sequences[1]), 'iklmn') + self.assertEqual(alns[0].sequences[0].offset, 4) + self.assertEqual(alns[0].sequences[1].offset, 2) + +if __name__ == "__main__": + try: + unittest.main() + except Exception, e: + print e