diff --git a/modules/seq/alg/doc/seqalg.rst b/modules/seq/alg/doc/seqalg.rst index 4475202dd9c4248b6b75df89491c5d32029df26e..cf3587d06c21d3041cbe350bad899dde7dfdef82 100644 --- a/modules/seq/alg/doc/seqalg.rst +++ b/modules/seq/alg/doc/seqalg.rst @@ -39,8 +39,8 @@ .. function:: LocalAlign(seq1, seq2, subst_weight, gap_open=-5, gap_ext=-2) - Performs a Smith/Waterman local alignment of *seq1* and *seq2* and returns the - best-scoring alignments as a list of pairwise alignments. + Performs a Smith/Waterman local alignment of *seq1* and *seq2* and returns + the best-scoring alignments as a list of pairwise alignments. **Example:** @@ -63,5 +63,33 @@ :param gap_open: The gap opening penalty. Must be a negative number :param gap_ext: The gap extension penalty. Must be a negative number :returns: list of best-scoring, non-overlapping alignments of *seq1* and - *seq2*. The start of the alignments is stored in the sequence offset of the - two sequences. \ No newline at end of file + *seq2*. Since alignments always start with a replacement, the start is + stored in the sequence offset of the two sequences. + + +.. function:: GlobalAlign(seq1, seq2, subst_weight, gap_open=-5, gap_ext=-2) + + Performs a Needleman/Wunsch global alignment of *seq1* and *seq2* and returns + the best-scoring alignment. + + **Example:** + + .. code-block:: python + + seq_a=seq.CreateSequence('A', 'acdefghiklmn') + seq_b=seq.CreateSequence('B', 'acdhiklmn') + alns=seq.alg.GlobalAlign(seq_a, seq_b, seq.alg.BLOSUM62) + print alns[0].ToString(80) + # >>> A acdefghiklmn + # >>> B acd---hiklmn + + :param seq1: A valid sequence + :type seq1: :class:`~ost.seq.ConstSequenceHandle` + :param seq2: A valid sequence + :type seq2: :class:`~ost.seq.ConstSequenceHandle` + + :param subst_weigth: The substitution weights matrix + :type subst_weight: :class:`SubstWeightMatrix` + :param gap_open: The gap opening penalty. Must be a negative number + :param gap_ext: The gap extension penalty. Must be a negative number + :returns: best-scoring alignment of *seq1* and *seq2*. diff --git a/modules/seq/alg/pymod/wrap_seq_alg.cc b/modules/seq/alg/pymod/wrap_seq_alg.cc index 6257b736f0868d48334e9e17585964c48875c199..f8324daa8694614925630a94decf6d5c6d35d739 100644 --- a/modules/seq/alg/pymod/wrap_seq_alg.cc +++ b/modules/seq/alg/pymod/wrap_seq_alg.cc @@ -25,6 +25,7 @@ #include <ost/seq/alg/conservation.hh> #include <ost/seq/alg/subst_weight_matrix.hh> #include <ost/seq/alg/local_align.hh> +#include <ost/seq/alg/global_align.hh> using namespace boost::python; using namespace ost::seq; using namespace ost::seq::alg; @@ -53,7 +54,9 @@ BOOST_PYTHON_MODULE(_seq_alg) ; def("MergePairwiseAlignments", &MergePairwiseAlignments); def("Conservation", &Conservation, (arg("assign")=true, arg("prop_name")="cons")); - def("LocalAlign", &LocalAlign, (arg("seq1"), arg("seq2"), arg("subst_weight"), + def("LocalAlign", &LocalAlign, (arg("seq1"), arg("seq2"),arg("subst_weight"), + arg("gap_open")=-5, arg("gap_ext")=-2)); + def("GlobalAlign", &GlobalAlign,(arg("seq1"),arg("seq2"),arg("subst_weight"), arg("gap_open")=-5, arg("gap_ext")=-2)); } diff --git a/modules/seq/alg/src/CMakeLists.txt b/modules/seq/alg/src/CMakeLists.txt index b45f036cb89c35002c6cda176eb483eabdf0e01b..507869bcbe6f2bcccd6c13a4c941bbe805ede9f6 100644 --- a/modules/seq/alg/src/CMakeLists.txt +++ b/modules/seq/alg/src/CMakeLists.txt @@ -7,11 +7,13 @@ alignment_opts.hh merge_pairwise_alignments.hh conservation.hh local_align.hh +global_align.hh ) set(OST_SEQ_ALG_SOURCES merge_pairwise_alignments.cc local_align.cc +global_align.cc sequence_identity.cc ins_del.cc subst_weight_matrix.cc diff --git a/modules/seq/alg/src/global_align.cc b/modules/seq/alg/src/global_align.cc new file mode 100644 index 0000000000000000000000000000000000000000..ad8275c9c8688128aaede9b0cc5a15b19d92652b --- /dev/null +++ b/modules/seq/alg/src/global_align.cc @@ -0,0 +1,97 @@ +//------------------------------------------------------------------------------ +// This file is part of the OpenStructure project <www.openstructure.org> +// +// Copyright (C) 2008-2011 by the OpenStructure authors +// +// This library is free software; you can redistribute it and/or modify it under +// the terms of the GNU Lesser General Public License as published by the Free +// Software Foundation; either version 3.0 of the License, or (at your option) +// any later version. +// This library is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more +// details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this library; if not, write to the Free Software Foundation, Inc., +// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +//------------------------------------------------------------------------------ +#include <ost/log.hh> +#include "impl/align_impl.hh" +#include "global_align.hh" + +namespace ost { namespace seq { namespace alg { + +void Traceback(impl::AlnMat& mat, int max_i, int max_j, + const ConstSequenceHandle& s1, + const ConstSequenceHandle& s2, + AlignmentList& alignments) +{ + int i = (max_i > 0 ? max_i-1 : 0); + int j = (max_j > 0 ? max_j-1 : 0); + String aln_str1; + String aln_str2; + while (i > 0 || j > 0) { + impl::SetRoute(mat, i, j, s1, s2, aln_str1, aln_str2); + } + impl::StoreStrAsAln(aln_str1, aln_str2, s1, s2, i, j, alignments); +} + +AlignmentList GlobalAlign(const ConstSequenceHandle& s1, + const ConstSequenceHandle& s2, + alg::SubstWeightMatrixPtr& subst, + int gap_open,int gap_ext) +{ + impl::AlnMat mat(s1.GetLength()+1, s2.GetLength()+1); + // init first column of matrix + if (mat.GetHeight() > 1) { + mat(0, 1).score = mat(0, 0).score + gap_open; + mat(0, 1).from = impl::INS1; + } + for (int j = 2; j < mat.GetHeight(); ++j) { + mat(0, j).score = mat(0, j-1).score + gap_ext; + mat(0, j).from = impl::INS1; + } + // fill the alignment matrix + for (int i = 0; i < mat.GetWidth()-1; ++i) { + mat(i+1, 0).score = mat(i, 0).score + + (mat(i, 0).from==impl::INS2 ? gap_ext : gap_open); + mat(i+1,0).from = impl::INS2; + for (int j = 0; j < mat.GetHeight()-1; ++j) { + char c1=s1[i]; + char c2=s2[j]; + short weight=subst->GetWeight(c1, c2); + int diag=weight+mat(i, j).score; + int ins1=mat(i+1, j).score + +(mat(i+1, j).from==impl::INS1 ? gap_ext : gap_open); + int ins2=mat(i, j+1).score + +(mat(i, j+1).from==impl::INS2 ? gap_ext : gap_open); + if (diag>=ins1) { + if (diag>=ins2) { + mat(i+1, j+1).score=diag; + mat(i+1, j+1).from=impl::DIAG; + } else { + mat(i+1, j+1).score=ins2; + mat(i+1, j+1).from=impl::INS2; + } + } else if (ins1>ins2) { + mat(i+1, j+1).score=ins1; + mat(i+1, j+1).from=impl::INS1; + } else { + mat(i+1, j+1).score=ins2; + mat(i+1, j+1).from=impl::INS2; + } + } + } + // write traceback matrix in debug mode +#if !defined(NDEBUG) + DbgWriteAlnMatrix(mat,s1, s2); +#endif + AlignmentList alignments; + Traceback(mat, mat.GetWidth(), mat.GetHeight(), s1, s2, alignments); + LOG_DEBUG(alignments.back().ToString(80)); + + return alignments; +} + +}}} diff --git a/modules/seq/alg/src/global_align.hh b/modules/seq/alg/src/global_align.hh new file mode 100644 index 0000000000000000000000000000000000000000..a920832d7e97f66a6a60d211fa5911106262ac54 --- /dev/null +++ b/modules/seq/alg/src/global_align.hh @@ -0,0 +1,35 @@ +//------------------------------------------------------------------------------ +// This file is part of the OpenStructure project <www.openstructure.org> +// +// Copyright (C) 2008-2011 by the OpenStructure authors +// +// This library is free software; you can redistribute it and/or modify it under +// the terms of the GNU Lesser General Public License as published by the Free +// Software Foundation; either version 3.0 of the License, or (at your option) +// any later version. +// This library is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more +// details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this library; if not, write to the Free Software Foundation, Inc., +// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +//------------------------------------------------------------------------------ +#ifndef OST_SEQ_ALG_GLOBAL_ALIGN_HH +#define OST_SEQ_ALG_GLOBAL_ALIGN_HH + +#include <ost/seq/alignment_handle.hh> +#include <ost/seq/alg/subst_weight_matrix.hh> +#include "module_config.hh" + +namespace ost { namespace seq { namespace alg { + + +AlignmentList DLLEXPORT_OST_SEQ_ALG GlobalAlign(const ConstSequenceHandle& s1, + const ConstSequenceHandle& s2, + SubstWeightMatrixPtr& subst, + int gap_open=-5,int gap_ext=-2); +}}} + +#endif diff --git a/modules/seq/alg/src/impl/align_impl.hh b/modules/seq/alg/src/impl/align_impl.hh new file mode 100644 index 0000000000000000000000000000000000000000..866b569ae4295250ea295501178a51e0fdf8dc3a --- /dev/null +++ b/modules/seq/alg/src/impl/align_impl.hh @@ -0,0 +1,143 @@ +//------------------------------------------------------------------------------ +// This file is part of the OpenStructure project <www.openstructure.org> +// +// Copyright (C) 2008-2011 by the OpenStructure authors +// +// This library is free software; you can redistribute it and/or modify it under +// the terms of the GNU Lesser General Public License as published by the Free +// Software Foundation; either version 3.0 of the License, or (at your option) +// any later version. +// This library is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more +// details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this library; if not, write to the Free Software Foundation, Inc., +// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +//------------------------------------------------------------------------------ +#ifndef OST_ALIGN_IMPL_HH +#define OST_ALIGN_IMPL_HH + +#include <ost/seq/alignment_handle.hh> +#include <ost/seq/alg/subst_weight_matrix.hh> +//#include "module_config.hh" + +namespace ost { namespace seq { namespace alg { namespace impl { + + typedef enum { + DIAG, + INS1, + INS2, + UNKN + } Path; + + struct DLLEXPORT AlnPos { + AlnPos(): score(0), from(impl::UNKN) { } + int score; + impl::Path from; + }; + + struct DLLEXPORT AlnMat { + AlnMat(int width, int height): + mat_(width*height), width_(width), height_(height) + { } + + impl::AlnPos& operator()(int x, int y) { return mat_[x*height_+y]; } + + const impl::AlnPos& operator()(int x, int y) const { + return mat_[x*height_+y]; + } + + int GetWidth() const { return width_; } + int GetHeight() const { return height_; } + + private: + std::vector<impl::AlnPos> mat_; + int width_; + int height_; +}; + +inline void DLLEXPORT SetRoute(impl::AlnMat& mat, int& i, int& j, + const ConstSequenceHandle& s1, + const ConstSequenceHandle& s2, + String& aln_str1, String& aln_str2) +{ + switch (mat(i, j).from) { + case impl::DIAG: + --i; + --j; + aln_str1.push_back(s1[i]); + aln_str2.push_back(s2[j]); + break; + case impl::INS1: + --j; + aln_str1.push_back('-'); + aln_str2.push_back(s2[j]); + break; + case impl::INS2: + --i; + aln_str1.push_back(s1[i]); + aln_str2.push_back('-'); + break; + default: + assert(0 && "should never get here"); + } +}; + +inline void DLLEXPORT StoreStrAsAln(String& aln_str1, String& aln_str2, + const ConstSequenceHandle& s1, + const ConstSequenceHandle& s2, + const int i, const int j, + AlignmentList& alignments) +{ + for (size_t x=0; x<aln_str1.size()/2; ++x) { + std::swap(aln_str1[x], aln_str1[aln_str1.size()-x-1]); + std::swap(aln_str2[x], aln_str2[aln_str1.size()-x-1]); + } + AlignmentHandle aln=CreateAlignment(); + aln.AddSequence(CreateSequence(s1.GetName(), aln_str1)); + aln.AddSequence(CreateSequence(s2.GetName(), aln_str2)); + aln.SetSequenceOffset(0, i); + aln.SetSequenceOffset(1, j); + alignments.push_back(aln); +}; + +#if !defined(NDEBUG) +inline void DLLEXPORT DbgWriteAlnMatrix(impl::AlnMat& mat, + const ConstSequenceHandle& s1, + const ConstSequenceHandle& s2) +{ + std::stringstream ss; + ss << " "; + for (int j = 1; j < mat.GetHeight(); ++j) { + ss << s2[j-1]; + } + ss << std::endl; + for (int i = 0; i < mat.GetWidth(); ++i) { + for (int j = 0; j < mat.GetHeight(); ++j) { + if (j==0 && i>0) { + ss << s1[i-1] << " "; + } + if (i==0 && j==0) { + ss << " "; + } + if (mat(i, j).from==impl::DIAG) { + ss << "\\"; + } else if (mat(i, j).from==impl::INS1) { + ss << "-"; + } else if (mat(i, j).from==impl::INS2) { + ss << "|"; + } else if (mat(i, j).from==impl::UNKN) { + ss << "$"; + } + } + ss << std::endl; + } + LOG_DEBUG(ss.str()); +}; +#endif + +}}}} + +#endif diff --git a/modules/seq/alg/src/local_align.cc b/modules/seq/alg/src/local_align.cc index 963c81eb60ebb078ce9c6b9b84ab0dd0f10447e2..aec7a5932a26adbcbae8fae3cd5d13008182198e 100644 --- a/modules/seq/alg/src/local_align.cc +++ b/modules/seq/alg/src/local_align.cc @@ -17,42 +17,12 @@ // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA //------------------------------------------------------------------------------ #include <ost/log.hh> +#include "impl/align_impl.hh" #include "local_align.hh" namespace ost { namespace seq { namespace alg { -typedef enum { - DIAG, - INS1, - INS2, - UNKN -} Path; - -struct AlnPos { - AlnPos(): score(0), from(UNKN) { } - int score; - Path from; -}; - -struct AlnMat { - AlnMat(int width, int height): - mat_(width*height), width_(width), height_(height) - { } - - AlnPos& operator()(int x, int y) { return mat_[x*height_+y]; } - - const AlnPos& operator()(int x, int y) const { return mat_[x*height_+y]; } - - int GetWidth() const { return width_; } - int GetHeight() const { return height_; } - -private: - std::vector<AlnPos> mat_; - int width_; - int height_; -}; - -bool CollectPatches(AlnMat& mat, int min_i, int max_i, int min_j, int max_j, +bool CollectPatches(impl::AlnMat& mat, int min_i, int max_i, int min_j, int max_j, const ConstSequenceHandle& s1, const ConstSequenceHandle& s2, AlignmentList& alignments) @@ -71,41 +41,12 @@ bool CollectPatches(AlnMat& mat, int min_i, int max_i, int min_j, int max_j, String aln_str1; String aln_str2; while (i>min_i && j>min_j && mat(i, j).score>0) { - switch (mat(i, j).from) { - case DIAG: - --i; - --j; - aln_str1.push_back(s1[i]); - aln_str2.push_back(s2[j]); - break; - case INS1: - --j; - aln_str1.push_back('-'); - aln_str2.push_back(s2[j]); - break; - case INS2: - --i; - aln_str1.push_back(s1[i]); - aln_str2.push_back('-'); - break; - default: - assert(0 && "should never get here"); - } + impl::SetRoute(mat, i, j, s1, s2, aln_str1, aln_str2); } if (aln_str1.size()<2) { return false; } - for (size_t x=0; x<aln_str1.size()/2; ++x) { - std::swap(aln_str1[x], aln_str1[aln_str1.size()-x-1]); - std::swap(aln_str2[x], aln_str2[aln_str1.size()-x-1]); - } - AlignmentHandle aln=CreateAlignment(); - aln.AddSequence(CreateSequence(s1.GetName(), aln_str1)); - aln.AddSequence(CreateSequence(s2.GetName(), aln_str2)); - aln.SetSequenceOffset(0, i); - aln.SetSequenceOffset(1, j); - alignments.push_back(aln); - // clear used regions + impl::StoreStrAsAln(aln_str1, aln_str2, s1, s2, i, j, alignments); for (int x=i; x<=mmax_i; ++x) { for (int y=0; y<mat.GetHeight(); ++y) { mat(x, y).score=0; @@ -124,58 +65,38 @@ AlignmentList LocalAlign(const ConstSequenceHandle& s1, alg::SubstWeightMatrixPtr& subst, int gap_open, int gap_ext) { - AlnMat mat(s1.GetLength()+1, s2.GetLength()+1); + impl::AlnMat mat(s1.GetLength()+1, s2.GetLength()+1); for (int i=0; i<mat.GetWidth()-1; ++i) { for (int j=0; j<mat.GetHeight()-1; ++j) { char c1=s1[i]; char c2=s2[j]; short weight=subst->GetWeight(c1, c2); int diag=weight+mat(i, j).score; - int ins1=mat(i+1, j).score+(mat(i+1, j).from==INS1 ? gap_ext : gap_open); + int ins1=mat(i+1, j).score + +(mat(i+1, j).from==impl::INS1 ? gap_ext : gap_open); ins1=std::max(0, ins1); - int ins2=mat(i, j+1).score+(mat(i, j+1).from==INS2 ? gap_ext : gap_open); + int ins2=mat(i, j+1).score + +(mat(i, j+1).from==impl::INS2 ? gap_ext : gap_open); ins2=std::max(0, ins2); if (diag>=ins1) { if (diag>=ins2) { mat(i+1, j+1).score=diag; - mat(i+1, j+1).from=DIAG; + mat(i+1, j+1).from=impl::DIAG; } else { mat(i+1, j+1).score=ins2; - mat(i+1, j+1).from=INS2; + mat(i+1, j+1).from=impl::INS2; } } else if (ins1>ins2) { mat(i+1, j+1).score=ins1; - mat(i+1, j+1).from=INS1; + mat(i+1, j+1).from=impl::INS1; } else { mat(i+1, j+1).score=ins2; - mat(i+1, j+1).from=INS2; + mat(i+1, j+1).from=impl::INS2; } } } #if !defined(NDEBUG) - std::stringstream ss; - for (int i=0; i<mat.GetWidth(); ++i) { - for (int j=0; j<mat.GetHeight(); ++j) { - if (mat(i, j).from==DIAG) { - ss << "\\"; - } else if (mat(i, j).from==INS1) { - ss << "-"; - } else if (mat(i, j).from==INS2) { - ss << "|"; - } - if (i==0 && j>0) { - ss << s2[j-1]; - } - if (j==0 && i>0) { - ss << s1[i-1] << " "; - } - if (i==0 && j==0) { - ss << " "; - } - } - ss << std::endl; - } - LOG_DEBUG(ss.str()); + DbgWriteAlnMatrix(mat,s1, s2); #endif AlignmentList alignments; while (CollectPatches(mat, 0, mat.GetWidth(), 0, mat.GetHeight(), diff --git a/modules/seq/alg/tests/CMakeLists.txt b/modules/seq/alg/tests/CMakeLists.txt index 53053090ee969e747f5f81c6f048f31c33424f29..f6d8ebe1493d5198882f57f101956baa8e6ff830 100644 --- a/modules/seq/alg/tests/CMakeLists.txt +++ b/modules/seq/alg/tests/CMakeLists.txt @@ -4,6 +4,7 @@ set(OST_SEQ_ALG_UNIT_TESTS tests.cc test_renumber.py test_local_align.py + test_global_align.py ) ost_unittest(seq_alg "${OST_SEQ_ALG_UNIT_TESTS}") diff --git a/modules/seq/alg/tests/test_global_align.py b/modules/seq/alg/tests/test_global_align.py new file mode 100644 index 0000000000000000000000000000000000000000..aed51741b8ef35f0afdbd02b250cc93ea1c05f23 --- /dev/null +++ b/modules/seq/alg/tests/test_global_align.py @@ -0,0 +1,50 @@ +import unittest +from ost import * +from ost import settings +from ost import seq +from ost.bindings.clustalw import * + +class TestGlobalAlign(unittest.TestCase): + def testDeletionInSeqB(self): + seq_a=seq.CreateSequence('A', 'aacdefghiklmn') + seq_b=seq.CreateSequence('B', 'acdhiklmn') + alns=seq.alg.GlobalAlign(seq_a, seq_b, seq.alg.BLOSUM62) + self.assertEqual(len(alns), 1) + self.assertEqual(alns[0].sequences[0].name, 'A') + self.assertEqual(alns[0].sequences[1].name, 'B') + self.assertEqual(str(alns[0].sequences[0]), 'aacdefghiklmn') + self.assertEqual(str(alns[0].sequences[1]), '-acd---hiklmn') + self.assertEqual(alns[0].sequences[0].offset, 0) + self.assertEqual(alns[0].sequences[1].offset, 0) + + def testDeletionInSeqA(self): + seq_a=seq.CreateSequence('A', 'acdhiklmn') + seq_b=seq.CreateSequence('B', 'acdefghiklmn') + alns=seq.alg.GlobalAlign(seq_a, seq_b, seq.alg.BLOSUM62) + self.assertEqual(len(alns), 1) + self.assertEqual(alns[0].sequences[0].name, 'A') + self.assertEqual(alns[0].sequences[1].name, 'B') + + self.assertEqual(str(alns[0].sequences[0]), 'acd---hiklmn') + self.assertEqual(str(alns[0].sequences[1]), 'acdefghiklmn') + self.assertEqual(alns[0].sequences[0].offset, 0) + self.assertEqual(alns[0].sequences[1].offset, 0) + + def testOffset(self): + seq_a=seq.CreateSequence('A', 'acdhiklmn') + seq_b=seq.CreateSequence('B', 'ggiklmn') + alns=seq.alg.GlobalAlign(seq_a, seq_b, seq.alg.BLOSUM62) + self.assertEqual(len(alns), 1) + self.assertEqual(alns[0].sequences[0].name, 'A') + self.assertEqual(alns[0].sequences[1].name, 'B') + + self.assertEqual(str(alns[0].sequences[0]), 'acdhiklmn') + self.assertEqual(str(alns[0].sequences[1]), 'g--giklmn') + self.assertEqual(alns[0].sequences[0].offset, 0) + self.assertEqual(alns[0].sequences[1].offset, 0) + +if __name__ == "__main__": + try: + unittest.main() + except Exception, e: + print e