Skip to content
Snippets Groups Projects
Commit 0546f80f authored by Bienchen's avatar Bienchen
Browse files

Added function seq.alg.GlobalAlign

parent 82e13203
No related branches found
No related tags found
No related merge requests found
......@@ -39,8 +39,8 @@
.. function:: LocalAlign(seq1, seq2, subst_weight, gap_open=-5, gap_ext=-2)
Performs a Smith/Waterman local alignment of *seq1* and *seq2* and returns the
best-scoring alignments as a list of pairwise alignments.
Performs a Smith/Waterman local alignment of *seq1* and *seq2* and returns
the best-scoring alignments as a list of pairwise alignments.
**Example:**
......@@ -63,5 +63,33 @@
:param gap_open: The gap opening penalty. Must be a negative number
:param gap_ext: The gap extension penalty. Must be a negative number
:returns: list of best-scoring, non-overlapping alignments of *seq1* and
*seq2*. The start of the alignments is stored in the sequence offset of the
two sequences.
\ No newline at end of file
*seq2*. Since alignments always start with a replacement, the start is
stored in the sequence offset of the two sequences.
.. function:: GlobalAlign(seq1, seq2, subst_weight, gap_open=-5, gap_ext=-2)
Performs a Needleman/Wunsch global alignment of *seq1* and *seq2* and returns
the best-scoring alignment.
**Example:**
.. code-block:: python
seq_a=seq.CreateSequence('A', 'acdefghiklmn')
seq_b=seq.CreateSequence('B', 'acdhiklmn')
alns=seq.alg.GlobalAlign(seq_a, seq_b, seq.alg.BLOSUM62)
print alns[0].ToString(80)
# >>> A acdefghiklmn
# >>> B acd---hiklmn
:param seq1: A valid sequence
:type seq1: :class:`~ost.seq.ConstSequenceHandle`
:param seq2: A valid sequence
:type seq2: :class:`~ost.seq.ConstSequenceHandle`
:param subst_weigth: The substitution weights matrix
:type subst_weight: :class:`SubstWeightMatrix`
:param gap_open: The gap opening penalty. Must be a negative number
:param gap_ext: The gap extension penalty. Must be a negative number
:returns: best-scoring alignment of *seq1* and *seq2*.
......@@ -25,6 +25,7 @@
#include <ost/seq/alg/conservation.hh>
#include <ost/seq/alg/subst_weight_matrix.hh>
#include <ost/seq/alg/local_align.hh>
#include <ost/seq/alg/global_align.hh>
using namespace boost::python;
using namespace ost::seq;
using namespace ost::seq::alg;
......@@ -53,7 +54,9 @@ BOOST_PYTHON_MODULE(_seq_alg)
;
def("MergePairwiseAlignments", &MergePairwiseAlignments);
def("Conservation", &Conservation, (arg("assign")=true, arg("prop_name")="cons"));
def("LocalAlign", &LocalAlign, (arg("seq1"), arg("seq2"), arg("subst_weight"),
def("LocalAlign", &LocalAlign, (arg("seq1"), arg("seq2"),arg("subst_weight"),
arg("gap_open")=-5, arg("gap_ext")=-2));
def("GlobalAlign", &GlobalAlign,(arg("seq1"),arg("seq2"),arg("subst_weight"),
arg("gap_open")=-5, arg("gap_ext")=-2));
}
......@@ -7,11 +7,13 @@ alignment_opts.hh
merge_pairwise_alignments.hh
conservation.hh
local_align.hh
global_align.hh
)
set(OST_SEQ_ALG_SOURCES
merge_pairwise_alignments.cc
local_align.cc
global_align.cc
sequence_identity.cc
ins_del.cc
subst_weight_matrix.cc
......
//------------------------------------------------------------------------------
// This file is part of the OpenStructure project <www.openstructure.org>
//
// Copyright (C) 2008-2011 by the OpenStructure authors
//
// This library is free software; you can redistribute it and/or modify it under
// the terms of the GNU Lesser General Public License as published by the Free
// Software Foundation; either version 3.0 of the License, or (at your option)
// any later version.
// This library is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
// details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with this library; if not, write to the Free Software Foundation, Inc.,
// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
//------------------------------------------------------------------------------
#include <ost/log.hh>
#include "impl/align_impl.hh"
#include "global_align.hh"
namespace ost { namespace seq { namespace alg {
void Traceback(impl::AlnMat& mat, int max_i, int max_j,
const ConstSequenceHandle& s1,
const ConstSequenceHandle& s2,
AlignmentList& alignments)
{
int i = (max_i > 0 ? max_i-1 : 0);
int j = (max_j > 0 ? max_j-1 : 0);
String aln_str1;
String aln_str2;
while (i > 0 || j > 0) {
impl::SetRoute(mat, i, j, s1, s2, aln_str1, aln_str2);
}
impl::StoreStrAsAln(aln_str1, aln_str2, s1, s2, i, j, alignments);
}
AlignmentList GlobalAlign(const ConstSequenceHandle& s1,
const ConstSequenceHandle& s2,
alg::SubstWeightMatrixPtr& subst,
int gap_open,int gap_ext)
{
impl::AlnMat mat(s1.GetLength()+1, s2.GetLength()+1);
// init first column of matrix
if (mat.GetHeight() > 1) {
mat(0, 1).score = mat(0, 0).score + gap_open;
mat(0, 1).from = impl::INS1;
}
for (int j = 2; j < mat.GetHeight(); ++j) {
mat(0, j).score = mat(0, j-1).score + gap_ext;
mat(0, j).from = impl::INS1;
}
// fill the alignment matrix
for (int i = 0; i < mat.GetWidth()-1; ++i) {
mat(i+1, 0).score = mat(i, 0).score
+ (mat(i, 0).from==impl::INS2 ? gap_ext : gap_open);
mat(i+1,0).from = impl::INS2;
for (int j = 0; j < mat.GetHeight()-1; ++j) {
char c1=s1[i];
char c2=s2[j];
short weight=subst->GetWeight(c1, c2);
int diag=weight+mat(i, j).score;
int ins1=mat(i+1, j).score
+(mat(i+1, j).from==impl::INS1 ? gap_ext : gap_open);
int ins2=mat(i, j+1).score
+(mat(i, j+1).from==impl::INS2 ? gap_ext : gap_open);
if (diag>=ins1) {
if (diag>=ins2) {
mat(i+1, j+1).score=diag;
mat(i+1, j+1).from=impl::DIAG;
} else {
mat(i+1, j+1).score=ins2;
mat(i+1, j+1).from=impl::INS2;
}
} else if (ins1>ins2) {
mat(i+1, j+1).score=ins1;
mat(i+1, j+1).from=impl::INS1;
} else {
mat(i+1, j+1).score=ins2;
mat(i+1, j+1).from=impl::INS2;
}
}
}
// write traceback matrix in debug mode
#if !defined(NDEBUG)
DbgWriteAlnMatrix(mat,s1, s2);
#endif
AlignmentList alignments;
Traceback(mat, mat.GetWidth(), mat.GetHeight(), s1, s2, alignments);
LOG_DEBUG(alignments.back().ToString(80));
return alignments;
}
}}}
//------------------------------------------------------------------------------
// This file is part of the OpenStructure project <www.openstructure.org>
//
// Copyright (C) 2008-2011 by the OpenStructure authors
//
// This library is free software; you can redistribute it and/or modify it under
// the terms of the GNU Lesser General Public License as published by the Free
// Software Foundation; either version 3.0 of the License, or (at your option)
// any later version.
// This library is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
// details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with this library; if not, write to the Free Software Foundation, Inc.,
// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
//------------------------------------------------------------------------------
#ifndef OST_SEQ_ALG_GLOBAL_ALIGN_HH
#define OST_SEQ_ALG_GLOBAL_ALIGN_HH
#include <ost/seq/alignment_handle.hh>
#include <ost/seq/alg/subst_weight_matrix.hh>
#include "module_config.hh"
namespace ost { namespace seq { namespace alg {
AlignmentList DLLEXPORT_OST_SEQ_ALG GlobalAlign(const ConstSequenceHandle& s1,
const ConstSequenceHandle& s2,
SubstWeightMatrixPtr& subst,
int gap_open=-5,int gap_ext=-2);
}}}
#endif
//------------------------------------------------------------------------------
// This file is part of the OpenStructure project <www.openstructure.org>
//
// Copyright (C) 2008-2011 by the OpenStructure authors
//
// This library is free software; you can redistribute it and/or modify it under
// the terms of the GNU Lesser General Public License as published by the Free
// Software Foundation; either version 3.0 of the License, or (at your option)
// any later version.
// This library is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
// details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with this library; if not, write to the Free Software Foundation, Inc.,
// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
//------------------------------------------------------------------------------
#ifndef OST_ALIGN_IMPL_HH
#define OST_ALIGN_IMPL_HH
#include <ost/seq/alignment_handle.hh>
#include <ost/seq/alg/subst_weight_matrix.hh>
//#include "module_config.hh"
namespace ost { namespace seq { namespace alg { namespace impl {
typedef enum {
DIAG,
INS1,
INS2,
UNKN
} Path;
struct DLLEXPORT AlnPos {
AlnPos(): score(0), from(impl::UNKN) { }
int score;
impl::Path from;
};
struct DLLEXPORT AlnMat {
AlnMat(int width, int height):
mat_(width*height), width_(width), height_(height)
{ }
impl::AlnPos& operator()(int x, int y) { return mat_[x*height_+y]; }
const impl::AlnPos& operator()(int x, int y) const {
return mat_[x*height_+y];
}
int GetWidth() const { return width_; }
int GetHeight() const { return height_; }
private:
std::vector<impl::AlnPos> mat_;
int width_;
int height_;
};
inline void DLLEXPORT SetRoute(impl::AlnMat& mat, int& i, int& j,
const ConstSequenceHandle& s1,
const ConstSequenceHandle& s2,
String& aln_str1, String& aln_str2)
{
switch (mat(i, j).from) {
case impl::DIAG:
--i;
--j;
aln_str1.push_back(s1[i]);
aln_str2.push_back(s2[j]);
break;
case impl::INS1:
--j;
aln_str1.push_back('-');
aln_str2.push_back(s2[j]);
break;
case impl::INS2:
--i;
aln_str1.push_back(s1[i]);
aln_str2.push_back('-');
break;
default:
assert(0 && "should never get here");
}
};
inline void DLLEXPORT StoreStrAsAln(String& aln_str1, String& aln_str2,
const ConstSequenceHandle& s1,
const ConstSequenceHandle& s2,
const int i, const int j,
AlignmentList& alignments)
{
for (size_t x=0; x<aln_str1.size()/2; ++x) {
std::swap(aln_str1[x], aln_str1[aln_str1.size()-x-1]);
std::swap(aln_str2[x], aln_str2[aln_str1.size()-x-1]);
}
AlignmentHandle aln=CreateAlignment();
aln.AddSequence(CreateSequence(s1.GetName(), aln_str1));
aln.AddSequence(CreateSequence(s2.GetName(), aln_str2));
aln.SetSequenceOffset(0, i);
aln.SetSequenceOffset(1, j);
alignments.push_back(aln);
};
#if !defined(NDEBUG)
inline void DLLEXPORT DbgWriteAlnMatrix(impl::AlnMat& mat,
const ConstSequenceHandle& s1,
const ConstSequenceHandle& s2)
{
std::stringstream ss;
ss << " ";
for (int j = 1; j < mat.GetHeight(); ++j) {
ss << s2[j-1];
}
ss << std::endl;
for (int i = 0; i < mat.GetWidth(); ++i) {
for (int j = 0; j < mat.GetHeight(); ++j) {
if (j==0 && i>0) {
ss << s1[i-1] << " ";
}
if (i==0 && j==0) {
ss << " ";
}
if (mat(i, j).from==impl::DIAG) {
ss << "\\";
} else if (mat(i, j).from==impl::INS1) {
ss << "-";
} else if (mat(i, j).from==impl::INS2) {
ss << "|";
} else if (mat(i, j).from==impl::UNKN) {
ss << "$";
}
}
ss << std::endl;
}
LOG_DEBUG(ss.str());
};
#endif
}}}}
#endif
......@@ -17,42 +17,12 @@
// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
//------------------------------------------------------------------------------
#include <ost/log.hh>
#include "impl/align_impl.hh"
#include "local_align.hh"
namespace ost { namespace seq { namespace alg {
typedef enum {
DIAG,
INS1,
INS2,
UNKN
} Path;
struct AlnPos {
AlnPos(): score(0), from(UNKN) { }
int score;
Path from;
};
struct AlnMat {
AlnMat(int width, int height):
mat_(width*height), width_(width), height_(height)
{ }
AlnPos& operator()(int x, int y) { return mat_[x*height_+y]; }
const AlnPos& operator()(int x, int y) const { return mat_[x*height_+y]; }
int GetWidth() const { return width_; }
int GetHeight() const { return height_; }
private:
std::vector<AlnPos> mat_;
int width_;
int height_;
};
bool CollectPatches(AlnMat& mat, int min_i, int max_i, int min_j, int max_j,
bool CollectPatches(impl::AlnMat& mat, int min_i, int max_i, int min_j, int max_j,
const ConstSequenceHandle& s1,
const ConstSequenceHandle& s2,
AlignmentList& alignments)
......@@ -71,41 +41,12 @@ bool CollectPatches(AlnMat& mat, int min_i, int max_i, int min_j, int max_j,
String aln_str1;
String aln_str2;
while (i>min_i && j>min_j && mat(i, j).score>0) {
switch (mat(i, j).from) {
case DIAG:
--i;
--j;
aln_str1.push_back(s1[i]);
aln_str2.push_back(s2[j]);
break;
case INS1:
--j;
aln_str1.push_back('-');
aln_str2.push_back(s2[j]);
break;
case INS2:
--i;
aln_str1.push_back(s1[i]);
aln_str2.push_back('-');
break;
default:
assert(0 && "should never get here");
}
impl::SetRoute(mat, i, j, s1, s2, aln_str1, aln_str2);
}
if (aln_str1.size()<2) {
return false;
}
for (size_t x=0; x<aln_str1.size()/2; ++x) {
std::swap(aln_str1[x], aln_str1[aln_str1.size()-x-1]);
std::swap(aln_str2[x], aln_str2[aln_str1.size()-x-1]);
}
AlignmentHandle aln=CreateAlignment();
aln.AddSequence(CreateSequence(s1.GetName(), aln_str1));
aln.AddSequence(CreateSequence(s2.GetName(), aln_str2));
aln.SetSequenceOffset(0, i);
aln.SetSequenceOffset(1, j);
alignments.push_back(aln);
// clear used regions
impl::StoreStrAsAln(aln_str1, aln_str2, s1, s2, i, j, alignments);
for (int x=i; x<=mmax_i; ++x) {
for (int y=0; y<mat.GetHeight(); ++y) {
mat(x, y).score=0;
......@@ -124,58 +65,38 @@ AlignmentList LocalAlign(const ConstSequenceHandle& s1,
alg::SubstWeightMatrixPtr& subst,
int gap_open, int gap_ext)
{
AlnMat mat(s1.GetLength()+1, s2.GetLength()+1);
impl::AlnMat mat(s1.GetLength()+1, s2.GetLength()+1);
for (int i=0; i<mat.GetWidth()-1; ++i) {
for (int j=0; j<mat.GetHeight()-1; ++j) {
char c1=s1[i];
char c2=s2[j];
short weight=subst->GetWeight(c1, c2);
int diag=weight+mat(i, j).score;
int ins1=mat(i+1, j).score+(mat(i+1, j).from==INS1 ? gap_ext : gap_open);
int ins1=mat(i+1, j).score
+(mat(i+1, j).from==impl::INS1 ? gap_ext : gap_open);
ins1=std::max(0, ins1);
int ins2=mat(i, j+1).score+(mat(i, j+1).from==INS2 ? gap_ext : gap_open);
int ins2=mat(i, j+1).score
+(mat(i, j+1).from==impl::INS2 ? gap_ext : gap_open);
ins2=std::max(0, ins2);
if (diag>=ins1) {
if (diag>=ins2) {
mat(i+1, j+1).score=diag;
mat(i+1, j+1).from=DIAG;
mat(i+1, j+1).from=impl::DIAG;
} else {
mat(i+1, j+1).score=ins2;
mat(i+1, j+1).from=INS2;
mat(i+1, j+1).from=impl::INS2;
}
} else if (ins1>ins2) {
mat(i+1, j+1).score=ins1;
mat(i+1, j+1).from=INS1;
mat(i+1, j+1).from=impl::INS1;
} else {
mat(i+1, j+1).score=ins2;
mat(i+1, j+1).from=INS2;
mat(i+1, j+1).from=impl::INS2;
}
}
}
#if !defined(NDEBUG)
std::stringstream ss;
for (int i=0; i<mat.GetWidth(); ++i) {
for (int j=0; j<mat.GetHeight(); ++j) {
if (mat(i, j).from==DIAG) {
ss << "\\";
} else if (mat(i, j).from==INS1) {
ss << "-";
} else if (mat(i, j).from==INS2) {
ss << "|";
}
if (i==0 && j>0) {
ss << s2[j-1];
}
if (j==0 && i>0) {
ss << s1[i-1] << " ";
}
if (i==0 && j==0) {
ss << " ";
}
}
ss << std::endl;
}
LOG_DEBUG(ss.str());
DbgWriteAlnMatrix(mat,s1, s2);
#endif
AlignmentList alignments;
while (CollectPatches(mat, 0, mat.GetWidth(), 0, mat.GetHeight(),
......
......@@ -4,6 +4,7 @@ set(OST_SEQ_ALG_UNIT_TESTS
tests.cc
test_renumber.py
test_local_align.py
test_global_align.py
)
ost_unittest(seq_alg "${OST_SEQ_ALG_UNIT_TESTS}")
......
import unittest
from ost import *
from ost import settings
from ost import seq
from ost.bindings.clustalw import *
class TestGlobalAlign(unittest.TestCase):
def testDeletionInSeqB(self):
seq_a=seq.CreateSequence('A', 'aacdefghiklmn')
seq_b=seq.CreateSequence('B', 'acdhiklmn')
alns=seq.alg.GlobalAlign(seq_a, seq_b, seq.alg.BLOSUM62)
self.assertEqual(len(alns), 1)
self.assertEqual(alns[0].sequences[0].name, 'A')
self.assertEqual(alns[0].sequences[1].name, 'B')
self.assertEqual(str(alns[0].sequences[0]), 'aacdefghiklmn')
self.assertEqual(str(alns[0].sequences[1]), '-acd---hiklmn')
self.assertEqual(alns[0].sequences[0].offset, 0)
self.assertEqual(alns[0].sequences[1].offset, 0)
def testDeletionInSeqA(self):
seq_a=seq.CreateSequence('A', 'acdhiklmn')
seq_b=seq.CreateSequence('B', 'acdefghiklmn')
alns=seq.alg.GlobalAlign(seq_a, seq_b, seq.alg.BLOSUM62)
self.assertEqual(len(alns), 1)
self.assertEqual(alns[0].sequences[0].name, 'A')
self.assertEqual(alns[0].sequences[1].name, 'B')
self.assertEqual(str(alns[0].sequences[0]), 'acd---hiklmn')
self.assertEqual(str(alns[0].sequences[1]), 'acdefghiklmn')
self.assertEqual(alns[0].sequences[0].offset, 0)
self.assertEqual(alns[0].sequences[1].offset, 0)
def testOffset(self):
seq_a=seq.CreateSequence('A', 'acdhiklmn')
seq_b=seq.CreateSequence('B', 'ggiklmn')
alns=seq.alg.GlobalAlign(seq_a, seq_b, seq.alg.BLOSUM62)
self.assertEqual(len(alns), 1)
self.assertEqual(alns[0].sequences[0].name, 'A')
self.assertEqual(alns[0].sequences[1].name, 'B')
self.assertEqual(str(alns[0].sequences[0]), 'acdhiklmn')
self.assertEqual(str(alns[0].sequences[1]), 'g--giklmn')
self.assertEqual(alns[0].sequences[0].offset, 0)
self.assertEqual(alns[0].sequences[1].offset, 0)
if __name__ == "__main__":
try:
unittest.main()
except Exception, e:
print e
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment