diff --git a/modules/seq/alg/pymod/CMakeLists.txt b/modules/seq/alg/pymod/CMakeLists.txt index 93dd1b348e19c104bc5364eb04dbde4238bd5dfa..3b0a705877194de18c04de5d1b84561cccf22bfc 100644 --- a/modules/seq/alg/pymod/CMakeLists.txt +++ b/modules/seq/alg/pymod/CMakeLists.txt @@ -1,6 +1,5 @@ set(OST_SEQ_ALG_PYMOD_SOURCES wrap_seq_alg.cc - export_align.cc ) pymod(NAME seq_alg OUTPUT_DIR ost/seq/alg diff --git a/modules/seq/alg/pymod/export_align.cc b/modules/seq/alg/pymod/export_align.cc deleted file mode 100644 index 49a8d1c8bc5dffb55230accdbaf988d8014827bd..0000000000000000000000000000000000000000 --- a/modules/seq/alg/pymod/export_align.cc +++ /dev/null @@ -1,59 +0,0 @@ -//------------------------------------------------------------------------------ -// This file is part of the OpenStructure project <www.openstructure.org> -// -// Copyright (C) 2008-2010 by the OpenStructure authors -// -// This library is free software; you can redistribute it and/or modify it under -// the terms of the GNU Lesser General Public License as published by the Free -// Software Foundation; either version 3.0 of the License, or (at your option) -// any later version. -// This library is distributed in the hope that it will be useful, but WITHOUT -// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more -// details. -// -// You should have received a copy of the GNU Lesser General Public License -// along with this library; if not, write to the Free Software Foundation, Inc., -// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -//------------------------------------------------------------------------------ -#include <boost/python.hpp> -#include <boost/python/register_ptr_to_python.hpp> -#include <boost/python/suite/indexing/vector_indexing_suite.hpp> -using namespace boost::python; - -#include <ost/info/info.hh> - -#include <ost/mol/mol.hh> -#include <ost/seq/alg/subst_weight_matrix.hh> -#include <ost/seq/alg/local_align.hh> -#include <ost/seq/alg/global_align.hh> -using namespace ost; -using namespace ost::seq; -using namespace ost::seq::alg; - -void export_Align() -{ - class_<SubstWeightMatrix, SubstWeightMatrixPtr>("SubstWeightMatrix", init<>()) - .def("GetWeight", &SubstWeightMatrix::GetWeight) - .def("SetWeight", &SubstWeightMatrix::SetWeight) - ; -class_<AlignmentOpts>("AlignmentOpts", init<int,int, SubstWeightMatrixPtr>()) - .def_readwrite("gap_open_penalty", &AlignmentOpts::gap_open_penalty) - .def_readwrite("gap_extension_penalty", - &AlignmentOpts::gap_extension_penalty) - .def_readwrite("subst_weights", &AlignmentOpts::subst_weights) -; - def("SubstWeightMatrixToInfo", &SubstWeightMatrixToInfo); - def("SubstWeightMatrixFromInfo", &SubstWeightMatrixFromInfo); - - class_<AlignedPatch>("AlignedPatch", no_init) - .def_readwrite("alignment", &AlignedPatch::alignment) - .def_readwrite("offset_a", &AlignedPatch::offset_a) - .def_readwrite("offset_b", &AlignedPatch::offset_b) - ; - class_<AlignedPatchList>("AlignedPatchList", init<>()) - .def(vector_indexing_suite<AlignedPatchList>()) - ; - def("LocalSequenceAlign", &LocalSequenceAlign); - def("GlobalSequenceAlign", &GlobalSequenceAlign); -} diff --git a/modules/seq/alg/pymod/wrap_seq_alg.cc b/modules/seq/alg/pymod/wrap_seq_alg.cc index 561d0379c24210f86a9bee8a321f2ee7431c7723..b9e4b683e0b3b6a599b420488fc08b5f552c1736 100644 --- a/modules/seq/alg/pymod/wrap_seq_alg.cc +++ b/modules/seq/alg/pymod/wrap_seq_alg.cc @@ -27,8 +27,6 @@ using namespace boost::python; using namespace ost::seq; using namespace ost::seq::alg; -void export_Align(); - BOOST_PYTHON_MODULE(_seq_alg) { enum_<RefMode::Type>("RefMode") @@ -48,5 +46,5 @@ BOOST_PYTHON_MODULE(_seq_alg) ; def("MergePairwiseAlignments", &MergePairwiseAlignments); def("Conservation", &Conservation, (arg("assign")=true, arg("prop_name")="cons")); - export_Align(); + } diff --git a/modules/seq/alg/src/CMakeLists.txt b/modules/seq/alg/src/CMakeLists.txt index 5f3cac427b7c4b20094856c135accdd854aac184..7b4f86f53e5bb8c08feeba088deb1c8319691d0d 100644 --- a/modules/seq/alg/src/CMakeLists.txt +++ b/modules/seq/alg/src/CMakeLists.txt @@ -3,20 +3,16 @@ sequence_identity.hh module_config.hh ins_del.hh subst_weight_matrix.hh -local_align.hh alignment_opts.hh -global_align.hh merge_pairwise_alignments.hh conservation.hh ) set(OST_SEQ_ALG_SOURCES merge_pairwise_alignments.cc -global_align.cc sequence_identity.cc ins_del.cc subst_weight_matrix.cc -local_align.cc conservation.cc ) diff --git a/modules/seq/alg/src/global_align.cc b/modules/seq/alg/src/global_align.cc deleted file mode 100644 index 6003ab84fe79bfbb754913f906e7b8440abf5b51..0000000000000000000000000000000000000000 --- a/modules/seq/alg/src/global_align.cc +++ /dev/null @@ -1,128 +0,0 @@ -//------------------------------------------------------------------------------ -// This file is part of the OpenStructure project <www.openstructure.org> -// -// Copyright (C) 2008-2010 by the OpenStructure authors -// -// This library is free software; you can redistribute it and/or modify it under -// the terms of the GNU Lesser General Public License as published by the Free -// Software Foundation; either version 3.0 of the License, or (at your option) -// any later version. -// This library is distributed in the hope that it will be useful, but WITHOUT -// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more -// details. -// -// You should have received a copy of the GNU Lesser General Public License -// along with this library; if not, write to the Free Software Foundation, Inc., -// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -//------------------------------------------------------------------------------ -/* - Author: Marco Biasini - */ -#include <ost/seq/alg/global_align.hh> - -namespace ost { namespace seq { namespace alg { - -namespace { - -class GlobalAlnMatrix { -public: - GlobalAlnMatrix(int num_cols, int num_rows) - : num_rows_(num_rows), num_cols_(num_cols) { - data_.resize(num_cols_*num_rows_,0); - } - short Get(int x, int y) const { - assert(x>=0 && x<num_cols_); - assert(y>=0 && y<num_rows_); - return data_[num_cols_*y+x]; - } - void Set(int x, int y, short value) { - data_[num_cols_*y+x]=value; - } - - int num_rows_; - int num_cols_; - std::vector<short> data_; -}; - -} - -AlignmentHandle GlobalSequenceAlign(const SequenceHandle& seq_a, - const SequenceHandle& seq_b, - const AlignmentOpts& opts) -{ - // using the strings directly is much better for performance reasons - String ssa=seq_a.GetString(); - String ssb=seq_b.GetString(); - GlobalAlnMatrix mat(ssa.size()+1, ssb.size()+1); - - // build-up aln mattrix - for (size_t i=0; i<ssa.size()+1; ++i) { - mat.Set(i, 0, i*opts.gap_open_penalty); - } - for (size_t i=0; i<ssb.size()+1; ++i) { - mat.Set(0, i, i*opts.gap_open_penalty); - } - for (size_t i=1; i<ssa.size()+1; ++i) { - for (size_t j=1; j<ssb.size()+1; ++j) { - short mm=opts.subst_weights->GetWeight(ssa[i-1], ssb[j-1]); - short m11=mat.Get(i-1, j-1)+mm; - short m10=mat.Get(i-1, j)-opts.gap_open_penalty; - short m01=mat.Get(i, j-1)-opts.gap_open_penalty; - mat.Set(i, j, std::max(m11, std::max(m10, m01))); - } - } - - // backtracking - size_t i=ssa.size(); - size_t j=ssb.size(); - String asa; - String asb; - asa.reserve(ssa.size()); - asb.reserve(ssb.size()); - - while (i>0 && j>0) { - // find out where we came from - short m11=mat.Get(i-1, j-1); - short m10=mat.Get(i-1, j); - short m01=mat.Get(i, j-1); - short m00=mat.Get(i, j); - short mm=opts.subst_weights->GetWeight(ssa[i-1], ssb[j-1]); - if (m11+mm==m00) { // match - asa.append(1, ssa[i-1]); - asb.append(1, ssb[j-1]); - --j; - --i; - } else if (m10-opts.gap_open_penalty==m00) { // seq b has a gap - asa.append(1, ssa[i-1]); - asb.append(1, '-'); - --i; - } else if (m01-opts.gap_open_penalty==m00) { // seq a has a gap - asa.append(1, '-'); - asb.append(1, ssb[j-1]); - --j; - } - } - while (i>0) { - asa.append(1, ssa[i-1]); - asb.append(1, '-'); - --i; - } - while (j>0) { - asa.append(1, '-'); - asb.append(1, ssb[j-1]); - --j; - } - // reverse the two strings - String rasa, rasb; - rasa.reserve(asa.size()); - rasb.reserve(asb.size()); - std::copy(asa.rbegin(), asa.rend(), std::back_inserter(rasa)); - std::copy(asb.rbegin(), asb.rend(), std::back_inserter(rasb)); - AlignmentHandle aln=CreateAlignment(); - aln.AddSequence(CreateSequence(seq_a.GetName(), rasa)); - aln.AddSequence(CreateSequence(seq_b.GetName(), rasb)); - return aln; -} - -}}} diff --git a/modules/seq/alg/src/global_align.hh b/modules/seq/alg/src/global_align.hh deleted file mode 100644 index 9b477fa9bd2c4c06755b7858b5f1f0d2cea6dcee..0000000000000000000000000000000000000000 --- a/modules/seq/alg/src/global_align.hh +++ /dev/null @@ -1,42 +0,0 @@ -//------------------------------------------------------------------------------ -// This file is part of the OpenStructure project <www.openstructure.org> -// -// Copyright (C) 2008-2010 by the OpenStructure authors -// -// This library is free software; you can redistribute it and/or modify it under -// the terms of the GNU Lesser General Public License as published by the Free -// Software Foundation; either version 3.0 of the License, or (at your option) -// any later version. -// This library is distributed in the hope that it will be useful, but WITHOUT -// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more -// details. -// -// You should have received a copy of the GNU Lesser General Public License -// along with this library; if not, write to the Free Software Foundation, Inc., -// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -//------------------------------------------------------------------------------ -#ifndef OST_SEQ_ALG_GLOBAL_ALIGN_HH -#define OST_SEQ_ALG_GLOBAL_ALIGN_HH -/* - Author: Marco Biasini - */ -#include <ost/seq/alignment_handle.hh> -#include <ost/seq/alg/module_config.hh> -#include <ost/seq/alg/alignment_opts.hh> - -namespace ost { namespace seq { namespace alg { - -/// \brief globally align two sequences -/// -/// the optimal global alignment of the two sequences, given the substitution -/// weights and gap opening penalties is computed by using a classical -/// Needleman-Wunsch algorithm. The algorithm has O(n*m) complexity in both -/// space and time. -AlignmentHandle DLLEXPORT_OST_SEQ_ALG -GlobalSequenceAlign(const SequenceHandle& seq_a, const SequenceHandle& seq_b, - const AlignmentOpts& opts); - -}}} - -#endif \ No newline at end of file diff --git a/modules/seq/alg/src/local_align.cc b/modules/seq/alg/src/local_align.cc deleted file mode 100644 index 197a34226b214c09c960be1ab8eb046c1b7ac1cf..0000000000000000000000000000000000000000 --- a/modules/seq/alg/src/local_align.cc +++ /dev/null @@ -1,160 +0,0 @@ -//------------------------------------------------------------------------------ -// This file is part of the OpenStructure project <www.openstructure.org> -// -// Copyright (C) 2008-2010 by the OpenStructure authors -// -// This library is free software; you can redistribute it and/or modify it under -// the terms of the GNU Lesser General Public License as published by the Free -// Software Foundation; either version 3.0 of the License, or (at your option) -// any later version. -// This library is distributed in the hope that it will be useful, but WITHOUT -// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more -// details. -// -// You should have received a copy of the GNU Lesser General Public License -// along with this library; if not, write to the Free Software Foundation, Inc., -// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -//------------------------------------------------------------------------------ -#include "local_align.hh" - -namespace ost { namespace seq { namespace alg { - -namespace { - -class AliMatrix { -public: - AliMatrix(int num_rows, int num_cols) - : num_rows_(num_rows), num_cols_(num_cols) { - data_.resize(num_cols_*num_rows_,0); - } - short Get(int x, int y) const { - assert(x>=0 && x<num_rows_); - assert(y>=0 && y<num_cols_); - return data_[num_cols_*x+y]; - } - void Set(int x, int y, short value) { - data_[num_cols_*x+y]=value; - } - - int num_rows_; - int num_cols_; - std::vector<short> data_; -}; - -std::pair<int, int> FindMax(const AliMatrix& mat, int min_col, - int max_col, int min_row, int max_row) { - int max_i=0, max_j=0, max_value=-1; - for (int i=min_col; i<max_col; ++i) { - for (int j=min_row; j<max_row; ++j) { - int x=mat.Get(i, j); - if (x>max_value) { - max_value=x; - max_i=i; max_j=j; - } - } - } - return std::make_pair(max_i, max_j); -} - -// divide and conquer the alignment matrix to get all patches. -void CollectPatches(AlignedPatchList& patches, const String& seq_a, - const String& seq_b, const AliMatrix& ali_mat, int min_col, - int max_col, int min_row, int max_row) { - if (max_col-min_col<2 || max_row-min_row<2) - return; - String seq_c, seq_d; - seq_c.reserve(std::max(seq_a.length(), seq_b.length())); - seq_d.reserve(std::max(seq_a.length(), seq_b.length())); - std::pair<int, int> best=FindMax(ali_mat, min_col, max_col, min_row, max_row); - std::pair<int, int> max_pair=best; - - typedef std::pair<int, int> XY; - std::vector<XY> stack; - while(ali_mat.Get(best.first, best.second)>0 && best.first>=min_col && - best.second>=min_row) { - stack.push_back(XY(best.first, best.second)); - int a=ali_mat.Get(best.first-1, best.second-1); - int b=ali_mat.Get(best.first-1, best.second); - int c=ali_mat.Get(best.first, best.second-1); - if (a>=c) { - if (a>=b) { - best.first-=1; - best.second-=1; - } else { - best.first-=1; - } - } else { - if (c>b) { - best.second-=1; - } else { - best.first-=1; - } - } - } - for (int i=static_cast<int>(stack.size())-2; i>=0; --i) { - XY next=stack[i]; - XY curr=stack[i+1]; - if (next.first==curr.first+1) { - if (next.second==curr.second+1) { - seq_c.append(1, seq_a[next.first-1]); - seq_d.append(1, seq_b[next.second-1]); - } else { - seq_c.append(1, seq_a[next.first-1]); - seq_d.append(1, '-'); - } - } else { - if (next.second==curr.second+1) { - seq_c.append(1, '-'); - seq_d.append(1, seq_b[next.second-1]); - } else { - seq_c.append(1, seq_a[next.first-1]); - seq_d.append(1, '-'); - } - } - } - if (seq_c.size()>2 && seq_c.size()>2) { - patches.push_back(AlignedPatch()); - patches.back().offset_a=best.first; - patches.back().offset_b=best.second; - if (seq_c[0]=='-' || seq_d[0]=='-') { - seq_c.erase(seq_c.begin()); - seq_d.erase(seq_d.begin()); - } - patches.back().alignment=CreateAlignment(); - patches.back().alignment.AddSequence(CreateSequence("seq1", seq_c)); - patches.back().alignment.AddSequence(CreateSequence("seq2 ", seq_d)); - CollectPatches(patches, seq_a, seq_b, ali_mat, max_pair.first+1, max_col, - max_pair.second+1, max_row); - } -} - -} - -AlignedPatchList LocalSequenceAlign(const SequenceHandle& sseq_a, - const SequenceHandle& sseq_b, - const AlignmentOpts& opts) -{ - - String seq_a=sseq_a.GetString(); - String seq_b=sseq_b.GetString(); - AliMatrix ali_mat(seq_a.length()+1, seq_b.length()+1); - // build alignment matrix. - for (size_t i=0; i<seq_a.length(); ++i) { - for (size_t j=0; j<seq_b.length(); ++j) { - int w=opts.subst_weights->GetWeight(seq_a[i], seq_b[j]); - int match_mism=std::max(0, w+int(ali_mat.Get(i, j))); - int del=ali_mat.Get(i, j+1)-opts.gap_open_penalty; - int ins=ali_mat.Get(i+1, j)-opts.gap_open_penalty; - ali_mat.Set(i+1, j+1, std::max(match_mism, std::max(del, ins))); - } - } - AlignedPatchList patches; - // backtracking - CollectPatches(patches, seq_a, seq_b, ali_mat, 0, seq_a.length()+1, - 0, seq_b.length()+1); - return patches; -} - -}}} - diff --git a/modules/seq/alg/src/local_align.hh b/modules/seq/alg/src/local_align.hh deleted file mode 100644 index b685851d39994c8106c3368b6127c356898d5428..0000000000000000000000000000000000000000 --- a/modules/seq/alg/src/local_align.hh +++ /dev/null @@ -1,70 +0,0 @@ -//------------------------------------------------------------------------------ -// This file is part of the OpenStructure project <www.openstructure.org> -// -// Copyright (C) 2008-2010 by the OpenStructure authors -// -// This library is free software; you can redistribute it and/or modify it under -// the terms of the GNU Lesser General Public License as published by the Free -// Software Foundation; either version 3.0 of the License, or (at your option) -// any later version. -// This library is distributed in the hope that it will be useful, but WITHOUT -// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more -// details. -// -// You should have received a copy of the GNU Lesser General Public License -// along with this library; if not, write to the Free Software Foundation, Inc., -// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -//------------------------------------------------------------------------------ -#ifndef OST_SEQ_LOCAL_ALIGN_HH -#define OST_SEQ_LOCAL_ALIGN_HH - - -#include <ost/seq/alignment_handle.hh> -#include <ost/seq/alg/module_config.hh> -#include <ost/seq/alg/alignment_opts.hh> -#include <ost/seq/alg/subst_weight_matrix.hh> -/* - Author: Marco Biasini - */ - -namespace ost { namespace seq { namespace alg { - -/// \brief holds the result of a local sequence alignment -struct DLLEXPORT_OST_SEQ_ALG AlignedPatch { - /// \brief offset of aligned patch relative to the start of the first sequence - int offset_a; - /// \brief offset of aligned patch relative to the start of the first sequence - int offset_b; - /// \brief the aligned patch - AlignmentHandle alignment; - bool operator==(const AlignedPatch& rhs) const { - return offset_a==rhs.offset_a && offset_b==rhs.offset_b && - rhs.alignment==alignment; - } - bool operator!=(const AlignedPatch& rhs) const { - return !this->operator==(rhs); - } -}; - -typedef std::vector<AlignedPatch> AlignedPatchList; - -/// \brief Align pair of sequences locally with Waterman-Smith algorithm -/// -/// The alignment algorithm will return the set of optimal alignments given the -/// weights of the substitution matrix and gap opening and extension penalties. -/// The sequence patches are non-overlapping -/// -/// \param seq_a is the first sequence to be aligned -/// \param seq_b is the second sequence to be aligned -/// -/// \param opts holds a set of alignment options such as gap opening and -/// extension penalties and the substitution matrix -/// -/// \return list of locally aligned sequence patches. -AlignedPatchList DLLEXPORT_OST_SEQ_ALG -LocalSequenceAlign(const SequenceHandle& seq_a, const SequenceHandle& seq_b, - const AlignmentOpts& opts); -}}} - -#endif