diff --git a/modules/seq/alg/CMakeLists.txt b/modules/seq/alg/CMakeLists.txt index 6955f55f9b8d0237224a2d4aab25aa3b5310003d..83b3b1278b615370144ec4b094fa0bf0bc279874 100644 --- a/modules/seq/alg/CMakeLists.txt +++ b/modules/seq/alg/CMakeLists.txt @@ -1,2 +1,3 @@ add_subdirectory(src) -add_subdirectory(pymod) \ No newline at end of file +add_subdirectory(pymod) +add_subdirectory(tests) \ No newline at end of file diff --git a/modules/seq/alg/pymod/wrap_seq_alg.cc b/modules/seq/alg/pymod/wrap_seq_alg.cc index 6d610a04a90eacd6d6b5da5f47e5322cf88a08ab..d4c4462c861fdee993e124db514b39991a49d4db 100644 --- a/modules/seq/alg/pymod/wrap_seq_alg.cc +++ b/modules/seq/alg/pymod/wrap_seq_alg.cc @@ -18,9 +18,12 @@ //------------------------------------------------------------------------------ #include <boost/python.hpp> #include <boost/python/suite/indexing/vector_indexing_suite.hpp> -using namespace boost::python; + +#include <ost/seq/alg/merge_pairwise_alignments.hh> #include <ost/seq/alg/sequence_identity.hh> #include <ost/seq/alg/ins_del.hh> + +using namespace boost::python; using namespace ost::seq; using namespace ost::seq::alg; @@ -43,5 +46,6 @@ BOOST_PYTHON_MODULE(_seq_alg) .def("GetDeletions", &InsDel::GetDeletions) .def("GetInsertions", &InsDel::GetInsertions) ; + def("MergePairwiseAlignments", &MergePairwiseAlignments); export_Align(); } diff --git a/modules/seq/alg/src/CMakeLists.txt b/modules/seq/alg/src/CMakeLists.txt index 038e982523cff1706e933e311dfeb5a15222d0e7..ae43bd38d1d9bea4f7fbacd7d50be2d9eef46775 100644 --- a/modules/seq/alg/src/CMakeLists.txt +++ b/modules/seq/alg/src/CMakeLists.txt @@ -6,9 +6,11 @@ subst_weight_matrix.hh local_align.hh alignment_opts.hh global_align.hh +merge_pairwise_alignments.hh ) set(OST_SEQ_ALG_SOURCES +merge_pairwise_alignments.cc global_align.cc sequence_identity.cc ins_del.cc diff --git a/modules/seq/alg/src/merge_pairwise_alignments.cc b/modules/seq/alg/src/merge_pairwise_alignments.cc new file mode 100644 index 0000000000000000000000000000000000000000..f7a7c000ecebf9cc96711b59dff976a6c9620816 --- /dev/null +++ b/modules/seq/alg/src/merge_pairwise_alignments.cc @@ -0,0 +1,139 @@ +//------------------------------------------------------------------------------ +// This file is part of the OpenStructure project <www.openstructure.org> +// +// Copyright (C) 2008-2010 by the OpenStructure authors +// +// This library is free software; you can redistribute it and/or modify it under +// the terms of the GNU Lesser General Public License as published by the Free +// Software Foundation; either version 3.0 of the License, or (at your option) +// any later version. +// This library is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more +// details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this library; if not, write to the Free Software Foundation, Inc., +// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +//------------------------------------------------------------------------------ + + +#include <sstream> +#include <map> + +#include <ost/integrity_error.hh> +#include "merge_pairwise_alignments.hh" + +/* + Author: Marco Biasini + */ +namespace ost { namespace seq { namespace alg { + +typedef std::map<int, int> ShiftMap; + +namespace { + +void update_shifts(const AlignmentHandle& aln, + const SequenceHandle& ref_seq, + ShiftMap& shifts) +{ + ConstSequenceHandle s1=aln.GetSequence(0); + if (s1.GetGaplessString()!=ref_seq.GetString()) { + throw IntegrityError("The gapless version of '"+s1.GetString()+ + "' is not identical to the reference sequence."); + } + for (int i=0; i<s1.GetLength(); ++i) { + int shift=0; + int j=i; + while (j<s1.GetLength() && s1.GetOneLetterCode(j)=='-') { + shift++; + j++; + } + if (shift>0) { + assert(i>0); + int res_index=s1.GetResidueIndex(i-1); + ShiftMap::iterator p=shifts.find(res_index); + if (p!=shifts.end()) { + p->second=std::max(p->second, shift); + } else { + shifts.insert(std::make_pair(res_index, shift)); + } + } + i=j; + } +} + +SequenceHandle shift_reference(const SequenceHandle& ref_seq, + const ShiftMap& shifts) +{ + std::ostringstream new_sequence; + String ref_str=ref_seq.GetString(); + int last=0; + for (ShiftMap::const_iterator i=shifts.begin(),e=shifts.end();i!=e; ++i) { + new_sequence << ref_str.substr(last, i->first-last+1) + << String(i->second, '-'); + last=i->first+1; + } + new_sequence << ref_str.substr(last); + SequenceHandle s=CreateSequence(ref_seq.GetName(), + new_sequence.str()); + if (ref_seq.HasAttachedView()) + s.AttachView(ref_seq.GetAttachedView()); + s.SetSequenceOffset(s.GetSequenceOffset()); + return s; +} + +SequenceHandle realign_sequence(const AlignmentHandle& aln, + const ShiftMap& shifts) +{ + std::ostringstream new_sequence; + ConstSequenceHandle s1=aln.GetSequence(0); + ConstSequenceHandle s2=aln.GetSequence(1); + for (int i=0; i<s1.GetLength(); ++i) { + int shift=0; + int j=i; + while (j<s1.GetLength() && s1.GetOneLetterCode(j)=='-') { + shift++; + ++j; + } + if (i>0 && s1.GetOneLetterCode(i-1)!='-') { + ShiftMap::const_iterator p=shifts.find(s1.GetResidueIndex(i-1)); + if (p!=shifts.end()) { + int d=p->second-shift; + assert(d>=0); + new_sequence << String(d, '-'); + } + } + new_sequence << s2.GetOneLetterCode(i); + } + SequenceHandle s=CreateSequence(s2.GetName(), new_sequence.str()); + if (s2.HasAttachedView()) + s.AttachView(s2.GetAttachedView()); + s.SetSequenceOffset(s2.GetSequenceOffset()); + return s; +} + +} + +AlignmentHandle MergePairwiseAlignments(const AlignmentList& pairwise_alns, + const SequenceHandle& ref_seq) +{ + + ShiftMap shifts; + for (AlignmentList::const_iterator i=pairwise_alns.begin(), + e=pairwise_alns.end(); i!=e; ++i) { + update_shifts(*i, ref_seq, shifts); + } + + AlignmentHandle merged=CreateAlignment(); + merged.AddSequence(shift_reference(ref_seq, shifts)); + for (AlignmentList::const_iterator i=pairwise_alns.begin(), + e=pairwise_alns.end(); i!=e; ++i) { + SequenceHandle new_seq=realign_sequence(*i, shifts); + merged.AddSequence(new_seq); + } + return merged; +} + + +}}} diff --git a/modules/seq/alg/src/merge_pairwise_alignments.hh b/modules/seq/alg/src/merge_pairwise_alignments.hh new file mode 100644 index 0000000000000000000000000000000000000000..e021dbe09569a95d3cbb2bd0cfedac66ea29958b --- /dev/null +++ b/modules/seq/alg/src/merge_pairwise_alignments.hh @@ -0,0 +1,49 @@ +//------------------------------------------------------------------------------ +// This file is part of the OpenStructure project <www.openstructure.org> +// +// Copyright (C) 2008-2010 by the OpenStructure authors +// +// This library is free software; you can redistribute it and/or modify it under +// the terms of the GNU Lesser General Public License as published by the Free +// Software Foundation; either version 3.0 of the License, or (at your option) +// any later version. +// This library is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more +// details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this library; if not, write to the Free Software Foundation, Inc., +// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +//------------------------------------------------------------------------------ +#ifndef OST_SEQ_ALG_MERGE_PAIRWISE_ALIGNMENTS_HH +#define OST_SEQ_ALG_MERGE_PAIRWISE_ALIGNMENTS_HH + +#include <ost/seq/alignment_handle.hh> +#include <ost/seq/alg/module_config.hh> +/* + Author: Marco Biasini + */ +namespace ost { namespace seq { namespace alg { + + +/// \brief merge a list of pairwise alignments into one multiple sequence +/// alignment +/// +/// All sequences in the pairwise sequence alignments are a realigned to the +/// reference sequence. This is useful to merge the results of a BLAST or HMM +/// database search into one multiple sequence alignment. +/// +/// The method does not produce the optimal multiple sequence alignemnt for all +/// the sequences. +/// +/// \param pairwise_alignments is a list of AlignmentHandles, each containing +/// two sequences +/// \param ref_seq is the reference sequence. The reference sequence must not +/// contain any gaps. +AlignmentHandle DLLEXPORT_OST_SEQ_ALG +MergePairwiseAlignments(const AlignmentList& pairwise_alns, + const SequenceHandle& ref_seq); +}}} + +#endif diff --git a/modules/seq/alg/tests/CMakeLists.txt b/modules/seq/alg/tests/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..512de085684ed3392d006a2f3151f2174b1a744c --- /dev/null +++ b/modules/seq/alg/tests/CMakeLists.txt @@ -0,0 +1,7 @@ +set(OST_SEQ_ALG_UNIT_TESTS + test_merge_pairwise_alignments.cc + tests.cc +) + +ost_unittest(seq_alg "${OST_SEQ_ALG_UNIT_TESTS}") + diff --git a/modules/seq/alg/tests/test_merge_pairwise_alignments.cc b/modules/seq/alg/tests/test_merge_pairwise_alignments.cc new file mode 100644 index 0000000000000000000000000000000000000000..37d58e7073eebf2726fd5b8e170bb9096fec78bf --- /dev/null +++ b/modules/seq/alg/tests/test_merge_pairwise_alignments.cc @@ -0,0 +1,85 @@ +//------------------------------------------------------------------------------ +// This file is part of the OpenStructure project <www.openstructure.org> +// +// Copyright (C) 2008-2010 by the OpenStructure authors +// +// This library is free software; you can redistribute it and/or modify it under +// the terms of the GNU Lesser General Public License as published by the Free +// Software Foundation; either version 3.0 of the License, or (at your option) +// any later version. +// This library is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more +// details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this library; if not, write to the Free Software Foundation, Inc., +// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +//------------------------------------------------------------------------------ +/* + * Authors: Marco Biasini, Juergen Haas + */ + +#define BOOST_TEST_DYN_LINK +#include <boost/test/unit_test.hpp> + +#include <ost/seq/alg/merge_pairwise_alignments.hh> + + + +using namespace ost; +using namespace ost::seq; + +BOOST_AUTO_TEST_SUITE(ost_seq_alg) + +BOOST_AUTO_TEST_CASE(merge_pairwise_alignments_one) +{ + SequenceHandle ref=CreateSequence("REF", "abcdefghijklmn"); + SequenceHandle s1=CreateSequence("S1", "abcd---efghijklmn"); + SequenceHandle s2=CreateSequence("S2", "abcdxyzefghijklmn"); + AlignmentHandle aln1=CreateAlignment(); + aln1.AddSequence(s1); + aln1.AddSequence(s2); + + SequenceHandle s3=CreateSequence("S1", "abcdefghij---klmn"); + SequenceHandle s4=CreateSequence("S2", "abcdefghijxyzklmn"); + + AlignmentHandle aln2=CreateAlignment(); + aln2.AddSequence(s3); + aln2.AddSequence(s4); + AlignmentList l; + l.push_back(aln1); + l.push_back(aln2); + AlignmentHandle m=alg::MergePairwiseAlignments(l, ref); + ConstSequenceList seqs=m.GetSequences(); + BOOST_CHECK_EQUAL(seqs[0].GetString(), "abcd---efghij---klmn"); + BOOST_CHECK_EQUAL(seqs[1].GetString(), "abcdxyzefghij---klmn"); + BOOST_CHECK_EQUAL(seqs[2].GetString(), "abcd---efghijxyzklmn"); +} + +BOOST_AUTO_TEST_CASE(merge_pairwise_alignments_two) +{ + SequenceHandle ref=CreateSequence("REF", "abcdefghijklmn"); + SequenceHandle s1=CreateSequence("S1", "abcd---efghijklmn"); + SequenceHandle s2=CreateSequence("S2", "abcdxyzefghijklmn"); + AlignmentHandle aln1=CreateAlignment(); + aln1.AddSequence(s1); + aln1.AddSequence(s2); + + SequenceHandle s3=CreateSequence("S1", "abcd-efghijklmn"); + SequenceHandle s4=CreateSequence("S2", "abcdyefghijklmn"); + + AlignmentHandle aln2=CreateAlignment(); + aln2.AddSequence(s3); + aln2.AddSequence(s4); + AlignmentList l; + l.push_back(aln1); + l.push_back(aln2); + AlignmentHandle m=alg::MergePairwiseAlignments(l, ref); + ConstSequenceList seqs=m.GetSequences(); + BOOST_CHECK_EQUAL(seqs[0].GetString(), "abcd---efghijklmn"); + BOOST_CHECK_EQUAL(seqs[1].GetString(), "abcdxyzefghijklmn"); + BOOST_CHECK_EQUAL(seqs[2].GetString(), "abcd--yefghijklmn"); +} + +BOOST_AUTO_TEST_SUITE_END() diff --git a/modules/seq/alg/tests/tests.cc b/modules/seq/alg/tests/tests.cc new file mode 100644 index 0000000000000000000000000000000000000000..b7e455b67d5f2c41282391e3d362f860c992290d --- /dev/null +++ b/modules/seq/alg/tests/tests.cc @@ -0,0 +1,22 @@ +//------------------------------------------------------------------------------ +// This file is part of the OpenStructure project <www.openstructure.org> +// +// Copyright (C) 2008-2010 by the OpenStructure authors +// +// This library is free software; you can redistribute it and/or modify it under +// the terms of the GNU Lesser General Public License as published by the Free +// Software Foundation; either version 3.0 of the License, or (at your option) +// any later version. +// This library is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more +// details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this library; if not, write to the Free Software Foundation, Inc., +// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +//------------------------------------------------------------------------------ +#define BOOST_TEST_DYN_LINK +#define BOOST_TEST_MODULE ost_seq_alg +#include <boost/test/unit_test.hpp> +