diff --git a/modules/seq/alg/doc/seqalg.rst b/modules/seq/alg/doc/seqalg.rst new file mode 100644 index 0000000000000000000000000000000000000000..79e6ca62bb3d97082b37fcbb9afee719e12b276f --- /dev/null +++ b/modules/seq/alg/doc/seqalg.rst @@ -0,0 +1,99 @@ +:mod:`seq.alg <ost.seq.alg>` -- Algorithms for Sequences +================================================================================ + +.. module:: ost.seq.alg + :synopsis: Algorithms for sequences + + +.. function:: MergePairwiseAlignments(pairwise_alns, ref_seq) + + Merge a list of pairwise alignments into a multiple sequence alignments. This + function uses the reference sequence as the anchor and inserts gaps where + needed. This is also known as the *star method*. + + The resulting multiple sequence alignment provides a simple way to map between + residues of pairwise alignments, e.g. to compare distances in two structural + templates. + + There are a few things to keep in mind when using this function: + + - The reference sequence mustn't contain any gaps + + - The first sequence of each pairwise alignments corresponds to the reference + sequence. Apart from the presence of gaps, these two sequences must be + completely identical. + + - If the reference sequence has an offset, the first sequence of each pairwise alignment + must have the same offset. This offset is inherited by the first sequence of the final + output alignment. + + - The resulting multiple sequence alignment is by no means optimal. For + better results, consider using a multiple-sequence alignment program such + as MUSCLE or ClustalW. + + - Residues in columns where the reference sequence has gaps should not be + considered as aligned. There is no information in the pairwise alignment to + guide the merging, the result is undefined. + +.. autofunction:: AlignToSEQRES + +.. autofunction:: AlignmentFromChainView + + + +.. function:: LocalAlign(seq1, seq2, subst_weight, gap_open=-5, gap_ext=-2) + + Performs a Smith/Waterman local alignment of *seq1* and *seq2* and returns + the best-scoring alignments as a list of pairwise alignments. + + **Example:** + + .. code-block:: python + + seq_a=seq.CreateSequence('A', 'acdefghiklmn') + seq_b=seq.CreateSequence('B', 'acdhiklmn') + alns=seq.alg.LocalAlign(seq_a, seq_b, seq.alg.BLOSUM62) + print alns[0].ToString(80) + # >>> A acdefghiklmn + # >>> B acd---hiklmn + + :param seq1: A valid sequence + :type seq1: :class:`~ost.seq.ConstSequenceHandle` + :param seq2: A valid sequence + :type seq2: :class:`~ost.seq.ConstSequenceHandle` + + :param subst_weigth: The substitution weights matrix + :type subst_weight: :class:`SubstWeightMatrix` + :param gap_open: The gap opening penalty. Must be a negative number + :param gap_ext: The gap extension penalty. Must be a negative number + :returns: list of best-scoring, non-overlapping alignments of *seq1* and + *seq2*. Since alignments always start with a replacement, the start is + stored in the sequence offset of the two sequences. + + +.. function:: GlobalAlign(seq1, seq2, subst_weight, gap_open=-5, gap_ext=-2) + + Performs a Needleman/Wunsch global alignment of *seq1* and *seq2* and returns + the best-scoring alignment. + + **Example:** + + .. code-block:: python + + seq_a=seq.CreateSequence('A', 'acdefghiklmn') + seq_b=seq.CreateSequence('B', 'acdhiklmn') + alns=seq.alg.GlobalAlign(seq_a, seq_b, seq.alg.BLOSUM62) + print alns[0].ToString(80) + # >>> A acdefghiklmn + # >>> B acd---hiklmn + + :param seq1: A valid sequence + :type seq1: :class:`~ost.seq.ConstSequenceHandle` + :param seq2: A valid sequence + :type seq2: :class:`~ost.seq.ConstSequenceHandle` + + :param subst_weigth: The substitution weights matrix + :type subst_weight: :class:`SubstWeightMatrix` + :param gap_open: The gap opening penalty. Must be a negative number + :param gap_ext: The gap extension penalty. Must be a negative number + :returns: best-scoring alignment of *seq1* and *seq2*. diff --git a/modules/seq/alg/src/merge_pairwise_alignments.cc b/modules/seq/alg/src/merge_pairwise_alignments.cc index 032e92ac9de3d0d994fb7449247c033707198cdb..3b45e335fc9a8749f2eb824f39cdec4b4fb27df2 100644 --- a/modules/seq/alg/src/merge_pairwise_alignments.cc +++ b/modules/seq/alg/src/merge_pairwise_alignments.cc @@ -90,7 +90,7 @@ SequenceHandle shift_reference(const ConstSequenceHandle& ref_seq, new_sequence.str()); if (ref_seq.HasAttachedView()) s.AttachView(ref_seq.GetAttachedView()); - s.SetOffset(s.GetOffset()); + s.SetOffset(ref_seq.GetOffset()); return s; } @@ -134,7 +134,23 @@ SequenceHandle realign_sequence(const AlignmentHandle& aln, AlignmentHandle MergePairwiseAlignments(const AlignmentList& pairwise_alns, const ConstSequenceHandle& ref_seq) { - + + int ref_offset=ref_seq.GetOffset(); + int alignment_counter=0; + for (AlignmentList::const_iterator i=pairwise_alns.begin(), e=pairwise_alns.end(); i!=e; ++i) { + AlignmentHandle ali_handle=*i; + ConstSequenceHandle seq_handle=ali_handle.GetSequence(0); + if (seq_handle.GetOffset()!=ref_offset) { + String error_string; + std::ostringstream error_ss(error_string); + error_ss << "The offset of the first sequence in alignment " << alignment_counter << + "is not identical to the offset of the reference sequence." << std::endl; + throw IntegrityError(error_string); + } + alignment_counter+=1; + } + + ShiftMap shifts; for (AlignmentList::const_iterator i=pairwise_alns.begin(), e=pairwise_alns.end(); i!=e; ++i) { diff --git a/modules/seq/alg/tests/test_merge_pairwise_alignments.cc b/modules/seq/alg/tests/test_merge_pairwise_alignments.cc index 3f67ac5d161840199ac352ffb09d5bf7c883603d..851f5c9f163911f1166492a25fd94ebcf4c5ea5a 100644 --- a/modules/seq/alg/tests/test_merge_pairwise_alignments.cc +++ b/modules/seq/alg/tests/test_merge_pairwise_alignments.cc @@ -24,7 +24,7 @@ #include <boost/test/unit_test.hpp> #include <ost/seq/alg/merge_pairwise_alignments.hh> - +#include <ost/integrity_error.hh> using namespace ost; @@ -107,4 +107,86 @@ BOOST_AUTO_TEST_CASE(merge_pairwise_alignments_three) BOOST_CHECK_EQUAL(seqs[1].GetString(), "xyabcdefghijk"); BOOST_CHECK_EQUAL(seqs[2].GetString(), "-zabcdefghijk"); } + +BOOST_AUTO_TEST_CASE(merge_pairwise_alignments_four) +{ + SequenceHandle ref=CreateSequence("REF", "abcdefghijk"); + SequenceHandle s1=CreateSequence("S1", "abcdefghijk--"); + SequenceHandle s2=CreateSequence("S2", "abcdefghijkxy"); + AlignmentHandle aln1=CreateAlignment(); + aln1.AddSequence(s1); + aln1.AddSequence(s2); + + SequenceHandle s3=CreateSequence("S1", "-abcdefghijk"); + SequenceHandle s4=CreateSequence("S2", "zabcdefghijk"); + + AlignmentHandle aln2=CreateAlignment(); + aln2.AddSequence(s3); + aln2.AddSequence(s4); + AlignmentList l; + l.push_back(aln1); + l.push_back(aln2); + AlignmentHandle m=alg::MergePairwiseAlignments(l, ref); + ConstSequenceList seqs=m.GetSequences(); + BOOST_CHECK_EQUAL(seqs[0].GetString(), "-abcdefghijk--"); + BOOST_CHECK_EQUAL(seqs[1].GetString(), "-abcdefghijkxy"); + BOOST_CHECK_EQUAL(seqs[2].GetString(), "zabcdefghijk--"); +} + +BOOST_AUTO_TEST_CASE(merge_pairwise_alignments_five) +{ + SequenceHandle ref=CreateSequence("REF", "abcdefghijk"); + ref.SetOffset(4); + SequenceHandle s1=CreateSequence("S1", "abcdefghijk--"); + SequenceHandle s2=CreateSequence("S2", "abcdefghijkxy"); + s1.SetOffset(4); + AlignmentHandle aln1=CreateAlignment(); + aln1.AddSequence(s1); + aln1.AddSequence(s2); + + SequenceHandle s3=CreateSequence("S1", "-abcdefghijk"); + SequenceHandle s4=CreateSequence("S2", "zabcdefghijk"); + s3.SetOffset(4); + + AlignmentHandle aln2=CreateAlignment(); + aln2.AddSequence(s3); + aln2.AddSequence(s4); + AlignmentList l; + l.push_back(aln1); + l.push_back(aln2); + AlignmentHandle m=alg::MergePairwiseAlignments(l, ref); + ConstSequenceList seqs=m.GetSequences(); + + BOOST_CHECK_EQUAL(seqs[0].GetString(), "-abcdefghijk--"); + BOOST_CHECK_EQUAL(seqs[0].GetOffset(), 4); + BOOST_CHECK_EQUAL(seqs[1].GetString(), "-abcdefghijkxy"); + BOOST_CHECK_EQUAL(seqs[2].GetString(), "zabcdefghijk--"); +} + +BOOST_AUTO_TEST_CASE(merge_pairwise_alignments_six) +{ + SequenceHandle ref=CreateSequence("REF", "abcdefghijk"); + ref.SetOffset(4); + SequenceHandle s1=CreateSequence("S1", "abcdefghijk--"); + SequenceHandle s2=CreateSequence("S2", "abcdefghijkxy"); + s1.SetOffset(5); + AlignmentHandle aln1=CreateAlignment(); + aln1.AddSequence(s1); + aln1.AddSequence(s2); + + SequenceHandle s3=CreateSequence("S1", "-abcdefghijk"); + SequenceHandle s4=CreateSequence("S2", "zabcdefghijk"); + s3.SetOffset(4); + + AlignmentHandle aln2=CreateAlignment(); + aln2.AddSequence(s3); + aln2.AddSequence(s4); + AlignmentList l; + l.push_back(aln1); + l.push_back(aln2); + BOOST_CHECK_THROW (alg::MergePairwiseAlignments(l, ref),IntegrityError); + +} + + BOOST_AUTO_TEST_SUITE_END()