From 23a01b0b32a5b0b401b0e4d5fcc6981f3576ddbf Mon Sep 17 00:00:00 2001
From: Martino Bertoni <martino.bertoni@unibas.ch>
Date: Fri, 8 Nov 2013 15:29:14 +0100
Subject: [PATCH] added a semiglobal alignment. equal to global alignment but
 without starting and ending gaps penalty. good when 1 sequence is much
 smaller then the other or you expect the 2 sequences just to overlap a bit.

---
 modules/seq/alg/pymod/wrap_seq_alg.cc         |   3 +
 modules/seq/alg/src/CMakeLists.txt            |   2 +
 modules/seq/alg/src/semiglobal_align.cc       | 103 ++++++++++++++++++
 modules/seq/alg/src/semiglobal_align.hh       |  35 ++++++
 modules/seq/alg/tests/CMakeLists.txt          |   1 +
 .../seq/alg/tests/test_semiglobal_align.py    |  23 ++++
 6 files changed, 167 insertions(+)
 create mode 100644 modules/seq/alg/src/semiglobal_align.cc
 create mode 100644 modules/seq/alg/src/semiglobal_align.hh
 create mode 100644 modules/seq/alg/tests/test_semiglobal_align.py

diff --git a/modules/seq/alg/pymod/wrap_seq_alg.cc b/modules/seq/alg/pymod/wrap_seq_alg.cc
index f22866761..418defe25 100644
--- a/modules/seq/alg/pymod/wrap_seq_alg.cc
+++ b/modules/seq/alg/pymod/wrap_seq_alg.cc
@@ -26,6 +26,7 @@
 #include <ost/seq/alg/subst_weight_matrix.hh>
 #include <ost/seq/alg/local_align.hh>
 #include <ost/seq/alg/global_align.hh>
+#include <ost/seq/alg/semiglobal_align.hh>
 #include <ost/seq/alg/entropy.hh>
 using namespace boost::python;
 using namespace ost::seq;
@@ -59,6 +60,8 @@ BOOST_PYTHON_MODULE(_ost_seq_alg)
       arg("gap_open")=-5, arg("gap_ext")=-2));
   def("GlobalAlign", &GlobalAlign,(arg("seq1"),arg("seq2"),arg("subst_weight"), 
       arg("gap_open")=-5, arg("gap_ext")=-2));
+  def("SemiGlobalAlign", &SemiGlobalAlign,(arg("seq1"),arg("seq2"),arg("subst_weight"), 
+      arg("gap_open")=-5, arg("gap_ext")=-2));
   def("ShannonEntropy", &ShannonEntropy, (arg("aln"), arg("ignore_gaps")=true));
 
 }
diff --git a/modules/seq/alg/src/CMakeLists.txt b/modules/seq/alg/src/CMakeLists.txt
index ea6284a98..76549ab0d 100644
--- a/modules/seq/alg/src/CMakeLists.txt
+++ b/modules/seq/alg/src/CMakeLists.txt
@@ -9,12 +9,14 @@ merge_pairwise_alignments.hh
 conservation.hh
 local_align.hh
 global_align.hh
+semiglobal_align.hh
 )
 
 set(OST_SEQ_ALG_SOURCES
 merge_pairwise_alignments.cc
 local_align.cc
 global_align.cc
+semiglobal_align.cc
 entropy.cc
 sequence_identity.cc
 ins_del.cc
diff --git a/modules/seq/alg/src/semiglobal_align.cc b/modules/seq/alg/src/semiglobal_align.cc
new file mode 100644
index 000000000..4b89fd15c
--- /dev/null
+++ b/modules/seq/alg/src/semiglobal_align.cc
@@ -0,0 +1,103 @@
+//------------------------------------------------------------------------------
+// This file is part of the OpenStructure project <www.openstructure.org>
+//
+// Copyright (C) 2008-2011 by the OpenStructure authors
+//
+// This library is free software; you can redistribute it and/or modify it under
+// the terms of the GNU Lesser General Public License as published by the Free
+// Software Foundation; either version 3.0 of the License, or (at your option)
+// any later version.
+// This library is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
+// details.
+//
+// You should have received a copy of the GNU Lesser General Public License
+// along with this library; if not, write to the Free Software Foundation, Inc.,
+// 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+//------------------------------------------------------------------------------
+#include <ost/log.hh>
+#include "impl/align_impl.hh"
+#include "semiglobal_align.hh"
+
+namespace ost { namespace seq { namespace alg {
+
+void SemiTraceback(impl::AlnMat& mat, int max_i, int max_j,
+                  const ConstSequenceHandle& s1, 
+                  const ConstSequenceHandle& s2,
+                  AlignmentList& alignments)
+{
+  int i = (max_i > 0 ? max_i-1 : 0);
+  int j = (max_j > 0 ? max_j-1 : 0);
+  String aln_str1;
+  String aln_str2;
+  while (i > 0 || j > 0) {
+    impl::SetRoute(mat, i, j, s1, s2, aln_str1, aln_str2);
+  }
+  impl::StoreStrAsAln(aln_str1, aln_str2, s1, s2, i, j, alignments);
+}
+
+AlignmentList SemiGlobalAlign(const ConstSequenceHandle& s1, 
+                              const ConstSequenceHandle& s2,
+                              alg::SubstWeightMatrixPtr& subst,
+                              int gap_open,int gap_ext)
+{
+  impl::AlnMat mat(s1.GetLength()+1, s2.GetLength()+1);
+  // init first column of matrix no start gap penalty
+  if (mat.GetHeight() > 1) {
+    mat(0, 1).score = mat(0, 0).score;
+    mat(0, 1).from = impl::INS1;
+  }
+  for (int j = 2; j < mat.GetHeight(); ++j) {
+    mat(0, j).score = mat(0, j-1).score;
+    mat(0, j).from = impl::INS1;
+  }
+  // fill the alignment matrix
+  for (int i = 0; i < mat.GetWidth()-1; ++i) {
+    mat(i+1, 0).score = mat(i, 0).score;
+    mat(i+1, 0).from = impl::INS2;
+    for (int j = 0; j < mat.GetHeight()-1; ++j) {
+      char c1=s1[i];
+      char c2=s2[j];
+      short weight=subst->GetWeight(c1, c2);
+      int diag=weight+mat(i, j).score;
+      int ins1=mat(i+1, j).score
+              +(mat(i+1, j).from==impl::INS1 ? gap_ext : gap_open);
+      int ins2=mat(i, j+1).score
+              +(mat(i, j+1).from==impl::INS2 ? gap_ext : gap_open);
+      // ignore end gaps (if one of the two seqs is done)
+      if (j+1==mat.GetHeight()-1){
+        ins2=mat(i, j+1).score;
+      }
+      if (i+1==mat.GetWidth()-1){
+        ins1=mat(i+1, j).score;
+      }
+      if (diag>=ins1) {
+        if (diag>=ins2) {
+          mat(i+1, j+1).score=diag;
+          mat(i+1, j+1).from=impl::DIAG;
+        } else {
+          mat(i+1, j+1).score=ins2;
+          mat(i+1, j+1).from=impl::INS2;
+        }
+      } else if (ins1>ins2) {
+        mat(i+1, j+1).score=ins1;
+        mat(i+1, j+1).from=impl::INS1;
+      } else {
+        mat(i+1, j+1).score=ins2;
+        mat(i+1, j+1).from=impl::INS2;
+      }
+    }
+  }
+  // write traceback matrix in debug mode
+#if !defined(NDEBUG)
+  DbgWriteAlnMatrix(mat,s1, s2);
+#endif
+  AlignmentList alignments;
+  SemiTraceback(mat, mat.GetWidth(), mat.GetHeight(), s1, s2, alignments);
+  LOG_DEBUG(alignments.back().ToString(80));
+
+  return alignments;
+}
+
+}}}
diff --git a/modules/seq/alg/src/semiglobal_align.hh b/modules/seq/alg/src/semiglobal_align.hh
new file mode 100644
index 000000000..6d1ca5002
--- /dev/null
+++ b/modules/seq/alg/src/semiglobal_align.hh
@@ -0,0 +1,35 @@
+//------------------------------------------------------------------------------
+// This file is part of the OpenStructure project <www.openstructure.org>
+//
+// Copyright (C) 2008-2011 by the OpenStructure authors
+//
+// This library is free software; you can redistribute it and/or modify it under
+// the terms of the GNU Lesser General Public License as published by the Free
+// Software Foundation; either version 3.0 of the License, or (at your option)
+// any later version.
+// This library is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
+// details.
+//
+// You should have received a copy of the GNU Lesser General Public License
+// along with this library; if not, write to the Free Software Foundation, Inc.,
+// 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+//------------------------------------------------------------------------------
+#ifndef OST_SEQ_ALG_SEMIGLOBAL_ALIGN_HH
+#define OST_SEQ_ALG_SEMIGLOBAL_ALIGN_HH
+
+#include <ost/seq/alignment_handle.hh>
+#include <ost/seq/alg/subst_weight_matrix.hh>
+#include "module_config.hh"
+
+namespace ost { namespace seq { namespace alg {
+
+
+AlignmentList DLLEXPORT_OST_SEQ_ALG SemiGlobalAlign(const ConstSequenceHandle& s1,
+                                                    const ConstSequenceHandle& s2,
+                                                    SubstWeightMatrixPtr& subst,
+                                                    int gap_open=-5,int gap_ext=-2);
+}}}
+
+#endif
diff --git a/modules/seq/alg/tests/CMakeLists.txt b/modules/seq/alg/tests/CMakeLists.txt
index 76b599790..ca0e72988 100644
--- a/modules/seq/alg/tests/CMakeLists.txt
+++ b/modules/seq/alg/tests/CMakeLists.txt
@@ -5,6 +5,7 @@ set(OST_SEQ_ALG_UNIT_TESTS
   test_renumber.py
   test_local_align.py
   test_global_align.py
+  test_semiglobal_align.py
   test_weight_matrix.py
   test_aligntoseqres.py
 )
diff --git a/modules/seq/alg/tests/test_semiglobal_align.py b/modules/seq/alg/tests/test_semiglobal_align.py
new file mode 100644
index 000000000..e50198928
--- /dev/null
+++ b/modules/seq/alg/tests/test_semiglobal_align.py
@@ -0,0 +1,23 @@
+import unittest
+from ost import *
+from ost import settings
+from ost import seq
+from ost.bindings.clustalw import *
+
+class TestSemiGlobalAlign(unittest.TestCase):
+  def testSemiGlobalAlignment(self):
+    seq_a=seq.CreateSequence('A', 'abcdefghijklmnok')
+    seq_b=seq.CreateSequence('B', 'cdehijk')
+    alns=seq.alg.SemiGlobalAlign(seq_a, seq_b, seq.alg.BLOSUM62)
+    self.assertEqual(len(alns), 1)
+    self.assertEqual(alns[0].sequences[0].name, 'A')
+    self.assertEqual(alns[0].sequences[1].name, 'B')
+    self.assertEqual(str(alns[0].sequences[0]), 'abcdefghijklmnok')
+    self.assertEqual(str(alns[0].sequences[1]), '--cde--hijk-----')
+    self.assertEqual(alns[0].sequences[0].offset, 0)
+    self.assertEqual(alns[0].sequences[1].offset, 0)
+
+
+if __name__ == "__main__":
+  from ost import testutils
+  testutils.RunTests()
-- 
GitLab