From 54b004566a3ec88f306ddaa4658178caca98ef1a Mon Sep 17 00:00:00 2001
From: marco <marco@5a81b35b-ba03-0410-adc8-b2c5c5119f08>
Date: Mon, 30 Aug 2010 07:30:37 +0000
Subject: [PATCH] speedup and unit test for sequence identity calculation

git-svn-id: https://dng.biozentrum.unibas.ch/svn/openstructure/trunk@2676 5a81b35b-ba03-0410-adc8-b2c5c5119f08
---
 modules/seq/alg/pymod/wrap_seq_alg.cc         |   4 +-
 modules/seq/alg/src/sequence_identity.cc      |  33 +++--
 modules/seq/alg/tests/CMakeLists.txt          |   1 +
 .../seq/alg/tests/test_sequence_identity.cc   | 132 ++++++++++++++++++
 4 files changed, 159 insertions(+), 11 deletions(-)
 create mode 100644 modules/seq/alg/tests/test_sequence_identity.cc

diff --git a/modules/seq/alg/pymod/wrap_seq_alg.cc b/modules/seq/alg/pymod/wrap_seq_alg.cc
index 7ec249db1..f632d3d6e 100644
--- a/modules/seq/alg/pymod/wrap_seq_alg.cc
+++ b/modules/seq/alg/pymod/wrap_seq_alg.cc
@@ -35,8 +35,8 @@ BOOST_PYTHON_MODULE(_seq_alg)
     .value("LONGER_SEQUENCE", RefMode::LONGER_SEQUENCE)
     .export_values()
   ;
-  def("SequenceIdentity", &SequenceIdentity, arg("seq_a")=0, arg("seq_b")=1);
-  
+  def("SequenceIdentity", &SequenceIdentity, 
+      (arg("ref_mode")=RefMode::ALIGNMENT, arg("seq_a")=0, arg("seq_b")=1));
   class_<AlignedRegionList>("AlignedRegionList", init<>())
     .def(vector_indexing_suite<AlignedRegionList>())
   ;
diff --git a/modules/seq/alg/src/sequence_identity.cc b/modules/seq/alg/src/sequence_identity.cc
index 855d3ec48..5d0a7dc2d 100644
--- a/modules/seq/alg/src/sequence_identity.cc
+++ b/modules/seq/alg/src/sequence_identity.cc
@@ -30,28 +30,43 @@ Real SequenceIdentity(const AlignmentHandle& aln, RefMode::Type ref_mode,
 {
   int non_gap_count=0;
   int identical_count=0;
-  for (AlignedColumnIterator i=aln.begin(), e=aln.end(); i!=e; ++i) {
-    AlignedColumn col=*i;
-    if (!(col[seq_a] == '-' && col[seq_b]=='-')) {
+  const String& sa=aln.GetSequence(seq_a).GetString();
+  const String& sb=aln.GetSequence(seq_b).GetString();  
+  for (String::const_iterator 
+       i=sa.begin(), e=sa.end(), j=sb.begin(); i!=e; ++i, ++j) {
+    if (!((*i)== '-' || (*j)=='-')) {
       non_gap_count++;
     }
-    if (col[seq_a]!='-' && col[seq_a]==col[seq_b]) {
+    if ((*i)!='-' && (*i)==(*j)) {
       identical_count++;
     }
   }
   Real seq_id=0.0;
-  // devide by length of longer sequence:
+  // divide by length of longer sequence:
   if (ref_mode==RefMode::LONGER_SEQUENCE) {
     int wo_gaps_a=aln.GetSequence(seq_a).GetGaplessString().length();
     int wo_gaps_b=aln.GetSequence(seq_b).GetGaplessString().length();
     if(wo_gaps_a>=wo_gaps_b) {
-      seq_id=100*static_cast<Real>(identical_count)/wo_gaps_a;
+      if (wo_gaps_a==0) {
+        seq_id=0;
+      } else {
+        seq_id=100*static_cast<Real>(identical_count)/wo_gaps_a;        
+      }
+
     } else {
-      seq_id=100*static_cast<Real>(identical_count)/wo_gaps_b;
+      if (wo_gaps_b==0) {
+        seq_id=0;
+      } else {
+        seq_id=100*static_cast<Real>(identical_count)/wo_gaps_b;        
+      }
     }
   } else {
-    // divide by alignment length:
-    seq_id=100*static_cast<Real>(identical_count)/non_gap_count;
+    if (non_gap_count==0) {
+      seq_id=0;
+    } else {
+      // divide by alignment length:
+      seq_id=100*static_cast<Real>(identical_count)/non_gap_count;      
+    }
   }
   return seq_id;
 }
diff --git a/modules/seq/alg/tests/CMakeLists.txt b/modules/seq/alg/tests/CMakeLists.txt
index 512de0856..37969a869 100644
--- a/modules/seq/alg/tests/CMakeLists.txt
+++ b/modules/seq/alg/tests/CMakeLists.txt
@@ -1,5 +1,6 @@
 set(OST_SEQ_ALG_UNIT_TESTS
   test_merge_pairwise_alignments.cc
+  test_sequence_identity.cc
   tests.cc
 )
 
diff --git a/modules/seq/alg/tests/test_sequence_identity.cc b/modules/seq/alg/tests/test_sequence_identity.cc
new file mode 100644
index 000000000..fa2d58eaf
--- /dev/null
+++ b/modules/seq/alg/tests/test_sequence_identity.cc
@@ -0,0 +1,132 @@
+//------------------------------------------------------------------------------
+// This file is part of the OpenStructure project <www.openstructure.org>
+//
+// Copyright (C) 2008-2010 by the OpenStructure authors
+//
+// This library is free software; you can redistribute it and/or modify it under
+// the terms of the GNU Lesser General Public License as published by the Free
+// Software Foundation; either version 3.0 of the License, or (at your option)
+// any later version.
+// This library is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
+// details.
+//
+// You should have received a copy of the GNU Lesser General Public License
+// along with this library; if not, write to the Free Software Foundation, Inc.,
+// 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+//------------------------------------------------------------------------------
+/*
+ *  Authors: Marco Biasini
+ */
+
+#define BOOST_TEST_DYN_LINK
+#include <boost/test/unit_test.hpp>
+
+#include <ost/seq/alg/sequence_identity.hh>
+
+
+
+using namespace ost;
+using namespace ost::seq;
+
+BOOST_AUTO_TEST_SUITE(ost_seq_alg)
+
+BOOST_AUTO_TEST_CASE(seqid_one) 
+{
+  SequenceHandle s1=CreateSequence("S1", "abcdefgh");
+  SequenceHandle s2=CreateSequence("S2", "abcd----");
+  
+  AlignmentHandle aln=CreateAlignment();
+  aln.AddSequence(s1);
+  aln.AddSequence(s2);
+  
+  Real seqid=alg::SequenceIdentity(aln, alg::RefMode::LONGER_SEQUENCE, 0, 1);
+  BOOST_CHECK_CLOSE(seqid, 50.0, 1e-6);
+
+  // check the default sequence indices
+  seqid=alg::SequenceIdentity(aln, alg::RefMode::LONGER_SEQUENCE);
+  BOOST_CHECK_CLOSE(seqid, 50.0, 1e-6);
+
+  // check the default value for reference mode
+  seqid=alg::SequenceIdentity(aln);
+  BOOST_CHECK_CLOSE(seqid, 50.0, 1e-6);  
+  
+  seqid=alg::SequenceIdentity(aln, alg::RefMode::ALIGNMENT);
+  BOOST_CHECK_CLOSE(seqid, 100.0, 1e-6);  
+}
+
+BOOST_AUTO_TEST_CASE(seqid_two) 
+{
+  SequenceHandle s1=CreateSequence("S1", "abcd");
+  SequenceHandle s2=CreateSequence("S2", "xbcx");
+  
+  AlignmentHandle aln=CreateAlignment();
+  aln.AddSequence(s1);
+  aln.AddSequence(s2);
+  
+  Real seqid=alg::SequenceIdentity(aln, alg::RefMode::LONGER_SEQUENCE, 0, 1);
+  BOOST_CHECK_CLOSE(seqid, 50.0, 1e-6);
+
+  // check the default sequence indices
+  seqid=alg::SequenceIdentity(aln, alg::RefMode::LONGER_SEQUENCE);
+  BOOST_CHECK_CLOSE(seqid, 50.0, 1e-6);
+
+  // check the default value for reference mode
+  seqid=alg::SequenceIdentity(aln);
+  BOOST_CHECK_CLOSE(seqid, 50.0, 1e-6);  
+  
+  seqid=alg::SequenceIdentity(aln, alg::RefMode::ALIGNMENT);
+  BOOST_CHECK_CLOSE(seqid, 50.0, 1e-6);  
+}
+
+BOOST_AUTO_TEST_CASE(seqid_three) 
+{
+  SequenceHandle s1=CreateSequence("S1", "a--d");
+  SequenceHandle s2=CreateSequence("S2", "x--x");
+  
+  AlignmentHandle aln=CreateAlignment();
+  aln.AddSequence(s1);
+  aln.AddSequence(s2);
+  
+  Real seqid=alg::SequenceIdentity(aln, alg::RefMode::LONGER_SEQUENCE, 0, 1);
+  BOOST_CHECK_CLOSE(seqid, 0.0, 1e-6);
+
+  // check the default sequence indices
+  seqid=alg::SequenceIdentity(aln, alg::RefMode::LONGER_SEQUENCE);
+  BOOST_CHECK_CLOSE(seqid, 0.0, 1e-6);
+
+  // check the default value for reference mode
+  seqid=alg::SequenceIdentity(aln);
+  BOOST_CHECK_CLOSE(seqid, 0.0, 1e-6);  
+  
+  seqid=alg::SequenceIdentity(aln, alg::RefMode::ALIGNMENT);
+  BOOST_CHECK_CLOSE(seqid, 0.0, 1e-6);  
+}
+
+BOOST_AUTO_TEST_CASE(seqid_empty) 
+{
+  SequenceHandle s1=CreateSequence("S1", "---");
+  SequenceHandle s2=CreateSequence("S2", "---");
+  
+  AlignmentHandle aln=CreateAlignment();
+  aln.AddSequence(s1);
+  aln.AddSequence(s2);
+  
+  Real seqid=alg::SequenceIdentity(aln, alg::RefMode::LONGER_SEQUENCE, 0, 1);
+  BOOST_CHECK_CLOSE(seqid, 0.0, 1e-6);
+
+  // check the default sequence indices
+  seqid=alg::SequenceIdentity(aln, alg::RefMode::LONGER_SEQUENCE);
+  BOOST_CHECK_CLOSE(seqid, 0.0, 1e-6);
+
+  // check the default value for reference mode
+  seqid=alg::SequenceIdentity(aln);
+  BOOST_CHECK_CLOSE(seqid, 0.0, 1e-6);  
+  
+  seqid=alg::SequenceIdentity(aln, alg::RefMode::ALIGNMENT);
+  BOOST_CHECK_CLOSE(seqid, 0.0, 1e-6);  
+}
+
+
+BOOST_AUTO_TEST_SUITE_END()
-- 
GitLab