From 24927ca428e844b92b8b570d2a72acaa04ab10c5 Mon Sep 17 00:00:00 2001
From: Stefan Bienert <stefan.bienert@unibas.ch>
Date: Fri, 27 Sep 2013 11:34:27 +0200
Subject: [PATCH] Added function 'Normalise()' for sequences

---
 modules/seq/base/doc/seq.rst               | 11 +++++++----
 modules/seq/base/pymod/export_sequence.cc  |  1 +
 modules/seq/base/src/impl/sequence_impl.cc | 21 +++++++++++++++++++++
 modules/seq/base/src/impl/sequence_impl.hh |  3 +++
 modules/seq/base/src/sequence_handle.cc    |  5 +++++
 modules/seq/base/src/sequence_handle.hh    |  8 ++++++--
 modules/seq/base/tests/test_seq.py         | 10 ++++++++++
 modules/seq/base/tests/test_sequence.cc    | 20 ++++++++++++++++++++
 8 files changed, 73 insertions(+), 6 deletions(-)

diff --git a/modules/seq/base/doc/seq.rst b/modules/seq/base/doc/seq.rst
index 042f1d09c..e5a89a5f1 100644
--- a/modules/seq/base/doc/seq.rst
+++ b/modules/seq/base/doc/seq.rst
@@ -30,7 +30,7 @@ Sequence Offset
 
 When using sequences and structures together, often the start of the structure 
 and the beginning of the sequence do not fall together. In the following case, 
-the alignment of sequences B and C only covers a subpart of structure A::
+the alignment of sequences B and C only covers a subsequence of structure A::
 
   A acefghiklmnpqrstuvwy
   B     ghiklm
@@ -167,6 +167,10 @@ The SequenceHandle
     Returns a string version of this sequence with all hyphens removed. Also
     available as the property :attr:`gapless_string`.
      
+  .. method:: Normalise()
+     
+    Remove '-' and '.' as gaps from the sequence and make it all upper case.
+    Works in place.
    
   .. method:: SetName()
   
@@ -205,7 +209,7 @@ The SequenceHandle
 
   Check whether the two sequences s1 and s2 match. This function performs are
   case-insensitive comparison of the two sequences. The character  'X' is
-  interpreted as a wildcard character that always matches the other sequence.
+  interpreted as a wild card character that always matches the other sequence.
 
 The SequenceList    
 --------------------------------------------------------------------------------
@@ -220,7 +224,7 @@ The AlignmentHandle
 --------------------------------------------------------------------------------
 
 The :class:`AlignmentHandle` represents a list of aligned sequences. In
-constrast to :class:`SequenceList`, an alignment requires all sequences to be of 
+contrast to :class:`SequenceList`, an alignment requires all sequences to be of 
 the same length. New instances of alignments are created with 
 :func:`CreateAlignment` and :func:`AlignmentFromSequenceList`.
 
@@ -431,4 +435,3 @@ an alignment:
   .. method:: RemoveSequence(index)
 
     Remove sequence at *index* from the alignment.
-
diff --git a/modules/seq/base/pymod/export_sequence.cc b/modules/seq/base/pymod/export_sequence.cc
index 6cdf081cd..1919290e7 100644
--- a/modules/seq/base/pymod/export_sequence.cc
+++ b/modules/seq/base/pymod/export_sequence.cc
@@ -300,6 +300,7 @@ void export_sequence()
     .def("AttachView", attach_two)
     .def("Append", &SequenceHandle::Append)
     .def("SetString", &SequenceHandle::SetString)
+    .def("Normalise", &SequenceHandle::Normalise)
     .add_property("string",
                   make_function(&SequenceHandle::GetString,
                                 return_value_policy<copy_const_reference>()),
diff --git a/modules/seq/base/src/impl/sequence_impl.cc b/modules/seq/base/src/impl/sequence_impl.cc
index 733eaf1b8..9864fe8e5 100644
--- a/modules/seq/base/src/impl/sequence_impl.cc
+++ b/modules/seq/base/src/impl/sequence_impl.cc
@@ -331,6 +331,27 @@ void SequenceImpl::Replace(const String& str,int start, int end)
   this->ShiftsFromSequence();
 }
 
+void SequenceImpl::Normalise() {
+  const char gaps[]={'.', '-', '\0'};
+  bool is_gap;
+  String new_seq_string = "";
+  int n_gaps = sizeof(gaps) / sizeof(gaps[0]);
+  for (size_t i=0; i<seq_string_.length(); ++i) {
+    is_gap = false;
+    for (int j=0; j<n_gaps; ++j) {
+      if (seq_string_[i]==gaps[j]) {
+        is_gap = true;
+        break;
+      }
+    }
+    if (is_gap == false) {
+      new_seq_string = new_seq_string + char(toupper(seq_string_[i]));
+    }
+  }
+  seq_string_ = new_seq_string;
+  this->ShiftsFromSequence();
+}
+
 void SequenceImpl::ShiftRegion(int start, int end, int amount)
 {
   if(start > end || start + amount < 0 || end + amount > this->GetLength()){
diff --git a/modules/seq/base/src/impl/sequence_impl.hh b/modules/seq/base/src/impl/sequence_impl.hh
index fecad2ca9..ccb64b775 100644
--- a/modules/seq/base/src/impl/sequence_impl.hh
+++ b/modules/seq/base/src/impl/sequence_impl.hh
@@ -81,6 +81,9 @@ public:
   /// \brief Set sequence String
   void SetString(const String& seq);
 
+  /// \brief Remove everything 'unusual' from sequence
+  void Normalise();
+
   /// \brief replace substring starting from start to end
   void Replace(const String& str,int start, int end);
 
diff --git a/modules/seq/base/src/sequence_handle.cc b/modules/seq/base/src/sequence_handle.cc
index 627ebdc36..f57a624e7 100644
--- a/modules/seq/base/src/sequence_handle.cc
+++ b/modules/seq/base/src/sequence_handle.cc
@@ -295,6 +295,11 @@ String SequenceHandle::GetGaplessString() const
   return Impl()->GetGaplessString();
 }
 
+void SequenceHandle::Normalise() {
+  this->CheckValidity();
+  Impl()->Normalise();
+}
+
 int SequenceHandle::GetOffset() const
 {
   this->CheckValidity();
diff --git a/modules/seq/base/src/sequence_handle.hh b/modules/seq/base/src/sequence_handle.hh
index d7247cbd1..cf9aa732d 100644
--- a/modules/seq/base/src/sequence_handle.hh
+++ b/modules/seq/base/src/sequence_handle.hh
@@ -123,7 +123,7 @@ public:
   bool HasAttachedView() const;
   
   const String& GetRole() const;
-  
+
   bool operator==(const ConstSequenceHandle& rhs) const;
   bool operator!=(const ConstSequenceHandle& rhs) const;  
   
@@ -231,7 +231,11 @@ public:
   /// \brief get attached view. may be an invalid entity view
   /// \sa SequenceHandle::AttachView(const mol::EntityView&, const String&)
   mol::EntityView GetAttachedView() const;
-  
+
+  /// \brief remove '.', '-' as gaps and make sequence all-uppercase
+  /// Changes happen in place.
+  void Normalise();
+
   /// \brief create copy sequence
   /// The newly created sequence has the same attached view.
   SequenceHandle Copy() const;
diff --git a/modules/seq/base/tests/test_seq.py b/modules/seq/base/tests/test_seq.py
index bfd80d7bf..f202fe522 100644
--- a/modules/seq/base/tests/test_seq.py
+++ b/modules/seq/base/tests/test_seq.py
@@ -175,6 +175,16 @@ class TestSeq(unittest.TestCase):
     self.assertEqual(string_a, 'BDFH')
     self.assertEqual(string_b, 'BDFH')
 
+  def testNormalise(self):
+    seq_a=seq.CreateSequence("A", "B-D-FGH")
+    self.assertEqual("B-D-FGH", seq_a.GetString())
+    seq_a.Normalise()
+    self.assertEqual("BDFGH", seq_a.GetString())
+    seq_a=seq.CreateSequence("A", "b.d-fgh")
+    self.assertEqual("b.d-fgh", seq_a.GetString())
+    seq_a.Normalise()
+    self.assertEqual("BDFGH", seq_a.GetString())
+
 if __name__== '__main__':
   from ost import testutils
   testutils.RunTests()
diff --git a/modules/seq/base/tests/test_sequence.cc b/modules/seq/base/tests/test_sequence.cc
index 462747784..309939db0 100644
--- a/modules/seq/base/tests/test_sequence.cc
+++ b/modules/seq/base/tests/test_sequence.cc
@@ -106,6 +106,26 @@ BOOST_AUTO_TEST_CASE(seq_string)
   BOOST_CHECK_EQUAL(s.GetGaplessString(),"");
 }
 
+BOOST_AUTO_TEST_CASE(seq_normalise)
+{
+  SequenceHandle s=CreateSequence("S1", "abfcdadeaf");
+  BOOST_CHECK_EQUAL(s.GetString(),"abfcdadeaf");
+  s.Normalise();
+  BOOST_CHECK_EQUAL(s.GetString(),"ABFCDADEAF");
+  s=CreateSequence("S1", ".afc..de.f");
+  BOOST_CHECK_EQUAL(s.GetString(),".afc..de.f");
+  s.Normalise();
+  BOOST_CHECK_EQUAL(s.GetString(),"AFCDEF");
+  s=CreateSequence("S1", "-afc--de-f");
+  BOOST_CHECK_EQUAL(s.GetString(),"-afc--de-f");
+  s.Normalise();
+  BOOST_CHECK_EQUAL(s.GetString(),"AFCDEF");
+  s=CreateSequence("S1", ".afc-.de-f");
+  BOOST_CHECK_EQUAL(s.GetString(),".afc-.de-f");
+  s.Normalise();
+  BOOST_CHECK_EQUAL(s.GetString(),"AFCDEF");
+}
+
 BOOST_AUTO_TEST_CASE(seq_onelettercode)
 {
   SequenceHandle s=CreateSequence("S1", "abfcdadeaf");
-- 
GitLab