From 4ac959df53fa2050b46127c753c36a129e619e10 Mon Sep 17 00:00:00 2001
From: marco <marco@5a81b35b-ba03-0410-adc8-b2c5c5119f08>
Date: Thu, 19 Aug 2010 11:00:47 +0000
Subject: [PATCH] added import of PIR files

This is BZDNG-153

git-svn-id: https://dng.biozentrum.unibas.ch/svn/openstructure/trunk@2656 5a81b35b-ba03-0410-adc8-b2c5c5119f08
---
 modules/io/src/seq/pir_io_handler.cc         | 89 +++++++++++++-------
 modules/io/tests/CMakeLists.txt              |  1 +
 modules/io/tests/test_pir.cc                 | 73 ++++++++++++++++
 modules/io/tests/testfiles/pir/no-star.pir   | 21 +++++
 modules/io/tests/testfiles/pir/seq-types.pir | 21 +++++
 modules/io/tests/testfiles/pir/simple.pir    |  4 +
 modules/seq/base/src/invalid_sequence.hh     |  1 +
 7 files changed, 180 insertions(+), 30 deletions(-)
 create mode 100644 modules/io/tests/test_pir.cc
 create mode 100644 modules/io/tests/testfiles/pir/no-star.pir
 create mode 100644 modules/io/tests/testfiles/pir/seq-types.pir
 create mode 100644 modules/io/tests/testfiles/pir/simple.pir

diff --git a/modules/io/src/seq/pir_io_handler.cc b/modules/io/src/seq/pir_io_handler.cc
index a5d07c9ee..2974dcd51 100644
--- a/modules/io/src/seq/pir_io_handler.cc
+++ b/modules/io/src/seq/pir_io_handler.cc
@@ -72,50 +72,79 @@ bool PirIOHandler::ProvidesExport(const boost::filesystem::path& loc,
   return PirIOHandler::ProvidesImport(loc, format);
 }
 
-void PirIOHandler::Import(seq::SequenceList& aln,
-                            std::istream& instream)
+void PirIOHandler::Import(seq::SequenceList& aln, std::istream& instream)
 {
-  const char* error_msg="Bad Pir file: Expected '>', but '%1%' found.";  
-  
+  const char* error_msg="Expected %1%, but '%2%' found.";  
+  const char* premature_end="Premature end in PIR file on line %1%";
+  const char* empty_seq="PIR file contains empty sequence on line %1%";
   String line;
-  std::getline(instream, line);    
-  int seq_count=0;
-  //not yet supported
-  throw IOException("Import of PIR format is not yet supported!");
+  int line_num=1, seq_count=0;
+  // we can't use a normal while(std::getline(...)) here, because the PIR format 
+  // requires us to perform a one-line look-ahead. 
+  std::getline(instream, line);  
   while (!instream.eof()) {
-    // parse header information. cut after first "|"
+    // skip empty lines
     if (line.find_first_not_of("\n\t ")==String::npos) {
-      std::getline(instream, line);
       continue;
     }
-    if (line.length()==0 || line[0]!='>') {
-      String error=str(format(error_msg) % line);
+    if (line[0]!='>') {
+      String error=str(format(error_msg) % "'>'" % line);
+      throw IOException(error);
+    }
+    if (line.length()<4) {
+      String error=str(format(error_msg) % "seq-type;identifier" % line);
       throw IOException(error);
     }
-    String identifier=line.substr(1);
-    String sequence_str;
-    while (std::getline(instream, line) && line.size()>0 && line[0]!='>') {
-      if (line.find_first_not_of("\n\t ")==String::npos) {
-        continue;
+    String seq_type=line.substr(1, 2);
+    if (!(seq_type=="P1" || seq_type=="F1" || seq_type=="DL" || 
+          seq_type=="DC" || seq_type=="RL" || seq_type=="RC" || 
+          seq_type=="XX")) {
+      throw IOException("Bad PIR file: Unknown sequence type"/*" '%1%' on line %2%" % seq_type % line_nu*m*/);
+    }
+    String name=line.substr(4);
+    int subject_line_num=line_num;
+    if (!std::getline(instream, line)) {
+      throw IOException(str(format(premature_end) % line_num));
+    }
+    line_num+=1;
+    std::stringstream seq_string;
+    bool end_seq=false;
+    while (!end_seq && std::getline(instream, line)) {
+      line_num+=1;
+      for (String::iterator i=line.begin(), e=line.end(); i!=e; ++i) {
+        if (isspace(*i)) {
+          continue;
+        }
+        if (*i=='*') {
+          end_seq=true;
+          break;
+        }
+        seq_string << *i;
       }
-      sequence_str+=line;
     }
-    if (sequence_str.length()>0) {
-      try {
-        seq::SequenceHandle seq=seq::CreateSequence(identifier, sequence_str);
-        aln.AddSequence(seq);          
-        seq_count+=1;
-      } catch (seq::InvalidSequence& e) {
-        throw e;
+    while (std::getline(instream, line)) {
+      line_num+=1;
+      if (!line.empty() && line[0]=='>') {
+        break;
       }
-    } else {
-      throw IOException("Bad Pir file: Sequence is empty.");
     }
-  }    
+    if (!end_seq) {
+      throw IOException(str(format(premature_end) % line_num));
+    }
+    if (seq_string.str().empty()) {
+      throw IOException(str(format(empty_seq) % subject_line_num));
+    }
+    try {
+      seq::SequenceHandle ss=seq::CreateSequence(name, seq_string.str());
+      aln.AddSequence(ss);          
+      seq_count+=1;
+    } catch (seq::InvalidSequence& e) {
+      throw e;
+    }
+  }
   if (seq_count==0) {
     throw IOException("Bad Pir file: File is empty");
-  }                        
-
+  }
 }
 
 void PirIOHandler::Export(const seq::ConstSequenceList& seqs,
diff --git a/modules/io/tests/CMakeLists.txt b/modules/io/tests/CMakeLists.txt
index f0c230801..1cb26f5fd 100644
--- a/modules/io/tests/CMakeLists.txt
+++ b/modules/io/tests/CMakeLists.txt
@@ -3,6 +3,7 @@ set(OST_IO_UNIT_TESTS
   test_io_pdb.cc
   test_io_crd.cc
   test_io_sdf.cc
+  test_pir.cc
   test_iomanager.cc
   tests.cc
 )
diff --git a/modules/io/tests/test_pir.cc b/modules/io/tests/test_pir.cc
new file mode 100644
index 000000000..1829404cc
--- /dev/null
+++ b/modules/io/tests/test_pir.cc
@@ -0,0 +1,73 @@
+//------------------------------------------------------------------------------
+// This file is part of the OpenStructure project <www.openstructure.org>
+//
+// Copyright (C) 2008-2010 by the OpenStructure authors
+//
+// This library is free software; you can redistribute it and/or modify it under
+// the terms of the GNU Lesser General Public License as published by the Free
+// Software Foundation; either version 3.0 of the License, or (at your option)
+// any later version.
+// This library is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
+// details.
+//
+// You should have received a copy of the GNU Lesser General Public License
+// along with this library; if not, write to the Free Software Foundation, Inc.,
+// 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+//------------------------------------------------------------------------------
+
+/*
+  Author: Marco Biasini
+ */
+
+#define BOOST_TEST_DYN_LINK
+#include <boost/test/unit_test.hpp>
+#include <ost/seq/invalid_sequence.hh>
+#include <ost/io/seq/pir_io_handler.hh>
+#include <ost/io/seq/load.hh>
+#include <ost/io/io_exception.hh>
+
+
+using namespace ost;
+using namespace ost::io;
+
+
+BOOST_AUTO_TEST_SUITE( io )
+
+
+BOOST_AUTO_TEST_CASE(pir_filetypes) 
+{
+  BOOST_CHECK(PirIOHandler::ProvidesImport("","pir"));
+  BOOST_CHECK(PirIOHandler::ProvidesImport("alignment.pir"));
+}
+
+BOOST_AUTO_TEST_CASE(pir_simple) 
+{
+  seq::SequenceHandle s=LoadSequence("testfiles/pir/simple.pir");
+  BOOST_CHECK_EQUAL(s.GetName(), "name");
+  BOOST_CHECK_EQUAL(s.GetString(), "ABCDEFGHIJKLMNOP");
+}
+
+BOOST_AUTO_TEST_CASE(pir_seq_types) 
+{
+  seq::SequenceList s=LoadSequenceList("testfiles/pir/seq-types.pir");
+}
+
+BOOST_AUTO_TEST_CASE(pir_errors)
+{
+  BOOST_CHECK_THROW(SequenceFromString(">", "pir"), IOException);
+  BOOST_CHECK_THROW(SequenceFromString(">P1", "pir"), IOException);
+  BOOST_CHECK_THROW(SequenceFromString(">P1;", "pir"), IOException);
+  BOOST_CHECK_THROW(SequenceFromString(">P1;name\n", "pir"), IOException);
+  BOOST_CHECK_THROW(SequenceFromString(">P1;name\ndescription\n", "pir"), 
+                    IOException);  
+}
+
+BOOST_AUTO_TEST_CASE(pir_no_star) 
+{
+  BOOST_CHECK_THROW(LoadSequence("testfiles/pir/no-star.pir"),  
+                    seq::InvalidSequence);
+}
+
+BOOST_AUTO_TEST_SUITE_END()
\ No newline at end of file
diff --git a/modules/io/tests/testfiles/pir/no-star.pir b/modules/io/tests/testfiles/pir/no-star.pir
new file mode 100644
index 000000000..98bc371aa
--- /dev/null
+++ b/modules/io/tests/testfiles/pir/no-star.pir
@@ -0,0 +1,21 @@
+>P1;name
+description
+ABCD EFGH*
+>F1;name
+description
+ABCD EFGH
+>DL;name*
+description
+ABCD EFGH
+>DC;name*
+description
+ABCD EFGH
+>RL;name*
+description
+ABCD EFGH
+>RC;name*
+description
+ABCD EFGH
+>XX;name*
+description
+ABCD EFGH*
\ No newline at end of file
diff --git a/modules/io/tests/testfiles/pir/seq-types.pir b/modules/io/tests/testfiles/pir/seq-types.pir
new file mode 100644
index 000000000..ed8569f43
--- /dev/null
+++ b/modules/io/tests/testfiles/pir/seq-types.pir
@@ -0,0 +1,21 @@
+>P1;name
+description
+ABCD EFGH*
+>F1;name
+description
+ABCD EFGH*
+>DL;name
+description
+ABCD EFGH*
+>DC;name
+description
+ABCD EFGH*
+>RL;name
+description
+ABCD EFGH*
+>RC;name*
+description
+ABCD EFGH*
+>XX;name
+description
+ABCD EFGH*
\ No newline at end of file
diff --git a/modules/io/tests/testfiles/pir/simple.pir b/modules/io/tests/testfiles/pir/simple.pir
new file mode 100644
index 000000000..6e8f03e4e
--- /dev/null
+++ b/modules/io/tests/testfiles/pir/simple.pir
@@ -0,0 +1,4 @@
+>P1;name
+description
+ABCD EFGH
+IJKL MNOP*
\ No newline at end of file
diff --git a/modules/seq/base/src/invalid_sequence.hh b/modules/seq/base/src/invalid_sequence.hh
index b112df8f3..453916403 100644
--- a/modules/seq/base/src/invalid_sequence.hh
+++ b/modules/seq/base/src/invalid_sequence.hh
@@ -23,6 +23,7 @@
   Author: Marco Biasini
  */
 #include <ost/seq/module_config.hh>
+#include <ost/message.hh>
 namespace ost { namespace seq {
 
 class DLLEXPORT InvalidSequence : public Error {
-- 
GitLab