From a79a6debc7e5b10cbb6887120662eb48a2b7e9d3 Mon Sep 17 00:00:00 2001
From: Stefan Bienert <stefan.bienert@unibas.ch>
Date: Thu, 3 Sep 2015 13:18:09 +0200
Subject: [PATCH] Testing ParseHHBlitsOutput

---
 modules/bindings/pymod/hhblits.py      | 268 ++++++++++++-------------
 modules/bindings/tests/test_hhblits.py |  82 ++++++++
 2 files changed, 210 insertions(+), 140 deletions(-)

diff --git a/modules/bindings/pymod/hhblits.py b/modules/bindings/pymod/hhblits.py
index 98c7d0e4b..0eac983c9 100644
--- a/modules/bindings/pymod/hhblits.py
+++ b/modules/bindings/pymod/hhblits.py
@@ -32,34 +32,34 @@ class HHblitsHit:
     :type: :class:`ost.seq.AlignmentHandle`
 
   .. attribute:: evalue
-  
+
     The E-value of the alignment
-  
+
   .. attribute:: prob
-  
+
     The probability of the alignment (between 0 and 100)
-  
+
   .. attribute:: score
   
     The alignment score
   """
   def __init__(self, hit_id, aln, score, ss_score,evalue, pvalue, prob):
-    self.hit_id=hit_id
-    self.aln=aln
-    self.score=score
-    self.ss_score=ss_score
-    self.evalue=evalue
-    self.prob=prob
-    self.pvalue=pvalue
+      self.hit_id = hit_id
+      self.aln = aln
+      self.score = score
+      self.ss_score = ss_score
+      self.evalue = evalue
+      self.prob = prob
+      self.pvalue = pvalue
 
 class HHblitsHeader:
-  def __init__(self):
-    self.query=''
-    self.match_columns=0
-    self.n_eff=0
-    self.searched_hmms=0
-    self.date=None
-    self.command=''
+    def __init__(self):
+        self.query = ''
+        self.match_columns = 0
+        self.n_eff = 0
+        self.searched_hmms = 0
+        self.date = None
+        self.command = ''
 
 def ParseHeaderLine(line):
   # First, we seek the start of the identifier, that is, the first whitespace
@@ -72,7 +72,6 @@ def ParseHeaderLine(line):
     if line[i] == ' ':
       break
   assert len(line)-i >= 31 and line[i+1] != ' '
-  #hit_id=line[split_pos+1:35].strip()
   hit_id = line[i+1:i+31].strip()
   fields = line[i+32:].split()
   prob = float(fields[0])
@@ -80,132 +79,121 @@ def ParseHeaderLine(line):
   pvalue = float(fields[2])
   score = float(fields[3])
   ss_score = float(fields[4])
-  # cols = int(fields[5])
   offsets = (int(fields[6].split('-')[0]), int(fields[7].split('-')[0]))
-  #split_pos=line.find(' ', 3)
-  #assert split_pos!=-1 and split_pos<34
-  #prob=float(line[35:40].strip())
-  #evalue=float(line[41:49].strip())
-  #pvalue=float(line[48:57].strip())
-  #score=float(line[56:64].strip())
-  #ss_score=float(line[63:69].strip())
-  #offsets=(int(line[76:79].strip()), int(line[86:89].strip()))
   return (HHblitsHit(hit_id, None, score, ss_score, evalue, pvalue, prob), offsets)
 
 def ParseHHblitsOutput(output):
-  """
-  Parses the HHblits output and returns a tuple of HHblitsHeader and
-  a list of HHblitsHit instances.
-  """
-  lines=iter(output)
-  def _ParseHeaderSection(lines):
-    value_start_column=14
-    date_pattern='%a %b %d %H:%M:%S %Y'
-    header=HHblitsHeader()
-    line=lines.next()
-    assert line.startswith('Query')
-    header.query=line[value_start_column:].strip()
-    line=lines.next()
-    assert line.startswith('Match_columns')
-    header.match_columns=int(line[value_start_column:].strip())
-
-    line=lines.next()
-    assert line.startswith('No_of_seqs')
-    
-    line=lines.next()
-    assert line.startswith('Neff')
-    header.n_eff=float(line[value_start_column:].strip())
-    
-    line=lines.next()
-    assert line.startswith('Searched_HMMs')
-    header.searched_hmms=int(line[value_start_column:].strip())
-    
-    line=lines.next()
-    assert line.startswith('Date')
-    value=line[value_start_column:].strip()
-    header.date=datetime.datetime.strptime(value, date_pattern)
-    
-    line=lines.next()
-    assert line.startswith('Command')
-    header.command=line[value_start_column:].strip()
-    
-    line=lines.next()
-    assert len(line.strip())==0
-    return header
-
-  def _ParseTableOfContents(lines):
-    assert lines.next().startswith(' No Hit')
-    hits=[]
-    while True:
-      line=lines.next()
-      if len(line.strip())==0:
-        return hits
-      hits.append(ParseHeaderLine(line))
-    return hits
-  def _ParseResultBody(query_id, hits, lines):
-    entry_index=None
-    query_str=''
-    templ_str=''
-    def _MakeAln(query_id, hit_id, query_string, templ_string, 
-                 q_offset, t_offset):
-        s1=seq.CreateSequence(query_id, query_string)
-        s1.offset=q_offset-1
-        s2=seq.CreateSequence(hit_id, templ_string)
-        s2.offset=t_offset-1
-        return seq.CreateAlignment(s1, s2)
-    try:
-      while True:
+    """
+    Parses the HHblits output and returns a tuple of HHblitsHeader and
+    a list of HHblitsHit instances.
+    """
+    lines=iter(output)
+    def _ParseHeaderSection(lines):
+        value_start_column = 14
+        date_pattern = '%a %b %d %H:%M:%S %Y'
+        header = HHblitsHeader()
+        line = lines.next()
+        assert line.startswith('Query')
+        header.query=line[value_start_column:].strip()
+        line=lines.next()
+        assert line.startswith('Match_columns')
+        header.match_columns=int(line[value_start_column:].strip())
+      
+        line=lines.next()
+        assert line.startswith('No_of_seqs')
+        
+        line=lines.next()
+        assert line.startswith('Neff')
+        header.n_eff=float(line[value_start_column:].strip())
+        
         line=lines.next()
-        if len(line.strip())==0:
-          continue
-        if line.startswith('Done!'):
-          if len(query_str)>0:
-            hits[entry_index][0].aln=_MakeAln(query_id, hits[entry_index][0].hit_id,
-                                              query_str, templ_str,
-                                              *hits[entry_index][1]) 
-          return [h for h,o in hits]
-        if line.startswith('No '):
-          if len(query_str)>0:
-            hits[entry_index][0].aln=_MakeAln(query_id, 
-                                              hits[entry_index][0].hit_id,
-                                              query_str, templ_str,
-                                              *hits[entry_index][1])
-          entry_index=int(line[3:].strip())-1
-          hits[entry_index][0].hit_id=lines.next()[1:].strip()
-          query_str=''
-          templ_str=''
-          # skip the next line. It doesn't contain information we don't 
-          # already know
-          lines.next()
-          continue
-        assert entry_index!=None
-        if line[1:].startswith(' Consensus'): continue
-        if line[1:].startswith(' ss_pred'): continue
-        if line[1:].startswith(' ss_conf'): continue
-        if line.startswith('T '):
-
-          end_pos=line.find(' ', 22)
-          assert end_pos!=-1
-          templ_str+=line[22:end_pos]
-        if line.startswith('Q '):
-          end_pos=line.find(' ', 22)
-          assert end_pos!=-1
-          query_str+=line[22:end_pos]
-
-        #hit.aln=seq.CreateAlignment(query_seq, templ_seq)
-    except StopIteration:
-      if len(query_str)>0:
-        hits[entry_index][0].aln=_MakeAln(query_id, hits[entry_index][0].hit_id,
-                                          query_str, templ_str,
-                                          *hits[entry_index][1]) 
-      return [h for h,o in hits]  
-  header=_ParseHeaderSection(lines)
-  # parse the table of contents. This is neccessary as some of the properties 
-  # (i.e. start of alignment) we need are only given there. From the TOC we 
-  # create a list of hits that is then further filled with data when we parse 
-  # the actual result body
-  hits=_ParseTableOfContents(lines)
-  return header, _ParseResultBody(header.query, hits, lines)
+        assert line.startswith('Searched_HMMs')
+        header.searched_hmms=int(line[value_start_column:].strip())
+        
+        line=lines.next()
+        assert line.startswith('Date')
+        value=line[value_start_column:].strip()
+        header.date=datetime.datetime.strptime(value, date_pattern)
+        
+        line=lines.next()
+        assert line.startswith('Command')
+        header.command=line[value_start_column:].strip()
+        
+        line=lines.next()
+        assert len(line.strip())==0
+        return header
+
+    def _ParseTableOfContents(lines):
+        assert lines.next().startswith(' No Hit')
+        hits=[]
+        while True:
+            line=lines.next()
+            if len(line.strip())==0:
+                return hits
+            hits.append(ParseHeaderLine(line))
+        return hits
+
+    def _ParseResultBody(query_id, hits, lines):
+        entry_index = None
+        query_str = ''
+        templ_str = ''
+        def _MakeAln(query_id, hit_id, query_string, templ_string, 
+                     q_offset, t_offset):
+            s1=seq.CreateSequence(query_id, query_string)
+            s1.offset=q_offset-1
+            s2=seq.CreateSequence(hit_id, templ_string)
+            s2.offset=t_offset-1
+            return seq.CreateAlignment(s1, s2)
+        try:
+            while True:
+                line=lines.next()
+                if len(line.strip()) == 0:
+                    continue
+                if line.startswith('Done!'):
+                    if len(query_str)>0:
+                        hits[entry_index][0].aln = _MakeAln(\
+                            query_id, hits[entry_index][0].hit_id,
+                            query_str, templ_str, *hits[entry_index][1])
+                    return [h for h,o in hits]
+                if line.startswith('No '):
+                    if len(query_str)>0:
+                        hits[entry_index][0].aln=_MakeAln(\
+                            query_id, hits[entry_index][0].hit_id,
+                            query_str, templ_str, *hits[entry_index][1])
+                    entry_index=int(line[3:].strip())-1
+                    hits[entry_index][0].hit_id=lines.next()[1:].strip()
+                    query_str=''
+                    templ_str=''
+                    # skip the next line. It doesn't contain information we
+                    # don't already know
+                    lines.next()
+                    continue
+                assert entry_index!=None
+                if line[1:].startswith(' Consensus'): continue
+                if line[1:].startswith(' ss_pred'): continue
+                if line[1:].startswith(' ss_conf'): continue
+                if line.startswith('T '):
+                    end_pos=line.find(' ', 22)
+                    assert end_pos!=-1
+                    templ_str+=line[22:end_pos]
+                if line.startswith('Q '):
+                    end_pos=line.find(' ', 22)
+                    assert end_pos!=-1
+                    query_str+=line[22:end_pos]
+        except StopIteration:
+            if len(query_str)>0:
+                hits[entry_index][0].aln=_MakeAln(query_id,
+                                                  hits[entry_index][0].hit_id,
+                                                  query_str, templ_str,
+                                                  *hits[entry_index][1]) 
+            return [h for h,o in hits]
+    header=_ParseHeaderSection(lines)
+    # parse the table of contents. This is neccessary as some of the properties 
+    # (i.e. start of alignment) we need are only given there. From the TOC we 
+    # create a list of hits that is then further filled with data when we parse 
+    # the actual result body
+    hits=_ParseTableOfContents(lines)
+    return header, _ParseResultBody(header.query, hits, lines)
 
 
 def ParseHHM(profile):
diff --git a/modules/bindings/tests/test_hhblits.py b/modules/bindings/tests/test_hhblits.py
index 92276d44b..61c4ac0de 100644
--- a/modules/bindings/tests/test_hhblits.py
+++ b/modules/bindings/tests/test_hhblits.py
@@ -7,6 +7,7 @@ import sys
 import tempfile
 import shutil
 import filecmp
+import datetime
 
 import ost
 from ost import seq
@@ -290,6 +291,87 @@ class TestHHblitsBindings(unittest.TestCase):
                          'Profile file "testfiles/testali.a3m" is missing '+
                          'the "Consensus" section')
 
+    def testParseHHblitsOutput(self):
+        header, hits = hhblits.ParseHHblitsOutput(open("testfiles/test.hhr"))
+        self.assertEqual(header.query, 'Test')
+        self.assertEqual(header.match_columns, 141)
+        self.assertEqual(header.n_eff, 9.4)
+        self.assertEqual(header.searched_hmms, 5)
+        self.assertEqual(header.date,
+                         datetime.datetime.strptime('Mon Aug 31 16:45:45 2015',
+                                                    '%a %b %d %H:%M:%S %Y'))
+        self.assertEqual(header.command, '/import/bc2/apps/HH-suite/2.0.16-go'+
+                         'olf-1.4.10/bin/hhblits -cpu 1 -n 1 -e 0.001 -Z 1000'+
+                         '0 -B 10000 -i /import/bc2/home/schwede/bienert/git/'+
+                         'ost_newenv.git/modules/bindings/tests/testfiles/tes'+
+                         'tali.a3m -o /scratch/tmp9E5dor/testali_cpu1_n1.hhr '+
+                         '-d /import/bc2/home/schwede/bienert/git/ost_newenv.'+
+                         'git/modules/bindings/tests/testfiles/hhblitsdb/hhbl'+
+                         'itsdb')
+        self.assertEqual(len(hits), 4)
+        # hit 1
+        self.assertEqual(hits[0].hit_id, '3e7b90809bd446a538f9eb1a1ca0e551')
+        self.assertEqual(hits[0].score, float('222.3'))
+        self.assertEqual(hits[0].ss_score, float('16.4'))
+        self.assertEqual(hits[0].evalue, float('2.7e-42'))
+        self.assertEqual(hits[0].pvalue, float('6.9E-45'))
+        self.assertEqual(hits[0].prob, float('100.0'))
+        self.assertEqual(str(hits[0].aln),
+                         'Test                VLSPADKTNVKAAWGKVGAHAGEYGAEALER'+
+                         'MFLSFPTTKTYFPHF-DL-S----HGSAQ\n3e7b90809bd446a5... '+
+                         'VLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDRFKHLKT'+
+                         'EAEMKASED\n\nTest                VKGHGKKVADALTNAVAH'+
+                         'VDDMPNALSALSDLHAHK-LRVDPVNFKLLSHCLLVTLAAHL\n3e7b908'+
+                         '09bd446a5... LKKHGVTVLTALGAILKKKGHHEAELKPLAQSHA-TKH'+
+                         'KIPIKYLEFISEAIIHVLHSRH\n\nTest                PAEFT'+
+                         'PAVHASLDKFLASVSTVLTSKYR\n3e7b90809bd446a5... PGDFGA'+
+                         'DAQGAMNKALELFRKDIAAKYK\n')
+        # hit 2
+        self.assertEqual(hits[1].hit_id, 'af828e69a5f2d0fd42a2a213e09eb64f')
+        self.assertEqual(hits[1].score, float('215.7'))
+        self.assertEqual(hits[1].ss_score, float('17.2'))
+        self.assertEqual(hits[1].evalue, float('1.5e-41'))
+        self.assertEqual(hits[1].pvalue, float('3.9E-44'))
+        self.assertEqual(hits[1].prob, float('100.0'))
+        self.assertEqual(str(hits[1].aln),
+                         'Test                VLSPADKTNVKAAWGKVGAHAGEYGAEALER'+
+                         'MFLSFPTTKTYFPHFDLSHGSAQVKGHGK\naf828e69a5f2d0fd... '+
+                         'VLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHFDLSHG'+
+                         'SAQVKGHGK\n\nTest                KVADALTNAVAHVDDMPN'+
+                         'ALSALSDLHAHKLRVDPVNFKLLSHCLLVTLAAHLPAEFTPA\naf828e6'+
+                         '9a5f2d0fd... KVADALTNAVAHVDDMPNALSALSDLHAHKLRVDPVNF'+
+                         'KLLSHCLLVTLAAHLPAEFTPA\n\nTest                VHASL'+
+                         'DKFLASVSTVLTSKYR\naf828e69a5f2d0fd... VHASLDKFLASVS'+
+                         'TVLTSKYR\n')
+        # hit 3
+        self.assertEqual(hits[2].hit_id, '9287755aa6aa27583da6be3b2408bfcc')
+        self.assertEqual(hits[2].score, float('215.1'))
+        self.assertEqual(hits[2].ss_score, float('16.5'))
+        self.assertEqual(hits[2].evalue, float('3.8e-41'))
+        self.assertEqual(hits[2].pvalue, float('9.7E-44'))
+        self.assertEqual(hits[2].prob, float('100.0'))
+        self.assertEqual(str(hits[2].aln),
+                         'Test                VLSPADKTNVKAAWGKVGAHAGEYGAEALER'+
+                         'MFLSFPTTKTYFPHF-DLS-----HGSAQ\n9287755aa6aa2758... '+
+                         'HLTPEEKSAVTALWGKVN--VDEVGGEALGRLLVVYPWTQRFFESFGDLST'+
+                         'PDAVMGNPK\n\nTest                VKGHGKKVADALTNAVAH'+
+                         'VDDMPNALSALSDLHAHK-LRVDPVNFKLLSHCLLVTLAAHL\n9287755'+
+                         'aa6aa2758... VKAHGKKVLGAFSDGLAHLDNLKGTFATLSELHC-DKL'+
+                         'HVDPENFRLLGNVLVCVLAHHF\n\nTest                PAEFT'+
+                         'PAVHASLDKFLASVSTVLTSKYR\n9287755aa6aa2758... GKEFTP'+
+                         'PVQAAYQKVVAGVANALAHKYH\n')
+        # hit 4
+        self.assertEqual(hits[3].hit_id, 'e69e1ac0a4b2554df25f2c2183b0fba0')
+        self.assertEqual(hits[3].score, float('12.6'))
+        self.assertEqual(hits[3].ss_score, float('2.6'))
+        self.assertEqual(hits[3].evalue, float('64.0'))
+        self.assertEqual(hits[3].pvalue, float('0.16'))
+        self.assertEqual(hits[3].prob, float('0.6'))
+        self.assertEqual(str(hits[3].aln), 'Test                VDPVNFKLLSHCL'+
+                         'LVTLAAHL\ne69e1ac0a4b2554d... ATPEQAQLVHKEIRKIVKDTC'+
+                         '\n')
+
+# ParseHHblitsOutput
 
 if __name__ == "__main__":
     hhsuite_root_dir =  os.getenv('EBROOTHHMINSUITE')
-- 
GitLab