From a79a6debc7e5b10cbb6887120662eb48a2b7e9d3 Mon Sep 17 00:00:00 2001 From: Stefan Bienert <stefan.bienert@unibas.ch> Date: Thu, 3 Sep 2015 13:18:09 +0200 Subject: [PATCH] Testing ParseHHBlitsOutput --- modules/bindings/pymod/hhblits.py | 268 ++++++++++++------------- modules/bindings/tests/test_hhblits.py | 82 ++++++++ 2 files changed, 210 insertions(+), 140 deletions(-) diff --git a/modules/bindings/pymod/hhblits.py b/modules/bindings/pymod/hhblits.py index 98c7d0e4b..0eac983c9 100644 --- a/modules/bindings/pymod/hhblits.py +++ b/modules/bindings/pymod/hhblits.py @@ -32,34 +32,34 @@ class HHblitsHit: :type: :class:`ost.seq.AlignmentHandle` .. attribute:: evalue - + The E-value of the alignment - + .. attribute:: prob - + The probability of the alignment (between 0 and 100) - + .. attribute:: score The alignment score """ def __init__(self, hit_id, aln, score, ss_score,evalue, pvalue, prob): - self.hit_id=hit_id - self.aln=aln - self.score=score - self.ss_score=ss_score - self.evalue=evalue - self.prob=prob - self.pvalue=pvalue + self.hit_id = hit_id + self.aln = aln + self.score = score + self.ss_score = ss_score + self.evalue = evalue + self.prob = prob + self.pvalue = pvalue class HHblitsHeader: - def __init__(self): - self.query='' - self.match_columns=0 - self.n_eff=0 - self.searched_hmms=0 - self.date=None - self.command='' + def __init__(self): + self.query = '' + self.match_columns = 0 + self.n_eff = 0 + self.searched_hmms = 0 + self.date = None + self.command = '' def ParseHeaderLine(line): # First, we seek the start of the identifier, that is, the first whitespace @@ -72,7 +72,6 @@ def ParseHeaderLine(line): if line[i] == ' ': break assert len(line)-i >= 31 and line[i+1] != ' ' - #hit_id=line[split_pos+1:35].strip() hit_id = line[i+1:i+31].strip() fields = line[i+32:].split() prob = float(fields[0]) @@ -80,132 +79,121 @@ def ParseHeaderLine(line): pvalue = float(fields[2]) score = float(fields[3]) ss_score = float(fields[4]) - # cols = int(fields[5]) offsets = (int(fields[6].split('-')[0]), int(fields[7].split('-')[0])) - #split_pos=line.find(' ', 3) - #assert split_pos!=-1 and split_pos<34 - #prob=float(line[35:40].strip()) - #evalue=float(line[41:49].strip()) - #pvalue=float(line[48:57].strip()) - #score=float(line[56:64].strip()) - #ss_score=float(line[63:69].strip()) - #offsets=(int(line[76:79].strip()), int(line[86:89].strip())) return (HHblitsHit(hit_id, None, score, ss_score, evalue, pvalue, prob), offsets) def ParseHHblitsOutput(output): - """ - Parses the HHblits output and returns a tuple of HHblitsHeader and - a list of HHblitsHit instances. - """ - lines=iter(output) - def _ParseHeaderSection(lines): - value_start_column=14 - date_pattern='%a %b %d %H:%M:%S %Y' - header=HHblitsHeader() - line=lines.next() - assert line.startswith('Query') - header.query=line[value_start_column:].strip() - line=lines.next() - assert line.startswith('Match_columns') - header.match_columns=int(line[value_start_column:].strip()) - - line=lines.next() - assert line.startswith('No_of_seqs') - - line=lines.next() - assert line.startswith('Neff') - header.n_eff=float(line[value_start_column:].strip()) - - line=lines.next() - assert line.startswith('Searched_HMMs') - header.searched_hmms=int(line[value_start_column:].strip()) - - line=lines.next() - assert line.startswith('Date') - value=line[value_start_column:].strip() - header.date=datetime.datetime.strptime(value, date_pattern) - - line=lines.next() - assert line.startswith('Command') - header.command=line[value_start_column:].strip() - - line=lines.next() - assert len(line.strip())==0 - return header - - def _ParseTableOfContents(lines): - assert lines.next().startswith(' No Hit') - hits=[] - while True: - line=lines.next() - if len(line.strip())==0: - return hits - hits.append(ParseHeaderLine(line)) - return hits - def _ParseResultBody(query_id, hits, lines): - entry_index=None - query_str='' - templ_str='' - def _MakeAln(query_id, hit_id, query_string, templ_string, - q_offset, t_offset): - s1=seq.CreateSequence(query_id, query_string) - s1.offset=q_offset-1 - s2=seq.CreateSequence(hit_id, templ_string) - s2.offset=t_offset-1 - return seq.CreateAlignment(s1, s2) - try: - while True: + """ + Parses the HHblits output and returns a tuple of HHblitsHeader and + a list of HHblitsHit instances. + """ + lines=iter(output) + def _ParseHeaderSection(lines): + value_start_column = 14 + date_pattern = '%a %b %d %H:%M:%S %Y' + header = HHblitsHeader() + line = lines.next() + assert line.startswith('Query') + header.query=line[value_start_column:].strip() + line=lines.next() + assert line.startswith('Match_columns') + header.match_columns=int(line[value_start_column:].strip()) + + line=lines.next() + assert line.startswith('No_of_seqs') + + line=lines.next() + assert line.startswith('Neff') + header.n_eff=float(line[value_start_column:].strip()) + line=lines.next() - if len(line.strip())==0: - continue - if line.startswith('Done!'): - if len(query_str)>0: - hits[entry_index][0].aln=_MakeAln(query_id, hits[entry_index][0].hit_id, - query_str, templ_str, - *hits[entry_index][1]) - return [h for h,o in hits] - if line.startswith('No '): - if len(query_str)>0: - hits[entry_index][0].aln=_MakeAln(query_id, - hits[entry_index][0].hit_id, - query_str, templ_str, - *hits[entry_index][1]) - entry_index=int(line[3:].strip())-1 - hits[entry_index][0].hit_id=lines.next()[1:].strip() - query_str='' - templ_str='' - # skip the next line. It doesn't contain information we don't - # already know - lines.next() - continue - assert entry_index!=None - if line[1:].startswith(' Consensus'): continue - if line[1:].startswith(' ss_pred'): continue - if line[1:].startswith(' ss_conf'): continue - if line.startswith('T '): - - end_pos=line.find(' ', 22) - assert end_pos!=-1 - templ_str+=line[22:end_pos] - if line.startswith('Q '): - end_pos=line.find(' ', 22) - assert end_pos!=-1 - query_str+=line[22:end_pos] - - #hit.aln=seq.CreateAlignment(query_seq, templ_seq) - except StopIteration: - if len(query_str)>0: - hits[entry_index][0].aln=_MakeAln(query_id, hits[entry_index][0].hit_id, - query_str, templ_str, - *hits[entry_index][1]) - return [h for h,o in hits] - header=_ParseHeaderSection(lines) - # parse the table of contents. This is neccessary as some of the properties - # (i.e. start of alignment) we need are only given there. From the TOC we - # create a list of hits that is then further filled with data when we parse - # the actual result body - hits=_ParseTableOfContents(lines) - return header, _ParseResultBody(header.query, hits, lines) + assert line.startswith('Searched_HMMs') + header.searched_hmms=int(line[value_start_column:].strip()) + + line=lines.next() + assert line.startswith('Date') + value=line[value_start_column:].strip() + header.date=datetime.datetime.strptime(value, date_pattern) + + line=lines.next() + assert line.startswith('Command') + header.command=line[value_start_column:].strip() + + line=lines.next() + assert len(line.strip())==0 + return header + + def _ParseTableOfContents(lines): + assert lines.next().startswith(' No Hit') + hits=[] + while True: + line=lines.next() + if len(line.strip())==0: + return hits + hits.append(ParseHeaderLine(line)) + return hits + + def _ParseResultBody(query_id, hits, lines): + entry_index = None + query_str = '' + templ_str = '' + def _MakeAln(query_id, hit_id, query_string, templ_string, + q_offset, t_offset): + s1=seq.CreateSequence(query_id, query_string) + s1.offset=q_offset-1 + s2=seq.CreateSequence(hit_id, templ_string) + s2.offset=t_offset-1 + return seq.CreateAlignment(s1, s2) + try: + while True: + line=lines.next() + if len(line.strip()) == 0: + continue + if line.startswith('Done!'): + if len(query_str)>0: + hits[entry_index][0].aln = _MakeAln(\ + query_id, hits[entry_index][0].hit_id, + query_str, templ_str, *hits[entry_index][1]) + return [h for h,o in hits] + if line.startswith('No '): + if len(query_str)>0: + hits[entry_index][0].aln=_MakeAln(\ + query_id, hits[entry_index][0].hit_id, + query_str, templ_str, *hits[entry_index][1]) + entry_index=int(line[3:].strip())-1 + hits[entry_index][0].hit_id=lines.next()[1:].strip() + query_str='' + templ_str='' + # skip the next line. It doesn't contain information we + # don't already know + lines.next() + continue + assert entry_index!=None + if line[1:].startswith(' Consensus'): continue + if line[1:].startswith(' ss_pred'): continue + if line[1:].startswith(' ss_conf'): continue + if line.startswith('T '): + end_pos=line.find(' ', 22) + assert end_pos!=-1 + templ_str+=line[22:end_pos] + if line.startswith('Q '): + end_pos=line.find(' ', 22) + assert end_pos!=-1 + query_str+=line[22:end_pos] + except StopIteration: + if len(query_str)>0: + hits[entry_index][0].aln=_MakeAln(query_id, + hits[entry_index][0].hit_id, + query_str, templ_str, + *hits[entry_index][1]) + return [h for h,o in hits] + header=_ParseHeaderSection(lines) + # parse the table of contents. This is neccessary as some of the properties + # (i.e. start of alignment) we need are only given there. From the TOC we + # create a list of hits that is then further filled with data when we parse + # the actual result body + hits=_ParseTableOfContents(lines) + return header, _ParseResultBody(header.query, hits, lines) def ParseHHM(profile): diff --git a/modules/bindings/tests/test_hhblits.py b/modules/bindings/tests/test_hhblits.py index 92276d44b..61c4ac0de 100644 --- a/modules/bindings/tests/test_hhblits.py +++ b/modules/bindings/tests/test_hhblits.py @@ -7,6 +7,7 @@ import sys import tempfile import shutil import filecmp +import datetime import ost from ost import seq @@ -290,6 +291,87 @@ class TestHHblitsBindings(unittest.TestCase): 'Profile file "testfiles/testali.a3m" is missing '+ 'the "Consensus" section') + def testParseHHblitsOutput(self): + header, hits = hhblits.ParseHHblitsOutput(open("testfiles/test.hhr")) + self.assertEqual(header.query, 'Test') + self.assertEqual(header.match_columns, 141) + self.assertEqual(header.n_eff, 9.4) + self.assertEqual(header.searched_hmms, 5) + self.assertEqual(header.date, + datetime.datetime.strptime('Mon Aug 31 16:45:45 2015', + '%a %b %d %H:%M:%S %Y')) + self.assertEqual(header.command, '/import/bc2/apps/HH-suite/2.0.16-go'+ + 'olf-1.4.10/bin/hhblits -cpu 1 -n 1 -e 0.001 -Z 1000'+ + '0 -B 10000 -i /import/bc2/home/schwede/bienert/git/'+ + 'ost_newenv.git/modules/bindings/tests/testfiles/tes'+ + 'tali.a3m -o /scratch/tmp9E5dor/testali_cpu1_n1.hhr '+ + '-d /import/bc2/home/schwede/bienert/git/ost_newenv.'+ + 'git/modules/bindings/tests/testfiles/hhblitsdb/hhbl'+ + 'itsdb') + self.assertEqual(len(hits), 4) + # hit 1 + self.assertEqual(hits[0].hit_id, '3e7b90809bd446a538f9eb1a1ca0e551') + self.assertEqual(hits[0].score, float('222.3')) + self.assertEqual(hits[0].ss_score, float('16.4')) + self.assertEqual(hits[0].evalue, float('2.7e-42')) + self.assertEqual(hits[0].pvalue, float('6.9E-45')) + self.assertEqual(hits[0].prob, float('100.0')) + self.assertEqual(str(hits[0].aln), + 'Test VLSPADKTNVKAAWGKVGAHAGEYGAEALER'+ + 'MFLSFPTTKTYFPHF-DL-S----HGSAQ\n3e7b90809bd446a5... '+ + 'VLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDRFKHLKT'+ + 'EAEMKASED\n\nTest VKGHGKKVADALTNAVAH'+ + 'VDDMPNALSALSDLHAHK-LRVDPVNFKLLSHCLLVTLAAHL\n3e7b908'+ + '09bd446a5... LKKHGVTVLTALGAILKKKGHHEAELKPLAQSHA-TKH'+ + 'KIPIKYLEFISEAIIHVLHSRH\n\nTest PAEFT'+ + 'PAVHASLDKFLASVSTVLTSKYR\n3e7b90809bd446a5... PGDFGA'+ + 'DAQGAMNKALELFRKDIAAKYK\n') + # hit 2 + self.assertEqual(hits[1].hit_id, 'af828e69a5f2d0fd42a2a213e09eb64f') + self.assertEqual(hits[1].score, float('215.7')) + self.assertEqual(hits[1].ss_score, float('17.2')) + self.assertEqual(hits[1].evalue, float('1.5e-41')) + self.assertEqual(hits[1].pvalue, float('3.9E-44')) + self.assertEqual(hits[1].prob, float('100.0')) + self.assertEqual(str(hits[1].aln), + 'Test VLSPADKTNVKAAWGKVGAHAGEYGAEALER'+ + 'MFLSFPTTKTYFPHFDLSHGSAQVKGHGK\naf828e69a5f2d0fd... '+ + 'VLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHFDLSHG'+ + 'SAQVKGHGK\n\nTest KVADALTNAVAHVDDMPN'+ + 'ALSALSDLHAHKLRVDPVNFKLLSHCLLVTLAAHLPAEFTPA\naf828e6'+ + '9a5f2d0fd... KVADALTNAVAHVDDMPNALSALSDLHAHKLRVDPVNF'+ + 'KLLSHCLLVTLAAHLPAEFTPA\n\nTest VHASL'+ + 'DKFLASVSTVLTSKYR\naf828e69a5f2d0fd... VHASLDKFLASVS'+ + 'TVLTSKYR\n') + # hit 3 + self.assertEqual(hits[2].hit_id, '9287755aa6aa27583da6be3b2408bfcc') + self.assertEqual(hits[2].score, float('215.1')) + self.assertEqual(hits[2].ss_score, float('16.5')) + self.assertEqual(hits[2].evalue, float('3.8e-41')) + self.assertEqual(hits[2].pvalue, float('9.7E-44')) + self.assertEqual(hits[2].prob, float('100.0')) + self.assertEqual(str(hits[2].aln), + 'Test VLSPADKTNVKAAWGKVGAHAGEYGAEALER'+ + 'MFLSFPTTKTYFPHF-DLS-----HGSAQ\n9287755aa6aa2758... '+ + 'HLTPEEKSAVTALWGKVN--VDEVGGEALGRLLVVYPWTQRFFESFGDLST'+ + 'PDAVMGNPK\n\nTest VKGHGKKVADALTNAVAH'+ + 'VDDMPNALSALSDLHAHK-LRVDPVNFKLLSHCLLVTLAAHL\n9287755'+ + 'aa6aa2758... VKAHGKKVLGAFSDGLAHLDNLKGTFATLSELHC-DKL'+ + 'HVDPENFRLLGNVLVCVLAHHF\n\nTest PAEFT'+ + 'PAVHASLDKFLASVSTVLTSKYR\n9287755aa6aa2758... GKEFTP'+ + 'PVQAAYQKVVAGVANALAHKYH\n') + # hit 4 + self.assertEqual(hits[3].hit_id, 'e69e1ac0a4b2554df25f2c2183b0fba0') + self.assertEqual(hits[3].score, float('12.6')) + self.assertEqual(hits[3].ss_score, float('2.6')) + self.assertEqual(hits[3].evalue, float('64.0')) + self.assertEqual(hits[3].pvalue, float('0.16')) + self.assertEqual(hits[3].prob, float('0.6')) + self.assertEqual(str(hits[3].aln), 'Test VDPVNFKLLSHCL'+ + 'LVTLAAHL\ne69e1ac0a4b2554d... ATPEQAQLVHKEIRKIVKDTC'+ + '\n') + +# ParseHHblitsOutput if __name__ == "__main__": hhsuite_root_dir = os.getenv('EBROOTHHMINSUITE') -- GitLab