diff --git a/modules/bindings/pymod/CMakeLists.txt b/modules/bindings/pymod/CMakeLists.txt index f91dc53d8e46178f80f41f6140a3fdcc8424ee01..89244b23f96c53c63e8b44385d3c64a1c07e197e 100644 --- a/modules/bindings/pymod/CMakeLists.txt +++ b/modules/bindings/pymod/CMakeLists.txt @@ -1,2 +1,13 @@ -pymod(NAME bindings PY __init__.py lga.py hbplus.py msms.py tmtools.py - dssp.py clustalw.py utils.py naccess.py) +set(OST_BINDINGS +__init__.py +lga.py +hbplus.py +msms.py +tmtools.py +dssp.py +clustalw.py +utils.py +naccess.py +blast.py +) +pymod(NAME bindings PY ${OST_BINDINGS}) diff --git a/modules/bindings/pymod/blast.py b/modules/bindings/pymod/blast.py new file mode 100644 index 0000000000000000000000000000000000000000..32e36e3bf37fd4860111b76425efea56c8547659 --- /dev/null +++ b/modules/bindings/pymod/blast.py @@ -0,0 +1,122 @@ +from ost import settings +import subprocess +from xml.dom import minidom +from ost import io, seq +import ost +import re +import os +class AlignedPatch: + def __init__(self, aln, bit_score, score, evalue): + self.aln=aln + self.bit_score=bit_score + self.score=score + self.evalue=evalue + +class BlastHit: + """ + A positive match found by BLAST. + + Each blast hit consists of one or more aligned patches. + + """ + def __init__(self, identifier, aligned_patches): + self.identifier=identifier + self.aligned_patches=aligned_patches + +def ParseBlastOutput(string): + """ + Parses the blast output and returns a list of BlastHits + """ + def _GetText(node): + rc='' + for child in node.childNodes: + if child.nodeType==child.TEXT_NODE: + rc+=child.data + return rc + + def _GetValue(node, tag_name): + tag=node.getElementsByTagName(tag_name) + assert len(tag)==1 + return _GetText(tag[0]) + + def _GetInt(node, tag_name): + return int(_GetValue(node, tag_name)) + + def _ParseHsp(query_id, hit_id, hsp): + bit_score=float(_GetValue(hsp, 'Hsp_bit-score')) + score=float(_GetValue(hsp, 'Hsp_score')) + evalue=float(_GetValue(hsp, 'Hsp_evalue')) + query_offset=_GetInt(hsp, 'Hsp_query-from')-1 + hit_offset=_GetInt(hsp, 'Hsp_hit-from')-1 + query_seq=seq.CreateSequence(str(query_id), str(_GetValue(hsp, 'Hsp_qseq'))) + query_seq.offset=query_offset + hit_seq=seq.CreateSequence(str(hit_id), str(_GetValue(hsp, 'Hsp_hseq'))) + hit_seq.offset=hit_offset + aln=seq.CreateAlignment(query_seq, hit_seq) + return AlignedPatch(aln, bit_score, score, evalue) + try: + doc=minidom.parseString(string) + except Exception, e: + ost.LogError('Error while parsing BLAST output: %s' % str(e)) + return None + hits=[] + query_id=_GetValue(doc, 'BlastOutput_query-def') + for hit in doc.getElementsByTagName('Hit'): + hit_id=_GetValue(hit, 'Hit_def') + hsps=hit.getElementsByTagName('Hsp') + aligned_patches=[_ParseHsp(query_id, hit_id, hsp) for hsp in hsps] + hits.append(BlastHit(hit_id, aligned_patches)) + return hits + + + +class BlastError(RuntimeError): + def __init__(self, brief, details): + self.brief=brief + self.details=details + + def __str__(self): + if self.details: + return '%s\n%s' % (self.brief, self.details) + else: + return self.brief + +def Blast(query, database, gap_open=11, gap_ext=1, matrix='BLOSUM62', + blast_location=None): + """ + Runs a protein vs. protein blast search. The results are returned as a + list of :class:`BlastHit` instances. + + :param query: the query sequence + :type query: :class:`seq.ConstSequenceHandle` + + :param database: The filename of the sequence database. Make sure that + formatdb has been run on the database and the <database>.pin file exists. + :param matrix: The substitution matrix to be used. Must be one of 'BLOSUM45', + 'BLOSUM62', 'BLOSUM80', 'PAM30', 'PAM70'. + :param gap_open: Gap opening penalty. Note that only a subset of gap opening + penalties is supported for each substitutition matrix. Consult the blast + docs for more information. + :param gap_ext: Gap extension penalty. Only a subset of gap extension + penalties are supported for each of the substitution matrices. Consult the + blast docs for more information. + """ + subst_mats=('BLOSUM45', 'BLOSUM62', 'BLOSUM80', 'PAM30', 'PAM70',) + if matrix not in subst_mats: + raise ValueError('matrix must be one of %s' % ', '.join(subst_mats)) + if not os.path.exists('%s.pin' % database): + raise IOError("Database %s does not exist" % database) + blastall_exe=settings.Locate('blastall', explicit_file_name=blast_location) + args=[blastall_exe, '-d', database, '-p', 'blastp', + '-m', '7', '-M', matrix, '-G', str(gap_open), '-E', str(gap_ext)] + ost.LogInfo('running BLAST (%s)' % ' '.join(args)) + blast_pipe=subprocess.Popen(args, stderr=subprocess.PIPE, + stdout=subprocess.PIPE, stdin=subprocess.PIPE) + stdout, stderr=blast_pipe.communicate(io.SequenceToString(query, 'fasta')) + if len(stderr)>0: + pattern=re.compile(r'^\[.*\]\s+ERROR:\s+(.*)') + lines=stderr.split('\n') + error_message=pattern.match(lines[0]) + if error_message: + raise BlastError(error_message.group(1), '\n'.join(lines[1:])) + return ParseBlastOutput(stdout) diff --git a/modules/bindings/tests/CMakeLists.txt b/modules/bindings/tests/CMakeLists.txt index 5a3f44e3d827f7571b5aaed58fde891abc8543bb..a93d8aeaa8b0a21c4bd0370fba5bf51a06988800 100644 --- a/modules/bindings/tests/CMakeLists.txt +++ b/modules/bindings/tests/CMakeLists.txt @@ -1,6 +1,7 @@ set(OST_BINDINGS_UNIT_TESTS test_msms.py test_clustalw.py + test_blast.py ) ost_unittest(bindings "${OST_BINDINGS_UNIT_TESTS}") diff --git a/modules/bindings/tests/test_blast.py b/modules/bindings/tests/test_blast.py new file mode 100644 index 0000000000000000000000000000000000000000..2b42bfd18ec4a612a602e86834f52e383149ee65 --- /dev/null +++ b/modules/bindings/tests/test_blast.py @@ -0,0 +1,50 @@ +import unittest +from ost import * +from ost import settings +from ost.bindings import blast + +class TestBlastBindings(unittest.TestCase): + def setUp(self): + self.query=seq.CreateSequence('A', 'MKPHPWFFGKIPRAKAEEMLSKQRHDGAFLIRESESAPG'+ + 'DFSLSVKFGNDVQHFKVLRDGAGKYFLWVVKFNSLNELV'+ + 'DYHRSTSVSRNQQIFLRDIEQVP') + def testAllowedMatricesAndPenalties(self): + self.assertRaises(blast.BlastError, blast.Blast, self.query, + 'testfiles/seqdb', matrix='BLOSUM45') + blast.Blast(self.query, 'testfiles/seqdb', matrix='BLOSUM45', + gap_open=13, gap_ext=3) + blast.Blast(self.query, 'testfiles/seqdb', matrix='BLOSUM80') + blast.Blast(self.query, 'testfiles/seqdb', matrix='PAM70') + blast.Blast(self.query, 'testfiles/seqdb', matrix='PAM30', gap_open=7, + gap_ext=2) + blast.Blast(self.query, 'testfiles/seqdb', matrix='BLOSUM62') + self.assertRaises(ValueError, blast.Blast, self.query, 'testfiles/seqdb', + matrix='MUAHA') + def testMissingDB(self): + self.assertRaises(IOError, blast.Blast, self.query, + 'testfiles/nonexistentdb') + + def testParseBlastOutput(self): + hits=blast.Blast(self.query, 'testfiles/seqdb') + expected_output=[{'evalue':2.366130E-59,'bitscore':211.460,'score':537}, + {'evalue':4.808930E-59,'bitscore':210.305,'score':534}, + {'evalue':5.361450E-58,'bitscore':206.838,'score':525}, + {'evalue':3.277500E+00,'bitscore':15.0086,'score':27}, + {'evalue':9.696520E+00,'bitscore':13.4678,'score':23}] + self.assertEqual(len(hits), 4) + for expected, hit in zip(expected_output, hits): + patch=hit.aligned_patches[0] + self.assertAlmostEqual(patch.evalue, expected['evalue']) + self.assertAlmostEqual(patch.bit_score, expected['bitscore']) + self.assertAlmostEqual(patch.score, expected['score']) +if __name__ == "__main__": + # test if blast package is available on system, otherwise ignore tests + try: + blastpath=settings.Locate('blastall') + except(settings.FileNotFound): + print "Could not find blastall executable: ignoring unit tests" + exit(0) + try: + unittest.main() + except Exception, e: + print e \ No newline at end of file diff --git a/modules/bindings/tests/testfiles/seqdb.fasta b/modules/bindings/tests/testfiles/seqdb.fasta new file mode 100644 index 0000000000000000000000000000000000000000..5e665b80f9857d04dc2440182bd226931dcc13c3 --- /dev/null +++ b/modules/bindings/tests/testfiles/seqdb.fasta @@ -0,0 +1,41 @@ +>1fhsA +GIEMKPHPWFFGKIPRAKAEEMLSKQRHDGAFLIRESESAPGDFSLSVKF +GNDVQHFKVLRDGAGKYFLWVVKFNSLNELVDYHRSTSVSRNQQIFLRDI +EQVPQQPTYVQA +>1griA +MEAIAKYDFKATADDELSFKRGDILKVQNWYKAELNGKDGFIPKNYIEMK +PHPWFFGKIPRAKAEEMLSKQRHDGAFLIRESESAPGDFSLSVKFGNDVQ +HFKVLRDGAGKYFLWVVKFNSLNELVDYHRSTSVSRNQQIFLRDIEQVPQ +QPTYVQALFDFDPQEDGELGFRRGDFIHVMDNSDPNWWKGACHGQTGMFP +RNYVTPVNRNV +>3n84E +MIEMKPHPWFFGKIPRAKAEEMLSKQRHDGAFLIRESESAPGDFSLSVKF +GNDVQHFKVLRDGAGKYFLWVVKFNSLNELVDYHRSTSVSRNQQIFLRDI +EQ +>3dwgB +RHMTRYDSLLQALGNTPLVGLQRLSPRWDDGRDGPHVRLWAKLEDRNPTG +SIKDRPAVRMIEQAEADGLLRPGATILEPTSGNTGISLAMAARLKGYRLI +CVMPENTSVERRQLLELYGAQIIFSAANTAVATAKELAATNPSWVMLYQY +GNPANTDSHYCGTGPELLADLPEITHFVAGLGTTGTLMGTGRFLREHVAN +VKIVAAEPRYGEGVYALRNMDEGFVPELYDPEILTARYSVGAVDAVRRTR +ELVHTEGIFAGISTGAVLHAALGVGAGALAAGERADIALVVADAGWKYLS +TGAYAGSLDDAETALEGQLWA +>3hkfA +SSVFIFPPKPKDVLTITLTPKVTCVVVDISKDDPEVQFSWFVDDVEVHTA +QTQPREEQFNSTFRSVSELPIMHQDWLNGKEFKCRVNSAAFPAPIEKTIS +KTKGRPKAPQVYTIPPPKEQMAKDKVSLTCMITDFFPEDITVEWQWNGQP +AENYKNTQPIMDTDGSYFVYSKLNVQKSNWEAGNTFTCSVLHEGLHNHHT +EKSLS +>1mw9X +GKALVIVESPAKAKTINKYLGSDYVVKSSVGHIRDLPTERGALVNRMGVD +PWHNWEAHYEVLPGKEKVVSELKQLAEKADHIYLATDLDREGEAIAWHLR +EVIGGDDARYSRVVFNEITKNAIRQAFNKPGELNIDRVNAQQARRFMDRV +VGYMVSPLLWKKIARGLSAGRVQSVAVRLVVEREREIKAFVPEEFWEVDA +STTTPSGEALALQVTHQNDKPFRPVNKEQTQAAVSLLEKARYSVLEREDK +PTTSKPGAPFITSTLQQAASTRLGFGVKKTMMMAQRLYEAGYITYMRTDS +TNLSQDAVNMVRGYISDNFGKKYLPESPNQYAREAIRPSDVNVMAESLKD +MEADAQKLYQLIWRQFVACQMTPAKYDSTTLTVGAGDFRLKARGRILRFD +GWTKVMPALEDRILPAVNKGDALTLVELTPAQHFTKPPARFSEASLVKEL +EKRGIGRPSTYASIISTIQDRGYVRVENRRFYAEKMGEIVTDRLEENFRE +LMNYDFTAQMENNLDQVANHEAEWKAVLDHFFSDFTQQLDKAEKDPEEGG +MRPNQM \ No newline at end of file diff --git a/modules/bindings/tests/testfiles/seqdb.phr b/modules/bindings/tests/testfiles/seqdb.phr new file mode 100644 index 0000000000000000000000000000000000000000..b4cfd67e27cc18f5e7b199476a4083e6f4ff8a56 Binary files /dev/null and b/modules/bindings/tests/testfiles/seqdb.phr differ diff --git a/modules/bindings/tests/testfiles/seqdb.pin b/modules/bindings/tests/testfiles/seqdb.pin new file mode 100644 index 0000000000000000000000000000000000000000..1f6f5be753acd11950e085c4971e9fe0ea7d78e2 Binary files /dev/null and b/modules/bindings/tests/testfiles/seqdb.pin differ diff --git a/modules/bindings/tests/testfiles/seqdb.psq b/modules/bindings/tests/testfiles/seqdb.psq new file mode 100644 index 0000000000000000000000000000000000000000..b55326094e5594efaa5f3a695ece541dd46669ee Binary files /dev/null and b/modules/bindings/tests/testfiles/seqdb.psq differ