From 0a49556bba1e804d159e23a0d173f87388ebd28d Mon Sep 17 00:00:00 2001 From: Marco Biasini <marco.biasini@unibas.ch> Date: Tue, 8 Feb 2011 09:08:09 +0100 Subject: [PATCH] added bindings for blast --- modules/bindings/pymod/CMakeLists.txt | 15 ++- modules/bindings/pymod/blast.py | 122 +++++++++++++++++++ modules/bindings/tests/CMakeLists.txt | 1 + modules/bindings/tests/test_blast.py | 50 ++++++++ modules/bindings/tests/testfiles/seqdb.fasta | 41 +++++++ modules/bindings/tests/testfiles/seqdb.phr | Bin 0 -> 408 bytes modules/bindings/tests/testfiles/seqdb.pin | Bin 0 -> 120 bytes modules/bindings/tests/testfiles/seqdb.psq | Bin 0 -> 1514 bytes 8 files changed, 227 insertions(+), 2 deletions(-) create mode 100644 modules/bindings/pymod/blast.py create mode 100644 modules/bindings/tests/test_blast.py create mode 100644 modules/bindings/tests/testfiles/seqdb.fasta create mode 100644 modules/bindings/tests/testfiles/seqdb.phr create mode 100644 modules/bindings/tests/testfiles/seqdb.pin create mode 100644 modules/bindings/tests/testfiles/seqdb.psq diff --git a/modules/bindings/pymod/CMakeLists.txt b/modules/bindings/pymod/CMakeLists.txt index f91dc53d8..89244b23f 100644 --- a/modules/bindings/pymod/CMakeLists.txt +++ b/modules/bindings/pymod/CMakeLists.txt @@ -1,2 +1,13 @@ -pymod(NAME bindings PY __init__.py lga.py hbplus.py msms.py tmtools.py - dssp.py clustalw.py utils.py naccess.py) +set(OST_BINDINGS +__init__.py +lga.py +hbplus.py +msms.py +tmtools.py +dssp.py +clustalw.py +utils.py +naccess.py +blast.py +) +pymod(NAME bindings PY ${OST_BINDINGS}) diff --git a/modules/bindings/pymod/blast.py b/modules/bindings/pymod/blast.py new file mode 100644 index 000000000..32e36e3bf --- /dev/null +++ b/modules/bindings/pymod/blast.py @@ -0,0 +1,122 @@ +from ost import settings +import subprocess +from xml.dom import minidom +from ost import io, seq +import ost +import re +import os +class AlignedPatch: + def __init__(self, aln, bit_score, score, evalue): + self.aln=aln + self.bit_score=bit_score + self.score=score + self.evalue=evalue + +class BlastHit: + """ + A positive match found by BLAST. + + Each blast hit consists of one or more aligned patches. + + """ + def __init__(self, identifier, aligned_patches): + self.identifier=identifier + self.aligned_patches=aligned_patches + +def ParseBlastOutput(string): + """ + Parses the blast output and returns a list of BlastHits + """ + def _GetText(node): + rc='' + for child in node.childNodes: + if child.nodeType==child.TEXT_NODE: + rc+=child.data + return rc + + def _GetValue(node, tag_name): + tag=node.getElementsByTagName(tag_name) + assert len(tag)==1 + return _GetText(tag[0]) + + def _GetInt(node, tag_name): + return int(_GetValue(node, tag_name)) + + def _ParseHsp(query_id, hit_id, hsp): + bit_score=float(_GetValue(hsp, 'Hsp_bit-score')) + score=float(_GetValue(hsp, 'Hsp_score')) + evalue=float(_GetValue(hsp, 'Hsp_evalue')) + query_offset=_GetInt(hsp, 'Hsp_query-from')-1 + hit_offset=_GetInt(hsp, 'Hsp_hit-from')-1 + query_seq=seq.CreateSequence(str(query_id), str(_GetValue(hsp, 'Hsp_qseq'))) + query_seq.offset=query_offset + hit_seq=seq.CreateSequence(str(hit_id), str(_GetValue(hsp, 'Hsp_hseq'))) + hit_seq.offset=hit_offset + aln=seq.CreateAlignment(query_seq, hit_seq) + return AlignedPatch(aln, bit_score, score, evalue) + try: + doc=minidom.parseString(string) + except Exception, e: + ost.LogError('Error while parsing BLAST output: %s' % str(e)) + return None + hits=[] + query_id=_GetValue(doc, 'BlastOutput_query-def') + for hit in doc.getElementsByTagName('Hit'): + hit_id=_GetValue(hit, 'Hit_def') + hsps=hit.getElementsByTagName('Hsp') + aligned_patches=[_ParseHsp(query_id, hit_id, hsp) for hsp in hsps] + hits.append(BlastHit(hit_id, aligned_patches)) + return hits + + + +class BlastError(RuntimeError): + def __init__(self, brief, details): + self.brief=brief + self.details=details + + def __str__(self): + if self.details: + return '%s\n%s' % (self.brief, self.details) + else: + return self.brief + +def Blast(query, database, gap_open=11, gap_ext=1, matrix='BLOSUM62', + blast_location=None): + """ + Runs a protein vs. protein blast search. The results are returned as a + list of :class:`BlastHit` instances. + + :param query: the query sequence + :type query: :class:`seq.ConstSequenceHandle` + + :param database: The filename of the sequence database. Make sure that + formatdb has been run on the database and the <database>.pin file exists. + :param matrix: The substitution matrix to be used. Must be one of 'BLOSUM45', + 'BLOSUM62', 'BLOSUM80', 'PAM30', 'PAM70'. + :param gap_open: Gap opening penalty. Note that only a subset of gap opening + penalties is supported for each substitutition matrix. Consult the blast + docs for more information. + :param gap_ext: Gap extension penalty. Only a subset of gap extension + penalties are supported for each of the substitution matrices. Consult the + blast docs for more information. + """ + subst_mats=('BLOSUM45', 'BLOSUM62', 'BLOSUM80', 'PAM30', 'PAM70',) + if matrix not in subst_mats: + raise ValueError('matrix must be one of %s' % ', '.join(subst_mats)) + if not os.path.exists('%s.pin' % database): + raise IOError("Database %s does not exist" % database) + blastall_exe=settings.Locate('blastall', explicit_file_name=blast_location) + args=[blastall_exe, '-d', database, '-p', 'blastp', + '-m', '7', '-M', matrix, '-G', str(gap_open), '-E', str(gap_ext)] + ost.LogInfo('running BLAST (%s)' % ' '.join(args)) + blast_pipe=subprocess.Popen(args, stderr=subprocess.PIPE, + stdout=subprocess.PIPE, stdin=subprocess.PIPE) + stdout, stderr=blast_pipe.communicate(io.SequenceToString(query, 'fasta')) + if len(stderr)>0: + pattern=re.compile(r'^\[.*\]\s+ERROR:\s+(.*)') + lines=stderr.split('\n') + error_message=pattern.match(lines[0]) + if error_message: + raise BlastError(error_message.group(1), '\n'.join(lines[1:])) + return ParseBlastOutput(stdout) diff --git a/modules/bindings/tests/CMakeLists.txt b/modules/bindings/tests/CMakeLists.txt index 5a3f44e3d..a93d8aeaa 100644 --- a/modules/bindings/tests/CMakeLists.txt +++ b/modules/bindings/tests/CMakeLists.txt @@ -1,6 +1,7 @@ set(OST_BINDINGS_UNIT_TESTS test_msms.py test_clustalw.py + test_blast.py ) ost_unittest(bindings "${OST_BINDINGS_UNIT_TESTS}") diff --git a/modules/bindings/tests/test_blast.py b/modules/bindings/tests/test_blast.py new file mode 100644 index 000000000..2b42bfd18 --- /dev/null +++ b/modules/bindings/tests/test_blast.py @@ -0,0 +1,50 @@ +import unittest +from ost import * +from ost import settings +from ost.bindings import blast + +class TestBlastBindings(unittest.TestCase): + def setUp(self): + self.query=seq.CreateSequence('A', 'MKPHPWFFGKIPRAKAEEMLSKQRHDGAFLIRESESAPG'+ + 'DFSLSVKFGNDVQHFKVLRDGAGKYFLWVVKFNSLNELV'+ + 'DYHRSTSVSRNQQIFLRDIEQVP') + def testAllowedMatricesAndPenalties(self): + self.assertRaises(blast.BlastError, blast.Blast, self.query, + 'testfiles/seqdb', matrix='BLOSUM45') + blast.Blast(self.query, 'testfiles/seqdb', matrix='BLOSUM45', + gap_open=13, gap_ext=3) + blast.Blast(self.query, 'testfiles/seqdb', matrix='BLOSUM80') + blast.Blast(self.query, 'testfiles/seqdb', matrix='PAM70') + blast.Blast(self.query, 'testfiles/seqdb', matrix='PAM30', gap_open=7, + gap_ext=2) + blast.Blast(self.query, 'testfiles/seqdb', matrix='BLOSUM62') + self.assertRaises(ValueError, blast.Blast, self.query, 'testfiles/seqdb', + matrix='MUAHA') + def testMissingDB(self): + self.assertRaises(IOError, blast.Blast, self.query, + 'testfiles/nonexistentdb') + + def testParseBlastOutput(self): + hits=blast.Blast(self.query, 'testfiles/seqdb') + expected_output=[{'evalue':2.366130E-59,'bitscore':211.460,'score':537}, + {'evalue':4.808930E-59,'bitscore':210.305,'score':534}, + {'evalue':5.361450E-58,'bitscore':206.838,'score':525}, + {'evalue':3.277500E+00,'bitscore':15.0086,'score':27}, + {'evalue':9.696520E+00,'bitscore':13.4678,'score':23}] + self.assertEqual(len(hits), 4) + for expected, hit in zip(expected_output, hits): + patch=hit.aligned_patches[0] + self.assertAlmostEqual(patch.evalue, expected['evalue']) + self.assertAlmostEqual(patch.bit_score, expected['bitscore']) + self.assertAlmostEqual(patch.score, expected['score']) +if __name__ == "__main__": + # test if blast package is available on system, otherwise ignore tests + try: + blastpath=settings.Locate('blastall') + except(settings.FileNotFound): + print "Could not find blastall executable: ignoring unit tests" + exit(0) + try: + unittest.main() + except Exception, e: + print e \ No newline at end of file diff --git a/modules/bindings/tests/testfiles/seqdb.fasta b/modules/bindings/tests/testfiles/seqdb.fasta new file mode 100644 index 000000000..5e665b80f --- /dev/null +++ b/modules/bindings/tests/testfiles/seqdb.fasta @@ -0,0 +1,41 @@ +>1fhsA +GIEMKPHPWFFGKIPRAKAEEMLSKQRHDGAFLIRESESAPGDFSLSVKF +GNDVQHFKVLRDGAGKYFLWVVKFNSLNELVDYHRSTSVSRNQQIFLRDI +EQVPQQPTYVQA +>1griA +MEAIAKYDFKATADDELSFKRGDILKVQNWYKAELNGKDGFIPKNYIEMK +PHPWFFGKIPRAKAEEMLSKQRHDGAFLIRESESAPGDFSLSVKFGNDVQ +HFKVLRDGAGKYFLWVVKFNSLNELVDYHRSTSVSRNQQIFLRDIEQVPQ +QPTYVQALFDFDPQEDGELGFRRGDFIHVMDNSDPNWWKGACHGQTGMFP +RNYVTPVNRNV +>3n84E +MIEMKPHPWFFGKIPRAKAEEMLSKQRHDGAFLIRESESAPGDFSLSVKF +GNDVQHFKVLRDGAGKYFLWVVKFNSLNELVDYHRSTSVSRNQQIFLRDI +EQ +>3dwgB +RHMTRYDSLLQALGNTPLVGLQRLSPRWDDGRDGPHVRLWAKLEDRNPTG +SIKDRPAVRMIEQAEADGLLRPGATILEPTSGNTGISLAMAARLKGYRLI +CVMPENTSVERRQLLELYGAQIIFSAANTAVATAKELAATNPSWVMLYQY +GNPANTDSHYCGTGPELLADLPEITHFVAGLGTTGTLMGTGRFLREHVAN +VKIVAAEPRYGEGVYALRNMDEGFVPELYDPEILTARYSVGAVDAVRRTR +ELVHTEGIFAGISTGAVLHAALGVGAGALAAGERADIALVVADAGWKYLS +TGAYAGSLDDAETALEGQLWA +>3hkfA +SSVFIFPPKPKDVLTITLTPKVTCVVVDISKDDPEVQFSWFVDDVEVHTA +QTQPREEQFNSTFRSVSELPIMHQDWLNGKEFKCRVNSAAFPAPIEKTIS +KTKGRPKAPQVYTIPPPKEQMAKDKVSLTCMITDFFPEDITVEWQWNGQP +AENYKNTQPIMDTDGSYFVYSKLNVQKSNWEAGNTFTCSVLHEGLHNHHT +EKSLS +>1mw9X +GKALVIVESPAKAKTINKYLGSDYVVKSSVGHIRDLPTERGALVNRMGVD +PWHNWEAHYEVLPGKEKVVSELKQLAEKADHIYLATDLDREGEAIAWHLR +EVIGGDDARYSRVVFNEITKNAIRQAFNKPGELNIDRVNAQQARRFMDRV +VGYMVSPLLWKKIARGLSAGRVQSVAVRLVVEREREIKAFVPEEFWEVDA +STTTPSGEALALQVTHQNDKPFRPVNKEQTQAAVSLLEKARYSVLEREDK +PTTSKPGAPFITSTLQQAASTRLGFGVKKTMMMAQRLYEAGYITYMRTDS +TNLSQDAVNMVRGYISDNFGKKYLPESPNQYAREAIRPSDVNVMAESLKD +MEADAQKLYQLIWRQFVACQMTPAKYDSTTLTVGAGDFRLKARGRILRFD +GWTKVMPALEDRILPAVNKGDALTLVELTPAQHFTKPPARFSEASLVKEL +EKRGIGRPSTYASIISTIQDRGYVRVENRRFYAEKMGEIVTDRLEENFRE +LMNYDFTAQMENNLDQVANHEAEWKAVLDHFFSDFTQQLDKAEKDPEEGG +MRPNQM \ No newline at end of file diff --git a/modules/bindings/tests/testfiles/seqdb.phr b/modules/bindings/tests/testfiles/seqdb.phr new file mode 100644 index 0000000000000000000000000000000000000000..b4cfd67e27cc18f5e7b199476a4083e6f4ff8a56 GIT binary patch literal 408 zcmXqLFlboNAjN8!mQn1;z_75vpkWn6h||d@-ap7C-qQsnyr6-JkpUSjLI}XkNH5AH zWCn^MSj;favoLW5S%}9SOa$CvoKl|dM8FJYLS|%SrxEfC3js3>bIUCw2)Kh4hZz7A CiChZ+ literal 0 HcmV?d00001 diff --git a/modules/bindings/tests/testfiles/seqdb.pin b/modules/bindings/tests/testfiles/seqdb.pin new file mode 100644 index 0000000000000000000000000000000000000000..1f6f5be753acd11950e085c4971e9fe0ea7d78e2 GIT binary patch literal 120 zcmZQzU|?ZjU|?imU|=myElf#bU|^7TOHER+&`~flFf>$9u&^>RRB-fVfCIM2tS~l{ z4wU7>z`)SKz`$^Zfq_wgfq^lEfq`)b)Z8Kl21Yjq2FA4v3{3AC7?}4mFtEM?05{GK AC;$Ke literal 0 HcmV?d00001 diff --git a/modules/bindings/tests/testfiles/seqdb.psq b/modules/bindings/tests/testfiles/seqdb.psq new file mode 100644 index 0000000000000000000000000000000000000000..b55326094e5594efaa5f3a695ece541dd46669ee GIT binary patch literal 1514 zcmZQz=VayK;^W{GVPj+G;^Y%x<YHuH<>40O;uqjxVP|CH<`iHRWEEuOV`pI#<Q5d> zVq@oJ5$5M$;}Yf;U}0xu=MrP%77-TaV&fI$=4Is;W)b5M5EK#=78Ky+=jUYO7GU9I z<rn7T=jRg=6Xs`R;9+ItWaJWKVdG*HVq{@q<rZY)5@2WH<mM9Q=M@p-Vr1p!W#?jH zXXE7K;uWK_>$uri*jV`ZSy|Xwx!KtS1lU>FI5~uQSa=0l_;^J`xY!w)IoSDy*m>Ca z1bD@Sh4_Sd1$c!Sc<ACx1_2HpAptQKL2hn-Ms9XqAwF(lc5Z$FZb3c)5f&D90Ty;X z4q*Xq5k@X<Ru%zXJ|T8NPA(P!K1N{y9!^$%Mpi}^c5ZF~K6XYSPHt8{AwhOtA$Cqd zZblwPMgeXvb}<2NPG(^qK2}~KL19(_0e)_7R&FtNMt)9CHbF*4ULi(dMj=KnR&GW{ zAznT~5n&!~F@7<2UOq-%Ar?UnF=lolc0N{aZblYvK2}a44mM#%c5ZedA$B2d9(Eyi z0XA*{Rt{lCUSTdyVMaz)J^?XyR(4@AMs5LK9u`)1Heo(iZZQ@<R!(jqMgcKFVRlAg z7DizK0U-faZeb1~R(4J{Ms`j?A$CS#ZVpC9ZgydIMs`MSMn-m40Y(;1Ms8tYMixeP z5iT)qkRmZgc0q0y7DiSfMs8Mier^#)20=k#HcmD^J}y2k7GZ87P9bg~J}zM)W?^Ar z7EVDf78X8MVSYA25jJ5K7GYLl4k1Q<A$~pqR#tvCUO^!?0YPCwR&G8{9u9sM5l}K^ zW#eKN5atzRWMt!G<l|)J65<r(65?VP;NxQC;};ea;^gDw<6`CKVdP@r5*FkZV&>r# zVqs(BV`bqK5@r?Q7vW{+=VN5$731O+;^*V!VG&|s7ZhU?78B&+<`w4W666(OWn||S zViRH(6z1k&W#{JL<=_xv<r3r;WMJoF<QC=>W)<XP<YMF!;^gHL<7O9R5fc{X5)>3> z=in4z;pP)!6<}xN7UmV;VHalM6X5_E$RWlm%+1Hn#mXfN@(>q4HzO+-BMS$o7&oI3 z3pa}ZD?2Ecig0iXunKdsv$L>(;zK}Kn2nc}Q;3U~kyC)5k&Ty&kDZm9my<<6n3s{C zpHV=7jfX`*SeRXmM_7=Ln_Gm7i<41+om-HRT|k&$P?%9zfLmCYRe)81m6MAR6x*z9 zBCNtJjDkW!LVSYktc=`@-2B2q9Q?d2TzqT-e8Rk3to%a!jEus9+}x~OAU6nevkI`X zaPbKV33BnVGxD)<3JD5v^Yb$@3JM8uv$L@ab8!jr@bED53vi3EGO~+t3W@Ou2(btX z@p23DvoH$t@(2sCi*X9F@UpRUafxyBu?q6>@{2JFurhKA@CmXA^9u7YvI=r@vGA}m zvM};<af|VDbBYM?vk5aY^YaMtfyypHAt7!dP~u}@6X51z6kr$N<Q8CKVHXkN66WD! z1m!+XZazj~UM_YPMs6W)VODM-K1O~HHX$xPK1KmHK~_dVZecD~ZdNV<c20HyK0zTd zMnO(aK_O0l76EoKVF6)QUI76%F-BG{9(GnvVIdX)ZdO)aHUU;{9$qmPHX%lS9#&pn wZWexFMqUn9Mph9nMqzFi4mLJH7B(S%er^^nMpiBsK2}zCb{+vfUVa`101J>oi2wiq literal 0 HcmV?d00001 -- GitLab