From 0a49556bba1e804d159e23a0d173f87388ebd28d Mon Sep 17 00:00:00 2001
From: Marco Biasini <marco.biasini@unibas.ch>
Date: Tue, 8 Feb 2011 09:08:09 +0100
Subject: [PATCH] added bindings for blast

---
 modules/bindings/pymod/CMakeLists.txt        |  15 ++-
 modules/bindings/pymod/blast.py              | 122 +++++++++++++++++++
 modules/bindings/tests/CMakeLists.txt        |   1 +
 modules/bindings/tests/test_blast.py         |  50 ++++++++
 modules/bindings/tests/testfiles/seqdb.fasta |  41 +++++++
 modules/bindings/tests/testfiles/seqdb.phr   | Bin 0 -> 408 bytes
 modules/bindings/tests/testfiles/seqdb.pin   | Bin 0 -> 120 bytes
 modules/bindings/tests/testfiles/seqdb.psq   | Bin 0 -> 1514 bytes
 8 files changed, 227 insertions(+), 2 deletions(-)
 create mode 100644 modules/bindings/pymod/blast.py
 create mode 100644 modules/bindings/tests/test_blast.py
 create mode 100644 modules/bindings/tests/testfiles/seqdb.fasta
 create mode 100644 modules/bindings/tests/testfiles/seqdb.phr
 create mode 100644 modules/bindings/tests/testfiles/seqdb.pin
 create mode 100644 modules/bindings/tests/testfiles/seqdb.psq

diff --git a/modules/bindings/pymod/CMakeLists.txt b/modules/bindings/pymod/CMakeLists.txt
index f91dc53d8..89244b23f 100644
--- a/modules/bindings/pymod/CMakeLists.txt
+++ b/modules/bindings/pymod/CMakeLists.txt
@@ -1,2 +1,13 @@
-pymod(NAME bindings PY __init__.py lga.py hbplus.py msms.py tmtools.py 
-                       dssp.py clustalw.py utils.py naccess.py)
+set(OST_BINDINGS
+__init__.py
+lga.py
+hbplus.py
+msms.py
+tmtools.py 
+dssp.py
+clustalw.py
+utils.py
+naccess.py
+blast.py
+)
+pymod(NAME bindings PY ${OST_BINDINGS})
diff --git a/modules/bindings/pymod/blast.py b/modules/bindings/pymod/blast.py
new file mode 100644
index 000000000..32e36e3bf
--- /dev/null
+++ b/modules/bindings/pymod/blast.py
@@ -0,0 +1,122 @@
+from ost import settings
+import subprocess
+from xml.dom import minidom
+from ost import io, seq
+import ost
+import re
+import os
+class AlignedPatch:
+  def __init__(self, aln, bit_score, score, evalue):
+    self.aln=aln
+    self.bit_score=bit_score
+    self.score=score
+    self.evalue=evalue
+
+class BlastHit:
+  """
+  A positive match found by BLAST.
+  
+  Each blast hit consists of one or more aligned patches. 
+  
+  """
+  def __init__(self, identifier, aligned_patches):
+    self.identifier=identifier
+    self.aligned_patches=aligned_patches
+
+def ParseBlastOutput(string):
+  """
+  Parses the blast output and returns a list of BlastHits
+  """
+  def _GetText(node):
+    rc=''
+    for child in node.childNodes:
+        if child.nodeType==child.TEXT_NODE:
+            rc+=child.data
+    return rc
+      
+  def _GetValue(node, tag_name):
+    tag=node.getElementsByTagName(tag_name)
+    assert len(tag)==1
+    return _GetText(tag[0])
+    
+  def _GetInt(node, tag_name):
+    return int(_GetValue(node, tag_name))
+
+  def _ParseHsp(query_id, hit_id, hsp):
+    bit_score=float(_GetValue(hsp, 'Hsp_bit-score'))
+    score=float(_GetValue(hsp, 'Hsp_score'))
+    evalue=float(_GetValue(hsp, 'Hsp_evalue'))
+    query_offset=_GetInt(hsp, 'Hsp_query-from')-1
+    hit_offset=_GetInt(hsp, 'Hsp_hit-from')-1
+    query_seq=seq.CreateSequence(str(query_id), str(_GetValue(hsp, 'Hsp_qseq')))
+    query_seq.offset=query_offset
+    hit_seq=seq.CreateSequence(str(hit_id), str(_GetValue(hsp, 'Hsp_hseq')))
+    hit_seq.offset=hit_offset
+    aln=seq.CreateAlignment(query_seq, hit_seq)
+    return AlignedPatch(aln, bit_score, score, evalue)
+  try:
+    doc=minidom.parseString(string)
+  except Exception, e:
+    ost.LogError('Error while parsing BLAST output: %s' % str(e))
+    return None
+  hits=[]
+  query_id=_GetValue(doc, 'BlastOutput_query-def')
+  for hit in doc.getElementsByTagName('Hit'):
+    hit_id=_GetValue(hit, 'Hit_def')
+    hsps=hit.getElementsByTagName('Hsp')
+    aligned_patches=[_ParseHsp(query_id, hit_id, hsp) for hsp in hsps]
+    hits.append(BlastHit(hit_id, aligned_patches))
+  return hits
+
+
+
+class BlastError(RuntimeError):
+  def __init__(self, brief, details):
+    self.brief=brief
+    self.details=details
+
+  def __str__(self):
+    if self.details:
+      return '%s\n%s' % (self.brief, self.details)
+    else:
+      return self.brief
+
+def Blast(query, database, gap_open=11, gap_ext=1, matrix='BLOSUM62',
+         blast_location=None):
+  """
+  Runs a protein vs. protein blast search. The results are returned as a 
+  list of :class:`BlastHit` instances.
+  
+  :param query: the query sequence
+  :type query: :class:`seq.ConstSequenceHandle`
+  
+  :param database: The filename of the sequence database. Make sure that 
+      formatdb has been run on the database and the <database>.pin file exists.
+  :param matrix: The substitution matrix to be used. Must be one of 'BLOSUM45',
+     'BLOSUM62', 'BLOSUM80', 'PAM30', 'PAM70'.
+  :param gap_open: Gap opening penalty. Note that only a subset of gap opening 
+     penalties is supported for each substitutition matrix. Consult the blast 
+     docs for more information.
+  :param gap_ext: Gap extension penalty. Only a subset of gap extension 
+     penalties are supported for each of the substitution matrices. Consult the 
+     blast docs for more information.
+  """
+  subst_mats=('BLOSUM45', 'BLOSUM62', 'BLOSUM80', 'PAM30', 'PAM70',)
+  if matrix not in subst_mats:
+    raise ValueError('matrix must be one of %s' % ', '.join(subst_mats))
+  if not os.path.exists('%s.pin' % database):
+    raise IOError("Database %s does not exist" % database)
+  blastall_exe=settings.Locate('blastall', explicit_file_name=blast_location)
+  args=[blastall_exe, '-d', database, '-p', 'blastp', 
+        '-m', '7', '-M', matrix, '-G', str(gap_open), '-E', str(gap_ext)]
+  ost.LogInfo('running BLAST (%s)' % ' '.join(args))
+  blast_pipe=subprocess.Popen(args, stderr=subprocess.PIPE, 
+                              stdout=subprocess.PIPE, stdin=subprocess.PIPE)
+  stdout, stderr=blast_pipe.communicate(io.SequenceToString(query, 'fasta'))
+  if len(stderr)>0:
+     pattern=re.compile(r'^\[.*\]\s+ERROR:\s+(.*)')
+     lines=stderr.split('\n')
+     error_message=pattern.match(lines[0])
+     if error_message:
+       raise BlastError(error_message.group(1), '\n'.join(lines[1:]))
+  return ParseBlastOutput(stdout)
diff --git a/modules/bindings/tests/CMakeLists.txt b/modules/bindings/tests/CMakeLists.txt
index 5a3f44e3d..a93d8aeaa 100644
--- a/modules/bindings/tests/CMakeLists.txt
+++ b/modules/bindings/tests/CMakeLists.txt
@@ -1,6 +1,7 @@
 set(OST_BINDINGS_UNIT_TESTS
   test_msms.py
   test_clustalw.py
+  test_blast.py
 )
 
 ost_unittest(bindings "${OST_BINDINGS_UNIT_TESTS}")
diff --git a/modules/bindings/tests/test_blast.py b/modules/bindings/tests/test_blast.py
new file mode 100644
index 000000000..2b42bfd18
--- /dev/null
+++ b/modules/bindings/tests/test_blast.py
@@ -0,0 +1,50 @@
+import unittest
+from ost import *
+from ost import settings
+from ost.bindings import blast
+
+class TestBlastBindings(unittest.TestCase):
+  def setUp(self):
+    self.query=seq.CreateSequence('A', 'MKPHPWFFGKIPRAKAEEMLSKQRHDGAFLIRESESAPG'+
+                                       'DFSLSVKFGNDVQHFKVLRDGAGKYFLWVVKFNSLNELV'+
+                                       'DYHRSTSVSRNQQIFLRDIEQVP')
+  def testAllowedMatricesAndPenalties(self):
+    self.assertRaises(blast.BlastError, blast.Blast, self.query, 
+                      'testfiles/seqdb', matrix='BLOSUM45')
+    blast.Blast(self.query, 'testfiles/seqdb', matrix='BLOSUM45', 
+                gap_open=13, gap_ext=3)
+    blast.Blast(self.query, 'testfiles/seqdb', matrix='BLOSUM80')
+    blast.Blast(self.query, 'testfiles/seqdb', matrix='PAM70')
+    blast.Blast(self.query, 'testfiles/seqdb', matrix='PAM30', gap_open=7,
+                gap_ext=2)
+    blast.Blast(self.query, 'testfiles/seqdb', matrix='BLOSUM62')
+    self.assertRaises(ValueError, blast.Blast, self.query, 'testfiles/seqdb', 
+                      matrix='MUAHA')
+  def testMissingDB(self):
+    self.assertRaises(IOError, blast.Blast, self.query, 
+                      'testfiles/nonexistentdb')
+  
+  def testParseBlastOutput(self):
+    hits=blast.Blast(self.query, 'testfiles/seqdb')
+    expected_output=[{'evalue':2.366130E-59,'bitscore':211.460,'score':537}, 
+                     {'evalue':4.808930E-59,'bitscore':210.305,'score':534}, 
+                     {'evalue':5.361450E-58,'bitscore':206.838,'score':525}, 
+                     {'evalue':3.277500E+00,'bitscore':15.0086,'score':27}, 
+                     {'evalue':9.696520E+00,'bitscore':13.4678,'score':23}]
+    self.assertEqual(len(hits), 4)
+    for expected, hit in zip(expected_output, hits):
+      patch=hit.aligned_patches[0]
+      self.assertAlmostEqual(patch.evalue, expected['evalue'])
+      self.assertAlmostEqual(patch.bit_score, expected['bitscore'])
+      self.assertAlmostEqual(patch.score, expected['score'])
+if __name__ == "__main__":
+  # test if blast package is available on system, otherwise ignore tests
+  try:
+    blastpath=settings.Locate('blastall')
+  except(settings.FileNotFound):
+    print "Could not find blastall executable: ignoring unit tests"
+    exit(0)
+  try:
+    unittest.main()
+  except Exception, e:
+    print e
\ No newline at end of file
diff --git a/modules/bindings/tests/testfiles/seqdb.fasta b/modules/bindings/tests/testfiles/seqdb.fasta
new file mode 100644
index 000000000..5e665b80f
--- /dev/null
+++ b/modules/bindings/tests/testfiles/seqdb.fasta
@@ -0,0 +1,41 @@
+>1fhsA
+GIEMKPHPWFFGKIPRAKAEEMLSKQRHDGAFLIRESESAPGDFSLSVKF
+GNDVQHFKVLRDGAGKYFLWVVKFNSLNELVDYHRSTSVSRNQQIFLRDI
+EQVPQQPTYVQA
+>1griA
+MEAIAKYDFKATADDELSFKRGDILKVQNWYKAELNGKDGFIPKNYIEMK
+PHPWFFGKIPRAKAEEMLSKQRHDGAFLIRESESAPGDFSLSVKFGNDVQ
+HFKVLRDGAGKYFLWVVKFNSLNELVDYHRSTSVSRNQQIFLRDIEQVPQ
+QPTYVQALFDFDPQEDGELGFRRGDFIHVMDNSDPNWWKGACHGQTGMFP
+RNYVTPVNRNV
+>3n84E
+MIEMKPHPWFFGKIPRAKAEEMLSKQRHDGAFLIRESESAPGDFSLSVKF
+GNDVQHFKVLRDGAGKYFLWVVKFNSLNELVDYHRSTSVSRNQQIFLRDI
+EQ
+>3dwgB
+RHMTRYDSLLQALGNTPLVGLQRLSPRWDDGRDGPHVRLWAKLEDRNPTG
+SIKDRPAVRMIEQAEADGLLRPGATILEPTSGNTGISLAMAARLKGYRLI
+CVMPENTSVERRQLLELYGAQIIFSAANTAVATAKELAATNPSWVMLYQY
+GNPANTDSHYCGTGPELLADLPEITHFVAGLGTTGTLMGTGRFLREHVAN
+VKIVAAEPRYGEGVYALRNMDEGFVPELYDPEILTARYSVGAVDAVRRTR
+ELVHTEGIFAGISTGAVLHAALGVGAGALAAGERADIALVVADAGWKYLS
+TGAYAGSLDDAETALEGQLWA
+>3hkfA
+SSVFIFPPKPKDVLTITLTPKVTCVVVDISKDDPEVQFSWFVDDVEVHTA
+QTQPREEQFNSTFRSVSELPIMHQDWLNGKEFKCRVNSAAFPAPIEKTIS
+KTKGRPKAPQVYTIPPPKEQMAKDKVSLTCMITDFFPEDITVEWQWNGQP
+AENYKNTQPIMDTDGSYFVYSKLNVQKSNWEAGNTFTCSVLHEGLHNHHT
+EKSLS
+>1mw9X
+GKALVIVESPAKAKTINKYLGSDYVVKSSVGHIRDLPTERGALVNRMGVD
+PWHNWEAHYEVLPGKEKVVSELKQLAEKADHIYLATDLDREGEAIAWHLR
+EVIGGDDARYSRVVFNEITKNAIRQAFNKPGELNIDRVNAQQARRFMDRV
+VGYMVSPLLWKKIARGLSAGRVQSVAVRLVVEREREIKAFVPEEFWEVDA
+STTTPSGEALALQVTHQNDKPFRPVNKEQTQAAVSLLEKARYSVLEREDK
+PTTSKPGAPFITSTLQQAASTRLGFGVKKTMMMAQRLYEAGYITYMRTDS
+TNLSQDAVNMVRGYISDNFGKKYLPESPNQYAREAIRPSDVNVMAESLKD
+MEADAQKLYQLIWRQFVACQMTPAKYDSTTLTVGAGDFRLKARGRILRFD
+GWTKVMPALEDRILPAVNKGDALTLVELTPAQHFTKPPARFSEASLVKEL
+EKRGIGRPSTYASIISTIQDRGYVRVENRRFYAEKMGEIVTDRLEENFRE
+LMNYDFTAQMENNLDQVANHEAEWKAVLDHFFSDFTQQLDKAEKDPEEGG
+MRPNQM
\ No newline at end of file
diff --git a/modules/bindings/tests/testfiles/seqdb.phr b/modules/bindings/tests/testfiles/seqdb.phr
new file mode 100644
index 0000000000000000000000000000000000000000..b4cfd67e27cc18f5e7b199476a4083e6f4ff8a56
GIT binary patch
literal 408
zcmXqLFlboNAjN8!mQn1;z_75vpkWn6h||d@-ap7C-qQsnyr6-JkpUSjLI}XkNH5AH
zWCn^MSj;favoLW5S%}9SOa$CvoKl|dM8FJYLS|%SrxEfC3js3>bIUCw2)Kh4hZz7A
CiChZ+

literal 0
HcmV?d00001

diff --git a/modules/bindings/tests/testfiles/seqdb.pin b/modules/bindings/tests/testfiles/seqdb.pin
new file mode 100644
index 0000000000000000000000000000000000000000..1f6f5be753acd11950e085c4971e9fe0ea7d78e2
GIT binary patch
literal 120
zcmZQzU|?ZjU|?imU|=myElf#bU|^7TOHER+&`~flFf>$9u&^>RRB-fVfCIM2tS~l{
z4wU7>z`)SKz`$^Zfq_wgfq^lEfq`)b)Z8Kl21Yjq2FA4v3{3AC7?}4mFtEM?05{GK
AC;$Ke

literal 0
HcmV?d00001

diff --git a/modules/bindings/tests/testfiles/seqdb.psq b/modules/bindings/tests/testfiles/seqdb.psq
new file mode 100644
index 0000000000000000000000000000000000000000..b55326094e5594efaa5f3a695ece541dd46669ee
GIT binary patch
literal 1514
zcmZQz=VayK;^W{GVPj+G;^Y%x<YHuH<>40O;uqjxVP|CH<`iHRWEEuOV`pI#<Q5d>
zVq@oJ5$5M$;}Yf;U}0xu=MrP%77-TaV&fI$=4Is;W)b5M5EK#=78Ky+=jUYO7GU9I
z<rn7T=jRg=6Xs`R;9+ItWaJWKVdG*HVq{@q<rZY)5@2WH<mM9Q=M@p-Vr1p!W#?jH
zXXE7K;uWK_>$uri*jV`ZSy|Xwx!KtS1lU>FI5~uQSa=0l_;^J`xY!w)IoSDy*m>Ca
z1bD@Sh4_Sd1$c!Sc<ACx1_2HpAptQKL2hn-Ms9XqAwF(lc5Z$FZb3c)5f&D90Ty;X
z4q*Xq5k@X<Ru%zXJ|T8NPA(P!K1N{y9!^$%Mpi}^c5ZF~K6XYSPHt8{AwhOtA$Cqd
zZblwPMgeXvb}<2NPG(^qK2}~KL19(_0e)_7R&FtNMt)9CHbF*4ULi(dMj=KnR&GW{
zAznT~5n&!~F@7<2UOq-%Ar?UnF=lolc0N{aZblYvK2}a44mM#%c5ZedA$B2d9(Eyi
z0XA*{Rt{lCUSTdyVMaz)J^?XyR(4@AMs5LK9u`)1Heo(iZZQ@<R!(jqMgcKFVRlAg
z7DizK0U-faZeb1~R(4J{Ms`j?A$CS#ZVpC9ZgydIMs`MSMn-m40Y(;1Ms8tYMixeP
z5iT)qkRmZgc0q0y7DiSfMs8Mier^#)20=k#HcmD^J}y2k7GZ87P9bg~J}zM)W?^Ar
z7EVDf78X8MVSYA25jJ5K7GYLl4k1Q<A$~pqR#tvCUO^!?0YPCwR&G8{9u9sM5l}K^
zW#eKN5atzRWMt!G<l|)J65<r(65?VP;NxQC;};ea;^gDw<6`CKVdP@r5*FkZV&>r#
zVqs(BV`bqK5@r?Q7vW{+=VN5$731O+;^*V!VG&|s7ZhU?78B&+<`w4W666(OWn||S
zViRH(6z1k&W#{JL<=_xv<r3r;WMJoF<QC=>W)<XP<YMF!;^gHL<7O9R5fc{X5)>3>
z=in4z;pP)!6<}xN7UmV;VHalM6X5_E$RWlm%+1Hn#mXfN@(>q4HzO+-BMS$o7&oI3
z3pa}ZD?2Ecig0iXunKdsv$L>(;zK}Kn2nc}Q;3U~kyC)5k&Ty&kDZm9my<<6n3s{C
zpHV=7jfX`*SeRXmM_7=Ln_Gm7i<41+om-HRT|k&$P?%9zfLmCYRe)81m6MAR6x*z9
zBCNtJjDkW!LVSYktc=`@-2B2q9Q?d2TzqT-e8Rk3to%a!jEus9+}x~OAU6nevkI`X
zaPbKV33BnVGxD)<3JD5v^Yb$@3JM8uv$L@ab8!jr@bED53vi3EGO~+t3W@Ou2(btX
z@p23DvoH$t@(2sCi*X9F@UpRUafxyBu?q6>@{2JFurhKA@CmXA^9u7YvI=r@vGA}m
zvM};<af|VDbBYM?vk5aY^YaMtfyypHAt7!dP~u}@6X51z6kr$N<Q8CKVHXkN66WD!
z1m!+XZazj~UM_YPMs6W)VODM-K1O~HHX$xPK1KmHK~_dVZecD~ZdNV<c20HyK0zTd
zMnO(aK_O0l76EoKVF6)QUI76%F-BG{9(GnvVIdX)ZdO)aHUU;{9$qmPHX%lS9#&pn
wZWexFMqUn9Mph9nMqzFi4mLJH7B(S%er^^nMpiBsK2}zCb{+vfUVa`101J>oi2wiq

literal 0
HcmV?d00001

-- 
GitLab