binging for protein sequence clustering program kclust. No unit tests

yet.

binging for protein sequence clustering program kclust. No unit tests
9d4c35ba · Gabriel Studer · b7c9d184 · 9d4c35ba · 9d4c35ba
Commit 9d4c35ba authored 12 years ago by Gabriel Studer
--- a/modules/bindings/pymod/CMakeLists.txt
+++ b/modules/bindings/pymod/CMakeLists.txt
@@ -10,5 +10,6 @@ utils.py
 naccess.py
 blast.py
 cadscore.py
+kclust.py
 )
 pymod(NAME bindings PY ${OST_BINDINGS})
--- a/modules/bindings/pymod/kclust.py
+++ b/modules/bindings/pymod/kclust.py
+import subprocess, os, tempfile
+from ost import io, seq, settings
+"""
+Wrapper for kClust a protein sequence clustering algorithm
+unpublished, but mentioned in:
+Michael Remmert, Andreas Biegert, Andreas Hauser & Johannes Soeding
+HHblits: lighting-fast iterative protein sequence searching by
+HMM-HMM alignment
+Nature Methods 9, 173-175 (2012)
+"""
+class cluster:
+  """
+  Holds the information of one cluster
+  .. attribute:: sequences
+    SequenceList containing all sequences of the cluster
+  .. attribute:: representative_id
+    a string, that contains the name of the representative sequence of the cluster
+    as estimated by kClust. 
+  .. attribute:: alignment
+    alignment handle containing a multiple sequence alignment of all sequences of
+    the cluster. Gets only calculated when binding is called with appropriate flag.
+  """
+  def __init__(self, sequences, representative_id):
+    self.sequences=sequences
+    self.representative_id=representative_id
+    self.alignment=None
+def _SetupFiles(sequences):
+  # create temporary directory
+  tmp_dir_name=tempfile.mkdtemp()
+  io.SaveSequenceList(sequences, os.path.join(tmp_dir_name,'fastadb.fasta'))
+  return tmp_dir_name
+def _CleanupFiles(tmp_dir_name):
+  import shutil
+  shutil.rmtree(tmp_dir_name)
+def _ParseOutput(tmp_dir_name):
+  header_data=open(os.path.join(tmp_dir_name,'headers.dmp'),'r').readlines()
+  cluster_data=open(os.path.join(tmp_dir_name,'clusters.dmp'),'r').readlines()
+  sequences=io.LoadSequenceList(os.path.join(tmp_dir_name,'fastadb.fasta'))
+  clusters=dict()
+  header_mapper=dict()
+  for line in header_data:
+    header_mapper[int(line.split()[0])]=line.split()[1].strip().strip('>')
+  #find numeric ids of the representatives of the clusters
+  unique_representatives=list()
+  for line in cluster_data[1:]:
+    actual_cluster=int(line.split()[1])
+    try:
+      unique_representatives.index(actual_cluster)
+    except:
+      unique_representatives.append(actual_cluster)
+  #assign every header to its corresponding cluster, where the
+  #cluster id is given by the id of the representative of the cluster
+  for idx in unique_representatives:
+    clusters[idx]=seq.CreateSequenceList()
+  for line in cluster_data[1:]:
+    clusters[int(line.split()[1])].AddSequence(sequences.FindSequence(header_mapper[int(line.split()[0])]))
+  #translate into final output
+  res=list()
+  for k, v in clusters.iteritems():
+    res.append(cluster(v, header_mapper[k]))
+  return res
+def _RunkClust(tmp_dir_name, clustering_thresh, create_alignments):
+  bitscore=clustering_thresh*0.060269-0.68498
+  executable=settings.Locate('kClust') 
+  cmd=[]
+  cmd.append('kClust')
+  cmd.append('-i')
+  cmd.append(os.path.join(tmp_dir_name,'fastadb.fasta'))
+  cmd.append('-d')
+  cmd.append(tmp_dir_name)
+  cmd.append('-s')
+  cmd.append(str(bitscore))
+  cmd=' '.join(cmd)
+  ps=subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
+  ps.stdout.readlines()
+  result=_ParseOutput(tmp_dir_name)
+  if(create_alignments):
+    from ost.bindings import clustalw
+    for c in result:
+      if len(c.sequences)>1:
+        c.alignment=clustalw.ClustalW(c.sequences)
+      else:
+        aln=seq.CreateAlignment()
+        aln.AddSequence(c.sequences[0])
+        c.alignment=aln
+  return result
+def kClust(sequences, clustering_thresh=30, create_alignments=False):
+  """
+  Uses kClust to generate clusters of amino acid sequences.
+  :param sequences: All sequences you want to cluster. 
+  :type sequences: :class:`ost.seq.SequenceList`
+  :param clustering_thres: Sequence identity threshold to build clusters. Note,
+                           that clustering_thresh is more a rule of thumb, since
+                           compositional bias in the sequence can also play a role.
+                           The value gets transformed in a bitscore, that is used
+                           as an input parameter of kClust 
+  :param create_alignments: Flag, wether the alignments of the clusters get calculated.
+                            Requires clustalw in the path.
+  :returns: A list of cluster instances
+  :raises: :class:`~ost.settings.FileNotFound` if kClust could not be located.
+  """
+  tmp_dir_name=_SetupFiles(sequences)
+  result=_RunkClust(tmp_dir_name, clustering_thresh, create_alignments)
+  _CleanupFiles(tmp_dir_name)
+  return result