Skip to content
Snippets Groups Projects
Commit 9d4c35ba authored by Gabriel Studer's avatar Gabriel Studer
Browse files

binging for protein sequence clustering program kclust. No unit tests

yet.
parent b7c9d184
No related branches found
No related tags found
No related merge requests found
......@@ -10,5 +10,6 @@ utils.py
naccess.py
blast.py
cadscore.py
kclust.py
)
pymod(NAME bindings PY ${OST_BINDINGS})
import subprocess, os, tempfile
from ost import io, seq, settings
"""
Wrapper for kClust a protein sequence clustering algorithm
unpublished, but mentioned in:
Michael Remmert, Andreas Biegert, Andreas Hauser & Johannes Soeding
HHblits: lighting-fast iterative protein sequence searching by
HMM-HMM alignment
Nature Methods 9, 173-175 (2012)
"""
class cluster:
"""
Holds the information of one cluster
.. attribute:: sequences
SequenceList containing all sequences of the cluster
.. attribute:: representative_id
a string, that contains the name of the representative sequence of the cluster
as estimated by kClust.
.. attribute:: alignment
alignment handle containing a multiple sequence alignment of all sequences of
the cluster. Gets only calculated when binding is called with appropriate flag.
"""
def __init__(self, sequences, representative_id):
self.sequences=sequences
self.representative_id=representative_id
self.alignment=None
def _SetupFiles(sequences):
# create temporary directory
tmp_dir_name=tempfile.mkdtemp()
io.SaveSequenceList(sequences, os.path.join(tmp_dir_name,'fastadb.fasta'))
return tmp_dir_name
def _CleanupFiles(tmp_dir_name):
import shutil
shutil.rmtree(tmp_dir_name)
def _ParseOutput(tmp_dir_name):
header_data=open(os.path.join(tmp_dir_name,'headers.dmp'),'r').readlines()
cluster_data=open(os.path.join(tmp_dir_name,'clusters.dmp'),'r').readlines()
sequences=io.LoadSequenceList(os.path.join(tmp_dir_name,'fastadb.fasta'))
clusters=dict()
header_mapper=dict()
for line in header_data:
header_mapper[int(line.split()[0])]=line.split()[1].strip().strip('>')
#find numeric ids of the representatives of the clusters
unique_representatives=list()
for line in cluster_data[1:]:
actual_cluster=int(line.split()[1])
try:
unique_representatives.index(actual_cluster)
except:
unique_representatives.append(actual_cluster)
#assign every header to its corresponding cluster, where the
#cluster id is given by the id of the representative of the cluster
for idx in unique_representatives:
clusters[idx]=seq.CreateSequenceList()
for line in cluster_data[1:]:
clusters[int(line.split()[1])].AddSequence(sequences.FindSequence(header_mapper[int(line.split()[0])]))
#translate into final output
res=list()
for k, v in clusters.iteritems():
res.append(cluster(v, header_mapper[k]))
return res
def _RunkClust(tmp_dir_name, clustering_thresh, create_alignments):
bitscore=clustering_thresh*0.060269-0.68498
executable=settings.Locate('kClust')
cmd=[]
cmd.append('kClust')
cmd.append('-i')
cmd.append(os.path.join(tmp_dir_name,'fastadb.fasta'))
cmd.append('-d')
cmd.append(tmp_dir_name)
cmd.append('-s')
cmd.append(str(bitscore))
cmd=' '.join(cmd)
ps=subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
ps.stdout.readlines()
result=_ParseOutput(tmp_dir_name)
if(create_alignments):
from ost.bindings import clustalw
for c in result:
if len(c.sequences)>1:
c.alignment=clustalw.ClustalW(c.sequences)
else:
aln=seq.CreateAlignment()
aln.AddSequence(c.sequences[0])
c.alignment=aln
return result
def kClust(sequences, clustering_thresh=30, create_alignments=False):
"""
Uses kClust to generate clusters of amino acid sequences.
:param sequences: All sequences you want to cluster.
:type sequences: :class:`ost.seq.SequenceList`
:param clustering_thres: Sequence identity threshold to build clusters. Note,
that clustering_thresh is more a rule of thumb, since
compositional bias in the sequence can also play a role.
The value gets transformed in a bitscore, that is used
as an input parameter of kClust
:param create_alignments: Flag, wether the alignments of the clusters get calculated.
Requires clustalw in the path.
:returns: A list of cluster instances
:raises: :class:`~ost.settings.FileNotFound` if kClust could not be located.
"""
tmp_dir_name=_SetupFiles(sequences)
result=_RunkClust(tmp_dir_name, clustering_thresh, create_alignments)
_CleanupFiles(tmp_dir_name)
return result
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment