diff --git a/modules/bindings/pymod/CMakeLists.txt b/modules/bindings/pymod/CMakeLists.txt index 8cd78a87cdf3491c6839bdbb33e0e76d86dc91dc..52c855f25e792401fde6f6befa48ca443009f91f 100644 --- a/modules/bindings/pymod/CMakeLists.txt +++ b/modules/bindings/pymod/CMakeLists.txt @@ -10,5 +10,6 @@ utils.py naccess.py blast.py cadscore.py +kclust.py ) pymod(NAME bindings PY ${OST_BINDINGS}) diff --git a/modules/bindings/pymod/kclust.py b/modules/bindings/pymod/kclust.py new file mode 100644 index 0000000000000000000000000000000000000000..944c73701f248688bf02afb3be9fa87146aae0be --- /dev/null +++ b/modules/bindings/pymod/kclust.py @@ -0,0 +1,148 @@ +import subprocess, os, tempfile +from ost import io, seq, settings + +""" +Wrapper for kClust a protein sequence clustering algorithm + +unpublished, but mentioned in: + +Michael Remmert, Andreas Biegert, Andreas Hauser & Johannes Soeding +HHblits: lighting-fast iterative protein sequence searching by +HMM-HMM alignment +Nature Methods 9, 173-175 (2012) + +""" + +class cluster: + + """ + Holds the information of one cluster + + .. attribute:: sequences + + SequenceList containing all sequences of the cluster + + .. attribute:: representative_id + + a string, that contains the name of the representative sequence of the cluster + as estimated by kClust. + + .. attribute:: alignment + + alignment handle containing a multiple sequence alignment of all sequences of + the cluster. Gets only calculated when binding is called with appropriate flag. + """ + + def __init__(self, sequences, representative_id): + + + self.sequences=sequences + self.representative_id=representative_id + self.alignment=None + +def _SetupFiles(sequences): + + # create temporary directory + tmp_dir_name=tempfile.mkdtemp() + io.SaveSequenceList(sequences, os.path.join(tmp_dir_name,'fastadb.fasta')) + return tmp_dir_name + +def _CleanupFiles(tmp_dir_name): + + import shutil + shutil.rmtree(tmp_dir_name) + +def _ParseOutput(tmp_dir_name): + + header_data=open(os.path.join(tmp_dir_name,'headers.dmp'),'r').readlines() + cluster_data=open(os.path.join(tmp_dir_name,'clusters.dmp'),'r').readlines() + sequences=io.LoadSequenceList(os.path.join(tmp_dir_name,'fastadb.fasta')) + + clusters=dict() + header_mapper=dict() + for line in header_data: + header_mapper[int(line.split()[0])]=line.split()[1].strip().strip('>') + + #find numeric ids of the representatives of the clusters + unique_representatives=list() + for line in cluster_data[1:]: + actual_cluster=int(line.split()[1]) + try: + unique_representatives.index(actual_cluster) + except: + unique_representatives.append(actual_cluster) + + #assign every header to its corresponding cluster, where the + #cluster id is given by the id of the representative of the cluster + for idx in unique_representatives: + clusters[idx]=seq.CreateSequenceList() + for line in cluster_data[1:]: + clusters[int(line.split()[1])].AddSequence(sequences.FindSequence(header_mapper[int(line.split()[0])])) + + #translate into final output + + res=list() + for k, v in clusters.iteritems(): + res.append(cluster(v, header_mapper[k])) + + return res + +def _RunkClust(tmp_dir_name, clustering_thresh, create_alignments): + + bitscore=clustering_thresh*0.060269-0.68498 + + executable=settings.Locate('kClust') + + cmd=[] + cmd.append('kClust') + cmd.append('-i') + cmd.append(os.path.join(tmp_dir_name,'fastadb.fasta')) + cmd.append('-d') + cmd.append(tmp_dir_name) + cmd.append('-s') + cmd.append(str(bitscore)) + + cmd=' '.join(cmd) + ps=subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) + ps.stdout.readlines() + + result=_ParseOutput(tmp_dir_name) + + if(create_alignments): + from ost.bindings import clustalw + for c in result: + if len(c.sequences)>1: + c.alignment=clustalw.ClustalW(c.sequences) + else: + aln=seq.CreateAlignment() + aln.AddSequence(c.sequences[0]) + c.alignment=aln + + return result + +def kClust(sequences, clustering_thresh=30, create_alignments=False): + + """ + Uses kClust to generate clusters of amino acid sequences. + + :param sequences: All sequences you want to cluster. + :type sequences: :class:`ost.seq.SequenceList` + + :param clustering_thres: Sequence identity threshold to build clusters. Note, + that clustering_thresh is more a rule of thumb, since + compositional bias in the sequence can also play a role. + The value gets transformed in a bitscore, that is used + as an input parameter of kClust + + :param create_alignments: Flag, wether the alignments of the clusters get calculated. + Requires clustalw in the path. + + :returns: A list of cluster instances + + :raises: :class:`~ost.settings.FileNotFound` if kClust could not be located. + """ + + tmp_dir_name=_SetupFiles(sequences) + result=_RunkClust(tmp_dir_name, clustering_thresh, create_alignments) + _CleanupFiles(tmp_dir_name) + return result