Skip to content
Snippets Groups Projects
Select Git revision
  • 5d0894fa41b3b4c49d52358af1208e8b0f60c1fe
  • master default protected
  • develop protected
  • cmake_boost_refactor
  • ubuntu_ci
  • mmtf
  • non-orthogonal-maps
  • no_boost_filesystem
  • data_viewer
  • 2.11.1
  • 2.11.0
  • 2.10.0
  • 2.9.3
  • 2.9.2
  • 2.9.1
  • 2.9.0
  • 2.8.0
  • 2.7.0
  • 2.6.1
  • 2.6.0
  • 2.6.0-rc4
  • 2.6.0-rc3
  • 2.6.0-rc2
  • 2.6.0-rc
  • 2.5.0
  • 2.5.0-rc2
  • 2.5.0-rc
  • 2.4.0
  • 2.4.0-rc2
29 results

blast.py

Blame
  • kclust.py 4.36 KiB
    import subprocess, os, tempfile
    from ost import io, seq, settings
    
    """
    Wrapper for kClust a protein sequence clustering algorithm
    
    unpublished, but mentioned in:
    
    Michael Remmert, Andreas Biegert, Andreas Hauser & Johannes Soeding
    HHblits: lighting-fast iterative protein sequence searching by
    HMM-HMM alignment
    Nature Methods 9, 173-175 (2012)
    
    source code can be downloaded from:
    
    ftp://toolkit.genzentrum.lmu.de/pub/
    """
    
    class cluster:
    
      """
      Holds the information of one cluster
      
      .. attribute:: sequences
        
        SequenceList containing all sequences of the cluster
      
      .. attribute:: representative_id
      
        a string, that contains the name of the representative sequence of the cluster
        as estimated by kClust. 
        
      .. attribute:: alignment
    
        alignment handle containing a multiple sequence alignment of all sequences of
        the cluster. Gets only calculated when binding is called with appropriate flag.
      """
    
      def __init__(self, sequences, representative_id):
    
        self.sequences=sequences
        self.representative_id=representative_id
        self.alignment=None
    
    def _SetupFiles(sequences):
    
      # create temporary directory
      tmp_dir_name=tempfile.mkdtemp()
      io.SaveSequenceList(sequences, os.path.join(tmp_dir_name,'fastadb.fasta'))
      return tmp_dir_name
    
    def _CleanupFiles(tmp_dir_name):
    
      import shutil
      shutil.rmtree(tmp_dir_name)
    
    def _ParseOutput(tmp_dir_name):
    
      header_data=open(os.path.join(tmp_dir_name,'headers.dmp'),'r').readlines()
      cluster_data=open(os.path.join(tmp_dir_name,'clusters.dmp'),'r').readlines()
      sequences=io.LoadSequenceList(os.path.join(tmp_dir_name,'fastadb.fasta'))
    
      clusters=dict()
      header_mapper=dict()
      for line in header_data:
        header_mapper[int(line.split()[0])]=line.split()[1].strip().strip('>')
    
      #find numeric ids of the representatives of the clusters
      unique_representatives=list()
      for line in cluster_data[1:]:
        actual_cluster=int(line.split()[1])
        try:
          unique_representatives.index(actual_cluster)
        except:
          unique_representatives.append(actual_cluster)
    
      #assign every header to its corresponding cluster, where the
      #cluster id is given by the id of the representative of the cluster
      for idx in unique_representatives:
        clusters[idx]=seq.CreateSequenceList()
      for line in cluster_data[1:]:
        clusters[int(line.split()[1])].AddSequence(sequences.FindSequence(header_mapper[int(line.split()[0])]))
    
      #translate into final output
    
      res=list()
      for k, v in clusters.iteritems():
        res.append(cluster(v, header_mapper[k]))
    
      return res
      
    def _RunkClust(tmp_dir_name, clustering_thresh, create_alignments):
    
      bitscore=clustering_thresh*0.060269-0.68498
    
      executable=settings.Locate('kClust') 
    
      cmd=[]
      cmd.append(executable)
      cmd.append('-i')
      cmd.append(os.path.join(tmp_dir_name,'fastadb.fasta'))
      cmd.append('-d')
      cmd.append(tmp_dir_name)
      cmd.append('-s')
      cmd.append(str(bitscore))
    
      cmd=' '.join(cmd)
      ps=subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
      stdout, stderr = ps.communicate()
    
      result=_ParseOutput(tmp_dir_name)
    
      if(create_alignments):
        from ost.bindings import clustalw
        for c in result:
          if len(c.sequences)>1:
            c.alignment=clustalw.ClustalW(c.sequences)
          else:
            aln=seq.CreateAlignment()
            aln.AddSequence(c.sequences[0])
            c.alignment=aln
    
      return result
    
    def kClust(sequences, clustering_thresh=30, create_alignments=False):
    
      """
      Uses kClust to generate clusters of amino acid sequences.
      
      :param sequences: All sequences you want to cluster. 
      :type sequences: :class:`ost.seq.SequenceList`
    
      :param clustering_thres: Sequence identity threshold to build clusters. Note,
                               that clustering_thresh is more a rule of thumb, since
                               compositional bias in the sequence can also play a role.
                               The value gets transformed in a bitscore, that is used
                               as an input parameter of kClust 
    
      :param create_alignments: Flag, wether the alignments of the clusters get calculated.
                                Requires clustalw in the path.
    
      :returns: A list of cluster instances
      
      :raises: :class:`~ost.settings.FileNotFound` if kClust could not be located.
      """
    
      tmp_dir_name=_SetupFiles(sequences)
      result=_RunkClust(tmp_dir_name, clustering_thresh, create_alignments)
      _CleanupFiles(tmp_dir_name)
      return result