from ost.bindings import utils from ost import settings, io, seq, LogError import os import subprocess def ClustalW(seq1, seq2=None, clustalw=None, keep_files=False, nopgap=False, clustalw_option_string=False): ''' Runs a ClustalW multiple sequence alignment. The results are returned as a :class:`~ost.seq.AlignmentHandle` instance. There are two ways to use this function: - align exactly two sequences: :param seq1: sequence_one :type seq1: :class:`~ost.seq.SequenceHandle` or :class:`str` :param seq2: sequence_two :type seq2: :class:`~ost.seq.SequenceHandle` or :class:`str` The two sequences can be specified as two separate function parameters (`seq1`, `seq2`). The type of both parameters can be either :class:`~ost.seq.SequenceHandle` or :class:`str`, but must be the same for both parameters. - align two or more sequences: :param seq1: sequence_list :type seq1: :class:`~ost.seq.SequenceList` :param seq2: must be :class:`None` Two or more sequences can be specified by using a :class:`~ost.seq.SequenceList`. It is then passed as the first function parameter (`seq1`). The second parameter (`seq2`) must be :class:`None`. :param clustalw: path to ClustalW executable (used in :func:`~ost.settings.Locate`) :type clustalw: :class:`str` :param nopgap: turn residue-specific gaps off :type nopgap: :class:`bool` :param clustalw_option_string: additional ClustalW flags (see http://www.clustal.org/download/clustalw_help.txt) :type clustalw_option_string: :class:`str` :param keep_files: do not delete temporary files :type keep_files: :class:`bool` .. note :: - In the passed sequences ClustalW will convert lowercase to uppercase, and change all '.' to '-'. OST will convert and '?' to 'X' before aligning sequences with ClustalW. - If a :attr:`sequence name <ost.seq.SequenceHandle.name>` contains spaces, only the part before the space is considered as sequence name. To avoid surprises, you should remove spaces from the sequence name. - Sequence names must be unique (:class:`ValueError` exception raised otherwise). ClustalW will accept only IUB/IUPAC amino acid and nucleic acid codes: ======= ======================= ======= ============================ Residue Name Residue Name ======= ======================= ======= ============================ A alanine P proline B aspartate or asparagine Q glutamine C cystine R arginine D aspartate S serine E glutamate T threonine F phenylalanine U selenocysteine G glycine V valine H histidine W tryptophan I isoleucine Y tyrosine K lysine Z glutamate or glutamine L leucine X any M methionine \* translation stop N asparagine \- gap of indeterminate length ======= ======================= ======= ============================ ''' clustalw_path=settings.Locate(('clustalw', 'clustalw2'), explicit_file_name=clustalw) if seq2!=None: if isinstance(seq1, seq.SequenceHandle) and isinstance(seq2, seq.SequenceHandle): seq_list=seq.CreateSequenceList() seq_list.AddSequence(seq1) seq_list.AddSequence(seq2) elif isinstance(seq1, str) and isinstance(seq2, str): seqh1=seq.CreateSequence("seq1", seq1) seqh2=seq.CreateSequence("seq2", seq2) seq_list=seq.CreateSequenceList() seq_list.AddSequence(seqh1) seq_list.AddSequence(seqh2) else: LogError("WARNING: Specify at least two Sequences") return elif isinstance(seq1, seq.SequenceList): seq_list=seq1 else: LogError("WARNING: Specify either two SequenceHandles or one SequenceList") return sequence_names = set() for s in seq_list: # we cut out anything after a space to be consistent with ClustalW behaviour sequence_names.add(s.GetName().split(' ')[0]) if len(sequence_names) < len(seq_list): raise ValueError("ClustalW can only process sequences with unique identifiers!") new_list = seq.CreateSequenceList() for s in seq_list: ss = s.Copy() for i,c in enumerate(ss): if c=='?': ss[i]='X' new_list.AddSequence(ss) seq_list = new_list temp_dir=utils.TempDirWithFiles((seq_list,)) out=os.path.join(temp_dir.dirname, 'out.fasta') command='%s -infile="%s" -output=fasta -outfile="%s"' % (clustalw_path, temp_dir.files[0], out) if nopgap: command+=" -nopgap" if clustalw_option_string!=False: command=command+" "+clustalw_option_string #see useful flags: http://toolkit.tuebingen.mpg.de/clustalw/help_params subprocess.run(command, shell=True, stdout=subprocess.DEVNULL) aln=io.LoadAlignment(out) for sequence in seq_list: for seq_num,aln_seq in enumerate(aln.sequences): if aln_seq.GetName()==sequence.GetName(): break aln.SetSequenceOffset(seq_num,sequence.offset) if sequence.HasAttachedView(): aln.AttachView(seq_num,sequence.GetAttachedView().Copy()) if not keep_files: temp_dir.Cleanup() return aln