Select Git revision
export_non_standard.cc
clustalw.py 5.49 KiB
from ost.bindings import utils
from ost import settings, io, seq, LogError
import os
import subprocess
def ClustalW(seq1, seq2=None, clustalw=None, keep_files=False, nopgap=False,
clustalw_option_string=False):
'''
Runs a ClustalW multiple sequence alignment. The results are returned as a
:class:`~ost.seq.AlignmentHandle` instance.
There are two ways to use this function:
- align exactly two sequences:
:param seq1: sequence_one
:type seq1: :class:`~ost.seq.SequenceHandle` or :class:`str`
:param seq2: sequence_two
:type seq2: :class:`~ost.seq.SequenceHandle` or :class:`str`
The two sequences can be specified as two separate function parameters
(`seq1`, `seq2`). The type of both parameters can be either
:class:`~ost.seq.SequenceHandle` or :class:`str`, but must be the same for
both parameters.
- align two or more sequences:
:param seq1: sequence_list
:type seq1: :class:`~ost.seq.SequenceList`
:param seq2: must be :class:`None`
Two or more sequences can be specified by using a
:class:`~ost.seq.SequenceList`. It is then passed as the first function
parameter (`seq1`). The second parameter (`seq2`) must be :class:`None`.
:param clustalw: path to ClustalW executable (used in :func:`~ost.settings.Locate`)
:type clustalw: :class:`str`
:param nopgap: turn residue-specific gaps off
:type nopgap: :class:`bool`
:param clustalw_option_string: additional ClustalW flags (see http://www.clustal.org/download/clustalw_help.txt)
:type clustalw_option_string: :class:`str`
:param keep_files: do not delete temporary files
:type keep_files: :class:`bool`
.. note ::
- In the passed sequences ClustalW will convert lowercase to uppercase, and
change all '.' to '-'. OST will convert and '?' to 'X' before aligning
sequences with ClustalW.
- If a :attr:`sequence name <ost.seq.SequenceHandle.name>` contains spaces,
only the part before the space is considered as sequence name. To avoid
surprises, you should remove spaces from the sequence name.
- Sequence names must be unique (:class:`ValueError` exception raised
otherwise).
ClustalW will accept only IUB/IUPAC amino acid and nucleic acid codes:
======= ======================= ======= ============================
Residue Name Residue Name
======= ======================= ======= ============================
A alanine P proline
B aspartate or asparagine Q glutamine
C cystine R arginine
D aspartate S serine
E glutamate T threonine
F phenylalanine U selenocysteine
G glycine V valine
H histidine W tryptophan
I isoleucine Y tyrosine
K lysine Z glutamate or glutamine
L leucine X any
M methionine \\* translation stop
N asparagine \\- gap of indeterminate length
======= ======================= ======= ============================
'''
clustalw_path=settings.Locate(('clustalw', 'clustalw2'),
explicit_file_name=clustalw)
if seq2!=None:
if isinstance(seq1, seq.SequenceHandle) and isinstance(seq2, seq.SequenceHandle):
seq_list=seq.CreateSequenceList()
seq_list.AddSequence(seq1)
seq_list.AddSequence(seq2)
elif isinstance(seq1, str) and isinstance(seq2, str):
seqh1=seq.CreateSequence("seq1", seq1)
seqh2=seq.CreateSequence("seq2", seq2)
seq_list=seq.CreateSequenceList()
seq_list.AddSequence(seqh1)
seq_list.AddSequence(seqh2)
else:
LogError("WARNING: Specify at least two Sequences")
return
elif isinstance(seq1, seq.SequenceList):
seq_list=seq1
else:
LogError("WARNING: Specify either two SequenceHandles or one SequenceList")
return
sequence_names = set()
for s in seq_list:
# we cut out anything after a space to be consistent with ClustalW behaviour
sequence_names.add(s.GetName().split(' ')[0])
if len(sequence_names) < len(seq_list):
raise ValueError("ClustalW can only process sequences with unique identifiers!")
new_list = seq.CreateSequenceList()
for s in seq_list:
ss = s.Copy()
for i,c in enumerate(ss):
if c=='?':
ss[i]='X'
new_list.AddSequence(ss)
seq_list = new_list
temp_dir=utils.TempDirWithFiles((seq_list,))
out=os.path.join(temp_dir.dirname, 'out.fasta')
command='%s -infile="%s" -output=fasta -outfile="%s"' % (clustalw_path,
temp_dir.files[0],
out)
if nopgap:
command+=" -nopgap"
if clustalw_option_string!=False:
command=command+" "+clustalw_option_string #see useful flags: http://toolkit.tuebingen.mpg.de/clustalw/help_params
subprocess.run(command, shell=True, stdout=subprocess.DEVNULL)
aln=io.LoadAlignment(out)
for sequence in seq_list:
for seq_num,aln_seq in enumerate(aln.sequences):
if aln_seq.GetName()==sequence.GetName():
break
aln.SetSequenceOffset(seq_num,sequence.offset)
if sequence.HasAttachedView():
aln.AttachView(seq_num,sequence.GetAttachedView().Copy())
if not keep_files:
temp_dir.Cleanup()
return aln