From 0a3ee11fc6b278b7f04d1d14e4492d4c312c237c Mon Sep 17 00:00:00 2001
From: Stefan Bienert <stefan.bienert@unibas.ch>
Date: Mon, 17 Aug 2015 16:18:56 +0200
Subject: [PATCH] --fasta option
---
core/pymod/core/pm3argparse.py | 236 +++++++++++++++++++++++++++++++--
1 file changed, 227 insertions(+), 9 deletions(-)
diff --git a/core/pymod/core/pm3argparse.py b/core/pymod/core/pm3argparse.py
index 670c4453..3d099d21 100644
--- a/core/pymod/core/pm3argparse.py
+++ b/core/pymod/core/pm3argparse.py
@@ -5,6 +5,41 @@ Extensions for the argparse module.
import argparse
import sys
import os
+import gzip
+import tempfile
+
+import ost
+from ost import io, seq
+
+from promod3.core import helper
+
+def _AssembleTrgTplAln(target, template):
+ """
+ Internal function: Assemble a target-template alignment without leading/
+ final gaps in the target sequence. Set the offset for the template sequence.
+ """
+ # count leading gaps to get the start position
+ start = 0
+ for i in range(0, target.length):
+ if target[i] != '-':
+ start = i
+ break
+ # get rid of closing gaps at the end
+ end = target.length
+ for i in range(target.length, 1, -1):
+ if target[i-1] != '-':
+ end = i
+ break
+ # assemble template sequence
+ tpl_str = ''
+ for i in range(start, end):
+ tpl_str += template[i]
+ new_aln = seq.CreateAlignment(seq.CreateSequence(target.name.strip(),
+ str(target)[start:end]),
+ seq.CreateSequence(template.name.strip(),
+ tpl_str))
+ new_aln.SetSequenceOffset(1, start)
+ return new_aln
class PM3ArgumentParser(argparse.ArgumentParser):
"""
@@ -12,7 +47,7 @@ class PM3ArgumentParser(argparse.ArgumentParser):
set of standard arguments which can be activated, rather than added via the
traditional way. This helps keeping up a common naming scheme throughout
all |project| actions. As a real extension, this subclass provides checking
- of input parameters on :meth:`~pm3argparse.PM3ArgumentParser.Parse`. Beside
+ of input parameters on :meth:`Parse`. Beside
this, everything you can do with a 'real' :class:`~argparse.ArgumentParser`
instance is possible here.
@@ -54,24 +89,207 @@ class PM3ArgumentParser(argparse.ArgumentParser):
formatter_class=\
argparse.ArgumentDefaultsHelpFormatter)
self.action = action
+ self.activate = dict()
+ def _print_message(self, message, file=None):
+ #pylint: disable=redefined-builtin
+ """
+ This is like a welcome message to the "country of bad style"... we are
+ overwriting a "_" function from the parent-class. Those guys should not
+ be used outside of the housing module, never... but here it is a single
+ function to bend :mod:`argparse` to use :class:`ost.Logger`.
+ """
+ if message:
+ if file is None or file is sys.stderr:
+ ost.LogError(message)
+ else:
+ ost.LogScript(message)
- def Parse(self, args=None, namespace=None):
+ def Parse(self, args=None):
"""
Parse an argument string.
:param args: The argument string. As default |sysargv|_ is used.
:type args: :class:`list`
- :param namespace: The same as for
- :meth:`argparse.ArgumentParser.parse_args`.
-
- :returns: If :attr:`namespace` is not given,
- :class:`argparse.Namespace`.
+ :returns: :class:`promod3.cor.pm3argparse.PM3OptionsNamespace`.
"""
- opts = self.parse_args(args=args, namespace=namespace)
+ opts = PM3OptionsNamespace()
+ self.parse_args(args=args, namespace=opts)
+
+ opts.PostProcess(self.activate.keys())
return opts
+ def AssembleParser(self):
+ """
+ When adding options via the :meth:`Add*` methods, call this after you
+ are done. Everything before just tells the parser that it should
+ contain those option sets but does not actually add anything.
+ :meth:`AssembleParser` will put everything in place, in the right order
+ and with the right constraints.
+ """
+ if 'ALIGNMENT' in self.activate.keys():
+ self._AssembleAlignment()
+
+ def AddAlignment(self):
+ """
+ Add everything needed to load alignments to the argument parser. Creates
+ several options/ arguments and adds some checks for post processing.
+ This method only adds a flag to the parser to add alignment options on
+ :meth:`AssembleParser`. Depending on which options you activate, things
+ need to be added in a different order or have other constraints.
+
+ Options/ arguments added:
+
+ * ``--fasta trg:<NAME> <FILE>`` - describing a target-template alignment
+ with ``trg:`` marking the target sequence inside :file:`<FILE>`
+
+ Exit codes related to alignment input:
+
+ * 11 - no prefix ``trg:`` found for an argument to '--fasta'
+
+ * 12 - a given alignment file does not exist
+
+ * 13 - never raised (parameter for checking gzip files)
+
+ * 14 - empty target name found (``trg:``)
+
+ * 15 - found an empty alignment file
+
+ * 16 - alignment with more than 2 sequences found
+
+ * 17 - target sequence name not found in alignment
+
+ * 18 - sequences in the alignment have different length
+
+ Attributes added to the namespace returned by
+ :meth:`Parse`:
+
+ * :attr:`fasta` - filled with the input of the '--fasta' argument, a
+ :class:`list` with multiple :class:`list` objects
+
+ * :attr:`alignments` - :class:`ost.AlignmentList`, same order as
+ :attr:`fasta`
+
+ * :attr:`aln_sources` - the original source of the alignment, may be
+ filename(s) or a string in JSON format,
+ :class:`list` of all sources
+ """
+ self.activate['ALIGNMENT'] = 1
+
+ def _AssembleAlignment(self):
+ """
+ Actually add alignment arguments/ options
+ """
+ # FastA input: - always pairwise alignments
+ # - callable multiple times
+ # - goes by 'trg:<SEQNAME> <FILE>'
+ # - excludes JSON file/ object
+ # - leading whitespaces will be deleted
+ self.add_argument('-f', '--fasta', nargs=2, action='append',
+ metavar=('trg:<NAME>', '<FILE>'),
+ help='Pairwise alignment in FastA format, needs to '+
+ 'declare what is the target sequence.')
+ # input: FastA/ JSON
+ # determined by extension: if we are wrong, the whole loading fails
+ # possibility to add JSON: mention limitation!
+
+class PM3OptionsNamespace(object):
+ """
+ This one is mainly for internal use. You can use it like everything that
+ comes out of :meth:`argparse.ArgumentParser.parse_args`. Attributes are
+ added regarding how you assembled your argument parser.
+ """
+ def __init__(self):
+ pass
+
+ def PostProcess(self, activated):
+ """
+ Post processing of activated option packs.
+ """
+ if 'ALIGNMENT' in activated:
+ self._PostProcessAlignment()
+
+ def _PostProcessAlignment(self):
+ #pylint: disable=no-member
+ #pylint: disable=attribute-defined-outside-init
+ """
+ Doing some extra work after parsing.
+ """
+ self.aln_sources = list()
+ self.alignments = seq.AlignmentList()
+ if self.fasta:
+ for src in self.fasta:
+ if src[0].startswith('trg:'):
+ trgname = src[0][4:]
+ seqfile = src[1]
+ elif src[1].startswith('trg:'):
+ trgname = src[1][4:]
+ seqfile = src[0]
+ else:
+ helper.MsgErrorAndExit("'--fasta' requires one argument "+
+ "prefixed with 'trg:' marking the "+
+ "target sequence name", 11)
+ if not len(trgname):
+ helper.MsgErrorAndExit("'--fasta' requires argument "+
+ "'trg:' defining the "+
+ "target sequence name, empty one "+
+ "found: '%s'" % ' '.join(src), 14)
+ helper.FileExists("Alignment", 12, seqfile)
+ is_gz = helper.FileGzip("Alignment", 13, seqfile)
+ readfile = seqfile
+ if is_gz:
+ zip_fh = gzip.open(seqfile)
+ unzip_str = zip_fh.read()
+ zip_fh.close()
+ unzip_file = tempfile.NamedTemporaryFile(mode='w',
+ suffix='.fas')
+ unzip_file.write(unzip_str)
+ unzip_file.flush()
+ readfile = unzip_file.name
+ try:
+ aln = io.LoadAlignment(readfile, format="fasta")
+ except Exception, exc: #pylint: disable=broad-except
+ if exc.message == 'Bad FASTA file: File is empty':
+ helper.MsgErrorAndExit("'--fasta' refers to an empty "+\
+ "file or its in the wrong "+
+ "format: %s" % seqfile, 15)
+ elif exc.message == 'sequences have different lengths':
+ helper.MsgErrorAndExit("'--fasta %s': " % ' '.join(src)+
+ "sequences in the alignment "+
+ "have different length.", 18)
+ else:
+ raise
+ finally:
+ if is_gz:
+ unzip_file.close()
+ # check alignment
+ nos = aln.GetCount()
+ if nos > 2:
+ helper.MsgErrorAndExit("'--fasta %s' " % ' '.join(src)+
+ "points to an alignment with "+
+ "more than 2 sequences.", 16)
+ fst_seq = aln.GetSequence(0)
+ snd_seq = aln.GetSequence(1)
+ if fst_seq.name.strip() == trgname:
+ new_aln = _AssembleTrgTplAln(fst_seq, snd_seq)
+ elif snd_seq.name.strip() == trgname:
+ new_aln = _AssembleTrgTplAln(snd_seq, fst_seq)
+ else:
+ helper.MsgErrorAndExit("'--fasta %s' " % ' '.join(src)+
+ "does not define a target name "+
+ "found in the alignment.", 17)
+
+ self.alignments.append(new_aln)
+ self.aln_sources.append(seqfile)
+
# LocalWords: param attr prog argparse ArgumentParser bool sys os init str
# LocalWords: progattr descattr argpinit argv formatter meth args namespace
-# LocalWords: ArgumentDefaultsHelpFormatter sysargv
+# LocalWords: ArgumentDefaultsHelpFormatter sysargv AssembleParser fasta io
+# LocalWords: metavar trg tpl FastA gzip tempfile ost promod aln stderr src
+# LocalWords: AssembleTrgTplAln CreateSequence SetSequenceOffset LogError
+# LocalWords: LogScript OptionsNamespace PostProcess AssembleAlignment JSON
+# LocalWords: AddAlignment AlignmentList SEQNAME whitespaces nargs trgname
+# LocalWords: PostProcessAlignment startswith seqfile elif MsgErrorAndExit
+# LocalWords: len FileExists gz FileGzip readfile fh NamedTemporaryFile fas
+# LocalWords: LoadAlignment exc GetCount fst GetSequence snd
--
GitLab