From 0a3ee11fc6b278b7f04d1d14e4492d4c312c237c Mon Sep 17 00:00:00 2001
From: Stefan Bienert <stefan.bienert@unibas.ch>
Date: Mon, 17 Aug 2015 16:18:56 +0200
Subject: [PATCH] --fasta option

---
 core/pymod/core/pm3argparse.py | 236 +++++++++++++++++++++++++++++++--
 1 file changed, 227 insertions(+), 9 deletions(-)

diff --git a/core/pymod/core/pm3argparse.py b/core/pymod/core/pm3argparse.py
index 670c4453..3d099d21 100644
--- a/core/pymod/core/pm3argparse.py
+++ b/core/pymod/core/pm3argparse.py
@@ -5,6 +5,41 @@ Extensions for the argparse module.
 import argparse
 import sys
 import os
+import gzip
+import tempfile
+
+import ost
+from ost import io, seq
+
+from promod3.core import helper
+
+def _AssembleTrgTplAln(target, template):
+    """
+    Internal function: Assemble a target-template alignment without leading/
+    final gaps in the target sequence. Set the offset for the template sequence.
+    """
+    # count leading gaps to get the start position
+    start = 0
+    for i in range(0, target.length):
+        if target[i] != '-':
+            start = i
+            break
+    # get rid of closing gaps at the end
+    end = target.length
+    for i in range(target.length, 1, -1):
+        if target[i-1] != '-':
+            end = i
+            break
+    # assemble template sequence
+    tpl_str = ''
+    for i in range(start, end):
+        tpl_str += template[i]
+    new_aln = seq.CreateAlignment(seq.CreateSequence(target.name.strip(),
+                                                     str(target)[start:end]),
+                                  seq.CreateSequence(template.name.strip(),
+                                                     tpl_str))
+    new_aln.SetSequenceOffset(1, start)
+    return new_aln
 
 class PM3ArgumentParser(argparse.ArgumentParser):
     """
@@ -12,7 +47,7 @@ class PM3ArgumentParser(argparse.ArgumentParser):
     set of standard arguments which can be activated, rather than added via the
     traditional way. This helps keeping up a common naming scheme throughout
     all |project| actions. As a real extension, this subclass provides checking
-    of input parameters on :meth:`~pm3argparse.PM3ArgumentParser.Parse`. Beside
+    of input parameters on :meth:`Parse`. Beside
     this, everything you can do with a 'real' :class:`~argparse.ArgumentParser`
     instance is possible here.
 
@@ -54,24 +89,207 @@ class PM3ArgumentParser(argparse.ArgumentParser):
                                          formatter_class=\
                                          argparse.ArgumentDefaultsHelpFormatter)
         self.action = action
+        self.activate = dict()
 
+    def _print_message(self, message, file=None):
+        #pylint: disable=redefined-builtin
+        """
+        This is like a welcome message to the "country of bad style"... we are
+        overwriting a "_" function from the parent-class. Those guys should not
+        be used outside of the housing module, never... but here it is a single
+        function to bend :mod:`argparse` to use :class:`ost.Logger`.
+        """
+        if message:
+            if file is None or file is sys.stderr:
+                ost.LogError(message)
+            else:
+                ost.LogScript(message)
 
-    def Parse(self, args=None, namespace=None):
+    def Parse(self, args=None):
         """
         Parse an argument string.
 
         :param args: The argument string. As default |sysargv|_ is used.
         :type args: :class:`list`
 
-        :param namespace: The same as for
-                          :meth:`argparse.ArgumentParser.parse_args`.
-
-        :returns: If :attr:`namespace` is not given,
-                  :class:`argparse.Namespace`.
+        :returns: :class:`promod3.cor.pm3argparse.PM3OptionsNamespace`.
         """
-        opts = self.parse_args(args=args, namespace=namespace)
+        opts = PM3OptionsNamespace()
+        self.parse_args(args=args, namespace=opts)
+
+        opts.PostProcess(self.activate.keys())
         return opts
 
+    def AssembleParser(self):
+        """
+        When adding options via the :meth:`Add*` methods, call this after you
+        are done. Everything before just tells the parser that it should
+        contain those option sets but does not actually add anything.
+        :meth:`AssembleParser` will put everything in place, in the right order
+        and with the right constraints.
+        """
+        if 'ALIGNMENT' in self.activate.keys():
+            self._AssembleAlignment()
+
+    def AddAlignment(self):
+        """
+        Add everything needed to load alignments to the argument parser. Creates
+        several options/ arguments and adds some checks for post processing.
+        This method only adds a flag to the parser to add alignment options on
+        :meth:`AssembleParser`. Depending on which options you activate, things
+        need to be added in a different order or have other constraints.
+
+        Options/ arguments added:
+
+        * ``--fasta trg:<NAME> <FILE>`` - describing a target-template alignment
+          with ``trg:`` marking the target sequence inside :file:`<FILE>`
+
+        Exit codes related to alignment input:
+
+        * 11 - no prefix ``trg:`` found for an argument to '--fasta'
+
+        * 12 - a given alignment file does not exist
+
+        * 13 - never raised (parameter for checking gzip files)
+
+        * 14 - empty target name found (``trg:``)
+
+        * 15 - found an empty alignment file
+
+        * 16 - alignment with more than 2 sequences found
+
+        * 17 - target sequence name not found in alignment
+
+        * 18 - sequences in the alignment have different length
+
+        Attributes added to the namespace returned by
+        :meth:`Parse`:
+
+        * :attr:`fasta` - filled with the input of the '--fasta' argument, a
+                          :class:`list` with multiple :class:`list` objects
+
+        * :attr:`alignments` - :class:`ost.AlignmentList`, same order as
+                               :attr:`fasta`
+
+        * :attr:`aln_sources` - the original source of the alignment, may be
+                                filename(s) or a string in JSON format,
+                                :class:`list` of all sources
+        """
+        self.activate['ALIGNMENT'] = 1
+
+    def _AssembleAlignment(self):
+        """
+        Actually add alignment arguments/ options
+        """
+        # FastA input: - always pairwise alignments
+        #              - callable multiple times
+        #              - goes by 'trg:<SEQNAME> <FILE>'
+        #              - excludes JSON file/ object
+        #              - leading whitespaces will be deleted
+        self.add_argument('-f', '--fasta', nargs=2, action='append',
+                          metavar=('trg:<NAME>', '<FILE>'),
+                          help='Pairwise alignment in FastA format, needs to '+
+                          'declare what is the target sequence.')
+        # input: FastA/ JSON
+        # determined by extension: if we are wrong, the whole loading fails
+        # possibility to add JSON: mention limitation!
+
+class PM3OptionsNamespace(object):
+    """
+    This one is mainly for internal use. You can use it like everything that
+    comes out of :meth:`argparse.ArgumentParser.parse_args`. Attributes are
+    added regarding how you assembled your argument parser.
+    """
+    def __init__(self):
+        pass
+
+    def PostProcess(self, activated):
+        """
+        Post processing of activated option packs.
+        """
+        if 'ALIGNMENT' in activated:
+            self._PostProcessAlignment()
+
+    def _PostProcessAlignment(self):
+        #pylint: disable=no-member
+        #pylint: disable=attribute-defined-outside-init
+        """
+        Doing some extra work after parsing.
+        """
+        self.aln_sources = list()
+        self.alignments = seq.AlignmentList()
+        if self.fasta:
+            for src in self.fasta:
+                if src[0].startswith('trg:'):
+                    trgname = src[0][4:]
+                    seqfile = src[1]
+                elif src[1].startswith('trg:'):
+                    trgname = src[1][4:]
+                    seqfile = src[0]
+                else:
+                    helper.MsgErrorAndExit("'--fasta' requires one argument "+
+                                           "prefixed with 'trg:' marking the "+
+                                           "target sequence name", 11)
+                if not len(trgname):
+                    helper.MsgErrorAndExit("'--fasta' requires argument "+
+                                           "'trg:' defining the "+
+                                           "target sequence name, empty one "+
+                                           "found: '%s'" % ' '.join(src), 14)
+                helper.FileExists("Alignment", 12, seqfile)
+                is_gz = helper.FileGzip("Alignment", 13, seqfile)
+                readfile = seqfile
+                if is_gz:
+                    zip_fh = gzip.open(seqfile)
+                    unzip_str = zip_fh.read()
+                    zip_fh.close()
+                    unzip_file = tempfile.NamedTemporaryFile(mode='w',
+                                                             suffix='.fas')
+                    unzip_file.write(unzip_str)
+                    unzip_file.flush()
+                    readfile = unzip_file.name
+                try:
+                    aln = io.LoadAlignment(readfile, format="fasta")
+                except Exception, exc: #pylint: disable=broad-except
+                    if exc.message == 'Bad FASTA file: File is empty':
+                        helper.MsgErrorAndExit("'--fasta' refers to an empty "+\
+                                               "file or its in the wrong "+
+                                               "format: %s" % seqfile, 15)
+                    elif exc.message == 'sequences have different lengths':
+                        helper.MsgErrorAndExit("'--fasta %s': " % ' '.join(src)+
+                                               "sequences in the alignment "+
+                                               "have different length.", 18)
+                    else:
+                        raise
+                finally:
+                    if is_gz:
+                        unzip_file.close()
+                # check alignment
+                nos = aln.GetCount()
+                if nos > 2:
+                    helper.MsgErrorAndExit("'--fasta %s' " % ' '.join(src)+
+                                           "points to an alignment with "+
+                                           "more than 2 sequences.", 16)
+                fst_seq = aln.GetSequence(0)
+                snd_seq = aln.GetSequence(1)
+                if fst_seq.name.strip() == trgname:
+                    new_aln = _AssembleTrgTplAln(fst_seq, snd_seq)
+                elif snd_seq.name.strip() == trgname:
+                    new_aln = _AssembleTrgTplAln(snd_seq, fst_seq)
+                else:
+                    helper.MsgErrorAndExit("'--fasta %s' " % ' '.join(src)+
+                                           "does not define a target name "+
+                                           "found in the alignment.", 17)
+
+                self.alignments.append(new_aln)
+                self.aln_sources.append(seqfile)
+
 #  LocalWords:  param attr prog argparse ArgumentParser bool sys os init str
 #  LocalWords:  progattr descattr argpinit argv formatter meth args namespace
-#  LocalWords:  ArgumentDefaultsHelpFormatter sysargv
+#  LocalWords:  ArgumentDefaultsHelpFormatter sysargv AssembleParser fasta io
+#  LocalWords:  metavar trg tpl FastA gzip tempfile ost promod aln stderr src
+#  LocalWords:  AssembleTrgTplAln CreateSequence SetSequenceOffset LogError
+#  LocalWords:  LogScript OptionsNamespace PostProcess AssembleAlignment JSON
+#  LocalWords:  AddAlignment AlignmentList SEQNAME whitespaces nargs trgname
+#  LocalWords:  PostProcessAlignment startswith seqfile elif MsgErrorAndExit
+#  LocalWords:  len FileExists gz FileGzip readfile fh NamedTemporaryFile fas
+#  LocalWords:  LoadAlignment exc GetCount fst GetSequence snd
-- 
GitLab