From 2fe0e865393dd01ee28922b69363a7612c2fd557 Mon Sep 17 00:00:00 2001
From: Gabriel Studer <gabriel.studer@unibas.ch>
Date: Thu, 30 May 2019 23:33:16 +0200
Subject: [PATCH] setup fragger handles in argument parser

---
 core/pymod/pm3argparse.py      | 153 ++++++++++++++++++++++++++++++++-
 core/tests/test_pm3argparse.py |  65 ++++++++++++++
 2 files changed, 214 insertions(+), 4 deletions(-)

diff --git a/core/pymod/pm3argparse.py b/core/pymod/pm3argparse.py
index d4c9643f..e97776d1 100644
--- a/core/pymod/pm3argparse.py
+++ b/core/pymod/pm3argparse.py
@@ -32,6 +32,7 @@ import ost
 from ost import io, seq
 
 from promod3.core import helper
+from promod3 import loop, modelling
 
 def _TmpForGZip(filename, suffix, msg_prefix):
     """Unpack a file to a tmp file if gzipped.
@@ -259,6 +260,18 @@ def _FetchProfileFromFile(filename):
                                str(exc), 52)        
     return prof
 
+def _FetchPsipredFromFile(filename):
+    """Load psipred prediction from filename and return it."""
+    argstr = filename + ": "
+    helper.FileExists("Profile", 51, filename)
+    try:
+        pred = loop.PsipredPrediction.FromHHM(filename)
+    except Exception, exc:
+        helper.MsgErrorAndExit(argstr + ": failure to parse psipred " +
+                               "prediction: " + str(exc), 56)        
+    return pred
+
+
 def _GetChains(structures, structure_sources):
     """Get chain id to entity view (single chain) mapping (dict)."""
     # IDs: (file_base = base file name with no extensions)
@@ -427,6 +440,8 @@ class PM3ArgumentParser(argparse.ArgumentParser):
             self._AssembleStructure()
         if 'PROFILE' in self.activate:
             self._AssembleProfile()
+        if 'FRAGMENTS' in self.activate:
+            self._AssembleFragments()
 
     def AddAlignment(self, allow_multitemplate=False):
         """Commandline options for alignments.
@@ -603,6 +618,51 @@ class PM3ArgumentParser(argparse.ArgumentParser):
         """
         self.activate.add('PROFILE')
 
+
+    def AddFragments(self):
+        """Commandline option for usage of Fragments
+
+        Activate everything needed to setup 
+        :class:`promod3.modelling.FraggerHandle` objects in the argument parser.
+        Command line arguments are then added in :meth:`AssembleParser` and the
+        input is post processed and checked in :meth:`Parse`.
+
+        Options/arguments added:
+
+        * ``--use_fragments`` - Boolean flag whether to setup fragger handles.
+
+        Notes:
+
+        * Fragger handles are setup to identify fragments in a 
+          :class:`promod3.loop.StructureDB`.
+
+        * If no profiles are provided as additional argument 
+          (``-s/--seqprof <FILE>``), fragments are identified based on BLOSUM62 
+          sequence similarity.
+
+        * If you provide profiles that are not in hhm format, fragments are 
+          identified based on BLOSUM62 sequence similarity, sequence profile 
+          scoring and structural profile scoring.
+
+        * If you provide profiles in hhm format (optimal case), psipred 
+          predictions are fetched from there and fragments are identified based
+          on secondary structure agreement, secondary structure dependent
+          torsion probabilities, sequence profile scoring and structure 
+          profile scoring.
+
+        Attributes added to the namespace returned by :meth:`Parse`:
+
+        * :attr:`fragger_handles` - :class:`list` of 
+          :class:`promod3.modelling.FraggerHandle`, ordered to match the target 
+          sequences.
+
+        Exit codes related to fragments input:
+
+        * 56 - cannot read psipred prediction from hhm file
+        """
+        self.activate.add('FRAGMENTS')
+
+
     def _AssembleAlignment(self):
         """Actually add alignment arguments/options."""
         aln_grp = self.add_mutually_exclusive_group(required=True)
@@ -650,6 +710,15 @@ class PM3ArgumentParser(argparse.ArgumentParser):
                           ".hhm, .hhm.gz, .pssm, .pssm.gz", action='append',
                           default=list())
 
+    def _AssembleFragments(self):
+        self.add_argument('--use_fragments',
+                          help="Use fragments instead of torsion angle "+
+                          "based sampling for Monte Carlo approaches. "+
+                          "For optimal performance you should provide "+
+                          "sequence profiles in hhm format. (File "+
+                          "extensions: .hhm or .hhm.gz). BUT: be aware of"+
+                          "increased runtime.", action="store_true")
+
 class PM3OptionsNamespace(object):
     # class will grow, so for the moment pylint is ignored
     #pylint: disable=too-few-public-methods
@@ -672,6 +741,8 @@ class PM3OptionsNamespace(object):
             self._AttachViews()
         if 'PROFILE' in activated:
             self._PostProcessProfile()
+        if 'FRAGMENTS' in activated:
+            self._PostProcessFragments()
 
     def _PostProcessAlignment(self):
         #pylint: disable=no-member
@@ -730,11 +801,11 @@ class PM3OptionsNamespace(object):
             # so not having any profile is fine
             return
 
-        loaded_profiles = list()
+        self.loaded_profiles = list()
         for src in self.seqprof:
-            loaded_profiles.append(_FetchProfileFromFile(src))
+            self.loaded_profiles.append(_FetchProfileFromFile(src))
 
-        prof_sequences = [p.sequence for p in loaded_profiles]
+        prof_sequences = [p.sequence for p in self.loaded_profiles]
 
         # check uniqueness of loaded profiles
         if len(set(prof_sequences)) != len(prof_sequences):
@@ -746,7 +817,7 @@ class PM3OptionsNamespace(object):
                          for aln in self.alignments]
         for s in trg_sequences:
             try:
-                self.profiles.append(loaded_profiles[prof_sequences.index(s)])
+                self.profiles.append(self.loaded_profiles[prof_sequences.index(s)])
             except Exception, exc:
                 helper.MsgErrorAndExit("Could not find profile with sequence " +
                                        "that exactly matches trg seq: " + s, 55)
@@ -758,6 +829,80 @@ class PM3OptionsNamespace(object):
             helper.MsgErrorAndExit("Could not map every profile to a target " +
                                    "sequence", 53)
 
+    def _PostProcessFragments(self):
+
+        self.fragger_handles = list()
+
+        if not self.use_fragments:
+            # no fragments requested, so lets just return
+            return
+
+        trg_sequences = [aln.GetSequence(0).GetGaplessString() \
+                         for aln in self.alignments]
+
+        # we only want to setup a Fragger for every unique target sequence
+        unique_trg_sequences = list(set(trg_sequences))
+
+        # already setup variables, fill later if required data is present
+        profiles = [None] * len(unique_trg_sequences)
+        psipred_predictions = [None] * len(unique_trg_sequences)
+        ts_coil = None
+        ts_helix = None
+        ts_extended = None
+
+        # a structure db we need anyway. Load once and assign the same to all 
+        # fraggers to avoid memory explosion
+        structure_db = loop.LoadStructureDB()
+
+        # load the profiles
+        if hasattr(self, "profiles") and len(self.profiles) > 0:
+            profile_dict = dict()
+            for p in self.loaded_profiles:
+                profile_dict[p.sequence] = p
+            # as we already mapped the profiles in _PostProcessProfiles,
+            # the following is guaranteed to find the right profile
+            # for every unique target sequence
+            for s_idx, s in enumerate(unique_trg_sequences):
+                profiles[s_idx] = profile_dict[s]
+
+            # For the psipred predictions we have to go back to the
+            # input files. If they all end with .hhm or hhm.gz we're ready to go
+            file_endings_ok = True
+            for src in self.seqprof:
+                if not (src.endswith(".hhm") or src.endswith(".hhm.gz")):
+                    file_endings_ok = False
+                    break
+
+            if file_endings_ok:
+                # lets load the torsion samplers now as they are only required
+                # if we also add psipred handlers
+                ts_coil = loop.LoadTorsionSamplerCoil()
+                ts_extended = loop.LoadTorsionSamplerExtended()
+                ts_helix = loop.LoadTorsionSamplerHelical()
+
+                # to get the right filenames we use the sequences of the 
+                # loaded profiles that are in the same order as self.seqprof
+                profile_sequences = [p.sequence for p in self.loaded_profiles]
+                for s_idx, s in enumerate(unique_trg_sequences):
+                    fn = self.seqprof[profile_sequences.index(s)]
+                    psipred_predictions[s_idx] = _FetchPsipredFromFile(fn)
+        
+        # setup one fragger handle for each unique sequence
+        fraggers = list()
+        for i in range(len(unique_trg_sequences)):
+            fraggers.append(modelling.FraggerHandle(unique_trg_sequences[i],
+                                                    profile = profiles[i],
+                                                    psipred_pred = psipred_predictions[i],
+                                                    rmsd_thresh = 0.02,
+                                                    structure_db = structure_db,
+                                                    torsion_sampler_coil = ts_coil,
+                                                    torsion_sampler_helix = ts_helix,
+                                                    torsion_sampler_extended = ts_extended))
+        # map them to the chains
+        for s in trg_sequences:
+            self.fragger_handles.append(fraggers[unique_trg_sequences.index(s)])
+
+
 #  LocalWords:  param attr prog argparse ArgumentParser bool sys os init str
 #  LocalWords:  progattr descattr argpinit argv formatter meth args namespace
 #  LocalWords:  ArgumentDefaultsHelpFormatter sysargv AssembleParser fasta io
diff --git a/core/tests/test_pm3argparse.py b/core/tests/test_pm3argparse.py
index a9626bf4..878008b5 100644
--- a/core/tests/test_pm3argparse.py
+++ b/core/tests/test_pm3argparse.py
@@ -1225,6 +1225,71 @@ class PM3ArgParseTests(unittest.TestCase):
                          opts.profiles[1].sequence)
 
 
+    def testFraggerNotRequested(self):
+        parser = pm3argparse.PM3ArgumentParser(__doc__, action=False)
+        parser.AddAlignment()
+        parser.AddFragments()
+        parser.AssembleParser()
+        opts =  parser.Parse(['-f', 'data/aln_tpl/1crn.fasta',
+                              '-f', 'data/aln_tpl/5ua4_B.fasta'])
+        self.assertEqual(len(opts.alignments), 2)
+        self.assertEqual(len(opts.fragger_handles), 0)
+
+    def testFraggerAttachedWithoutProfile(self):
+        parser = pm3argparse.PM3ArgumentParser(__doc__, action=False)
+        parser.AddAlignment()
+        parser.AddFragments()
+        parser.AssembleParser()
+        opts =  parser.Parse(['-f', 'data/aln_tpl/1crn.fasta',
+                              '-f', 'data/aln_tpl/5ua4_B.fasta',
+                              '--use_fragments'])
+        self.assertEqual(len(opts.alignments), 2)
+        self.assertEqual(len(opts.fragger_handles), 2)
+        self.assertEqual(opts.alignments[0].GetSequence(0).GetGaplessString(),
+                         opts.fragger_handles[0].sequence)
+        self.assertEqual(opts.alignments[1].GetSequence(0).GetGaplessString(),
+                         opts.fragger_handles[1].sequence)
+        # most of the stuff in the fragger handles should be None
+        self.assertIsNone(opts.fragger_handles[0].profile)
+        self.assertIsNone(opts.fragger_handles[0].psipred_pred)
+        self.assertIsNone(opts.fragger_handles[0].torsion_sampler_coil)
+        self.assertIsNone(opts.fragger_handles[0].torsion_sampler_helix)
+        self.assertIsNone(opts.fragger_handles[0].torsion_sampler_extended)
+        self.assertIsNone(opts.fragger_handles[1].profile)
+        self.assertIsNone(opts.fragger_handles[1].psipred_pred)
+        self.assertIsNone(opts.fragger_handles[1].torsion_sampler_coil)
+        self.assertIsNone(opts.fragger_handles[1].torsion_sampler_helix)
+        self.assertIsNone(opts.fragger_handles[1].torsion_sampler_extended)
+
+    def testFraggerAttachedWithProfile(self):
+        parser = pm3argparse.PM3ArgumentParser(__doc__, action=False)
+        parser.AddAlignment()
+        parser.AddProfile()
+        parser.AddFragments()
+        parser.AssembleParser()
+        opts =  parser.Parse(['-f', 'data/aln_tpl/1crn.fasta',
+                              '-f', 'data/aln_tpl/5ua4_B.fasta',
+                              '-s', 'data/aln_tpl/1crn.hhm',
+                              '-s', 'data/aln_tpl/5ua4_B.hhm',
+                              '--use_fragments'])
+        self.assertEqual(len(opts.alignments), 2)
+        self.assertEqual(len(opts.fragger_handles), 2)
+        self.assertEqual(opts.alignments[0].GetSequence(0).GetGaplessString(),
+                         opts.fragger_handles[0].sequence)
+        self.assertEqual(opts.alignments[1].GetSequence(0).GetGaplessString(),
+                         opts.fragger_handles[1].sequence)
+        # most of the stuff in the fragger handles should be None
+        self.assertIsNotNone(opts.fragger_handles[0].profile)
+        self.assertIsNotNone(opts.fragger_handles[0].psipred_pred)
+        self.assertIsNotNone(opts.fragger_handles[0].torsion_sampler_coil)
+        self.assertIsNotNone(opts.fragger_handles[0].torsion_sampler_helix)
+        self.assertIsNotNone(opts.fragger_handles[0].torsion_sampler_extended)
+        self.assertIsNotNone(opts.fragger_handles[1].profile)
+        self.assertIsNotNone(opts.fragger_handles[1].psipred_pred)
+        self.assertIsNotNone(opts.fragger_handles[1].torsion_sampler_coil)
+        self.assertIsNotNone(opts.fragger_handles[1].torsion_sampler_helix)
+        self.assertIsNotNone(opts.fragger_handles[1].torsion_sampler_extended)
+
 # test options: --disable-aln check (for amino acids)
 # test options: --disable-input-checks (for all)
 # test option: --disable-mm-check (macromolecule)
-- 
GitLab