From 0d675bb90b13037a78b4677a225fa0dbe3728ac6 Mon Sep 17 00:00:00 2001
From: Gabriel Studer <gabriel.studer@unibas.ch>
Date: Tue, 21 Jul 2020 12:04:34 +0200
Subject: [PATCH] hhblits docu updates

---
 modules/bindings/doc/hhblits.rst   | 131 ++++++++++++-----------------
 modules/bindings/pymod/hhblits3.py |   7 +-
 2 files changed, 57 insertions(+), 81 deletions(-)

diff --git a/modules/bindings/doc/hhblits.rst b/modules/bindings/doc/hhblits.rst
index 62b3262a0..e39e71eab 100644
--- a/modules/bindings/doc/hhblits.rst
+++ b/modules/bindings/doc/hhblits.rst
@@ -4,118 +4,91 @@
 Introduction
 --------------------------------------------------------------------------------
 
-HHblits is a sequence search tool like BLAST, able to find more distant
-homologs This is achieved by performing `profile-profile searches`. In BLAST, a
-\query sequence is searched against a sequence database. That is a
-`sequence-sequence search`. HHblits works on a profile database, usually that
-one is provided, queried with a sequence profile. The latter one needs to be
-calculated before the actual search. In very simple words, HHblits is using
-per-sequence scoring functions to be more sensitive, in this particular case
-Hidden Markov models. The software suite needed for HHblits can be found
+HHblits is a sequence search tool like BLAST but able to find more distant
+homologs. This is achieved by aligning hidden Markov models (HMM)
+in the search process as opposed to `sequence-sequence` searches in BLAST.
+HHblits works on a HMM database, usually that one is provided, queried with 
+a HMM representing your target sequence. The latter one needs to be calculated 
+before the actual search. The software suite needed for HHblits can be found on
+`github <https://github.com/soedinglab/hh-suite>`_.
+Alternatively, the deprecated HHblits 2.x suite can be found here:
 `here <http://wwwuser.gwdg.de/~compbiol/data/hhsuite/releases/all/>`_.
 
-
-Examples
+On HHblits Versions
 --------------------------------------------------------------------------------
 
-A typical search: Get an instance of the bindings, build the search profile out
-of the query sequence, run the search and iterate results. Since
-:class:`~HHblits` works with a :class:`~ost.seq.SequenceHandle` or a FastA
-file, both ways are shown.
-
-First query by sequence:
+The binding for HHblits 3 has internally been forked from the HHblits 2 binding. 
+The binding for HHblits 2 is considered deprecated and doesn't receive bugfixes
+anymore. Also the documentation refers to the HHblits 3 binding. The different
+bindings can be imported explicitely:
 
 .. code-block:: python
 
-  from ost.bindings import hhblits
+  from ost.bindings import hhblits2  
+  from ost.bindings import hhblits3
 
-  # get a SequenceHandle
-  query_seq = seq.CreateSequence('Query',
-                                 'MRIILLGAPGAGKGTQAQFIMEKYGIPQISTGDMLRAAVKSGS'+
-                                 'ELGKQAKDIMDAGKLVTDELVIALVKERIAQEDCRNGFLLDGF'+
-                                 'PRTIPQADAMKEAGINVDYVLEFDVPDELIVDRIVGRRVHAPS'+
-                                 'GRVYHVKFNPPKVEGKDDVTGEELTTRKDDQEETVRKRLVEYH'+
-                                 'QMTAPLIGYYYYSKEAEAGNTKYAKVDGTKPVAEVRADLEKILG')
+Alternatively you can let OpenStructure figure out the HHblits version you're
+using and import the appropriate binding for you under the base name hhblits. 
+This assumes the HHblits binary (hhblits) to be in your path and raises an error 
+otherwise.
 
-  # set up the search environment
-  # $EBROOTHHMINSUITE points to the root of a HHsuite installation
-  hh = hhblits.HHblits(query_seq, os.getenv('EBROOTHHMINSUITE'))
-
-  # now create a search profile for the query sequence against the NR20 db
-  # provided on the HHblits web page, nr20_12Aug11 is just the prefix common to
-  # all db files, so `ls hhtools/nr20_12Aug11*` would list all of them
-  a3m_file = hh.BuildQueryMSA(nrdb='hhtools/nr20_12Aug11')
+.. code-block:: python
 
-  # search time! we just search against NR20 again, but every HHblits db is
-  # working here, e.g. one build from all the sequences in PDB
-  hit_file = hh.Search(a3m_file, 'hhtools/nr20_12Aug11')
+  from ost.bindings import hhblits
 
-  # lets have a look at the resuls
-  with open(hit_file) as hit_fh:
-      header, hits = hhblits.ParseHHblitsOutput(hit_fh)
-  for hit in hits:
-      print(hit.aln)
 
-  # cleanup
-  hh.Cleanup()
+Examples
+--------------------------------------------------------------------------------
 
-Very similar going by file:
+A typical search: Get an instance of the binding, build the search profile out
+of the query sequence, run the search and iterate results. 
 
 .. code-block:: python
 
-  from ost.bindings import hhblits
-
-  # set up the search environment
-  #  $EBROOTHHMINSUITE points to the root of a HHsuite installation
-  hh = hhblits.HHblits('query.fas', os.getenv('EBROOTHHMINSUITE'))
+  from ost.bindings import hhblits3
 
-  # now create a search profile for the query sequence against the NR20 db
-  # provided on the HHblits web page, nr20_12Aug11 is just the prefix common to
-  # all db files, so `ls hhtools/nr20_12Aug11*` would list all of them
-  a3m_file = hh.BuildQueryMSA(nrdb='hhtools/nr20_12Aug11')
+  # Create a SequenceHandle, alternatively you can load any sequence in 
+  # FASTA format using ost.io.LoadSequence(<PATH_TO_SEQUENCE>)
+  query_seq = seq.CreateSequence('Query',
+                                 'TTCCPSIVARSNFNVCRLPGTPEAICATYTGCIIIPGATCPGDYAN')
 
-  # search time! we just search against NR20 again, but every HHblits db is
+  # set up the search environment
+  # lets assume a default installation with hhblits binary at
+  # <PATH_TO_HHBLITS_INSTALL>/bin/hhblits
+  hh = hhblits3.HHblits(query_seq, '<PATH_TO_HHBLITS_INSTALL>')
+
+  # now create a search profile for the query sequence against uniclust30 
+  # which you can get with instructions in the hh-suite user guide (github)
+  # <PATH_TO_DB>/uniclust30_2018_08 is just the prefix common to
+  # all db files, so `ls <PATH_TO_DB>/uniclust30_2018_08*` would list all 
+  # of them
+  a3m_file = hh.BuildQueryMSA(nrdb='<PATH_TO_DB>/uniclust30_2018_08')
+
+  # lets load the data in the a3m_file and display the generated
+  # multiple sequence alignment note that ParseA3M is not a class method 
+  # but a module function
+  a3m_data = hhblits3.ParseA3M(open(a3m_file))
+  print(a3m_data['msa'])
+
+  # search time! we just search against uniclust again, but every HHblits db is
   # working here, e.g. one build from all the sequences in PDB
-  hit_file = hh.Search(a3m_file, 'hhtools/nr20_12Aug11')
+  hit_file = hh.Search(a3m_file, '<PATH_TO_DB>/uniclust30_2018_08')
 
   # lets have a look at the resuls
   with open(hit_file) as hit_fh:
-      header, hits = hhblits.ParseHHblitsOutput(hit_fh)
+      header, hits = hhblits3.ParseHHblitsOutput(hit_fh)
   for hit in hits:
       print(hit.aln)
 
   # cleanup
   hh.Cleanup()
 
-The alignments produced by HHblits are sometimes slightly better than by BLAST,
-so one may want to extract them:
-
-.. code-block:: python
-
-  from ost.bindings import hhblits
-
-  # set up the search environment
-  #  $EBROOTHHMINSUITE points to the root of a HHsuite installation
-  hh = hhblits.HHblits('query.fas', os.getenv('EBROOTHHMINSUITE'))
-
-  # now create a search profile for the query sequence against the NR20 db
-  # provided on the HHblits web page, nr20_12Aug11 is just the prefix common to
-  # all db files, so `ls hhtools/nr20_12Aug11*` would list all of them
-  a3m_file = hh.BuildQueryMSA(nrdb='hhtools/nr20_12Aug11')
-
-  # note that ParseA3M is not a class method but a module function
-  output = hhblits.ParseA3M(open(a3m_file))
-
-  print(output['msa'])
-
-  # cleanup
-  hh.Cleanup()
 
 
 Binding API
 --------------------------------------------------------------------------------
 
-.. automodule:: ost.bindings.hhblits
+.. automodule:: ost.bindings.hhblits3
    :synopsis: Search related sequences in databases
    :members:
 
diff --git a/modules/bindings/pymod/hhblits3.py b/modules/bindings/pymod/hhblits3.py
index 0e34a97c1..82c632c9e 100644
--- a/modules/bindings/pymod/hhblits3.py
+++ b/modules/bindings/pymod/hhblits3.py
@@ -542,7 +542,8 @@ class HHblits:
                         front of every key. Boolean True values add flag without
                         value. Merged with default options 
                         {'cpu': 1, 'n': 1, 'e': 0.001}, where 'n' defines the 
-                        number of iterations.
+                        number of iterations and 'e' the E-value cutoff for 
+                        inclusion of sequences in result alignment.
         :type options: :class:`dict`
 
         :param a3m_file: a path of a3m_file to be used, optional
@@ -553,7 +554,9 @@ class HHblits:
                           addss.pl script provided by the HH-suite. However, 
                           your HH-suite installation requires you to specify
                           paths to PSIRED etc. We refer to the HH-suite user
-                          guide for further instructions.
+                          guide for further instructions. Assignment is done
+                          by calling :func:`HHblits.AssignSSToA3M`
+
         :type assign_ss:  :class:`bool`
 
         :return: The path to the A3M file containing the MSA
-- 
GitLab