From 0d675bb90b13037a78b4677a225fa0dbe3728ac6 Mon Sep 17 00:00:00 2001 From: Gabriel Studer <gabriel.studer@unibas.ch> Date: Tue, 21 Jul 2020 12:04:34 +0200 Subject: [PATCH] hhblits docu updates --- modules/bindings/doc/hhblits.rst | 131 ++++++++++++----------------- modules/bindings/pymod/hhblits3.py | 7 +- 2 files changed, 57 insertions(+), 81 deletions(-) diff --git a/modules/bindings/doc/hhblits.rst b/modules/bindings/doc/hhblits.rst index 62b3262a0..e39e71eab 100644 --- a/modules/bindings/doc/hhblits.rst +++ b/modules/bindings/doc/hhblits.rst @@ -4,118 +4,91 @@ Introduction -------------------------------------------------------------------------------- -HHblits is a sequence search tool like BLAST, able to find more distant -homologs This is achieved by performing `profile-profile searches`. In BLAST, a -\query sequence is searched against a sequence database. That is a -`sequence-sequence search`. HHblits works on a profile database, usually that -one is provided, queried with a sequence profile. The latter one needs to be -calculated before the actual search. In very simple words, HHblits is using -per-sequence scoring functions to be more sensitive, in this particular case -Hidden Markov models. The software suite needed for HHblits can be found +HHblits is a sequence search tool like BLAST but able to find more distant +homologs. This is achieved by aligning hidden Markov models (HMM) +in the search process as opposed to `sequence-sequence` searches in BLAST. +HHblits works on a HMM database, usually that one is provided, queried with +a HMM representing your target sequence. The latter one needs to be calculated +before the actual search. The software suite needed for HHblits can be found on +`github <https://github.com/soedinglab/hh-suite>`_. +Alternatively, the deprecated HHblits 2.x suite can be found here: `here <http://wwwuser.gwdg.de/~compbiol/data/hhsuite/releases/all/>`_. - -Examples +On HHblits Versions -------------------------------------------------------------------------------- -A typical search: Get an instance of the bindings, build the search profile out -of the query sequence, run the search and iterate results. Since -:class:`~HHblits` works with a :class:`~ost.seq.SequenceHandle` or a FastA -file, both ways are shown. - -First query by sequence: +The binding for HHblits 3 has internally been forked from the HHblits 2 binding. +The binding for HHblits 2 is considered deprecated and doesn't receive bugfixes +anymore. Also the documentation refers to the HHblits 3 binding. The different +bindings can be imported explicitely: .. code-block:: python - from ost.bindings import hhblits + from ost.bindings import hhblits2 + from ost.bindings import hhblits3 - # get a SequenceHandle - query_seq = seq.CreateSequence('Query', - 'MRIILLGAPGAGKGTQAQFIMEKYGIPQISTGDMLRAAVKSGS'+ - 'ELGKQAKDIMDAGKLVTDELVIALVKERIAQEDCRNGFLLDGF'+ - 'PRTIPQADAMKEAGINVDYVLEFDVPDELIVDRIVGRRVHAPS'+ - 'GRVYHVKFNPPKVEGKDDVTGEELTTRKDDQEETVRKRLVEYH'+ - 'QMTAPLIGYYYYSKEAEAGNTKYAKVDGTKPVAEVRADLEKILG') +Alternatively you can let OpenStructure figure out the HHblits version you're +using and import the appropriate binding for you under the base name hhblits. +This assumes the HHblits binary (hhblits) to be in your path and raises an error +otherwise. - # set up the search environment - # $EBROOTHHMINSUITE points to the root of a HHsuite installation - hh = hhblits.HHblits(query_seq, os.getenv('EBROOTHHMINSUITE')) - - # now create a search profile for the query sequence against the NR20 db - # provided on the HHblits web page, nr20_12Aug11 is just the prefix common to - # all db files, so `ls hhtools/nr20_12Aug11*` would list all of them - a3m_file = hh.BuildQueryMSA(nrdb='hhtools/nr20_12Aug11') +.. code-block:: python - # search time! we just search against NR20 again, but every HHblits db is - # working here, e.g. one build from all the sequences in PDB - hit_file = hh.Search(a3m_file, 'hhtools/nr20_12Aug11') + from ost.bindings import hhblits - # lets have a look at the resuls - with open(hit_file) as hit_fh: - header, hits = hhblits.ParseHHblitsOutput(hit_fh) - for hit in hits: - print(hit.aln) - # cleanup - hh.Cleanup() +Examples +-------------------------------------------------------------------------------- -Very similar going by file: +A typical search: Get an instance of the binding, build the search profile out +of the query sequence, run the search and iterate results. .. code-block:: python - from ost.bindings import hhblits - - # set up the search environment - # $EBROOTHHMINSUITE points to the root of a HHsuite installation - hh = hhblits.HHblits('query.fas', os.getenv('EBROOTHHMINSUITE')) + from ost.bindings import hhblits3 - # now create a search profile for the query sequence against the NR20 db - # provided on the HHblits web page, nr20_12Aug11 is just the prefix common to - # all db files, so `ls hhtools/nr20_12Aug11*` would list all of them - a3m_file = hh.BuildQueryMSA(nrdb='hhtools/nr20_12Aug11') + # Create a SequenceHandle, alternatively you can load any sequence in + # FASTA format using ost.io.LoadSequence(<PATH_TO_SEQUENCE>) + query_seq = seq.CreateSequence('Query', + 'TTCCPSIVARSNFNVCRLPGTPEAICATYTGCIIIPGATCPGDYAN') - # search time! we just search against NR20 again, but every HHblits db is + # set up the search environment + # lets assume a default installation with hhblits binary at + # <PATH_TO_HHBLITS_INSTALL>/bin/hhblits + hh = hhblits3.HHblits(query_seq, '<PATH_TO_HHBLITS_INSTALL>') + + # now create a search profile for the query sequence against uniclust30 + # which you can get with instructions in the hh-suite user guide (github) + # <PATH_TO_DB>/uniclust30_2018_08 is just the prefix common to + # all db files, so `ls <PATH_TO_DB>/uniclust30_2018_08*` would list all + # of them + a3m_file = hh.BuildQueryMSA(nrdb='<PATH_TO_DB>/uniclust30_2018_08') + + # lets load the data in the a3m_file and display the generated + # multiple sequence alignment note that ParseA3M is not a class method + # but a module function + a3m_data = hhblits3.ParseA3M(open(a3m_file)) + print(a3m_data['msa']) + + # search time! we just search against uniclust again, but every HHblits db is # working here, e.g. one build from all the sequences in PDB - hit_file = hh.Search(a3m_file, 'hhtools/nr20_12Aug11') + hit_file = hh.Search(a3m_file, '<PATH_TO_DB>/uniclust30_2018_08') # lets have a look at the resuls with open(hit_file) as hit_fh: - header, hits = hhblits.ParseHHblitsOutput(hit_fh) + header, hits = hhblits3.ParseHHblitsOutput(hit_fh) for hit in hits: print(hit.aln) # cleanup hh.Cleanup() -The alignments produced by HHblits are sometimes slightly better than by BLAST, -so one may want to extract them: - -.. code-block:: python - - from ost.bindings import hhblits - - # set up the search environment - # $EBROOTHHMINSUITE points to the root of a HHsuite installation - hh = hhblits.HHblits('query.fas', os.getenv('EBROOTHHMINSUITE')) - - # now create a search profile for the query sequence against the NR20 db - # provided on the HHblits web page, nr20_12Aug11 is just the prefix common to - # all db files, so `ls hhtools/nr20_12Aug11*` would list all of them - a3m_file = hh.BuildQueryMSA(nrdb='hhtools/nr20_12Aug11') - - # note that ParseA3M is not a class method but a module function - output = hhblits.ParseA3M(open(a3m_file)) - - print(output['msa']) - - # cleanup - hh.Cleanup() Binding API -------------------------------------------------------------------------------- -.. automodule:: ost.bindings.hhblits +.. automodule:: ost.bindings.hhblits3 :synopsis: Search related sequences in databases :members: diff --git a/modules/bindings/pymod/hhblits3.py b/modules/bindings/pymod/hhblits3.py index 0e34a97c1..82c632c9e 100644 --- a/modules/bindings/pymod/hhblits3.py +++ b/modules/bindings/pymod/hhblits3.py @@ -542,7 +542,8 @@ class HHblits: front of every key. Boolean True values add flag without value. Merged with default options {'cpu': 1, 'n': 1, 'e': 0.001}, where 'n' defines the - number of iterations. + number of iterations and 'e' the E-value cutoff for + inclusion of sequences in result alignment. :type options: :class:`dict` :param a3m_file: a path of a3m_file to be used, optional @@ -553,7 +554,9 @@ class HHblits: addss.pl script provided by the HH-suite. However, your HH-suite installation requires you to specify paths to PSIRED etc. We refer to the HH-suite user - guide for further instructions. + guide for further instructions. Assignment is done + by calling :func:`HHblits.AssignSSToA3M` + :type assign_ss: :class:`bool` :return: The path to the A3M file containing the MSA -- GitLab