diff --git a/CHANGELOG.txt b/CHANGELOG.txt index 00bd9b3e2765afaf9c8de816bfbbc5882b8c2eca..c517b5637ff1e422282281668c60f7c07944bb3b 100644 --- a/CHANGELOG.txt +++ b/CHANGELOG.txt @@ -12,6 +12,7 @@ Changes in Release <RELEASE NUMBER> * Extended lDDT API in ost.mol.alg module to reproduce functionality of lddt binary. * Added `actions` interface including one action to compare structures. + * Updated HHblits binding (minor changes for optional arguments). Changes in Release 1.7.1 -------------------------------------------------------------------------------- diff --git a/actions/ost-compare-structures b/actions/ost-compare-structures index 3b6b9107bbab5e5adcf22c9ea75286d1ca8959d7..b4a192631f961f54e07059a56057c839f2651007 100644 --- a/actions/ost-compare-structures +++ b/actions/ost-compare-structures @@ -697,6 +697,7 @@ def _Main(): reference_results["info"]["residue_names_consistent"] = is_cons reference_results["info"]["mapping"] = { "chain_mapping": qs_scorer.chain_mapping, + "chain_mapping_scheme": qs_scorer.chain_mapping_scheme, "alignments": _GetAlignmentsAsFasta(qs_scorer.alignments)} skip_score = False if opts.consistency_checks: diff --git a/docker/Dockerfile b/docker/Dockerfile index c3fa238362271d281a3416bac646b77ea1ea6be5..66a56b23585f308c76c5ce46554c92c5f90d6704 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -95,7 +95,7 @@ RUN cd ${SRC_FOLDER} && \ # COMPILE AND INSTALL DSSP ########################## RUN cd ${SRC_FOLDER} && \ - wget ftp://ftp.cmbi.ru.nl/pub/software/dssp/dssp-${DSSP_VERSION}.tgz && \ + wget ftp://ftp.cmbi.umcn.nl/pub/molbio/software/dssp-2/dssp-${DSSP_VERSION}.tgz && \ tar -xvzf dssp-${DSSP_VERSION}.tgz && \ cd dssp-${DSSP_VERSION} && \ make -j ${CPUS_FOR_MAKE} && \ diff --git a/modules/bindings/doc/hhblits.rst b/modules/bindings/doc/hhblits.rst index c822d5bd0641d1fc6a863066f4196e3fe77f467a..9ffb1c0c6f5d81b2a35b4299e54a30165bf656e1 100644 --- a/modules/bindings/doc/hhblits.rst +++ b/modules/bindings/doc/hhblits.rst @@ -1,9 +1,6 @@ :mod:`~ost.bindings.hhblits` - Search related sequences in databases ================================================================================ -.. module:: ost.bindings.hhblits - :synopsis: Search related sequences in databases - Introduction -------------------------------------------------------------------------------- @@ -15,7 +12,7 @@ one is provided, queried with a sequence profile. The latter one needs to be calculated before the actual search. In very simple words, HHblits is using per-sequence scoring functions to be more sensitive, in this particular case Hidden Markov models. The software suite needed for HHblits can be found -`here <http://toolkit.tuebingen.mpg.de/hhblits>`_. +`here <http://wwwuser.gwdg.de/~compbiol/data/hhsuite/releases/all/>`_. Examples @@ -59,6 +56,9 @@ First query by sequence: for hit in hits: print hit.aln + # cleanup + hh.Cleanup() + Very similar going by file: .. code-block:: python @@ -84,6 +84,9 @@ Very similar going by file: for hit in hits: print hit.aln + # cleanup + hh.Cleanup() + The alignments produced by HHblits are sometimes slightly better than by BLAST, so one may want to extract them: @@ -105,24 +108,15 @@ so one may want to extract them: print output['msa'] + # cleanup + hh.Cleanup() + Binding API -------------------------------------------------------------------------------- -.. autoclass:: ost.bindings.hhblits.HHblits +.. automodule:: ost.bindings.hhblits + :synopsis: Search related sequences in databases :members: -.. autoclass:: ost.bindings.hhblits.HHblitsHit - -.. autoclass:: ost.bindings.hhblits.HHblitsHeader - -.. autofunction:: ost.bindings.hhblits.ParseHHblitsOutput - -.. autofunction:: ost.bindings.hhblits.ParseA3M - -.. autofunction:: ost.bindings.hhblits.ParseHeaderLine - -.. autofunction:: ost.bindings.hhblits.ParseHHM - -.. autofunction:: ost.bindings.hhblits.EstimateMemConsumption .. LocalWords: HHblits homologs diff --git a/modules/bindings/pymod/hhblits.py b/modules/bindings/pymod/hhblits.py index dddaa7404688f414eaa1a3915c6141b049ff52ce..0a1045171c167a5e24e958d4256739443b37970b 100644 --- a/modules/bindings/pymod/hhblits.py +++ b/modules/bindings/pymod/hhblits.py @@ -1,4 +1,4 @@ -'''HHblits wrapper. +'''HHblits wrapper classes and functions. ''' import subprocess @@ -124,8 +124,8 @@ def ParseHeaderLine(line): :param line: Line from the output header. :type line: :class:`str` - :return: Hit information - :rtype: :class:`HHblitsHit` + :return: Hit information and query/template offsets + :rtype: (:class:`HHblitsHit`, (:class:`int`, :class:`int`)) ''' for i in range(0, len(line)): if line[i].isdigit(): @@ -147,15 +147,14 @@ def ParseHeaderLine(line): def ParseHHblitsOutput(output): """ - Parses the HHblits output and returns a tuple of :class:`HHblitsHeader` and - a list of :class:`HHblitsHit` instances. + Parses the HHblits output as produced by :meth:`HHblits.Search` and returns + the header of the search results and a list of hits. - :param output: output of a :meth:`HHblits.Search`, needs to be iteratable, - e.g. an open file handle - :type output: :class:`file`/ iteratable + :param output: Iterable containing the lines of the HHblits output file + :type output: iterable (e.g. an open file handle) :return: a tuple of the header of the search results and the hits - :rtype: (:class:`HHblitsHeader`, :class:`HHblitsHit`) + :rtype: (:class:`HHblitsHeader`, :class:`list` of :class:`HHblitsHit`) """ lines = iter(output) def _ParseHeaderSection(lines): @@ -216,6 +215,12 @@ def ParseHHblitsOutput(output): return seq.CreateAlignment(s1, s2) try: while True: + # Lines which we are interested in: + # - "Done!" -> end of list + # - "No ..." -> next item in list + # - "T <hit_id> <start> <data> <end>" + # - "Q <query_id> <start> <data> <end>" + # -> rest is to be skipped line = lines.next() if len(line.strip()) == 0: continue @@ -239,19 +244,30 @@ def ParseHHblitsOutput(output): lines.next() continue assert entry_index != None + # Skip all "T ..." and "Q ..." lines besides the one we want if line[1:].startswith(' Consensus'): continue if line[1:].startswith(' ss_pred'): continue if line[1:].startswith(' ss_conf'): continue + if line[1:].startswith(' ss_dssp'): + continue if line.startswith('T '): end_pos = line.find(' ', 22) - assert end_pos != -1 + # this can fail if we didn't skip all other "T ..." lines + if end_pos == -1: + error_str = "Unparsable line '%s' for entry No %d" \ + % (line.strip(), entry_index + 1) + raise AssertionError(error_str) templ_str += line[22:end_pos] if line.startswith('Q '): end_pos = line.find(' ', 22) - assert end_pos != -1 + # this can fail if we didn't skip all other "Q ..." lines + if end_pos == -1: + error_str = "Unparsable line '%s' for entry No %d" \ + % (line.strip(), entry_index + 1) + raise AssertionError(error_str) query_str += line[22:end_pos] except StopIteration: if len(query_str) > 0: @@ -271,10 +287,10 @@ def ParseHHblitsOutput(output): def ParseA3M(a3m_file): ''' Parse secondary structure information and the multiple sequence alignment - out of an A3M file. + out of an A3M file as produced by :meth:`HHblits.BuildQueryMSA`. - :param a3m_file: Iteratable containing the lines of the A3M file - :type a3m_file: iteratable, e.g. an opened file + :param a3m_file: Iterable containing the lines of the A3M file + :type a3m_file: iterable (e.g. an open file handle) :return: Dictionary containing "ss_pred" (:class:`list`), "ss_conf" (:class:`list`) and "msa" (:class:`~ost.seq.AlignmentHandle`). @@ -323,36 +339,36 @@ def ParseA3M(a3m_file): t = msa_seq[0] al = seq.AlignmentList() for i in range(1, len(msa_seq)): - qs = '' - ts = '' - k = 0 - for c in msa_seq[i]: - if c.islower(): - qs += '-' - ts += c.upper() - else: - qs += t[k] - ts += c - k += 1 - nl = seq.CreateAlignment(seq.CreateSequence(msa_head[0], qs), - seq.CreateSequence(msa_head[i], ts)) - al.append(nl) + qs = '' + ts = '' + k = 0 + for c in msa_seq[i]: + if c.islower(): + qs += '-' + ts += c.upper() + else: + qs += t[k] + ts += c + k += 1 + nl = seq.CreateAlignment(seq.CreateSequence(msa_head[0], qs), + seq.CreateSequence(msa_head[i], ts)) + al.append(nl) profile_dict['msa'] = seq.alg.MergePairwiseAlignments(\ - al, - seq.CreateSequence(msa_head[0], - t)) + al, seq.CreateSequence(msa_head[0], t)) return profile_dict def ParseHHM(profile): - '''Parse secondary structure information and the MSA out of an HHM profile. + ''' + Parse secondary structure information and the MSA out of an HHM profile as + produced by :meth:`HHblits.A3MToProfile`. :param profile: Opened file handle holding the profile. :type profile: :class:`file` :return: Dictionary containing "ss_pred" (:class:`list`), "ss_conf" (:class:`list`), "msa" (:class:`~ost.seq.AlignmentHandle`) and - "consensus" (~ost.seq.SequenceHandle). + "consensus" (:class:`~ost.seq.SequenceHandle`). ''' profile_dict = dict() state = 'NONE' @@ -423,25 +439,13 @@ def ParseHHM(profile): seq.CreateSequence(msa_head[i], ts)) al.append(nl) profile_dict['msa'] = seq.alg.MergePairwiseAlignments(\ - al, - seq.CreateSequence(msa_head[0], t)) - #print profile_dict['msa'].ToString(80) + al, seq.CreateSequence(msa_head[0], t)) + #print profile_dict['msa'].ToString(80) # Consensus profile_dict['consensus'] = seq.CreateSequence('Consensus', consensus_txt) return profile_dict -def EstimateMemConsumption(): - """ - Estimate the memory needed by HHblits. By default it uses not more than 3G. - Also for small sequences it already uses quite some memnmory (46AA, 1.48G). - And since the memory consumption could depend on the iterative search runs, - how many hits are found in each step, we just go with 4G, here. - - :return: Assumed memory consumtion - :rtype: (:class:`float`, :class:`str`) - """ - return 4.0, 'G' class HHblits: """ @@ -461,7 +465,6 @@ class HHblits: :param working_dir: Directory for temporary files. Will be created if not present but **not** automatically deleted. :type working_dir: :class:`str` - """ OUTPUT_PREFIX = 'query_hhblits' def __init__(self, query, hhsuite_root, hhblits_bin=None, working_dir=None): @@ -474,6 +477,8 @@ class HHblits: self.hhblits_bin = settings.Locate('hhblits', explicit_file_name=hhblits_bin) self.bin_dir = os.path.dirname(self.hhblits_bin) + # guess root folder (note: this may fail in future) + self.hhsuite_root = os.path.dirname(self.bin_dir) self.hhlib_dir = os.path.join(self.hhsuite_root, 'lib', 'hh') if working_dir: self.needs_cleanup = False @@ -501,63 +506,63 @@ class HHblits: self.working_dir = tmp_dir.dirname self.filename = tmp_dir.files[0] - def Cleanup(self): - """Delete temporary data. + def BuildQueryMSA(self, nrdb, options={}, a3m_file=None): + """Builds the MSA for the query sequence. - Delete temporary data if no working dir was given. Controlled by - :attr:`needs_cleanup`. - """ - if self.needs_cleanup and os.path.exists(self.working_dir): - shutil.rmtree(self.working_dir) + This function directly uses hhblits of hhtools. While in theory it would + be possible to do this by PSI-blasting on our own, hhblits is supposed + to be faster. Also it is supposed to prevent alignment corruption. The + alignment corruption is caused by low-scoring terminal alignments that + draw the sequences found by PSI-blast away from the optimum. By removing + these low scoring ends, part of the alignment corruption can be + suppressed. - def BuildQueryMSA(self, nrdb, iterations=1, mact=None, cpu=1): - """Builds the MSA for the query sequence + hhblits does **not** call PSIPRED on the MSA to predict the secondary + structure of the query sequence. This is done by addss.pl of hhtools. + The predicted secondary structure is stored together with the sequences + identified by hhblits. - This function directly uses hhblits of hhtools. While in theory it - would be possible to do this by PSI-blasting on our own, hhblits is - supposed to be faster. Also it is supposed to prevent alignment - corruption. The alignment corruption is caused by low-scoring terminal - alignments that draw the sequences found by PSI-blast away from the - optimum. By removing these low scoring ends, part of the alignment - corruption can be suppressed. hhblits does **not** call PSIPRED on the - MSA to predict the secondary structure of the query sequence. This is - done by addss.pl of hhtools. The predicted secondary structure is - stored together with the sequences identified by hhblits. + The produced A3M file can be parsed by :func:`ParseA3M`. If the file was + already produced, hhblits is not called again and the existing file path + is returned. :param nrdb: Database to be align against; has to be an hhblits database :type nrdb: :class:`str` - :param iterations: Number of hhblits iterations - :type iterations: :class:`int` - - :param mact: ``-mact`` of hhblits - :type mact: :class:`float` + :param options: Dictionary of options to *hhblits*, one "-" is added in + front of every key. Boolean True values add flag without + value. Merged with default options {'cpu': 1, 'n': 1}, + where 'n' defines the number of iterations. + :type options: :class:`dict` - :param cpu: ``-cpu`` of hhblits - :type cpu: :class:`int` + :param a3m_file: a path of a3m_file to be used, optional + :type a3m_file: :class:`str` - :return: the path to the MSA file + :return: The path to the A3M file containing the MSA :rtype: :class:`str` """ - a3m_file = '%s.a3m' % os.path.splitext(self.filename)[0] + if a3m_file is None: + a3m_file = '%s.a3m' % os.path.splitext(self.filename)[0] + if os.path.exists(a3m_file): + ost.LogInfo('Reusing already existing query alignment (%s)' % a3m_file) + return a3m_file ost.LogInfo('Using hhblits from "%s"' % self.hhsuite_root) full_nrdb = os.path.join(os.path.abspath(os.path.split(nrdb)[0]), os.path.split(nrdb)[1]) # create MSA - hhblits_cmd = '%s -e 0.001 -cpu %d -i %s -oa3m %s -d %s -n %d' % \ - (self.hhblits_bin, cpu, self.filename, a3m_file, - full_nrdb, iterations) - if mact: - hhblits_cmd += '-mact %f' % mact + opts = {'cpu' : 1, # no. of cpus used + 'n' : 1} # no. of iterations + opts.update(options) + opt_cmd, _ = _ParseOptions(opts) + hhblits_cmd = '%s -e 0.001 -i %s -oa3m %s -d %s %s' % \ + (self.hhblits_bin, self.filename, a3m_file, full_nrdb, + opt_cmd) job = subprocess.Popen(hhblits_cmd, shell=True, cwd=self.working_dir, stdout=subprocess.PIPE, stderr=subprocess.PIPE) sout, _ = job.communicate() - #lines = sout.splitlines() - #for l in lines: - # print l.strip() - #lines = serr.splitlines() - #for l in lines: - # print l.strip() + lines = sout.splitlines() + for line in lines: + ost.LogVerbose(line.strip()) if not os.path.exists(a3m_file): ost.LogWarning('Building query profile failed, no output') return a3m_file @@ -580,7 +585,7 @@ class HHblits: if 'error' in line.lower(): ost.LogWarning('Predicting secondary structure for MSA '+ '(%s) failed, on command: %s' % (a3m_file, line)) - return a3m_file + return a3m_file return a3m_file def A3MToProfile(self, a3m_file, hhm_file=None): @@ -588,13 +593,18 @@ class HHblits: Converts the A3M alignment file to a hhm profile. If hhm_file is not given, the output file will be set to <:attr:`a3m_file`-basename>.hhm. - :param a3m_file: input MSA + The produced A3M file can be parsed by :func:`ParseHHM`. + + If the file was already produced, the existing file path is returned + without recomputing it. + + :param a3m_file: Path to input MSA as produced by :meth:`BuildQueryMSA` :type a3m_file: :class:`str` - :param hhm_file: output file name + :param hhm_file: Desired output file name :type hhm_file: :class:`str` - :return: the path to the profile + :return: Path to the profile file :rtype: :class:`str` """ hhmake = os.path.join(self.bin_dir, 'hhmake') @@ -609,24 +619,27 @@ class HHblits: raise IOError('could not convert a3m to hhm file') return hhm_file - def A3MToCS(self, a3m_file, cs_file=None, options={}): """ Converts the A3M alignment file to a column state sequence file. If cs_file is not given, the output file will be set to <:attr:`a3m_file`-basename>.seq219. - :param a3m_file: A3M file to be converted + If the file was already produced, the existing file path is returned + without recomputing it. + + :param a3m_file: Path to input MSA as produced by :meth:`BuildQueryMSA` :type a3m_file: :class:`str` - :param cs_file: output file name (may be omitted) + :param cs_file: Output file name (may be omitted) :type cs_file: :class:`str` - :param options: dictionary of options to *cstranslate*, must come with - the right amount of '-' in front. + :param options: Dictionary of options to *cstranslate*, one "-" is added + in front of every key. Boolean True values add flag + without value. :type options: :class:`dict` - :return: the path to the column state sequence file + :return: Path to the column state sequence file :rtype: :class:`str` """ cstranslate = os.path.join(self.hhlib_dir, 'bin', 'cstranslate') @@ -634,31 +647,36 @@ class HHblits: cs_file = '%s.seq219' % os.path.splitext(a3m_file)[0] if os.path.exists(cs_file): return cs_file - opt_cmd = list() - for k, val in options.iteritems(): - if type(val) == type(True): - if val == True: - opt_cmd.append('%s' % str(k)) - else: - opt_cmd.append('%s %s' % (str(k), str(val))) - opt_cmd = ' '.join(opt_cmd) - cs_cmd = '%s -i %s -o %s %s' % (cstranslate, a3m_file, cs_file, opt_cmd) + opt_cmd, _ = _ParseOptions(options) + cs_cmd = '%s -i %s -o %s %s' % ( + cstranslate, + os.path.abspath(a3m_file), + os.path.abspath(cs_file), + opt_cmd) ost.LogVerbose('converting %s to %s' % (a3m_file, cs_file)) - job = subprocess.Popen(cs_cmd, shell=True, + job = subprocess.Popen(cs_cmd, shell=True, cwd=self.working_dir, stdout=subprocess.PIPE, stderr=subprocess.PIPE) sout, _ = job.communicate() - #lines = serr.splitlines() - #for l in lines: - # print l lines = sout.splitlines() for line in lines: - if line in 'Wrote abstract state sequence to %s' % cs_file: + if 'Wrote abstract state sequence to' in line: return cs_file ost.LogWarning('Creating column state sequence file (%s) failed' % \ cs_file) + def Cleanup(self): + """Delete temporary data. + + Delete temporary data if no working dir was given. Controlled by + :attr:`needs_cleanup`. + """ + if self.needs_cleanup and os.path.exists(self.working_dir): + shutil.rmtree(self.working_dir) + def CleanupFailed(self): '''In case something went wrong, call to make sure everything is clean. + + This will delete the working dir independently of :attr:`needs_cleanup`. ''' store_needs_cleanup = self.needs_cleanup self.needs_cleanup = True @@ -667,50 +685,41 @@ class HHblits: def Search(self, a3m_file, database, options={}, prefix=''): """ - Searches for templates in the given database. Before running the - search, the hhm file is copied. This makes it possible to launch - several hhblits instances at once. Upon success, the filename of the - result file is returned. This file may be parsed with - :func:`ParseHHblitsOutput`. + Searches for templates in the given database. Before running the search, + the hhm file is copied. This makes it possible to launch several hhblits + instances at once. Upon success, the filename of the result file is + returned. This file may be parsed with :func:`ParseHHblitsOutput`. - :param a3m_file: input MSA file + :param a3m_file: Path to input MSA as produced by :meth:`BuildQueryMSA` :type a3m_file: :class:`str` - :param database: search database, needs to be the common prefix of the + :param database: Search database, needs to be the common prefix of the database files :type database: :class:`str` - :param options: dictionary of options, must come with the right amount - of '-' in front. + :param options: Dictionary of options to *hhblits*, one "-" is added in + front of every key. Boolean True values add flag without + value. Merged with default options {'cpu': 1, 'n': 1}, + where 'n' defines the number of iterations. :type options: :class:`dict` - :param prefix: prefix to the result file + :param prefix: Prefix to the result file :type prefix: :class:`str` - :return: the path to the result file + :return: The path to the result file :rtype: :class:`str` """ opts = {'cpu' : 1, # no. of cpus used 'n' : 1} # no. of iterations opts.update(options) - opt_cmd = [] - opt_str = [] - for k, val in opts.iteritems(): - if type(val) == type(True): - if val == True: - opt_cmd.append('-%s' % str(k)) - opt_str.append(str(k)) - else: - opt_cmd.append('-%s %s' % (str(k), str(val))) - opt_str.append('%s%s' % (str(k), str(val))) - opt_cmd = ' '.join(opt_cmd) - opt_str = '_'.join(opt_str) + opt_cmd, opt_str = _ParseOptions(opts) base = os.path.basename(os.path.splitext(a3m_file)[0]) hhr_file = '%s%s_%s.hhr' % (prefix, base, opt_str) hhr_file = os.path.join(self.working_dir, hhr_file) - search_cmd = '%s %s -e 0.001 -Z 10000 -B 10000 -i %s -o %s -d %s'%( + search_cmd = '%s %s -e 0.001 -Z 10000 -B 10000 -i %s -o %s -d %s' % ( self.hhblits_bin, - opt_cmd, os.path.abspath(a3m_file), + opt_cmd, + os.path.abspath(a3m_file), hhr_file, os.path.join(os.path.abspath(os.path.split(database)[0]), os.path.split(database)[1])) @@ -722,20 +731,44 @@ class HHblits: if job.returncode != 0: lines = sout.splitlines() for line in lines: - print line.strip() + ost.LogError(line.strip()) lines = serr.splitlines() for line in lines: - print line.strip() + ost.LogError(line.strip()) return None return hhr_file -__all__ = ['HHblits', 'HHblitsHit', 'HHblitsHeader', 'ParseHeaderLine', +def _ParseOptions(opts): + """ + :return: Tuple of strings (opt_cmd, opt_str), where opt_cmd can be + passed to command ("-" added in front of keys, options + separated by space) and opt_str (options separated by "_") + can be used for filenames. + :param opts: Dictionary of options, one "-" is added in front of every + key. Boolean True values add flag without value. + """ + opt_cmd = list() + opt_str = list() + for k, val in opts.iteritems(): + if type(val) == type(True): + if val == True: + opt_cmd.append('-%s' % str(k)) + opt_str.append(str(k)) + else: + opt_cmd.append('-%s %s' % (str(k), str(val))) + opt_str.append('%s%s' % (str(k), str(val))) + opt_cmd = ' '.join(opt_cmd) + opt_str = '_'.join(opt_str) + return opt_cmd, opt_str + + +__all__ = ['HHblits', 'HHblitsHit', 'HHblitsHeader', 'ParseHHblitsOutput', 'ParseA3M', 'ParseHHM', - 'EstimateMemConsumption'] + 'ParseHeaderLine'] -# LocalWords: HHblits MSA hhblits hhtools PSIPRED addss param nrdb str mact +# LocalWords: HHblits MSA hhblits hhtools PSIPRED addss param nrdb str # LocalWords: cpu hhm func ParseHHblitsOutput ss pred conf msa hhsuite dir # LocalWords: attr basename rtype cstranslate tuple HHblitsHeader meth aln -# LocalWords: HHblitsHit iteratable evalue pvalue neff hmms datetime +# LocalWords: HHblitsHit iterable evalue pvalue neff hmms datetime # LocalWords: whitespace whitespaces diff --git a/modules/bindings/pymod/utils.py b/modules/bindings/pymod/utils.py index 02edd598f9b664c165f385b92429b619d7bf09fd..e476d04a9a752c64f86cf0f0e4d0a4418cfa405c 100644 --- a/modules/bindings/pymod/utils.py +++ b/modules/bindings/pymod/utils.py @@ -29,7 +29,7 @@ def SaveToTempDir(objects, seq_format='fasta', structure_format='pdb'): file_names.append(name) continue if isinstance(obj, mol.EntityView) or isinstance(obj, mol.EntityHandle): - name=os.path.join(tmp_dir_name, tmp_dir_name, 'mol%02d.pdb' % (index+1)) + name=os.path.join(tmp_dir_name, 'mol%02d.pdb' % (index+1)) io.SaveEntity(obj, name, structure_format) file_names.append(name) continue diff --git a/modules/bindings/tests/test_hhblits.py b/modules/bindings/tests/test_hhblits.py index c396528b8c623d620e07f7611431760afcfd1725..2ba099ced0c4e7f5dd7d0856431cd48d22a326b2 100644 --- a/modules/bindings/tests/test_hhblits.py +++ b/modules/bindings/tests/test_hhblits.py @@ -196,7 +196,7 @@ class TestHHblitsBindings(unittest.TestCase): _, self.tmpfile = tempfile.mkstemp(suffix='.seq219') os.remove(self.tmpfile) csfile = self.hh.A3MToCS("testfiles/testali.a3m", - cs_file=self.tmpfile, options={'--alphabet' : + cs_file=self.tmpfile, options={'-alphabet' : os.path.join(self.hh.hhlib_dir, 'data', 'cs219.lib')}) @@ -211,7 +211,7 @@ class TestHHblitsBindings(unittest.TestCase): 'TSKYR') self.hh = hhblits.HHblits(query_seq, self.hhroot) csfile = self.hh.A3MToCS("testfiles/testali.a3m", - options={'--alphabet' : + options={'-alphabet' : os.path.join(self.hh.hhlib_dir, 'data', 'cs219.lib')}) @@ -228,7 +228,7 @@ class TestHHblitsBindings(unittest.TestCase): self.hh = hhblits.HHblits(query_seq, self.hhroot) csfile = self.hh.A3MToCS("testfiles/testali.a3m", cs_file='testfiles/test.seq219', - options={'--alphabet' : + options={'-alphabet' : os.path.join(self.hh.hhlib_dir, 'data', 'cs219.lib')}) @@ -305,6 +305,17 @@ class TestHHblitsBindings(unittest.TestCase): 'HHHHHHHHHHHHCC') self.assertEqual(prof['msa'].GetCount(), 253) + def fastParseHeader(self): + header_line = ' 1 814cbc1899f35c872169524af30fc2 100.0 5E-100' + \ + ' 5E-104 710.5 34.1 277 3-293 2-280 (281)' + hit, offset = hhblits.ParseHeaderLine(header_line) + self.assertEqual(hit.hit_id, '814cbc1899f35c872169524af30fc2') + self.assertAlmostEqual(hit.evalue, 0) + self.assertAlmostEqual(hit.prob, 100.0) + self.assertAlmostEqual(hit.pvalue, 0) + self.assertAlmostEqual(hit.score, 710.5) + self.assertAlmostEqual(hit.ss_score, 34.1) + def testParseHHblitsOutput(self): header, hits = hhblits.ParseHHblitsOutput(open("testfiles/test.hhr")) self.assertEqual(header.query, 'Test') @@ -385,8 +396,6 @@ class TestHHblitsBindings(unittest.TestCase): 'Test VDPVNFKLLSHCLLVTLAAHL\ne69e1ac0'+ 'a4b2554d... ATPEQAQLVHKEIRKIVKDTC\n') -# ParseHHblitsOutput - if __name__ == "__main__": hhsuite_root_dir = os.getenv('EBROOTHHMINSUITE') if not hhsuite_root_dir: diff --git a/modules/doc/actions.rst b/modules/doc/actions.rst index 779a95ea47b393a7ede6dd395f1f6d650fda576d..1d22faf44e6cc8890ffd2a621c49fb0d35da0ed3 100644 --- a/modules/doc/actions.rst +++ b/modules/doc/actions.rst @@ -44,8 +44,10 @@ The output file has following format: "<MODEL NAME>": { # Model name extracted from the file name "<REFERENCE NAME>": { # Reference name extracted from the file name "info": { - "residue_names_consistent": <Are the residue numbers consistent? true or false>, - "mapping": <Mapping of chains eg. {"A": "B", "B": "A"}>, + "residue_names_consistent": <Are the residue numbers consistent? true or false>, + "mapping": { + "chain_mapping": <Mapping of chains eg. {"A": "B", "B": "A"}>, + "chain_mapping_scheme": <Scheme used to get mapping, check mapping manually if "permissive" or "extensive">, "alignments": <list of chain-chain alignments in FASTA format> } }, @@ -72,10 +74,10 @@ The output file has following format: "per_residue_scores": [ # per-residue lDDT scores - calculated when --save-per-residue-scores (-spr) option is selected { "total_contacts": <total number of contacts between model and reference>, - "residue_name": <three letter code of the residue>, + "residue_name": <three letter code of the residue in reference chain>, "lddt": <residue lDDT score>, "conserved_contacts": <number of conserved contacts between model and reference for given residue>, - "residue_number": <total number of contacts between model and reference for given residue> + "residue_number": <residue number in reference chain> }, . . @@ -164,51 +166,57 @@ Example usage: ################################################################################ Performing structural checks --> for reference(s) - Checking + Checking reference.pdb Checking stereo-chemistry - Average Z-Score for bond lengths: -nan - Bonds outside of tolerance range: 0 out of 0 + Average Z-Score for bond lengths: 0.13694 + Bonds outside of tolerance range: 0 out of 2654 Bond Avg Length Avg zscore Num Bonds - Average Z-Score angle widths: 0.00000 - Angles outside of tolerance range: 0 out of 1 + C-C 1.50876 0.09299 1501 + C-N 1.42978 0.17690 635 + C-O 1.25079 0.21528 518 + Average Z-Score angle widths: 0.07562 + Angles outside of tolerance range: 0 out of 2941 Filtering non-bonded clashes 0 non-bonded short-range distances shorter than tolerance distance Distances shorter than tolerance are on average shorter by: 0.00000 --> for model(s) - Checking + Checking model.pdb Checking stereo-chemistry - Average Z-Score for bond lengths: -nan - Bonds outside of tolerance range: 0 out of 0 + Average Z-Score for bond lengths: -0.22524 + Bonds outside of tolerance range: 0 out of 2774 Bond Avg Length Avg zscore Num Bonds - Average Z-Score angle widths: 0.00000 - Angles outside of tolerance range: 0 out of 1 + C-C 1.50225 -0.20158 1558 + C-N 1.42294 -0.12261 666 + C-O 1.24232 -0.42115 546 + C-S 1.80215 0.20858 4 + Average Z-Score angle widths: -0.06767 + Angles outside of tolerance range: 0 out of 3079 Filtering non-bonded clashes 0 non-bonded short-range distances shorter than tolerance distance Distances shorter than tolerance are on average shorter by: 0.00000 ################################################################################ - Comparing to - Chains removed from : _ - Chains in : AB - Chains in : AB - Chemically equivalent chain-groups in pdb_1: [['B', 'A']] - Chemically equivalent chain-groups in pdb_2: [['A', 'B']] + Comparing model.pdb to reference.pdb + Chains in reference.pdb: AB + Chains in model.pdb: AB + Chemically equivalent chain-groups in reference.pdb: [['B', 'A']] + Chemically equivalent chain-groups in model.pdb: [['A', 'B']] Chemical chain-groups mapping: {('B', 'A'): ('A', 'B')} Identifying Symmetry Groups... - Symmetry threshold 0.1 used for angles of pdb_1 - Symmetry threshold 0.1 used for axis of pdb_1 - Symmetry threshold 0.1 used for angles of pdb_2 - Symmetry threshold 0.1 used for axis of pdb_2 + Symmetry threshold 0.1 used for angles of reference.pdb + Symmetry threshold 0.1 used for axis of reference.pdb + Symmetry threshold 0.1 used for angles of model.pdb + Symmetry threshold 0.1 used for axis of model.pdb Selecting Symmetry Groups... - Symmetry-groups used in pdb_1: [('B',), ('A',)] - Symmetry-groups used in pdb_2: [('A',), ('B',)] + Symmetry-groups used in reference.pdb: [('B',), ('A',)] + Symmetry-groups used in model.pdb: [('A',), ('B',)] Closed Symmetry with strict parameters Mapping found: {'A': 'B', 'B': 'A'} -------------------------------------------------------------------------------- - Checking consistency between and + Checking consistency between model.pdb and reference.pdb Consistency check: OK -------------------------------------------------------------------------------- Computing QS-score - QSscore pdb_1, pdb_2: best: 0.90, global: 0.90 + QSscore reference.pdb, model.pdb: best: 0.90, global: 0.90 -------------------------------------------------------------------------------- Computing lDDT scores lDDT settings: @@ -226,8 +234,8 @@ Example usage: Global LDDT score: 0.7854 (904568 conserved distances out of 1151664 checked, over 4 thresholds) --> Computing oligomeric lDDT score - Reference pdb_1 has: 2 chains - Model pdb_2 has: 2 chains + Reference reference.pdb has: 2 chains + Model model.pdb has: 2 chains Coverage: 1 (384 out of 384 residues) Oligo lDDT score: 0.8025 --> Computing weighted lDDT score @@ -243,8 +251,8 @@ In the example above the output file looks as follows: { "result": { - "": { - "": { + "model.pdb": { + "reference.pdb": { "info": { "residue_names_consistent": true, "mapping": { @@ -252,6 +260,7 @@ In the example above the output file looks as follows: "A": "B", "B": "A" }, + "chain_mapping_scheme": "strict", "alignments": [ ">reference:A\n-PGLFLTLEGLDGSGKTTQARRLAAFLEAQGRPVLLTREPGGGLPEVRSL---QELSPEAEYLLFSADRAEHVRKVILPGLAAGKVVISDRYLDSSLAYQGYGRGLPLPWLREVAREATRGLKPRLTFLLDLPPEAALRRVR-------LGLEFFRRVREGYLALARAEPGRFVVLDATLPEEEIARAIQAHLRPLLP\n>model:B\nMPGLFLTLEGLDGSGKTTQARRLAAFLEAQGRPVLLTREPGGGLPEVRSLLLTQELSPEAEYLLFSADRAEHVRKVILPGLAAGKVVISDRYLDSSLAYQGYGRGLPLPWLREVAREATRGLKPRLTFLLDLPPEAALRRVRRPDRLEGLGLEFFRRVREGYLALARAEPGRFVVLDATLPEEEIARAIQAHLRPLLP", ">reference:B\n-PGLFLTLEGLDGSGKTTQARRLAAFLEAQGRPVLLTREPGGGLPEVRSLLLTQELSPEAEYLLFSADRAEHVRKVILPGLAAGKVVISDRYLDSSLAYQGYGRGLPLPWLREVAREATRGLKPRLTFLLDLPPEAALRRVRRPDRLEGLGLEFFRRVREGYLALARAEPGRFVVLDATLPEEEIARAIQAHLRPLLP\n>model:A\nMPGLFLTLEGLDGSGKTTQARRLAAFLEAQGRPVLLTREPGGGLPEVRSLLLTQELSPEAEYLLFSADRAEHVRKVILPGLAAGKVVISDRYLDSSLAYQGYGRGLPLPWLREVAREATRGLKPRLTFLLDLPPEAALRRVRRPDRLEGLGLEFFRRVREGYLALARAEPGRFVVLDATLPEEEIARAIQAHLRPLLP" @@ -261,7 +270,7 @@ In the example above the output file looks as follows: "lddt": { "oligo_lddt": { "status": "SUCCESS", - "global_score": 0.8025223016738892, + "global_score": 0.8025223275721413, "error": "" }, "weighted_lddt": { @@ -311,6 +320,7 @@ In the example above the output file looks as follows: "save_per_residue_scores": false, "fault_tolerant": false, "reference_selection": "", + "qs_rmsd": false, "cwd": "CWD", "inclusion_radius": 15.0, "angle_tolerance": 15.0, @@ -340,7 +350,8 @@ calculate eg. QS-score directly: .. code:: console - $OST_ROOT/bin/ost compare-structures --model model.pdb --reference reference.pdb --output output_qs.json --qs-score --residue-number-alignment + $ $OST_ROOT/bin/ost compare-structures --model model.pdb --reference reference.pdb --output output_qs.json --qs-score --residue-number-alignment + ################################################################################ Reading input files (fault_tolerant=False) --> reading model from model.pdb @@ -349,7 +360,6 @@ calculate eg. QS-score directly: imported 3 chains, 408 residues, 3011 atoms; with 0 helices and 0 strands ################################################################################ Comparing model.pdb to reference.pdb - Chains removed from reference.pdb: _ Chains in reference.pdb: AB Chains in model.pdb: AB Chemically equivalent chain-groups in reference.pdb: [['B', 'A']] diff --git a/modules/io/doc/io.rst b/modules/io/doc/io.rst index 580c6650a8ead19afc4d372bc65b106fdb7027f2..ccff89f86ea08263adbed07e6bf45d598541594a 100644 --- a/modules/io/doc/io.rst +++ b/modules/io/doc/io.rst @@ -142,8 +142,8 @@ Loading sequence or alignment files myseq = io.LoadSequence('seq.fasta') # for obtaining a SequenceList seqlist = io.LoadSequenceList('seqs.fasta') - # or for multiple aligned fasta files use - aln = io.LoadAlignment('algnm.aln',format="clustal") + # or for multiple alignments (here from CLUSTAL) + aln = io.LoadAlignment('algnm.aln', format="clustal") For a list of file formats supported by :func:`LoadSequence` see :doc:`sequence_formats`. @@ -212,11 +212,11 @@ Saving Sequence Data .. code-block:: python # recognizes FASTA file by file extension - io.SaveSequence(myseq,'seq.fasta') + io.SaveSequence(myseq, 'seq.fasta') # for saving a SequenceList - io.SaveSequenceList(seqlist,'seqlist.fasta') - # or multiple aligned fasta files - io.SaveAlignment(aln,'algnm.aln',format="clustal") + io.SaveSequenceList(seqlist, 'seqlist.fasta') + # or for multiple alignments (here in FASTA format) + io.SaveAlignment(aln, 'aln.fasta') For a list of file formats supported by :func:`SaveSequence` see :doc:`sequence_formats`. diff --git a/modules/mol/alg/pymod/qsscoring.py b/modules/mol/alg/pymod/qsscoring.py index dbe8c1dba6c332440915ac058b1d8e75cf8f7f4f..228fbb43208b9fee2b1b763275636e6a64849945 100644 --- a/modules/mol/alg/pymod/qsscoring.py +++ b/modules/mol/alg/pymod/qsscoring.py @@ -181,6 +181,7 @@ class QSscorer: self._symm_1 = None self._symm_2 = None self._chain_mapping = None + self._chain_mapping_scheme = None self._alignments = None self._mapped_residues = None self._global_score = None @@ -207,9 +208,8 @@ class QSscorer: :attr:`qs_ent_1` and value = :class:`tuple` of chain names in :attr:`qs_ent_2`. - :raises: :class:`QSscoreError` if we end up having less than 2 chains for - either entity in the mapping (can happen if chains do not have CA - atoms). + :raises: :class:`QSscoreError` if we end up having no chains for either + entity in the mapping (can happen if chains do not have CA atoms). """ if self._chem_mapping is None: self._chem_mapping = _GetChemGroupsMapping(self.qs_ent_1, self.qs_ent_2) @@ -364,12 +364,41 @@ class QSscorer: to find a chain mapping. """ if self._chain_mapping is None: - self._chain_mapping = _GetChainMapping(self.ent_to_cm_1, self.ent_to_cm_2, - self.symm_1, self.symm_2, - self.chem_mapping) + self._chain_mapping, self._chain_mapping_scheme = \ + _GetChainMapping(self.ent_to_cm_1, self.ent_to_cm_2, self.symm_1, + self.symm_2, self.chem_mapping) LogInfo('Mapping found: %s' % str(self._chain_mapping)) return self._chain_mapping + @property + def chain_mapping_scheme(self): + """Mapping scheme used to get :attr:`chain_mapping`. + + Possible values: + + - 'strict': 80% overlap needed within 4 Angstrom (overlap based mapping). + - 'tolerant': 40% overlap needed within 6 Angstrom (overlap based mapping). + - 'permissive': 20% overlap needed within 8 Angstrom (overlap based + mapping). It's best if you check mapping manually! + - 'extensive': Extensive search used for mapping detection (fallback). This + approach has known limitations and may be removed in future versions. + Mapping should be checked manually! + - 'user': :attr:`chain_mapping` was set by user before first use of this + attribute. + + :getter: Computed with :attr:`chain_mapping` on first use (cached) + :type: :class:`str` + :raises: :class:`QSscoreError` as in :attr:`chain_mapping`. + """ + if self._chain_mapping_scheme is None: + # default: user provided + self._chain_mapping_scheme = 'user' + # get chain mapping and make sure internal variable is set + # -> will not compute and only update _chain_mapping if user provided + # -> will compute and overwrite _chain_mapping_scheme else + self._chain_mapping = self.chain_mapping + return self._chain_mapping_scheme + @property def alignments(self): """List of successful sequence alignments using :attr:`chain_mapping`. @@ -424,6 +453,7 @@ class QSscorer: :getter: Computed on first use (cached) :type: :class:`float` + :raises: :class:`QSscoreError` if only one chain is mapped """ if self._global_score is None: self._ComputeScores() @@ -439,6 +469,7 @@ class QSscorer: :getter: Computed on first use (cached) :type: :class:`float` + :raises: :class:`QSscoreError` if only one chain is mapped """ if self._best_score is None: self._ComputeScores() @@ -1308,7 +1339,7 @@ class MappedLDDTScorer(object): existing in model and reference: - "residue_number": Residue number in reference chain - - "residue_name": Residue number in reference chain + - "residue_name": Residue name in reference chain - "lddt": local lDDT - "conserved_contacts": number of conserved contacts - "total_contacts": total number of contacts @@ -1782,7 +1813,9 @@ def _FindSymmetry(qs_ent_1, qs_ent_2, ent_to_cm_1, ent_to_cm_2, chem_mapping): def _GetChainMapping(ent_1, ent_2, symm_1, symm_2, chem_mapping): """ - :return: Mapping from *ent_1* to *ent_2* (see :attr:`QSscorer.chain_mapping`) + :return: Tuple with mapping from *ent_1* to *ent_2* (see + :attr:`QSscorer.chain_mapping`) and scheme used (see + :attr:`QSscorer.chain_mapping_scheme`) :param ent_1: See :attr:`QSscorer.ent_to_cm_1` :param ent_2: See :attr:`QSscorer.ent_to_cm_2` @@ -1808,7 +1841,7 @@ def _GetChainMapping(ent_1, ent_2, symm_1, symm_2, chem_mapping): if scheme == 'permissive': LogWarning('Permissive thresholds used for overlap based mapping ' + \ 'detection: check mapping manually: %s' % mapping) - return mapping + return mapping, scheme # NOTE that what follows below is sub-optimal: # - if the two structures don't fit at all, we may map chains rather randomly @@ -1908,7 +1941,7 @@ def _GetChainMapping(ent_1, ent_2, symm_1, symm_2, chem_mapping): LogWarning('Extensive search used for mapping detection (fallback). This ' + \ 'approach has known limitations. Check mapping manually: %s' \ % mapping) - return mapping + return mapping, 'extensive' def _GetSymmetrySubgroups(qs_ent, ent, chem_groups): diff --git a/singularity/Singularity.1.7.1 b/singularity/Singularity.1.7.1 index 8a96d0a8f0ed575e5b2a5c0ad007076cddd03a4b..b80190001bf093f2b10c7cd55886e19db95b7ee4 100644 --- a/singularity/Singularity.1.7.1 +++ b/singularity/Singularity.1.7.1 @@ -65,7 +65,7 @@ locale-gen en_US.UTF-8 # INSTALL SOME PYTHON PACKAGES GLOBALY ###################################### -pip install --upgrade pip && pip install --no-cache-dir numpy==1.10.4 \ +pip install --no-cache-dir numpy==1.10.4 \ scipy==1.0.0 \ pandas==0.22.0 @@ -110,7 +110,7 @@ fi ############## cd ${SRC_FOLDER} if [ ! -f dssp-${DSSP_VERSION}.tgz ]; then - wget ftp://ftp.cmbi.ru.nl/pub/software/dssp/dssp-${DSSP_VERSION}.tgz + wget ftp://ftp.cmbi.umcn.nl/pub/molbio/software/dssp-2/dssp-${DSSP_VERSION}.tgz tar -xvzf dssp-${DSSP_VERSION}.tgz cd dssp-${DSSP_VERSION} make -j ${CPUS_FOR_MAKE}