Skip to content
Snippets Groups Projects
Commit adb141f9 authored by Studer Gabriel's avatar Studer Gabriel
Browse files

enable pairwise sequence alignments with parasail

Parasail: https://github.com/jeffdaily/parasail

It's faster than the pairwise sequence algorithms we have in
OpenStructure. However, the main reason to check out another solution
is the fact that OpenStructure may return suboptimal alignment
results. This is a result of how the dynamic programming table is built.
If for a certain cell in the dynamic programming table a match state and
insertion state score equally, match is preferred. There is a chance
that we would get a better score later on if we would stick with the
insertion state (gap open penalties vs. gap extension penalties).
Not sure if this will be fixed in the near future.
parent dc21d482
No related branches found
No related tags found
No related merge requests found
...@@ -46,6 +46,7 @@ option(ENABLE_STATIC "whether static libraries should be compiled" OFF) ...@@ -46,6 +46,7 @@ option(ENABLE_STATIC "whether static libraries should be compiled" OFF)
option(UBUNTU_LAYOUT "whether Debian/Ubuntu's lib and libexec directory layout should be used" OFF) option(UBUNTU_LAYOUT "whether Debian/Ubuntu's lib and libexec directory layout should be used" OFF)
option(HIDDEN_VISIBILITY "on gcc, use -fvisibility=hidden" OFF) option(HIDDEN_VISIBILITY "on gcc, use -fvisibility=hidden" OFF)
option(ENABLE_MM "whether openmm support is added" OFF) option(ENABLE_MM "whether openmm support is added" OFF)
option(ENABLE_PARASAIL, "whether parasail alignment library should be included" OFF)
if (CXX) if (CXX)
set(CMAKE_CXX_COMPILER ${CXX}) set(CMAKE_CXX_COMPILER ${CXX})
...@@ -130,6 +131,11 @@ if(ENABLE_MM) ...@@ -130,6 +131,11 @@ if(ENABLE_MM)
else() else()
set(_OPENMM OFF) set(_OPENMM OFF)
endif() endif()
if(ENABLE_PARASAIL)
set(_PARASAIL ON)
else()
set(_PARASAIL OFF)
endif()
if (UBUNTU_LAYOUT) if (UBUNTU_LAYOUT)
...@@ -228,6 +234,10 @@ else(ENABLE_MM) ...@@ -228,6 +234,10 @@ else(ENABLE_MM)
set(_OPENMM_PLUGINS "NONE") set(_OPENMM_PLUGINS "NONE")
endif(ENABLE_MM) endif(ENABLE_MM)
if(ENABLE_PARASAIL)
find_package(Parasail REQUIRED)
endif(ENABLE_PARASAIL)
if (ENABLE_STATIC) if (ENABLE_STATIC)
set(Boost_LIBRARIES) set(Boost_LIBRARIES)
set(Boost_USE_STATIC_LIBS ON) set(Boost_USE_STATIC_LIBS ON)
...@@ -290,7 +300,7 @@ include_directories(${Boost_INCLUDE_DIRS} ...@@ -290,7 +300,7 @@ include_directories(${Boost_INCLUDE_DIRS}
${PNG_INCLUDE_DIRS} ${PNG_INCLUDE_DIRS}
${OPEN_MM_INCLUDE_DIRS} ${OPEN_MM_INCLUDE_DIRS}
${SQLITE3_INCLUDE_DIRS} ${SQLITE3_INCLUDE_DIRS}
) ${PARASAIL_INCLUDE_DIR})
if (UNIX) if (UNIX)
SET(CMAKE_SKIP_BUILD_RPATH FALSE) SET(CMAKE_SKIP_BUILD_RPATH FALSE)
...@@ -342,6 +352,7 @@ message(STATUS ...@@ -342,6 +352,7 @@ message(STATUS
" SpaceNav Device support (-DENABLE_SPNAV) : ${_SPNAV}\n" " SpaceNav Device support (-DENABLE_SPNAV) : ${_SPNAV}\n"
" OpenMM support (-DENABLE_MM) : ${_OPENMM}\n" " OpenMM support (-DENABLE_MM) : ${_OPENMM}\n"
" OpenMM plugins (-DOPEN_MM_PLUGIN_DIR) : ${_OPENMM_PLUGINS}\n" " OpenMM plugins (-DOPEN_MM_PLUGIN_DIR) : ${_OPENMM_PLUGINS}\n"
" Parasail alignment library (-DENABLE_PARASAIL) : ${_PARASAIL}\n"
" Optimize (-DOPTIMIZE) : ${_OPT}\n" " Optimize (-DOPTIMIZE) : ${_OPT}\n"
" Profiling support (-DPROFILE) : ${_PROFILE}\n" " Profiling support (-DPROFILE) : ${_PROFILE}\n"
" Double Precision (-DUSE_DOUBLE_PRECISION) : ${_DOUBLE_PREC}\n" " Double Precision (-DUSE_DOUBLE_PRECISION) : ${_DOUBLE_PREC}\n"
......
# Try to find Parasail
#
# Find the native Parasail include and library. Sets:
#
# PARASAIL_INCLUDE_DIR - Parasail include dir.
# PARASAIL_LIBRARY - Parasail library.
find_path(PARASAIL_INCLUDE_DIR parasail.h HINTS PARASAIL_INCLUDE_DIR REQUIRED)
find_library(PARASAIL_LIBRARY NAMES parasail HINTS PARASAIL_LIBRARY REQUIRED)
mark_as_advanced (PARASAIL_LIBRARY PARASAIL_INCLUDE_DIR)
...@@ -25,3 +25,4 @@ So far, the binding module includes: ...@@ -25,3 +25,4 @@ So far, the binding module includes:
lga lga
cadscore cadscore
dockq dockq
parasail
Parasail - Pairwise sequence alignments
================================================================================
Basic access to pairwise sequence alignment functionality in
`parasail <https://github.com/jeffdaily/parasail/>`_ from
within OpenStructure.
Citation:
Jeff Daily. Parasail: SIMD C library for global, semi-global,
and local pairwise sequence alignments. (2016) BMC Bioinformatics
Parasail support must be enabled at compile time - see installation
instructions. Parasail allows to choose from various strategies but
for the sake of simplicity, this Python binding always calls
``parasail_<mode>_trace_scan_sat`` which seems reasonably fast
across the global, semi-global and local modes.
See parasail documentation for more information. Example usage:
.. code-block:: python
from ost import bindings
print("parasail available?", bindings.ParasailAvailable())
s1 = seq.CreateSequence("A", "HEAGAWGHEE")
s2 = seq.CreateSequence("B", "PAWHEA")
aln = bindings.ParaGlobalAlign(s1, s2, gap_open_penalty=11,
gap_extension_penalty=1,
matrix="blosum62")
print(aln.ToString())
.. method:: ParasailAvailable()
returns if OpenStructure has been compiled with parasail support
.. method:: ParaGlobalAlign(s1, s2, gap_open_penalty=11, \
gap_extension_penalty=1, matrix="blosum62")
Calls parasail_nw_trace_scan_sat to perform a Needleman-Wunsch global
alignment.
:param s1: First sequence
:type s1: :class:`ost.seq.SequenceHandle`
:param s2: Second sequence
:type s2: :class:`ost.seq.SequenceHandle`
:param gap_open_penalty: Gap opening penalty for dynamic programming table
:type gap_open_penalty: :class:`int`
:param gap_extension_penalty: Gap extension penalty for dynamic programming
table
:type gap_extension_penalty: :class:`int`
:param matrix: Substution matrix - Anything that can be resolved by parasail.
E.g. pretty much the full BLOSUM/PAM families of matrices
("blosum62", "pam100",...) for protein sequences, or
"nuc44"/"dnafull" for nucleotides.
:returns: :class:`ost.seq.AlignmentHandle`
.. method:: ParaSemiGlobalAlign(s1, s2, gap_open_penalty=11, \
gap_extension_penalty=1, matrix="blosum62")
Same as :meth:`ParaGlobalAlign` but calling parasail_sg_trace_scan_sat to
perform a semi-global alignment.
.. method:: ParaLocalAlign(s1, s2, gap_open_penalty=11, \
gap_extension_penalty=1, matrix="blosum62")
Same as :meth:`ParaGlobalAlign` but calling parasail_sw_trace_scan_sat to
perform a Smith-Waterman local alignment. The sequences in the returned
alignment might be subsequences of *s1*/*s2* but have offsets set.
...@@ -21,6 +21,7 @@ dockq.py ...@@ -21,6 +21,7 @@ dockq.py
set(OST_BINDINGS_PYMOD_SOURCES set(OST_BINDINGS_PYMOD_SOURCES
export_tmalign.cc export_tmalign.cc
export_parasail.cc
wrap_bindings.cc wrap_bindings.cc
) )
......
//------------------------------------------------------------------------------
// This file is part of the OpenStructure project <www.openstructure.org>
//
// Copyright (C) 2008-2020 by the OpenStructure authors
//
// This library is free software; you can redistribute it and/or modify it under
// the terms of the GNU Lesser General Public License as published by the Free
// Software Foundation; either version 3.0 of the License, or (at your option)
// any later version.
// This library is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
// details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with this library; if not, write to the Free Software Foundation, Inc.,
// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
//------------------------------------------------------------------------------
#include <boost/python.hpp>
#include <ost/bindings/wrap_parasail.hh>
using namespace boost::python;
void export_parasail() {
def("ParaLocalAlign", &ost::bindings::ParaLocalAlign, (arg("s1"), arg("s2"),
arg("gap_open_penalty")=11,
arg("gap_extension_penalty")=1,
arg("matrix")="blosum62"));
def("ParaGlobalAlign", &ost::bindings::ParaGlobalAlign, (arg("s1"), arg("s2"),
arg("gap_open_penalty")=11,
arg("gap_extension_penalty")=1,
arg("matrix")="blosum62"));
def("ParaSemiGlobalAlign", &ost::bindings::ParaSemiGlobalAlign, (arg("s1"), arg("s2"),
arg("gap_open_penalty")=11,
arg("gap_extension_penalty")=1,
arg("matrix")="blosum62"));
def("ParasailAvailable", &ost::bindings::ParasailAvailable);
}
...@@ -24,8 +24,10 @@ using namespace boost::python; ...@@ -24,8 +24,10 @@ using namespace boost::python;
void export_TMAlign(); void export_TMAlign();
void export_parasail();
BOOST_PYTHON_MODULE(_ost_bindings) BOOST_PYTHON_MODULE(_ost_bindings)
{ {
export_TMAlign(); export_TMAlign();
export_parasail();
} }
...@@ -5,11 +5,13 @@ if (COMPILE_TMTOOLS) ...@@ -5,11 +5,13 @@ if (COMPILE_TMTOOLS)
endif() endif()
set(OST_BINDINGS_SOURCES set(OST_BINDINGS_SOURCES
wrap_tmalign.cc) wrap_tmalign.cc
wrap_parasail.cc)
set(OST_BINDINGS_HEADERS set(OST_BINDINGS_HEADERS
wrap_tmalign.hh) wrap_tmalign.hh
wrap_parasail.hh)
module(NAME bindings SOURCES ${OST_BINDINGS_SOURCES} module(NAME bindings SOURCES ${OST_BINDINGS_SOURCES}
HEADERS ${OST_BINDINGS_HEADERS} HEADER_OUTPUT_DIR ost/bindings HEADERS ${OST_BINDINGS_HEADERS} HEADER_OUTPUT_DIR ost/bindings
DEPENDS_ON ost_geom ost_mol ost_seq) DEPENDS_ON ost_geom ost_mol ost_seq LINK ${PARASAIL_LIBRARY})
//------------------------------------------------------------------------------
// This file is part of the OpenStructure project <www.openstructure.org>
//
// Copyright (C) 2008-2024 by the OpenStructure authors
//
// This library is free software; you can redistribute it and/or modify it under
// the terms of the GNU Lesser General Public License as published by the Free
// Software Foundation; either version 3.0 of the License, or (at your option)
// any later version.
// This library is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
// details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with this library; if not, write to the Free Software Foundation, Inc.,
// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
//------------------------------------------------------------------------------
#include <ost/bindings/wrap_parasail.hh>
#if OST_PARASAIL_ENABLED
#include <parasail.h> // include for external parasail
typedef parasail_result_t* (*parasail_aln_fn)(const char*, int, const char*,
int, int, int,
const parasail_matrix_t *);
namespace{
ost::seq::AlignmentHandle Align(const ost::seq::SequenceHandle& s1,
const ost::seq::SequenceHandle& s2,
int gap_open_penalty, int gap_extension_penalty,
const parasail_matrix_t* matrix,
parasail_aln_fn aln_fn,
bool set_offset) {
parasail_result_t *r = NULL;
parasail_traceback_t *traceback_r = NULL;
String s1_str = s1.GetGaplessString();
String s2_str = s2.GetGaplessString();
const char* s1_c_str = s1_str.c_str();
int s1_len = s1_str.size();
const char* s2_c_str = s2_str.c_str();
int s2_len = s2_str.size();
r = aln_fn(s1_c_str, s1_len, s2_c_str, s2_len, 11, 1, matrix);
traceback_r = parasail_result_get_traceback(r, s1_c_str, s1_len,
s2_c_str, s2_len, matrix,
'|', ':', '.');
ost::seq::AlignmentHandle aln = ost::seq::CreateAlignment();
ost::seq::SequenceHandle aln_s1 =
ost::seq::CreateSequence(s1.GetName(), String(traceback_r->query));
ost::seq::SequenceHandle aln_s2 =
ost::seq::CreateSequence(s2.GetName(), String(traceback_r->ref));
if(set_offset) {
/*
Tried the code below to set offsets but observed wrongly reported
offset which can be triggered from Python:
from ost import io
from ost import seq
from ost import bindings
test_seq = seq.CreateSequence("s", "MMDKHKYRVEIQQMMFVSGEINDPPVETTSLIEDIVRGQVIEILLQSNKTAHLRGSRSIL"
"PEDVIFLIRHDKAKVNRLRTYLSWKDLRKNAKDQDASAGVASGTGNPGAGGEDDLKKAGG"
"GEKDEKDGGNMMKVKKSQIKLPWELQFMFNEHPLENNDDNDDMDEDEREANIVTLKRLKM"
"ADDRTRNMTKEEYVHWSDCRQASFTFRKNKRFKDWSGISQLTEGKPHDDVIDILGFLTFE"
"IVCSLTETALKIKQREQVLQTQKDKSQQSSQDNTNFEFASSTLHRKKRLFDGPENVINPL"
"KPRHIEEAWRVLQTIDMRHRALTNFKGGRLSSKPIIM")
s = seq.CreateSequence("asdf", "MTKSIYIIIGYMLHDEEFFYFFFISFYTLWIVFFLLHLSFFSTLSFGIFHDFDTDVYIKVGNYIL"
"HFLELSKNSNLLKNSSEMLKHFRLASLMYMYVYTQMICPSLLGIRN")
aln = bindings.ParaLocalAlign(test_seq, s)
print(aln.ToString())
print(aln.GetSequence(0).offset)
print(aln.GetSequence(1).offset)
// Not sure if this is the most efficient way to set offsets,
// i.e. the index of start characters which are actually
// present in returned alignments
parasail_cigar_t* cigar = parasail_result_get_cigar(r, s1_c_str, s1_len,
s2_c_str, s2_len,
matrix);
aln_s1.SetOffset(cigar->beg_query);
aln_s2.SetOffset(cigar->beg_ref);
parasail_cigar_free(cigar);
*/
// this is far from optimal but doesn't have the issue described above
aln_s1.SetOffset(s1.GetString().find(aln_s1.GetGaplessString()));
aln_s2.SetOffset(s2.GetString().find(aln_s2.GetGaplessString()));
}
aln.AddSequence(aln_s1);
aln.AddSequence(aln_s2);
parasail_result_free(r);
parasail_traceback_free(traceback_r);
return aln;
}
}
namespace ost{ namespace bindings{
ost::seq::AlignmentHandle ParaLocalAlign(const ost::seq::SequenceHandle& s1,
const ost::seq::SequenceHandle& s2,
int gap_open_penalty,
int gap_extension_penalty,
const String& matrix) {
const parasail_matrix_t* m = parasail_matrix_lookup(matrix.c_str());
return Align(s1, s2, gap_open_penalty, gap_extension_penalty,
m, parasail_sw_trace_scan_sat, true);
}
ost::seq::AlignmentHandle ParaGlobalAlign(const ost::seq::SequenceHandle& s1,
const ost::seq::SequenceHandle& s2,
int gap_open_penalty,
int gap_extension_penalty,
const String& matrix) {
const parasail_matrix_t* m = parasail_matrix_lookup(matrix.c_str());
return Align(s1, s2, gap_open_penalty, gap_extension_penalty,
m, parasail_nw_trace_scan_sat, false);
}
ost::seq::AlignmentHandle ParaSemiGlobalAlign(const ost::seq::SequenceHandle& s1,
const ost::seq::SequenceHandle& s2,
int gap_open_penalty,
int gap_extension_penalty,
const String& matrix) {
const parasail_matrix_t* m = parasail_matrix_lookup(matrix.c_str());
return Align(s1, s2, gap_open_penalty, gap_extension_penalty,
m, parasail_sg_trace_scan_sat, false);
}
bool ParasailAvailable() {
return true;
}
}} // ns
#else
namespace ost{ namespace bindings{
ost::seq::AlignmentHandle ParaLocalAlign(const ost::seq::SequenceHandle& s1,
const ost::seq::SequenceHandle& s2,
int gap_open_penalty,
int gap_extension_penalty,
const String& matrix) {
throw ost::Error("Must enable Parasail when compiling OpenStructure to use "
"ParaLocalAlign");
}
ost::seq::AlignmentHandle ParaGlobalAlign(const ost::seq::SequenceHandle& s1,
const ost::seq::SequenceHandle& s2,
int gap_open_penalty,
int gap_extension_penalty,
const String& matrix) {
throw ost::Error("Must enable Parasail when compiling OpenStructure to use "
"ParaGlobalAlign");
}
ost::seq::AlignmentHandle ParaSemiGlobalAlign(const ost::seq::SequenceHandle& s1,
const ost::seq::SequenceHandle& s2,
int gap_open_penalty,
int gap_extension_penalty,
const String& matrix) {
throw ost::Error("Must enable Parasail when compiling OpenStructure to use "
"ParaSemiGlobalAlign");
}
bool ParasailAvailable() {
return false;
}
}} // ns
#endif
//------------------------------------------------------------------------------
// This file is part of the OpenStructure project <www.openstructure.org>
//
// Copyright (C) 2008-2024 by the OpenStructure authors
//
// This library is free software; you can redistribute it and/or modify it under
// the terms of the GNU Lesser General Public License as published by the Free
// Software Foundation; either version 3.0 of the License, or (at your option)
// any later version.
// This library is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
// details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with this library; if not, write to the Free Software Foundation, Inc.,
// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
//------------------------------------------------------------------------------
#ifndef OST_BINDINGS_PARASAIL_H
#define OST_BINDINGS_PARASAIL_H
#include <ost/seq/sequence_handle.hh>
#include <ost/seq/alignment_handle.hh>
namespace ost { namespace bindings {
ost::seq::AlignmentHandle ParaLocalAlign(const ost::seq::SequenceHandle& s1,
const ost::seq::SequenceHandle& s2,
int gap_open_penalty = 11,
int gap_extension_penalty = 1,
const String& matrix="blosum62");
ost::seq::AlignmentHandle ParaGlobalAlign(const ost::seq::SequenceHandle& s1,
const ost::seq::SequenceHandle& s2,
int gap_open_penalty = 11,
int gap_extension_penalty = 1,
const String& matrix="blosum62");
ost::seq::AlignmentHandle ParaSemiGlobalAlign(const ost::seq::SequenceHandle& s1,
const ost::seq::SequenceHandle& s2,
int gap_open_penalty = 11,
int gap_extension_penalty = 1,
const String& matrix="blosum62");
bool ParasailAvailable();
}} //ns
#endif
...@@ -50,6 +50,11 @@ if (ENABLE_INFO) ...@@ -50,6 +50,11 @@ if (ENABLE_INFO)
else() else()
set(info_enabled 0) set(info_enabled 0)
endif() endif()
if (ENABLE_PARASAIL)
set(parasail_enabled 1)
else()
set(parasail_enabled 0)
endif()
set(config_hh_generator "CMake") set(config_hh_generator "CMake")
set(CONFIG_HH_FILE "${CMAKE_CURRENT_SOURCE_DIR}/config.hh") set(CONFIG_HH_FILE "${CMAKE_CURRENT_SOURCE_DIR}/config.hh")
......
...@@ -31,5 +31,6 @@ ...@@ -31,5 +31,6 @@
#define OST_FFT_USE_THREADS @fftw_use_threads@ #define OST_FFT_USE_THREADS @fftw_use_threads@
#define OST_UBUNTU_LAYOUT @ubuntu_layout@ #define OST_UBUNTU_LAYOUT @ubuntu_layout@
#define OST_INFO_ENABLED @info_enabled@ #define OST_INFO_ENABLED @info_enabled@
#define OST_PARASAIL_ENABLED @parasail_enabled@
#endif #endif
...@@ -57,6 +57,11 @@ If you would like to use the :mod:`molecular mechanics <ost.mol.mm>` module: ...@@ -57,6 +57,11 @@ If you would like to use the :mod:`molecular mechanics <ost.mol.mm>` module:
* `OpenMM <https://simtk.org/home/openmm>`_ (7.7.0) * `OpenMM <https://simtk.org/home/openmm>`_ (7.7.0)
If you would like to do pairwise sequence alignments with parasail
in the :mod:`bindings <ost.bindings>` module:
* `parasail <https://github.com/jeffdaily/parasail/>`_ (2.6.2)
We do not provide backwards compatibility to Python 2.7. The last We do not provide backwards compatibility to Python 2.7. The last
release supporting Python 2.7 is 1.11.0. release supporting Python 2.7 is 1.11.0.
...@@ -187,6 +192,13 @@ can influence it. ...@@ -187,6 +192,13 @@ can influence it.
* `OPEN_MM_PLUGIN_DIR`: the path for OpenMM plugins * `OPEN_MM_PLUGIN_DIR`: the path for OpenMM plugins
* see example below for commonly used paths * see example below for commonly used paths
* `ENABLE_PARASAIL` controls whether parasail should be enabled. By default,
this is switched off. If it is turned on, you must also set the paths
to your parasail installation:
* `PARASAIL_INCLUDE_DIR`: the include path containing the file parasail.h
* `PARASAIL_LIBRARY`: the parasail library
* Several paths to other libraries can be set if they are not in the expected * Several paths to other libraries can be set if they are not in the expected
locations: locations:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment