From b12075586d1bb23166518946c5d4a872e19b08b0 Mon Sep 17 00:00:00 2001 From: Marco Biasini <marco.biasini@unibas.ch> Date: Mon, 11 Feb 2013 18:14:10 +0100 Subject: [PATCH] Shannon entropy calculation for sequence alignments --- modules/seq/alg/doc/seqalg.rst | 15 ++++++ modules/seq/alg/pymod/wrap_seq_alg.cc | 2 + modules/seq/alg/src/CMakeLists.txt | 2 + modules/seq/alg/src/entropy.cc | 68 +++++++++++++++++++++++++++ modules/seq/alg/src/entropy.hh | 36 ++++++++++++++ 5 files changed, 123 insertions(+) create mode 100644 modules/seq/alg/src/entropy.cc create mode 100644 modules/seq/alg/src/entropy.hh diff --git a/modules/seq/alg/doc/seqalg.rst b/modules/seq/alg/doc/seqalg.rst index 02f74c4f7..b846283f3 100644 --- a/modules/seq/alg/doc/seqalg.rst +++ b/modules/seq/alg/doc/seqalg.rst @@ -118,3 +118,18 @@ :param gap_open: The gap opening penalty. Must be a negative number :param gap_ext: The gap extension penalty. Must be a negative number :returns: best-scoring alignment of *seq1* and *seq2*. + +.. function:: ShannonEntropy(aln, ignore_gaps=True) + + Returns the per-column Shannon entropies of the alignment. The entropy + describes how conserved a certain column in the alignment is . The higher + the entropy is, the less conserved the column. For column with no amino + aids, the entropy value is set to NAN + + :param aln: multiple sequence alignment + :type aln: :class:`~ost.seq.AlignmentHandle` + :param ignore_gaps: whether to ignore gaps in the column. + :type ignore_gaps: bool + + :returns: list of column entropies + diff --git a/modules/seq/alg/pymod/wrap_seq_alg.cc b/modules/seq/alg/pymod/wrap_seq_alg.cc index 9239d1ec3..f22866761 100644 --- a/modules/seq/alg/pymod/wrap_seq_alg.cc +++ b/modules/seq/alg/pymod/wrap_seq_alg.cc @@ -26,6 +26,7 @@ #include <ost/seq/alg/subst_weight_matrix.hh> #include <ost/seq/alg/local_align.hh> #include <ost/seq/alg/global_align.hh> +#include <ost/seq/alg/entropy.hh> using namespace boost::python; using namespace ost::seq; using namespace ost::seq::alg; @@ -58,5 +59,6 @@ BOOST_PYTHON_MODULE(_ost_seq_alg) arg("gap_open")=-5, arg("gap_ext")=-2)); def("GlobalAlign", &GlobalAlign,(arg("seq1"),arg("seq2"),arg("subst_weight"), arg("gap_open")=-5, arg("gap_ext")=-2)); + def("ShannonEntropy", &ShannonEntropy, (arg("aln"), arg("ignore_gaps")=true)); } diff --git a/modules/seq/alg/src/CMakeLists.txt b/modules/seq/alg/src/CMakeLists.txt index ccffcacf4..ea6284a98 100644 --- a/modules/seq/alg/src/CMakeLists.txt +++ b/modules/seq/alg/src/CMakeLists.txt @@ -4,6 +4,7 @@ module_config.hh ins_del.hh subst_weight_matrix.hh alignment_opts.hh +entropy.hh merge_pairwise_alignments.hh conservation.hh local_align.hh @@ -14,6 +15,7 @@ set(OST_SEQ_ALG_SOURCES merge_pairwise_alignments.cc local_align.cc global_align.cc +entropy.cc sequence_identity.cc ins_del.cc conservation.cc diff --git a/modules/seq/alg/src/entropy.cc b/modules/seq/alg/src/entropy.cc new file mode 100644 index 000000000..788151257 --- /dev/null +++ b/modules/seq/alg/src/entropy.cc @@ -0,0 +1,68 @@ +//------------------------------------------------------------------------------ +// This file is part of the OpenStructure project <www.openstructure.org> +// +// Copyright (C) 2008-2011 by the OpenStructure authors +// +// This library is free software; you can redistribute it and/or modify it under +// the terms of the GNU Lesser General Public License as published by the Free +// Software Foundation; either version 3.0 of the License, or (at your option) +// any later version. +// This library is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more +// details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this library; if not, write to the Free Software Foundation, Inc., +// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +//------------------------------------------------------------------------------ +#include <limits> + +#include "entropy.hh" + +#include <ost/seq/aligned_column.hh> +#include <ost/seq/aligned_column_iterator.hh> + +namespace ost { namespace seq { namespace alg { + +int OneLetterCodeToIndex(char olc, bool ignore_gaps) +{ + if (olc >= 'A' && olc <= 'Z') return (olc-'A'); + if (olc >= 'a' && olc <= 'z') return (olc-'a'); + if (olc == '-' && !ignore_gaps) return 26; + return -1; +} + +std::vector<Real> ShannonEntropy(const AlignmentHandle& aln, bool ignore_gaps) { + std::vector<Real> entropies(aln.GetLength(), 0); + int aa_counts[27]; + int k = 0; + for (AlignmentHandle::iterator + i = aln.begin(), e = aln.end(); i != e; ++i, ++k) { + const AlignedColumn& col = *i; + memset(aa_counts, 0, sizeof(int)*27); + int counts = 0; + for (int j=0; j<aln.GetCount(); ++j) { + int index = OneLetterCodeToIndex(col[j], ignore_gaps); + if (index==-1) { + continue; + } + counts += 1; + aa_counts[index] += 1; + } + Real entropy = 0.0; + for (int j = 0; j < 27; ++j) { + if (aa_counts[j]) { + Real freq = static_cast<Real>(aa_counts[j])/counts; + entropy-= log(freq)*freq; + } + } + if (counts) + entropies[k] = entropy; + else + entropies[k] = std::numeric_limits<Real>::quiet_NaN(); + } + return entropies; +} + +}}} diff --git a/modules/seq/alg/src/entropy.hh b/modules/seq/alg/src/entropy.hh new file mode 100644 index 000000000..85ec4ac0d --- /dev/null +++ b/modules/seq/alg/src/entropy.hh @@ -0,0 +1,36 @@ +//------------------------------------------------------------------------------ +// This file is part of the OpenStructure project <www.openstructure.org> +// +// Copyright (C) 2008-2011 by the OpenStructure authors +// +// This library is free software; you can redistribute it and/or modify it under +// the terms of the GNU Lesser General Public License as published by the Free +// Software Foundation; either version 3.0 of the License, or (at your option) +// any later version. +// This library is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more +// details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this library; if not, write to the Free Software Foundation, Inc., +// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +//------------------------------------------------------------------------------ +#ifndef OST_SEQ_ALG_ENTROPY_HH +#define OST_SEQ_ALG_ENTROPY_HH + +#include <vector> + +#include <ost/seq/alignment_handle.hh> + +#include "module_config.hh" + +namespace ost { namespace seq { namespace alg { + +/// \brief calculates the Shannon entropy for each column in the alignment +std::vector<Real> DLLEXPORT_OST_SEQ_ALG ShannonEntropy(const AlignmentHandle& aln, + bool ignore_gaps=true); + +}}} +#endif + -- GitLab