diff --git a/modules/seq/alg/doc/seqalg.rst b/modules/seq/alg/doc/seqalg.rst index 02f74c4f78015926f5faae38c54d5a4c72f24218..b846283f30ff5b7a2ec7d319cf91b7c65195d4cb 100644 --- a/modules/seq/alg/doc/seqalg.rst +++ b/modules/seq/alg/doc/seqalg.rst @@ -118,3 +118,18 @@ :param gap_open: The gap opening penalty. Must be a negative number :param gap_ext: The gap extension penalty. Must be a negative number :returns: best-scoring alignment of *seq1* and *seq2*. + +.. function:: ShannonEntropy(aln, ignore_gaps=True) + + Returns the per-column Shannon entropies of the alignment. The entropy + describes how conserved a certain column in the alignment is . The higher + the entropy is, the less conserved the column. For column with no amino + aids, the entropy value is set to NAN + + :param aln: multiple sequence alignment + :type aln: :class:`~ost.seq.AlignmentHandle` + :param ignore_gaps: whether to ignore gaps in the column. + :type ignore_gaps: bool + + :returns: list of column entropies + diff --git a/modules/seq/alg/pymod/wrap_seq_alg.cc b/modules/seq/alg/pymod/wrap_seq_alg.cc index 9239d1ec3d2e759729cfb1bcde0fb9005e52ded1..f22866761d38ac9c19fbe44b7f6d97cc41c27827 100644 --- a/modules/seq/alg/pymod/wrap_seq_alg.cc +++ b/modules/seq/alg/pymod/wrap_seq_alg.cc @@ -26,6 +26,7 @@ #include <ost/seq/alg/subst_weight_matrix.hh> #include <ost/seq/alg/local_align.hh> #include <ost/seq/alg/global_align.hh> +#include <ost/seq/alg/entropy.hh> using namespace boost::python; using namespace ost::seq; using namespace ost::seq::alg; @@ -58,5 +59,6 @@ BOOST_PYTHON_MODULE(_ost_seq_alg) arg("gap_open")=-5, arg("gap_ext")=-2)); def("GlobalAlign", &GlobalAlign,(arg("seq1"),arg("seq2"),arg("subst_weight"), arg("gap_open")=-5, arg("gap_ext")=-2)); + def("ShannonEntropy", &ShannonEntropy, (arg("aln"), arg("ignore_gaps")=true)); } diff --git a/modules/seq/alg/src/CMakeLists.txt b/modules/seq/alg/src/CMakeLists.txt index ccffcacf464cc22d85f62a6d5190067b76622cb7..ea6284a98d46e553879c2f7266e8d6d27289e19b 100644 --- a/modules/seq/alg/src/CMakeLists.txt +++ b/modules/seq/alg/src/CMakeLists.txt @@ -4,6 +4,7 @@ module_config.hh ins_del.hh subst_weight_matrix.hh alignment_opts.hh +entropy.hh merge_pairwise_alignments.hh conservation.hh local_align.hh @@ -14,6 +15,7 @@ set(OST_SEQ_ALG_SOURCES merge_pairwise_alignments.cc local_align.cc global_align.cc +entropy.cc sequence_identity.cc ins_del.cc conservation.cc diff --git a/modules/seq/alg/src/entropy.cc b/modules/seq/alg/src/entropy.cc new file mode 100644 index 0000000000000000000000000000000000000000..7881512578187af9a0439ffe1ef44d5bab46dfe9 --- /dev/null +++ b/modules/seq/alg/src/entropy.cc @@ -0,0 +1,68 @@ +//------------------------------------------------------------------------------ +// This file is part of the OpenStructure project <www.openstructure.org> +// +// Copyright (C) 2008-2011 by the OpenStructure authors +// +// This library is free software; you can redistribute it and/or modify it under +// the terms of the GNU Lesser General Public License as published by the Free +// Software Foundation; either version 3.0 of the License, or (at your option) +// any later version. +// This library is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more +// details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this library; if not, write to the Free Software Foundation, Inc., +// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +//------------------------------------------------------------------------------ +#include <limits> + +#include "entropy.hh" + +#include <ost/seq/aligned_column.hh> +#include <ost/seq/aligned_column_iterator.hh> + +namespace ost { namespace seq { namespace alg { + +int OneLetterCodeToIndex(char olc, bool ignore_gaps) +{ + if (olc >= 'A' && olc <= 'Z') return (olc-'A'); + if (olc >= 'a' && olc <= 'z') return (olc-'a'); + if (olc == '-' && !ignore_gaps) return 26; + return -1; +} + +std::vector<Real> ShannonEntropy(const AlignmentHandle& aln, bool ignore_gaps) { + std::vector<Real> entropies(aln.GetLength(), 0); + int aa_counts[27]; + int k = 0; + for (AlignmentHandle::iterator + i = aln.begin(), e = aln.end(); i != e; ++i, ++k) { + const AlignedColumn& col = *i; + memset(aa_counts, 0, sizeof(int)*27); + int counts = 0; + for (int j=0; j<aln.GetCount(); ++j) { + int index = OneLetterCodeToIndex(col[j], ignore_gaps); + if (index==-1) { + continue; + } + counts += 1; + aa_counts[index] += 1; + } + Real entropy = 0.0; + for (int j = 0; j < 27; ++j) { + if (aa_counts[j]) { + Real freq = static_cast<Real>(aa_counts[j])/counts; + entropy-= log(freq)*freq; + } + } + if (counts) + entropies[k] = entropy; + else + entropies[k] = std::numeric_limits<Real>::quiet_NaN(); + } + return entropies; +} + +}}} diff --git a/modules/seq/alg/src/entropy.hh b/modules/seq/alg/src/entropy.hh new file mode 100644 index 0000000000000000000000000000000000000000..85ec4ac0d6a9ab60cda514d64e4e277f6d6f80a9 --- /dev/null +++ b/modules/seq/alg/src/entropy.hh @@ -0,0 +1,36 @@ +//------------------------------------------------------------------------------ +// This file is part of the OpenStructure project <www.openstructure.org> +// +// Copyright (C) 2008-2011 by the OpenStructure authors +// +// This library is free software; you can redistribute it and/or modify it under +// the terms of the GNU Lesser General Public License as published by the Free +// Software Foundation; either version 3.0 of the License, or (at your option) +// any later version. +// This library is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more +// details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this library; if not, write to the Free Software Foundation, Inc., +// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +//------------------------------------------------------------------------------ +#ifndef OST_SEQ_ALG_ENTROPY_HH +#define OST_SEQ_ALG_ENTROPY_HH + +#include <vector> + +#include <ost/seq/alignment_handle.hh> + +#include "module_config.hh" + +namespace ost { namespace seq { namespace alg { + +/// \brief calculates the Shannon entropy for each column in the alignment +std::vector<Real> DLLEXPORT_OST_SEQ_ALG ShannonEntropy(const AlignmentHandle& aln, + bool ignore_gaps=true); + +}}} +#endif +