diff --git a/modules/seq/base/doc/seq.rst b/modules/seq/base/doc/seq.rst index 658d163acb24ec2ef56d56c7da72ac65811e8ce9..cecd2fd1bc451497034bd8e3d415c52d1b87500f 100644 --- a/modules/seq/base/doc/seq.rst +++ b/modules/seq/base/doc/seq.rst @@ -436,3 +436,87 @@ an alignment: .. method:: RemoveSequence(index) Remove sequence at *index* from the alignment. + + +Handling Hidden Markov Models +-------------------------------------------------------------------------------- + +The HMM provides a simple container for hidden markov models in form of +single columns containing amino acid frequencies and transition probabilities. + +.. class:: HMMColumn + + .. method:: BLOSUMNullModel() + + Static method, that returns a new :class:`HMMColumn` with amino acid + frequencies given from the BLOSUM62 substitution matrix. + + .. method:: GetFreq(aa) + + :param aa: One letter code of standard amino acid + :type aa: :class:`str` + + :returns: Frequency of aa + + .. method:: SetFreq(aa,freq) + + :param aa: One letter code of standard amino acid + :param freq: The frequency of the given amino acid + :type aa: :class:`str` + :type freq: :class:`float` + + .. method:: GetTransitionFreq(from,to) + + :param from: Current state of HMM (HMM_MATCH, HMM_INSERT or HMM_DELETE) + :param to: Next state + + :returns: Frequency of given state transition + + .. method:: SetTransitionFreq(from,to,freq) + + :param from: Current state of HMM (HMM_MATCH, HMM_INSERT or HMM_DELETE) + :param to: Next state + :param freq: Frequency of transition + + .. attribute:: one_letter_code + + One letter code, this column is associated to + + .. attribute:: entropy + + Shannon entropy based on the columns amino acid frequencies + + +.. class:: HMM + + .. method:: Load(filename) + + Static method to load an hmm in the hhm format as it is in use in the HHSuite. + + :param filename: Name of file to load + :type filename: :class:`str` + + .. method:: AddColumn(col) + + Appends column in the internal column list. + + :param col: Column to add + :type col: :class:`HMMColumn` + + .. attribute:: columns + + Iterable columns of the HMM + + .. attribute:: null_model + + Null model of the HMM + + .. attribute:: avg_entropy + + Average entropy of all the columns + + + + + + diff --git a/modules/seq/base/pymod/CMakeLists.txt b/modules/seq/base/pymod/CMakeLists.txt index f2aac4ea51cb0cd57fed4d304199db0a420158fb..c3d565c25a3798b93ea74c9a7a428521b0706121 100644 --- a/modules/seq/base/pymod/CMakeLists.txt +++ b/modules/seq/base/pymod/CMakeLists.txt @@ -1,5 +1,6 @@ set(OST_SEQ_PYMOD_SOURCES export_sequence.cc + export_hmm.cc wrap_seq.cc ) diff --git a/modules/seq/base/pymod/export_hmm.cc b/modules/seq/base/pymod/export_hmm.cc new file mode 100644 index 0000000000000000000000000000000000000000..944892f80237974f21463e7939d47aea5c67688b --- /dev/null +++ b/modules/seq/base/pymod/export_hmm.cc @@ -0,0 +1,48 @@ +#include <boost/python/suite/indexing/vector_indexing_suite.hpp> +#include <boost/python.hpp> +#include <ost/seq/hmm.hh> +using namespace ost::seq; +using namespace boost::python; + +void export_hmm() +{ + + enum_<HMMState>("HMMState") + .value("HMM_MATCH", HMM_MATCH) + .value("HMM_INSERT", HMM_INSERT) + .value("HMM_DELETE", HMM_DELETE) + ; + + class_<HMMColumn>("HMMColumn", init<>()) + .add_property("n_eff", &HMMColumn::GetNEff, + &HMMColumn::SetNEff) + .add_property("n_eff_ins", &HMMColumn::GetNEffIns, + &HMMColumn::SetNEffIns) + .add_property("n_eff_del", &HMMColumn::GetNEffDel, + &HMMColumn::SetNEffDel) + .add_property("one_letter_code", &HMMColumn::GetOneLetterCode, + &HMMColumn::SetOneLetterCode) + .add_property("entropy", &HMMColumn::GetEntropy) + .def("GetFreq", &HMMColumn::GetFreq) + .def("SetFreq", &HMMColumn::SetFreq) + .def("GetTransitionFreq",&HMMColumn::GetTransitionFreq) + .def("SetTransitionFreq",&HMMColumn::SetTransitionFreq) + .def("BLOSUMNullModel", + &HMMColumn::BLOSUMNullModel).staticmethod("BLOSUMNullModel") + ; + + class_<HMMColumnList>("HMMColumnList", init<>()) + .def(vector_indexing_suite<HMMColumnList>()) + ; + + class_<HMM, HMMPtr>("HMM", init<>()) + .def("Load", &HMM::Load).staticmethod("Load") + .def("AddColumn", &HMM::push_back) + .add_property("null_model", make_function(&HMM::GetNullModel, + return_value_policy<copy_const_reference>())) + .add_property("columns", + make_function(&HMM::GetColumns, + return_value_policy<copy_const_reference>())) + .add_property("avg_entropy", &HMM::GetAverageEntropy) + ; +} diff --git a/modules/seq/base/pymod/wrap_seq.cc b/modules/seq/base/pymod/wrap_seq.cc index c881dae0ab14a9678aabbacc575daba93bfcedd3..d676f8757034e235658105e852bd48056cfff3b6 100644 --- a/modules/seq/base/pymod/wrap_seq.cc +++ b/modules/seq/base/pymod/wrap_seq.cc @@ -20,9 +20,11 @@ using namespace boost::python; void export_sequence(); +void export_hmm(); BOOST_PYTHON_MODULE(_ost_seq) { export_sequence(); + export_hmm(); } diff --git a/modules/seq/base/src/CMakeLists.txt b/modules/seq/base/src/CMakeLists.txt index 8ef75076e739e4cf90e01e97f5e9582f22cd67cf..3ba4fcf21ec72479e4a300b551e91597553e51d8 100644 --- a/modules/seq/base/src/CMakeLists.txt +++ b/modules/seq/base/src/CMakeLists.txt @@ -16,6 +16,7 @@ aligned_column.hh aligned_column_iterator.hh invalid_sequence.hh views_from_sequences.hh +hmm.hh ) set(OST_SEQ_SOURCES @@ -28,6 +29,7 @@ sequence_list.cc alignment_handle.cc sequence_op.cc views_from_sequences.cc +hmm.cc ) if (ENABLE_INFO) @@ -38,4 +40,5 @@ endif() module(NAME seq SOURCES ${OST_SEQ_SOURCES} HEADERS ${OST_SEQ_IMPL_HEADERS} IN_DIR impl ${OST_SEQ_HEADERS} + LINK ${BOOST_IOSTREAM_LIBRARIES} DEPENDS_ON ost_mol ${INFO_DEPS}) \ No newline at end of file diff --git a/modules/seq/base/src/hmm.cc b/modules/seq/base/src/hmm.cc new file mode 100644 index 0000000000000000000000000000000000000000..795e6cd55c89bbdd080ec52cb419fb310ed8d9c9 --- /dev/null +++ b/modules/seq/base/src/hmm.cc @@ -0,0 +1,163 @@ +#include <fstream> +#include <iostream> +#include <boost/iostreams/filter/gzip.hpp> +#include <boost/iostreams/filtering_stream.hpp> +#include <ost/string_ref.hh> +#include <stdexcept> +#include "hmm.hh" + +namespace ost { namespace seq{ + +int HMMColumn::GetIndex(char olc) const { + if (olc == 'A') return 0; + if (olc >= 'C' && olc <= 'I') return (olc-'A') - 1; + if (olc >= 'K' && olc <= 'N') return (olc-'A') - 2; + if (olc >= 'P' && olc <= 'T') return (olc-'A') - 3; + if (olc == 'V' ) return 17; + if (olc == 'W' ) return 18; + if (olc == 'Y' ) return 19; + return -1; +} + +HMMColumn HMMColumn::BLOSUMNullModel() { + HMMColumn col; + + col.freq_[0] = 0.0732; + col.freq_[1] = 0.0250; + col.freq_[2] = 0.0542; + col.freq_[3] = 0.0547; + col.freq_[4] = 0.0447; + col.freq_[5] = 0.0751; + col.freq_[6] = 0.0261; + col.freq_[7] = 0.0700; + col.freq_[8] = 0.1011; + col.freq_[9] = 0.0584; + col.freq_[10] = 0.0246; + col.freq_[11] = 0.0463; + col.freq_[12] = 0.0351; + col.freq_[13] = 0.0326; + col.freq_[14] = 0.0484; + col.freq_[15] = 0.0573; + col.freq_[16] = 0.0505; + col.freq_[17] = 0.0758; + col.freq_[18] = 0.0124; + col.freq_[19] = 0.0345; + + //it's an allaline... I know, it doesn;t make much sense + col.olc_ = 'A'; + + return col; +} + +Real HMMColumn::GetEntropy() const { + Real entropy = 0.0; + for (const Real* j = this->freqs_begin(), *e2 = this->freqs_end(); j != e2; ++j) { + + if (*j>0.0) { + entropy -= log(*j)**j; + } + } + return entropy; +} + +HMMPtr HMM::Load(const std::string& filename) { + HMMPtr hmm(new HMM); + boost::iostreams::filtering_stream<boost::iostreams::input> in; + std::ifstream stream(filename.c_str()); + if(!stream){ + throw std::runtime_error("Could not open file!"); + } + if (filename.rfind(".gz") == filename.size()-3) { + in.push(boost::iostreams::gzip_decompressor()); + } + + in.push(stream); + std::string line; + bool null_found = false; + ost::StringRef sline; + while (std::getline(in, line)) { + sline=ost::StringRef(line.c_str(), line.length()); + if (sline.length()>4 && + sline.substr(0, 5)==ost::StringRef("NULL ", 5)) { + null_found = true; + break; + } + } + const char* olcs = "ACDEFGHIKLMNPQRSTVWY"; + if (!null_found) { + throw std::runtime_error("no NULL found in file"); + } + std::vector<ost::StringRef> chunks = sline.split(); + for (int i=1; i<21; ++i) { + if (chunks[i] != ost::StringRef("*", 1)) { + Real freq = pow(2.0, -0.001*chunks[i].to_int().second); + hmm->null_model_.SetFreq(olcs[i-1], freq); + } + } + // read until we hit HMM, then skip 3 more lines + bool hmm_found = false; + while (std::getline(in, line)) { + sline=ost::StringRef(line.c_str(), line.length()); + if (sline.length()>3 && + sline.substr(0, 4)==ost::StringRef("HMM ", 4)) { + for (int i = 0; i<2; ++i ) { + std::getline(in, line); + } + hmm_found = true; + break; + } + } + if (!hmm_found) { + throw std::runtime_error("no HMM found in file"); + } + while (std::getline(in, line)) { + sline = ost::StringRef(line.c_str(), line.length()); + if (sline.trim().empty()) continue; + if (sline.trim() == ost::StringRef("//", 2)) break; + std::vector<ost::StringRef> freqs = sline.split(); + std::string line2; + std::getline(in, line2); + ost::StringRef sline2(line2.c_str(), line2.length()); + std::vector<ost::StringRef> trans = sline2.trim().split(); + hmm->columns_.push_back(HMMColumn()); + hmm->columns_.back().SetOneLetterCode(freqs[0][0]); + for (int i = 2; i < 22; ++i) { + if (freqs[i] != ost::StringRef("*", 1)) { + hmm->columns_.back().SetFreq(olcs[i-2], + pow(2.0, -0.001*freqs[i].to_int().second)); + } + } + HMMState from,to; + int trans_freq_pos = 0; + for(uint i = 0; i < 3; ++i){ + from = HMMState(i); + for(uint j = 0; j < 3; ++j){ + if((i == 1 && j == 2) || (i == 2 && j == 1)) continue; //transition not possible + if (trans[trans_freq_pos] != ost::StringRef("*", 1)){ + Real t_freq = pow(2.0, -0.001*trans[trans_freq_pos].to_int().second); + to = HMMState(j); + hmm->columns_.back().SetTransitionFreq(from, to, t_freq); + } + ++trans_freq_pos; + } + } + + hmm->columns_.back().SetNEff(0.001 * trans[trans_freq_pos].to_int().second); + hmm->columns_.back().SetNEffIns(0.001 * trans[trans_freq_pos+1].to_int().second); + hmm->columns_.back().SetNEffDel(0.001 * trans[trans_freq_pos+2].to_int().second); + } + return hmm; +} + +Real HMM::GetAverageEntropy() const { + Real n_eff=0.0; + int n = 0; + for (std::vector<HMMColumn>::const_iterator + i = this->columns_begin(), e = this->columns_end(); i != e; ++i) { + n += 1; + n_eff += i->GetEntropy(); + } + return (n > 0) ? n_eff/n : 0.0; +} + +}} //ns diff --git a/modules/seq/base/src/hmm.hh b/modules/seq/base/src/hmm.hh new file mode 100644 index 0000000000000000000000000000000000000000..13c24a1c7277b8bd25dc98af4747fda1f869ee92 --- /dev/null +++ b/modules/seq/base/src/hmm.hh @@ -0,0 +1,125 @@ +#ifndef OST_SEQ_HMM_HH +#define OST_SEQ_HMM_HH + +#include <string> +#include <vector> +#include <boost/shared_ptr.hpp> +#include <ost/base.hh> + +namespace ost { namespace seq { +typedef enum { + HMM_MATCH=0, + HMM_INSERT=1, + HMM_DELETE=2 +} HMMState; + + +class HMMColumn { + public: + HMMColumn() : n_eff_(0.0), n_eff_ins_(0.0), n_eff_del_(0.0) { + memset(freq_, 0, sizeof(Real)*20); + memset(trans_, 0, sizeof(Real)*9); + } + HMMColumn(const HMMColumn& rhs): olc_(rhs.olc_), n_eff_(rhs.n_eff_), + n_eff_ins_(rhs.n_eff_ins_), n_eff_del_(rhs.n_eff_del_) { + memcpy(freq_, rhs.freq_, sizeof(Real)*20); + memcpy(trans_, rhs.trans_, sizeof(Real)*9); + } + Real GetTransitionFreq(HMMState from, HMMState to) const { + return trans_[from][to]; + } + void SetTransitionFreq(HMMState from, HMMState to, Real freq){ + trans_[from][to] = freq; + } + void SetNEff(Real val) { n_eff_ = val; } + void SetNEffIns(Real val) { n_eff_ins_ = val; } + void SetNEffDel(Real val) { n_eff_del_ = val; } + Real GetNEff() const { return n_eff_; } + Real GetNEffIns() const { return n_eff_ins_; } + Real GetNEffDel() const { return n_eff_del_; } + + Real GetFreq(char ch) const { + return freq_[this->GetIndex(ch)]; + } + + void SetFreq(char ch, Real freq) { + freq_[this->GetIndex(ch)]=freq; + } + + bool operator==(const HMMColumn& rhs) const { + return !memcmp(freq_, rhs.freq_, sizeof(freq_)) && + !memcmp(trans_, rhs.trans_, sizeof(trans_)) && + n_eff_ == rhs.n_eff_ && + n_eff_ins_ == rhs.n_eff_ins_ && + n_eff_del_ == rhs.n_eff_del_; + } + + Real* freqs_begin() { return freq_; } + Real* freqs_end() { return freq_+20; } + const Real* freqs_begin() const { return freq_; } + const Real* freqs_end() const { return freq_+20; } + char GetOneLetterCode() const { return olc_; } + void SetOneLetterCode(char olc) { olc_ = olc; } + Real GetEntropy() const; + + static HMMColumn BLOSUMNullModel(); + private: + int GetIndex(char ch) const; + char olc_; + Real freq_[20]; + Real trans_[3][3]; + Real n_eff_; + Real n_eff_ins_; + Real n_eff_del_; +}; + + +class HMM; +typedef boost::shared_ptr<HMM> HMMPtr; +typedef std::vector<HMMColumn> HMMColumnList; +// Represents a HHsearch profile +// Minimalistic on Purpose. +class HMM { + public: + HMM() {} + static HMMPtr Load(const std::string& filename); + const std::vector<HMMColumn>& GetColumns() const { return columns_; } + const HMMColumn& GetNullModel() const { return null_model_; } + void SetNullModel(const HMMColumn& null_model) { null_model_ = null_model; } + + + //some functions to make it behave like a vector + + size_t size() const { return columns_.size(); } + + bool empty() const { return columns_.empty(); } + + void push_back(const HMMColumn& c) { columns_.push_back(c); } + + HMMColumn& operator[](size_t index) { return columns_[index]; } + + const HMMColumn& operator[](size_t index) const { return columns_[index]; } + + HMMColumn& at(size_t index) { return columns_.at(index); } + + const HMMColumn& at(size_t index) const { return columns_.at(index); } + + bool operator==(const HMM& other) const { return columns_ == other.columns_ && + null_model_ == other.null_model_; } + + bool operator!=(const HMM& other) const { return !(other == (*this)); } + + + HMMColumnList::const_iterator columns_end() const { return columns_.end(); } + HMMColumnList::iterator columns_end() { return columns_.end(); } + HMMColumnList::const_iterator columns_begin() const { return columns_.begin(); } + HMMColumnList::iterator columns_begin() { return columns_.begin(); } + Real GetAverageEntropy() const; + private: + HMMColumn null_model_; + HMMColumnList columns_; +}; + +}} + +#endif diff --git a/modules/seq/base/tests/CMakeLists.txt b/modules/seq/base/tests/CMakeLists.txt index 645dba066e39c51c1fc0b9211228e8490ed85a07..5fdf6e37c3b9a7149fe2b006942ad8df2f219590 100644 --- a/modules/seq/base/tests/CMakeLists.txt +++ b/modules/seq/base/tests/CMakeLists.txt @@ -4,6 +4,7 @@ set(OST_SEQ_UNIT_TESTS test_aligned_column.cc test_aligned_region.cc test_alignment.cc + test_hmm.cc tests.cc ) diff --git a/modules/seq/base/tests/test_hmm.cc b/modules/seq/base/tests/test_hmm.cc new file mode 100644 index 0000000000000000000000000000000000000000..47a15f9d0a56757f4926f8081499325786114146 --- /dev/null +++ b/modules/seq/base/tests/test_hmm.cc @@ -0,0 +1,142 @@ +//------------------------------------------------------------------------------ +// This file is part of the OpenStructure project <www.openstructure.org> +// +// Copyright (C) 2008-2015 by the OpenStructure authors +// +// This library is free software; you can redistribute it and/or modify it under +// the terms of the GNU Lesser General Public License as published by the Free +// Software Foundation; either version 3.0 of the License, or (at your option) +// any later version. +// This library is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more +// details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this library; if not, write to the Free Software Foundation, Inc., +// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +//------------------------------------------------------------------------------ +#define BOOST_TEST_DYN_LINK +#include <boost/test/unit_test.hpp> +#include <boost/test/auto_unit_test.hpp> +#include <iostream> + +#include <ost/seq/hmm.hh> + +using namespace ost::seq; + + +BOOST_AUTO_TEST_SUITE( hmm ); + + +BOOST_AUTO_TEST_CASE(hmm_loading) +{ + HMMPtr hmm = HMM::Load("testfiles/test_hmm.hhm"); + BOOST_CHECK_EQUAL(hmm->size(),5); + //check sequence with different methods of getting the columns + BOOST_CHECK_EQUAL((*hmm)[0].GetOneLetterCode(),'E'); + BOOST_CHECK_EQUAL((hmm->columns_begin()+1)->GetOneLetterCode(),'T'); + BOOST_CHECK_EQUAL(hmm->GetColumns()[2].GetOneLetterCode(),'G'); + BOOST_CHECK_EQUAL(hmm->at(3).GetOneLetterCode(),'S'); + BOOST_CHECK_EQUAL((hmm->columns_end()-1)->GetOneLetterCode(),'P'); + + //we first check, whether the null model has been read correctly + //and then randomly pick a column to check + + HMMColumn null_model = hmm->GetNullModel(); + + String olc = "ACDEFGHIKLMNPQRSTVWY"; + + Real correct_aa_freqs[] = {pow(2.0,-0.001*3706), pow(2.0,-0.001*5728), + pow(2.0,-0.001*4211), pow(2.0,-0.001*4064), + pow(2.0,-0.001*4839), pow(2.0,-0.001*3729), + pow(2.0,-0.001*4763), pow(2.0,-0.001*4308), + pow(2.0,-0.001*4069), pow(2.0,-0.001*3323), + pow(2.0,-0.001*5509), pow(2.0,-0.001*4640), + pow(2.0,-0.001*4464), pow(2.0,-0.001*4937), + pow(2.0,-0.001*4285), pow(2.0,-0.001*4423), + pow(2.0,-0.001*3815), pow(2.0,-0.001*3783), + pow(2.0,-0.001*6325), pow(2.0,-0.001*4665)}; + + for(int i = 0; i < 20; ++i){ + BOOST_CHECK_CLOSE(null_model.GetFreq(olc[i]),correct_aa_freqs[i],Real(1e-5)); + } + + //transition frequencies should all be zero in the null model + HMMState from,to; + for(int i = 0; i < 3; ++i){ + from = HMMState(i); + for(int j = 0; j < 3; ++j){ + to = HMMState(j); + BOOST_CHECK_EQUAL(null_model.GetTransitionFreq(from,to),0.0); + } + } + + //NEffs should also be zero + BOOST_CHECK_EQUAL(null_model.GetNEff(),0.0); + BOOST_CHECK_EQUAL(null_model.GetNEffIns(),0.0); + BOOST_CHECK_EQUAL(null_model.GetNEffDel(),0.0); + + + //let's check, whether a particular column has been read correctly + HMMColumn col = (*hmm)[2]; + BOOST_CHECK_EQUAL(col.GetOneLetterCode(),'G'); + BOOST_CHECK_CLOSE(col.GetNEff(),3.076,Real(1e-5)); + BOOST_CHECK_CLOSE(col.GetNEffIns(),1.097,Real(1e-5)); + BOOST_CHECK_CLOSE(col.GetNEffDel(),0.0,Real(1e-5)); + + memset(correct_aa_freqs,0,20*sizeof(Real)); + correct_aa_freqs[0] = pow(2.0,-0.001*3676); + correct_aa_freqs[1] = pow(2.0,-0.001*2597); + correct_aa_freqs[5] = pow(2.0,-0.001*5359); + correct_aa_freqs[6] = pow(2.0,-0.001*3275); + correct_aa_freqs[11] = pow(2.0,-0.001*3292); + correct_aa_freqs[12] = pow(2.0,-0.001*5077); + correct_aa_freqs[13] = pow(2.0,-0.001*3826); + correct_aa_freqs[15] = pow(2.0,-0.001*2409); + correct_aa_freqs[16] = pow(2.0,-0.001*3733); + correct_aa_freqs[17] = pow(2.0,-0.001*4503); + correct_aa_freqs[19] = pow(2.0,-0.001*3070); + + for(int i = 0; i < 20; ++i){ + BOOST_CHECK_CLOSE(col.GetFreq(olc[i]),correct_aa_freqs[i],Real(1e-5)); + } + + BOOST_CHECK_CLOSE(col.GetTransitionFreq(HMM_MATCH,HMM_MATCH),pow(2.0,-0.001*46.0),Real(1e-5)); + BOOST_CHECK_CLOSE(col.GetTransitionFreq(HMM_MATCH,HMM_INSERT),pow(2.0,-0.001*4979.0),Real(1e-5)); + BOOST_CHECK_CLOSE(col.GetTransitionFreq(HMM_MATCH,HMM_DELETE),0.0,Real(1e-5)); + BOOST_CHECK_CLOSE(col.GetTransitionFreq(HMM_INSERT,HMM_MATCH),pow(2.0,-0.001*2006.0),Real(1e-5)); + BOOST_CHECK_CLOSE(col.GetTransitionFreq(HMM_INSERT,HMM_INSERT),pow(2.0,-0.001*413.0),Real(1e-5)); + BOOST_CHECK_CLOSE(col.GetTransitionFreq(HMM_DELETE,HMM_MATCH),0.0,Real(1e-5)); + BOOST_CHECK_CLOSE(col.GetTransitionFreq(HMM_DELETE,HMM_DELETE),0.0,Real(1e-5)); + //following transitions are impossible, so frequency must be zero anyway + BOOST_CHECK_CLOSE(col.GetTransitionFreq(HMM_DELETE,HMM_INSERT),0.0,Real(1e-5)); + BOOST_CHECK_CLOSE(col.GetTransitionFreq(HMM_INSERT,HMM_DELETE),0.0,Real(1e-5)); + + BOOST_CHECK_CLOSE(col.GetNEff(),3076*0.001,Real(1e-5)); + BOOST_CHECK_CLOSE(col.GetNEffIns(),1097*0.001,Real(1e-5)); + BOOST_CHECK_CLOSE(col.GetNEffDel(),0.0,Real(1e-5)); + + + //check, whether the entropy of the column above gets correctly calculated + Real entropy = 0.0; + entropy -= correct_aa_freqs[0] * log(correct_aa_freqs[0]); + entropy -= correct_aa_freqs[1] * log(correct_aa_freqs[1]); + entropy -= correct_aa_freqs[5] * log(correct_aa_freqs[5]); + entropy -= correct_aa_freqs[6] * log(correct_aa_freqs[6]); + entropy -= correct_aa_freqs[11] * log(correct_aa_freqs[11]); + entropy -= correct_aa_freqs[12] * log(correct_aa_freqs[12]); + entropy -= correct_aa_freqs[13] * log(correct_aa_freqs[13]); + entropy -= correct_aa_freqs[15] * log(correct_aa_freqs[15]); + entropy -= correct_aa_freqs[16] * log(correct_aa_freqs[16]); + entropy -= correct_aa_freqs[17] * log(correct_aa_freqs[17]); + entropy -= correct_aa_freqs[19] * log(correct_aa_freqs[19]); + BOOST_CHECK_CLOSE((*hmm)[2].GetEntropy(),entropy,Real(1e-5)); + + //look at avg entropy... + BOOST_CHECK_CLOSE(hmm->GetAverageEntropy(),1.48704576,Real(1e-5)); +} + + + +BOOST_AUTO_TEST_SUITE_END(); diff --git a/modules/seq/base/tests/testfiles/test_hmm.hhm b/modules/seq/base/tests/testfiles/test_hmm.hhm new file mode 100644 index 0000000000000000000000000000000000000000..1b9a665d1e1ce09e89efa6f38f1648a38bdbe529 --- /dev/null +++ b/modules/seq/base/tests/testfiles/test_hmm.hhm @@ -0,0 +1,31 @@ +HHsearch 1.5 +I don't get parsed +I don't get parsed +I don't get parsed +I don't get parsed +I don't get parsed +I don't get parsed +I don't get parsed +I don't get parsed +I don't get parsed +# +NULL 3706 5728 4211 4064 4839 3729 4763 4308 4069 3323 5509 4640 4464 4937 4285 4423 3815 3783 6325 4665 +HMM A C D E F G H I K L M N P Q R S T V W Y + M->M M->I M->D I->M I->I D->M D->D Neff Neff_I Neff_D + 0 * * 0 * 0 * * * * +E 1 1780 * * 497 * * * * * * * * * * * * * * * * 1 + 0 * * * * * * 1610 0 0 + +T 2 1615 * * * * 2652 * * * * * * * * * * 3992 1147 * * 2 + 0 * * * * * * 2652 0 0 + +G 3 3676 2597 * * * 5359 3275 * * * * 3292 5077 3826 * 2409 3733 4503 * 3070 3 + 46 4979 * 2006 413 * * 3076 1097 0 + +S 4 2579 * * * * * * * * 4320 * 4532 3895 4519 * 951 5140 3595 * * 4 + 58 4660 * 2833 218 * * 3050 1146 0 + +P 5 2585 * 4184 * * 2345 4748 * * * * * 2839 1766 5056 3635 * * * * 5 + 0 * * * * * * 3036 0 0 + +//