diff --git a/modules/seq/base/pymod/CMakeLists.txt b/modules/seq/base/pymod/CMakeLists.txt index 4f304ceeb7d60afea9ba0dfb2a03d9b5db0fb7ba..c40a805fa2dccb7feba2d4e35476d50a5e02a954 100644 --- a/modules/seq/base/pymod/CMakeLists.txt +++ b/modules/seq/base/pymod/CMakeLists.txt @@ -1,6 +1,5 @@ set(OST_SEQ_PYMOD_SOURCES export_sequence.cc - export_hmm.cc export_profile_handle.cc wrap_seq.cc ) diff --git a/modules/seq/base/pymod/export_hmm.cc b/modules/seq/base/pymod/export_hmm.cc deleted file mode 100644 index 1d9b39e28d3d12f4cdfca73b9c66031c71200c2c..0000000000000000000000000000000000000000 --- a/modules/seq/base/pymod/export_hmm.cc +++ /dev/null @@ -1,75 +0,0 @@ -#include <boost/python/suite/indexing/vector_indexing_suite.hpp> -#include <boost/python.hpp> -#include <ost/seq/hmm.hh> -using namespace ost::seq; -using namespace boost::python; - -namespace{ - -boost::python::list wrap_get_names(HMMDBPtr db){ - std::vector<String> v_names = db->GetNames(); - boost::python::list names; - for(std::vector<String>::iterator i = v_names.begin(); - i != v_names.end(); ++i){ - names.append(*i); - } - return names; -} - -} - -void export_hmm() -{ - - enum_<HMMState>("HMMState") - .value("HMM_MATCH", HMM_MATCH) - .value("HMM_INSERT", HMM_INSERT) - .value("HMM_DELETE", HMM_DELETE) - ; - - class_<HMMColumn>("HMMColumn", init<>()) - .add_property("n_eff", &HMMColumn::GetNEff, - &HMMColumn::SetNEff) - .add_property("n_eff_ins", &HMMColumn::GetNEffIns, - &HMMColumn::SetNEffIns) - .add_property("n_eff_del", &HMMColumn::GetNEffDel, - &HMMColumn::SetNEffDel) - .add_property("one_letter_code", &HMMColumn::GetOneLetterCode, - &HMMColumn::SetOneLetterCode) - .add_property("entropy", &HMMColumn::GetEntropy) - .def("GetFreq", &HMMColumn::GetFreq) - .def("SetFreq", &HMMColumn::SetFreq) - .def("GetTransitionFreq",&HMMColumn::GetTransitionFreq) - .def("SetTransitionFreq",&HMMColumn::SetTransitionFreq) - .def("BLOSUMNullModel", - &HMMColumn::BLOSUMNullModel).staticmethod("BLOSUMNullModel") - ; - - class_<HMMColumnList>("HMMColumnList", init<>()) - .def(vector_indexing_suite<HMMColumnList>()) - ; - - class_<HMM, HMMPtr>("HMM", init<>()) - .def("__len__",&HMM::size) - .def("Load", &HMM::Load).staticmethod("Load") - .def("AddColumn", &HMM::push_back) - .def("Extract", &HMM::Extract) - .add_property("null_model", make_function(&HMM::GetNullModel, - return_value_policy<copy_const_reference>())) - .add_property("columns", - make_function(&HMM::GetColumns, - return_value_policy<copy_const_reference>())) - .add_property("avg_entropy", &HMM::GetAverageEntropy) - .add_property("sequence",&HMM::GetSequence) - ; - - - class_<HMMDB, HMMDBPtr>("HMMDB", init<>()) - .def("Load", &HMMDB::Load).staticmethod("Load") - .def("Save", &HMMDB::Save) - .def("AddHMM", &HMMDB::AddHMM) - .def("GetHMM", &HMMDB::GetHMM) - .def("Size", &HMMDB::size) - .def("GetNames",&wrap_get_names) - ; -} diff --git a/modules/seq/base/pymod/wrap_seq.cc b/modules/seq/base/pymod/wrap_seq.cc index f937590d5cc2aa9e19e057e30b71bfc1170b5426..1dbc2ba17e65435db1480db4c8330d04718d1300 100644 --- a/modules/seq/base/pymod/wrap_seq.cc +++ b/modules/seq/base/pymod/wrap_seq.cc @@ -20,12 +20,10 @@ using namespace boost::python; void export_sequence(); -void export_hmm(); void export_profile_handle(); BOOST_PYTHON_MODULE(_ost_seq) { export_sequence(); - export_hmm(); export_profile_handle(); } diff --git a/modules/seq/base/src/CMakeLists.txt b/modules/seq/base/src/CMakeLists.txt index 80247afad75871d04a73b16dacec349687b3039f..ff359c8ad8dba949f3f0bfbf1aab122b973d4bd9 100644 --- a/modules/seq/base/src/CMakeLists.txt +++ b/modules/seq/base/src/CMakeLists.txt @@ -16,7 +16,6 @@ aligned_column.hh aligned_column_iterator.hh invalid_sequence.hh views_from_sequences.hh -hmm.hh profile_handle.hh ) @@ -30,7 +29,6 @@ sequence_list.cc alignment_handle.cc sequence_op.cc views_from_sequences.cc -hmm.cc profile_handle.cc ) diff --git a/modules/seq/base/src/hmm.cc b/modules/seq/base/src/hmm.cc deleted file mode 100644 index b009f61a05d9e417703b2f848a80cb5462f8db0e..0000000000000000000000000000000000000000 --- a/modules/seq/base/src/hmm.cc +++ /dev/null @@ -1,292 +0,0 @@ -#include <fstream> -#include <iostream> -#include <boost/iostreams/filter/gzip.hpp> -#include <boost/iostreams/filtering_stream.hpp> -#include <ost/string_ref.hh> -#include <stdexcept> -#include "hmm.hh" - -namespace ost { namespace seq{ - -int HMMColumn::GetIndex(char olc){ - if (olc == 'A') return 0; - if (olc >= 'C' && olc <= 'I') return (olc-'A') - 1; - if (olc >= 'K' && olc <= 'N') return (olc-'A') - 2; - if (olc >= 'P' && olc <= 'T') return (olc-'A') - 3; - if (olc == 'V' ) return 17; - if (olc == 'W' ) return 18; - if (olc == 'Y' ) return 19; - return -1; -} - -HMMColumn HMMColumn::BLOSUMNullModel() { - HMMColumn col; - - col.freq_[0] = 0.0732; - col.freq_[1] = 0.0250; - col.freq_[2] = 0.0542; - col.freq_[3] = 0.0547; - col.freq_[4] = 0.0447; - col.freq_[5] = 0.0751; - col.freq_[6] = 0.0261; - col.freq_[7] = 0.0700; - col.freq_[8] = 0.1011; - col.freq_[9] = 0.0584; - col.freq_[10] = 0.0246; - col.freq_[11] = 0.0463; - col.freq_[12] = 0.0351; - col.freq_[13] = 0.0326; - col.freq_[14] = 0.0484; - col.freq_[15] = 0.0573; - col.freq_[16] = 0.0505; - col.freq_[17] = 0.0758; - col.freq_[18] = 0.0124; - col.freq_[19] = 0.0345; - - //it's an allaline... I know, it doesn;t make much sense - col.olc_ = 'A'; - - return col; -} - -Real HMMColumn::GetEntropy() const { - Real entropy = 0.0; - for (const Real* j = this->freqs_begin(), *e2 = this->freqs_end(); j != e2; ++j) { - - if (*j>0.0) { - entropy -= log(*j)**j; - } - } - return entropy; -} - -Real HMMColumn::GetFreq(char ch) const{ - int idx = HMMColumn::GetIndex(ch); - if(idx == -1){ - throw std::runtime_error("Invalid One Letter Code observed when getting frequency in HMMColumn!"); - } - return freq_[idx]; -} - -void HMMColumn::SetFreq(char ch, Real freq){ - int idx = HMMColumn::GetIndex(ch); - if(idx == -1){ - throw std::runtime_error("Invalid One Letter Code observed when setting frequency in HMMColumn!"); - } - freq_[idx] = freq; -} - -HMMPtr HMM::Load(const std::string& filename) { - HMMPtr hmm(new HMM); - boost::iostreams::filtering_stream<boost::iostreams::input> in; - std::ifstream stream(filename.c_str()); - if(!stream){ - throw std::runtime_error("Could not open file!"); - } - if (filename.rfind(".gz") == filename.size()-3) { - in.push(boost::iostreams::gzip_decompressor()); - } - - in.push(stream); - std::string line; - bool null_found = false; - ost::StringRef sline; - while (std::getline(in, line)) { - sline=ost::StringRef(line.c_str(), line.length()); - if (sline.length()>4 && - sline.substr(0, 5)==ost::StringRef("NULL ", 5)) { - null_found = true; - break; - } - } - const char* olcs = "ACDEFGHIKLMNPQRSTVWY"; - if (!null_found) { - throw std::runtime_error("no NULL found in file"); - } - std::vector<ost::StringRef> chunks = sline.split(); - for (int i=1; i<21; ++i) { - if (chunks[i] != ost::StringRef("*", 1)) { - Real freq = pow(2.0, -0.001*chunks[i].to_int().second); - hmm->null_model_.SetFreq(olcs[i-1], freq); - } - } - // read until we hit HMM, then skip 3 more lines - bool hmm_found = false; - while (std::getline(in, line)) { - sline=ost::StringRef(line.c_str(), line.length()); - if (sline.length()>3 && - sline.substr(0, 4)==ost::StringRef("HMM ", 4)) { - for (int i = 0; i<2; ++i ) { - std::getline(in, line); - } - hmm_found = true; - break; - } - } - if (!hmm_found) { - throw std::runtime_error("no HMM found in file"); - } - while (std::getline(in, line)) { - sline = ost::StringRef(line.c_str(), line.length()); - if (sline.trim().empty()) continue; - if (sline.trim() == ost::StringRef("//", 2)) break; - std::vector<ost::StringRef> freqs = sline.split(); - std::string line2; - std::getline(in, line2); - ost::StringRef sline2(line2.c_str(), line2.length()); - std::vector<ost::StringRef> trans = sline2.trim().split(); - hmm->columns_.push_back(HMMColumn()); - hmm->columns_.back().SetOneLetterCode(freqs[0][0]); - for (int i = 2; i < 22; ++i) { - if (freqs[i] != ost::StringRef("*", 1)) { - hmm->columns_.back().SetFreq(olcs[i-2], - pow(2.0, -0.001*freqs[i].to_int().second)); - } - } - HMMState from,to; - int trans_freq_pos = 0; - for(uint i = 0; i < 3; ++i){ - from = HMMState(i); - for(uint j = 0; j < 3; ++j){ - if((i == 1 && j == 2) || (i == 2 && j == 1)) continue; //transition not possible - if (trans[trans_freq_pos] != ost::StringRef("*", 1)){ - Real t_freq = pow(2.0, -0.001*trans[trans_freq_pos].to_int().second); - to = HMMState(j); - hmm->columns_.back().SetTransitionFreq(from, to, t_freq); - } - ++trans_freq_pos; - } - } - - hmm->columns_.back().SetNEff(0.001 * trans[trans_freq_pos].to_int().second); - hmm->columns_.back().SetNEffIns(0.001 * trans[trans_freq_pos+1].to_int().second); - hmm->columns_.back().SetNEffDel(0.001 * trans[trans_freq_pos+2].to_int().second); - } - return hmm; -} - -String HMM::GetSequence() const{ - - std::stringstream ss; - for(HMMColumnList::const_iterator i = this->columns_begin(); - i != this->columns_end(); ++i){ - ss << i->GetOneLetterCode(); - } - return ss.str(); -} - -HMMPtr HMM::Extract(uint from, uint to){ - - if(to <= from){ - throw std::runtime_error("Second index must be bigger than first one!"); - } - - if(to > this->size()){ - throw std::runtime_error("Invalid index!"); - } - - HMMPtr return_hmm(new HMM); - return_hmm->SetNullModel(null_model_); - for(uint i = from; i < to; ++i){ - return_hmm->push_back(columns_[i]); - } - - return return_hmm; -} - - - - -Real HMM::GetAverageEntropy() const { - Real n_eff=0.0; - int n = 0; - for (std::vector<HMMColumn>::const_iterator - i = this->columns_begin(), e = this->columns_end(); i != e; ++i) { - n += 1; - n_eff += i->GetEntropy(); - } - return (n > 0) ? n_eff/n : 0.0; -} - -void HMMDB::Save(const String& filename) const{ - - std::ofstream out_stream(filename.c_str(), std::ios::binary); - - //write out total size - uint32_t total_size = data_.size(); - out_stream.write(reinterpret_cast<char*>(&total_size),4); - - //write out the data elements - char string_size; - for(std::map<String,HMMPtr>::const_iterator i = data_.begin(); - i != data_.end(); ++i){ - string_size = static_cast<char>(i->first.size()); - out_stream.write(reinterpret_cast<char*>(&string_size),1); - out_stream.write(i->first.c_str(),string_size); - out_stream << *i->second; - } - out_stream.close(); -} - -HMMDBPtr HMMDB::Load(const String& filename){ - - std::ifstream in_stream(filename.c_str(), std::ios::binary); - if (!in_stream){ - std::stringstream ss; - ss << "the file '" << filename << "' does not exist."; - throw std::runtime_error(ss.str()); - } - - HMMDBPtr db(new HMMDB); - - //read in the total size - uint32_t total_size; - in_stream.read(reinterpret_cast<char*>(&total_size),4); - - //read in the single hmms - char string_size; - for(uint i = 0; i < total_size; ++i){ - in_stream.read(&string_size,1); - String name(string_size,'X'); - in_stream.read(&name[0],string_size); - HMMPtr hmm(new HMM); - in_stream >> *hmm; - db->AddHMM(name,hmm); - } - - return db; -} - -void HMMDB::AddHMM(const String& name, HMMPtr hmm){ - if(name.size() > 255){ - throw std::runtime_error("Name of HMM must be smaller than 256!"); - } - if(name.empty()){ - throw std::runtime_error("Name must not be empty!"); - } - data_[name] = hmm; -} - -HMMPtr HMMDB::GetHMM(const String& name) const{ - std::map<String,HMMPtr>::const_iterator i = data_.find(name); - if(i == data_.end()){ - std::stringstream ss; - ss << "HMM database does not contain an entry with name "; - ss << name <<"!"; - throw std::runtime_error(ss.str()); - } - return i->second; -} - -std::vector<String> HMMDB::GetNames() const{ - std::vector<String> return_vec; - return_vec.reserve(this->size()); - for(std::map<String,HMMPtr>::const_iterator i = data_.begin(); - i != data_.end(); ++i){ - return_vec.push_back(i->first); - } - return return_vec; -} - - -}} //ns diff --git a/modules/seq/base/src/hmm.hh b/modules/seq/base/src/hmm.hh deleted file mode 100644 index f706911a6b07729fa551d7d2c87a40ee0688b2a5..0000000000000000000000000000000000000000 --- a/modules/seq/base/src/hmm.hh +++ /dev/null @@ -1,285 +0,0 @@ -#ifndef OST_SEQ_HMM_HH -#define OST_SEQ_HMM_HH - -#include <stdint.h> -#include <string> -#include <string.h> // for memcpy, etc -#include <vector> -#include <map> -#include <fstream> -#include <boost/shared_ptr.hpp> -#include <ost/base.hh> - -namespace ost { namespace seq { - -class HMM; -class HMMColumn; -class HMMDB; -typedef boost::shared_ptr<HMM> HMMPtr; -typedef boost::shared_ptr<HMMDB> HMMDBPtr; -typedef std::vector<HMMColumn> HMMColumnList; - -typedef enum { - HMM_MATCH=0, - HMM_INSERT=1, - HMM_DELETE=2 -} HMMState; - - -class HMMColumn { - public: - - HMMColumn() : n_eff_(0.0), n_eff_ins_(0.0), n_eff_del_(0.0) { - memset(freq_, 0, sizeof(Real)*20); - memset(trans_, 0, sizeof(Real)*9); - } - - HMMColumn(const HMMColumn& rhs): olc_(rhs.olc_), n_eff_(rhs.n_eff_), - n_eff_ins_(rhs.n_eff_ins_), n_eff_del_(rhs.n_eff_del_) { - memcpy(freq_, rhs.freq_, sizeof(Real)*20); - memcpy(trans_, rhs.trans_, sizeof(Real)*9); - } - - Real GetTransitionFreq(HMMState from, HMMState to) const { - return trans_[from][to]; - } - - void SetTransitionFreq(HMMState from, HMMState to, Real freq){ - trans_[from][to] = freq; - } - - void SetNEff(Real val) { n_eff_ = val; } - - void SetNEffIns(Real val) { n_eff_ins_ = val; } - - void SetNEffDel(Real val) { n_eff_del_ = val; } - - Real GetNEff() const { return n_eff_; } - - Real GetNEffIns() const { return n_eff_ins_; } - - Real GetNEffDel() const { return n_eff_del_; } - - Real GetFreq(char ch) const; - - void SetFreq(char ch, Real freq); - - bool operator==(const HMMColumn& rhs) const { - return !memcmp(freq_, rhs.freq_, sizeof(freq_)) && - !memcmp(trans_, rhs.trans_, sizeof(trans_)) && - n_eff_ == rhs.n_eff_ && - n_eff_ins_ == rhs.n_eff_ins_ && - n_eff_del_ == rhs.n_eff_del_; - } - - Real* freqs_begin() { return freq_; } - Real* freqs_end() { return freq_+20; } - const Real* freqs_begin() const { return freq_; } - const Real* freqs_end() const { return freq_+20; } - char GetOneLetterCode() const { return olc_; } - void SetOneLetterCode(char olc) { olc_ = olc; } - Real GetEntropy() const; - static int GetIndex(char ch); - - static HMMColumn BLOSUMNullModel(); - - //functions to feed streams with limited accuracy of internal data - //not intended for python export - - friend std::ofstream& operator<<(std::ofstream& os, HMMColumn& col){ - - char data[61]; - char* data_ptr = &data[0]; - - //transform aa_freq - for(uint i = 0; i < 20; ++i){ - *(reinterpret_cast<int16_t*>(data_ptr)) = static_cast<int16_t>(col.freq_[i]*10000); - data_ptr+=2; - } - //transform transition freq - *(reinterpret_cast<int16_t*>(data_ptr)) = static_cast<int16_t>(col.trans_[0][0]*10000); //M-M - data_ptr+=2; - *(reinterpret_cast<int16_t*>(data_ptr)) = static_cast<int16_t>(col.trans_[0][1]*10000); //M-I - data_ptr+=2; - *(reinterpret_cast<int16_t*>(data_ptr)) = static_cast<int16_t>(col.trans_[0][2]*10000); //M-D - data_ptr+=2; - *(reinterpret_cast<int16_t*>(data_ptr)) = static_cast<int16_t>(col.trans_[1][0]*10000); //I-M - data_ptr+=2; - *(reinterpret_cast<int16_t*>(data_ptr)) = static_cast<int16_t>(col.trans_[1][1]*10000); //I-I - data_ptr+=2; - *(reinterpret_cast<int16_t*>(data_ptr)) = static_cast<int16_t>(col.trans_[2][0]*10000); //D-M - data_ptr+=2; - *(reinterpret_cast<int16_t*>(data_ptr)) = static_cast<int16_t>(col.trans_[2][2]*10000); //D-D - data_ptr+=2; - - //transform neff values - *(reinterpret_cast<int16_t*>(data_ptr)) = static_cast<int16_t>(col.n_eff_*1000); - data_ptr+=2; - *(reinterpret_cast<int16_t*>(data_ptr)) = static_cast<int16_t>(col.n_eff_ins_*1000); - data_ptr+=2; - *(reinterpret_cast<int16_t*>(data_ptr)) = static_cast<int16_t>(col.n_eff_del_*1000); - data_ptr+=2; - - //finally write one letter code - *data_ptr = col.olc_; - - os.write(data,61); - return os; - } - - friend std::ifstream& operator>>(std::ifstream& is, HMMColumn& col){ - - char data[61]; - char* data_ptr = &data[0]; - is.read(data,61); - - //transform aa_freq - for(uint i = 0; i < 20; ++i){ - col.freq_[i] = *(reinterpret_cast<int16_t*>(data_ptr)) * 0.0001; - data_ptr+=2; - } - //transform transition freq - col.trans_[0][0] = *(reinterpret_cast<int16_t*>(data_ptr)) * 0.0001; //M-M - data_ptr+=2; - col.trans_[0][1] = *(reinterpret_cast<int16_t*>(data_ptr)) * 0.0001; //M-I - data_ptr+=2; - col.trans_[0][2] = *(reinterpret_cast<int16_t*>(data_ptr)) * 0.0001; //M-D - data_ptr+=2; - col.trans_[1][0] = *(reinterpret_cast<int16_t*>(data_ptr)) * 0.0001; //I-M - data_ptr+=2; - col.trans_[1][1] = *(reinterpret_cast<int16_t*>(data_ptr)) * 0.0001; //I-I - data_ptr+=2; - col.trans_[2][0] = *(reinterpret_cast<int16_t*>(data_ptr)) * 0.0001; //D-M - data_ptr+=2; - col.trans_[2][2] = *(reinterpret_cast<int16_t*>(data_ptr)) * 0.0001; //D-D - data_ptr+=2; - - //transform neff values - col.n_eff_ = *(reinterpret_cast<int16_t*>(data_ptr)) * 0.001; - data_ptr+=2; - col.n_eff_ins_ = *(reinterpret_cast<int16_t*>(data_ptr)) * 0.001; - data_ptr+=2; - col.n_eff_del_ = *(reinterpret_cast<int16_t*>(data_ptr)) * 0.001; - data_ptr+=2; - - //finally read one letter code - col.olc_ = *data_ptr; - - return is; - } - - private: - char olc_; - Real freq_[20]; - Real trans_[3][3]; - Real n_eff_; - Real n_eff_ins_; - Real n_eff_del_; -}; - - -class HMM { - public: - HMM() {} - - static HMMPtr Load(const std::string& filename); - - const std::vector<HMMColumn>& GetColumns() const { return columns_; } - - const HMMColumn& GetNullModel() const { return null_model_; } - - void SetNullModel(const HMMColumn& null_model) { null_model_ = null_model; } - - String GetSequence() const; - - HMMPtr Extract(uint from, uint to); - - //some functions to make it behave like a vector - - size_t size() const { return columns_.size(); } - - bool empty() const { return columns_.empty(); } - - void push_back(const HMMColumn& c) { columns_.push_back(c); } - - HMMColumn& operator[](size_t index) { return columns_[index]; } - - const HMMColumn& operator[](size_t index) const { return columns_[index]; } - - HMMColumn& at(size_t index) { return columns_.at(index); } - - const HMMColumn& at(size_t index) const { return columns_.at(index); } - - bool operator==(const HMM& other) const { return columns_ == other.columns_ && - null_model_ == other.null_model_; } - - bool operator!=(const HMM& other) const { return !(other == (*this)); } - - - HMMColumnList::const_iterator columns_end() const { return columns_.end(); } - HMMColumnList::iterator columns_end() { return columns_.end(); } - HMMColumnList::const_iterator columns_begin() const { return columns_.begin(); } - HMMColumnList::iterator columns_begin() { return columns_.begin(); } - - Real GetAverageEntropy() const; - - friend std::ofstream& operator<<(std::ofstream& os, HMM& hmm){ - - os << hmm.null_model_; - - uint32_t size = hmm.size(); - os.write(reinterpret_cast<char*>(&size),4); - - for(uint i = 0; i < size; ++i){ - os << hmm.columns_[i]; - } - - return os; - } - - friend std::ifstream& operator>>(std::ifstream& is, HMM& hmm){ - - is >> hmm.null_model_; - - uint32_t size; - is.read(reinterpret_cast<char*>(&size),4); - - hmm.columns_.resize(size); - - for(uint i = 0; i < size; ++i){ - is >> hmm.columns_[i]; - } - - return is; - } - - private: - HMMColumn null_model_; - HMMColumnList columns_; -}; - - -class HMMDB{ - -public: - - void Save(const String& filename) const; - - static HMMDBPtr Load(const String& filename); - - void AddHMM(const String& name, HMMPtr hmm); - - HMMPtr GetHMM(const String& name) const; - - size_t size() const { return data_.size(); } - - std::vector<String> GetNames() const; - -private: - - std::map<String,HMMPtr> data_; -}; - -}} - -#endif diff --git a/modules/seq/base/tests/CMakeLists.txt b/modules/seq/base/tests/CMakeLists.txt index 3bdf195757796a792f1efd69f5616d9d8f3ff70a..c160c4b320d178ce1be2255ad2d4da20d2e5c09c 100644 --- a/modules/seq/base/tests/CMakeLists.txt +++ b/modules/seq/base/tests/CMakeLists.txt @@ -4,7 +4,6 @@ set(OST_SEQ_UNIT_TESTS test_aligned_column.cc test_aligned_region.cc test_alignment.cc - test_hmm.cc test_profile.cc tests.cc ) diff --git a/modules/seq/base/tests/test_hmm.cc b/modules/seq/base/tests/test_hmm.cc deleted file mode 100644 index 47a15f9d0a56757f4926f8081499325786114146..0000000000000000000000000000000000000000 --- a/modules/seq/base/tests/test_hmm.cc +++ /dev/null @@ -1,142 +0,0 @@ -//------------------------------------------------------------------------------ -// This file is part of the OpenStructure project <www.openstructure.org> -// -// Copyright (C) 2008-2015 by the OpenStructure authors -// -// This library is free software; you can redistribute it and/or modify it under -// the terms of the GNU Lesser General Public License as published by the Free -// Software Foundation; either version 3.0 of the License, or (at your option) -// any later version. -// This library is distributed in the hope that it will be useful, but WITHOUT -// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more -// details. -// -// You should have received a copy of the GNU Lesser General Public License -// along with this library; if not, write to the Free Software Foundation, Inc., -// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -//------------------------------------------------------------------------------ -#define BOOST_TEST_DYN_LINK -#include <boost/test/unit_test.hpp> -#include <boost/test/auto_unit_test.hpp> -#include <iostream> - -#include <ost/seq/hmm.hh> - -using namespace ost::seq; - - -BOOST_AUTO_TEST_SUITE( hmm ); - - -BOOST_AUTO_TEST_CASE(hmm_loading) -{ - HMMPtr hmm = HMM::Load("testfiles/test_hmm.hhm"); - BOOST_CHECK_EQUAL(hmm->size(),5); - //check sequence with different methods of getting the columns - BOOST_CHECK_EQUAL((*hmm)[0].GetOneLetterCode(),'E'); - BOOST_CHECK_EQUAL((hmm->columns_begin()+1)->GetOneLetterCode(),'T'); - BOOST_CHECK_EQUAL(hmm->GetColumns()[2].GetOneLetterCode(),'G'); - BOOST_CHECK_EQUAL(hmm->at(3).GetOneLetterCode(),'S'); - BOOST_CHECK_EQUAL((hmm->columns_end()-1)->GetOneLetterCode(),'P'); - - //we first check, whether the null model has been read correctly - //and then randomly pick a column to check - - HMMColumn null_model = hmm->GetNullModel(); - - String olc = "ACDEFGHIKLMNPQRSTVWY"; - - Real correct_aa_freqs[] = {pow(2.0,-0.001*3706), pow(2.0,-0.001*5728), - pow(2.0,-0.001*4211), pow(2.0,-0.001*4064), - pow(2.0,-0.001*4839), pow(2.0,-0.001*3729), - pow(2.0,-0.001*4763), pow(2.0,-0.001*4308), - pow(2.0,-0.001*4069), pow(2.0,-0.001*3323), - pow(2.0,-0.001*5509), pow(2.0,-0.001*4640), - pow(2.0,-0.001*4464), pow(2.0,-0.001*4937), - pow(2.0,-0.001*4285), pow(2.0,-0.001*4423), - pow(2.0,-0.001*3815), pow(2.0,-0.001*3783), - pow(2.0,-0.001*6325), pow(2.0,-0.001*4665)}; - - for(int i = 0; i < 20; ++i){ - BOOST_CHECK_CLOSE(null_model.GetFreq(olc[i]),correct_aa_freqs[i],Real(1e-5)); - } - - //transition frequencies should all be zero in the null model - HMMState from,to; - for(int i = 0; i < 3; ++i){ - from = HMMState(i); - for(int j = 0; j < 3; ++j){ - to = HMMState(j); - BOOST_CHECK_EQUAL(null_model.GetTransitionFreq(from,to),0.0); - } - } - - //NEffs should also be zero - BOOST_CHECK_EQUAL(null_model.GetNEff(),0.0); - BOOST_CHECK_EQUAL(null_model.GetNEffIns(),0.0); - BOOST_CHECK_EQUAL(null_model.GetNEffDel(),0.0); - - - //let's check, whether a particular column has been read correctly - HMMColumn col = (*hmm)[2]; - BOOST_CHECK_EQUAL(col.GetOneLetterCode(),'G'); - BOOST_CHECK_CLOSE(col.GetNEff(),3.076,Real(1e-5)); - BOOST_CHECK_CLOSE(col.GetNEffIns(),1.097,Real(1e-5)); - BOOST_CHECK_CLOSE(col.GetNEffDel(),0.0,Real(1e-5)); - - memset(correct_aa_freqs,0,20*sizeof(Real)); - correct_aa_freqs[0] = pow(2.0,-0.001*3676); - correct_aa_freqs[1] = pow(2.0,-0.001*2597); - correct_aa_freqs[5] = pow(2.0,-0.001*5359); - correct_aa_freqs[6] = pow(2.0,-0.001*3275); - correct_aa_freqs[11] = pow(2.0,-0.001*3292); - correct_aa_freqs[12] = pow(2.0,-0.001*5077); - correct_aa_freqs[13] = pow(2.0,-0.001*3826); - correct_aa_freqs[15] = pow(2.0,-0.001*2409); - correct_aa_freqs[16] = pow(2.0,-0.001*3733); - correct_aa_freqs[17] = pow(2.0,-0.001*4503); - correct_aa_freqs[19] = pow(2.0,-0.001*3070); - - for(int i = 0; i < 20; ++i){ - BOOST_CHECK_CLOSE(col.GetFreq(olc[i]),correct_aa_freqs[i],Real(1e-5)); - } - - BOOST_CHECK_CLOSE(col.GetTransitionFreq(HMM_MATCH,HMM_MATCH),pow(2.0,-0.001*46.0),Real(1e-5)); - BOOST_CHECK_CLOSE(col.GetTransitionFreq(HMM_MATCH,HMM_INSERT),pow(2.0,-0.001*4979.0),Real(1e-5)); - BOOST_CHECK_CLOSE(col.GetTransitionFreq(HMM_MATCH,HMM_DELETE),0.0,Real(1e-5)); - BOOST_CHECK_CLOSE(col.GetTransitionFreq(HMM_INSERT,HMM_MATCH),pow(2.0,-0.001*2006.0),Real(1e-5)); - BOOST_CHECK_CLOSE(col.GetTransitionFreq(HMM_INSERT,HMM_INSERT),pow(2.0,-0.001*413.0),Real(1e-5)); - BOOST_CHECK_CLOSE(col.GetTransitionFreq(HMM_DELETE,HMM_MATCH),0.0,Real(1e-5)); - BOOST_CHECK_CLOSE(col.GetTransitionFreq(HMM_DELETE,HMM_DELETE),0.0,Real(1e-5)); - //following transitions are impossible, so frequency must be zero anyway - BOOST_CHECK_CLOSE(col.GetTransitionFreq(HMM_DELETE,HMM_INSERT),0.0,Real(1e-5)); - BOOST_CHECK_CLOSE(col.GetTransitionFreq(HMM_INSERT,HMM_DELETE),0.0,Real(1e-5)); - - BOOST_CHECK_CLOSE(col.GetNEff(),3076*0.001,Real(1e-5)); - BOOST_CHECK_CLOSE(col.GetNEffIns(),1097*0.001,Real(1e-5)); - BOOST_CHECK_CLOSE(col.GetNEffDel(),0.0,Real(1e-5)); - - - //check, whether the entropy of the column above gets correctly calculated - Real entropy = 0.0; - entropy -= correct_aa_freqs[0] * log(correct_aa_freqs[0]); - entropy -= correct_aa_freqs[1] * log(correct_aa_freqs[1]); - entropy -= correct_aa_freqs[5] * log(correct_aa_freqs[5]); - entropy -= correct_aa_freqs[6] * log(correct_aa_freqs[6]); - entropy -= correct_aa_freqs[11] * log(correct_aa_freqs[11]); - entropy -= correct_aa_freqs[12] * log(correct_aa_freqs[12]); - entropy -= correct_aa_freqs[13] * log(correct_aa_freqs[13]); - entropy -= correct_aa_freqs[15] * log(correct_aa_freqs[15]); - entropy -= correct_aa_freqs[16] * log(correct_aa_freqs[16]); - entropy -= correct_aa_freqs[17] * log(correct_aa_freqs[17]); - entropy -= correct_aa_freqs[19] * log(correct_aa_freqs[19]); - BOOST_CHECK_CLOSE((*hmm)[2].GetEntropy(),entropy,Real(1e-5)); - - //look at avg entropy... - BOOST_CHECK_CLOSE(hmm->GetAverageEntropy(),1.48704576,Real(1e-5)); -} - - - -BOOST_AUTO_TEST_SUITE_END(); diff --git a/modules/seq/base/tests/testfiles/test_hmm.hhm b/modules/seq/base/tests/testfiles/test_hmm.hhm deleted file mode 100644 index 1b9a665d1e1ce09e89efa6f38f1648a38bdbe529..0000000000000000000000000000000000000000 --- a/modules/seq/base/tests/testfiles/test_hmm.hhm +++ /dev/null @@ -1,31 +0,0 @@ -HHsearch 1.5 -I don't get parsed -I don't get parsed -I don't get parsed -I don't get parsed -I don't get parsed -I don't get parsed -I don't get parsed -I don't get parsed -I don't get parsed -# -NULL 3706 5728 4211 4064 4839 3729 4763 4308 4069 3323 5509 4640 4464 4937 4285 4423 3815 3783 6325 4665 -HMM A C D E F G H I K L M N P Q R S T V W Y - M->M M->I M->D I->M I->I D->M D->D Neff Neff_I Neff_D - 0 * * 0 * 0 * * * * -E 1 1780 * * 497 * * * * * * * * * * * * * * * * 1 - 0 * * * * * * 1610 0 0 - -T 2 1615 * * * * 2652 * * * * * * * * * * 3992 1147 * * 2 - 0 * * * * * * 2652 0 0 - -G 3 3676 2597 * * * 5359 3275 * * * * 3292 5077 3826 * 2409 3733 4503 * 3070 3 - 46 4979 * 2006 413 * * 3076 1097 0 - -S 4 2579 * * * * * * * * 4320 * 4532 3895 4519 * 951 5140 3595 * * 4 - 58 4660 * 2833 218 * * 3050 1146 0 - -P 5 2585 * 4184 * * 2345 4748 * * * * * 2839 1766 5056 3635 * * * * 5 - 0 * * * * * * 3036 0 0 - -//