diff --git a/modules/seq/base/doc/seq.rst b/modules/seq/base/doc/seq.rst index da94a428bcd6f589c9cab4ae2c084c597de13e10..254082fe0930b87605b50d54e4f382d5690c1c54 100644 --- a/modules/seq/base/doc/seq.rst +++ b/modules/seq/base/doc/seq.rst @@ -519,6 +519,49 @@ single columns containing amino acid frequencies and transition probabilities. Average entropy of all the columns +.. class:: HMMDB + + A simple database to gather :class:`HMM` objects. It is possible + to save them to disk in a compressed format with limited accuracy + (4 digits for freq values). + + .. method:: Save(filename) + + :param filename: Name of file that will be generated on disk. + :type filename: :class:`str` + + .. method:: Load(filename) + + Static loading method + + :param filename: Name of file from which the database should be loaded. + :type filename: :class:`str` + :returns: The loaded database + + .. method:: AddHMM(name, hmm) + + :param name: Name of HMM to be added + :param hmm: HMM to be added + + :type name: :class:`str` + :type hmm: :class:`HMM` + :raises: :class:`Exception` when filename is longer than 255 characters. + + .. method:: GetHMM(name) + + :param name: Name of HMM to be returned + :type name: :class:`str` + :returns: The requested :class:`HMM` + :raises: :class:`Exception` when no :class:`HMM` for **name** exists. + + .. method:: Size() + + :returns: Number of :class:`HMM` objects in the database + + .. method:: GetNames() + + :returns: A nonsorted list of the names of all :class:`HMM` objects in the database + diff --git a/modules/seq/base/pymod/export_hmm.cc b/modules/seq/base/pymod/export_hmm.cc index fea380dcab1f3fb56d3077722d125f5b9020a55f..8ca85193958010d44d02c1b6506268ee7b2e7f1b 100644 --- a/modules/seq/base/pymod/export_hmm.cc +++ b/modules/seq/base/pymod/export_hmm.cc @@ -4,6 +4,20 @@ using namespace ost::seq; using namespace boost::python; +namespace{ + +boost::python::list wrap_get_names(HMMDBPtr db){ + std::vector<String> v_names = db->GetNames(); + boost::python::list names; + for(std::vector<String>::iterator i = v_names.begin(); + i != v_names.end(); ++i){ + names.append(*i); + } + return names; +} + +} + void export_hmm() { @@ -46,4 +60,14 @@ void export_hmm() .add_property("avg_entropy", &HMM::GetAverageEntropy) .add_property("sequence",&HMM::GetSequence) ; + + + class_<HMMDB, HMMDBPtr>("HMMDB", init<>()) + .def("Load", &HMMDB::Load).staticmethod("Load") + .def("Save", &HMMDB::Save) + .def("AddHMM", &HMMDB::AddHMM) + .def("GetHMM", &HMMDB::GetHMM) + .def("Size", &HMMDB::size) + .def("GetNames",&wrap_get_names) + ; } diff --git a/modules/seq/base/src/hmm.cc b/modules/seq/base/src/hmm.cc index e53b062731f668fd06bfcc5b17d41bad86c2fa93..390a55b82eb556ced56dbda62bf97de31dc8cb35 100644 --- a/modules/seq/base/src/hmm.cc +++ b/modules/seq/base/src/hmm.cc @@ -171,4 +171,85 @@ Real HMM::GetAverageEntropy() const { return (n > 0) ? n_eff/n : 0.0; } +void HMMDB::Save(const String& filename) const{ + + std::ofstream out_stream(filename.c_str(), std::ios::binary); + + //write out total size + uint32_t total_size = data_.size(); + out_stream.write(reinterpret_cast<char*>(&total_size),4); + + //write out the data elements + char string_size; + for(std::map<String,HMMPtr>::const_iterator i = data_.begin(); + i != data_.end(); ++i){ + string_size = static_cast<char>(i->first.size()); + out_stream.write(reinterpret_cast<char*>(&string_size),1); + out_stream.write(i->first.c_str(),string_size); + out_stream << *i->second; + } + out_stream.close(); +} + +HMMDBPtr HMMDB::Load(const String& filename){ + + std::ifstream in_stream(filename.c_str(), std::ios::binary); + if (!in_stream){ + std::stringstream ss; + ss << "the file '" << filename << "' does not exist."; + throw std::runtime_error(ss.str()); + } + + HMMDBPtr db(new HMMDB); + + //read in the total size + uint32_t total_size; + in_stream.read(reinterpret_cast<char*>(&total_size),4); + + //read in the single hmms + char string_size; + for(uint i = 0; i < total_size; ++i){ + in_stream.read(&string_size,1); + String name(string_size,'X'); + in_stream.read(&name[0],string_size); + HMMPtr hmm(new HMM); + in_stream >> *hmm; + db->AddHMM(name,hmm); + } + + return db; +} + +void HMMDB::AddHMM(const String& name, HMMPtr hmm){ + if(name.size() > 255){ + throw std::runtime_error("Name of HMM must be smaller than 256!"); + } + if(name.empty()){ + throw std::runtime_error("Name must not be empty!"); + } + data_[name] = hmm; +} + +HMMPtr HMMDB::GetHMM(const String& name) const{ + std::map<String,HMMPtr>::const_iterator i = data_.find(name); + if(i == data_.end()){ + std::stringstream ss; + ss << "HMM database does not contain an entry with name "; + ss << name <<"!"; + throw std::runtime_error(ss.str()); + } + return i->second; +} + +std::vector<String> HMMDB::GetNames() const{ + std::vector<String> return_vec; + return_vec.reserve(this->size()); + for(std::map<String,HMMPtr>::const_iterator i = data_.begin(); + i != data_.end(); ++i){ + return_vec.push_back(i->first); + } + return return_vec; +} + + }} //ns diff --git a/modules/seq/base/src/hmm.hh b/modules/seq/base/src/hmm.hh index 0dcaf36866e316ca1bd53b13d44406252fc21845..82826948e6a81e4d96d815357196dd0bfa5d2bdd 100644 --- a/modules/seq/base/src/hmm.hh +++ b/modules/seq/base/src/hmm.hh @@ -3,11 +3,20 @@ #include <string> #include <vector> +#include <map> #include <fstream> #include <boost/shared_ptr.hpp> #include <ost/base.hh> namespace ost { namespace seq { + +class HMM; +class HMMColumn; +class HMMDB; +typedef boost::shared_ptr<HMM> HMMPtr; +typedef boost::shared_ptr<HMMDB> HMMDBPtr; +typedef std::vector<HMMColumn> HMMColumnList; + typedef enum { HMM_MATCH=0, HMM_INSERT=1, @@ -171,11 +180,6 @@ class HMMColumn { }; -class HMM; -typedef boost::shared_ptr<HMM> HMMPtr; -typedef std::vector<HMMColumn> HMMColumnList; -// Represents a HHsearch profile -// Minimalistic on Purpose. class HMM { public: HMM() {} @@ -249,13 +253,33 @@ class HMM { return is; } - - private: HMMColumn null_model_; HMMColumnList columns_; }; + +class HMMDB{ + +public: + + void Save(const String& filename) const; + + static HMMDBPtr Load(const String& filename); + + void AddHMM(const String& name, HMMPtr hmm); + + HMMPtr GetHMM(const String& name) const; + + size_t size() const { return data_.size(); } + + std::vector<String> GetNames() const; + +private: + + std::map<String,HMMPtr> data_; +}; + }} #endif