diff --git a/modules/io/src/seq/hhm_io_handler.cc b/modules/io/src/seq/hhm_io_handler.cc index b524f3161bae3a85f541ad2213d7fcb9cf07b138..bc1effe40944a244b4b4f648f05d57f8278fcea8 100644 --- a/modules/io/src/seq/hhm_io_handler.cc +++ b/modules/io/src/seq/hhm_io_handler.cc @@ -58,6 +58,67 @@ seq::ProfileColumn GetColumn(const std::vector<ost::StringRef>& chunks, } return pc; } + +seq::HMMDataPtr GetHMMData(const std::vector<ost::StringRef>& chunks, + const std::vector<uint>& idx, + const std::vector<seq::HMMTransition>& transitions, + int neff_idx, + int neff_i_idx, + int neff_d_idx, + const String& exception) { + + // check chunks size + if(chunks.size() != 10) { + throw IOException(exception); + } + + seq::HMMDataPtr data(new seq::HMMData); + for (uint i = 0; i < idx.size(); ++i) { + if (chunks[idx[i]] != ost::StringRef("*", 1)) { + std::pair<bool, int> p = chunks[idx[i]].to_int(); + if (!p.first) { + throw IOException(exception); + } + data->SetProb(transitions[i], pow(2.0, -0.001 * p.second)); + } else { + data->SetProb(transitions[i], 0.0); + } + } + + if (chunks[neff_idx] != ost::StringRef("*", 1)) { + std::pair<bool, int> p = chunks[neff_idx].to_int(); + if (!p.first) { + throw IOException(exception); + } + data->SetNeff(0.001 * p.second); + } else { + data->SetNeff(0.0); + } + + if (chunks[neff_i_idx] != ost::StringRef("*", 1)) { + std::pair<bool, int> p = chunks[neff_i_idx].to_int(); + if (!p.first) { + throw IOException(exception); + } + data->SetNeff_I(0.001 * p.second); + } else { + data->SetNeff_I(0.0); + } + + if (chunks[neff_d_idx] != ost::StringRef("*", 1)) { + std::pair<bool, int> p = chunks[neff_d_idx].to_int(); + if (!p.first) { + throw IOException(exception); + } + data->SetNeff_D(0.001 * p.second); + } else { + data->SetNeff_D(0.0); + } + + return data; +} + + } // anon ns void HhmIOHandler::Import(seq::ProfileHandle& prof, @@ -81,11 +142,18 @@ void HhmIOHandler::Import(seq::ProfileHandle& prof, std::string line; ost::StringRef sline; std::string null_line; + std::string neff_line; char olcs[20]; + std::vector<ost::seq::HMMTransition> transitions; + std::vector<uint> transition_idx; + int neff_idx = -1; + int neff_i_idx = -1; + int neff_d_idx = -1; std::vector<ost::StringRef> chunks; - // read line-by-line looking for NULL + // read line-by-line looking for NEFF and NULL bool null_found = false; + bool neff_found = false; while (std::getline(in, line)) { sline = ost::StringRef(line.c_str(), line.length()); if (sline.length()>4 && @@ -94,6 +162,11 @@ void HhmIOHandler::Import(seq::ProfileHandle& prof, null_found = true; break; } + if (sline.length()>4 && + sline.substr(0, 5) == ost::StringRef("NEFF ", 5)) { + neff_line = line; + neff_found = true; + } } if (!null_found) { throw IOException("No NULL found in file " + loc.string()); @@ -115,8 +188,57 @@ void HhmIOHandler::Import(seq::ProfileHandle& prof, } olcs[i-1] = chunks[i][0]; } - // skip 2 lines and get out + // extract hmm data std::getline(in, line); + sline = ost::StringRef(line.c_str(), line.length()); + chunks = sline.split(); + if (chunks.size() != 10) { + throw IOException("Badly formatted HMM line in " + loc.string()); + } + + for (uint i = 0; i < 10; ++i) { + if(chunks[i] == ost::StringRef("M->M", 4)) { + transition_idx.push_back(i); + transitions.push_back(ost::seq::HMM_M2M); + } else if(chunks[i] == ost::StringRef("M->I", 4)) { + transition_idx.push_back(i); + transitions.push_back(ost::seq::HMM_M2I); + } else if(chunks[i] == ost::StringRef("M->D", 4)) { + transition_idx.push_back(i); + transitions.push_back(ost::seq::HMM_M2D); + } else if(chunks[i] == ost::StringRef("I->M", 4)) { + transition_idx.push_back(i); + transitions.push_back(ost::seq::HMM_I2M); + } else if(chunks[i] == ost::StringRef("I->I", 4)) { + transition_idx.push_back(i); + transitions.push_back(ost::seq::HMM_I2I); + } else if(chunks[i] == ost::StringRef("D->M", 4)) { + transition_idx.push_back(i); + transitions.push_back(ost::seq::HMM_D2M); + } else if(chunks[i] == ost::StringRef("D->D", 4)) { + transition_idx.push_back(i); + transitions.push_back(ost::seq::HMM_D2D); + } else if(chunks[i] == ost::StringRef("Neff", 4)) { + neff_idx = i; + } else if(chunks[i] == ost::StringRef("Neff_I", 6)) { + neff_i_idx = i; + } else if(chunks[i] == ost::StringRef("Neff_D", 6)) { + neff_d_idx = i; + } else { + throw IOException("Badly formatted HMM line in " + loc.string()); + } + } + + // check whether we really got everything + if(neff_idx == -1 || neff_i_idx == -1 || neff_d_idx == -1) { + throw IOException("Badly formatted HMM line in " + loc.string()); + } + + if(transition_idx.size() != 7) { + throw IOException("Badly formatted HMM line in " + loc.string()); + } + + // skip one line std::getline(in, line); hmm_found = true; break; @@ -142,8 +264,42 @@ void HhmIOHandler::Import(seq::ProfileHandle& prof, // frequencies prof.AddColumn(GetColumn(chunks, 2, olcs, "Badly formatted line\n" + line + "\n in " + loc.string()), olc); - // skip line (trans. matrix) + // get transition probabilities std::getline(in, line); + sline = ost::StringRef(line.c_str(), line.length()); + if (sline.trim().empty()) continue; + if (sline.trim() == ost::StringRef("//", 2)) break; + chunks = sline.split(); + prof.back().SetHMMData(GetHMMData(chunks, transition_idx, + transitions, neff_idx, neff_i_idx, neff_d_idx, + "Badly formatted line\n" + line + "\n in " + + loc.string())); + } + + // parse neff if it's there, calculate the average of every column otherwise + if(neff_found) { + sline = ost::StringRef(neff_line.c_str(), neff_line.length()); + chunks = sline.split(); + if(chunks.size() != 2) { + throw IOException("Badly formatted line\n" + neff_line+ "\n in " + + loc.string()); + } + std::pair<bool, float> p = chunks[1].to_float(); + if (!p.first) { + throw IOException("Badly formatted line\n" + neff_line+ "\n in " + + loc.string()); + } + prof.SetNeff(p.second); + } else { + Real summed_neff = 0.0; + for(size_t i = 0; i < prof.size(); ++i) { + summed_neff += prof[i].GetHMMData()->GetNeff(); + } + if(prof.size() > 0) { + prof.SetNeff(summed_neff/prof.size()); + } else{ + prof.SetNeff(1.0); + } } } diff --git a/modules/seq/base/pymod/export_profile_handle.cc b/modules/seq/base/pymod/export_profile_handle.cc index 101446d4584f3c26ffd9e57f9241e51a60953e64..41834d556ac9fc84ad8bb5619875105d44a6899f 100644 --- a/modules/seq/base/pymod/export_profile_handle.cc +++ b/modules/seq/base/pymod/export_profile_handle.cc @@ -44,10 +44,30 @@ boost::python::list wrap_get_names(ProfileDBPtr db) { void export_profile_handle() { + + enum_<HMMTransition>("HMMTransition") + .value("HMM_M2M", HMM_M2M) + .value("HMM_M2I", HMM_M2I) + .value("HMM_M2D", HMM_M2D) + .value("HMM_I2M", HMM_I2M) + .value("HMM_I2I", HMM_I2I) + .value("HMM_D2M", HMM_D2M) + .value("HMM_D2D", HMM_D2D) + ; + + class_<HMMData>("HMMData", init<>()) + .add_property("neff", &HMMData::GetNeff, &HMMData::SetNeff) + .add_property("neff_i", &HMMData::GetNeff_I, &HMMData::SetNeff_I) + .add_property("neff_d", &HMMData::GetNeff_D, &HMMData::SetNeff_D) + .def("GetProb", &HMMData::GetProb, (arg("transition"))) + .def("SetProb", &HMMData::SetProb, (arg("transition"), arg("prob"))) + ; + class_<ProfileColumn>("ProfileColumn", init<>()) .add_property("entropy", &ProfileColumn::GetEntropy) .def("GetFreq", &ProfileColumn::GetFreq, (arg("aa"))) .def("SetFreq", &ProfileColumn::SetFreq, (arg("aa"), arg("freq"))) + .add_property("hmm_data", &ProfileColumn::GetHMMData, &ProfileColumn::SetHMMData) .def("GetScore", &ProfileColumn::GetScore, (arg("other"), arg("null_model"))) .def("BLOSUMNullModel", &ProfileColumn::BLOSUMNullModel) @@ -76,6 +96,8 @@ void export_profile_handle() .add_property("avg_entropy", &ProfileHandle::GetAverageEntropy) .add_property("sequence", &ProfileHandle::GetSequence, &ProfileHandle::SetSequence) + .add_property("neff", &ProfileHandle::GetNeff, + &ProfileHandle::SetNeff) ; class_<ProfileDB, ProfileDBPtr>("ProfileDB", init<>()) diff --git a/modules/seq/base/src/profile_handle.cc b/modules/seq/base/src/profile_handle.cc index c6aee7a587e45d95ee063e93472de52aafdd3a60..3daa7f185ac6678da8bea5ebf25bb48e6c40b456 100644 --- a/modules/seq/base/src/profile_handle.cc +++ b/modules/seq/base/src/profile_handle.cc @@ -137,7 +137,7 @@ Real ProfileColumn::GetScore(const ProfileColumn& other, // ProfileHandle ////////////////////////////////////////////////////////////////////////////// -ProfileHandlePtr ProfileHandle::Extract(uint from, uint to) { +ProfileHandlePtr ProfileHandle::Extract(uint from, uint to) const { // check if (to <= from) { throw Error("Second index must be bigger than first one!"); @@ -187,10 +187,17 @@ Real ProfileHandle::GetAverageScore(const ProfileHandle& other, // ProfileDB ////////////////////////////////////////////////////////////////////////////// + void ProfileDB::Save(const String& filename) const{ std::ofstream out_stream(filename.c_str(), std::ios::binary); + uint32_t magic_number = 42; + out_stream.write(reinterpret_cast<char*>(&magic_number), sizeof(uint32_t)); + + uint8_t version = 1; + out_stream.write(reinterpret_cast<char*>(&version), sizeof(uint8_t)); + //write out total size uint32_t total_size = data_.size(); out_stream.write(reinterpret_cast<char*>(&total_size),4); @@ -218,6 +225,26 @@ ProfileDBPtr ProfileDB::Load(const String& filename){ ProfileDBPtr db(new ProfileDB); + uint32_t magic_number; + in_stream.read(reinterpret_cast<char*>(&magic_number), sizeof(uint32_t)); + + if(magic_number != 42) { + std::stringstream ss; + ss << "Could not read magic number in " << filename<<". Either the file "; + ss << "is corrupt, does not contain a ProfileDB or is of an old version "; + ss << "which is not supported anymore."; + throw Error(ss.str()); + } + + uint8_t version; + in_stream.read(reinterpret_cast<char*>(&version), sizeof(uint8_t)); + if(version != 1) { + std::stringstream ss; + ss << "ProfileDB in " << filename << " is of version " << version; + ss << " but only version 1 can be read."; + throw Error(ss.str()); + } + //read in the total size uint32_t total_size; in_stream.read(reinterpret_cast<char*>(&total_size),4); diff --git a/modules/seq/base/src/profile_handle.hh b/modules/seq/base/src/profile_handle.hh index 4a06461b86940a96eb3b76945324984bdeb9e03b..0ec31d3de2dfa35d619d7b23f92f5ac258a9b438 100644 --- a/modules/seq/base/src/profile_handle.hh +++ b/modules/seq/base/src/profile_handle.hh @@ -39,12 +39,128 @@ namespace ost { namespace seq { class ProfileHandle; class ProfileColumn; +class HMMData; class ProfileDB; typedef boost::shared_ptr<ProfileHandle> ProfileHandlePtr; +typedef boost::shared_ptr<HMMData> HMMDataPtr; typedef std::vector<ProfileHandlePtr> ProfileHandleList; typedef boost::shared_ptr<ProfileDB> ProfileDBPtr; typedef std::vector<ProfileColumn> ProfileColumnList; +typedef enum { + HMM_M2M = 0, HMM_M2I = 1, HMM_M2D = 2, + HMM_I2M = 3, HMM_I2I = 4, + HMM_D2M = 5, HMM_D2D = 6 +} HMMTransition; + +class DLLEXPORT_OST_SEQ HMMData { +public: + HMMData() { + memset(trans_, 0, 7*sizeof(Real)); + trans_[HMM_M2M] = 1.0; + trans_[HMM_I2M] = 1.0; + trans_[HMM_D2M] = 1.0; + neff_ = 1.0; + neff_i_ = 1.0; + neff_d_ = 1.0; + } + + HMMData(const HMMData& rhs) { + memcpy(trans_, rhs.trans_, 7*sizeof(Real)); + neff_ = rhs.neff_; + neff_i_ = rhs.neff_i_; + neff_d_ = rhs.neff_d_; + } + + Real GetProb(HMMTransition transition) const { + return trans_[transition]; + } + + void SetProb(HMMTransition transition, Real prob) { + trans_[transition] = prob; + } + + Real GetNeff() const { + return neff_; + } + + void SetNeff(Real neff) { + neff_ = neff; + } + + Real GetNeff_I() const { + return neff_i_; + } + + void SetNeff_I(Real neff) { + neff_i_ = neff; + } + + Real GetNeff_D() const { + return neff_d_; + } + + void SetNeff_D(Real neff) { + neff_d_ = neff; + } + + bool operator==(const HMMData& rhs) const { + return (!memcmp(trans_, rhs.trans_, sizeof(trans_)) && + neff_ == rhs.neff_ && + neff_i_ == rhs.neff_i_ && + neff_d_ == rhs.neff_d_); + } + bool operator!=(const HMMData& rhs) const { return !(rhs == (*this)); } + + HMMData& operator=(const HMMData& rhs) { + memcpy(trans_, rhs.trans_, 7*sizeof(Real)); + neff_ = rhs.neff_; + neff_i_ = rhs.neff_i_; + neff_d_ = rhs.neff_d_; + return *this; + } + + friend std::ofstream& operator<<(std::ofstream& os, HMMData& dat) { + + int16_t p_data[7]; + for (uint i = 0; i < 7; ++i) { + p_data[i] = static_cast<int16_t>(dat.trans_[i]*10000); + } + os.write(reinterpret_cast<char*>(p_data), 7*sizeof(int16_t)); + + float neff_data[3]; + neff_data[0] = dat.neff_; + neff_data[1] = dat.neff_i_; + neff_data[2] = dat.neff_d_; + os.write(reinterpret_cast<char*>(neff_data), 3*sizeof(float)); + + return os; + } + + friend std::ifstream& operator>>(std::ifstream& is, HMMData& dat) { + + int16_t p_data[7]; + is.read(reinterpret_cast<char*>(p_data), 7*sizeof(int16_t)); + for (uint i = 0; i < 7; ++i) { + dat.trans_[i] = p_data[i] * 0.0001; + } + + float neff_data[3]; + is.read(reinterpret_cast<char*>(neff_data), 3*sizeof(float)); + dat.neff_ = neff_data[0]; + dat.neff_i_ = neff_data[1]; + dat.neff_d_ = neff_data[2]; + + return is; + } + +private: + Real trans_[7]; + Real neff_; + Real neff_i_; + Real neff_d_; +}; + /// \brief Defines profile of 20 frequencies for one residue. /// /// Frequencies are identified by the one-letter-code for that amino acid. @@ -59,15 +175,43 @@ public: ProfileColumn(const ProfileColumn& rhs) { memcpy(freq_, rhs.freq_, sizeof(freq_)); + if(rhs.hmm_data_) { + // do deep copy + hmm_data_ = HMMDataPtr(new HMMData(*rhs.hmm_data_)); + } } ProfileColumn& operator= (const ProfileColumn& rhs) { memcpy(freq_, rhs.freq_, sizeof(freq_)); + if(rhs.hmm_data_) { + // do deep copy + hmm_data_ = HMMDataPtr(new HMMData(*rhs.hmm_data_)); + } else if(hmm_data_) { + hmm_data_ = HMMDataPtr(); + } return *this; } static ProfileColumn BLOSUMNullModel(); static ProfileColumn HHblitsNullModel(); + void SetHMMData(HMMDataPtr p) { + hmm_data_ = p; + } + + HMMDataPtr GetHMMData() const{ + if(!hmm_data_) { + throw Error("ProfileColumn has no HMM data set!"); + } + return hmm_data_; + } + + Real GetTransProb(HMMTransition transition) const { + if(!hmm_data_) { + throw Error("ProfileColumn has no HMM data set!"); + } + return hmm_data_->GetProb(transition); + } + /// \brief Translate one-letter-code to index (0-indexing). static int GetIndex(char ch); @@ -102,6 +246,16 @@ public: data[i] = static_cast<int16_t>(col.freq_[i]*10000); } os.write(reinterpret_cast<char*>(data), sizeof(data)); + + if(col.hmm_data_) { + bool has_hmm_data = true; + os.write(reinterpret_cast<char*>(&has_hmm_data), 1); + os << *col.hmm_data_; + } else { + bool has_hmm_data = false; + os.write(reinterpret_cast<char*>(&has_hmm_data), 1); + } + return os; } @@ -112,11 +266,19 @@ public: for (uint i = 0; i < 20; ++i) { col.freq_[i] = data[i] * 0.0001; } + + bool has_hmm_data; + is.read(reinterpret_cast<char*>(&has_hmm_data), 1); + if(has_hmm_data) { + is >> *col.hmm_data_; + } + return is; } private: Real freq_[20]; + HMMDataPtr hmm_data_; }; /// \brief Provides a profile for a sequence. @@ -129,7 +291,7 @@ private: class DLLEXPORT_OST_SEQ ProfileHandle { public: /// \brief Constructs an empty profile handle (sequence = '', 0 columns). - ProfileHandle(): null_model_(ProfileColumn::HHblitsNullModel()) {} + ProfileHandle(): null_model_(ProfileColumn::HHblitsNullModel()), neff_(1.0) {} // uses compiler-generated copy- and assignment operators (work here!) @@ -151,10 +313,14 @@ public: seq_ = seq; } + Real GetNeff() const { return neff_; } + + void SetNeff(Real neff) { neff_ = neff; } + /// \brief Extract subset of profile for columns from until to-1 (0-indexing). /// Null model is copied from this profile. /// \throw Error if to <= from or to > size(). - ProfileHandlePtr Extract(uint from, uint to); + ProfileHandlePtr Extract(uint from, uint to) const; /// \brief Compute average entropy over all columns. Real GetAverageEntropy() const; @@ -186,6 +352,10 @@ public: const ProfileColumn& at(size_t index) const { return columns_.at(index); } + ProfileColumn& back() { return columns_.back(); } + + const ProfileColumn& back() const { return columns_.back(); } + bool operator==(const ProfileHandle& other) const { return seq_ == other.seq_ && columns_ == other.columns_ && @@ -251,6 +421,7 @@ private: String seq_; ProfileColumn null_model_; ProfileColumnList columns_; + Real neff_; }; /// \brief Contains a DB of profiles (identified by a unique name (String)).