Skip to content
Snippets Groups Projects
Commit a01f38e6 authored by Studer Gabriel's avatar Studer Gabriel
Browse files

add support for hmm related data in ProfileHandle

Every column has optionally a HMMData object set that contains transition
probabilities and neff values. Any access when its not set throws an error.
The HHM file reader has been updated to read the data in.
parent d5c9ab52
No related branches found
No related tags found
No related merge requests found
......@@ -58,6 +58,67 @@ seq::ProfileColumn GetColumn(const std::vector<ost::StringRef>& chunks,
}
return pc;
}
seq::HMMDataPtr GetHMMData(const std::vector<ost::StringRef>& chunks,
const std::vector<uint>& idx,
const std::vector<seq::HMMTransition>& transitions,
int neff_idx,
int neff_i_idx,
int neff_d_idx,
const String& exception) {
// check chunks size
if(chunks.size() != 10) {
throw IOException(exception);
}
seq::HMMDataPtr data(new seq::HMMData);
for (uint i = 0; i < idx.size(); ++i) {
if (chunks[idx[i]] != ost::StringRef("*", 1)) {
std::pair<bool, int> p = chunks[idx[i]].to_int();
if (!p.first) {
throw IOException(exception);
}
data->SetProb(transitions[i], pow(2.0, -0.001 * p.second));
} else {
data->SetProb(transitions[i], 0.0);
}
}
if (chunks[neff_idx] != ost::StringRef("*", 1)) {
std::pair<bool, int> p = chunks[neff_idx].to_int();
if (!p.first) {
throw IOException(exception);
}
data->SetNeff(0.001 * p.second);
} else {
data->SetNeff(0.0);
}
if (chunks[neff_i_idx] != ost::StringRef("*", 1)) {
std::pair<bool, int> p = chunks[neff_i_idx].to_int();
if (!p.first) {
throw IOException(exception);
}
data->SetNeff_I(0.001 * p.second);
} else {
data->SetNeff_I(0.0);
}
if (chunks[neff_d_idx] != ost::StringRef("*", 1)) {
std::pair<bool, int> p = chunks[neff_d_idx].to_int();
if (!p.first) {
throw IOException(exception);
}
data->SetNeff_D(0.001 * p.second);
} else {
data->SetNeff_D(0.0);
}
return data;
}
} // anon ns
void HhmIOHandler::Import(seq::ProfileHandle& prof,
......@@ -81,11 +142,18 @@ void HhmIOHandler::Import(seq::ProfileHandle& prof,
std::string line;
ost::StringRef sline;
std::string null_line;
std::string neff_line;
char olcs[20];
std::vector<ost::seq::HMMTransition> transitions;
std::vector<uint> transition_idx;
int neff_idx = -1;
int neff_i_idx = -1;
int neff_d_idx = -1;
std::vector<ost::StringRef> chunks;
// read line-by-line looking for NULL
// read line-by-line looking for NEFF and NULL
bool null_found = false;
bool neff_found = false;
while (std::getline(in, line)) {
sline = ost::StringRef(line.c_str(), line.length());
if (sline.length()>4 &&
......@@ -94,6 +162,11 @@ void HhmIOHandler::Import(seq::ProfileHandle& prof,
null_found = true;
break;
}
if (sline.length()>4 &&
sline.substr(0, 5) == ost::StringRef("NEFF ", 5)) {
neff_line = line;
neff_found = true;
}
}
if (!null_found) {
throw IOException("No NULL found in file " + loc.string());
......@@ -115,8 +188,57 @@ void HhmIOHandler::Import(seq::ProfileHandle& prof,
}
olcs[i-1] = chunks[i][0];
}
// skip 2 lines and get out
// extract hmm data
std::getline(in, line);
sline = ost::StringRef(line.c_str(), line.length());
chunks = sline.split();
if (chunks.size() != 10) {
throw IOException("Badly formatted HMM line in " + loc.string());
}
for (uint i = 0; i < 10; ++i) {
if(chunks[i] == ost::StringRef("M->M", 4)) {
transition_idx.push_back(i);
transitions.push_back(ost::seq::HMM_M2M);
} else if(chunks[i] == ost::StringRef("M->I", 4)) {
transition_idx.push_back(i);
transitions.push_back(ost::seq::HMM_M2I);
} else if(chunks[i] == ost::StringRef("M->D", 4)) {
transition_idx.push_back(i);
transitions.push_back(ost::seq::HMM_M2D);
} else if(chunks[i] == ost::StringRef("I->M", 4)) {
transition_idx.push_back(i);
transitions.push_back(ost::seq::HMM_I2M);
} else if(chunks[i] == ost::StringRef("I->I", 4)) {
transition_idx.push_back(i);
transitions.push_back(ost::seq::HMM_I2I);
} else if(chunks[i] == ost::StringRef("D->M", 4)) {
transition_idx.push_back(i);
transitions.push_back(ost::seq::HMM_D2M);
} else if(chunks[i] == ost::StringRef("D->D", 4)) {
transition_idx.push_back(i);
transitions.push_back(ost::seq::HMM_D2D);
} else if(chunks[i] == ost::StringRef("Neff", 4)) {
neff_idx = i;
} else if(chunks[i] == ost::StringRef("Neff_I", 6)) {
neff_i_idx = i;
} else if(chunks[i] == ost::StringRef("Neff_D", 6)) {
neff_d_idx = i;
} else {
throw IOException("Badly formatted HMM line in " + loc.string());
}
}
// check whether we really got everything
if(neff_idx == -1 || neff_i_idx == -1 || neff_d_idx == -1) {
throw IOException("Badly formatted HMM line in " + loc.string());
}
if(transition_idx.size() != 7) {
throw IOException("Badly formatted HMM line in " + loc.string());
}
// skip one line
std::getline(in, line);
hmm_found = true;
break;
......@@ -142,8 +264,42 @@ void HhmIOHandler::Import(seq::ProfileHandle& prof,
// frequencies
prof.AddColumn(GetColumn(chunks, 2, olcs, "Badly formatted line\n"
+ line + "\n in " + loc.string()), olc);
// skip line (trans. matrix)
// get transition probabilities
std::getline(in, line);
sline = ost::StringRef(line.c_str(), line.length());
if (sline.trim().empty()) continue;
if (sline.trim() == ost::StringRef("//", 2)) break;
chunks = sline.split();
prof.back().SetHMMData(GetHMMData(chunks, transition_idx,
transitions, neff_idx, neff_i_idx, neff_d_idx,
"Badly formatted line\n" + line + "\n in " +
loc.string()));
}
// parse neff if it's there, calculate the average of every column otherwise
if(neff_found) {
sline = ost::StringRef(neff_line.c_str(), neff_line.length());
chunks = sline.split();
if(chunks.size() != 2) {
throw IOException("Badly formatted line\n" + neff_line+ "\n in " +
loc.string());
}
std::pair<bool, float> p = chunks[1].to_float();
if (!p.first) {
throw IOException("Badly formatted line\n" + neff_line+ "\n in " +
loc.string());
}
prof.SetNeff(p.second);
} else {
Real summed_neff = 0.0;
for(size_t i = 0; i < prof.size(); ++i) {
summed_neff += prof[i].GetHMMData()->GetNeff();
}
if(prof.size() > 0) {
prof.SetNeff(summed_neff/prof.size());
} else{
prof.SetNeff(1.0);
}
}
}
......
......@@ -44,10 +44,30 @@ boost::python::list wrap_get_names(ProfileDBPtr db) {
void export_profile_handle()
{
enum_<HMMTransition>("HMMTransition")
.value("HMM_M2M", HMM_M2M)
.value("HMM_M2I", HMM_M2I)
.value("HMM_M2D", HMM_M2D)
.value("HMM_I2M", HMM_I2M)
.value("HMM_I2I", HMM_I2I)
.value("HMM_D2M", HMM_D2M)
.value("HMM_D2D", HMM_D2D)
;
class_<HMMData>("HMMData", init<>())
.add_property("neff", &HMMData::GetNeff, &HMMData::SetNeff)
.add_property("neff_i", &HMMData::GetNeff_I, &HMMData::SetNeff_I)
.add_property("neff_d", &HMMData::GetNeff_D, &HMMData::SetNeff_D)
.def("GetProb", &HMMData::GetProb, (arg("transition")))
.def("SetProb", &HMMData::SetProb, (arg("transition"), arg("prob")))
;
class_<ProfileColumn>("ProfileColumn", init<>())
.add_property("entropy", &ProfileColumn::GetEntropy)
.def("GetFreq", &ProfileColumn::GetFreq, (arg("aa")))
.def("SetFreq", &ProfileColumn::SetFreq, (arg("aa"), arg("freq")))
.add_property("hmm_data", &ProfileColumn::GetHMMData, &ProfileColumn::SetHMMData)
.def("GetScore", &ProfileColumn::GetScore,
(arg("other"), arg("null_model")))
.def("BLOSUMNullModel", &ProfileColumn::BLOSUMNullModel)
......@@ -76,6 +96,8 @@ void export_profile_handle()
.add_property("avg_entropy", &ProfileHandle::GetAverageEntropy)
.add_property("sequence", &ProfileHandle::GetSequence,
&ProfileHandle::SetSequence)
.add_property("neff", &ProfileHandle::GetNeff,
&ProfileHandle::SetNeff)
;
class_<ProfileDB, ProfileDBPtr>("ProfileDB", init<>())
......
......@@ -137,7 +137,7 @@ Real ProfileColumn::GetScore(const ProfileColumn& other,
// ProfileHandle
//////////////////////////////////////////////////////////////////////////////
ProfileHandlePtr ProfileHandle::Extract(uint from, uint to) {
ProfileHandlePtr ProfileHandle::Extract(uint from, uint to) const {
// check
if (to <= from) {
throw Error("Second index must be bigger than first one!");
......@@ -187,10 +187,17 @@ Real ProfileHandle::GetAverageScore(const ProfileHandle& other,
// ProfileDB
//////////////////////////////////////////////////////////////////////////////
void ProfileDB::Save(const String& filename) const{
std::ofstream out_stream(filename.c_str(), std::ios::binary);
uint32_t magic_number = 42;
out_stream.write(reinterpret_cast<char*>(&magic_number), sizeof(uint32_t));
uint8_t version = 1;
out_stream.write(reinterpret_cast<char*>(&version), sizeof(uint8_t));
//write out total size
uint32_t total_size = data_.size();
out_stream.write(reinterpret_cast<char*>(&total_size),4);
......@@ -218,6 +225,26 @@ ProfileDBPtr ProfileDB::Load(const String& filename){
ProfileDBPtr db(new ProfileDB);
uint32_t magic_number;
in_stream.read(reinterpret_cast<char*>(&magic_number), sizeof(uint32_t));
if(magic_number != 42) {
std::stringstream ss;
ss << "Could not read magic number in " << filename<<". Either the file ";
ss << "is corrupt, does not contain a ProfileDB or is of an old version ";
ss << "which is not supported anymore.";
throw Error(ss.str());
}
uint8_t version;
in_stream.read(reinterpret_cast<char*>(&version), sizeof(uint8_t));
if(version != 1) {
std::stringstream ss;
ss << "ProfileDB in " << filename << " is of version " << version;
ss << " but only version 1 can be read.";
throw Error(ss.str());
}
//read in the total size
uint32_t total_size;
in_stream.read(reinterpret_cast<char*>(&total_size),4);
......
......@@ -39,12 +39,128 @@ namespace ost { namespace seq {
class ProfileHandle;
class ProfileColumn;
class HMMData;
class ProfileDB;
typedef boost::shared_ptr<ProfileHandle> ProfileHandlePtr;
typedef boost::shared_ptr<HMMData> HMMDataPtr;
typedef std::vector<ProfileHandlePtr> ProfileHandleList;
typedef boost::shared_ptr<ProfileDB> ProfileDBPtr;
typedef std::vector<ProfileColumn> ProfileColumnList;
typedef enum {
HMM_M2M = 0, HMM_M2I = 1, HMM_M2D = 2,
HMM_I2M = 3, HMM_I2I = 4,
HMM_D2M = 5, HMM_D2D = 6
} HMMTransition;
class DLLEXPORT_OST_SEQ HMMData {
public:
HMMData() {
memset(trans_, 0, 7*sizeof(Real));
trans_[HMM_M2M] = 1.0;
trans_[HMM_I2M] = 1.0;
trans_[HMM_D2M] = 1.0;
neff_ = 1.0;
neff_i_ = 1.0;
neff_d_ = 1.0;
}
HMMData(const HMMData& rhs) {
memcpy(trans_, rhs.trans_, 7*sizeof(Real));
neff_ = rhs.neff_;
neff_i_ = rhs.neff_i_;
neff_d_ = rhs.neff_d_;
}
Real GetProb(HMMTransition transition) const {
return trans_[transition];
}
void SetProb(HMMTransition transition, Real prob) {
trans_[transition] = prob;
}
Real GetNeff() const {
return neff_;
}
void SetNeff(Real neff) {
neff_ = neff;
}
Real GetNeff_I() const {
return neff_i_;
}
void SetNeff_I(Real neff) {
neff_i_ = neff;
}
Real GetNeff_D() const {
return neff_d_;
}
void SetNeff_D(Real neff) {
neff_d_ = neff;
}
bool operator==(const HMMData& rhs) const {
return (!memcmp(trans_, rhs.trans_, sizeof(trans_)) &&
neff_ == rhs.neff_ &&
neff_i_ == rhs.neff_i_ &&
neff_d_ == rhs.neff_d_);
}
bool operator!=(const HMMData& rhs) const { return !(rhs == (*this)); }
HMMData& operator=(const HMMData& rhs) {
memcpy(trans_, rhs.trans_, 7*sizeof(Real));
neff_ = rhs.neff_;
neff_i_ = rhs.neff_i_;
neff_d_ = rhs.neff_d_;
return *this;
}
friend std::ofstream& operator<<(std::ofstream& os, HMMData& dat) {
int16_t p_data[7];
for (uint i = 0; i < 7; ++i) {
p_data[i] = static_cast<int16_t>(dat.trans_[i]*10000);
}
os.write(reinterpret_cast<char*>(p_data), 7*sizeof(int16_t));
float neff_data[3];
neff_data[0] = dat.neff_;
neff_data[1] = dat.neff_i_;
neff_data[2] = dat.neff_d_;
os.write(reinterpret_cast<char*>(neff_data), 3*sizeof(float));
return os;
}
friend std::ifstream& operator>>(std::ifstream& is, HMMData& dat) {
int16_t p_data[7];
is.read(reinterpret_cast<char*>(p_data), 7*sizeof(int16_t));
for (uint i = 0; i < 7; ++i) {
dat.trans_[i] = p_data[i] * 0.0001;
}
float neff_data[3];
is.read(reinterpret_cast<char*>(neff_data), 3*sizeof(float));
dat.neff_ = neff_data[0];
dat.neff_i_ = neff_data[1];
dat.neff_d_ = neff_data[2];
return is;
}
private:
Real trans_[7];
Real neff_;
Real neff_i_;
Real neff_d_;
};
/// \brief Defines profile of 20 frequencies for one residue.
///
/// Frequencies are identified by the one-letter-code for that amino acid.
......@@ -59,15 +175,43 @@ public:
ProfileColumn(const ProfileColumn& rhs) {
memcpy(freq_, rhs.freq_, sizeof(freq_));
if(rhs.hmm_data_) {
// do deep copy
hmm_data_ = HMMDataPtr(new HMMData(*rhs.hmm_data_));
}
}
ProfileColumn& operator= (const ProfileColumn& rhs) {
memcpy(freq_, rhs.freq_, sizeof(freq_));
if(rhs.hmm_data_) {
// do deep copy
hmm_data_ = HMMDataPtr(new HMMData(*rhs.hmm_data_));
} else if(hmm_data_) {
hmm_data_ = HMMDataPtr();
}
return *this;
}
static ProfileColumn BLOSUMNullModel();
static ProfileColumn HHblitsNullModel();
void SetHMMData(HMMDataPtr p) {
hmm_data_ = p;
}
HMMDataPtr GetHMMData() const{
if(!hmm_data_) {
throw Error("ProfileColumn has no HMM data set!");
}
return hmm_data_;
}
Real GetTransProb(HMMTransition transition) const {
if(!hmm_data_) {
throw Error("ProfileColumn has no HMM data set!");
}
return hmm_data_->GetProb(transition);
}
/// \brief Translate one-letter-code to index (0-indexing).
static int GetIndex(char ch);
......@@ -102,6 +246,16 @@ public:
data[i] = static_cast<int16_t>(col.freq_[i]*10000);
}
os.write(reinterpret_cast<char*>(data), sizeof(data));
if(col.hmm_data_) {
bool has_hmm_data = true;
os.write(reinterpret_cast<char*>(&has_hmm_data), 1);
os << *col.hmm_data_;
} else {
bool has_hmm_data = false;
os.write(reinterpret_cast<char*>(&has_hmm_data), 1);
}
return os;
}
......@@ -112,11 +266,19 @@ public:
for (uint i = 0; i < 20; ++i) {
col.freq_[i] = data[i] * 0.0001;
}
bool has_hmm_data;
is.read(reinterpret_cast<char*>(&has_hmm_data), 1);
if(has_hmm_data) {
is >> *col.hmm_data_;
}
return is;
}
private:
Real freq_[20];
HMMDataPtr hmm_data_;
};
/// \brief Provides a profile for a sequence.
......@@ -129,7 +291,7 @@ private:
class DLLEXPORT_OST_SEQ ProfileHandle {
public:
/// \brief Constructs an empty profile handle (sequence = '', 0 columns).
ProfileHandle(): null_model_(ProfileColumn::HHblitsNullModel()) {}
ProfileHandle(): null_model_(ProfileColumn::HHblitsNullModel()), neff_(1.0) {}
// uses compiler-generated copy- and assignment operators (work here!)
......@@ -151,10 +313,14 @@ public:
seq_ = seq;
}
Real GetNeff() const { return neff_; }
void SetNeff(Real neff) { neff_ = neff; }
/// \brief Extract subset of profile for columns from until to-1 (0-indexing).
/// Null model is copied from this profile.
/// \throw Error if to <= from or to > size().
ProfileHandlePtr Extract(uint from, uint to);
ProfileHandlePtr Extract(uint from, uint to) const;
/// \brief Compute average entropy over all columns.
Real GetAverageEntropy() const;
......@@ -186,6 +352,10 @@ public:
const ProfileColumn& at(size_t index) const { return columns_.at(index); }
ProfileColumn& back() { return columns_.back(); }
const ProfileColumn& back() const { return columns_.back(); }
bool operator==(const ProfileHandle& other) const {
return seq_ == other.seq_ &&
columns_ == other.columns_ &&
......@@ -251,6 +421,7 @@ private:
String seq_;
ProfileColumn null_model_;
ProfileColumnList columns_;
Real neff_;
};
/// \brief Contains a DB of profiles (identified by a unique name (String)).
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment