diff --git a/modules/io/src/seq/hhm_io_handler.cc b/modules/io/src/seq/hhm_io_handler.cc index 395c97bd1f48c0bb6bdc0829497f0846afaadb78..b524f3161bae3a85f541ad2213d7fcb9cf07b138 100644 --- a/modules/io/src/seq/hhm_io_handler.cc +++ b/modules/io/src/seq/hhm_io_handler.cc @@ -132,21 +132,19 @@ void HhmIOHandler::Import(seq::ProfileHandle& prof, + null_line + "\n in " + loc.string())); // set columns - String seqres; while (std::getline(in, line)) { sline = ost::StringRef(line.c_str(), line.length()); if (sline.trim().empty()) continue; if (sline.trim() == ost::StringRef("//", 2)) break; chunks = sline.split(); // one letter code - seqres += chunks[0][0]; + const char olc = chunks[0][0]; // frequencies - prof.push_back(GetColumn(chunks, 2, olcs, "Badly formatted line\n" - + line + "\n in " + loc.string())); + prof.AddColumn(GetColumn(chunks, 2, olcs, "Badly formatted line\n" + + line + "\n in " + loc.string()), olc); // skip line (trans. matrix) std::getline(in, line); } - prof.SetSequence(seqres); } void HhmIOHandler::Export(const seq::ProfileHandle& prof, diff --git a/modules/io/src/seq/pssm_io_handler.cc b/modules/io/src/seq/pssm_io_handler.cc index 4b16ed6cb94cede54755a4c22ac95a56b6b9f0a3..746bae3223149b065b61b4a3eb432bf5aff4a8b5 100644 --- a/modules/io/src/seq/pssm_io_handler.cc +++ b/modules/io/src/seq/pssm_io_handler.cc @@ -89,20 +89,20 @@ void PssmIOHandler::Import(seq::ProfileHandle& prof, } // parse table (assume: index olc 20xscore 20xfreq) - String seqres; while (std::getline(in, line)) { sline = ost::StringRef(line.c_str(), line.length()); chunks = sline.split(); // stop if not enough entries if (chunks.size() < 42) break; // parse row (freq in % as integers) - seqres += chunks[1][0]; + const char olc = chunks[1][0]; Real sum_freq = 0; Real freqs[20]; for (uint i = 22; i < 42; ++i) { std::pair<bool, int> pbi = chunks[i].to_int(); if (!pbi.first) { - throw IOException("Badly formatted line\n" + line + "\n in " + loc.string()); + throw IOException("Badly formatted line\n" + line + "\n in " + + loc.string()); } sum_freq += pbi.second; freqs[i-22] = pbi.second; @@ -112,9 +112,8 @@ void PssmIOHandler::Import(seq::ProfileHandle& prof, for (uint i = 0; i < 20; ++i) { pc.SetFreq(olcs[i], freqs[i]/sum_freq); } - prof.push_back(pc); + prof.AddColumn(pc, olc); } - prof.SetSequence(seqres); } void PssmIOHandler::Export(const seq::ProfileHandle& prof, diff --git a/modules/io/tests/test_io_sequence_profile.cc b/modules/io/tests/test_io_sequence_profile.cc index 85d2dc60201ba74d6627b0ed45c1c3acad738448..afddf8d3f1429db4782ce717c02ec6b9c123c285 100644 --- a/modules/io/tests/test_io_sequence_profile.cc +++ b/modules/io/tests/test_io_sequence_profile.cc @@ -168,7 +168,6 @@ BOOST_AUTO_TEST_CASE(pssm_loading) // compare manually ProfileHandle prof_pssm_ref; - prof_pssm_ref.SetSequence("RRSPP"); prof_pssm_ref.SetNullModel(ProfileColumn::BLOSUMNullModel()); ProfileColumn pc1, pc2, pc3, pc4, pc5; pc1.SetFreq('R', 0.74); @@ -241,11 +240,12 @@ BOOST_AUTO_TEST_CASE(pssm_loading) pc5.SetFreq('T', 0.02); pc5.SetFreq('V', 0.04); - prof_pssm_ref.push_back(pc1); - prof_pssm_ref.push_back(pc2); - prof_pssm_ref.push_back(pc3); - prof_pssm_ref.push_back(pc4); - prof_pssm_ref.push_back(pc5); + prof_pssm_ref.AddColumn(pc1); + prof_pssm_ref.AddColumn(pc2); + prof_pssm_ref.AddColumn(pc3); + prof_pssm_ref.AddColumn(pc4); + prof_pssm_ref.AddColumn(pc5); + prof_pssm_ref.SetSequence("RRSPP"); compareProfiles(*prof, prof_pssm_ref, 1e-3); diff --git a/modules/seq/base/doc/seq.rst b/modules/seq/base/doc/seq.rst index 3d112dfa92418658a73b9cf5b45eb3a4273954e5..f2e1edbf33088e1699b00e999db3d4c9f23b9173 100644 --- a/modules/seq/base/doc/seq.rst +++ b/modules/seq/base/doc/seq.rst @@ -446,6 +446,11 @@ residue. It mainly contains: Static method, that returns a new :class:`ProfileColumn` with amino acid frequencies given from the BLOSUM62 substitution matrix. + .. method:: HHblitsNullModel() + + Static method, that returns a new :class:`ProfileColumn` with amino acid + frequencies as set in HHblits output. + .. method:: GetFreq(aa) :return: Frequency of *aa* @@ -484,14 +489,16 @@ residue. It mainly contains: :rtype: :class:`int` - .. method:: AddColumn(col) + .. method:: AddColumn(col, olc='X') Appends column in the internal column list. - :param col: Column to add + :param col: Column to add to :attr:`columns` :type col: :class:`ProfileColumn` + :param olc: One letter code to add to :attr:`sequence` + :type col: :class:`str` - .. method:: Extract(from,to) + .. method:: Extract(from, to) :param from: Col Idx to start from :param to: End Idx, not included in sub-ProfileHandle @@ -519,37 +526,31 @@ residue. It mainly contains: :raises: :exc:`~exceptions.Error` if any *columns[i+offset]* out of bounds. - .. method:: SetSequence(sequence) - - Sets :attr:`sequence`. - - .. method:: SetNullModel(null_model) - - Sets :attr:`null_model`. .. attribute:: sequence - Sequence for which we have this profile. - Note: user must enforce consistency between sequence length and number of - profile columns. + Sequence for which we have this profile. When setting a new value, the + length and the number of profile columns must match (exception thrown + otherwise). :type: :class:`str` .. attribute:: columns - Iterable columns of the profile + Iterable columns of the profile (read-only). :type: :class:`ProfileColumnList` .. attribute:: null_model - Null model of the profile + Null model of the profile. By default this is set to + :meth:`ProfileColumn.HHblitsNullModel`. :type: :class:`ProfileColumn` .. attribute:: avg_entropy - Average entropy of all the columns + Average entropy of all the columns (read-only). :type: :class:`float` diff --git a/modules/seq/base/pymod/export_profile_handle.cc b/modules/seq/base/pymod/export_profile_handle.cc index 111442f6c1a072f0a3f625cdcb986bd8c5086322..101446d4584f3c26ffd9e57f9241e51a60953e64 100644 --- a/modules/seq/base/pymod/export_profile_handle.cc +++ b/modules/seq/base/pymod/export_profile_handle.cc @@ -46,12 +46,14 @@ void export_profile_handle() { class_<ProfileColumn>("ProfileColumn", init<>()) .add_property("entropy", &ProfileColumn::GetEntropy) - .def("GetFreq", &ProfileColumn::GetFreq) - .def("SetFreq", &ProfileColumn::SetFreq) + .def("GetFreq", &ProfileColumn::GetFreq, (arg("aa"))) + .def("SetFreq", &ProfileColumn::SetFreq, (arg("aa"), arg("freq"))) .def("GetScore", &ProfileColumn::GetScore, (arg("other"), arg("null_model"))) - .def("BLOSUMNullModel", - &ProfileColumn::BLOSUMNullModel).staticmethod("BLOSUMNullModel") + .def("BLOSUMNullModel", &ProfileColumn::BLOSUMNullModel) + .staticmethod("BLOSUMNullModel") + .def("HHblitsNullModel", &ProfileColumn::HHblitsNullModel) + .staticmethod("HHblitsNullModel") ; class_<ProfileColumnList>("ProfileColumnList", init<>()) @@ -60,26 +62,27 @@ void export_profile_handle() class_<ProfileHandle, ProfileHandlePtr>("ProfileHandle", init<>()) .def("__len__",&ProfileHandle::size) - .def("AddColumn", &ProfileHandle::push_back) - .def("Extract", &ProfileHandle::Extract) - .def("SetNullModel", &ProfileHandle::SetNullModel) - .def("SetSequence", &ProfileHandle::SetSequence) + .def("AddColumn", &ProfileHandle::AddColumn, (arg("col"), arg("olc")='X')) + .def("Extract", &ProfileHandle::Extract, (arg("from"), arg("to"))) .def("GetAverageScore", &ProfileHandle::GetAverageScore, (arg("other"), arg("offset")=0)) - .add_property("null_model", make_function(&ProfileHandle::GetNullModel, - return_value_policy<copy_const_reference>())) - .add_property("columns", + .add_property("null_model", + make_function(&ProfileHandle::GetNullModel, + return_value_policy<copy_const_reference>()), + &ProfileHandle::SetNullModel) + .add_property("columns", make_function(&ProfileHandle::GetColumns, - return_value_policy<copy_const_reference>())) + return_value_policy<copy_const_reference>())) .add_property("avg_entropy", &ProfileHandle::GetAverageEntropy) - .add_property("sequence", &ProfileHandle::GetSequence) + .add_property("sequence", &ProfileHandle::GetSequence, + &ProfileHandle::SetSequence) ; class_<ProfileDB, ProfileDBPtr>("ProfileDB", init<>()) - .def("Load", &ProfileDB::Load).staticmethod("Load") - .def("Save", &ProfileDB::Save) - .def("AddProfile", &ProfileDB::AddProfile) - .def("GetProfile", &ProfileDB::GetProfile) + .def("Load", &ProfileDB::Load, (arg("filename"))).staticmethod("Load") + .def("Save", &ProfileDB::Save, (arg("filename"))) + .def("AddProfile", &ProfileDB::AddProfile, (arg("name"), arg("prof"))) + .def("GetProfile", &ProfileDB::GetProfile, (arg("name"))) .def("Size", &ProfileDB::size) .def("GetNames",&wrap_get_names) ; diff --git a/modules/seq/base/src/profile_handle.cc b/modules/seq/base/src/profile_handle.cc index 5e9cf536f66a6b9ec2c7711bacaa589fc3b4b48c..c6aee7a587e45d95ee063e93472de52aafdd3a60 100644 --- a/modules/seq/base/src/profile_handle.cc +++ b/modules/seq/base/src/profile_handle.cc @@ -58,6 +58,33 @@ ProfileColumn ProfileColumn::BLOSUMNullModel() { return col; } +ProfileColumn ProfileColumn::HHblitsNullModel() { + ProfileColumn col; + + col.freq_[0] = 0.07662717998027802; + col.freq_[1] = 0.01886688359081745; + col.freq_[2] = 0.05399613827466965; + col.freq_[3] = 0.05978801101446152; + col.freq_[4] = 0.03493943437933922; + col.freq_[5] = 0.0754152461886406; + col.freq_[6] = 0.03682935610413551; + col.freq_[7] = 0.050485048443078995; + col.freq_[8] = 0.05958116054534912; + col.freq_[9] = 0.09992572665214539; + col.freq_[10] = 0.021959668025374413; + col.freq_[11] = 0.040107060223817825; + col.freq_[12] = 0.045310840010643005; + col.freq_[13] = 0.03264486789703369; + col.freq_[14] = 0.05129634961485863; + col.freq_[15] = 0.04661700129508972; + col.freq_[16] = 0.07105106115341187; + col.freq_[17] = 0.07264462858438492; + col.freq_[18] = 0.012473411858081818; + col.freq_[19] = 0.039418045431375504; + + return col; +} + int ProfileColumn::GetIndex(char olc) { if (olc == 'A') return 0; if (olc >= 'C' && olc <= 'I') return (olc-'A') - 1; @@ -110,22 +137,21 @@ Real ProfileColumn::GetScore(const ProfileColumn& other, // ProfileHandle ////////////////////////////////////////////////////////////////////////////// -ProfileHandlePtr ProfileHandle::Extract(uint from, uint to){ - +ProfileHandlePtr ProfileHandle::Extract(uint from, uint to) { + // check if (to <= from) { throw Error("Second index must be bigger than first one!"); } - if (to > this->size()) { throw Error("Invalid index!"); } + // get subset to return ProfileHandlePtr return_prof(new ProfileHandle); return_prof->SetNullModel(null_model_); - for(uint i = from; i < to; ++i){ - return_prof->push_back(columns_[i]); + for (uint i = from; i < to; ++i) { + return_prof->AddColumn(columns_[i], seq_[i]); } - return_prof->SetSequence(seq_.substr(from, to-from)); return return_prof; } diff --git a/modules/seq/base/src/profile_handle.hh b/modules/seq/base/src/profile_handle.hh index b09f1db9424edc71e2bb7bfcbc84fb4862ce06d6..4a06461b86940a96eb3b76945324984bdeb9e03b 100644 --- a/modules/seq/base/src/profile_handle.hh +++ b/modules/seq/base/src/profile_handle.hh @@ -66,6 +66,7 @@ public: } static ProfileColumn BLOSUMNullModel(); + static ProfileColumn HHblitsNullModel(); /// \brief Translate one-letter-code to index (0-indexing). static int GetIndex(char ch); @@ -128,7 +129,7 @@ private: class DLLEXPORT_OST_SEQ ProfileHandle { public: /// \brief Constructs an empty profile handle (sequence = '', 0 columns). - ProfileHandle() {} + ProfileHandle(): null_model_(ProfileColumn::HHblitsNullModel()) {} // uses compiler-generated copy- and assignment operators (work here!) @@ -136,11 +137,19 @@ public: const ProfileColumn& GetNullModel() const { return null_model_; } - void SetNullModel(const ProfileColumn& null_model) { null_model_ = null_model; } + void SetNullModel(const ProfileColumn& null_model) { + null_model_ = null_model; + } String GetSequence() const { return seq_; } - void SetSequence(const String& seq) { seq_ = seq; } + void SetSequence(const String& seq) { + if (seq.length() != columns_.size()) { + throw Error("ProfileHandle - Inconsistency between number of columns and " + " seq. length."); + } + seq_ = seq; + } /// \brief Extract subset of profile for columns from until to-1 (0-indexing). /// Null model is copied from this profile. @@ -155,6 +164,12 @@ public: /// result normalized by other.size() Real GetAverageScore(const ProfileHandle& other, uint offset = 0) const; + // \brief Can only add column with an associated olc + void AddColumn(const ProfileColumn& c, char olc='X') { + columns_.push_back(c); + seq_ += olc; + } + // some functions to make it behave like a vector void clear() { seq_ = ""; columns_.clear(); } @@ -163,8 +178,6 @@ public: bool empty() const { return columns_.empty(); } - void push_back(const ProfileColumn& c) { columns_.push_back(c); } - ProfileColumn& operator[](size_t index) { return columns_[index]; } const ProfileColumn& operator[](size_t index) const { return columns_[index]; } @@ -179,12 +192,18 @@ public: null_model_ == other.null_model_; } - bool operator!=(const ProfileHandle& other) const { return !(other == (*this)); } + bool operator!=(const ProfileHandle& other) const { + return !(other == (*this)); + } - ProfileColumnList::const_iterator columns_end() const { return columns_.end(); } - ProfileColumnList::iterator columns_end() { return columns_.end(); } - ProfileColumnList::const_iterator columns_begin() const { return columns_.begin(); } ProfileColumnList::iterator columns_begin() { return columns_.begin(); } + ProfileColumnList::iterator columns_end() { return columns_.end(); } + ProfileColumnList::const_iterator columns_begin() const { + return columns_.begin(); + } + ProfileColumnList::const_iterator columns_end() const { + return columns_.end(); + } // functions to feed streams with limited accuracy of internal data // not intended for python export diff --git a/modules/seq/base/tests/test_profile.cc b/modules/seq/base/tests/test_profile.cc index fb5ef983a5dbf433139b0004764c253581ccc86e..c48e65f36258baebbf1d1b829ce07f3e1fb7c741 100644 --- a/modules/seq/base/tests/test_profile.cc +++ b/modules/seq/base/tests/test_profile.cc @@ -32,9 +32,22 @@ using namespace ost::seq; - BOOST_AUTO_TEST_SUITE( profile ); +namespace { + +void CheckNullModel(const ProfileColumn& nm) { + BOOST_CHECK_GE(nm.GetEntropy(), Real(0)); + Real sum_freq = 0; + const Real* freq_ptr = nm.freqs_begin(); + for (uint i = 0; i < 20; ++i) { + BOOST_CHECK_GE(freq_ptr[i], Real(0)); + sum_freq += freq_ptr[i]; + } + BOOST_CHECK_CLOSE(sum_freq, Real(1), Real(1e-2)); +} + +} // anon ns BOOST_AUTO_TEST_CASE(profile_column) { @@ -53,20 +66,15 @@ BOOST_AUTO_TEST_CASE(profile_column) BOOST_CHECK_EQUAL(pc.GetFreq(olc[i]), tst); } - // check null model + // check null models ProfileColumn nm = ProfileColumn::BLOSUMNullModel(); - BOOST_CHECK_GE(pc.GetEntropy(), Real(0)); - Real sum_freq = 0; - freq_ptr = nm.freqs_begin(); - for (uint i = 0; i < 20; ++i) { - BOOST_CHECK_GE(freq_ptr[i], Real(0)); - sum_freq += freq_ptr[i]; - } - BOOST_CHECK_CLOSE(sum_freq, Real(1), Real(1e-3)); + CheckNullModel(nm); + CheckNullModel(ProfileColumn::HHblitsNullModel()); // check assign/copy ProfileColumn nm2(nm); BOOST_CHECK(nm == nm2); + freq_ptr = nm.freqs_begin(); Real* freq_ptr2 = nm2.freqs_begin(); for (uint i = 0; i < 20; ++i) { BOOST_CHECK_EQUAL(freq_ptr[i], freq_ptr2[i]); @@ -81,7 +89,7 @@ BOOST_AUTO_TEST_CASE(profile_handle) ProfileHandle prof; BOOST_CHECK_EQUAL(prof.size(), size_t(0)); BOOST_CHECK_EQUAL(prof.GetSequence(), ""); - prof.SetSequence("RRSPP"); + CheckNullModel(prof.GetNullModel()); ProfileColumn pc0 = ProfileColumn::BLOSUMNullModel(); prof.SetNullModel(pc0); ProfileColumn pc1, pc2, pc3, pc4; @@ -93,11 +101,18 @@ BOOST_AUTO_TEST_CASE(profile_handle) pc3.SetFreq('E', 0.5); pc4.SetFreq('A', 0.5); pc4.SetFreq('H', 0.5); - prof.push_back(pc0); - prof.push_back(pc1); - prof.push_back(pc2); - prof.push_back(pc3); - prof.push_back(pc4); + prof.AddColumn(pc0); + prof.AddColumn(pc1); + prof.AddColumn(pc2); + BOOST_CHECK_EQUAL(prof.size(), prof.GetSequence().length()); + prof.AddColumn(pc3); + prof.AddColumn(pc4); + prof.SetSequence("RRSPP"); + BOOST_CHECK_EQUAL(prof.size(), prof.GetSequence().length()); + + // check invalid sequence settings + BOOST_CHECK_THROW(prof.SetSequence("RRSP"), ost::Error); + BOOST_CHECK_THROW(prof.SetSequence("RRSPPT"), ost::Error); // check components BOOST_CHECK_EQUAL(prof.size(), size_t(5));