From eeaa935640b6611164a6a3a9407e4d810c80e6f6 Mon Sep 17 00:00:00 2001 From: Gabriel Studer <gabriel.studer@unibas.ch> Date: Mon, 22 Jun 2020 23:27:39 +0200 Subject: [PATCH] load sequence profiles from strings --- modules/io/doc/io.rst | 13 +++ modules/io/pymod/wrap_io.cc | 2 + modules/io/src/seq/hhm_io_handler.cc | 93 +++++++++++--------- modules/io/src/seq/hhm_io_handler.hh | 4 + modules/io/src/seq/load.cc | 11 +++ modules/io/src/seq/load.hh | 3 + modules/io/src/seq/profile_io_handler.hh | 3 + modules/io/src/seq/pssm_io_handler.cc | 73 ++++++++------- modules/io/src/seq/pssm_io_handler.hh | 4 + modules/io/tests/test_io_sequence_profile.cc | 51 +++++++++++ 10 files changed, 183 insertions(+), 74 deletions(-) diff --git a/modules/io/doc/io.rst b/modules/io/doc/io.rst index 39a713dc9..381771ad9 100644 --- a/modules/io/doc/io.rst +++ b/modules/io/doc/io.rst @@ -323,6 +323,19 @@ Loading sequence or alignment files :type format: string :rtype: :class:`~ost.seq.SequenceList` +.. function:: SequenceProfileFromString(data, format) + + Load sequence profile from string. + + The format argument is mandatory. + + :param data: String containing the data you would read from a file with + specified format. + :type data: string + :param format: Name of the format. Can either be "hhm" or "pssm". + :type format: string + :rtype: :class:`~ost.seq.ProfileHandle` + Saving Sequence Data ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/modules/io/pymod/wrap_io.cc b/modules/io/pymod/wrap_io.cc index 0cf910a43..1a56254be 100644 --- a/modules/io/pymod/wrap_io.cc +++ b/modules/io/pymod/wrap_io.cc @@ -104,6 +104,8 @@ BOOST_PYTHON_MODULE(_ost_io) def("LoadSequenceProfile", &LoadSequenceProfile, (arg("filename"), arg("format")="auto")); + def("SequenceProfileFromString", &SequenceProfileFromString, + (arg("data"), arg("format"))); def("LoadSurface",LoadSurface,load_surface_ov()); def("LoadManagedSurface",LoadManagedSurface,load_msurface_ov()); diff --git a/modules/io/src/seq/hhm_io_handler.cc b/modules/io/src/seq/hhm_io_handler.cc index d8dfee180..3d976a571 100644 --- a/modules/io/src/seq/hhm_io_handler.cc +++ b/modules/io/src/seq/hhm_io_handler.cc @@ -135,6 +135,43 @@ void HhmIOHandler::Import(seq::ProfileHandle& prof, } in.push(stream); + this->Import(prof, in); +} + +void HhmIOHandler::ImportFromString(seq::ProfileHandle& prof, + const String& data) { + std::stringstream ss(data); + this->Import(prof, ss); +} + +void HhmIOHandler::Export(const seq::ProfileHandle& prof, + const boost::filesystem::path& loc) const { + throw IOException("Cannot write hhm files."); +} + +bool HhmIOHandler::ProvidesImport(const boost::filesystem::path& loc, + const String& format) { + if (format=="auto") { + String match_suf_string = loc.string(); + std::transform(match_suf_string.begin(), match_suf_string.end(), + match_suf_string.begin(), tolower); + if ( detail::FilenameEndsWith(match_suf_string,".hhm") + || detail::FilenameEndsWith(match_suf_string,".hhm.gz")) { + return true; + } + } else if (format == "hhm") { + return true; + } + return false; +} + +bool HhmIOHandler::ProvidesExport(const boost::filesystem::path& loc, + const String& format) { + // no writers here + return false; +} + +void HhmIOHandler::Import(seq::ProfileHandle& prof, std::istream& in) { // reset profile prof.clear(); @@ -169,7 +206,7 @@ void HhmIOHandler::Import(seq::ProfileHandle& prof, } } if (!null_found) { - throw IOException("No NULL found in file " + loc.string()); + throw IOException("No NULL found in file"); } // read until we hit HMM, extract olcs, then skip 2 more lines @@ -180,11 +217,11 @@ void HhmIOHandler::Import(seq::ProfileHandle& prof, if (chunks[0] == ost::StringRef("HMM", 3)) { // extract olcs if (chunks.size() != 21) { - throw IOException("Badly formatted HMM line in " + loc.string()); + throw IOException("Badly formatted HMM line: " + line); } for (uint i = 1; i < 21; ++i) { if (chunks[i].length() != 1) { - throw IOException("Badly formatted HMM line in " + loc.string()); + throw IOException("Badly formatted HMM line: " + line); } olcs[i-1] = chunks[i][0]; } @@ -193,7 +230,7 @@ void HhmIOHandler::Import(seq::ProfileHandle& prof, sline = ost::StringRef(line.c_str(), line.length()); chunks = sline.split(); if (chunks.size() != 10) { - throw IOException("Badly formatted HMM line in " + loc.string()); + throw IOException("Badly formatted HMM line: " + line); } for (uint i = 0; i < 10; ++i) { @@ -225,17 +262,17 @@ void HhmIOHandler::Import(seq::ProfileHandle& prof, } else if(chunks[i] == ost::StringRef("Neff_D", 6)) { neff_d_idx = i; } else { - throw IOException("Badly formatted HMM line in " + loc.string()); + throw IOException("Badly formatted HMM line: " + line); } } // check whether we really got everything if(neff_idx == -1 || neff_i_idx == -1 || neff_d_idx == -1) { - throw IOException("Badly formatted HMM line in " + loc.string()); + throw IOException("Badly formatted HMM line: " + line); } if(transition_idx.size() != 7) { - throw IOException("Badly formatted HMM line in " + loc.string()); + throw IOException("Badly formatted HMM line: " + line); } // skip one line @@ -245,13 +282,13 @@ void HhmIOHandler::Import(seq::ProfileHandle& prof, } } if (!hmm_found) { - throw IOException("No HMM found in file " + loc.string()); + throw IOException("No HMM found in given input"); } // set null model chunks = ost::StringRef(null_line.c_str(), null_line.length()).split(); prof.SetNullModel(GetColumn(chunks, 1, olcs, "Badly formatted NULL line\n" - + null_line + "\n in " + loc.string())); + + null_line)); // set columns while (std::getline(in, line)) { @@ -263,7 +300,7 @@ void HhmIOHandler::Import(seq::ProfileHandle& prof, const char olc = chunks[0][0]; // frequencies prof.AddColumn(GetColumn(chunks, 2, olcs, "Badly formatted line\n" - + line + "\n in " + loc.string()), olc); + + line), olc); // get transition probabilities std::getline(in, line); sline = ost::StringRef(line.c_str(), line.length()); @@ -272,8 +309,7 @@ void HhmIOHandler::Import(seq::ProfileHandle& prof, chunks = sline.split(); prof.back().SetHMMData(GetHMMData(chunks, transition_idx, transitions, neff_idx, neff_i_idx, neff_d_idx, - "Badly formatted line\n" + line + "\n in " + - loc.string())); + "Badly formatted line\n" + line)); } // parse neff if it's there, calculate the average of every column otherwise @@ -281,13 +317,11 @@ void HhmIOHandler::Import(seq::ProfileHandle& prof, sline = ost::StringRef(neff_line.c_str(), neff_line.length()); chunks = sline.split(); if(chunks.size() != 2) { - throw IOException("Badly formatted line\n" + neff_line+ "\n in " + - loc.string()); + throw IOException("Badly formatted line\n" + neff_line); } std::pair<bool, float> p = chunks[1].to_float(); if (!p.first) { - throw IOException("Badly formatted line\n" + neff_line+ "\n in " + - loc.string()); + throw IOException("Badly formatted line\n" + neff_line); } prof.SetNeff(p.second); } else { @@ -303,31 +337,4 @@ void HhmIOHandler::Import(seq::ProfileHandle& prof, } } -void HhmIOHandler::Export(const seq::ProfileHandle& prof, - const boost::filesystem::path& loc) const { - throw IOException("Cannot write hhm files."); -} - -bool HhmIOHandler::ProvidesImport(const boost::filesystem::path& loc, - const String& format) { - if (format=="auto") { - String match_suf_string = loc.string(); - std::transform(match_suf_string.begin(), match_suf_string.end(), - match_suf_string.begin(), tolower); - if ( detail::FilenameEndsWith(match_suf_string,".hhm") - || detail::FilenameEndsWith(match_suf_string,".hhm.gz")) { - return true; - } - } else if (format == "hhm") { - return true; - } - return false; -} - -bool HhmIOHandler::ProvidesExport(const boost::filesystem::path& loc, - const String& format) { - // no writers here - return false; -} - }} diff --git a/modules/io/src/seq/hhm_io_handler.hh b/modules/io/src/seq/hhm_io_handler.hh index 97fe449cf..9d4b1d2ae 100644 --- a/modules/io/src/seq/hhm_io_handler.hh +++ b/modules/io/src/seq/hhm_io_handler.hh @@ -33,6 +33,8 @@ class DLLEXPORT_OST_IO HhmIOHandler : public ProfileIOHandler { public: virtual void Import(seq::ProfileHandle& prof, const boost::filesystem::path& loc); + virtual void ImportFromString(seq::ProfileHandle& prof, + const String& data); virtual void Export(const seq::ProfileHandle& prof, const boost::filesystem::path& loc) const; @@ -43,6 +45,8 @@ public: static String GetFormatName() { return String("HHM"); } static String GetFormatDescription() { return String("HHM output of HHblits"); } +private: + void Import(seq::ProfileHandle& prof, std::istream& in); }; typedef ProfileIOHandlerFactory<HhmIOHandler> HhmIOHandlerFactory; diff --git a/modules/io/src/seq/load.cc b/modules/io/src/seq/load.cc index e652b845a..3be372e94 100644 --- a/modules/io/src/seq/load.cc +++ b/modules/io/src/seq/load.cc @@ -111,4 +111,15 @@ seq::ProfileHandlePtr LoadSequenceProfile(const String& file_name, return prof; } +seq::ProfileHandlePtr SequenceProfileFromString(const String& data, + const String& format) +{ + seq::ProfileHandlePtr prof(new seq::ProfileHandle); + String file_name = "fake." + format; + IOManager& m = IOManager::Instance(); + ProfileIOHandlerPtr prof_io = m.FindProfileImportHandler(file_name, format); + prof_io->ImportFromString(*prof, data); + return prof; +} + }} diff --git a/modules/io/src/seq/load.hh b/modules/io/src/seq/load.hh index 6c623241a..f0d52e599 100644 --- a/modules/io/src/seq/load.hh +++ b/modules/io/src/seq/load.hh @@ -61,6 +61,9 @@ SequenceFromString(const String& data, const String& format); seq::ProfileHandlePtr DLLEXPORT_OST_IO LoadSequenceProfile(const String& file_name, const String& format="auto"); +seq::ProfileHandlePtr DLLEXPORT_OST_IO +SequenceProfileFromString(const String& data, const String& format); + }} #endif diff --git a/modules/io/src/seq/profile_io_handler.hh b/modules/io/src/seq/profile_io_handler.hh index 19c78de8b..b84600c64 100644 --- a/modules/io/src/seq/profile_io_handler.hh +++ b/modules/io/src/seq/profile_io_handler.hh @@ -41,6 +41,9 @@ public: virtual void Import(seq::ProfileHandle& prof, const boost::filesystem::path& loc) = 0; + virtual void ImportFromString(seq::ProfileHandle& prof, + const String& data) = 0; + virtual void Export(const seq::ProfileHandle& prof, const boost::filesystem::path& loc) const = 0; }; diff --git a/modules/io/src/seq/pssm_io_handler.cc b/modules/io/src/seq/pssm_io_handler.cc index 2dff01015..892bf10c7 100644 --- a/modules/io/src/seq/pssm_io_handler.cc +++ b/modules/io/src/seq/pssm_io_handler.cc @@ -36,7 +36,7 @@ namespace ost { namespace io { void PssmIOHandler::Import(seq::ProfileHandle& prof, - const boost::filesystem::path& loc) { + const boost::filesystem::path& loc) { // open it up boost::iostreams::filtering_stream<boost::iostreams::input> in; boost::filesystem::ifstream stream(loc); @@ -49,6 +49,45 @@ void PssmIOHandler::Import(seq::ProfileHandle& prof, } in.push(stream); + this->Import(prof, in); +} + +void PssmIOHandler::ImportFromString(seq::ProfileHandle& prof, + const String& data) { + std::stringstream ss(data); + this->Import(prof, ss); +} + +void PssmIOHandler::Export(const seq::ProfileHandle& prof, + const boost::filesystem::path& loc) const { + throw IOException("Cannot write pssm files."); +} + +bool PssmIOHandler::ProvidesImport(const boost::filesystem::path& loc, + const String& format) { + if (format=="auto") { + String match_suf_string = loc.string(); + std::transform(match_suf_string.begin(), match_suf_string.end(), + match_suf_string.begin(), tolower); + if ( detail::FilenameEndsWith(match_suf_string,".pssm") + || detail::FilenameEndsWith(match_suf_string,".pssm.gz")) { + return true; + } + } else if (format == "pssm") { + return true; + } + return false; +} + +bool PssmIOHandler::ProvidesExport(const boost::filesystem::path& loc, + const String& format) { + // no writers here + return false; +} + +void PssmIOHandler::Import(seq::ProfileHandle& prof, + std::istream& in) { + // reset profile prof.clear(); @@ -85,7 +124,7 @@ void PssmIOHandler::Import(seq::ProfileHandle& prof, } } if (!table_found) { - throw IOException("No ASCII table found in file " + loc.string()); + throw IOException("No ASCII table found in input"); } // parse table (assume: index olc 20xscore 20xfreq) @@ -101,8 +140,7 @@ void PssmIOHandler::Import(seq::ProfileHandle& prof, for (uint i = 22; i < 42; ++i) { std::pair<bool, int> pbi = chunks[i].to_int(); if (!pbi.first) { - throw IOException("Badly formatted line\n" + line + "\n in " - + loc.string()); + throw IOException("Badly formatted line\n" + line); } sum_freq += pbi.second; freqs[i-22] = pbi.second; @@ -116,31 +154,4 @@ void PssmIOHandler::Import(seq::ProfileHandle& prof, } } -void PssmIOHandler::Export(const seq::ProfileHandle& prof, - const boost::filesystem::path& loc) const { - throw IOException("Cannot write pssm files."); -} - -bool PssmIOHandler::ProvidesImport(const boost::filesystem::path& loc, - const String& format) { - if (format=="auto") { - String match_suf_string = loc.string(); - std::transform(match_suf_string.begin(), match_suf_string.end(), - match_suf_string.begin(), tolower); - if ( detail::FilenameEndsWith(match_suf_string,".pssm") - || detail::FilenameEndsWith(match_suf_string,".pssm.gz")) { - return true; - } - } else if (format == "pssm") { - return true; - } - return false; -} - -bool PssmIOHandler::ProvidesExport(const boost::filesystem::path& loc, - const String& format) { - // no writers here - return false; -} - }} diff --git a/modules/io/src/seq/pssm_io_handler.hh b/modules/io/src/seq/pssm_io_handler.hh index ae8350731..b1b7d8c33 100644 --- a/modules/io/src/seq/pssm_io_handler.hh +++ b/modules/io/src/seq/pssm_io_handler.hh @@ -33,6 +33,8 @@ class DLLEXPORT_OST_IO PssmIOHandler : public ProfileIOHandler { public: virtual void Import(seq::ProfileHandle& prof, const boost::filesystem::path& loc); + virtual void ImportFromString(seq::ProfileHandle& prof, + const String& data); virtual void Export(const seq::ProfileHandle& prof, const boost::filesystem::path& loc) const; @@ -45,6 +47,8 @@ public: static String GetFormatDescription() { return String("ASCII Table (PSSM) output of PSI-BLAST (flag -Q)"); } +private: + void Import(seq::ProfileHandle& prof, std::istream& in); }; typedef ProfileIOHandlerFactory<PssmIOHandler> PssmIOHandlerFactory; diff --git a/modules/io/tests/test_io_sequence_profile.cc b/modules/io/tests/test_io_sequence_profile.cc index b866c3949..0b597298d 100644 --- a/modules/io/tests/test_io_sequence_profile.cc +++ b/modules/io/tests/test_io_sequence_profile.cc @@ -29,6 +29,7 @@ #include <ost/io/seq/load.hh> #include <ost/io/seq/hhm_io_handler.hh> #include <ost/io/seq/pssm_io_handler.hh> +#include <ost/io/io_exception.hh> using namespace ost::seq; using namespace ost::io; @@ -267,4 +268,54 @@ BOOST_AUTO_TEST_CASE(pssm_loading) BOOST_CHECK(*prof == *prof2); } +BOOST_AUTO_TEST_CASE(hhm_loading_file_vs_str) +{ + // load from file + ProfileHandlePtr prof_from_file(new ProfileHandle); + ProfileIOHandlerPtr io_handler(new HhmIOHandler); + io_handler->Import(*prof_from_file, "testfiles/test_hmm.hhm"); + + // load from string + std::ifstream in("testfiles/test_hmm.hhm", std::ios::in | std::ios::binary); + std::string str; + if (in) { + in.seekg(0, std::ios::end); + str.resize(in.tellg()); + in.seekg(0, std::ios::beg); + in.read(&str[0], str.size()); + in.close(); + } else { + throw IOException("Cannot read hhm file in unit test."); + } + ProfileHandlePtr prof_from_string(new ProfileHandle); + io_handler->ImportFromString(*prof_from_string, str); + + compareProfiles(*prof_from_file, *prof_from_string, 1e-3); +} + +BOOST_AUTO_TEST_CASE(pssm_loading_file_vs_str) +{ + // load from file + ProfileHandlePtr prof_from_file(new ProfileHandle); + ProfileIOHandlerPtr io_handler(new PssmIOHandler); + io_handler->Import(*prof_from_file, "testfiles/test_pssm.pssm"); + + // load from string + std::ifstream in("testfiles/test_pssm.pssm", std::ios::in | std::ios::binary); + std::string str; + if (in) { + in.seekg(0, std::ios::end); + str.resize(in.tellg()); + in.seekg(0, std::ios::beg); + in.read(&str[0], str.size()); + in.close(); + } else { + throw IOException("Cannot read pssm file in unit test."); + } + ProfileHandlePtr prof_from_string(new ProfileHandle); + io_handler->ImportFromString(*prof_from_string, str); + + compareProfiles(*prof_from_file, *prof_from_string, 1e-3); +} + BOOST_AUTO_TEST_SUITE_END(); -- GitLab