diff --git a/modules/io/doc/io.rst b/modules/io/doc/io.rst index 39a713dc980ad887052418b4d4b4667fdf83bd4e..381771ad94e0c1ec43bc1f03e4ed0192ff18cef7 100644 --- a/modules/io/doc/io.rst +++ b/modules/io/doc/io.rst @@ -323,6 +323,19 @@ Loading sequence or alignment files :type format: string :rtype: :class:`~ost.seq.SequenceList` +.. function:: SequenceProfileFromString(data, format) + + Load sequence profile from string. + + The format argument is mandatory. + + :param data: String containing the data you would read from a file with + specified format. + :type data: string + :param format: Name of the format. Can either be "hhm" or "pssm". + :type format: string + :rtype: :class:`~ost.seq.ProfileHandle` + Saving Sequence Data ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/modules/io/pymod/wrap_io.cc b/modules/io/pymod/wrap_io.cc index 0cf910a439ac54ea4c15d1f867c92fc2664a606c..1a56254bec76d365caaab073923d2bb7f6fa79eb 100644 --- a/modules/io/pymod/wrap_io.cc +++ b/modules/io/pymod/wrap_io.cc @@ -104,6 +104,8 @@ BOOST_PYTHON_MODULE(_ost_io) def("LoadSequenceProfile", &LoadSequenceProfile, (arg("filename"), arg("format")="auto")); + def("SequenceProfileFromString", &SequenceProfileFromString, + (arg("data"), arg("format"))); def("LoadSurface",LoadSurface,load_surface_ov()); def("LoadManagedSurface",LoadManagedSurface,load_msurface_ov()); diff --git a/modules/io/src/seq/hhm_io_handler.cc b/modules/io/src/seq/hhm_io_handler.cc index d8dfee180784591027507f8b7956b6382a80c520..3d976a57142b09be9e355bdcbfe0418cdc3b2f6d 100644 --- a/modules/io/src/seq/hhm_io_handler.cc +++ b/modules/io/src/seq/hhm_io_handler.cc @@ -135,6 +135,43 @@ void HhmIOHandler::Import(seq::ProfileHandle& prof, } in.push(stream); + this->Import(prof, in); +} + +void HhmIOHandler::ImportFromString(seq::ProfileHandle& prof, + const String& data) { + std::stringstream ss(data); + this->Import(prof, ss); +} + +void HhmIOHandler::Export(const seq::ProfileHandle& prof, + const boost::filesystem::path& loc) const { + throw IOException("Cannot write hhm files."); +} + +bool HhmIOHandler::ProvidesImport(const boost::filesystem::path& loc, + const String& format) { + if (format=="auto") { + String match_suf_string = loc.string(); + std::transform(match_suf_string.begin(), match_suf_string.end(), + match_suf_string.begin(), tolower); + if ( detail::FilenameEndsWith(match_suf_string,".hhm") + || detail::FilenameEndsWith(match_suf_string,".hhm.gz")) { + return true; + } + } else if (format == "hhm") { + return true; + } + return false; +} + +bool HhmIOHandler::ProvidesExport(const boost::filesystem::path& loc, + const String& format) { + // no writers here + return false; +} + +void HhmIOHandler::Import(seq::ProfileHandle& prof, std::istream& in) { // reset profile prof.clear(); @@ -169,7 +206,7 @@ void HhmIOHandler::Import(seq::ProfileHandle& prof, } } if (!null_found) { - throw IOException("No NULL found in file " + loc.string()); + throw IOException("No NULL found in file"); } // read until we hit HMM, extract olcs, then skip 2 more lines @@ -180,11 +217,11 @@ void HhmIOHandler::Import(seq::ProfileHandle& prof, if (chunks[0] == ost::StringRef("HMM", 3)) { // extract olcs if (chunks.size() != 21) { - throw IOException("Badly formatted HMM line in " + loc.string()); + throw IOException("Badly formatted HMM line: " + line); } for (uint i = 1; i < 21; ++i) { if (chunks[i].length() != 1) { - throw IOException("Badly formatted HMM line in " + loc.string()); + throw IOException("Badly formatted HMM line: " + line); } olcs[i-1] = chunks[i][0]; } @@ -193,7 +230,7 @@ void HhmIOHandler::Import(seq::ProfileHandle& prof, sline = ost::StringRef(line.c_str(), line.length()); chunks = sline.split(); if (chunks.size() != 10) { - throw IOException("Badly formatted HMM line in " + loc.string()); + throw IOException("Badly formatted HMM line: " + line); } for (uint i = 0; i < 10; ++i) { @@ -225,17 +262,17 @@ void HhmIOHandler::Import(seq::ProfileHandle& prof, } else if(chunks[i] == ost::StringRef("Neff_D", 6)) { neff_d_idx = i; } else { - throw IOException("Badly formatted HMM line in " + loc.string()); + throw IOException("Badly formatted HMM line: " + line); } } // check whether we really got everything if(neff_idx == -1 || neff_i_idx == -1 || neff_d_idx == -1) { - throw IOException("Badly formatted HMM line in " + loc.string()); + throw IOException("Badly formatted HMM line: " + line); } if(transition_idx.size() != 7) { - throw IOException("Badly formatted HMM line in " + loc.string()); + throw IOException("Badly formatted HMM line: " + line); } // skip one line @@ -245,13 +282,13 @@ void HhmIOHandler::Import(seq::ProfileHandle& prof, } } if (!hmm_found) { - throw IOException("No HMM found in file " + loc.string()); + throw IOException("No HMM found in given input"); } // set null model chunks = ost::StringRef(null_line.c_str(), null_line.length()).split(); prof.SetNullModel(GetColumn(chunks, 1, olcs, "Badly formatted NULL line\n" - + null_line + "\n in " + loc.string())); + + null_line)); // set columns while (std::getline(in, line)) { @@ -263,7 +300,7 @@ void HhmIOHandler::Import(seq::ProfileHandle& prof, const char olc = chunks[0][0]; // frequencies prof.AddColumn(GetColumn(chunks, 2, olcs, "Badly formatted line\n" - + line + "\n in " + loc.string()), olc); + + line), olc); // get transition probabilities std::getline(in, line); sline = ost::StringRef(line.c_str(), line.length()); @@ -272,8 +309,7 @@ void HhmIOHandler::Import(seq::ProfileHandle& prof, chunks = sline.split(); prof.back().SetHMMData(GetHMMData(chunks, transition_idx, transitions, neff_idx, neff_i_idx, neff_d_idx, - "Badly formatted line\n" + line + "\n in " + - loc.string())); + "Badly formatted line\n" + line)); } // parse neff if it's there, calculate the average of every column otherwise @@ -281,13 +317,11 @@ void HhmIOHandler::Import(seq::ProfileHandle& prof, sline = ost::StringRef(neff_line.c_str(), neff_line.length()); chunks = sline.split(); if(chunks.size() != 2) { - throw IOException("Badly formatted line\n" + neff_line+ "\n in " + - loc.string()); + throw IOException("Badly formatted line\n" + neff_line); } std::pair<bool, float> p = chunks[1].to_float(); if (!p.first) { - throw IOException("Badly formatted line\n" + neff_line+ "\n in " + - loc.string()); + throw IOException("Badly formatted line\n" + neff_line); } prof.SetNeff(p.second); } else { @@ -303,31 +337,4 @@ void HhmIOHandler::Import(seq::ProfileHandle& prof, } } -void HhmIOHandler::Export(const seq::ProfileHandle& prof, - const boost::filesystem::path& loc) const { - throw IOException("Cannot write hhm files."); -} - -bool HhmIOHandler::ProvidesImport(const boost::filesystem::path& loc, - const String& format) { - if (format=="auto") { - String match_suf_string = loc.string(); - std::transform(match_suf_string.begin(), match_suf_string.end(), - match_suf_string.begin(), tolower); - if ( detail::FilenameEndsWith(match_suf_string,".hhm") - || detail::FilenameEndsWith(match_suf_string,".hhm.gz")) { - return true; - } - } else if (format == "hhm") { - return true; - } - return false; -} - -bool HhmIOHandler::ProvidesExport(const boost::filesystem::path& loc, - const String& format) { - // no writers here - return false; -} - }} diff --git a/modules/io/src/seq/hhm_io_handler.hh b/modules/io/src/seq/hhm_io_handler.hh index 97fe449cf6bdc451f41336c36b989af7720c40fc..9d4b1d2aeece199e101f480364fd707e591d0fa2 100644 --- a/modules/io/src/seq/hhm_io_handler.hh +++ b/modules/io/src/seq/hhm_io_handler.hh @@ -33,6 +33,8 @@ class DLLEXPORT_OST_IO HhmIOHandler : public ProfileIOHandler { public: virtual void Import(seq::ProfileHandle& prof, const boost::filesystem::path& loc); + virtual void ImportFromString(seq::ProfileHandle& prof, + const String& data); virtual void Export(const seq::ProfileHandle& prof, const boost::filesystem::path& loc) const; @@ -43,6 +45,8 @@ public: static String GetFormatName() { return String("HHM"); } static String GetFormatDescription() { return String("HHM output of HHblits"); } +private: + void Import(seq::ProfileHandle& prof, std::istream& in); }; typedef ProfileIOHandlerFactory<HhmIOHandler> HhmIOHandlerFactory; diff --git a/modules/io/src/seq/load.cc b/modules/io/src/seq/load.cc index e652b845a53bd98840ee48909a9f1ba5aa173658..3be372e944e856ea2e75fdb3314caf35e3950c41 100644 --- a/modules/io/src/seq/load.cc +++ b/modules/io/src/seq/load.cc @@ -111,4 +111,15 @@ seq::ProfileHandlePtr LoadSequenceProfile(const String& file_name, return prof; } +seq::ProfileHandlePtr SequenceProfileFromString(const String& data, + const String& format) +{ + seq::ProfileHandlePtr prof(new seq::ProfileHandle); + String file_name = "fake." + format; + IOManager& m = IOManager::Instance(); + ProfileIOHandlerPtr prof_io = m.FindProfileImportHandler(file_name, format); + prof_io->ImportFromString(*prof, data); + return prof; +} + }} diff --git a/modules/io/src/seq/load.hh b/modules/io/src/seq/load.hh index 6c623241ad7c46f60ce2281498927b5996ec0036..f0d52e5999a48bb6b8b5cb6d6b2866f9293f8164 100644 --- a/modules/io/src/seq/load.hh +++ b/modules/io/src/seq/load.hh @@ -61,6 +61,9 @@ SequenceFromString(const String& data, const String& format); seq::ProfileHandlePtr DLLEXPORT_OST_IO LoadSequenceProfile(const String& file_name, const String& format="auto"); +seq::ProfileHandlePtr DLLEXPORT_OST_IO +SequenceProfileFromString(const String& data, const String& format); + }} #endif diff --git a/modules/io/src/seq/profile_io_handler.hh b/modules/io/src/seq/profile_io_handler.hh index 19c78de8b1c78e7b7c09222ff61f1b3d4412e104..b84600c646643a12e6d610d677f293d427ea7a93 100644 --- a/modules/io/src/seq/profile_io_handler.hh +++ b/modules/io/src/seq/profile_io_handler.hh @@ -41,6 +41,9 @@ public: virtual void Import(seq::ProfileHandle& prof, const boost::filesystem::path& loc) = 0; + virtual void ImportFromString(seq::ProfileHandle& prof, + const String& data) = 0; + virtual void Export(const seq::ProfileHandle& prof, const boost::filesystem::path& loc) const = 0; }; diff --git a/modules/io/src/seq/pssm_io_handler.cc b/modules/io/src/seq/pssm_io_handler.cc index 2dff01015975b28e86944cd0eb63c14a39a95742..892bf10c74bc9d49c205f2b7be1673eea187cff3 100644 --- a/modules/io/src/seq/pssm_io_handler.cc +++ b/modules/io/src/seq/pssm_io_handler.cc @@ -36,7 +36,7 @@ namespace ost { namespace io { void PssmIOHandler::Import(seq::ProfileHandle& prof, - const boost::filesystem::path& loc) { + const boost::filesystem::path& loc) { // open it up boost::iostreams::filtering_stream<boost::iostreams::input> in; boost::filesystem::ifstream stream(loc); @@ -49,6 +49,45 @@ void PssmIOHandler::Import(seq::ProfileHandle& prof, } in.push(stream); + this->Import(prof, in); +} + +void PssmIOHandler::ImportFromString(seq::ProfileHandle& prof, + const String& data) { + std::stringstream ss(data); + this->Import(prof, ss); +} + +void PssmIOHandler::Export(const seq::ProfileHandle& prof, + const boost::filesystem::path& loc) const { + throw IOException("Cannot write pssm files."); +} + +bool PssmIOHandler::ProvidesImport(const boost::filesystem::path& loc, + const String& format) { + if (format=="auto") { + String match_suf_string = loc.string(); + std::transform(match_suf_string.begin(), match_suf_string.end(), + match_suf_string.begin(), tolower); + if ( detail::FilenameEndsWith(match_suf_string,".pssm") + || detail::FilenameEndsWith(match_suf_string,".pssm.gz")) { + return true; + } + } else if (format == "pssm") { + return true; + } + return false; +} + +bool PssmIOHandler::ProvidesExport(const boost::filesystem::path& loc, + const String& format) { + // no writers here + return false; +} + +void PssmIOHandler::Import(seq::ProfileHandle& prof, + std::istream& in) { + // reset profile prof.clear(); @@ -85,7 +124,7 @@ void PssmIOHandler::Import(seq::ProfileHandle& prof, } } if (!table_found) { - throw IOException("No ASCII table found in file " + loc.string()); + throw IOException("No ASCII table found in input"); } // parse table (assume: index olc 20xscore 20xfreq) @@ -101,8 +140,7 @@ void PssmIOHandler::Import(seq::ProfileHandle& prof, for (uint i = 22; i < 42; ++i) { std::pair<bool, int> pbi = chunks[i].to_int(); if (!pbi.first) { - throw IOException("Badly formatted line\n" + line + "\n in " - + loc.string()); + throw IOException("Badly formatted line\n" + line); } sum_freq += pbi.second; freqs[i-22] = pbi.second; @@ -116,31 +154,4 @@ void PssmIOHandler::Import(seq::ProfileHandle& prof, } } -void PssmIOHandler::Export(const seq::ProfileHandle& prof, - const boost::filesystem::path& loc) const { - throw IOException("Cannot write pssm files."); -} - -bool PssmIOHandler::ProvidesImport(const boost::filesystem::path& loc, - const String& format) { - if (format=="auto") { - String match_suf_string = loc.string(); - std::transform(match_suf_string.begin(), match_suf_string.end(), - match_suf_string.begin(), tolower); - if ( detail::FilenameEndsWith(match_suf_string,".pssm") - || detail::FilenameEndsWith(match_suf_string,".pssm.gz")) { - return true; - } - } else if (format == "pssm") { - return true; - } - return false; -} - -bool PssmIOHandler::ProvidesExport(const boost::filesystem::path& loc, - const String& format) { - // no writers here - return false; -} - }} diff --git a/modules/io/src/seq/pssm_io_handler.hh b/modules/io/src/seq/pssm_io_handler.hh index ae83507310ce3da4e4405baf06d9d25867bdf344..b1b7d8c331a766c0ef04a32d226183229d3e65da 100644 --- a/modules/io/src/seq/pssm_io_handler.hh +++ b/modules/io/src/seq/pssm_io_handler.hh @@ -33,6 +33,8 @@ class DLLEXPORT_OST_IO PssmIOHandler : public ProfileIOHandler { public: virtual void Import(seq::ProfileHandle& prof, const boost::filesystem::path& loc); + virtual void ImportFromString(seq::ProfileHandle& prof, + const String& data); virtual void Export(const seq::ProfileHandle& prof, const boost::filesystem::path& loc) const; @@ -45,6 +47,8 @@ public: static String GetFormatDescription() { return String("ASCII Table (PSSM) output of PSI-BLAST (flag -Q)"); } +private: + void Import(seq::ProfileHandle& prof, std::istream& in); }; typedef ProfileIOHandlerFactory<PssmIOHandler> PssmIOHandlerFactory; diff --git a/modules/io/tests/test_io_sequence_profile.cc b/modules/io/tests/test_io_sequence_profile.cc index b866c394928efb2fbe68315e4caee2add0629fa9..0b597298dad1b73d6f8df7d75d8bf13488b2c643 100644 --- a/modules/io/tests/test_io_sequence_profile.cc +++ b/modules/io/tests/test_io_sequence_profile.cc @@ -29,6 +29,7 @@ #include <ost/io/seq/load.hh> #include <ost/io/seq/hhm_io_handler.hh> #include <ost/io/seq/pssm_io_handler.hh> +#include <ost/io/io_exception.hh> using namespace ost::seq; using namespace ost::io; @@ -267,4 +268,54 @@ BOOST_AUTO_TEST_CASE(pssm_loading) BOOST_CHECK(*prof == *prof2); } +BOOST_AUTO_TEST_CASE(hhm_loading_file_vs_str) +{ + // load from file + ProfileHandlePtr prof_from_file(new ProfileHandle); + ProfileIOHandlerPtr io_handler(new HhmIOHandler); + io_handler->Import(*prof_from_file, "testfiles/test_hmm.hhm"); + + // load from string + std::ifstream in("testfiles/test_hmm.hhm", std::ios::in | std::ios::binary); + std::string str; + if (in) { + in.seekg(0, std::ios::end); + str.resize(in.tellg()); + in.seekg(0, std::ios::beg); + in.read(&str[0], str.size()); + in.close(); + } else { + throw IOException("Cannot read hhm file in unit test."); + } + ProfileHandlePtr prof_from_string(new ProfileHandle); + io_handler->ImportFromString(*prof_from_string, str); + + compareProfiles(*prof_from_file, *prof_from_string, 1e-3); +} + +BOOST_AUTO_TEST_CASE(pssm_loading_file_vs_str) +{ + // load from file + ProfileHandlePtr prof_from_file(new ProfileHandle); + ProfileIOHandlerPtr io_handler(new PssmIOHandler); + io_handler->Import(*prof_from_file, "testfiles/test_pssm.pssm"); + + // load from string + std::ifstream in("testfiles/test_pssm.pssm", std::ios::in | std::ios::binary); + std::string str; + if (in) { + in.seekg(0, std::ios::end); + str.resize(in.tellg()); + in.seekg(0, std::ios::beg); + in.read(&str[0], str.size()); + in.close(); + } else { + throw IOException("Cannot read pssm file in unit test."); + } + ProfileHandlePtr prof_from_string(new ProfileHandle); + io_handler->ImportFromString(*prof_from_string, str); + + compareProfiles(*prof_from_file, *prof_from_string, 1e-3); +} + BOOST_AUTO_TEST_SUITE_END();