From 22f082ddbc50b836dbcbc10a1a261bc37f733589 Mon Sep 17 00:00:00 2001 From: Gerardo Tauriello <gerardo.tauriello@unibas.ch> Date: Tue, 9 Feb 2016 17:50:22 +0100 Subject: [PATCH] Added seq.ProfileHandle class and io.LoadSequenceProfile to work with profiles. --- modules/io/pymod/wrap_io.cc | 3 + modules/io/src/io_manager.cc | 47 +++- modules/io/src/io_manager.hh | 24 +- modules/io/src/seq/CMakeLists.txt | 5 + modules/io/src/seq/hhm_io_handler.cc | 179 ++++++++++++ modules/io/src/seq/hhm_io_handler.hh | 52 ++++ modules/io/src/seq/load.cc | 10 + modules/io/src/seq/load.hh | 6 +- modules/io/src/seq/profile_io_handler.hh | 93 +++++++ modules/io/src/seq/pssm_io_handler.cc | 147 ++++++++++ modules/io/src/seq/pssm_io_handler.hh | 54 ++++ modules/io/tests/CMakeLists.txt | 1 + modules/io/tests/test_io_sequence_profile.cc | 261 ++++++++++++++++++ modules/io/tests/testfiles/test_hmm.hhm | 31 +++ modules/io/tests/testfiles/test_pssm.pssm | 14 + modules/seq/base/pymod/CMakeLists.txt | 1 + .../seq/base/pymod/export_profile_handle.cc | 80 ++++++ modules/seq/base/pymod/wrap_seq.cc | 3 +- modules/seq/base/src/CMakeLists.txt | 2 + modules/seq/base/src/hmm.hh | 1 + modules/seq/base/src/profile_handle.cc | 220 +++++++++++++++ modules/seq/base/src/profile_handle.hh | 250 +++++++++++++++++ modules/seq/base/tests/CMakeLists.txt | 2 +- modules/seq/base/tests/test_profile.cc | 144 ++++++++++ 24 files changed, 1618 insertions(+), 12 deletions(-) create mode 100644 modules/io/src/seq/hhm_io_handler.cc create mode 100644 modules/io/src/seq/hhm_io_handler.hh create mode 100644 modules/io/src/seq/profile_io_handler.hh create mode 100644 modules/io/src/seq/pssm_io_handler.cc create mode 100644 modules/io/src/seq/pssm_io_handler.hh create mode 100644 modules/io/tests/test_io_sequence_profile.cc create mode 100644 modules/io/tests/testfiles/test_hmm.hhm create mode 100644 modules/io/tests/testfiles/test_pssm.pssm create mode 100644 modules/seq/base/pymod/export_profile_handle.cc create mode 100644 modules/seq/base/src/profile_handle.cc create mode 100644 modules/seq/base/src/profile_handle.hh create mode 100644 modules/seq/base/tests/test_profile.cc diff --git a/modules/io/pymod/wrap_io.cc b/modules/io/pymod/wrap_io.cc index 1e2b51869..6d16af3e9 100644 --- a/modules/io/pymod/wrap_io.cc +++ b/modules/io/pymod/wrap_io.cc @@ -99,6 +99,9 @@ BOOST_PYTHON_MODULE(_ost_io) def("SequenceListFromString", &SequenceListFromString); def("SequenceFromString", &SequenceFromString); def("SaveAlignment", &SaveAlignment, arg("format")="auto"); + + def("LoadSequenceProfile", &LoadSequenceProfile, arg("format")="auto"); + def("LoadSurface",LoadSurface,load_surface_ov()); def("LoadManagedSurface",LoadManagedSurface,load_msurface_ov()); diff --git a/modules/io/src/io_manager.cc b/modules/io/src/io_manager.cc index 6edc7fe43..cfd288c75 100644 --- a/modules/io/src/io_manager.cc +++ b/modules/io/src/io_manager.cc @@ -25,6 +25,8 @@ #include <ost/io/seq/fasta_io_handler.hh> #include <ost/io/seq/pir_io_handler.hh> #include <ost/io/seq/promod_io_handler.hh> +#include <ost/io/seq/hhm_io_handler.hh> +#include <ost/io/seq/pssm_io_handler.hh> #include <ost/io/mol/surface_io_msms_handler.hh> #include <ost/io/seq/clustal_io_handler.hh> #if OST_IMG_ENABLED @@ -50,10 +52,12 @@ IOManager::IOManager() RegisterFactory(EntityIOHandlerFactoryBaseP(new EntityIOCRDHandlerFactory)); RegisterFactory(EntityIOHandlerFactoryBaseP(new EntityIOSDFHandlerFactory)); RegisterFactory(EntityIOHandlerFactoryBaseP(new EntityIOMAEHandlerFactory)); - RegisterFactory(SequenceIOHandlerFactoryBasePtr(new FastaIOHandlerFactory)); + RegisterFactory(SequenceIOHandlerFactoryBasePtr(new FastaIOHandlerFactory)); RegisterFactory(SequenceIOHandlerFactoryBasePtr(new PirIOHandlerFactory)); - RegisterFactory(SequenceIOHandlerFactoryBasePtr(new ClustalIOHandlerFactory)); - RegisterFactory(SequenceIOHandlerFactoryBasePtr(new PromodIOHandlerFactory)); + RegisterFactory(SequenceIOHandlerFactoryBasePtr(new ClustalIOHandlerFactory)); + RegisterFactory(SequenceIOHandlerFactoryBasePtr(new PromodIOHandlerFactory)); + RegisterFactory(ProfileIOHandlerFactoryBasePtr(new HhmIOHandlerFactory)); + RegisterFactory(ProfileIOHandlerFactoryBasePtr(new PssmIOHandlerFactory)); RegisterFactory(SurfaceIOHandlerFactoryBasePtr(new SurfaceIOMSMSHandlerFactory)); #if OST_IMG_ENABLED RegisterFactory(MapIOHandlerFactoryBasePtr(new MapIODxHandlerFactory)); @@ -103,7 +107,7 @@ EntityIOHandlerP IOManager::FindEntityExportHandler(const String& filename, } SequenceIOHandlerPtr IOManager::FindAlignmentImportHandler(const String& filename, - const String& format) { + const String& format) { for(AlignmentIOFList::const_iterator it=alignment_io_list_.begin(); it!=alignment_io_list_.end();++it) { if( (*it)->ProvidesImport(filename,format)) { @@ -114,7 +118,7 @@ SequenceIOHandlerPtr IOManager::FindAlignmentImportHandler(const String& filenam } SequenceIOHandlerPtr IOManager::FindAlignmentExportHandler(const String& filename, - const String& format) + const String& format) { for(AlignmentIOFList::const_iterator it=alignment_io_list_.begin(); it!=alignment_io_list_.end();++it) { @@ -125,6 +129,29 @@ SequenceIOHandlerPtr IOManager::FindAlignmentExportHandler(const String& filenam throw IOUnknownFormatException("no suitable alignment io handler found for "+filename); } +ProfileIOHandlerPtr IOManager::FindProfileImportHandler(const String& filename, + const String& format) { + for(ProfileIOFList::const_iterator it=profile_io_list_.begin(); + it!=profile_io_list_.end();++it) { + if( (*it)->ProvidesImport(filename,format)) { + return (*it)->Create(); + } + } + throw IOUnknownFormatException("no suitable profile io handler found for "+filename); +} + +ProfileIOHandlerPtr IOManager::FindProfileExportHandler(const String& filename, + const String& format) +{ + for(ProfileIOFList::const_iterator it=profile_io_list_.begin(); + it!=profile_io_list_.end();++it) { + if( (*it)->ProvidesExport(filename,format)) { + return (*it)->Create(); + } + } + throw IOUnknownFormatException("no suitable profile io handler found for "+filename); +} + SurfaceIOHandlerPtr IOManager::FindSurfaceImportHandler(const String& filename, const String& format) { @@ -267,6 +294,11 @@ void IOManager::RegisterFactory(const SequenceIOHandlerFactoryBasePtr& f) alignment_io_list_.push_back(f); } +void IOManager::RegisterFactory(const ProfileIOHandlerFactoryBasePtr& f) +{ + profile_io_list_.push_back(f); +} + void IOManager::RegisterFactory(const SurfaceIOHandlerFactoryBasePtr& f) { surface_io_list_.push_back(f); @@ -282,6 +314,11 @@ const AlignmentIOFList& IOManager::GetAvailableAlignmentHandler() const return alignment_io_list_; } +const ProfileIOFList& IOManager::GetAvailableProfileHandler() const +{ + return profile_io_list_; +} + const SurfaceIOFList& IOManager::GetAvailableSurfaceHandler() const { return surface_io_list_; diff --git a/modules/io/src/io_manager.hh b/modules/io/src/io_manager.hh index bfebe32bd..18297af6d 100644 --- a/modules/io/src/io_manager.hh +++ b/modules/io/src/io_manager.hh @@ -26,6 +26,7 @@ #include <ost/io/mol/entity_io_handler.hh> #include <ost/io/seq/sequence_io_handler.hh> +#include <ost/io/seq/profile_io_handler.hh> #include <ost/io/mol/surface_io_handler.hh> #if OST_IMG_ENABLED @@ -33,10 +34,12 @@ #endif #include <ost/io/io_exception.hh> + namespace ost { namespace io { typedef std::vector<EntityIOHandlerFactoryBaseP> EntityIOHFList; typedef std::vector<SequenceIOHandlerFactoryBasePtr> AlignmentIOFList; +typedef std::vector<ProfileIOHandlerFactoryBasePtr> ProfileIOFList; typedef std::vector<SurfaceIOHandlerFactoryBasePtr> SurfaceIOFList; #if OST_IMG_ENABLED @@ -77,11 +80,17 @@ public: /// \brief get sequence io handler that is able to import the given file. /// \sa FindEntityImportHandler(filename) SequenceIOHandlerPtr FindAlignmentImportHandler(const String& filename, - const String& format="auto"); + const String& format="auto"); SequenceIOHandlerPtr FindAlignmentExportHandler(const String& filename, - const String& format="auto"); - + const String& format="auto"); + + ProfileIOHandlerPtr FindProfileImportHandler(const String& filename, + const String& format="auto"); + + ProfileIOHandlerPtr FindProfileExportHandler(const String& filename, + const String& format="auto"); + SurfaceIOHandlerPtr FindSurfaceImportHandler(const String& filename, const String& format="auto"); //@} @@ -90,7 +99,10 @@ public: void RegisterFactory(const EntityIOHandlerFactoryBaseP&); /// \brief register aligment io handler factory /// \sa adding_io_handler - void RegisterFactory(const SequenceIOHandlerFactoryBasePtr&); + void RegisterFactory(const SequenceIOHandlerFactoryBasePtr&); + /// \brief register profile io handler factory + /// \sa adding_io_handler + void RegisterFactory(const ProfileIOHandlerFactoryBasePtr&); /// \brief register surface io handler factory /// \sa adding_io_handler void RegisterFactory(const SurfaceIOHandlerFactoryBasePtr&); @@ -101,6 +113,9 @@ public: /// \brief Get a list with all available AlignmentHandler const AlignmentIOFList& GetAvailableAlignmentHandler() const; + /// \brief Get a list with all available ProfileHandler + const ProfileIOFList& GetAvailableProfileHandler() const; + /// \brief Get a list with all available SurfaceHandler const SurfaceIOFList& GetAvailableSurfaceHandler() const; @@ -137,6 +152,7 @@ private: EntityIOHFList entity_iohf_list_; AlignmentIOFList alignment_io_list_; + ProfileIOFList profile_io_list_; SurfaceIOFList surface_io_list_; #if OST_IMG_ENABLED diff --git a/modules/io/src/seq/CMakeLists.txt b/modules/io/src/seq/CMakeLists.txt index 9dd14b52a..605eb9546 100644 --- a/modules/io/src/seq/CMakeLists.txt +++ b/modules/io/src/seq/CMakeLists.txt @@ -5,6 +5,8 @@ save.cc fasta_io_handler.cc pir_io_handler.cc promod_io_handler.cc +hhm_io_handler.cc +pssm_io_handler.cc PARENT_SCOPE ) @@ -13,6 +15,9 @@ sequence_io_handler.hh fasta_io_handler.hh pir_io_handler.hh promod_io_handler.hh +profile_io_handler.hh +hhm_io_handler.hh +pssm_io_handler.hh clustal_io_handler.hh load.hh save.hh diff --git a/modules/io/src/seq/hhm_io_handler.cc b/modules/io/src/seq/hhm_io_handler.cc new file mode 100644 index 000000000..395c97bd1 --- /dev/null +++ b/modules/io/src/seq/hhm_io_handler.cc @@ -0,0 +1,179 @@ +//------------------------------------------------------------------------------ +// This file is part of the OpenStructure project <www.openstructure.org> +// +// Copyright (C) 2008-2015 by the OpenStructure authors +// +// This library is free software; you can redistribute it and/or modify it under +// the terms of the GNU Lesser General Public License as published by the Free +// Software Foundation; either version 3.0 of the License, or (at your option) +// any later version. +// This library is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more +// details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this library; if not, write to the Free Software Foundation, Inc., +// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +//------------------------------------------------------------------------------ + +/* + Author: Gerardo Tauriello, Gabriel Studer + */ + +#include "hhm_io_handler.hh" + +#include <boost/algorithm/string/predicate.hpp> +#include <boost/filesystem/convenience.hpp> +#include <boost/filesystem/fstream.hpp> +#include <boost/iostreams/filter/gzip.hpp> +#include <boost/iostreams/filtering_stream.hpp> + +#include <ost/string_ref.hh> +#include <ost/io/io_exception.hh> +#include <ost/io/io_utils.hh> + +namespace ost { namespace io { + +namespace { +// reset pc with freqs in chunks[i+offset] (i in [0,20[) +// checks for consistency of chunks and values read +seq::ProfileColumn GetColumn(const std::vector<ost::StringRef>& chunks, + const uint offset, const char* olcs, + const String& exception) { + // check chunks size + if (chunks.size() < offset+20) { + throw IOException(exception); + } + // reset pc + seq::ProfileColumn pc; // all 0 + for (uint i = 0; i < 20; ++i) { + if (chunks[i+offset] != ost::StringRef("*", 1)) { + std::pair<bool, int> pbi = chunks[i+offset].to_int(); + if (!pbi.first) { + throw IOException(exception); + } + pc.SetFreq(olcs[i], pow(2.0, -0.001 * pbi.second)); + } + } + return pc; +} +} // anon ns + +void HhmIOHandler::Import(seq::ProfileHandle& prof, + const boost::filesystem::path& loc) { + // open it up + boost::iostreams::filtering_stream<boost::iostreams::input> in; + boost::filesystem::ifstream stream(loc); + if (!stream) { + throw IOException("Could not open " + loc.string()); + } + // add unzip if necessary + if (boost::iequals(".gz", boost::filesystem::extension(loc))) { + in.push(boost::iostreams::gzip_decompressor()); + } + in.push(stream); + + // reset profile + prof.clear(); + + // tmp. storage + std::string line; + ost::StringRef sline; + std::string null_line; + char olcs[20]; + std::vector<ost::StringRef> chunks; + + // read line-by-line looking for NULL + bool null_found = false; + while (std::getline(in, line)) { + sline = ost::StringRef(line.c_str(), line.length()); + if (sline.length()>4 && + sline.substr(0, 5) == ost::StringRef("NULL ", 5)) { + null_line = line; + null_found = true; + break; + } + } + if (!null_found) { + throw IOException("No NULL found in file " + loc.string()); + } + + // read until we hit HMM, extract olcs, then skip 2 more lines + bool hmm_found = false; + while (std::getline(in, line)) { + sline = ost::StringRef(line.c_str(), line.length()); + chunks = sline.split(); + if (chunks[0] == ost::StringRef("HMM", 3)) { + // extract olcs + if (chunks.size() != 21) { + throw IOException("Badly formatted HMM line in " + loc.string()); + } + for (uint i = 1; i < 21; ++i) { + if (chunks[i].length() != 1) { + throw IOException("Badly formatted HMM line in " + loc.string()); + } + olcs[i-1] = chunks[i][0]; + } + // skip 2 lines and get out + std::getline(in, line); + std::getline(in, line); + hmm_found = true; + break; + } + } + if (!hmm_found) { + throw IOException("No HMM found in file " + loc.string()); + } + + // set null model + chunks = ost::StringRef(null_line.c_str(), null_line.length()).split(); + prof.SetNullModel(GetColumn(chunks, 1, olcs, "Badly formatted NULL line\n" + + null_line + "\n in " + loc.string())); + + // set columns + String seqres; + while (std::getline(in, line)) { + sline = ost::StringRef(line.c_str(), line.length()); + if (sline.trim().empty()) continue; + if (sline.trim() == ost::StringRef("//", 2)) break; + chunks = sline.split(); + // one letter code + seqres += chunks[0][0]; + // frequencies + prof.push_back(GetColumn(chunks, 2, olcs, "Badly formatted line\n" + + line + "\n in " + loc.string())); + // skip line (trans. matrix) + std::getline(in, line); + } + prof.SetSequence(seqres); +} + +void HhmIOHandler::Export(const seq::ProfileHandle& prof, + const boost::filesystem::path& loc) const { + throw IOException("Cannot write hhm files."); +} + +bool HhmIOHandler::ProvidesImport(const boost::filesystem::path& loc, + const String& format) { + if (format=="auto") { + String match_suf_string = loc.string(); + std::transform(match_suf_string.begin(), match_suf_string.end(), + match_suf_string.begin(), tolower); + if ( detail::FilenameEndsWith(match_suf_string,".hhm") + || detail::FilenameEndsWith(match_suf_string,".hhm.gz")) { + return true; + } + } else if (format == "hhm") { + return true; + } + return false; +} + +bool HhmIOHandler::ProvidesExport(const boost::filesystem::path& loc, + const String& format) { + // no writers here + return false; +} + +}} diff --git a/modules/io/src/seq/hhm_io_handler.hh b/modules/io/src/seq/hhm_io_handler.hh new file mode 100644 index 000000000..a3acd275c --- /dev/null +++ b/modules/io/src/seq/hhm_io_handler.hh @@ -0,0 +1,52 @@ +//------------------------------------------------------------------------------ +// This file is part of the OpenStructure project <www.openstructure.org> +// +// Copyright (C) 2008-2015 by the OpenStructure authors +// +// This library is free software; you can redistribute it and/or modify it under +// the terms of the GNU Lesser General Public License as published by the Free +// Software Foundation; either version 3.0 of the License, or (at your option) +// any later version. +// This library is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more +// details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this library; if not, write to the Free Software Foundation, Inc., +// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +//------------------------------------------------------------------------------ + +/* + Author: Gerardo Tauriello + */ + +#ifndef OST_HHM_IO_HANDLER_HH +#define OST_HHM_IO_HANDLER_HH + +#include <ost/io/module_config.hh> +#include "profile_io_handler.hh" + +namespace ost { namespace io { + +class DLLEXPORT_OST_IO HhmIOHandler : public ProfileIOHandler { +public: + virtual void Import(seq::ProfileHandle& prof, + const boost::filesystem::path& loc); + virtual void Export(const seq::ProfileHandle& prof, + const boost::filesystem::path& loc) const; + + static bool ProvidesImport(const boost::filesystem::path& loc, + const String& format="auto"); + static bool ProvidesExport(const boost::filesystem::path& loc, + const String& format="auto"); + + static String GetFormatName() { return String("HHM"); } + static String GetFormatDescription() { return String("HHM output of HHblits"); } +}; + +typedef ProfileIOHandlerFactory<HhmIOHandler> HhmIOHandlerFactory; + +}} + +#endif diff --git a/modules/io/src/seq/load.cc b/modules/io/src/seq/load.cc index 896c5fa97..9bca49efe 100644 --- a/modules/io/src/seq/load.cc +++ b/modules/io/src/seq/load.cc @@ -101,4 +101,14 @@ seq::SequenceHandle SequenceFromString(const String& data, return get_one_seq(SequenceListFromString(data, format)); } +seq::ProfileHandlePtr LoadSequenceProfile(const String& file_name, + const String& format) +{ + seq::ProfileHandlePtr prof(new seq::ProfileHandle); + IOManager& m = IOManager::Instance(); + ProfileIOHandlerPtr prof_io = m.FindProfileImportHandler(file_name, format); + prof_io->Import(*prof, file_name); + return prof; +} + }} diff --git a/modules/io/src/seq/load.hh b/modules/io/src/seq/load.hh index 485fef1d8..a72cee4b2 100644 --- a/modules/io/src/seq/load.hh +++ b/modules/io/src/seq/load.hh @@ -26,8 +26,9 @@ #include <ost/io/module_config.hh> #include <ost/seq/sequence_list.hh> #include <ost/seq/alignment_handle.hh> -namespace ost { namespace io { +#include <ost/seq/profile_handle.hh> +namespace ost { namespace io { seq::AlignmentHandle DLLEXPORT_OST_IO LoadAlignment(const String& file_name, const String& format="auto"); @@ -57,6 +58,9 @@ SequenceFromStream(std::istream& stream, const String& format); seq::SequenceHandle DLLEXPORT_OST_IO SequenceFromString(const String& data, const String& format); +seq::ProfileHandlePtr DLLEXPORT_OST_IO +LoadSequenceProfile(const String& file_name, const String& format="auto"); + }} #endif diff --git a/modules/io/src/seq/profile_io_handler.hh b/modules/io/src/seq/profile_io_handler.hh new file mode 100644 index 000000000..783d69762 --- /dev/null +++ b/modules/io/src/seq/profile_io_handler.hh @@ -0,0 +1,93 @@ +//------------------------------------------------------------------------------ +// This file is part of the OpenStructure project <www.openstructure.org> +// +// Copyright (C) 2008-2015 by the OpenStructure authors +// +// This library is free software; you can redistribute it and/or modify it under +// the terms of the GNU Lesser General Public License as published by the Free +// Software Foundation; either version 3.0 of the License, or (at your option) +// any later version. +// This library is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more +// details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this library; if not, write to the Free Software Foundation, Inc., +// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +//------------------------------------------------------------------------------ + +/* + Author: Gerardo Tauriello + */ + +#ifndef OST_IO_PROFILE_IO_HANDLER_HH +#define OST_IO_PROFILE_IO_HANDLER_HH + +#include <boost/shared_ptr.hpp> +#include <boost/filesystem/operations.hpp> + +#include <ost/io/module_config.hh> +#include <ost/io/io_utils.hh> +#include <ost/seq/profile_handle.hh> + +namespace ost { namespace io { + +//! pure abstract base class for profile io handlers +class DLLEXPORT_OST_IO ProfileIOHandler { +public: + virtual ~ProfileIOHandler() {} + + virtual void Import(seq::ProfileHandle& prof, + const boost::filesystem::path& loc) = 0; + + virtual void Export(const seq::ProfileHandle& prof, + const boost::filesystem::path& loc) const = 0; +}; + +typedef boost::shared_ptr<ProfileIOHandler> ProfileIOHandlerPtr; + +//! pure abstract base class for creation of a specfic io handler +class DLLEXPORT_OST_IO ProfileIOHandlerFactoryBase { +public: + virtual ~ProfileIOHandlerFactoryBase() {} + virtual bool ProvidesImport(const boost::filesystem::path& loc, + const String& format="auto") const = 0; + virtual bool ProvidesExport(const boost::filesystem::path& loc, + const String& format="auto") const = 0; + virtual ProfileIOHandlerPtr Create() const = 0; + virtual String GetFormatName() const =0; + virtual String GetFormatDescription() const =0; +}; + +typedef boost::shared_ptr<ProfileIOHandlerFactoryBase> ProfileIOHandlerFactoryBasePtr; + +template <class HANDLER> +class ProfileIOHandlerFactory: public ProfileIOHandlerFactoryBase { + virtual bool ProvidesImport(const boost::filesystem::path& loc, + const String& format="auto") const { + return HANDLER::ProvidesImport(loc, format); + } + + virtual bool ProvidesExport(const boost::filesystem::path& loc, + const String& format="auto") const { + return HANDLER::ProvidesExport(loc, format); + } + + virtual ProfileIOHandlerPtr Create() const { + return ProfileIOHandlerPtr(new HANDLER); + } + + virtual String GetFormatName() const { + return HANDLER::GetFormatName(); + } + + virtual String GetFormatDescription() const { + return HANDLER::GetFormatDescription(); + } + +}; + +}} // ns + +#endif diff --git a/modules/io/src/seq/pssm_io_handler.cc b/modules/io/src/seq/pssm_io_handler.cc new file mode 100644 index 000000000..4b16ed6cb --- /dev/null +++ b/modules/io/src/seq/pssm_io_handler.cc @@ -0,0 +1,147 @@ +//------------------------------------------------------------------------------ +// This file is part of the OpenStructure project <www.openstructure.org> +// +// Copyright (C) 2008-2015 by the OpenStructure authors +// +// This library is free software; you can redistribute it and/or modify it under +// the terms of the GNU Lesser General Public License as published by the Free +// Software Foundation; either version 3.0 of the License, or (at your option) +// any later version. +// This library is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more +// details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this library; if not, write to the Free Software Foundation, Inc., +// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +//------------------------------------------------------------------------------ + +/* + Author: Gerardo Tauriello + */ + +#include "pssm_io_handler.hh" + +#include <boost/algorithm/string/predicate.hpp> +#include <boost/filesystem/convenience.hpp> +#include <boost/filesystem/fstream.hpp> +#include <boost/iostreams/filter/gzip.hpp> +#include <boost/iostreams/filtering_stream.hpp> + +#include <ost/string_ref.hh> +#include <ost/io/io_exception.hh> +#include <ost/io/io_utils.hh> + +namespace ost { namespace io { + +void PssmIOHandler::Import(seq::ProfileHandle& prof, + const boost::filesystem::path& loc) { + // open it up + boost::iostreams::filtering_stream<boost::iostreams::input> in; + boost::filesystem::ifstream stream(loc); + if (!stream) { + throw IOException("Could not open " + loc.string()); + } + // add unzip if necessary + if (boost::iequals(".gz", boost::filesystem::extension(loc))) { + in.push(boost::iostreams::gzip_decompressor()); + } + in.push(stream); + + // reset profile + prof.clear(); + + // set null model + prof.SetNullModel(seq::ProfileColumn::BLOSUMNullModel()); + + // tmp. storage + std::string line; + ost::StringRef sline; + char olcs[20]; + std::vector<ost::StringRef> chunks; + + // read line-by-line looking for table + // -> start = line with 40 1-char tokens + bool table_found = false; + while (std::getline(in, line)) { + sline = ost::StringRef(line.c_str(), line.length()); + chunks = sline.split(); + if (chunks.size() == 40) { + bool got_it = true; + for (uint i = 0; i < 40; ++i) { + if (chunks[i].length() != 1) { + got_it = false; + break; + } + if (i >= 20) { + olcs[i-20] = chunks[i][0]; + } + } + if (got_it) { + table_found = true; + break; + } + } + } + if (!table_found) { + throw IOException("No ASCII table found in file " + loc.string()); + } + + // parse table (assume: index olc 20xscore 20xfreq) + String seqres; + while (std::getline(in, line)) { + sline = ost::StringRef(line.c_str(), line.length()); + chunks = sline.split(); + // stop if not enough entries + if (chunks.size() < 42) break; + // parse row (freq in % as integers) + seqres += chunks[1][0]; + Real sum_freq = 0; + Real freqs[20]; + for (uint i = 22; i < 42; ++i) { + std::pair<bool, int> pbi = chunks[i].to_int(); + if (!pbi.first) { + throw IOException("Badly formatted line\n" + line + "\n in " + loc.string()); + } + sum_freq += pbi.second; + freqs[i-22] = pbi.second; + } + // fill normalized frequencies + seq::ProfileColumn pc; + for (uint i = 0; i < 20; ++i) { + pc.SetFreq(olcs[i], freqs[i]/sum_freq); + } + prof.push_back(pc); + } + prof.SetSequence(seqres); +} + +void PssmIOHandler::Export(const seq::ProfileHandle& prof, + const boost::filesystem::path& loc) const { + throw IOException("Cannot write pssm files."); +} + +bool PssmIOHandler::ProvidesImport(const boost::filesystem::path& loc, + const String& format) { + if (format=="auto") { + String match_suf_string = loc.string(); + std::transform(match_suf_string.begin(), match_suf_string.end(), + match_suf_string.begin(), tolower); + if ( detail::FilenameEndsWith(match_suf_string,".pssm") + || detail::FilenameEndsWith(match_suf_string,".pssm.gz")) { + return true; + } + } else if (format == "pssm") { + return true; + } + return false; +} + +bool PssmIOHandler::ProvidesExport(const boost::filesystem::path& loc, + const String& format) { + // no writers here + return false; +} + +}} diff --git a/modules/io/src/seq/pssm_io_handler.hh b/modules/io/src/seq/pssm_io_handler.hh new file mode 100644 index 000000000..87a00673b --- /dev/null +++ b/modules/io/src/seq/pssm_io_handler.hh @@ -0,0 +1,54 @@ +//------------------------------------------------------------------------------ +// This file is part of the OpenStructure project <www.openstructure.org> +// +// Copyright (C) 2008-2015 by the OpenStructure authors +// +// This library is free software; you can redistribute it and/or modify it under +// the terms of the GNU Lesser General Public License as published by the Free +// Software Foundation; either version 3.0 of the License, or (at your option) +// any later version. +// This library is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more +// details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this library; if not, write to the Free Software Foundation, Inc., +// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +//------------------------------------------------------------------------------ + +/* + Author: Gerardo Tauriello + */ + +#ifndef OST_PSSM_IO_HANDLER_HH +#define OST_PSSM_IO_HANDLER_HH + +#include <ost/io/module_config.hh> +#include "profile_io_handler.hh" + +namespace ost { namespace io { + +class DLLEXPORT_OST_IO PssmIOHandler : public ProfileIOHandler { +public: + virtual void Import(seq::ProfileHandle& prof, + const boost::filesystem::path& loc); + virtual void Export(const seq::ProfileHandle& prof, + const boost::filesystem::path& loc) const; + + static bool ProvidesImport(const boost::filesystem::path& loc, + const String& format="auto"); + static bool ProvidesExport(const boost::filesystem::path& loc, + const String& format="auto"); + + static String GetFormatName() { return String("PSSM"); } + static String GetFormatDescription() { + return String("ASCII Table (PSSM) output of PSI-BLAST (flag -Q)"); + } +}; + +typedef ProfileIOHandlerFactory<PssmIOHandler> PssmIOHandlerFactory; + +}} + +#endif diff --git a/modules/io/tests/CMakeLists.txt b/modules/io/tests/CMakeLists.txt index 64a310415..0efbc1058 100644 --- a/modules/io/tests/CMakeLists.txt +++ b/modules/io/tests/CMakeLists.txt @@ -6,6 +6,7 @@ set(OST_IO_UNIT_TESTS test_io_crd.cc test_io_dcd.cc test_io_sdf.cc + test_io_sequence_profile.cc test_pir.cc test_iomanager.cc tests.cc diff --git a/modules/io/tests/test_io_sequence_profile.cc b/modules/io/tests/test_io_sequence_profile.cc new file mode 100644 index 000000000..85d2dc602 --- /dev/null +++ b/modules/io/tests/test_io_sequence_profile.cc @@ -0,0 +1,261 @@ +//------------------------------------------------------------------------------ +// This file is part of the OpenStructure project <www.openstructure.org> +// +// Copyright (C) 2008-2015 by the OpenStructure authors +// +// This library is free software; you can redistribute it and/or modify it under +// the terms of the GNU Lesser General Public License as published by the Free +// Software Foundation; either version 3.0 of the License, or (at your option) +// any later version. +// This library is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more +// details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this library; if not, write to the Free Software Foundation, Inc., +// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +//------------------------------------------------------------------------------ + +/* + Author: Gerardo Tauriello, Gabriel Studer + */ + +#define BOOST_TEST_DYN_LINK +#include <boost/test/unit_test.hpp> +#include <boost/test/auto_unit_test.hpp> +#include <iostream> + +#include <ost/seq/profile_handle.hh> +#include <ost/io/seq/load.hh> +#include <ost/io/seq/hhm_io_handler.hh> +#include <ost/io/seq/pssm_io_handler.hh> + +using namespace ost::seq; +using namespace ost::io; + +BOOST_AUTO_TEST_SUITE( io ); + +// helper to compare profiles element by element (with delta) +// delta given in % ! +template <typename T, typename T2> +void compareProfiles(T& t, T2& t2, Real delta = 0) { + BOOST_REQUIRE_EQUAL(t.size(), t2.size()); + BOOST_CHECK_EQUAL(t.GetSequence(), t2.GetSequence()); + BOOST_CHECK_CLOSE(t.GetAverageEntropy(), t2.GetAverageEntropy(), delta*20); + const Real* f = t.GetNullModel().freqs_begin(); + const Real* f2 = t2.GetNullModel().freqs_begin(); + for (uint j = 0; j < 20; ++j) { + BOOST_CHECK_CLOSE(f[j], f2[j], delta); + } + uint N = t.size(); + for (uint i = 0; i < N; ++i) { + f = t[i].freqs_begin(); + f2 = t2[i].freqs_begin(); + for (uint j = 0; j < 20; ++j) { + BOOST_CHECK_CLOSE(f[j], f2[j], delta); + } + } +} + +BOOST_AUTO_TEST_CASE(hhm_loading) +{ + // check hhm factory + ProfileIOHandlerFactoryBasePtr fac(new HhmIOHandlerFactory); + BOOST_CHECK_EQUAL(fac->ProvidesImport("test.hhm"), true); + BOOST_CHECK_EQUAL(fac->ProvidesImport("test.hhm.gz"), true); + BOOST_CHECK_EQUAL(fac->ProvidesImport("test.pssm"), false); + BOOST_CHECK_EQUAL(fac->ProvidesImport("test.pssm.gz"), false); + BOOST_CHECK_EQUAL(fac->ProvidesImport("test", "hhm"), true); + BOOST_CHECK_EQUAL(fac->ProvidesExport("test.hhm"), false); + BOOST_CHECK_EQUAL(fac->ProvidesExport("test.hhm.gz"), false); + BOOST_CHECK_EQUAL(fac->ProvidesExport("test", "hhm"), false); + ProfileIOHandlerPtr io_handler = fac->Create(); + + // load file + ProfileHandlePtr prof(new ProfileHandle); + io_handler->Import(*prof, "testfiles/test_hmm.hhm"); + BOOST_CHECK_EQUAL(prof->size(), size_t(5)); + BOOST_CHECK_EQUAL(prof->GetSequence(), "ETGSP"); + + // we first check, whether the null model has been read correctly + // and then randomly pick a column to check + + ProfileColumn null_model = prof->GetNullModel(); + + String olc = "ACDEFGHIKLMNPQRSTVWY"; + + Real correct_aa_freqs[] = {pow(2.0,-0.001*3706), pow(2.0,-0.001*5728), + pow(2.0,-0.001*4211), pow(2.0,-0.001*4064), + pow(2.0,-0.001*4839), pow(2.0,-0.001*3729), + pow(2.0,-0.001*4763), pow(2.0,-0.001*4308), + pow(2.0,-0.001*4069), pow(2.0,-0.001*3323), + pow(2.0,-0.001*5509), pow(2.0,-0.001*4640), + pow(2.0,-0.001*4464), pow(2.0,-0.001*4937), + pow(2.0,-0.001*4285), pow(2.0,-0.001*4423), + pow(2.0,-0.001*3815), pow(2.0,-0.001*3783), + pow(2.0,-0.001*6325), pow(2.0,-0.001*4665)}; + + for (int i = 0; i < 20; ++i) { + BOOST_CHECK_CLOSE(null_model.GetFreq(olc[i]), correct_aa_freqs[i], + Real(1e-5)); + } + + // let's check, whether a particular column has been read correctly + ProfileColumn col = (*prof)[2]; + + memset(correct_aa_freqs, 0, 20*sizeof(Real)); + correct_aa_freqs[0] = pow(2.0,-0.001*3676); + correct_aa_freqs[1] = pow(2.0,-0.001*2597); + correct_aa_freqs[5] = pow(2.0,-0.001*5359); + correct_aa_freqs[6] = pow(2.0,-0.001*3275); + correct_aa_freqs[11] = pow(2.0,-0.001*3292); + correct_aa_freqs[12] = pow(2.0,-0.001*5077); + correct_aa_freqs[13] = pow(2.0,-0.001*3826); + correct_aa_freqs[15] = pow(2.0,-0.001*2409); + correct_aa_freqs[16] = pow(2.0,-0.001*3733); + correct_aa_freqs[17] = pow(2.0,-0.001*4503); + correct_aa_freqs[19] = pow(2.0,-0.001*3070); + + for(int i = 0; i < 20; ++i){ + BOOST_CHECK_CLOSE(col.GetFreq(olc[i]), correct_aa_freqs[i], Real(1e-5)); + } + + // check, whether the entropy of the column above gets correctly calculated + Real entropy = 0.0; + entropy -= correct_aa_freqs[0] * log(correct_aa_freqs[0]); + entropy -= correct_aa_freqs[1] * log(correct_aa_freqs[1]); + entropy -= correct_aa_freqs[5] * log(correct_aa_freqs[5]); + entropy -= correct_aa_freqs[6] * log(correct_aa_freqs[6]); + entropy -= correct_aa_freqs[11] * log(correct_aa_freqs[11]); + entropy -= correct_aa_freqs[12] * log(correct_aa_freqs[12]); + entropy -= correct_aa_freqs[13] * log(correct_aa_freqs[13]); + entropy -= correct_aa_freqs[15] * log(correct_aa_freqs[15]); + entropy -= correct_aa_freqs[16] * log(correct_aa_freqs[16]); + entropy -= correct_aa_freqs[17] * log(correct_aa_freqs[17]); + entropy -= correct_aa_freqs[19] * log(correct_aa_freqs[19]); + BOOST_CHECK_CLOSE((*prof)[2].GetEntropy(), entropy, Real(1e-5)); + + // look at avg entropy... + BOOST_CHECK_CLOSE(prof->GetAverageEntropy(), 1.48704, Real(1e-2)); + + // reload with LoadSequenceProfile + ProfileHandlePtr prof2 = LoadSequenceProfile("testfiles/test_hmm.hhm"); + BOOST_CHECK(*prof == *prof2); + + // reload + io_handler->Import(*prof2, "testfiles/test_hmm.hhm"); + BOOST_CHECK(*prof == *prof2); +} + +BOOST_AUTO_TEST_CASE(pssm_loading) +{ + // check pssm factory + ProfileIOHandlerFactoryBasePtr fac(new PssmIOHandlerFactory); + BOOST_CHECK_EQUAL(fac->ProvidesImport("test.hhm"), false); + BOOST_CHECK_EQUAL(fac->ProvidesImport("test.hhm.gz"), false); + BOOST_CHECK_EQUAL(fac->ProvidesImport("test.pssm"), true); + BOOST_CHECK_EQUAL(fac->ProvidesImport("test.pssm.gz"), true); + BOOST_CHECK_EQUAL(fac->ProvidesImport("test", "pssm"), true); + BOOST_CHECK_EQUAL(fac->ProvidesExport("test.pssm"), false); + BOOST_CHECK_EQUAL(fac->ProvidesExport("test.pssm.gz"), false); + BOOST_CHECK_EQUAL(fac->ProvidesExport("test", "pssm"), false); + ProfileIOHandlerPtr io_handler = fac->Create(); + + // load file + ProfileHandlePtr prof(new ProfileHandle); + io_handler->Import(*prof, "testfiles/test_pssm.pssm"); + + // compare manually + ProfileHandle prof_pssm_ref; + prof_pssm_ref.SetSequence("RRSPP"); + prof_pssm_ref.SetNullModel(ProfileColumn::BLOSUMNullModel()); + ProfileColumn pc1, pc2, pc3, pc4, pc5; + pc1.SetFreq('R', 0.74); + pc1.SetFreq('Q', 0.01); + pc1.SetFreq('K', 0.25); + + pc2.SetFreq('A', 0.02/0.99); + pc2.SetFreq('R', 0.61/0.99); + pc2.SetFreq('N', 0.03/0.99); + pc2.SetFreq('Q', 0.02/0.99); + pc2.SetFreq('E', 0.02/0.99); + pc2.SetFreq('G', 0.02/0.99); + pc2.SetFreq('H', 0.01/0.99); + pc2.SetFreq('I', 0.01/0.99); + pc2.SetFreq('L', 0.01/0.99); + pc2.SetFreq('K', 0.13/0.99); + pc2.SetFreq('P', 0.03/0.99); + pc2.SetFreq('S', 0.06/0.99); + pc2.SetFreq('T', 0.01/0.99); + pc2.SetFreq('V', 0.01/0.99); + pc3.SetFreq('A', 0.17/0.99); + pc3.SetFreq('R', 0.07/0.99); + pc3.SetFreq('N', 0.03/0.99); + + pc3.SetFreq('A', 0.17); + pc3.SetFreq('R', 0.07); + pc3.SetFreq('N', 0.03); + pc3.SetFreq('D', 0.02); + pc3.SetFreq('C', 0.01); + pc3.SetFreq('Q', 0.03); + pc3.SetFreq('E', 0.02); + pc3.SetFreq('G', 0.03); + pc3.SetFreq('L', 0.02); + pc3.SetFreq('K', 0.04); + pc3.SetFreq('M', 0.01); + pc3.SetFreq('F', 0.01); + pc3.SetFreq('P', 0.06); + pc3.SetFreq('S', 0.37); + pc3.SetFreq('T', 0.08); + pc3.SetFreq('V', 0.03); + + pc4.SetFreq('A', 0.11/0.98); + pc4.SetFreq('R', 0.06/0.98); + pc4.SetFreq('D', 0.01/0.98); + pc4.SetFreq('Q', 0.02/0.98); + pc4.SetFreq('E', 0.02/0.98); + pc4.SetFreq('I', 0.01/0.98); + pc4.SetFreq('L', 0.03/0.98); + pc4.SetFreq('K', 0.04/0.98); + pc4.SetFreq('F', 0.01/0.98); + pc4.SetFreq('P', 0.58/0.98); + pc4.SetFreq('S', 0.03/0.98); + pc4.SetFreq('T', 0.05/0.98); + pc4.SetFreq('V', 0.01/0.98); + + pc5.SetFreq('A', 0.04); + pc5.SetFreq('R', 0.01); + pc5.SetFreq('N', 0.02); + pc5.SetFreq('D', 0.03); + pc5.SetFreq('Q', 0.02); + pc5.SetFreq('E', 0.03); + pc5.SetFreq('H', 0.01); + pc5.SetFreq('I', 0.01); + pc5.SetFreq('L', 0.04); + pc5.SetFreq('K', 0.05); + pc5.SetFreq('M', 0.01); + pc5.SetFreq('F', 0.01); + pc5.SetFreq('P', 0.63); + pc5.SetFreq('S', 0.03); + pc5.SetFreq('T', 0.02); + pc5.SetFreq('V', 0.04); + + prof_pssm_ref.push_back(pc1); + prof_pssm_ref.push_back(pc2); + prof_pssm_ref.push_back(pc3); + prof_pssm_ref.push_back(pc4); + prof_pssm_ref.push_back(pc5); + + compareProfiles(*prof, prof_pssm_ref, 1e-3); + + // reload with LoadSequenceProfile + ProfileHandlePtr prof2 = LoadSequenceProfile("testfiles/test_pssm.pssm"); + BOOST_CHECK(*prof == *prof2); + + // reload + io_handler->Import(*prof2, "testfiles/test_pssm.pssm"); + BOOST_CHECK(*prof == *prof2); +} + +BOOST_AUTO_TEST_SUITE_END(); diff --git a/modules/io/tests/testfiles/test_hmm.hhm b/modules/io/tests/testfiles/test_hmm.hhm new file mode 100644 index 000000000..1b9a665d1 --- /dev/null +++ b/modules/io/tests/testfiles/test_hmm.hhm @@ -0,0 +1,31 @@ +HHsearch 1.5 +I don't get parsed +I don't get parsed +I don't get parsed +I don't get parsed +I don't get parsed +I don't get parsed +I don't get parsed +I don't get parsed +I don't get parsed +# +NULL 3706 5728 4211 4064 4839 3729 4763 4308 4069 3323 5509 4640 4464 4937 4285 4423 3815 3783 6325 4665 +HMM A C D E F G H I K L M N P Q R S T V W Y + M->M M->I M->D I->M I->I D->M D->D Neff Neff_I Neff_D + 0 * * 0 * 0 * * * * +E 1 1780 * * 497 * * * * * * * * * * * * * * * * 1 + 0 * * * * * * 1610 0 0 + +T 2 1615 * * * * 2652 * * * * * * * * * * 3992 1147 * * 2 + 0 * * * * * * 2652 0 0 + +G 3 3676 2597 * * * 5359 3275 * * * * 3292 5077 3826 * 2409 3733 4503 * 3070 3 + 46 4979 * 2006 413 * * 3076 1097 0 + +S 4 2579 * * * * * * * * 4320 * 4532 3895 4519 * 951 5140 3595 * * 4 + 58 4660 * 2833 218 * * 3050 1146 0 + +P 5 2585 * 4184 * * 2345 4748 * * * * * 2839 1766 5056 3635 * * * * 5 + 0 * * * * * * 3036 0 0 + +// diff --git a/modules/io/tests/testfiles/test_pssm.pssm b/modules/io/tests/testfiles/test_pssm.pssm new file mode 100644 index 000000000..e3f72045c --- /dev/null +++ b/modules/io/tests/testfiles/test_pssm.pssm @@ -0,0 +1,14 @@ + +Last position-specific scoring matrix computed, weighted observed percentages rounded down, information per position, and relative weight of gapless real matches to pseudocounts + A R N D C Q E G H I L K M F P S T W Y V A R N D C Q E G H I L K M F P S T W Y V + 1 R -4 7 -3 -4 -7 -1 -3 -5 -3 -6 -5 4 -4 -6 -5 -4 -4 -6 -5 -6 0 74 0 0 0 1 0 0 0 0 0 25 0 0 0 0 0 0 0 0 1.96 0.65 + 2 R -2 7 0 -4 -6 0 -1 -3 -1 -4 -4 2 -4 -6 -1 0 -3 -6 -5 -4 2 61 3 0 0 2 2 2 1 1 1 13 0 0 3 6 1 0 0 1 1.22 0.52 + 3 S 2 1 0 -1 -2 0 -1 -1 -2 -4 -3 0 -2 -2 1 4 1 -5 -4 -2 17 7 3 2 1 3 2 3 0 0 2 4 1 1 6 37 8 0 0 3 0.42 0.31 + 4 P 1 0 -5 -3 -6 -2 -3 -5 -3 -4 -3 -1 -5 -3 7 -1 0 -3 -6 -4 11 6 0 1 0 2 2 0 0 1 3 4 0 1 58 3 5 0 0 1 1.42 0.62 + 5 P -1 -3 -2 -2 -7 -2 -2 -6 -3 -4 -2 0 -2 -4 7 -2 -2 -7 -6 -1 4 1 2 3 0 2 3 0 1 1 4 5 1 1 63 3 2 0 0 4 1.56 0.71 + + K Lambda +Standard Ungapped 0.1342 0.3155 +Standard Gapped 0.0410 0.2670 +PSI Ungapped 0.1607 0.3175 +PSI Gapped 0.0492 0.2670 diff --git a/modules/seq/base/pymod/CMakeLists.txt b/modules/seq/base/pymod/CMakeLists.txt index c3d565c25..4f304ceeb 100644 --- a/modules/seq/base/pymod/CMakeLists.txt +++ b/modules/seq/base/pymod/CMakeLists.txt @@ -1,6 +1,7 @@ set(OST_SEQ_PYMOD_SOURCES export_sequence.cc export_hmm.cc + export_profile_handle.cc wrap_seq.cc ) diff --git a/modules/seq/base/pymod/export_profile_handle.cc b/modules/seq/base/pymod/export_profile_handle.cc new file mode 100644 index 000000000..77479ac31 --- /dev/null +++ b/modules/seq/base/pymod/export_profile_handle.cc @@ -0,0 +1,80 @@ +//------------------------------------------------------------------------------ +// This file is part of the OpenStructure project <www.openstructure.org> +// +// Copyright (C) 2008-2015 by the OpenStructure authors +// +// This library is free software; you can redistribute it and/or modify it under +// the terms of the GNU Lesser General Public License as published by the Free +// Software Foundation; either version 3.0 of the License, or (at your option) +// any later version. +// This library is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more +// details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this library; if not, write to the Free Software Foundation, Inc., +// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +//------------------------------------------------------------------------------ + +/* + Author: Gerardo Tauriello, Gabriel Studer + */ + +#include <boost/python/suite/indexing/vector_indexing_suite.hpp> +#include <boost/python.hpp> +#include <ost/seq/profile_handle.hh> + +using namespace ost::seq; +using namespace boost::python; + +namespace { + +boost::python::list wrap_get_names(ProfileDBPtr db) { + std::vector<String> v_names = db->GetNames(); + boost::python::list names; + for (std::vector<String>::iterator i = v_names.begin(); + i != v_names.end(); ++i) { + names.append(*i); + } + return names; +} + +} // anon ns + +void export_profile_handle() +{ + class_<ProfileColumn>("ProfileColumn", init<>()) + .add_property("entropy", &ProfileColumn::GetEntropy) + .def("GetFreq", &ProfileColumn::GetFreq) + .def("SetFreq", &ProfileColumn::SetFreq) + .def("BLOSUMNullModel", + &ProfileColumn::BLOSUMNullModel).staticmethod("BLOSUMNullModel") + ; + + class_<ProfileColumnList>("ProfileColumnList", init<>()) + .def(vector_indexing_suite<ProfileColumnList>()) + ; + + class_<ProfileHandle, ProfileHandlePtr>("ProfileHandle", init<>()) + .def("__len__",&ProfileHandle::size) + .def("AddColumn", &ProfileHandle::push_back) + .def("Extract", &ProfileHandle::Extract) + .add_property("null_model", make_function(&ProfileHandle::GetNullModel, + return_value_policy<copy_const_reference>())) + .add_property("columns", + make_function(&ProfileHandle::GetColumns, + return_value_policy<copy_const_reference>())) + .add_property("avg_entropy", &ProfileHandle::GetAverageEntropy) + .add_property("sequence", &ProfileHandle::GetSequence) + ; + + class_<ProfileDB, ProfileDBPtr>("ProfileDB", init<>()) + .def("Load", &ProfileDB::Load).staticmethod("Load") + .def("Save", &ProfileDB::Save) + .def("AddProfile", &ProfileDB::AddProfile) + .def("GetProfile", &ProfileDB::GetProfile) + .def("Size", &ProfileDB::size) + .def("GetNames",&wrap_get_names) + ; +} diff --git a/modules/seq/base/pymod/wrap_seq.cc b/modules/seq/base/pymod/wrap_seq.cc index d676f8757..f937590d5 100644 --- a/modules/seq/base/pymod/wrap_seq.cc +++ b/modules/seq/base/pymod/wrap_seq.cc @@ -21,10 +21,11 @@ using namespace boost::python; void export_sequence(); void export_hmm(); +void export_profile_handle(); BOOST_PYTHON_MODULE(_ost_seq) { export_sequence(); export_hmm(); - + export_profile_handle(); } diff --git a/modules/seq/base/src/CMakeLists.txt b/modules/seq/base/src/CMakeLists.txt index 3ba4fcf21..80247afad 100644 --- a/modules/seq/base/src/CMakeLists.txt +++ b/modules/seq/base/src/CMakeLists.txt @@ -17,6 +17,7 @@ aligned_column_iterator.hh invalid_sequence.hh views_from_sequences.hh hmm.hh +profile_handle.hh ) set(OST_SEQ_SOURCES @@ -30,6 +31,7 @@ alignment_handle.cc sequence_op.cc views_from_sequences.cc hmm.cc +profile_handle.cc ) if (ENABLE_INFO) diff --git a/modules/seq/base/src/hmm.hh b/modules/seq/base/src/hmm.hh index 408dc5bc6..f706911a6 100644 --- a/modules/seq/base/src/hmm.hh +++ b/modules/seq/base/src/hmm.hh @@ -3,6 +3,7 @@ #include <stdint.h> #include <string> +#include <string.h> // for memcpy, etc #include <vector> #include <map> #include <fstream> diff --git a/modules/seq/base/src/profile_handle.cc b/modules/seq/base/src/profile_handle.cc new file mode 100644 index 000000000..0feebe958 --- /dev/null +++ b/modules/seq/base/src/profile_handle.cc @@ -0,0 +1,220 @@ +//------------------------------------------------------------------------------ +// This file is part of the OpenStructure project <www.openstructure.org> +// +// Copyright (C) 2008-2015 by the OpenStructure authors +// +// This library is free software; you can redistribute it and/or modify it under +// the terms of the GNU Lesser General Public License as published by the Free +// Software Foundation; either version 3.0 of the License, or (at your option) +// any later version. +// This library is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more +// details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this library; if not, write to the Free Software Foundation, Inc., +// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +//------------------------------------------------------------------------------ + +/* + Author: Gerardo Tauriello, Gabriel Studer + */ + +#include <fstream> +#include <iostream> +#include <ost/seq/profile_handle.hh> + +namespace ost { namespace seq{ + +////////////////////////////////////////////////////////////////////////////// +// ProfileColumn +////////////////////////////////////////////////////////////////////////////// + +ProfileColumn ProfileColumn::BLOSUMNullModel() { + ProfileColumn col; + + col.freq_[0] = 0.0732; + col.freq_[1] = 0.0250; + col.freq_[2] = 0.0542; + col.freq_[3] = 0.0547; + col.freq_[4] = 0.0447; + col.freq_[5] = 0.0751; + col.freq_[6] = 0.0261; + col.freq_[7] = 0.0700; + col.freq_[8] = 0.1011; + col.freq_[9] = 0.0584; + col.freq_[10] = 0.0246; + col.freq_[11] = 0.0463; + col.freq_[12] = 0.0351; + col.freq_[13] = 0.0326; + col.freq_[14] = 0.0484; + col.freq_[15] = 0.0573; + col.freq_[16] = 0.0505; + col.freq_[17] = 0.0758; + col.freq_[18] = 0.0124; + col.freq_[19] = 0.0345; + + return col; +} + +int ProfileColumn::GetIndex(char olc) { + if (olc == 'A') return 0; + if (olc >= 'C' && olc <= 'I') return (olc-'A') - 1; + if (olc >= 'K' && olc <= 'N') return (olc-'A') - 2; + if (olc >= 'P' && olc <= 'T') return (olc-'A') - 3; + if (olc == 'V' ) return 17; + if (olc == 'W' ) return 18; + if (olc == 'Y' ) return 19; + return -1; +} + +Real ProfileColumn::GetFreq(char ch) const{ + int idx = ProfileColumn::GetIndex(ch); + if (idx == -1) { + throw Error("Invalid One Letter Code observed when getting frequency in " + "ProfileColumn!"); + } + return freq_[idx]; +} + +void ProfileColumn::SetFreq(char ch, Real freq){ + int idx = ProfileColumn::GetIndex(ch); + if (idx == -1) { + throw Error("Invalid One Letter Code observed when setting frequency in " + "ProfileColumn!"); + } + freq_[idx] = freq; +} + +Real ProfileColumn::GetEntropy() const { + Real entropy = 0.0; + for (uint i = 0; i < 20; ++i) { + if (freq_[i] > 0.0) { + entropy -= log(freq_[i]) * freq_[i]; + } + } + return entropy; +} + +////////////////////////////////////////////////////////////////////////////// +// ProfileHandle +////////////////////////////////////////////////////////////////////////////// + +ProfileHandlePtr ProfileHandle::Extract(uint from, uint to){ + + if (to <= from) { + throw Error("Second index must be bigger than first one!"); + } + + if (to > this->size()) { + throw Error("Invalid index!"); + } + + ProfileHandlePtr return_prof(new ProfileHandle); + return_prof->SetNullModel(null_model_); + for(uint i = from; i < to; ++i){ + return_prof->push_back(columns_[i]); + } + return_prof->SetSequence(seq_.substr(from, to-from)); + + return return_prof; +} + +Real ProfileHandle::GetAverageEntropy() const { + Real n_eff = 0.0; + int n = 0; + for (std::vector<ProfileColumn>::const_iterator + i = this->columns_begin(), e = this->columns_end(); i != e; ++i) { + n += 1; + n_eff += i->GetEntropy(); + } + return (n > 0) ? n_eff/n : 0.0; +} + +////////////////////////////////////////////////////////////////////////////// +// ProfileDB +////////////////////////////////////////////////////////////////////////////// + +void ProfileDB::Save(const String& filename) const{ + + std::ofstream out_stream(filename.c_str(), std::ios::binary); + + //write out total size + uint32_t total_size = data_.size(); + out_stream.write(reinterpret_cast<char*>(&total_size),4); + + //write out the data elements + char string_size; + for (std::map<String, ProfileHandlePtr>::const_iterator i = data_.begin(); + i != data_.end(); ++i){ + string_size = static_cast<char>(i->first.size()); + out_stream.write(reinterpret_cast<char*>(&string_size),1); + out_stream.write(i->first.c_str(),string_size); + out_stream << *i->second; + } + out_stream.close(); +} + +ProfileDBPtr ProfileDB::Load(const String& filename){ + + std::ifstream in_stream(filename.c_str(), std::ios::binary); + if (!in_stream){ + std::stringstream ss; + ss << "the file '" << filename << "' does not exist."; + throw Error(ss.str()); + } + + ProfileDBPtr db(new ProfileDB); + + //read in the total size + uint32_t total_size; + in_stream.read(reinterpret_cast<char*>(&total_size),4); + + //read in the single profiles + char string_size; + for(uint i = 0; i < total_size; ++i){ + in_stream.read(&string_size,1); + String name(string_size,'X'); + in_stream.read(&name[0],string_size); + ProfileHandlePtr prof(new ProfileHandle); + in_stream >> *prof; + db->AddProfile(name, prof); + } + + return db; +} + +void ProfileDB::AddProfile(const String& name, ProfileHandlePtr prof) { + if (name.size() > 255) { + throw Error("Name of Profile must be smaller than 256!"); + } + if (name.empty()) { + throw Error("Name must not be empty!"); + } + data_[name] = prof; +} + +ProfileHandlePtr ProfileDB::GetProfile(const String& name) const{ + std::map<String,ProfileHandlePtr>::const_iterator i = data_.find(name); + if (i == data_.end()) { + std::stringstream ss; + ss << "Profile database does not contain an entry with name "; + ss << name << "!"; + throw Error(ss.str()); + } + return i->second; +} + +std::vector<String> ProfileDB::GetNames() const{ + std::vector<String> return_vec; + return_vec.reserve(this->size()); + for (std::map<String, ProfileHandlePtr>::const_iterator i = data_.begin(); + i != data_.end(); ++i){ + return_vec.push_back(i->first); + } + return return_vec; +} + + +}} //ns diff --git a/modules/seq/base/src/profile_handle.hh b/modules/seq/base/src/profile_handle.hh new file mode 100644 index 000000000..e3642b6e1 --- /dev/null +++ b/modules/seq/base/src/profile_handle.hh @@ -0,0 +1,250 @@ +//------------------------------------------------------------------------------ +// This file is part of the OpenStructure project <www.openstructure.org> +// +// Copyright (C) 2008-2015 by the OpenStructure authors +// +// This library is free software; you can redistribute it and/or modify it under +// the terms of the GNU Lesser General Public License as published by the Free +// Software Foundation; either version 3.0 of the License, or (at your option) +// any later version. +// This library is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more +// details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this library; if not, write to the Free Software Foundation, Inc., +// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +//------------------------------------------------------------------------------ + +/* + Author: Gerardo Tauriello, Gabriel Studer + */ + +#ifndef OST_SEQ_PROFILE_HANDLE_HH +#define OST_SEQ_PROFILE_HANDLE_HH + +#include <ost/base.hh> +#include <ost/stdint.hh> +#include <ost/message.hh> +#include <ost/seq/module_config.hh> + +#include <string.h> // for memcpy, etc +#include <vector> +#include <map> +#include <fstream> +#include <boost/shared_ptr.hpp> + +namespace ost { namespace seq { + +class ProfileHandle; +class ProfileColumn; +class ProfileDB; +typedef boost::shared_ptr<ProfileHandle> ProfileHandlePtr; +typedef boost::shared_ptr<ProfileDB> ProfileDBPtr; +typedef std::vector<ProfileColumn> ProfileColumnList; + +/// \brief Defines profile of 20 frequencies for one residue. +/// +/// Frequencies are identified by the one-letter-code for that amino acid. +/// (possible codes: ACDEFGHIKLMNPQRSTVWY) +class DLLEXPORT_OST_SEQ ProfileColumn { +public: + + /// \brief Construct a profile with all frequencies set to 0. + ProfileColumn() { + memset(freq_, 0, sizeof(freq_)); + } + + ProfileColumn(const ProfileColumn& rhs) { + memcpy(freq_, rhs.freq_, sizeof(freq_)); + } + ProfileColumn& operator= (const ProfileColumn& rhs) { + memcpy(freq_, rhs.freq_, sizeof(freq_)); + return *this; + } + + static ProfileColumn BLOSUMNullModel(); + + /// \brief Translate one-letter-code to index (0-indexing). + static int GetIndex(char ch); + + Real GetFreq(char ch) const; + + void SetFreq(char ch, Real freq); + + bool operator==(const ProfileColumn& rhs) const { + return !memcmp(freq_, rhs.freq_, sizeof(freq_)); + } + bool operator!=(const ProfileColumn& rhs) const { return !(rhs == (*this)); } + + Real* freqs_begin() { return freq_; } + Real* freqs_end() { return freq_ + 20; } + const Real* freqs_begin() const { return freq_; } + const Real* freqs_end() const { return freq_ + 20; } + + /// \brief Get entropy for this column. + Real GetEntropy() const; + + // functions to feed streams with limited accuracy of internal data + // not intended for python export + + friend std::ofstream& operator<<(std::ofstream& os, ProfileColumn& col) { + int16_t data[20]; + //transform aa_freq + for (uint i = 0; i < 20; ++i) { + data[i] = static_cast<int16_t>(col.freq_[i]*10000); + } + os.write(reinterpret_cast<char*>(data), sizeof(data)); + return os; + } + + friend std::ifstream& operator>>(std::ifstream& is, ProfileColumn& col) { + int16_t data[20]; + is.read(reinterpret_cast<char*>(data), sizeof(data)); + //transform aa_freq + for (uint i = 0; i < 20; ++i) { + col.freq_[i] = data[i] * 0.0001; + } + return is; + } + +private: + Real freq_[20]; +}; + +/// \brief Provides a profile for a sequence. +/// +/// Properties: +/// - consists of N ProfileColumn (N = #residues in sequence) +/// - contains sequence (String) of length N +/// - user must enforce consistency between sequence length and #columns +/// - contains a null_model to use for this sequence +class DLLEXPORT_OST_SEQ ProfileHandle { +public: + /// \brief Constructs an empty profile handle (sequence = '', 0 columns). + ProfileHandle() {} + + // uses compiler-generated copy- and assignment operators (work here!) + + const std::vector<ProfileColumn>& GetColumns() const { return columns_; } + + const ProfileColumn& GetNullModel() const { return null_model_; } + + void SetNullModel(const ProfileColumn& null_model) { null_model_ = null_model; } + + String GetSequence() const { return seq_; } + + void SetSequence(const String& seq) { seq_ = seq; } + + /// \brief Extract subset of profile for columns from until to-1 (0-indexing). + /// Null model is copied from this profile. + /// \throw Error if to <= from or to > size(). + ProfileHandlePtr Extract(uint from, uint to); + + /// \brief Compute average entropy over all columns. + Real GetAverageEntropy() const; + + // some functions to make it behave like a vector + + void clear() { seq_ = ""; columns_.clear(); } + + size_t size() const { return columns_.size(); } + + bool empty() const { return columns_.empty(); } + + void push_back(const ProfileColumn& c) { columns_.push_back(c); } + + ProfileColumn& operator[](size_t index) { return columns_[index]; } + + const ProfileColumn& operator[](size_t index) const { return columns_[index]; } + + ProfileColumn& at(size_t index) { return columns_.at(index); } + + const ProfileColumn& at(size_t index) const { return columns_.at(index); } + + bool operator==(const ProfileHandle& other) const { + return seq_ == other.seq_ && + columns_ == other.columns_ && + null_model_ == other.null_model_; + } + + bool operator!=(const ProfileHandle& other) const { return !(other == (*this)); } + + ProfileColumnList::const_iterator columns_end() const { return columns_.end(); } + ProfileColumnList::iterator columns_end() { return columns_.end(); } + ProfileColumnList::const_iterator columns_begin() const { return columns_.begin(); } + ProfileColumnList::iterator columns_begin() { return columns_.begin(); } + + // functions to feed streams with limited accuracy of internal data + // not intended for python export + + friend std::ofstream& operator<<(std::ofstream& os, ProfileHandle& prof) { + // null model + os << prof.null_model_; + // num. columns/residues + uint32_t size = prof.size(); + os.write(reinterpret_cast<char*>(&size), sizeof(uint32_t)); + // sequence + if (prof.seq_.length() != size) { + throw Error("ProfileHandle - Inconsistency between number of columns and " + " seq. length."); + } + os.write(prof.seq_.c_str(), size); + // columns + for(uint i = 0; i < size; ++i){ + os << prof.columns_[i]; + } + + return os; + } + + friend std::ifstream& operator>>(std::ifstream& is, ProfileHandle& prof) { + // null model + is >> prof.null_model_; + // num. columns/residues + uint32_t size; + is.read(reinterpret_cast<char*>(&size), sizeof(uint32_t)); + // sequence + std::vector<char> tmp_buf(size); + is.read(&tmp_buf[0], size); + prof.seq_.assign(&tmp_buf[0], size); + // columns + prof.columns_.resize(size); + for(uint i = 0; i < size; ++i){ + is >> prof.columns_[i]; + } + + return is; + } + +private: + String seq_; + ProfileColumn null_model_; + ProfileColumnList columns_; +}; + +/// \brief Contains a DB of profiles (identified by a unique name (String)). +class DLLEXPORT_OST_SEQ ProfileDB { +public: + /// \brief Saves all profiles in DB with limited accuracy of internal data. + /// Binary format with fixed-width integers (should be portable). + void Save(const String& filename) const; + + static ProfileDBPtr Load(const String& filename); + + void AddProfile(const String& name, ProfileHandlePtr prof); + + ProfileHandlePtr GetProfile(const String& name) const; + + size_t size() const { return data_.size(); } + + std::vector<String> GetNames() const; + +private: + std::map<String, ProfileHandlePtr> data_; +}; + +}} + +#endif diff --git a/modules/seq/base/tests/CMakeLists.txt b/modules/seq/base/tests/CMakeLists.txt index 5fdf6e37c..3bdf19575 100644 --- a/modules/seq/base/tests/CMakeLists.txt +++ b/modules/seq/base/tests/CMakeLists.txt @@ -5,8 +5,8 @@ set(OST_SEQ_UNIT_TESTS test_aligned_region.cc test_alignment.cc test_hmm.cc + test_profile.cc tests.cc ) ost_unittest(MODULE seq SOURCES "${OST_SEQ_UNIT_TESTS}") - diff --git a/modules/seq/base/tests/test_profile.cc b/modules/seq/base/tests/test_profile.cc new file mode 100644 index 000000000..ef2292bc7 --- /dev/null +++ b/modules/seq/base/tests/test_profile.cc @@ -0,0 +1,144 @@ +//------------------------------------------------------------------------------ +// This file is part of the OpenStructure project <www.openstructure.org> +// +// Copyright (C) 2008-2015 by the OpenStructure authors +// +// This library is free software; you can redistribute it and/or modify it under +// the terms of the GNU Lesser General Public License as published by the Free +// Software Foundation; either version 3.0 of the License, or (at your option) +// any later version. +// This library is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more +// details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this library; if not, write to the Free Software Foundation, Inc., +// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +//------------------------------------------------------------------------------ + +/* + Author: Gerardo Tauriello, Gabriel Studer + */ + +#define BOOST_TEST_DYN_LINK +#include <boost/test/unit_test.hpp> +#include <boost/test/auto_unit_test.hpp> +#include <iostream> + +#include <ost/seq/profile_handle.hh> +#include <ost/io/seq/hhm_io_handler.hh> +#include <ost/io/seq/pssm_io_handler.hh> + +using namespace ost::seq; + + +BOOST_AUTO_TEST_SUITE( profile ); + + +BOOST_AUTO_TEST_CASE(profile_column) +{ + const char * olc = "ACDEFGHIKLMNPQRSTVWY"; + + // check empty column + ProfileColumn pc; + BOOST_CHECK_EQUAL(pc.GetEntropy(), Real(0)); + Real* freq_ptr = pc.freqs_begin(); + for (uint i = 0; i < 20; ++i) { + BOOST_CHECK_EQUAL(freq_ptr[i], Real(0)); + BOOST_CHECK_EQUAL(pc.GetFreq(olc[i]), Real(0)); + // check overwrite + const Real tst = Real(i)/Real(20); + pc.SetFreq(olc[i], tst); + BOOST_CHECK_EQUAL(pc.GetFreq(olc[i]), tst); + } + + // check null model + ProfileColumn nm = ProfileColumn::BLOSUMNullModel(); + BOOST_CHECK_GE(pc.GetEntropy(), Real(0)); + Real sum_freq = 0; + freq_ptr = nm.freqs_begin(); + for (uint i = 0; i < 20; ++i) { + BOOST_CHECK_GE(freq_ptr[i], Real(0)); + sum_freq += freq_ptr[i]; + } + BOOST_CHECK_CLOSE(sum_freq, Real(1), Real(1e-3)); + + // check assign/copy + ProfileColumn nm2(nm); + BOOST_CHECK(nm == nm2); + Real* freq_ptr2 = nm2.freqs_begin(); + for (uint i = 0; i < 20; ++i) { + BOOST_CHECK_EQUAL(freq_ptr[i], freq_ptr2[i]); + } + pc = nm; + BOOST_CHECK(nm == pc); +} + +BOOST_AUTO_TEST_CASE(profile_handle) +{ + // generate dummy + ProfileHandle prof; + BOOST_CHECK_EQUAL(prof.size(), size_t(0)); + BOOST_CHECK_EQUAL(prof.GetSequence(), ""); + prof.SetSequence("RRSPP"); + ProfileColumn pc0 = ProfileColumn::BLOSUMNullModel(); + prof.SetNullModel(pc0); + ProfileColumn pc1, pc2, pc3, pc4; + pc1.SetFreq('H', 0.74); + pc1.SetFreq('E', 0.01); + pc1.SetFreq('L', 0.25); + pc2.SetFreq('L', 1); + pc3.SetFreq('Y', 0.5); + pc3.SetFreq('E', 0.5); + pc4.SetFreq('A', 0.5); + pc4.SetFreq('H', 0.5); + prof.push_back(pc0); + prof.push_back(pc1); + prof.push_back(pc2); + prof.push_back(pc3); + prof.push_back(pc4); + + // check components + BOOST_CHECK_EQUAL(prof.size(), size_t(5)); + BOOST_CHECK(prof.GetNullModel() == pc0); + BOOST_CHECK_EQUAL(prof.GetSequence(), "RRSPP"); + + // check columns with different methods of getting the columns + BOOST_CHECK(prof[0] == pc0); + BOOST_CHECK(*(prof.columns_begin()+1) == pc1); + BOOST_CHECK(prof.GetColumns()[2] == pc2); + BOOST_CHECK(prof.at(3) == pc3); + BOOST_CHECK(*(prof.columns_end()-1) == pc4); + + // check entropies + Real entropy1 = -0.74*log(0.74) - 0.01*log(0.01) - 0.25*log(0.25); + Real entropy2 = 0; + Real entropy3 = -log(0.5); + Real avg_entropy = (pc0.GetEntropy() + entropy1 + entropy2 + 2*entropy3)/5; + BOOST_CHECK_CLOSE(prof[1].GetEntropy(), entropy1, Real(1e-3)); + BOOST_CHECK_CLOSE(prof[2].GetEntropy(), entropy2, Real(1e-3)); + BOOST_CHECK_CLOSE(prof[3].GetEntropy(), entropy3, Real(1e-3)); + BOOST_CHECK_CLOSE(prof.GetAverageEntropy(), avg_entropy, Real(1e-3)); + + // extract parts + ProfileHandlePtr pp = prof.Extract(1, 4); + avg_entropy = (entropy1 + entropy2 + entropy3)/3; + BOOST_CHECK_EQUAL(pp->size(), size_t(3)); + BOOST_CHECK_EQUAL(pp->GetSequence(), "RSP"); + BOOST_CHECK_CLOSE(pp->GetAverageEntropy(), avg_entropy, Real(1e-3)); + + pp = prof.Extract(0, 3); + avg_entropy = (pc0.GetEntropy() + entropy1 + entropy2)/3; + BOOST_CHECK_EQUAL(pp->size(), size_t(3)); + BOOST_CHECK_EQUAL(pp->GetSequence(), "RRS"); + BOOST_CHECK_CLOSE(pp->GetAverageEntropy(), avg_entropy, Real(1e-3)); + + pp = prof.Extract(3, 5); + BOOST_CHECK_EQUAL(pp->size(), size_t(2)); + BOOST_CHECK_EQUAL(pp->GetSequence(), "PP"); + BOOST_CHECK_CLOSE(pp->GetAverageEntropy(), entropy3, Real(1e-3)); +} + + +BOOST_AUTO_TEST_SUITE_END(); -- GitLab