diff --git a/modules/db/CMakeLists.txt b/modules/db/CMakeLists.txt index 6955f55f9b8d0237224a2d4aab25aa3b5310003d..4468b05a8c86e042baeebb9b56f19468d9f2e4b5 100644 --- a/modules/db/CMakeLists.txt +++ b/modules/db/CMakeLists.txt @@ -1,2 +1,3 @@ add_subdirectory(src) -add_subdirectory(pymod) \ No newline at end of file +add_subdirectory(pymod) +add_subdirectory(tests) diff --git a/modules/db/doc/db.rst b/modules/db/doc/db.rst new file mode 100644 index 0000000000000000000000000000000000000000..2333b5aa91875af5724461decb213d18462545ab --- /dev/null +++ b/modules/db/doc/db.rst @@ -0,0 +1,306 @@ +Linear Database +=============================================================================== + +.. currentmodule:: ost.db + + +Many applications require to load lots of structures. Expecially on distributed +file systems, io becomes a problem. +OST provides a linear database to dump position data, e.g. CA positions, or +character data, e.g. sequences, to allow fast retrieval. +The actual data container behave like linear data arrays and the idea is to +use an indexer to keep track of where to find data for a certain entry. + +.. class:: LinearIndexer() + + The idea of the :class:`LinearIndexer` is to keep track of locations + of data assuming a linear memory layout. The level of entries in the indexer + are assemblies that can contain an arbitrary number of chains with varying + length. Whenever a new assembly is added, a range enclosing all residues of + that assembly is defined that is subsequent to the range of the previously + added assembly. It is then not only possible to access the range of the full + assembly, but also the range of single chains. + Whenever an assembly with *n* residues is deleted, the ranges of all + assemblies that have been added later on are reduced by *n*. + + .. method:: Load(filename) + + Loads indexer from file + + :param filename: Path to file to be loaded + :type filename: :class:`str` + :returns: The loaded indexer + :rtype: :class:`LinearIndexer` + :raises: :exc:`ost.Error` if *filename* cannot be + opened + + + .. method:: Save(filename) + + Saves indexer to file + + :param filename: Path to file where the indexer is stored + :type filename: :class:`str` + :raises: :exc:`ost.Error` if *filename* cannot be + created + + + .. method:: AddAssembly(name, chain_names, chain_lenths) + + Adds a new assembly to the indexer. The range assigned to that assembly is + subsequent to the previously added assembly. + + :param name: Name of the added assembly + :param chain_names: Names of all chains of the added assembly + :param chain_lengths: The according lengths of the chains + :type name: :class:`str` + :type chain_names: :class:`list` of :class:`str` + :type chain_lengths: :class:`list` of :class:`int` + + :raises: :exc:`ost.Error` if lengths of + *chain_names* and *chain_lengths* is inconsistent + + + .. method:: RemoveAssembly(name) + + Removes an assembly from the indexer. Assuming that assembly contains + a total of *n* residues, all ranges of the subsequent assemblies are + reduced by *n*. + + :param name: Name of the assembly to be removed + :type param: :class:`str` + + :raises: :exc:`ost.Error` if *name* is not present + + + .. method:: GetAssemblies() + + :returns: The names of all added assemblies + :rtype: :class:`list` of :class:`str` + :raises: :exc:`ost.Error` if *name* is not present + + + .. method:: GetChainNames(name) + + :param name: Name of assembly from which you want the chain names + :type name: :class:`str` + + :returns: The chain names of the specified assembly + :rtype: :class:`list` of :class:`str` + + :raises: :exc:`ost.Error` if *name* is not present + + + .. method:: GetChainLengths(name) + + :param name: Name of assembly from which you want the chain lengths + :type name: :class:`str` + + :returns: The chain lengths of the specified assembly + :rtype: :class:`list` of :class:`int` + + :raises: :exc:`ost.Error` if *name* is not present + + + .. method:: GetDataRange(name) + + Get the range for a full assembly + + :param name: Name of the assembly from which you want the range + :type name: :class:`str` + :returns: Two values defining the range as [from, to[ + :rtype: :class:`tuple` of :class:`int` + + :raises: :exc:`ost.Error` if *name* is not present + + + .. method:: GetDataRange(name, chain_name) + + Get the range for a chain of an assembly + + :param name: Name of the assembly from which you want the range + :param chain_name: Name of the chain from which you want the range + :type name: :class:`str` + :type chain_name: :class:`str` + :returns: Two values defining the range as [from, to[ + :rtype: :class:`tuple` of :class:`int` + + :raises: :exc:`ost.Error` if *name* is not present + or the according assembly has no chain with specified + chain name + + + .. method:: GetNumResidues() + + :returns: The total number of residues in all added assemblies + :rtype: :class:`int` + + + +.. class:: LinearCharacterContainer() + + The :class:`LinearCharacterContainer` stores characters in a linear memory + layout that can represent sequences such as SEQRES or ATOMSEQ. + It can be accessed using range parameters and the idea is to keep it in sync + with a :class:`LinearIndexer`. + + .. method:: Load(filename) + + Loads container from file + + :param filename: Path to file to be loaded + :type filename: :class:`str` + :returns: The loaded container + :rtype: :class:`LinearCharacterContainer` + :raises: :exc:`ost.Error` if *filename* cannot be + opened + + + .. method:: Save(filename) + + Saves container to file + + :param filename: Path to file where the container is stored + :type filename: :class:`str` + :raises: :exc:`ost.Error` if *filename* cannot be + created + + .. method:: AddCharacters(characters) + + Adds *characters* at the end of the internal data. Call this function with + appropriate data whenever you add an assembly to the associated + :class:`LinearIndexer` + + :param characters: Characters to be added + :type characters: :class:`str` + + + .. method:: ClearRange(range) + + Removes all characters specified by *range* in form [*from*, *to* [ + from the internal data. + The internal data layout is linear, all characters starting from *to* + are shifted to the location defined by *from*. + Call this function with appropriate range whenever you remove an assembly + from the associated :class:`LinearIndexer` + + :param range: Range to be deleted in form [from, to[ + :type range: :class:`tuple` of :class:`int` + + :raises: :exc:`ost.Error` if *range* does not + specify a valid range + + + .. method:: GetCharacter(idx) + + :returns: The character at the specified location + :rtype: :class:`str` + :raises: :exc:`ost.Error` if *idx* does not + specify a valid position + + + .. method:: GetCharacters(range) + + :returns: The characters from the specified range + :rtype: :class:`str` + :raises: :exc:`ost.Error` if *range* does not + specify a valid range + + .. method:: GetNumElements() + + :returns: The number of stored characters + :rypte: :class:`int` + + + +.. class:: LinearPositionContainer() + + The :class:`LinearPositionContainer` stores positions in a linear memory + layout. + It can be accessed using range parameters and the idea is to keep it in sync + with a :class:`LinearIndexer`. In order to save some memory, a lossy + compression is applied that results in a limited accuracy of two digits. + if the absolute value of your added position is very large (> ~10000), + the accuracy is further lowered to one digit. This is all handled internally. + + .. method:: Load(filename) + + Loads container from file + + :param filename: Path to file to be loaded + :type filename: :class:`str` + :returns: The loaded container + :rtype: :class:`LinearPositionContainer` + :raises: :exc:`ost.Error` if *filename* cannot be + opened + + + .. method:: Save(filename) + + Saves container to file + + :param filename: Path to file where the container is stored + :type filename: :class:`str` + :raises: :exc:`ost.Error` if *filename* cannot be + created + + + .. method:: AddPositions(positions) + + Adds *positions* at the end of the internal data. Call this function with + appropriate data whenever you add an assembly to the associated + :class:`LinearIndexer` + + :param positions: Positions to be added + :type positions: :class:`ost.geom.Vec3List` + + + .. method:: ClearRange(range) + + Removes all positions specified by *range* in form [*from*, *to* [ + from the internal data. + The internal data layout is linear, all positions starting from *to* + are shifted to the location defined by *from*. + Call this function with appropriate range whenever you remove an assembly + from the associated :class:`LinearIndexer` + + :param range: Range to be deleted in form [from, to[ + :type range: :class:`tuple` of :class:`int` + + :raises: :exc:`ost.Error` if *range* does not + specify a valid range + + + .. method:: GetPosition(idx, pos) + + Extracts a position at specified location. For efficiency reasons, + the function requires the position to be passed as reference. + + :param idx: Specifies location + :param pos: Will be altered to the desired position + :type idx: :class:`int` + :type pos: :class:`ost.geom.Vec3` + + :raises: :exc:`ost.Error` if *idx* does not + specify a valid position + + + .. method:: GetPositions(range, positions) + + Extracts positions at specified range. For efficiency reasons, the function + requires the positions to be passed as reference. + + :param range: Range in form [from,to[ that defines positions + to be extracted + :param positions: Will be altered to the desired positions + :type range: :class:`tuple` of :class:`int` + :type positions: :class:`ost.geom.Vec3List` + + :raises: :exc:`ost.Error` if *range* does not + specify a valid range + + .. method:: GetNumElements() + + :returns: The number of stored positions + :rypte: :class:`int` + diff --git a/modules/db/pymod/CMakeLists.txt b/modules/db/pymod/CMakeLists.txt index d946237d3095ab263c2079a2ed67ea665b058199..88c8213b12107efdfb1e9f5f655ee46373a6968d 100644 --- a/modules/db/pymod/CMakeLists.txt +++ b/modules/db/pymod/CMakeLists.txt @@ -1,5 +1,6 @@ set(OST_DB_PYMOD_SOURCES - "" +export_linear_db.cc +wrap_db.cc ) pymod(NAME db CPP ${OST_DB_PYMOD_SOURCES} PY __init__.py) \ No newline at end of file diff --git a/modules/db/pymod/__init__.py b/modules/db/pymod/__init__.py index 2ce41f6fd62e717440ebff41ff00dea34d9d1abc..d443def87175ede548bcd9ff0140f3068e391c36 100644 --- a/modules/db/pymod/__init__.py +++ b/modules/db/pymod/__init__.py @@ -16,4 +16,4 @@ # along with this library; if not, write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #------------------------------------------------------------------------------ -#nothing yet to import +from _ost_db import * diff --git a/modules/db/pymod/export_linear_db.cc b/modules/db/pymod/export_linear_db.cc new file mode 100644 index 0000000000000000000000000000000000000000..314f49ae6d001bdc9c4d1258544d719ad8f0513c --- /dev/null +++ b/modules/db/pymod/export_linear_db.cc @@ -0,0 +1,208 @@ +//------------------------------------------------------------------------------ +// This file is part of the OpenStructure project <www.openstructure.org> +// +// Copyright (C) 2008-2019 by the OpenStructure authors +// +// This library is free software; you can redistribute it and/or modify it under +// the terms of the GNU Lesser General Public License as published by the Free +// Software Foundation; either version 3.0 of the License, or (at your option) +// any later version. +// This library is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more +// details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this library; if not, write to the Free Software Foundation, Inc., +// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +//------------------------------------------------------------------------------ + + +#include <boost/python.hpp> +#include <ost/db/linear_indexer.hh> +#include <ost/db/binary_container.hh> +#include <ost/db/extract_data_helper.hh> + +using namespace boost::python; +using namespace ost::db; + + +namespace{ + +template<typename T> +void ListToVec(const list& l, std::vector<T>& v) { + uint num_elements = len(l); + v.resize(num_elements); + for(uint i = 0; i < num_elements; ++i) { + v[i] = extract<T>(l[i]); + } +} + + +template<typename T> +void VecToList(const std::vector<T>& v, list& l) { + l = list(); + for(typename std::vector<T>::const_iterator i = v.begin(); + i != v.end(); ++i) { + l.append(*i); + } +} + + +void WrapAddAssembly(LinearIndexerPtr indexer, + const String& name, + const list& chain_names, + const list& chain_lengths) { + + std::vector<String> v_chain_names; + std::vector<uint> v_chain_lengths; + ListToVec(chain_names, v_chain_names); + ListToVec(chain_lengths, v_chain_lengths); + indexer->AddAssembly(name, v_chain_names, v_chain_lengths); +} + + +list WrapGetAssemblies(LinearIndexerPtr indexer) { + + std::vector<String> entry_names = indexer->GetAssemblies(); + list return_list; + VecToList(entry_names, return_list); + return return_list; +} + + +list WrapGetChainNames(LinearIndexerPtr indexer, const String& name) { + + std::vector<String> chain_names = indexer->GetChainNames(name); + list return_list; + VecToList(chain_names, return_list); + return return_list; +} + + +list WrapGetChainLengths(LinearIndexerPtr indexer, const String& name) { + + std::vector<uint> chain_lengths = indexer->GetChainLengths(name); + list return_list; + VecToList(chain_lengths, return_list); + return return_list; +} + + +tuple WrapGetDataRangeAssembly(LinearIndexerPtr indexer, + const String& name) { + + std::pair<uint64_t, uint64_t> range = indexer->GetDataRange(name); + tuple return_tuple = make_tuple(range.first, range.second); + return return_tuple; +} + + +tuple WrapGetDataRange(LinearIndexerPtr indexer, + const String& name, + const String& chain_name) { + + std::pair<uint64_t, uint64_t> range = indexer->GetDataRange(name, chain_name); + tuple return_tuple = make_tuple(range.first, range.second); + return return_tuple; +} + +void WrapClearCharacterRange(LinearCharacterContainerPtr container, + tuple range) { + + std::pair<uint64_t, uint64_t> p_range; + p_range.first = extract<uint64_t>(range[0]); + p_range.second = extract<uint64_t>(range[1]); + + container->ClearRange(p_range); +} + +String WrapGetCharacters(LinearCharacterContainerPtr container, + tuple& range) { + + std::pair<uint64_t, uint64_t> p_range; + p_range.first = extract<uint64_t>(range[0]); + p_range.second = extract<uint64_t>(range[1]); + String return_string; + container->GetCharacters(p_range, return_string); + return return_string; +} + +void WrapClearPositionRange(LinearPositionContainerPtr container, + tuple& range) { + + std::pair<uint64_t, uint64_t> p_range; + p_range.first = extract<uint64_t>(range[0]); + p_range.second = extract<uint64_t>(range[1]); + container->ClearRange(p_range); +} + +void WrapGetPositions(LinearPositionContainerPtr container, + tuple& range, + geom::Vec3List& positions) { + + std::pair<uint64_t, uint64_t> p_range; + p_range.first = extract<uint64_t>(range[0]); + p_range.second = extract<uint64_t>(range[1]); + container->GetPositions(p_range, positions); +} + +} + + +void export_linear_db() { + + class_<LinearIndexer, LinearIndexerPtr>("LinearIndexer", init<>()) + .def("Load", &LinearIndexer::Load, arg("filename")).staticmethod("Load") + .def("Save", &LinearIndexer::Save, arg("filename")) + .def("AddAssembly", &WrapAddAssembly, (arg("name"), + arg("chain_names"), + arg("chain_lengths"))) + .def("RemoveAssembly", &LinearIndexer::RemoveAssembly, (arg("name"))) + .def("GetAssemblies", &WrapGetAssemblies) + .def("GetChainNames", &WrapGetChainNames,arg("name")) + .def("GetChainLengths", &WrapGetChainLengths,arg("name")) + .def("GetDataRange", &WrapGetDataRangeAssembly,arg("name")) + .def("GetDataRange", &WrapGetDataRange,(arg("name"), + arg("chain_name"))) + .def("GetNumResidues", &LinearIndexer::GetNumResidues) + .def("__eq__", &LinearIndexer::operator==) + .def("__ne__", &LinearIndexer::operator!=) + ; + + + class_<LinearCharacterContainer, + LinearCharacterContainerPtr>("LinearCharacterContainer", init<>()) + .def("Load", &LinearCharacterContainer::Load, arg("filename")).staticmethod("Load") + .def("Save", &LinearCharacterContainer::Save, arg("filename")) + .def("ClearRange", &WrapClearCharacterRange, arg("range")) + .def("AddCharacters", &LinearCharacterContainer::AddCharacters, arg("characters")) + .def("GetCharacter", &LinearCharacterContainer::GetCharacter, arg("idx")) + .def("GetCharacters", &WrapGetCharacters, arg("range")) + .def("GetNumElements", &LinearCharacterContainer::GetNumElements) + .def("__eq__", &LinearCharacterContainer::operator==) + .def("__ne__", &LinearCharacterContainer::operator!=) + ; + + + class_<LinearPositionContainer, + LinearPositionContainerPtr>("LinearPositionContainer", init<>()) + .def("Load", &LinearPositionContainer::Load, arg("filename")).staticmethod("Load") + .def("Save", &LinearPositionContainer::Save, arg("filename")) + .def("ClearRange", &WrapClearPositionRange, arg("range")) + .def("AddPositions", &LinearPositionContainer::AddPositions, arg("positions")) + .def("GetPosition", &LinearPositionContainer::GetPosition, arg("idx"), arg("pos")) + .def("GetPositions", &WrapGetPositions, (arg("range"),arg("positions"))) + .def("GetNumElements", &LinearPositionContainer::GetNumElements) + .def("__eq__", &LinearPositionContainer::operator==) + .def("__ne__", &LinearPositionContainer::operator!=) + ; + + def("ExtractValidPositions", &ExtractValidPositions, (arg("entry_name"), + arg("chain_name"), + arg("indexer"), + arg("atomseq_container"), + arg("position_container"), + arg("seq"), arg("positions"))); +} + diff --git a/modules/db/pymod/wrap_db.cc b/modules/db/pymod/wrap_db.cc new file mode 100644 index 0000000000000000000000000000000000000000..883c6a33eb374e82acdd4882afe3fa00a86066be --- /dev/null +++ b/modules/db/pymod/wrap_db.cc @@ -0,0 +1,28 @@ +//------------------------------------------------------------------------------ +// This file is part of the OpenStructure project <www.openstructure.org> +// +// Copyright (C) 2008-2019 by the OpenStructure authors +// +// This library is free software; you can redistribute it and/or modify it under +// the terms of the GNU Lesser General Public License as published by the Free +// Software Foundation; either version 3.0 of the License, or (at your option) +// any later version. +// This library is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more +// details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this library; if not, write to the Free Software Foundation, Inc., +// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +//------------------------------------------------------------------------------ + + +#include <boost/python.hpp> + +void export_linear_db(); + +BOOST_PYTHON_MODULE(_ost_db) +{ + export_linear_db(); +} diff --git a/modules/db/src/CMakeLists.txt b/modules/db/src/CMakeLists.txt index dd75c8c843d256579e6a308490ba36df6bd2709b..77bc7829f9afafbfdc2228c8da42fff204e92046 100644 --- a/modules/db/src/CMakeLists.txt +++ b/modules/db/src/CMakeLists.txt @@ -2,14 +2,21 @@ set(OST_DB_HEADERS sqlite_wrap.hh sqlite_conv.hh module_config.hh +linear_indexer.hh +paged_array.hh +binary_container.hh +extract_data_helper.hh ) set(OST_DB_SOURCES sqlite_wrap.cc +linear_indexer.cc +binary_container.cc +extract_data_helper.cc ) module(NAME db SOURCES ${OST_DB_SOURCES} HEADERS ${OST_DB_HEADERS} - DEPENDS_ON ost_base sqlite3) + DEPENDS_ON ost_base ost_geom ost_seq sqlite3) if(WIN32) set_target_properties(ost_db PROPERTIES LINK_FLAGS "/DEF:${CMAKE_CURRENT_SOURCE_DIR}/sqlite3.def") add_definitions(/DSQLITE_ENABLE_COLUMN_METADATA) diff --git a/modules/db/src/binary_container.cc b/modules/db/src/binary_container.cc new file mode 100644 index 0000000000000000000000000000000000000000..a82723c52e67121acb08675a3313429886115d3c --- /dev/null +++ b/modules/db/src/binary_container.cc @@ -0,0 +1,235 @@ +//------------------------------------------------------------------------------ +// This file is part of the OpenStructure project <www.openstructure.org> +// +// Copyright (C) 2008-2019 by the OpenStructure authors +// +// This library is free software; you can redistribute it and/or modify it under +// the terms of the GNU Lesser General Public License as published by the Free +// Software Foundation; either version 3.0 of the License, or (at your option) +// any later version. +// This library is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more +// details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this library; if not, write to the Free Software Foundation, Inc., +// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +//------------------------------------------------------------------------------ + + +#include <ost/db/binary_container.hh> +#include <ost/message.hh> + + +namespace ost{ namespace db{ + +LinearCharacterContainerPtr LinearCharacterContainer::Load(const String& fn) { + + std::ifstream in_stream(fn.c_str(), std::ios::binary); + if (!in_stream) { + throw ost::Error("Could not open " + fn); + } + LinearCharacterContainerPtr loaded_container(new LinearCharacterContainer); + loaded_container->data_.Read(in_stream); + loaded_container->num_elements_ = loaded_container->data_.size(); + return loaded_container; +} + +void LinearCharacterContainer::Save(const String& filename) const { + + std::ofstream out_stream(filename.c_str(), std::ios::binary); + if (!out_stream) { + throw ost::Error("Could not open " + filename); + } + + data_.Write(out_stream); +} + +void LinearCharacterContainer::AddCharacters(const String& characters) { + for(uint i = 0; i < characters.size(); ++i) { + data_.push_back(characters[i]); + } + num_elements_ = data_.size(); +} + +void LinearCharacterContainer::ClearRange(std::pair<uint64_t, uint64_t> range) { + + if(range.second <= range.first) { + throw ost::Error("Cannot clear invalid range in " + "LinearCharacterContainer!"); + } + + if(range.first >= num_elements_ || range.second > num_elements_) { + throw ost::Error("Cannot clear invalid range in " + "LinearCharacterContainer!"); + } + + data_.ClearRange(range.first, range.second); + num_elements_ = data_.size(); +} + +char LinearCharacterContainer::GetCharacter(uint64_t idx) const { + + if(idx >= num_elements_) { + throw ost::Error("Cannot access invalid position in " + "LinearCharacterContainer!"); + } + + return data_[idx]; +} + +void LinearCharacterContainer::GetCharacters(std::pair<uint64_t, uint64_t> range, + String& s) const { + + if(range.second <= range.first) { + throw ost::Error("Cannot access invalid position in " + "LinearCharacterContainer!"); + } + + if(range.first >= num_elements_ || range.second > num_elements_) { + throw ost::Error("Cannot access invalid position in " + "LinearCharacterContainer!"); + } + + uint64_t size = range.second - range.first; + s = String(size, '-'); + for(uint64_t i = 0; i < size; ++i) { + s[i] = data_[range.first + i]; + } +} + + +LinearPositionContainerPtr LinearPositionContainer::Load(const String& fn) { + + std::ifstream in_stream(fn.c_str(), std::ios::binary); + if (!in_stream) { + throw ost::Error("Could not open " + fn); + } + LinearPositionContainerPtr loaded_container(new LinearPositionContainer); + loaded_container->data_.Read(in_stream); + loaded_container->num_elements_ = loaded_container->data_.size(); + return loaded_container; +} + +void LinearPositionContainer::Save(const String& filename) const { + + std::ofstream out_stream(filename.c_str(), std::ios::binary); + if (!out_stream) { + throw ost::Error("Could not open " + filename); + } + + data_.Write(out_stream); +} + + +void LinearPositionContainer::AddPositions(const geom::Vec3List& positions) { + + for(uint i = 0; i < positions.size(); ++i) { + + int64_t data = 0; + bool low_accuracy = false; + + int64_t int_x_pos = std::floor(positions[i][0] * Real(100) + Real(0.5)); + int64_t int_y_pos = std::floor(positions[i][1] * Real(100) + Real(0.5)); + int64_t int_z_pos = std::floor(positions[i][2] * Real(100) + Real(0.5)); + + if(int_x_pos < CUSTOM_INT_MIN || int_x_pos > CUSTOM_INT_MAX || + int_y_pos < CUSTOM_INT_MIN || int_y_pos > CUSTOM_INT_MAX || + int_z_pos < CUSTOM_INT_MIN || int_z_pos > CUSTOM_INT_MAX) { + + // let's try to bring it into the allowed range using a factor of + // ten (results in only one digit of accuracy) + int_x_pos = std::floor(positions[i][0] * Real(10) + Real(0.5)); + int_y_pos = std::floor(positions[i][1] * Real(10) + Real(0.5)); + int_z_pos = std::floor(positions[i][2] * Real(10) + Real(0.5)); + + if(int_x_pos < CUSTOM_INT_MIN || int_x_pos > CUSTOM_INT_MAX || + int_y_pos < CUSTOM_INT_MIN || int_y_pos > CUSTOM_INT_MAX || + int_z_pos < CUSTOM_INT_MIN || int_z_pos > CUSTOM_INT_MAX) { + // we don't go lower in accuracy, let's just throw an error + throw ost::Error("Out of bound position observed!"); + } + + low_accuracy = true; + } + + int_x_pos += CUSTOM_INT_MAX; + int_y_pos += CUSTOM_INT_MAX; + int_z_pos += CUSTOM_INT_MAX; + + data += (int_x_pos << X_POS_SHIFT); + data += (int_y_pos << Y_POS_SHIFT); + data += (int_z_pos << Z_POS_SHIFT); + + // as it is constructed, data is guaranteed to be positive and we only use + // 63 bits. + // Let's misuse the sign bit as a flag, whether we have a low accuracy + // representation + if(low_accuracy) { + data += (static_cast<int64_t>(1) << 63); + } + + data_.push_back(data); + } + + num_elements_ = data_.size(); +} + + +void LinearPositionContainer::ClearRange(std::pair<uint64_t, uint64_t> range) { + + if(range.second <= range.first) { + throw ost::Error("Cannot clear invalid range in " + "LinearPositionContainer!"); + } + + if(range.first >= num_elements_ || range.second > num_elements_) { + throw ost::Error("Cannot clear invalid range in " + "LinearPositionContainer!"); + } + + data_.ClearRange(range.first, range.second); + num_elements_ = data_.size(); +} + + +void LinearPositionContainer::GetPosition(uint64_t idx, geom::Vec3& pos) const { + + if(idx >= num_elements_) { + throw ost::Error("Cannot access invalid position in " + "LinearPositionContainer!"); + } + + int64_t data = data_[idx]; + Real factor = (data < 0) ? 0.1 : 0.01; + pos[0] = factor * Real(static_cast<int>(((data & X_POS_MASK) >> X_POS_SHIFT)) - + CUSTOM_INT_MAX); + pos[1] = factor * Real(static_cast<int>(((data & Y_POS_MASK) >> Y_POS_SHIFT)) - + CUSTOM_INT_MAX); + pos[2] = factor * Real(static_cast<int>(((data & Z_POS_MASK) >> Z_POS_SHIFT)) - + CUSTOM_INT_MAX); +} + +void LinearPositionContainer::GetPositions(std::pair<uint64_t, uint64_t> range, + geom::Vec3List& positions) const { + + if(range.second <= range.first) { + throw ost::Error("Cannot access invalid position in " + "LinearPositionContainer!"); + } + + if(range.first >= num_elements_ || range.second > num_elements_) { + throw ost::Error("Cannot access invalid position in " + "LinearPositionContainer!"); + } + + int num_positions = range.second - range.first; + positions.resize(num_positions); + + for(int i = 0; i < num_positions; ++i) { + this->GetPosition(range.first + i, positions[i]); + } +} + +}} //ns diff --git a/modules/db/src/binary_container.hh b/modules/db/src/binary_container.hh new file mode 100644 index 0000000000000000000000000000000000000000..3648513d43129fa0b8fd16789710c465ca5a917b --- /dev/null +++ b/modules/db/src/binary_container.hh @@ -0,0 +1,138 @@ +//------------------------------------------------------------------------------ +// This file is part of the OpenStructure project <www.openstructure.org> +// +// Copyright (C) 2008-2019 by the OpenStructure authors +// +// This library is free software; you can redistribute it and/or modify it under +// the terms of the GNU Lesser General Public License as published by the Free +// Software Foundation; either version 3.0 of the License, or (at your option) +// any later version. +// This library is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more +// details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this library; if not, write to the Free Software Foundation, Inc., +// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +//------------------------------------------------------------------------------ + + +#ifndef OST_DB_LINEAR_BINARY_CONTAINER_HH +#define OST_DB_LINEAR_BINARY_CONTAINER_HH + +#include <vector> +#include <boost/shared_ptr.hpp> +#include <ost/base.hh> +#include <ost/stdint.hh> +#include <ost/geom/vec3.hh> +#include <ost/db/paged_array.hh> + + +namespace ost { namespace db { + + +class LinearCharacterContainer; +class LinearPositionContainer; +typedef boost::shared_ptr<LinearCharacterContainer> LinearCharacterContainerPtr; +typedef boost::shared_ptr<LinearPositionContainer> LinearPositionContainerPtr; + + +class LinearCharacterContainer { + +public: + + LinearCharacterContainer(): num_elements_(0) { } + + static LinearCharacterContainerPtr Load(const std::string& filename); + + void Save(const std::string& filename) const; + + void AddCharacters(const std::string& characters); + + void ClearRange(std::pair<uint64_t, uint64_t> range); + + char GetCharacter(uint64_t idx) const; + + void GetCharacters(std::pair<uint64_t, uint64_t> range, String& s) const; + + uint64_t GetNumElements() const { return num_elements_; } + + bool operator==(const LinearCharacterContainer& other) const { + return data_ == other.data_; // no check for num_elements_ + // they are coupled anyway + } + + bool operator!=(const LinearCharacterContainer& other) const { + return !(*this == other); + } + +private: + + PagedArray<char, 1048576> data_; + // we track number of entries manually for fast checking of indices + // the .size() function of data_ is too slow + uint64_t num_elements_; +}; + + + + +class LinearPositionContainer { + +public: + + LinearPositionContainer(): num_elements_(0) { } + + static LinearPositionContainerPtr Load(const std::string& filename); + + void Save(const std::string& filename) const; + + void AddPositions(const geom::Vec3List& positions); + + void ClearRange(std::pair<uint64_t, uint64_t> range); + + void GetPosition(uint64_t idx, geom::Vec3& pos) const; + + void GetPositions(std::pair<uint64_t, uint64_t> range, + geom::Vec3List& positions) const; + + uint64_t GetNumElements() const { return num_elements_; } + + bool operator==(const LinearPositionContainer& other) const { + return data_ == other.data_; // no check for num_elements_ + // they are coupled anyway + } + + bool operator!=(const LinearPositionContainer& other) const { + return !(*this == other); + } + +private: + + // 21 bits starting at index 1 are set + static const uint64_t X_POS_MASK = 9223367638808264704; + static const int X_POS_SHIFT = 42; + + // 21 bits starting at index 22 are set + static const uint64_t Y_POS_MASK = 4398044413952; + static const int Y_POS_SHIFT = 21; + + // 21 bits starting at index 43 (the last 21 bits) are set + static const uint64_t Z_POS_MASK = 2097151; + static const int Z_POS_SHIFT = 0; + + // range that can be represented as 21 bit integer + static const int CUSTOM_INT_MIN = -1048576; + static const int CUSTOM_INT_MAX = 1048576; + + PagedArray<uint64_t, 1048576> data_; + // we track number of entries manually for fast checking of indices + // the .size() function of data_ is too slow + uint64_t num_elements_; +}; + + +}} //ns + +#endif diff --git a/modules/db/src/extract_data_helper.cc b/modules/db/src/extract_data_helper.cc new file mode 100644 index 0000000000000000000000000000000000000000..fba11bbf0f9c81b20f66495e6b4af8bb840e39e1 --- /dev/null +++ b/modules/db/src/extract_data_helper.cc @@ -0,0 +1,51 @@ +//------------------------------------------------------------------------------ +// This file is part of the OpenStructure project <www.openstructure.org> +// +// Copyright (C) 2008-2019 by the OpenStructure authors +// +// This library is free software; you can redistribute it and/or modify it under +// the terms of the GNU Lesser General Public License as published by the Free +// Software Foundation; either version 3.0 of the License, or (at your option) +// any later version. +// This library is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more +// details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this library; if not, write to the Free Software Foundation, Inc., +// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +//------------------------------------------------------------------------------ + +#include <ost/db/extract_data_helper.hh> + +namespace ost { namespace db { + +void ExtractValidPositions(const String& entry_name, const String& chain_name, + LinearIndexer& indexer, + LinearCharacterContainer& atomseq_container, + LinearPositionContainer& position_container, + ost::seq::SequenceHandle& seq, + geom::Vec3List& positions) { + + std::pair<uint64_t, uint64_t> data_range = indexer.GetDataRange(entry_name, + chain_name); + String atomseq; + atomseq_container.GetCharacters(data_range, atomseq); + geom::Vec3List extracted_positions; + position_container.GetPositions(data_range, extracted_positions); + + positions.clear(); + std::vector<char> valid_seq; + for(uint i = 0; i < atomseq.size(); ++i) { + if(atomseq[i] != '-') { + positions.push_back(extracted_positions[i]); + valid_seq.push_back(atomseq[i]); + } + } + + seq = ost::seq::CreateSequence("extracted_seq", String(valid_seq.begin(), + valid_seq.end())); +} + +}} //ns diff --git a/modules/db/src/extract_data_helper.hh b/modules/db/src/extract_data_helper.hh new file mode 100644 index 0000000000000000000000000000000000000000..844f56bf3c3b3bb721d5099550d134f66bd03775 --- /dev/null +++ b/modules/db/src/extract_data_helper.hh @@ -0,0 +1,39 @@ +//------------------------------------------------------------------------------ +// This file is part of the OpenStructure project <www.openstructure.org> +// +// Copyright (C) 2008-2019 by the OpenStructure authors +// +// This library is free software; you can redistribute it and/or modify it under +// the terms of the GNU Lesser General Public License as published by the Free +// Software Foundation; either version 3.0 of the License, or (at your option) +// any later version. +// This library is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more +// details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this library; if not, write to the Free Software Foundation, Inc., +// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +//------------------------------------------------------------------------------ + +#ifndef OST_DB_EXTRACT_DATA_HELPER_HH +#define OST_DB_EXTRACT_DATA_HELPER_HH + +#include <ost/geom/vec3.hh> +#include <ost/seq/sequence_handle.hh> +#include <ost/db/linear_indexer.hh> +#include <ost/db/binary_container.hh> + +namespace ost { namespace db { + +void ExtractValidPositions(const String& entry_name, const String& chain_name, + LinearIndexer& indexer, + LinearCharacterContainer& atom_seqcontainer, + LinearPositionContainer& position_container, + ost::seq::SequenceHandle& seq, + geom::Vec3List& positions); + +}} //ns + +#endif \ No newline at end of file diff --git a/modules/db/src/linear_indexer.cc b/modules/db/src/linear_indexer.cc new file mode 100644 index 0000000000000000000000000000000000000000..c0a586582525222d2fb4ff8e1cbcc56413d4f4dd --- /dev/null +++ b/modules/db/src/linear_indexer.cc @@ -0,0 +1,281 @@ +//------------------------------------------------------------------------------ +// This file is part of the OpenStructure project <www.openstructure.org> +// +// Copyright (C) 2008-2019 by the OpenStructure authors +// +// This library is free software; you can redistribute it and/or modify it under +// the terms of the GNU Lesser General Public License as published by the Free +// Software Foundation; either version 3.0 of the License, or (at your option) +// any later version. +// This library is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more +// details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this library; if not, write to the Free Software Foundation, Inc., +// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +//------------------------------------------------------------------------------ + + +#include <ost/db/linear_indexer.hh> +#include <ost/message.hh> +#include <numeric> + +namespace ost { namespace db { + +void AssemblyInfo::Read(std::ifstream& in_stream) { + uint16_t num; + in_stream.read(reinterpret_cast<char*>(&num), sizeof(uint16_t)); + chain_names.resize(num); + chain_lengths.resize(num); + in_stream.read(reinterpret_cast<char*>(&data_start), sizeof(uint64_t)); + + for(uint i = 0; i < num; ++i) { + uint16_t size; + in_stream.read(reinterpret_cast<char*>(&size), sizeof(uint16_t)); + chain_names[i].resize(size); + in_stream.read(&chain_names[i][0], size); + uint32_t chain_length; + in_stream.read(reinterpret_cast<char*>(&chain_length), sizeof(uint32_t)); + chain_lengths[i] = chain_length; + } +} + + +void AssemblyInfo::Write(std::ofstream& out_stream) const { + uint16_t num = chain_names.size(); + out_stream.write(reinterpret_cast<const char*>(&num), sizeof(uint16_t)); + out_stream.write(reinterpret_cast<const char*>(&data_start), sizeof(uint64_t)); + + for(uint i = 0; i < num; ++i) { + uint16_t size = chain_names[i].size(); + out_stream.write(reinterpret_cast<const char*>(&size), sizeof(uint16_t)); + out_stream.write(&chain_names[i][0], size); + uint32_t chain_length = chain_lengths[i]; + out_stream.write(reinterpret_cast<const char*>(&chain_length), sizeof(uint32_t)); + } +} + +LinearIndexerPtr LinearIndexer::Load(const String& filename) { + + std::ifstream in_stream(filename.c_str(), std::ios::binary); + if (!in_stream) { + throw ost::Error("Could not open " + filename); + } + + LinearIndexerPtr loaded_db(new LinearIndexer); + + in_stream.read(reinterpret_cast<char*>(&loaded_db->current_num_residues_), + sizeof(uint64_t)); + + uint32_t num_assemblies; + in_stream.read(reinterpret_cast<char*>(&num_assemblies), sizeof(uint32_t)); + + loaded_db->assemblies_.resize(num_assemblies); + for(uint i = 0; i < num_assemblies; ++i) { + loaded_db->assemblies_[i].Read(in_stream); + } + + for(uint i = 0; i < num_assemblies; ++i) { + uint16_t size; + in_stream.read(reinterpret_cast<char*>(&size), sizeof(uint16_t)); + String assembly_name(size, 'X'); + in_stream.read(&assembly_name[0], size); + uint32_t idx; + in_stream.read(reinterpret_cast<char*>(&idx), sizeof(uint32_t)); + loaded_db->idx_mapper_[assembly_name] = idx; + } + + return loaded_db; +} + + +void LinearIndexer::Save(const String& filename) const { + + std::ofstream out_stream(filename.c_str(), std::ios::binary); + if (!out_stream) { + throw ost::Error("Could not open " + filename); + } + + out_stream.write(reinterpret_cast<const char*>(¤t_num_residues_), + sizeof(uint64_t)); + + uint32_t num_assemblies = assemblies_.size(); + out_stream.write(reinterpret_cast<const char*>(&num_assemblies), + sizeof(uint32_t)); + + for(uint i = 0; i < num_assemblies; ++i) { + assemblies_[i].Write(out_stream); + } + + for(std::map<String, int>::const_iterator it = idx_mapper_.begin(); + it != idx_mapper_.end(); ++it) { + uint16_t size = it->first.size(); + out_stream.write(reinterpret_cast<const char*>(&size), sizeof(uint16_t)); + out_stream.write(&(it->first[0]), size); + uint32_t idx = it->second; + out_stream.write(reinterpret_cast<const char*>(&idx), sizeof(uint32_t)); + } +} + + +void LinearIndexer::AddAssembly(const String& name, + const std::vector<String>& chain_names, + const std::vector<uint>& chain_lengths) { + + if(chain_names.size() != chain_lengths.size()) { + throw ost::Error("Chain names and lengths must be consistent in size!"); + } + + std::map<String, int>::iterator it = idx_mapper_.find(name); + + if(it != idx_mapper_.end()) { + throw ost::Error("The assembly with name \"" + name + + "\" is already present in the indexer!"); + } + + // add new assembly info + idx_mapper_[name] = assemblies_.size(); + assemblies_.push_back(AssemblyInfo()); + + AssemblyInfo& assembly = assemblies_.back(); + assembly.data_start = current_num_residues_; + assembly.chain_names = chain_names; + assembly.chain_lengths = chain_lengths; + current_num_residues_ += std::accumulate(chain_lengths.begin(), + chain_lengths.end(), 0); +} + + +void LinearIndexer::RemoveAssembly(const String& name) { + + std::map<String, int>::iterator idx_it = idx_mapper_.find(name); + + if(idx_it == idx_mapper_.end()) { + throw ost::Error("The assembly with name \"" + name + + "\" is not present in the indexer!"); + } + + int assembly_idx = idx_it->second; + uint64_t num_removed_res = + std::accumulate(assemblies_[assembly_idx].chain_lengths.begin(), + assemblies_[assembly_idx].chain_lengths.end(), 0); + + // Let's remove that assembly + assemblies_.erase(assemblies_.begin() + assembly_idx); + idx_mapper_.erase(idx_it); + + // Reduce all data start indices coming after the removed assembly + for(uint i = assembly_idx; i < assemblies_.size(); ++i) { + assemblies_[i].data_start -= num_removed_res; + } + + // update the idx_mapper and numer of total residues + for(std::map<String, int>::iterator it = idx_mapper_.begin(); + it != idx_mapper_.end(); ++it) { + if(it->second > assembly_idx) { + it->second -= 1; + } + } + + current_num_residues_ -= num_removed_res; +} + + +std::vector<String> LinearIndexer::GetAssemblies() const { + + std::vector<String> return_vec; + for(std::map<String, int>::const_iterator it = idx_mapper_.begin(); + it != idx_mapper_.end(); ++it) { + return_vec.push_back(it->first); + } + + return return_vec; +} + + +std::vector<String> LinearIndexer::GetChainNames(const String& name) const { + + std::map<String, int>::const_iterator idx_it = idx_mapper_.find(name); + + if(idx_it == idx_mapper_.end()) { + throw ost::Error("The assembly with name \"" + name + + "\" is not present in the indexer!"); + } + + const AssemblyInfo& assembly = assemblies_[idx_it->second]; + + return assembly.chain_names; +} + + +std::vector<uint> LinearIndexer::GetChainLengths(const String& name) const { + + std::map<String, int>::const_iterator idx_it = idx_mapper_.find(name); + + if(idx_it == idx_mapper_.end()) { + throw ost::Error("The assembly with name \"" + name + + "\" is not present in the indexer!"); + } + + const AssemblyInfo& assembly = assemblies_[idx_it->second]; + + return assembly.chain_lengths; +} + + +std::pair<uint64_t, uint64_t> LinearIndexer::GetDataRange( + const String& name) const { + + std::map<String, int>::const_iterator idx_it = idx_mapper_.find(name); + + if(idx_it == idx_mapper_.end()) { + throw ost::Error("The assembly with name \"" + name + + "\" is not present in the indexer!"); + } + + const AssemblyInfo& assembly = assemblies_[idx_it->second]; + + uint64_t num_res = std::accumulate(assembly.chain_lengths.begin(), + assembly.chain_lengths.end(), 0); + + return std::make_pair(assembly.data_start, assembly.data_start + num_res); +} + + +std::pair<uint64_t, uint64_t> LinearIndexer::GetDataRange( + const String& name, + const String& chain_name) const { + + std::map<String, int>::const_iterator idx_it = idx_mapper_.find(name); + + if(idx_it == idx_mapper_.end()) { + throw ost::Error("The assembly with name \"" + name + + "\" is not present in the indexer!"); + } + + const AssemblyInfo& assembly = assemblies_[idx_it->second]; + + int ch_idx = -1; + int summed_length = 0; + for(uint i = 0; i < assembly.chain_names.size(); ++i) { + if(assembly.chain_names[i] == chain_name) { + ch_idx = i; + break; + } + summed_length += assembly.chain_lengths[i]; + } + + if(ch_idx == -1) { + throw ost::Error("Could not find chain with name \"" + chain_name + + "\" in specified assembly!"); + } + + uint64_t start = assembly.data_start + summed_length; + uint64_t end = start + assembly.chain_lengths[ch_idx]; + + return std::make_pair(start, end); +} + +}} //ns diff --git a/modules/db/src/linear_indexer.hh b/modules/db/src/linear_indexer.hh new file mode 100644 index 0000000000000000000000000000000000000000..3507ae12d3769d960b59e8cdfda1c90639aa552c --- /dev/null +++ b/modules/db/src/linear_indexer.hh @@ -0,0 +1,108 @@ +//------------------------------------------------------------------------------ +// This file is part of the OpenStructure project <www.openstructure.org> +// +// Copyright (C) 2008-2019 by the OpenStructure authors +// +// This library is free software; you can redistribute it and/or modify it under +// the terms of the GNU Lesser General Public License as published by the Free +// Software Foundation; either version 3.0 of the License, or (at your option) +// any later version. +// This library is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more +// details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this library; if not, write to the Free Software Foundation, Inc., +// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +//------------------------------------------------------------------------------ + + +#ifndef OST_DB_LINEAR_INDEXER_HH +#define OST_DB_LINEAR_INDEXER_HH + +#include <fstream> +#include <map> +#include <vector> +#include <boost/shared_ptr.hpp> +#include <ost/base.hh> +#include <ost/stdint.hh> + + +namespace ost { namespace db { + +class LinearIndexer; +typedef boost::shared_ptr<LinearIndexer> LinearIndexerPtr; + +struct AssemblyInfo { + + void Read(std::ifstream& in_stream); + + void Write(std::ofstream& out_stream) const; + + bool operator==(const AssemblyInfo& other) const { + return (data_start == other.data_start && + chain_names == other.chain_names && + chain_lengths == other.chain_lengths); + } + + bool operator!=(const AssemblyInfo& other) const { + return !(*this==other); + } + + uint64_t data_start; + std::vector<String> chain_names; + std::vector<uint> chain_lengths; +}; + +class LinearIndexer { + +public: + + LinearIndexer(): current_num_residues_(0) { } + + static LinearIndexerPtr Load(const String& filename); + + void Save(const String& filename) const; + + void AddAssembly(const String& name, + const std::vector<String>& chain_names, + const std::vector<uint>& chain_lengths); + + void RemoveAssembly(const String& name); + + std::vector<String> GetAssemblies() const; + + std::vector<String> GetChainNames(const String& name) const; + + std::vector<uint> GetChainLengths(const String& name) const; + + std::pair<uint64_t, uint64_t> GetDataRange(const String& name) const; + + std::pair<uint64_t, uint64_t> GetDataRange(const String& name, + const String& chain_name) const; + + uint64_t GetNumResidues() const { return current_num_residues_; } + + bool operator==(const LinearIndexer& other) const { + return (assemblies_ == other.assemblies_ && + idx_mapper_ == other.idx_mapper_ && + current_num_residues_ == other.current_num_residues_); + + } + + bool operator!=(const LinearIndexer& other) const { + return !(*this == other); + } + +private: + + std::vector<AssemblyInfo> assemblies_; + std::map<String, int> idx_mapper_; + uint64_t current_num_residues_; +}; + + +}} // ns + +#endif diff --git a/modules/db/src/paged_array.hh b/modules/db/src/paged_array.hh new file mode 100644 index 0000000000000000000000000000000000000000..5512c7a9d758f0c017ad3623c9c42866a939cb59 --- /dev/null +++ b/modules/db/src/paged_array.hh @@ -0,0 +1,168 @@ +//------------------------------------------------------------------------------ +// This file is part of the OpenStructure project <www.openstructure.org> +// +// Copyright (C) 2008-2019 by the OpenStructure authors +// +// This library is free software; you can redistribute it and/or modify it under +// the terms of the GNU Lesser General Public License as published by the Free +// Software Foundation; either version 3.0 of the License, or (at your option) +// any later version. +// This library is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more +// details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this library; if not, write to the Free Software Foundation, Inc., +// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +//------------------------------------------------------------------------------ + + +#ifndef OST_DB_PAGED_ARRAY_HH +#define OST_DB_PAGED_ARRAY_HH + +/* + Author: Marco Biasini, Gabriel Studer + */ + +#include <vector> +#include <fstream> +#include <ost/stdint.hh> +#include <ost/message.hh> + +namespace ost{ namespace db{ + +template <typename T, uint64_t P> +class PagedArray { +private: + typedef std::vector<T> Page; +public: + typedef T value_type; + + PagedArray() {} + + T& operator[](uint64_t i) + { + return pages_[this->GetPage(i)][this->GetOffset(i)]; + } + + const T& operator[](uint64_t i) const + { + return pages_[this->GetPage(i)][this->GetOffset(i)]; + } + + void push_back(const T& t) + { + if (pages_.empty()) { + pages_.push_back(Page()); + pages_.back().reserve(P); + } + if (pages_.back().size()==P) { + pages_.push_back(Page()); + pages_.back().reserve(P); + } + pages_.back().push_back(t); + } + uint64_t size() const + { + if (pages_.empty()) { + return 0; + } else { + return static_cast<uint64_t>((pages_.size()-1)*P+pages_.back().size()); + } + } + + bool empty() const + { + return pages_.empty(); + } + + void clear() + { + pages_.clear(); + } + + // clears everything in [from, to[ and shifts every datapoint after the specified + // range at the location defined by from + void ClearRange(uint64_t from, uint64_t to) + { + if(to <= from) return; // invalid range + uint64_t current_size = this->size(); + if(from >= current_size) return; // nothing to delete + if(to > current_size) to = current_size; // from is in the valid range, but + // to is too large. Let's just kill + // [from, this->size()[ + + uint64_t num_elements_to_shift = current_size - to; + for(uint64_t i = 0; i < num_elements_to_shift; ++i) { + (*this)[from + i] = (*this)[to + i]; + } + + uint64_t num_elements_deleted = to - from; + uint64_t new_size = current_size - num_elements_deleted; + uint64_t new_last_element_idx = new_size - 1; + uint64_t new_last_page_idx = this->GetPage(new_last_element_idx); + pages_.resize(new_last_page_idx + 1); + uint64_t offset = this->GetOffset(new_last_element_idx); + pages_.back().resize(offset + 1); + } + + + void Write(std::ofstream& out_stream, bool treat_as_pod=true) const + { + if (treat_as_pod) { + uint64_t s=this->size(); + out_stream.write(reinterpret_cast<char*>(&s), sizeof(uint64_t)); + for (typename std::vector<Page>::const_iterator i=pages_.begin(), + e=pages_.end(); i!=e; ++i) { + out_stream.write(reinterpret_cast<const char*>(&i->front()), + i->size()*sizeof(T)); + } + } else { + throw ost::Error("Cannot write non POD paged array!"); + } + } + void Read(std::ifstream& in_stream, bool treat_as_pod=true) + { + if (treat_as_pod) { + uint64_t s; + in_stream.read(reinterpret_cast<char*>(&s), sizeof(uint64_t)); + uint64_t num_pages=s/P+1; + pages_.resize(num_pages); + for (uint64_t i=0; i<num_pages; ++i) { + uint64_t left=std::min(P, s-i*P); + Page& p=pages_[i]; + p.reserve(P); + p.resize(left); + in_stream.read(reinterpret_cast<char*>(&p.front()), + p.size()*sizeof(T)); + } + } else { + throw ost::Error("Cannot load non POD paged array!"); + } + } + + // portable serialization + // (cleanly element by element with fixed-width base-types) + template <typename DS> + void Serialize(DS& ds) { + // assumes that T is not a fundamental type! + ds & pages_; + } + + bool operator==(const PagedArray& rhs) const { + return pages_ == rhs.pages_; + } + bool operator!=(const PagedArray& rhs) const { + return !this->operator==(rhs); + } + +private: + uint64_t GetPage(uint64_t i) const { return i/P; } + uint64_t GetOffset(uint64_t i) const { return i % P; } + std::vector<Page> pages_; +}; + +}} // ns + +#endif diff --git a/modules/db/tests/CMakeLists.txt b/modules/db/tests/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..17b8163969420ae39233c22756f42be205b4e2b4 --- /dev/null +++ b/modules/db/tests/CMakeLists.txt @@ -0,0 +1,7 @@ +set(OST_DB_UNIT_TESTS + test_linear_db.py +) + +ost_unittest(MODULE db + SOURCES "${OST_DB_UNIT_TESTS}") + diff --git a/modules/db/tests/test_linear_db.py b/modules/db/tests/test_linear_db.py new file mode 100644 index 0000000000000000000000000000000000000000..8350c991808f14779982af1cbd6f12a7c43e6226 --- /dev/null +++ b/modules/db/tests/test_linear_db.py @@ -0,0 +1,282 @@ +import unittest,os +from ost.db import LinearIndexer +from ost.db import LinearCharacterContainer +from ost.db import LinearPositionContainer +from ost import geom + + +def ComparePosList(l1, l2): + if len(l1)!= len(l2): + return False + eps = 0.0001 + for i in range(len(l1)): + if geom.Distance(l1[i],l2[i]) > eps: + return False + return True + + +class TestLinearDB(unittest.TestCase): + + def testLinearIndexer(self): + + indexer = LinearIndexer() + self.assertEqual(indexer.GetNumResidues(), 0) + + # check whether error is thrown when list parameters do not have + # same length + self.assertRaises(Exception, indexer.AddAssembly, "HELLOU", + ["AA", "B"], [20, 33, 1]) + + # correct it and add first assembly + indexer.AddAssembly("HELLOU", ["AA", "B", "CCC"], [20, 33, 1]) + + # check whether according entry has been added + self.assertEqual(indexer.GetAssemblies(), ["HELLOU"]) + self.assertEqual(indexer.GetNumResidues(), 54) + + # check whether error get thrown when the same assembly is added again + # and directly add another assembly + self.assertRaises(Exception, indexer.AddAssembly, "HELLOU", + ["AA", "B", "CCC"], [20, 33, 1]) + + indexer.AddAssembly("HELLOUYOU", ["X", "Y", "ZZZ"], [10, 26, 20]) + + # check whether according entry has been added. we need to sort the + # output since the GetAssemblies function gives no guarantee about ordering + entries = indexer.GetAssemblies() + entries.sort() + self.assertEqual(entries, ["HELLOU", "HELLOUYOU"]) + self.assertEqual(indexer.GetNumResidues(), 110) + + # check the GetDataRange functions and all their errors when + # inexistent data is accessed + self.assertRaises(Exception, indexer.GetDataRange, "HELL", "CCC") + self.assertRaises(Exception, indexer.GetDataRange, "HELLOU", "CC") + self.assertEqual(indexer.GetDataRange("HELLOU", "CCC"), (53,54)) + + self.assertRaises(Exception, indexer.GetDataRange, "HELL") + self.assertEqual(indexer.GetDataRange("HELLOU"), (0,54)) + + # add some more assemblies to test proper removal + indexer.AddAssembly("THIRDENTRY", ["X"], [5]) + indexer.AddAssembly("FOURTHENTRY", ["R","asdf"],[22, 10]) + + # check the errors to be thrown when the RemoveAssembly function is called + self.assertRaises(Exception, indexer.RemoveAssembly, "HELLOUYO") + + # before removing anything we check tons of ranges, we expect + # them to be ordered as they have been added to the indexer + self.assertEqual(indexer.GetDataRange("HELLOU"), (0, 54)) + self.assertEqual(indexer.GetDataRange("HELLOU", "B"), (20, 53)) + self.assertEqual(indexer.GetDataRange("HELLOUYOU"), (54, 110)) + self.assertEqual(indexer.GetDataRange("HELLOUYOU", "Y"), (64, 90)) + self.assertEqual(indexer.GetDataRange("THIRDENTRY"), (110, 115)) + self.assertEqual(indexer.GetDataRange("THIRDENTRY", "X"), (110, 115)) + self.assertEqual(indexer.GetDataRange("FOURTHENTRY"), (115, 147)) + self.assertEqual(indexer.GetDataRange("FOURTHENTRY", "asdf"), (137, 147)) + + self.assertEqual(indexer.GetNumResidues(), 147) + + # remove an assembly + indexer.RemoveAssembly("HELLOUYOU") + + # we expect, that the data ranges for assembly HELLOU stays the same, + # but for all others there should be a decrease of 10+26+20=56 + self.assertEqual(indexer.GetDataRange("HELLOU"), (0, 54)) + self.assertEqual(indexer.GetDataRange("HELLOU", "B"), (20, 53)) + self.assertEqual(indexer.GetDataRange("THIRDENTRY"), (110 - 56, 115 - 56)) + self.assertEqual(indexer.GetDataRange("THIRDENTRY", "X"), (110 - 56, 115 - 56)) + self.assertEqual(indexer.GetDataRange("FOURTHENTRY"), (115 - 56, 147 - 56)) + self.assertEqual(indexer.GetDataRange("FOURTHENTRY", "asdf"), (137 - 56, 147 - 56)) + self.assertEqual(indexer.GetNumResidues(), 147 - 56) + + # We additionally check, whether the thing is really gone + current_entries = indexer.GetAssemblies() + current_entries.sort() + self.assertEqual(current_entries, ["FOURTHENTRY", "HELLOU", "THIRDENTRY"]) + + # Let's finally test saving and loading + indexer.Save("test_indexer.dat") + loaded_indexer = LinearIndexer.Load("test_indexer.dat") + self.assertEqual(indexer, loaded_indexer) + + if os.path.exists("test_indexer.dat"): + os.remove("test_indexer.dat") + + + def testCharacterContainer(self): + + container = LinearCharacterContainer() + + self.assertEqual(container.GetNumElements(), 0) + + # define a few sequences to play with + s1 = "AAAAAAA" + s2 = "BBB" + s3 = "CCCCCCCCCC" + + # add some stuff + container.AddCharacters(s1) + container.AddCharacters(s2) + container.AddCharacters(s3) + + # check whether stuff has properly been added + self.assertEqual(container.GetNumElements(), len(s1) + len(s2) + len(s3)) + self.assertEqual(container.GetCharacters((0, container.GetNumElements())), + s1 + s2 + s3) + self.assertEqual(container.GetCharacters((0, len(s1))), "AAAAAAA") + self.assertEqual(container.GetCharacters((len(s1), len(s1) + len(s2))), "BBB") + self.assertEqual(container.GetCharacters((len(s1) + len(s2), + len(s1) + len(s2) + len(s3))), + "CCCCCCCCCC") + + + # check whether errors get thrown for invalid ranges + self.assertRaises(Exception, container.GetCharacters, (5,5)) + self.assertRaises(Exception, container.GetCharacters, (5,4)) + self.assertRaises(Exception, container.GetCharacters, (5,100)) + self.assertRaises(Exception, container.GetCharacters, (100,120)) + self.assertRaises(Exception, container.ClearRange, (5,5)) + self.assertRaises(Exception, container.ClearRange, (5,4)) + self.assertRaises(Exception, container.ClearRange, (5,100)) + self.assertRaises(Exception, container.ClearRange, (100,120)) + + # let's remove s2 and then check whether the content is as expected + container.ClearRange((len(s1),len(s1) + len(s2))) + self.assertEqual(container.GetNumElements(), len(s1) + len(s3)) + self.assertEqual(container.GetCharacters((0, len(s1))), "AAAAAAA") + self.assertEqual(container.GetCharacters((len(s1), len(s1) + len(s3))), "CCCCCCCCCC") + + # let's check the single character extraction function + full_expected_string = s1 + s3 + for i in range(container.GetNumElements()): + self.assertEqual(full_expected_string[i], container.GetCharacter(i)) + + # save / load and check for equality + container.Save("test_character_container.dat") + loaded_container = LinearCharacterContainer.Load("test_character_container.dat") + self.assertEqual(container, loaded_container) + + if os.path.exists("test_character_container.dat"): + os.remove("test_character_container.dat") + + + def testPositionContainer(self): + + container = LinearPositionContainer() + + self.assertEqual(container.GetNumElements(), 0) + + # define a few position lists to play with + pos_1 = geom.Vec3List() + pos_2 = geom.Vec3List() + pos_3 = geom.Vec3List() + + pos_1.append(geom.Vec3(-10.234, -45.333, -45.338)) + pos_1.append(geom.Vec3(10.234, 45.333, 45.338)) + pos_1.append(geom.Vec3(10.234, 45.333, 45.338)) + pos_1.append(geom.Vec3(10.234, 45.333, 45.338)) + pos_1.append(geom.Vec3(10.234, 45.333, 45.338)) + pos_1.append(geom.Vec3(10.234, 45.333, 45.338)) + + pos_2.append(geom.Vec3(50000.34, 32.43, 33.2222)) + pos_2.append(geom.Vec3(-50000.34, -32.43, -33.2222)) + pos_2.append(geom.Vec3(50000.34, 32.43, 33.2222)) + + pos_3.append(geom.Vec3(87.909, 65.222, 1000.555)) + pos_3.append(geom.Vec3(87.909, 65.222, 1000.555)) + + # get the rounded target values (the accuracy with which they) + # are stored in the container) + pos_1_target = geom.Vec3(10.23, 45.33, 45.34) + pos_2_target = geom.Vec3(50000.3, 32.4, 33.2) # reduced accuracy! + pos_3_target = geom.Vec3(87.91, 65.22, 1000.56) + + pos_1_target_list = geom.Vec3List() + pos_2_target_list = geom.Vec3List() + pos_3_target_list = geom.Vec3List() + + for i in range(6): + pos_1_target_list.append(pos_1_target) + pos_1_target_list[0] = -pos_1_target_list[0] + + for i in range(3): + pos_2_target_list.append(pos_2_target) + pos_2_target_list[1] = -pos_2_target_list[1] + + pos_3_target_list.append(pos_3_target) + pos_3_target_list.append(pos_3_target) + + # add it + container.AddPositions(pos_1) + container.AddPositions(pos_2) + container.AddPositions(pos_3) + + extracted_pos_list = geom.Vec3List() + extracted_pos = geom.Vec3() + + # check whether stuff has properly been added + self.assertEqual(container.GetNumElements(), + len(pos_1) + len(pos_2) + len(pos_3)) + + container.GetPositions((0,len(pos_1)), extracted_pos_list) + + self.assertTrue(ComparePosList(pos_1_target_list, extracted_pos_list)) + + container.GetPositions((len(pos_1),len(pos_1)+len(pos_2)), extracted_pos_list) + self.assertTrue(ComparePosList(pos_2_target_list, extracted_pos_list)) + + container.GetPositions((len(pos_1)+len(pos_2), + len(pos_1)+len(pos_2)+len(pos_3)), extracted_pos_list) + self.assertTrue(ComparePosList(pos_3_target_list, extracted_pos_list)) + + # check whether errors get thrown for invalid ranges + dummy_list = geom.Vec3List() + dummy_pos = geom.Vec3() + self.assertRaises(Exception, container.GetPositions, (5,5), dummy_list) + self.assertRaises(Exception, container.GetPositions, (5,4), dummy_list) + self.assertRaises(Exception, container.GetPositions, (5,100), dummy_list) + self.assertRaises(Exception, container.GetPositions, (100,120), dummy_list) + self.assertRaises(Exception, container.GetPosition, 100, dummy_pos) + self.assertRaises(Exception, container.ClearRange, (5,5)) + self.assertRaises(Exception, container.ClearRange, (5,4)) + self.assertRaises(Exception, container.ClearRange, (5,100)) + self.assertRaises(Exception, container.ClearRange, (100,120)) + + # lets remove pos_2_list and check, whether the result is as expected + container.ClearRange((len(pos_1), len(pos_1)+len(pos_2))) + + self.assertEqual(container.GetNumElements(), len(pos_1) + len(pos_3)) + + container.GetPositions((0,len(pos_1)), extracted_pos_list) + self.assertTrue(ComparePosList(pos_1_target_list, extracted_pos_list)) + + container.GetPositions((len(pos_1), len(pos_1) + len(pos_3)), extracted_pos_list) + self.assertTrue(ComparePosList(pos_3_target_list, extracted_pos_list)) + + # check the single position getter + pos_13_target_list = geom.Vec3List() + for item in pos_1_target_list: + pos_13_target_list.append(item) + for item in pos_3_target_list: + pos_13_target_list.append(item) + + eps = 0.00001 + for i in range(len(pos_13_target_list)): + container.GetPosition(i, extracted_pos) + self.assertTrue(geom.Distance(extracted_pos, pos_13_target_list[i]) < eps) + + # save / load and check for equality + container.Save("test_position_container.dat") + loaded_container = LinearPositionContainer.Load("test_position_container.dat") + self.assertEqual(container, loaded_container) + + if os.path.exists("test_position_container.dat"): + os.remove("test_position_container.dat") + + +if __name__ == "__main__": + + from ost import testutils + testutils.RunTests() +