diff --git a/modules/io/pymod/CMakeLists.txt b/modules/io/pymod/CMakeLists.txt index 6763a0189c4d779f8d2777576ba576d886201af4..d547f7d7a487a6b28a31478bb1d7ec1e95b68acb 100644 --- a/modules/io/pymod/CMakeLists.txt +++ b/modules/io/pymod/CMakeLists.txt @@ -2,6 +2,7 @@ set(OST_IO_PYMOD_SOURCES wrap_io.cc export_pdb_io.cc export_mmcif_io.cc + export_omf_io.cc ) if (ENABLE_IMG) diff --git a/modules/io/pymod/export_omf_io.cc b/modules/io/pymod/export_omf_io.cc new file mode 100644 index 0000000000000000000000000000000000000000..480095319e8b2f1e229a3b8ee3de07bbbbeb5e91 --- /dev/null +++ b/modules/io/pymod/export_omf_io.cc @@ -0,0 +1,53 @@ +//------------------------------------------------------------------------------ +// This file is part of the OpenStructure project <www.openstructure.org> +// +// Copyright (C) 2008-2021 by the OpenStructure authors +// +// This library is free software; you can redistribute it and/or modify it under +// the terms of the GNU Lesser General Public License as published by the Free +// Software Foundation; either version 3.0 of the License, or (at your option) +// any later version. +// This library is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more +// details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this library; if not, write to the Free Software Foundation, Inc., +// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +//------------------------------------------------------------------------------ +#include <boost/python.hpp> +using namespace boost::python; + +#include <ost/io/mol/omf.hh> + +using namespace ost; +using namespace ost::io; + +namespace{ + + PyObject* wrap_to_bytes(OMFPtr omf) { + String str = omf->ToString(); + return PyBytes_FromStringAndSize(str.c_str(), str.size()); + } + + OMFPtr wrap_from_bytes(boost::python::object obj) { + String str(PyBytes_AsString(obj.ptr()), PyBytes_Size(obj.ptr())); + return OMF::FromString(str); + } + +} + +void export_omf_io() { + class_<OMF, OMFPtr>("OMF",no_init) + .def("FromEntity", &OMF::FromEntity).staticmethod("FromEntity") + .def("FromMMCIF", &OMF::FromMMCIF).staticmethod("FromMMCIF") + .def("FromFile", &OMF::FromFile).staticmethod("FromFile") + .def("FromBytes", &wrap_from_bytes).staticmethod("FromBytes") + .def("ToFile", &OMF::ToFile) + .def("ToBytes", &wrap_to_bytes) + .def("GetAU", &OMF::GetAU) + .def("GetAUChain", &OMF::GetAUChain) + .def("GetBU", &OMF::GetBU) + ; +} diff --git a/modules/io/pymod/wrap_io.cc b/modules/io/pymod/wrap_io.cc index 1a56254bec76d365caaab073923d2bb7f6fa79eb..c714fed34c83b94da070651d35e4d14d0b9ada02 100644 --- a/modules/io/pymod/wrap_io.cc +++ b/modules/io/pymod/wrap_io.cc @@ -69,6 +69,7 @@ ost::mol::alg::StereoChemicalProps (*read_props_b)(bool check) = &ReadStereoChem void export_pdb_io(); void export_mmcif_io(); +void export_omf_io(); #if OST_IMG_ENABLED void export_map_io(); #endif @@ -134,6 +135,7 @@ BOOST_PYTHON_MODULE(_ost_io) export_pdb_io(); export_mmcif_io(); + export_omf_io(); #if OST_IMG_ENABLED export_map_io(); #endif diff --git a/modules/io/src/mol/CMakeLists.txt b/modules/io/src/mol/CMakeLists.txt index c88d99db8690af8efafc44a896f1cd017d03ff19..1581c9519b10e9b790f63cbbb20c51ce2abbd527 100644 --- a/modules/io/src/mol/CMakeLists.txt +++ b/modules/io/src/mol/CMakeLists.txt @@ -21,6 +21,7 @@ mmcif_reader.cc mmcif_info.cc pdb_str.cc stereochemical_params_reader.cc +omf.cc PARENT_SCOPE ) @@ -49,5 +50,6 @@ load_surface.hh surface_io_msms_handler.hh pdb_str.hh stereochemical_params_reader.hh +omf.hh PARENT_SCOPE ) diff --git a/modules/io/src/mol/mmcif_info.hh b/modules/io/src/mol/mmcif_info.hh index b97949105b4818777d0650d8ba59f45e8f05e8ab..916954323af8c2cbee20eaec252a80b70117c081 100644 --- a/modules/io/src/mol/mmcif_info.hh +++ b/modules/io/src/mol/mmcif_info.hh @@ -304,7 +304,7 @@ public: /// \brief Get the list of intervals of chains /// /// \return pair-intervals - const std::vector<std::pair<int, int> >& GetChainIntervalList() + const std::vector<std::pair<int, int> >& GetChainIntervalList() const { return tr_chains_; } @@ -317,7 +317,7 @@ public: /// \brief Get the list of intervals of operations /// /// \return pair-intervals - const std::vector<std::pair<int, int> >& GetOperationsIntervalList() + const std::vector<std::pair<int, int> >& GetOperationsIntervalList() const { return tr_operations_; } @@ -325,7 +325,7 @@ public: /// \brief Get the list of operations /// /// \return vector of vectors of iterators. - const std::vector<std::vector<MMCifInfoTransOpPtr> >& GetOperations() + const std::vector<std::vector<MMCifInfoTransOpPtr> >& GetOperations() const { return operations_; } diff --git a/modules/io/src/mol/omf.cc b/modules/io/src/mol/omf.cc new file mode 100644 index 0000000000000000000000000000000000000000..35b96f71ba15a4d1c2d3fe8040ab26c13c38804f --- /dev/null +++ b/modules/io/src/mol/omf.cc @@ -0,0 +1,973 @@ +#include <ost/mol/atom_handle.hh> +#include <ost/mol/residue_handle.hh> +#include <ost/mol/chain_handle.hh> +#include <ost/mol/xcs_editor.hh> + +#include "omf.hh" + + +namespace{ + + // some hash function we need for an unordered_map + // stolen from https://stackoverflow.com/questions/32685540/why-cant-i-compile-an-unordered-map-with-a-pair-as-key + struct pair_hash { + template <class T1, class T2> + std::size_t operator () (const std::pair<T1,T2> &p) const { + auto h1 = std::hash<T1>{}(p.first); + auto h2 = std::hash<T2>{}(p.second); + // Mainly for demonstration purposes, i.e. works but is overly simple + // In the real world, use sth. like boost.hash_combine + return h1 ^ h2; + } + }; + + // some helpers + void RealToIntVec(const std::vector<Real>& real_vec, + std::vector<int>& int_vec, Real factor) { + int_vec.resize(real_vec.size()); + for(uint i = 0; i < real_vec.size(); ++i) { + int_vec[i] = factor*real_vec[i]; + } + } + + void IntToRealVec(const std::vector<int>& int_vec, + std::vector<Real>& real_vec, Real factor) { + real_vec.resize(int_vec.size()); + for(uint i = 0; i < int_vec.size(); ++i) { + real_vec[i] = factor*int_vec[i]; + } + } + + // generates as many chain names as you want (potentially multiple characters) + struct ChainNameGenerator{ + ChainNameGenerator() { + chain_names = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789abcdefghijklmnopqrstuvwxyz"; + n_chain_names = chain_names.size(); + indices.push_back(-1); + } + + String Get() { + int idx = indices.size() - 1; + indices[idx] += 1; + bool more_digits = false; + while(idx >= 0) { + if(indices[idx] >= n_chain_names) { + indices[idx] = 0; + if(idx>0) { + indices[idx-1] += 1; + --idx; + } else { + more_digits = true; + break; + } + } else { + break; + } + } + if(more_digits) { + indices.insert(indices.begin(), 0); + } + String ch_name(indices.size(), 'X'); + for(uint i = 0; i < indices.size(); ++i) { + ch_name[i] = chain_names[indices[i]]; + } + return ch_name; + } + + void Reset() { + indices.clear(); + indices.push_back(-1); + } + + String chain_names; + int n_chain_names; + std::vector<int> indices; + }; + + // delta/runlength encodings/decodings + void DeltaEncoding(const std::vector<int>& in, std::vector<int>& out) { + out.clear(); + if(in.empty()) { + return; + } + out.reserve(in.size()); + out.push_back(in[0]); + for(uint i = 1; i < in.size(); ++i) { + out.push_back(in[i] - in[i-1]); + } + } + + void DeltaDecoding(const std::vector<int>& in, std::vector<int>& out) { + out.clear(); + if(in.empty()) { + return; + } + out.reserve(in.size()); + out.push_back(in[0]); + for(uint i = 1; i < in.size(); ++i) { + out.push_back(out.back() + in[i]); + } + } + + void RunLengthEncoding(const std::vector<int>& in, std::vector<int>& out) { + out.clear(); + if(in.empty()) { + return; + } + int current_item = in[0]; + int run_length = 1; + for(uint i = 1; i < in.size(); ++i) { + if(in[i] == current_item) { + ++run_length; + } else { + out.push_back(current_item); + out.push_back(run_length); + current_item = in[i]; + run_length = 1; + } + } + out.push_back(current_item); + out.push_back(run_length); + } + + void RunLengthDecoding(const std::vector<int>& in, std::vector<int>& out) { + out.clear(); + if(in.empty()) { + return; + } + int n_runs = in.size() / 2; + for(int run_idx = 0; run_idx < n_runs; ++run_idx) { + int value = in[run_idx*2]; + int run_length = in[run_idx*2+1]; + out.insert(out.end(), run_length, value); + } + } + + // dump and load strings + void Load(std::istream& stream, String& str) { + uint32_t size; + stream.read(reinterpret_cast<char*>(&size), sizeof(uint32_t)); + str.resize(size); + stream.read(&str[0], size); + } + + void Dump(std::ostream& stream, const String& str) { + uint32_t size = str.size(); + stream.write(reinterpret_cast<char*>(&size), sizeof(uint32_t)); + stream.write(&str[0], size); + } + + // dump and load chars + void Load(std::istream& stream, char& ch) { + stream.read(&ch, sizeof(char)); + } + + void Dump(std::ostream& stream, char ch) { + stream.write(&ch, sizeof(char)); + } + + // dump and load maps with string as key and ChainDataPtr as value + void Load(std::istream& stream, + std::map<String, ost::io::ChainDataPtr>& map) { + uint32_t size; + stream.read(reinterpret_cast<char*>(&size), sizeof(uint32_t)); + map.clear(); + for(uint i = 0; i < size; ++i) { + ost::io::ChainDataPtr p(new ost::io::ChainData); + p->FromStream(stream); + map[p->ch_name] = p; + } + } + + void Dump(std::ostream& stream, + const std::map<String, ost::io::ChainDataPtr>& map) { + uint32_t size = map.size(); + stream.write(reinterpret_cast<char*>(&size), sizeof(uint32_t)); + for(auto it = map.begin(); it != map.end(); ++it) { + // we don't dump the key (chain name), that's an attribute of the + // chain itself anyway + it->second->ToStream(stream); + } + } + + // dump and load vectors with various types of integers + template<typename T> + void LoadIntVec(std::istream& stream, std::vector<T>& vec) { + uint32_t size; + stream.read(reinterpret_cast<char*>(&size), sizeof(uint32_t)); + vec.resize(size); + stream.read(reinterpret_cast<char*>(&vec[0]), size*sizeof(T)); + } + + template<typename T> + void DumpIntVec(std::ostream& stream, const std::vector<T>& vec) { + uint32_t size = vec.size(); + stream.write(reinterpret_cast<char*>(&size), sizeof(uint32_t)); + stream.write(reinterpret_cast<const char*>(&vec[0]), size*sizeof(T)); + } + + // dump and load vectors with int that has adaptive memory model + void Load(std::istream& stream, std::vector<int>& vec) { + int8_t encoding; + stream.read(reinterpret_cast<char*>(&encoding), sizeof(int8_t)); + if(encoding == 8) { + std::vector<int8_t> int8_vec; + std::vector<int> outliers; + LoadIntVec(stream, int8_vec); + LoadIntVec(stream, outliers); + vec.assign(int8_vec.begin(), int8_vec.end()); + int n_outliers = outliers.size() / 2; + for(int i = 0; i < n_outliers; ++i) { + vec[outliers[2*i]] = outliers[2*i+1]; + } + } else if(encoding == 16) { + std::vector<int16_t> int16_vec; + std::vector<int> outliers; + LoadIntVec(stream, int16_vec); + LoadIntVec(stream, outliers); + vec.assign(int16_vec.begin(), int16_vec.end()); + int n_outliers = outliers.size() / 2; + for(int i = 0; i < n_outliers; ++i) { + vec[outliers[2*i]] = outliers[2*i+1]; + } + } else if(encoding == 32) { + LoadIntVec(stream, vec); + } else { + throw ost::Error("Encountered unknown encoding when loading int vec." ); + } + } + + void Dump(std::ostream& stream, const std::vector<int>& vec) { + std::vector<int> int8_outliers; + std::vector<int> int16_outliers; + for(uint i = 0; i < vec.size(); ++i) { + if(vec[i] < std::numeric_limits<int8_t>::min() || + vec[i] > std::numeric_limits<int8_t>::max()) { + int8_outliers.push_back(i); + if(vec[i] < std::numeric_limits<int16_t>::min() || + vec[i] > std::numeric_limits<int16_t>::max()) { + int16_outliers.push_back(i); + } + } + } + + int n_bytes_8 = 4 + 1 * vec.size(); // the data vec + n_bytes_8 += (4 + 2*4*int8_outliers.size()); // overhead to store outliers + int n_bytes_16 = 4 + 2 * vec.size(); // the data vec + n_bytes_16 += (4 + 2*4*int16_outliers.size()); // overhead to store outliers + int n_bytes_32 = 4 + 4 * vec.size(); // the data vec, no overhead + + if(n_bytes_8 < n_bytes_16 && n_bytes_8 < n_bytes_32) { + int8_t encoding = 8; + stream.write(reinterpret_cast<char*>(&encoding), sizeof(int8_t)); + std::vector<int8_t> int8_vec(vec.begin(), vec.end()); + std::vector<int> outliers; + for(auto it = int8_outliers.begin(); it != int8_outliers.end(); ++it) { + outliers.push_back(*it); + outliers.push_back(vec[*it]); + } + DumpIntVec(stream, int8_vec); + DumpIntVec(stream, outliers); + } else if(n_bytes_16 < n_bytes_32) { + int8_t encoding = 16; + stream.write(reinterpret_cast<char*>(&encoding), sizeof(int8_t)); + std::vector<int16_t> int16_vec(vec.begin(), vec.end()); + std::vector<int> outliers; + for(auto it = int16_outliers.begin(); it != int16_outliers.end(); ++it) { + outliers.push_back(*it); + outliers.push_back(vec[*it]); + } + DumpIntVec(stream, int16_vec); + DumpIntVec(stream, outliers); + } else { + int8_t encoding = 32; + stream.write(reinterpret_cast<char*>(&encoding), sizeof(int8_t)); + DumpIntVec(stream, vec); + } + } + + // dump and load vectors with strings + void Load(std::istream& stream, std::vector<String>& vec) { + std::vector<uint8_t> string_sizes; + String str; + LoadIntVec(stream, string_sizes); + Load(stream, str); + vec.resize(string_sizes.size()); + int idx = 0; + for(uint i = 0; i < string_sizes.size(); ++i) { + vec[i] = str.substr(idx, string_sizes[i]); + idx += string_sizes[i]; + } + } + + void Dump(std::ostream& stream, const std::vector<String>& vec) { + String total_string; + std::vector<uint8_t> string_sizes; + for(auto it = vec.begin(); it != vec.end(); ++it) { + if(it->size() > std::numeric_limits<uint8_t>::max()) { + std::stringstream ss; + ss << "Max string size that can be encoded is "; + ss << std::numeric_limits<uint8_t>::max() << " cannot encode "<< *it; + } + string_sizes.push_back(it->size()); + total_string += *it; + } + DumpIntVec(stream, string_sizes); + Dump(stream, total_string); + } + + // dump and load vectors with ResidueDefinition + void Load(std::istream& stream, std::vector<ost::io::ResidueDefinition>& vec) { + uint32_t size; + stream.read(reinterpret_cast<char*>(&size), sizeof(uint32_t)); + vec.resize(size); + for(uint i = 0; i < size; ++i) { + vec[i].FromStream(stream); + } + } + + void Dump(std::ostream& stream, const std::vector<ost::io::ResidueDefinition>& vec) { + uint32_t size = vec.size(); + stream.write(reinterpret_cast<char*>(&size), sizeof(uint32_t)); + for(uint i = 0; i < size; ++i) { + vec[i].ToStream(stream); + } + } + + // dump and load vectors with BioUnitDefinition + void Load(std::istream& stream, std::vector<ost::io::BioUnitDefinition>& vec) { + uint32_t size; + stream.read(reinterpret_cast<char*>(&size), sizeof(uint32_t)); + vec.resize(size); + for(uint i = 0; i < size; ++i) { + vec[i].FromStream(stream); + } + } + + void Dump(std::ostream& stream, const std::vector<ost::io::BioUnitDefinition>& vec) { + uint32_t size = vec.size(); + stream.write(reinterpret_cast<char*>(&size), sizeof(uint32_t)); + for(uint i = 0; i < size; ++i) { + vec[i].ToStream(stream); + } + } + + // dump and load vectors with Mat4 + void Load(std::istream& stream, std::vector<geom::Mat4>& vec) { + uint32_t size; + stream.read(reinterpret_cast<char*>(&size), sizeof(uint32_t)); + vec.resize(size); + for(uint i = 0; i < size; ++i) { + stream.read(reinterpret_cast<char*>(vec[i].Data()),16*sizeof(Real)); + } + } + + void Dump(std::ostream& stream, const std::vector<geom::Mat4>& vec) { + uint32_t size = vec.size(); + stream.write(reinterpret_cast<char*>(&size), sizeof(uint32_t)); + for(uint i = 0; i < size; ++i) { + stream.write(reinterpret_cast<const char*>(vec[i].Data()), 16*sizeof(Real)); + } + } + + // knowledge based dump and load functions, i.e. awesome compressions + void LoadIsHetatm(std::istream& stream, std::vector<bool>& vec) { + std::vector<int> run_length_encoded; + Load(stream, run_length_encoded); + std::vector<int> int_vec; + RunLengthDecoding(run_length_encoded, int_vec); + vec.assign(int_vec.begin(), int_vec.end()); + } + + void DumpIsHetatm(std::ostream& stream, const std::vector<bool>& vec) { + std::vector<int> int_vec(vec.begin(), vec.end()); + std::vector<int> run_length_encoded; + RunLengthEncoding(int_vec, run_length_encoded); + Dump(stream, run_length_encoded); + } + + void LoadBondOrders(std::istream& stream, std::vector<int>& vec) { + std::vector<int> run_length_encoded; + Load(stream, run_length_encoded); + RunLengthDecoding(run_length_encoded, vec); + } + + void DumpBondOrders(std::ostream& stream, const std::vector<int>& vec) { + std::vector<int> run_length_encoded; + RunLengthEncoding(vec, run_length_encoded); + Dump(stream, run_length_encoded); + } + + void LoadPosVec(std::istream& stream, std::vector<Real>& vec) { + std::vector<int> delta_encoded; + Load(stream, delta_encoded); + std::vector<int> int_vec; + DeltaDecoding(delta_encoded, int_vec); + IntToRealVec(int_vec, vec, 0.001); + } + + void LoadPositions(std::istream& stream, geom::Vec3List& positions) { + std::vector<Real> x_pos; + std::vector<Real> y_pos; + std::vector<Real> z_pos; + LoadPosVec(stream, x_pos); + LoadPosVec(stream, y_pos); + LoadPosVec(stream, z_pos); + positions.resize(x_pos.size()); + for(uint i = 0; i < positions.size(); ++i) { + positions[i] = geom::Vec3(x_pos[i], y_pos[i], z_pos[i]); + } + } + + void DumpPosVec(std::ostream& stream, const std::vector<Real>& vec) { + std::vector<int> int_vec; + RealToIntVec(vec, int_vec, 1000); + std::vector<int> delta_compressed; + DeltaEncoding(int_vec, delta_compressed); + Dump(stream, delta_compressed); + } + + void DumpPositions(std::ostream& stream, const geom::Vec3List& positions) { + std::vector<Real> x_pos(positions.size()); + std::vector<Real> y_pos(positions.size()); + std::vector<Real> z_pos(positions.size()); + for(uint i = 0; i < positions.size(); ++i) { + x_pos[i] = positions[i][0]; + y_pos[i] = positions[i][1]; + z_pos[i] = positions[i][2]; + } + DumpPosVec(stream, x_pos); + DumpPosVec(stream, y_pos); + DumpPosVec(stream, z_pos); + } + + void LoadBFactors(std::istream& stream, std::vector<Real>& bfactors) { + std::vector<int> delta_encoded; + Load(stream, delta_encoded); + std::vector<int> int_vec; + DeltaDecoding(delta_encoded, int_vec); + IntToRealVec(int_vec, bfactors, 0.01); + } + + void DumpBFactors(std::ostream& stream, const std::vector<Real>& bfactors) { + std::vector<int> int_vec; + RealToIntVec(bfactors, int_vec, 100); + std::vector<int> delta_encoded; + DeltaEncoding(int_vec, delta_encoded); + Dump(stream, delta_encoded); + } + + void LoadOccupancies(std::istream& stream, std::vector<Real>& occupancies) { + std::vector<int> run_length_encoded; + Load(stream, run_length_encoded); + std::vector<int> int_vec; + RunLengthDecoding(run_length_encoded, int_vec); + IntToRealVec(int_vec, occupancies, 0.01); + } + + void DumpOccupancies(std::ostream& stream, const std::vector<Real>& occupancies) { + std::vector<int> int_vec; + RealToIntVec(occupancies, int_vec, 100.0); + std::vector<int> run_length_encoded; + RunLengthEncoding(int_vec, run_length_encoded); + Dump(stream, run_length_encoded); + } + + void LoadInsertionCodes(std::istream& stream, std::vector<char>& ins_codes) { + std::vector<int> run_length_encoded; + Load(stream, run_length_encoded); + std::vector<int> int_vec; + RunLengthDecoding(run_length_encoded, int_vec); + ins_codes.assign(int_vec.begin(), int_vec.end()); + } + + void DumpInsertionCodes(std::ostream& stream, const std::vector<char>& ins_codes) { + std::vector<int> int_vec(ins_codes.begin(), ins_codes.end()); + std::vector<int> run_length_encoded; + RunLengthEncoding(int_vec, run_length_encoded); + Dump(stream, run_length_encoded); + } + + void LoadRnums(std::istream& stream, std::vector<int>& rnums) { + std::vector<int> run_length_encoded; + Load(stream, run_length_encoded); + std::vector<int> delta_encoded; + RunLengthDecoding(run_length_encoded, delta_encoded); + DeltaDecoding(delta_encoded, rnums); + } + + void DumpRnums(std::ostream& stream, const std::vector<int>& rnums) { + std::vector<int> delta_encoded; + DeltaEncoding(rnums, delta_encoded); + std::vector<int> run_length_encoded; + RunLengthEncoding(delta_encoded, run_length_encoded); + Dump(stream, run_length_encoded); + } + + void LoadResDefIndices(std::istream& stream, std::vector<int>& indices) { + Load(stream, indices); + } + + void DumpResDefIndices(std::ostream& stream, const std::vector<int>& indices) { + Dump(stream, indices); + } +} + + +namespace ost { namespace io { + +ResidueDefinition::ResidueDefinition(const ost::mol::ResidueHandle& res) { + name = res.GetName(); + olc = res.GetOneLetterCode(); + chem_type = char(res.GetChemType()); + chem_class = char(res.GetChemClass()); + ost::mol::AtomHandleList at_list = res.GetAtomList(); + for(auto it = at_list.begin(); it != at_list.end(); ++it) { + anames.push_back(it->GetName()); + } + + // sort atom names and store other info accordingly + // sorting is required to compare definitions from different residues + std::sort(anames.begin(), anames.end()); + std::unordered_map<long, int> hash_idx_mapper; + for(auto it = anames.begin(); it != anames.end(); ++it) { + ost::mol::AtomHandle at = res.FindAtom(*it); + hash_idx_mapper[at.GetHashCode()] = elements.size(); + elements.push_back(at.GetElement()); + is_hetatm.push_back(at.IsHetAtom()); + } + + // in principle we want to store all unique bonds, but nevertheless + // use a map here to track the bond orders at the same time + std::unordered_map<std::pair<int, int>, int, pair_hash> bond_order_map; + for(auto at_it = at_list.begin(); at_it != at_list.end(); ++at_it) { + ost::mol::BondHandleList bond_list = at_it->GetBondList(); + for(auto bond_it = bond_list.begin(); bond_it != bond_list.end(); ++bond_it) { + // The atom represented by at_it is either first OR second atom of that + // bond. So we check whether BOTH are in hash_idx_mapper to exclude bonds + // to other residues. + long h1 = bond_it->GetFirst().GetHashCode(); + long h2 = bond_it->GetSecond().GetHashCode(); + if(hash_idx_mapper.find(h1) != hash_idx_mapper.end() && + hash_idx_mapper.find(h2) != hash_idx_mapper.end()) { + int i1 = hash_idx_mapper[h1]; + int i2 = hash_idx_mapper[h2]; + if(i1 < i2) { + bond_order_map[std::make_pair(i1, i2)] = bond_it->GetBondOrder(); + } else { + bond_order_map[std::make_pair(i2, i1)] = bond_it->GetBondOrder(); + } + } + } + } + + // super stupid vec intended for sorting... sorry for that + std::vector<std::pair<std::pair<int, int>, int > > tmp; + for(auto it = bond_order_map.begin(); it != bond_order_map.end(); ++it) { + tmp.push_back(std::make_pair(std::make_pair(it->first.first, + it->first.second), it->second)); + } + + std::sort(tmp.begin(), tmp.end()); + for(auto it = tmp.begin(); it != tmp.end(); ++it) { + bonds.push_back(it->first.first); + bonds.push_back(it->first.second); + bond_orders.push_back(it->second); + } +} + +void ResidueDefinition::FromStream(std::istream& stream) { + Load(stream, name); + Load(stream, olc); + Load(stream, chem_type); + Load(stream, chem_class); + Load(stream, anames); + Load(stream, elements); + LoadIsHetatm(stream, is_hetatm); + Load(stream, bonds); + LoadBondOrders(stream, bond_orders); +} + +void ResidueDefinition::ToStream(std::ostream& stream) const{ + Dump(stream, name); + Dump(stream, olc); + Dump(stream, chem_type); + Dump(stream, chem_class); + Dump(stream, anames); + Dump(stream, elements); + DumpIsHetatm(stream, is_hetatm); + Dump(stream, bonds); + DumpBondOrders(stream, bond_orders); +} + +BioUnitDefinition::BioUnitDefinition(const ost::io::MMCifInfoBioUnit& bu) { + + au_chains = bu.GetChainList(); + + const std::vector<std::pair<int, int> >& bu_ch_intvl = bu.GetChainIntervalList(); + for(auto it = bu_ch_intvl.begin(); it != bu_ch_intvl.end(); ++it) { + chain_intvl.push_back(it->first); + chain_intvl.push_back(it->second); + } + + const std::vector<std::vector<MMCifInfoTransOpPtr> >& bu_op_list = bu.GetOperations(); + for(auto i = bu_op_list.begin(); i != bu_op_list.end(); ++i) { + std::vector<geom::Mat4> mat_list; + for(auto j = i->begin(); j != i->end(); ++j) { + geom::Mat4 m; + m.PasteRotation((*j)->GetMatrix()); + m.PasteTranslation((*j)->GetVector()); + mat_list.push_back(m); + } + operations.push_back(mat_list); + } + + const std::vector<std::pair<int, int> >& bu_op_intvl = bu.GetOperationsIntervalList(); + for(auto it = bu_op_intvl.begin(); it != bu_op_intvl.end(); ++it) { + op_intvl.push_back(it->first); + op_intvl.push_back(it->second); + } +} + +void BioUnitDefinition::ToStream(std::ostream& stream) const { + Dump(stream, au_chains); + Dump(stream, chain_intvl); + uint32_t size = operations.size(); + stream.write(reinterpret_cast<char*>(&size), sizeof(uint32_t)); + for(auto it = operations.begin(); it != operations.end(); ++it) { + Dump(stream, *it); + } + Dump(stream, op_intvl); +} + +void BioUnitDefinition::FromStream(std::istream& stream) { + Load(stream, au_chains); + Load(stream, chain_intvl); + uint32_t size = 0; + stream.read(reinterpret_cast<char*>(&size), sizeof(uint32_t)); + operations.resize(size); + for(uint i = 0; i < size; ++i) { + Load(stream, operations[i]); + } + Load(stream, op_intvl); +} + +ChainData::ChainData(const ost::mol::ChainHandle& chain, + const std::unordered_map<ResidueDefinition, + int, ResidueDefinitionHash>& res_def_map) { + + ch_name = chain.GetName(); + + // some mappers to deal with the bonds later on + std::unordered_map<long, int> residue_idx_mapper; + std::unordered_map<long, int> atom_idx_mapper; + + // process residues + ost::mol::ResidueHandleList res_list = chain.GetResidueList(); + for(auto res_it = res_list.begin(); res_it != res_list.end(); ++res_it){ + ResidueDefinition def(*res_it); + res_def_indices.push_back(res_def_map.at(def)); + rnums.push_back(res_it->GetNumber().GetNum()); + insertion_codes.push_back(res_it->GetNumber().GetInsCode()); + sec_structures.push_back(char(res_it->GetSecStructure())); + + // process atoms of that residue + for(auto a_it = def.anames.begin(); a_it != def.anames.end(); ++a_it) { + ost::mol::AtomHandle at = res_it->FindAtom(*a_it); + residue_idx_mapper[at.GetHashCode()] = rnums.size() - 1; + atom_idx_mapper[at.GetHashCode()] = positions.size(); + occupancies.push_back(at.GetOccupancy()); + bfactors.push_back(at.GetBFactor()); + positions.push_back(at.GetPos()); + } + } + + // process bonds + // in principle we want to store all unique bonds, but nevertheless + // use a map here to track the bond orders at the same time + std::unordered_map<std::pair<int, int>, int, pair_hash> bond_order_map; + + // the bonds itself can only be extracted from the overall entity + ost::mol::BondHandleList bond_list = chain.GetEntity().GetBondList(); + for(auto bond_it = bond_list.begin(); bond_it != bond_list.end(); ++bond_it) { + // The atom represented by at_it is either first OR second atom of that + // bond. So we check whether BOTH are in hash_idx_mapper to exclude bonds + // to other chains. + long h1 = bond_it->GetFirst().GetHashCode(); + long h2 = bond_it->GetSecond().GetHashCode(); + if(residue_idx_mapper.find(h1) != residue_idx_mapper.end() && + residue_idx_mapper.find(h2) != residue_idx_mapper.end()) { + // they're both from the same chain, let's check whether they are + // from different residues + if(residue_idx_mapper[h1] != residue_idx_mapper[h2]) { + // that's a relevant bond to add! + int at1 = atom_idx_mapper[h1]; + int at2 = atom_idx_mapper[h2]; + if(at1 < at2) { + bond_order_map[std::make_pair(at1, at2)] = bond_it->GetBondOrder(); + } else { + bond_order_map[std::make_pair(at2, at1)] = bond_it->GetBondOrder(); + } + } + } + } + + // super stupid vec intended for sorting... sorry for that + std::vector<std::pair<std::pair<int, int>, int > > tmp; + for(auto it = bond_order_map.begin(); it != bond_order_map.end(); ++it) { + tmp.push_back(std::make_pair(std::make_pair(it->first.first, + it->first.second), it->second)); + } + + std::sort(tmp.begin(), tmp.end()); + for(auto it = tmp.begin(); it != tmp.end(); ++it) { + bonds.push_back(it->first.first); + bonds.push_back(it->first.second); + bond_orders.push_back(it->second); + } +} + +void ChainData::ToStream(std::ostream& stream) const { + Dump(stream, ch_name); + DumpResDefIndices(stream, res_def_indices); + DumpRnums(stream, rnums); + DumpInsertionCodes(stream, insertion_codes); + DumpOccupancies(stream, occupancies); + DumpBFactors(stream, bfactors); + DumpPositions(stream, positions); + Dump(stream, bonds); + DumpBondOrders(stream, bond_orders); + DumpIntVec(stream, sec_structures); +} + +void ChainData::FromStream(std::istream& stream) { + Load(stream, ch_name); + LoadResDefIndices(stream, res_def_indices); + LoadRnums(stream, rnums); + LoadInsertionCodes(stream, insertion_codes); + LoadOccupancies(stream, occupancies); + LoadBFactors(stream, bfactors); + LoadPositions(stream, positions); + Load(stream, bonds); + LoadBondOrders(stream, bond_orders); + LoadIntVec(stream, sec_structures); +} + +OMFPtr OMF::FromEntity(const ost::mol::EntityHandle& ent) { + + OMFPtr omf(new OMF); + + // generate unordered map to assign indices to each unique residue definition + std::unordered_map<ResidueDefinition, int, ResidueDefinitionHash> res_def_map; + ost::mol::ResidueHandleList res_list = ent.GetResidueList(); + int idx = 0; + for(auto it = res_list.begin(); it != res_list.end(); ++it) { + ResidueDefinition def(*it); + auto map_it = res_def_map.find(def); + if(map_it == res_def_map.end()) { + // we didn't see that one before + res_def_map[def] = idx; + ++idx; + omf->residue_definitions_.push_back(def); + } + } + + ost::mol::ChainHandleList chain_list = ent.GetChainList(); + for(auto ch_it = chain_list.begin(); ch_it != chain_list.end(); ++ch_it) { + omf->chain_data_[ch_it->GetName()] = + ChainDataPtr(new ChainData(*ch_it, res_def_map)); + } + + return omf; +} + +OMFPtr OMF::FromMMCIF(const ost::mol::EntityHandle& ent, + const MMCifInfo& info) { + + OMFPtr p = OMF::FromEntity(ent); + const std::vector<MMCifInfoBioUnit>& biounits = info.GetBioUnits(); + for(auto it = biounits.begin(); it != biounits.end(); ++it) { + p->biounit_definitions_.push_back(BioUnitDefinition(*it)); + } + return p; +} + +OMFPtr OMF::FromFile(const String& fn) { + std::ifstream in_stream(fn.c_str(), std::ios::binary); + if (!in_stream) { + throw ost::Error("Could not open " + fn); + } + OMFPtr loaded_omf(new OMF); + loaded_omf->FromStream(in_stream); + return loaded_omf; +} + +void OMF::ToFile(const String& fn) const { + std::ofstream out_stream(fn.c_str(), std::ios::binary); + if (!out_stream) { + throw ost::Error("Could not open " + fn); + } + this->ToStream(out_stream); +} + +OMFPtr OMF::FromString(const String& s) { + std::istringstream in_stream(s); + OMFPtr loaded_omf(new OMF); + loaded_omf->FromStream(in_stream); + return loaded_omf; +} + +String OMF::ToString() const { + std::ostringstream out_stream; + this->ToStream(out_stream); + return out_stream.str(); +} + +ost::mol::EntityHandle OMF::GetAU() const{ + ost::mol::EntityHandle ent = mol::CreateEntity(); + ost::mol::XCSEditor ed = ent.EditXCS(mol::BUFFERED_EDIT); + + for(auto it = chain_data_.begin(); it!=chain_data_.end(); ++it) { + ost::mol::ChainHandle ch = ed.InsertChain(it->first); + this->FillChain(ch, ed, it->second); + } + return ent; +} + +ost::mol::EntityHandle OMF::GetAUChain(const String& name) const{ + + if(chain_data_.find(name) == chain_data_.end()) { + throw ost::Error("No chain of name " + name); + } + ost::mol::EntityHandle ent = mol::CreateEntity(); + ost::mol::XCSEditor ed = ent.EditXCS(mol::BUFFERED_EDIT); + ost::mol::ChainHandle ch = ed.InsertChain(name); + this->FillChain(ch, ed, chain_data_.at(name)); + return ent; +} + +ost::mol::EntityHandle OMF::GetBU(int bu_idx) const{ + if(bu_idx < 0 || bu_idx >= static_cast<int>(biounit_definitions_.size())) { + throw ost::Error("Invalid biounit idx"); + } + + const BioUnitDefinition& bu = biounit_definitions_[bu_idx]; + ost::mol::EntityHandle ent = mol::CreateEntity(); + ost::mol::XCSEditor ed = ent.EditXCS(mol::BUFFERED_EDIT); + + std::vector<String> au_chain_names; + std::vector<geom::Mat4> transforms; + + // The code below is pure magic and heavily inspired by + // the biounit buildup in modules/io/pymod/__init__.py + int n_intervals = bu.chain_intvl.size() / 2; + for(int intvl_idx = 0; intvl_idx < n_intervals; ++intvl_idx) { + std::vector<geom::Mat4> rts; + int op_start = bu.op_intvl[2*intvl_idx]; + int op_end = bu.op_intvl[2*intvl_idx+1]; + int n_intv_ops = op_end - op_start; + if(n_intv_ops) { + for(auto it = bu.operations[op_start].begin(); + it != bu.operations[op_start].end(); ++it) { + rts.push_back(*it); + } + ++op_start; + while(op_start < op_end) { + std::vector<geom::Mat4> tmp_rts; + for(auto i = bu.operations[op_start].begin(); + i != bu.operations[op_start].end(); ++i) { + for(auto j = rts.begin(); j != rts.end(); ++j) { + tmp_rts.push_back((*j)*(*i)); + } + } + rts = tmp_rts; + ++op_start; + } + } + for(int ch_idx = bu.chain_intvl[2*intvl_idx]; + ch_idx < bu.chain_intvl[2*intvl_idx+1]; ++ch_idx) { + for(auto it = rts.begin(); it != rts.end(); ++it) { + au_chain_names.push_back(bu.au_chains[ch_idx]); + transforms.push_back(*it); + } + } + } + + ChainNameGenerator gen; + for(uint bu_ch_idx = 0; bu_ch_idx < au_chain_names.size(); ++bu_ch_idx) { + String bu_ch_name = gen.Get(); + ost::mol::ChainHandle added_chain = ed.InsertChain(bu_ch_name); + this->FillChain(added_chain, ed, chain_data_.at(au_chain_names[bu_ch_idx]), + transforms[bu_ch_idx]); + } + + return ent; +} + +void OMF::ToStream(std::ostream& stream) const { + Dump(stream, residue_definitions_); + Dump(stream, biounit_definitions_); + Dump(stream, chain_data_); +} + +void OMF::FromStream(std::istream& stream) { + Load(stream, residue_definitions_); + Load(stream, biounit_definitions_); + Load(stream, chain_data_); +} + +void OMF::FillChain(ost::mol::ChainHandle& chain, ost::mol::XCSEditor& ed, + const ChainDataPtr data, geom::Mat4 t) const { + + geom::Vec3List* positions = &data->positions; + geom::Vec3List transformed_positions; // only filled if non-identity transform + if(t != geom::Mat4()) { + transformed_positions.resize(positions->size()); + Real a,b,c; + for (uint i = 0; i < transformed_positions.size(); ++i) { + const geom::Vec3& p = data->positions[i]; + a = t(0,0)*p[0]+t(0,1)*p[1]+t(0,2)*p[2]+t(0,3); + b = t(1,0)*p[0]+t(1,1)*p[1]+t(1,2)*p[2]+t(1,3); + c = t(2,0)*p[0]+t(2,1)*p[1]+t(2,2)*p[2]+t(2,3); + transformed_positions[i][0] = a; + transformed_positions[i][1] = b; + transformed_positions[i][2] = c; + } + positions = &transformed_positions; // bend around + } + + int at_idx = 0; + for(uint res_idx = 0; res_idx < data->res_def_indices.size(); ++res_idx) { + const ResidueDefinition& res_def = + residue_definitions_[data->res_def_indices[res_idx]]; + ost::mol::ResNum num = ost::mol::ResNum(data->rnums[res_idx], + data->insertion_codes[res_idx]); + ost::mol::ResidueHandle res = ed.AppendResidue(chain, res_def.name, num); + res.SetOneLetterCode(res_def.olc); + res.SetChemType(ost::mol::ChemType(res_def.chem_type)); + res.SetChemClass(ost::mol::ChemClass(res_def.chem_class)); + res.SetSecStructure(ost::mol::SecStructure(data->sec_structures[res_idx])); + + for(uint i = 0; i < res_def.anames.size(); ++i) { + ed.InsertAtom(res, res_def.anames[i], (*positions)[at_idx], + res_def.elements[i], data->occupancies[at_idx], + data->bfactors[at_idx], res_def.is_hetatm[i]); + ++at_idx; + } + + ost::mol::AtomHandleList added_atoms = res.GetAtomList(); + for(uint bond_idx = 0; bond_idx < res_def.bond_orders.size(); ++bond_idx) { + ed.Connect(added_atoms[res_def.bonds[2*bond_idx]], + added_atoms[res_def.bonds[2*bond_idx+1]], + res_def.bond_orders[bond_idx]); + } + } + ost::mol::AtomHandleList added_atoms = chain.GetAtomList(); + for(uint bond_idx = 0; bond_idx < data->bond_orders.size(); ++bond_idx) { + ed.Connect(added_atoms[data->bonds[2*bond_idx]], + added_atoms[data->bonds[2*bond_idx+1]], + data->bond_orders[bond_idx]); + } +} + +}} //ns diff --git a/modules/io/src/mol/omf.hh b/modules/io/src/mol/omf.hh new file mode 100644 index 0000000000000000000000000000000000000000..a8fb1de304d45684b59d36a041790b8d8836afae --- /dev/null +++ b/modules/io/src/mol/omf.hh @@ -0,0 +1,179 @@ +//------------------------------------------------------------------------------ +// This file is part of the OpenStructure project <www.openstructure.org> +// +// Copyright (C) 2008-2020 by the OpenStructure authors +// +// This library is free software; you can redistribute it and/or modify it under +// the terms of the GNU Lesser General Public License as published by the Free +// Software Foundation; either version 3.0 of the License, or (at your option) +// any later version. +// This library is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more +// details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this library; if not, write to the Free Software Foundation, Inc., +// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +//------------------------------------------------------------------------------ +#ifndef OST_IO_OMF_HH +#define OST_IO_OMF_HH + +#include <unordered_map> +#include <unordered_set> +#include <fstream> +#include <boost/iostreams/filtering_stream.hpp> +#include <ost/mol/entity_handle.hh> +#include <ost/geom/mat4.hh> +#include <ost/io/io_exception.hh> +#include <ost/io/mmcif_info.hh> + +namespace ost { namespace io { + +class ChainData; +class BioUnitData; +class OMF; +typedef boost::shared_ptr<OMF> OMFPtr; +typedef boost::shared_ptr<ChainData> ChainDataPtr; +typedef boost::shared_ptr<BioUnitData> BioUnitDataPtr; + +struct ResidueDefinition { + + ResidueDefinition() { }; + + ResidueDefinition(const ost::mol::ResidueHandle& res); + + bool operator==(const ResidueDefinition& other) const { + return (name == other.name && + olc == other.olc && + chem_type == other.chem_type && + chem_class == other.chem_class && + anames == other.anames && + elements == other.elements && + is_hetatm == other.is_hetatm && + bonds == other.bonds && + bond_orders == other.bond_orders); + } + + bool operator!=(const ResidueDefinition& other) const { + return !(*this == other); + } + + void ToStream(std::ostream& stream) const; + + void FromStream(std::istream& stream); + + String name; + char olc; + char chem_type; + char chem_class; + std::vector<String> anames; + std::vector<String> elements; + std::vector<bool> is_hetatm; + std::vector<int> bonds; + std::vector<int> bond_orders; +}; + + +// define hash function, so we can use ResidueDefinition as key in an unordered +// map. The used hash function is overly simple and gives a hash collision +// whenever we have two residues of same name but different atom composition. +// That's hopefully rare... +struct ResidueDefinitionHash { + std::size_t operator()(const ResidueDefinition& r) const { + return std::hash<String>()(r.name); + } +}; + + +struct BioUnitDefinition { + BioUnitDefinition() { } + + BioUnitDefinition(const ost::io::MMCifInfoBioUnit& bu); + + void ToStream(std::ostream& stream) const; + + void FromStream(std::istream& stream); + + std::vector<String> au_chains; + std::vector<int> chain_intvl; + std::vector<std::vector<geom::Mat4> > operations; + std::vector<int> op_intvl; +}; + + +struct ChainData { + + ChainData() { } + + ChainData(const ost::mol::ChainHandle& chain, + const std::unordered_map<ResidueDefinition, + int, ResidueDefinitionHash>& res_def_map); + + void ToStream(std::ostream& stream) const; + + void FromStream(std::istream& stream); + + // chain features + String ch_name; + + // residue features + std::vector<int> res_def_indices; + std::vector<int> rnums; + std::vector<char> insertion_codes; + std::vector<char> sec_structures; + + // atom features + std::vector<Real> occupancies; + std::vector<Real> bfactors; + geom::Vec3List positions; + + // bond features - only for bonds that are inter-residue + // e.g. peptide bonds + std::vector<int> bonds; + std::vector<int> bond_orders; +}; + +class OMF { + +public: + + static OMFPtr FromEntity(const ost::mol::EntityHandle& ent); + + static OMFPtr FromMMCIF(const ost::mol::EntityHandle& ent, + const MMCifInfo& info); + + static OMFPtr FromFile(const String& fn); + + static OMFPtr FromString(const String& s); + + void ToFile(const String& fn) const; + + String ToString() const; + + ost::mol::EntityHandle GetAU() const; + + ost::mol::EntityHandle GetAUChain(const String& name) const; + + ost::mol::EntityHandle GetBU(int bu_idx) const; + +private: + // only construct with static functions + OMF() { } + + void ToStream(std::ostream& stream) const; + + void FromStream(std::istream& stream); + + void FillChain(ost::mol::ChainHandle& chain, ost::mol::XCSEditor& ed, + const ChainDataPtr data, + geom::Mat4 transform = geom::Mat4()) const; + + std::vector<ResidueDefinition> residue_definitions_; + std::vector<BioUnitDefinition> biounit_definitions_; + std::map<String, ChainDataPtr> chain_data_; +}; + +}} //ns + +#endif