From 18f2f1aa1b5d65acf70a1013fb399d4eac87f2dd Mon Sep 17 00:00:00 2001 From: Gabriel Studer <gabriel.studer@unibas.ch> Date: Fri, 6 Jan 2023 14:57:19 +0100 Subject: [PATCH] Basic AFDB functions used by some experimental code --- modelling/pymod/CMakeLists.txt | 1 + modelling/pymod/export_afdb.cc | 42 +++++++ modelling/pymod/wrap_modelling.cc | 2 + modelling/src/CMakeLists.txt | 2 + modelling/src/afdb.cc | 192 ++++++++++++++++++++++++++++++ modelling/src/afdb.hh | 33 +++++ 6 files changed, 272 insertions(+) create mode 100644 modelling/pymod/export_afdb.cc create mode 100644 modelling/src/afdb.cc create mode 100644 modelling/src/afdb.hh diff --git a/modelling/pymod/CMakeLists.txt b/modelling/pymod/CMakeLists.txt index c58f17c9..22cc8f33 100644 --- a/modelling/pymod/CMakeLists.txt +++ b/modelling/pymod/CMakeLists.txt @@ -11,6 +11,7 @@ set(MODELLING_CPP export_scoring_weights.cc export_sidechain_reconstructor.cc export_motif_finder.cc + export_afdb.cc wrap_modelling.cc ) diff --git a/modelling/pymod/export_afdb.cc b/modelling/pymod/export_afdb.cc new file mode 100644 index 00000000..b4d4eb08 --- /dev/null +++ b/modelling/pymod/export_afdb.cc @@ -0,0 +1,42 @@ +// Copyright (c) 2013-2023, SIB - Swiss Institute of Bioinformatics and +// Biozentrum - University of Basel +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#include <boost/python.hpp> +#include <promod3/modelling/afdb.hh> + +using namespace ost; +using namespace boost::python; +using namespace promod3::modelling; + +namespace { + void WrapSeqToPentamerIndices(const String& sequence, bool unique, + boost::python::list& l) { + std::vector<int> indices; + SeqToPentamerIndices(sequence, unique, indices); + for(size_t i = 0; i < indices.size(); ++i) { + l.append(indices[i]); + } + } +} + +void export_afdb() +{ + def("SeqToPentamerIndices", &WrapSeqToPentamerIndices, (arg("sequence"), + arg("unique"), + arg("result_list"))); + def("CreateAFDBIdx", &CreateAFDBIdx, (arg("uniprot_ac"), arg("fragment"), + arg("version"))); +} diff --git a/modelling/pymod/wrap_modelling.cc b/modelling/pymod/wrap_modelling.cc index 39b82e4f..21e00850 100644 --- a/modelling/pymod/wrap_modelling.cc +++ b/modelling/pymod/wrap_modelling.cc @@ -28,6 +28,7 @@ void export_score_container(); void export_scoring_weights(); void export_SidechainReconstructor(); void export_motif_finder(); +void export_afdb(); BOOST_PYTHON_MODULE(_modelling) { @@ -43,4 +44,5 @@ BOOST_PYTHON_MODULE(_modelling) export_scoring_weights(); export_SidechainReconstructor(); export_motif_finder(); + export_afdb(); } diff --git a/modelling/src/CMakeLists.txt b/modelling/src/CMakeLists.txt index 440e65e0..0a618209 100644 --- a/modelling/src/CMakeLists.txt +++ b/modelling/src/CMakeLists.txt @@ -19,6 +19,7 @@ set(MODELLING_SOURCES sidechain_reconstructor.cc sidechain_env_listener.cc motif_finder.cc + afdb.cc ) set(MODELLING_HEADERS @@ -43,6 +44,7 @@ set(MODELLING_HEADERS sidechain_env_listener.hh motif_finder.hh robin_hood.h + afdb.hh ) module(NAME modelling diff --git a/modelling/src/afdb.cc b/modelling/src/afdb.cc new file mode 100644 index 00000000..d41564c8 --- /dev/null +++ b/modelling/src/afdb.cc @@ -0,0 +1,192 @@ +// Copyright (c) 2013-2020, SIB - Swiss Institute of Bioinformatics and +// Biozentrum - University of Basel +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <algorithm> + +#include <ost/log.hh> + +#include <promod3/modelling/afdb.hh> + +namespace{ + inline int CharToIdx(char ch) { + switch(ch){ + case 'A': return 0; + case 'C': return 1; + case 'D': return 2; + case 'E': return 3; + case 'F': return 4; + case 'G': return 5; + case 'H': return 6; + case 'I': return 7; + case 'K': return 8; + case 'L': return 9; + case 'M': return 10; + case 'N': return 11; + case 'P': return 12; + case 'Q': return 13; + case 'R': return 14; + case 'S': return 15; + case 'T': return 16; + case 'V': return 17; + case 'W': return 18; + case 'Y': return 19; + } + std::stringstream ss; + ss << "nonstandard olc observed: " << ch; + throw ost::Error(ss.str()); + } + + inline uint64_t AlphaToIdx(char ch) { + if(ch == ' ') { + return 0; + } else { + return static_cast<uint64_t>(ch - 'A' + 1); + } + } + + inline uint64_t NumericToIdx(char ch) { + if(ch == ' ') { + return 0; + } else { + return static_cast<uint64_t>(ch - '0' + 1); + } + } + + inline uint64_t AlphaNumericToIdx(char ch) { + if(ch == ' ') { + return 0; + } else if(ch >= '0' && ch <= '9') { + return static_cast<uint64_t>(ch-'0' + 1); + } else { + return static_cast<uint64_t>(ch-'A' + 1 + 10); + } + } + + inline bool CheckAlpha(char ch, bool allow_whitespace) { + return (ch>='A' and ch<='Z') || (allow_whitespace && ch == ' '); + } + + inline bool CheckNumeric(char ch, bool allow_whitespace) { + return (ch>='0' and ch<='9') || (allow_whitespace && ch == ' '); + } + + inline bool CheckAlphaNumeric(char ch, bool allow_whitespace) { + return CheckAlpha(ch, allow_whitespace) || + CheckNumeric(ch, allow_whitespace); + } +} + +namespace promod3 { namespace modelling { + +int PentamerToIdx(const char* ptr) { + return CharToIdx(ptr[0])*160000 + CharToIdx(ptr[1])*8000 + + CharToIdx(ptr[2])*400 + CharToIdx(ptr[3])*20 + CharToIdx(ptr[4]); +} + +void SeqToPentamerIndices(const String& seq, bool unique, std::vector<int>& indices) { + int N = seq.size() - 4; + indices.resize(N); + for(int i = 0; i < N; ++i) { + indices[i] = PentamerToIdx(&seq[i]); + } + if(unique) { + auto last = std::unique(indices.begin(), indices.end()); + indices.erase(last, indices.end()); + } +} + +uint64_t CreateAFDBIdx(const String& uniprot_ac, int fragment, int version) { + + // check if uniprot AC has expected size of 6 or 10 + // https://www.uniprot.org/help/accession_numbers + size_t ac_size = uniprot_ac.size(); + if(ac_size != 6 && ac_size != 10) { + std::stringstream ss; + ss << "Expect uniprot AC to be of size 6 or 10, got: " << uniprot_ac; + throw ost::Error(ss.str()); + } + + if(!CheckAlpha(uniprot_ac[0], false)) { + throw ost::Error("Exp capital alphabetic character at idx 0 of uniprot AC"); + } + + if(!CheckNumeric(uniprot_ac[1], false)) { + throw ost::Error("Exp capital alphabetic character at idx 0 of uniprot AC"); + } + + if(!CheckAlphaNumeric(uniprot_ac[2], false)) { + throw ost::Error("Exp capital alphabetic character at idx 0 of uniprot AC"); + } + + if(!CheckAlphaNumeric(uniprot_ac[3], false)) { + throw ost::Error("Exp capital alphabetic character at idx 0 of uniprot AC"); + } + + if(!CheckAlphaNumeric(uniprot_ac[4], false)) { + throw ost::Error("Exp capital alphabetic character at idx 0 of uniprot AC"); + } + + if(!CheckNumeric(uniprot_ac[5], false)) { + throw ost::Error("Exp capital alphabetic character at idx 0 of uniprot AC"); + } + + if(ac_size > 6) { + if(!CheckAlpha(uniprot_ac[6], true)) { + throw ost::Error("Exp capital alphabetic character at idx 0 of uniprot AC"); + } + + if(!CheckAlphaNumeric(uniprot_ac[7], true)) { + throw ost::Error("Exp capital alphabetic character at idx 0 of uniprot AC"); + } + + if(!CheckAlphaNumeric(uniprot_ac[8], true)) { + throw ost::Error("Exp capital alphabetic character at idx 0 of uniprot AC"); + } + + if(!CheckNumeric(uniprot_ac[9], true)) { + throw ost::Error("Exp capital alphabetic character at idx 0 of uniprot AC"); + } + } + + if(fragment < 0 || fragment > 127) { + std::stringstream ss; + ss << "Expect fragment to be in range [0, 127], got: " << fragment; + } + if(version < 0 || version > 31) { + std::stringstream ss; + ss << "Expect version to be in range [0, 31], got: " << version; + } + + uint64_t idx = 0; + idx += AlphaToIdx(uniprot_ac[0]); + idx += NumericToIdx(uniprot_ac[1]) << 5; + idx += AlphaNumericToIdx(uniprot_ac[2]) << 9; + idx += AlphaNumericToIdx(uniprot_ac[3]) << 15; + idx += AlphaNumericToIdx(uniprot_ac[4]) << 21; + idx += NumericToIdx(uniprot_ac[5]) << 27; + if(ac_size > 6) { + idx += AlphaToIdx(uniprot_ac[6]) << 31; + idx += AlphaNumericToIdx(uniprot_ac[7]) << 36; + idx += AlphaNumericToIdx(uniprot_ac[8]) << 42; + idx += NumericToIdx(uniprot_ac[9]) << 48; + } + idx += static_cast<uint64_t>(fragment) << 52; + idx += static_cast<uint64_t>(version) << 59; + return idx; +} + + + +}} //ns diff --git a/modelling/src/afdb.hh b/modelling/src/afdb.hh new file mode 100644 index 00000000..e48f792b --- /dev/null +++ b/modelling/src/afdb.hh @@ -0,0 +1,33 @@ +// Copyright (c) 2013-2023, SIB - Swiss Institute of Bioinformatics and +// Biozentrum - University of Basel +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef PM3_MODELLING_AFDB_HH +#define PM3_MODELLING_AFDB_HH + +#include <ost/mol/mol.hh> +#include <promod3/core/message.hh> + +namespace promod3 { namespace modelling { + +int PentamerToIdx(const char* ptr); + +void SeqToPentamerIndices(const String& seq, bool unique, + std::vector<int>& indices); + +uint64_t CreateAFDBIdx(const String& uniprot_ac, int fragment, int version); + +}} //ns + +#endif -- GitLab