From 18f2f1aa1b5d65acf70a1013fb399d4eac87f2dd Mon Sep 17 00:00:00 2001
From: Gabriel Studer <gabriel.studer@unibas.ch>
Date: Fri, 6 Jan 2023 14:57:19 +0100
Subject: [PATCH] Basic AFDB functions used by some experimental code

---
 modelling/pymod/CMakeLists.txt    |   1 +
 modelling/pymod/export_afdb.cc    |  42 +++++++
 modelling/pymod/wrap_modelling.cc |   2 +
 modelling/src/CMakeLists.txt      |   2 +
 modelling/src/afdb.cc             | 192 ++++++++++++++++++++++++++++++
 modelling/src/afdb.hh             |  33 +++++
 6 files changed, 272 insertions(+)
 create mode 100644 modelling/pymod/export_afdb.cc
 create mode 100644 modelling/src/afdb.cc
 create mode 100644 modelling/src/afdb.hh

diff --git a/modelling/pymod/CMakeLists.txt b/modelling/pymod/CMakeLists.txt
index c58f17c9..22cc8f33 100644
--- a/modelling/pymod/CMakeLists.txt
+++ b/modelling/pymod/CMakeLists.txt
@@ -11,6 +11,7 @@ set(MODELLING_CPP
   export_scoring_weights.cc
   export_sidechain_reconstructor.cc
   export_motif_finder.cc
+  export_afdb.cc
   wrap_modelling.cc
 )
 
diff --git a/modelling/pymod/export_afdb.cc b/modelling/pymod/export_afdb.cc
new file mode 100644
index 00000000..b4d4eb08
--- /dev/null
+++ b/modelling/pymod/export_afdb.cc
@@ -0,0 +1,42 @@
+// Copyright (c) 2013-2023, SIB - Swiss Institute of Bioinformatics and
+//                          Biozentrum - University of Basel
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// 
+//   http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include <boost/python.hpp>
+#include <promod3/modelling/afdb.hh>
+
+using namespace ost;
+using namespace boost::python;
+using namespace promod3::modelling;
+
+namespace {
+  void WrapSeqToPentamerIndices(const String& sequence, bool unique,
+                                boost::python::list& l) {
+    std::vector<int> indices;
+    SeqToPentamerIndices(sequence, unique, indices);
+    for(size_t i = 0; i < indices.size(); ++i) {
+      l.append(indices[i]);
+    }
+  }
+}
+
+void export_afdb()
+{
+  def("SeqToPentamerIndices", &WrapSeqToPentamerIndices, (arg("sequence"),
+                                                          arg("unique"),
+                                                          arg("result_list")));
+  def("CreateAFDBIdx", &CreateAFDBIdx, (arg("uniprot_ac"), arg("fragment"),
+                                        arg("version")));
+}
diff --git a/modelling/pymod/wrap_modelling.cc b/modelling/pymod/wrap_modelling.cc
index 39b82e4f..21e00850 100644
--- a/modelling/pymod/wrap_modelling.cc
+++ b/modelling/pymod/wrap_modelling.cc
@@ -28,6 +28,7 @@ void export_score_container();
 void export_scoring_weights();
 void export_SidechainReconstructor();
 void export_motif_finder();
+void export_afdb();
 
 BOOST_PYTHON_MODULE(_modelling)
 {
@@ -43,4 +44,5 @@ BOOST_PYTHON_MODULE(_modelling)
   export_scoring_weights();
   export_SidechainReconstructor();
   export_motif_finder();
+  export_afdb();
 }
diff --git a/modelling/src/CMakeLists.txt b/modelling/src/CMakeLists.txt
index 440e65e0..0a618209 100644
--- a/modelling/src/CMakeLists.txt
+++ b/modelling/src/CMakeLists.txt
@@ -19,6 +19,7 @@ set(MODELLING_SOURCES
   sidechain_reconstructor.cc
   sidechain_env_listener.cc
   motif_finder.cc
+  afdb.cc
 )
 
 set(MODELLING_HEADERS
@@ -43,6 +44,7 @@ set(MODELLING_HEADERS
   sidechain_env_listener.hh
   motif_finder.hh
   robin_hood.h
+  afdb.hh
 )
 
 module(NAME modelling
diff --git a/modelling/src/afdb.cc b/modelling/src/afdb.cc
new file mode 100644
index 00000000..d41564c8
--- /dev/null
+++ b/modelling/src/afdb.cc
@@ -0,0 +1,192 @@
+// Copyright (c) 2013-2020, SIB - Swiss Institute of Bioinformatics and
+//                          Biozentrum - University of Basel
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// 
+//   http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+
+#include <ost/log.hh>
+
+#include <promod3/modelling/afdb.hh>
+
+namespace{
+  inline int CharToIdx(char ch) {
+    switch(ch){
+      case 'A': return 0;
+      case 'C': return 1;
+      case 'D': return 2;
+      case 'E': return 3;
+      case 'F': return 4;
+      case 'G': return 5;
+      case 'H': return 6;
+      case 'I': return 7;
+      case 'K': return 8;
+      case 'L': return 9;
+      case 'M': return 10;
+      case 'N': return 11;
+      case 'P': return 12;
+      case 'Q': return 13;
+      case 'R': return 14;
+      case 'S': return 15;
+      case 'T': return 16;
+      case 'V': return 17;
+      case 'W': return 18;
+      case 'Y': return 19;
+    }
+    std::stringstream ss;
+    ss << "nonstandard olc observed: " << ch;
+    throw ost::Error(ss.str());
+  }
+
+  inline uint64_t AlphaToIdx(char ch) {
+    if(ch == ' ') {
+      return 0;
+    } else {
+      return static_cast<uint64_t>(ch - 'A' + 1);
+    }
+  }
+
+  inline uint64_t NumericToIdx(char ch) {
+    if(ch == ' ') {
+      return 0;
+    } else {
+      return static_cast<uint64_t>(ch - '0' + 1);
+    }
+  }
+
+  inline uint64_t AlphaNumericToIdx(char ch) {
+    if(ch == ' ') {
+      return 0;
+    } else if(ch >= '0' && ch <= '9') {
+      return static_cast<uint64_t>(ch-'0' + 1);
+    } else {
+      return static_cast<uint64_t>(ch-'A' + 1 + 10);
+    }
+  }
+
+  inline bool CheckAlpha(char ch, bool allow_whitespace) {
+    return (ch>='A' and ch<='Z') || (allow_whitespace && ch == ' ');
+  }
+
+  inline bool CheckNumeric(char ch, bool allow_whitespace) {
+    return (ch>='0' and ch<='9') || (allow_whitespace && ch == ' ');
+  }
+
+  inline bool CheckAlphaNumeric(char ch, bool allow_whitespace) {
+    return CheckAlpha(ch, allow_whitespace) || 
+           CheckNumeric(ch, allow_whitespace);
+  }
+}
+
+namespace promod3 { namespace modelling {
+
+int PentamerToIdx(const char* ptr) {
+  return CharToIdx(ptr[0])*160000 + CharToIdx(ptr[1])*8000 +
+         CharToIdx(ptr[2])*400 + CharToIdx(ptr[3])*20 + CharToIdx(ptr[4]);
+}
+
+void SeqToPentamerIndices(const String& seq, bool unique, std::vector<int>& indices) {
+  int N = seq.size() - 4;
+  indices.resize(N);
+  for(int i = 0; i < N; ++i) {
+    indices[i] = PentamerToIdx(&seq[i]);
+  }
+  if(unique) {
+    auto last = std::unique(indices.begin(), indices.end());
+    indices.erase(last, indices.end());
+  }
+}
+
+uint64_t CreateAFDBIdx(const String& uniprot_ac, int fragment, int version) {
+
+  // check if uniprot AC has expected size of 6 or 10
+  // https://www.uniprot.org/help/accession_numbers
+  size_t ac_size = uniprot_ac.size();
+  if(ac_size != 6 && ac_size != 10) {
+    std::stringstream ss;
+    ss << "Expect uniprot AC to be of size 6 or 10, got: " << uniprot_ac; 
+    throw ost::Error(ss.str());
+  }
+
+  if(!CheckAlpha(uniprot_ac[0], false)) {
+    throw ost::Error("Exp capital alphabetic character at idx 0 of uniprot AC");
+  }
+
+  if(!CheckNumeric(uniprot_ac[1], false)) {
+    throw ost::Error("Exp capital alphabetic character at idx 0 of uniprot AC");
+  }
+
+  if(!CheckAlphaNumeric(uniprot_ac[2], false)) {
+    throw ost::Error("Exp capital alphabetic character at idx 0 of uniprot AC");
+  }
+
+  if(!CheckAlphaNumeric(uniprot_ac[3], false)) {
+    throw ost::Error("Exp capital alphabetic character at idx 0 of uniprot AC");
+  }
+
+  if(!CheckAlphaNumeric(uniprot_ac[4], false)) {
+    throw ost::Error("Exp capital alphabetic character at idx 0 of uniprot AC");
+  }
+
+  if(!CheckNumeric(uniprot_ac[5], false)) {
+    throw ost::Error("Exp capital alphabetic character at idx 0 of uniprot AC");
+  }
+
+  if(ac_size > 6) {
+    if(!CheckAlpha(uniprot_ac[6], true)) {
+      throw ost::Error("Exp capital alphabetic character at idx 0 of uniprot AC");
+    }
+  
+    if(!CheckAlphaNumeric(uniprot_ac[7], true)) {
+      throw ost::Error("Exp capital alphabetic character at idx 0 of uniprot AC");
+    }
+  
+    if(!CheckAlphaNumeric(uniprot_ac[8], true)) {
+      throw ost::Error("Exp capital alphabetic character at idx 0 of uniprot AC");
+    }
+  
+    if(!CheckNumeric(uniprot_ac[9], true)) {
+      throw ost::Error("Exp capital alphabetic character at idx 0 of uniprot AC");
+    }
+  }
+
+  if(fragment < 0 || fragment > 127) {
+    std::stringstream ss;
+    ss << "Expect fragment to be in range [0, 127], got: " << fragment; 
+  }
+  if(version < 0 || version > 31) {
+    std::stringstream ss;
+    ss << "Expect version to be in range [0, 31], got: " << version;
+  }
+
+  uint64_t idx = 0;
+  idx += AlphaToIdx(uniprot_ac[0]);
+  idx += NumericToIdx(uniprot_ac[1]) << 5;
+  idx += AlphaNumericToIdx(uniprot_ac[2]) << 9;
+  idx += AlphaNumericToIdx(uniprot_ac[3]) << 15;
+  idx += AlphaNumericToIdx(uniprot_ac[4]) << 21;
+  idx += NumericToIdx(uniprot_ac[5]) << 27;
+  if(ac_size > 6) {
+    idx += AlphaToIdx(uniprot_ac[6]) << 31;
+    idx += AlphaNumericToIdx(uniprot_ac[7]) << 36;
+    idx += AlphaNumericToIdx(uniprot_ac[8]) << 42;
+    idx += NumericToIdx(uniprot_ac[9]) << 48;
+  }
+  idx += static_cast<uint64_t>(fragment) << 52;
+  idx += static_cast<uint64_t>(version) << 59;
+  return idx;
+}
+
+
+
+}} //ns
diff --git a/modelling/src/afdb.hh b/modelling/src/afdb.hh
new file mode 100644
index 00000000..e48f792b
--- /dev/null
+++ b/modelling/src/afdb.hh
@@ -0,0 +1,33 @@
+// Copyright (c) 2013-2023, SIB - Swiss Institute of Bioinformatics and
+//                          Biozentrum - University of Basel
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// 
+//   http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef PM3_MODELLING_AFDB_HH
+#define PM3_MODELLING_AFDB_HH
+
+#include <ost/mol/mol.hh>
+#include <promod3/core/message.hh>
+
+namespace promod3 { namespace modelling {
+
+int PentamerToIdx(const char* ptr);
+
+void SeqToPentamerIndices(const String& seq, bool unique,
+                          std::vector<int>& indices);
+
+uint64_t CreateAFDBIdx(const String& uniprot_ac, int fragment, int version);
+
+}} //ns
+
+#endif
-- 
GitLab