From 65530e32ac7044943e412e866266cbf6648e9d49 Mon Sep 17 00:00:00 2001
From: Gabriel Studer <gabriel.studer@unibas.ch>
Date: Thu, 6 Feb 2020 18:01:34 +0100
Subject: [PATCH] algorithm that calculates an HMM-HMM alignment score the
 hhblits way

---
 modules/seq/alg/src/hmm_score.cc | 214 +++++++++++++++++++++++++++++++
 modules/seq/alg/src/hmm_score.hh |  35 +++++
 2 files changed, 249 insertions(+)
 create mode 100644 modules/seq/alg/src/hmm_score.cc
 create mode 100644 modules/seq/alg/src/hmm_score.hh

diff --git a/modules/seq/alg/src/hmm_score.cc b/modules/seq/alg/src/hmm_score.cc
new file mode 100644
index 000000000..03dc372a9
--- /dev/null
+++ b/modules/seq/alg/src/hmm_score.cc
@@ -0,0 +1,214 @@
+#include <ost/seq/alg/hmm_score.hh>
+#include <numeric>
+
+namespace{
+
+void SetupSequence(const ost::seq::ProfileHandle& profile,
+	               const ost::seq::AlignmentHandle& aln,
+	               int s_idx, String& seq, int& offset) {
+
+  ost::seq::ConstSequenceHandle s_handle = aln.GetSequence(s_idx);
+  seq = s_handle.GetString();
+  String s_gapless = s_handle.GetGaplessString();
+  offset = s_handle.GetOffset();
+
+  if(offset < 0) {
+    throw ost::Error("aln must not contain sequences with negative offsets!");
+  }
+
+  // check whether sequences in aln match with profile sequences
+  if(offset + s_gapless.size() > profile.size() ||
+     profile.GetSequence().substr(offset, s_gapless.size()) != s_gapless) {
+    std::stringstream ss;
+    ss << "aln sequence: \"" << s_gapless << "\" (sequence offset: ";
+    ss << offset << ") does not match sequence in profile: ";
+    ss << profile.GetSequence();
+    throw ost::Error(ss.str());
+  }
+}
+
+
+Real InsertionTransitionScore(const ost::seq::ProfileHandle& prof_0, 
+                              const ost::seq::ProfileHandle& prof_1,
+                              int before_0, int before_1,
+                              int after_0, int after_1) {
+
+  // option 1:
+  // s0 switched to insertion state after the beginning of the gap
+  // and s1 stayed in match state
+  ////////////////////////////////////////////////////////////////
+
+  Real ins_score = 0.0;
+
+  // s0
+  ins_score += 
+  std::log2(prof_0[before_0].GetTransProb(ost::seq::HMM_M2I)) * 0.6;
+  int l = after_1 - before_1 - 1;
+  ins_score += 
+  (l - 1) * std::log2(prof_0[before_0].GetTransProb(ost::seq::HMM_I2I)) * 0.6;    
+  ins_score += std::log2(prof_0[before_0].GetTransProb(ost::seq::HMM_I2M));
+
+  // s1
+  for(int i = before_1; i < after_1; ++i) {
+    ins_score += std::log2(prof_1[i].GetTransProb(ost::seq::HMM_M2M));
+  }
+
+  // option 2:
+  // s0 has one single M->M transition
+  // and s1 switched into deletion state at the beginning of the gap
+  //////////////////////////////////////////////////////////////////
+
+  Real del_score = 0.0;
+
+  // s0
+  del_score += std::log2(prof_0[before_0].GetTransProb(ost::seq::HMM_M2M));
+
+  // s1
+  del_score += 
+  std::log2(prof_1[before_1].GetTransProb(ost::seq::HMM_M2D)) * 0.6;
+  for(int i = before_1 + 1; i < after_1 - 1; ++i ) {
+  	del_score += std::log2(prof_1[i].GetTransProb(ost::seq::HMM_D2D)) * 0.6;
+  }
+  del_score += std::log2(prof_1[after_1-1].GetTransProb(ost::seq::HMM_D2M));
+
+  return std::max(ins_score, del_score);
+}
+
+} // ns
+
+namespace ost{ namespace seq{ namespace alg{
+
+Real HMMScore(const ost::seq::ProfileHandle& prof_0, 
+	          const ost::seq::ProfileHandle& prof_1,
+	          const ost::seq::AlignmentHandle& aln,
+	          int seq_0_idx, int seq_1_idx) {
+
+  String s_0;
+  int s_0_o;
+  SetupSequence(prof_0, aln, seq_0_idx, s_0, s_0_o);
+
+  String s_1;
+  int s_1_o;
+  SetupSequence(prof_1, aln, seq_1_idx, s_1, s_1_o);
+
+  // hhblits uses the average background frequencies by default
+  std::vector<Real> null_freq(20, 0.0);
+  const Real* prof_0_null_freq = prof_0.GetNullModel().freqs_begin(); 
+  const Real* prof_1_null_freq = prof_1.GetNullModel().freqs_begin(); 
+  for(int i = 0; i < 20; ++i) {
+    null_freq[i] = (prof_0_null_freq[i] + prof_1_null_freq[i]) / 2;
+  }
+
+  Real transition_score = 0.0;
+  std::vector<Real> matching_scores(s_0.size(), 0.0); 
+  int n_matches = 0;
+
+  int col_0 = 0;
+  int col_1 = 0;
+  int state = 0; // 0: match
+                 // 1: insertion s0
+                 // 2: insertion s1
+  Real offset = -0.03; // described in 2005 hmm-hmm comparison paper by soding
+
+  // sum up column scores and all MM->MM transition scores
+  for(uint idx = 0; idx < s_0.size(); ++idx) {
+    if(s_0[idx] == '-' && s_1[idx] == '-') {
+      throw Error("Cannot have gap at same location in both sequences!");
+    }
+    if(s_0[idx] != '-' && s_1[idx] != '-') {
+      // score of matching hmm columns
+      Real summed_col_score = 0.0;
+      const Real* prof_0_freq = prof_0[col_0+s_0_o].freqs_begin();
+      const Real* prof_1_freq = prof_1[col_1+s_1_o].freqs_begin();
+      for (uint i = 0; i < 20; ++i) {
+        summed_col_score += (prof_0_freq[i] * prof_1_freq[i] / null_freq[i]);
+      }
+      matching_scores[idx] = std::log2(summed_col_score);
+      ++n_matches;
+
+      if(state == 0 && col_0>0 && col_1>0){
+        // MM->MM
+        transition_score += 
+        std::log2(prof_0[col_0+s_0_o-1].GetTransProb(ost::seq::HMM_M2M));
+        transition_score += 
+        std::log2(prof_1[col_1+s_1_o-1].GetTransProb(ost::seq::HMM_M2M));
+      } 
+      state = 0;
+      ++col_0;
+      ++col_1;
+    } else if(s_0[idx] == '-') {
+      state = 1;
+      ++col_1;
+    } else if(s_1[idx] == '-') {
+      state = 2;
+      ++col_0;
+    }
+  }
+
+  Real correl_score = 0.0;
+  for(int d = 1; d <= 4; ++d) {
+    for(size_t l = 0; l < matching_scores.size() - d; ++l) {
+      correl_score += matching_scores[l] * matching_scores[l+d];
+    }
+  }
+
+  // Deal with transition probabilities from indels
+  col_0 = 0;
+  col_1 = 0;
+  state = 0;
+
+  int ins_0_before_col_0 = -1;
+  int ins_0_before_col_1 = -1;
+  int ins_1_before_col_0 = -1;
+  int ins_1_before_col_1 = -1;
+
+  for(uint idx = 0; idx < s_0.size(); ++idx) {
+
+    if(s_0[idx] != '-' && s_1[idx] != '-') {
+      if(state == 1 && ins_0_before_col_0 >= 0) {
+        // an insertion in s0 ends here
+        transition_score += InsertionTransitionScore(prof_0, prof_1,  
+                                                     ins_0_before_col_0+s_0_o, 
+                                                     ins_0_before_col_1+s_1_o, 
+                                                     col_0+s_0_o, col_1+s_1_o);
+      }
+      if(state == 2 && ins_1_before_col_1 >= 0) {
+        // an insertion in s0 ends here
+        transition_score += InsertionTransitionScore(prof_1, prof_0, 
+                                                     ins_1_before_col_1+s_1_o, 
+                                                     ins_1_before_col_0+s_0_o, 
+                                                     col_1+s_1_o, col_0+s_0_o);
+      }
+      ++col_0;
+      ++col_1;
+      state = 0;
+    } else if(s_0[idx] == '-') {
+      if(state == 0) {
+        // start new insertion
+        ins_0_before_col_0 = col_0-1;
+        ins_0_before_col_1 = col_1-1;
+      } 
+      state = 1;
+      ++col_1;
+    } else if(s_1[idx] == '-') {
+      if(state == 0) {
+        // start new insertion
+        ins_1_before_col_0 = col_0-1;
+        ins_1_before_col_1 = col_1-1;
+      } 
+      state = 2;
+      ++col_0;
+    }
+  }
+
+  Real score = std::accumulate(matching_scores.begin(), 
+                               matching_scores.end(), 0.0);
+
+  score += n_matches * offset;
+  score += 0.1 * correl_score;
+  score += transition_score;
+
+  return score;
+}
+
+}}} // ns
diff --git a/modules/seq/alg/src/hmm_score.hh b/modules/seq/alg/src/hmm_score.hh
new file mode 100644
index 000000000..c84f74d47
--- /dev/null
+++ b/modules/seq/alg/src/hmm_score.hh
@@ -0,0 +1,35 @@
+//------------------------------------------------------------------------------
+// This file is part of the OpenStructure project <www.openstructure.org>
+//
+// Copyright (C) 2008-2020 by the OpenStructure authors
+//
+// This library is free software; you can redistribute it and/or modify it under
+// the terms of the GNU Lesser General Public License as published by the Free
+// Software Foundation; either version 3.0 of the License, or (at your option)
+// any later version.
+// This library is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
+// details.
+//
+// You should have received a copy of the GNU Lesser General Public License
+// along with this library; if not, write to the Free Software Foundation, Inc.,
+// 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+//------------------------------------------------------------------------------
+
+#ifndef OST_SEQ_ALG_HMM_SCORE_HH
+#define OST_SEQ_ALG_HMM_SCORE_HH
+
+#include <ost/seq/profile_handle.hh>
+#include <ost/seq/alignment_handle.hh>
+
+namespace ost{ namespace seq{ namespace alg{
+
+Real HMMScore(const ost::seq::ProfileHandle& profile_0, 
+	          const ost::seq::ProfileHandle& profile_1,
+	          const ost::seq::AlignmentHandle& aln,
+	          int seq_0_idx, int seq_1_idx);
+
+}}} // ns
+
+#endif
\ No newline at end of file
-- 
GitLab