From 85fb9022d16d80de370c5c2043c619bc7a597fae Mon Sep 17 00:00:00 2001
From: Gabriel Studer <gabriel.studer@unibas.ch>
Date: Tue, 10 May 2022 10:53:29 +0200
Subject: [PATCH] add MATCH and NUC44 substitution matrices

both can be used for nucleotide alignments
---
 modules/seq/alg/doc/seqalg.rst                | 12 +++++
 modules/seq/alg/pymod/mat.py                  |  5 +-
 modules/seq/alg/pymod/wrap_seq_alg.cc         |  2 +
 modules/seq/alg/src/data/NUC.4.4_edited       | 28 ++++++++++
 modules/seq/alg/src/data/generate_nuc_data.py | 44 ++++++++++++++++
 modules/seq/alg/src/subst_weight_matrix.cc    | 52 +++++++++++++++++++
 modules/seq/alg/src/subst_weight_matrix.hh    |  4 +-
 7 files changed, 145 insertions(+), 2 deletions(-)
 create mode 100644 modules/seq/alg/src/data/NUC.4.4_edited
 create mode 100644 modules/seq/alg/src/data/generate_nuc_data.py

diff --git a/modules/seq/alg/doc/seqalg.rst b/modules/seq/alg/doc/seqalg.rst
index b4061e23d..a092012d9 100644
--- a/modules/seq/alg/doc/seqalg.rst
+++ b/modules/seq/alg/doc/seqalg.rst
@@ -274,6 +274,18 @@ different levels of sequence identity:
 - BLOSUM80
 - BLOSUM100
 
+Two naive substitution matrices:
+
+- IDENTITY: Matches have score of 1, all other are 0
+- MATCH: Matches have score of 1, all other are -1
+
+Nucleotide substitution matrices:
+
+- NUC44: Nucleotide substitution matrix used in blastn that can deal with IUPAC
+  ambiguity codes. ATTENTION: has been edited to explicitely encode T/U
+  equivalence, i.e. you can just do `m.GetWeight('G', 'U')` instead of first
+  translating 'U' to 'T'. 
+
 
 .. _contact-prediction:
 
diff --git a/modules/seq/alg/pymod/mat.py b/modules/seq/alg/pymod/mat.py
index 57fa10c17..f00ec2237 100644
--- a/modules/seq/alg/pymod/mat.py
+++ b/modules/seq/alg/pymod/mat.py
@@ -10,5 +10,8 @@ BLOSUM62 = _InitMatrix(SubstWeightMatrix.Preset.BLOSUM62)
 BLOSUM80 = _InitMatrix(SubstWeightMatrix.Preset.BLOSUM80)
 BLOSUM100 = _InitMatrix(SubstWeightMatrix.Preset.BLOSUM100)
 IDENTITY = _InitMatrix(SubstWeightMatrix.Preset.IDENTITY)
+MATCH = _InitMatrix(SubstWeightMatrix.Preset.MATCH)
+NUC44 = _InitMatrix(SubstWeightMatrix.Preset.NUC44)
 
-__all__=['BLOSUM45','BLOSUM62','BLOSUM80','BLOSUM100', 'IDENTITY']
+__all__=['BLOSUM45','BLOSUM62','BLOSUM80','BLOSUM100', 'IDENTITY', 'MATCH',
+         'NUC44']
diff --git a/modules/seq/alg/pymod/wrap_seq_alg.cc b/modules/seq/alg/pymod/wrap_seq_alg.cc
index d0577fa5c..508a8cc5a 100644
--- a/modules/seq/alg/pymod/wrap_seq_alg.cc
+++ b/modules/seq/alg/pymod/wrap_seq_alg.cc
@@ -223,6 +223,8 @@ void export_contact_prediction()
     .value("BLOSUM80", SubstWeightMatrix::BLOSUM80)
     .value("BLOSUM100", SubstWeightMatrix::BLOSUM100)
     .value("IDENTITY", SubstWeightMatrix::IDENTITY)
+    .value("MATCH", SubstWeightMatrix::MATCH)
+    .value("NUC44", SubstWeightMatrix::NUC44)
   ;
 }
 
diff --git a/modules/seq/alg/src/data/NUC.4.4_edited b/modules/seq/alg/src/data/NUC.4.4_edited
new file mode 100644
index 000000000..4c89f7719
--- /dev/null
+++ b/modules/seq/alg/src/data/NUC.4.4_edited
@@ -0,0 +1,28 @@
+#
+# This matrix was created by Todd Lowe   12/10/92
+#
+# Uses ambiguous nucleotide codes, probabilities rounded to
+#  nearest integer
+#
+# Lowest score = -4, Highest score = 5
+#
+# EDIT GABRIEL STUDER (May 10 2022):
+# Add column/row for U to explicitely express T/U equivalence
+#
+    A   T   U   G   C   S   W   R   Y   K   M   B   V   H   D   N
+A   5  -4  -4  -4  -4  -4   1   1  -4  -4   1  -4  -1  -1  -1  -2
+T  -4   5   5  -4  -4  -4   1  -4   1   1  -4  -1  -4  -1  -1  -2
+U  -4   5   5  -4  -4  -4   1  -4   1   1  -4  -1  -4  -1  -1  -2
+G  -4  -4  -4   5  -4   1  -4   1  -4   1  -4  -1  -1  -4  -1  -2
+C  -4  -4  -4  -4   5   1  -4  -4   1  -4   1  -1  -1  -1  -4  -2
+S  -4  -4  -4   1   1  -1  -4  -2  -2  -2  -2  -1  -1  -3  -3  -1
+W   1   1   1  -4  -4  -4  -1  -2  -2  -2  -2  -3  -3  -1  -1  -1
+R   1  -4  -4   1  -4  -2  -2  -1  -4  -2  -2  -3  -1  -3  -1  -1
+Y  -4   1   1  -4   1  -2  -2  -4  -1  -2  -2  -1  -3  -1  -3  -1
+K  -4   1   1   1  -4  -2  -2  -2  -2  -1  -4  -1  -3  -3  -1  -1
+M   1  -4  -4  -4   1  -2  -2  -2  -2  -4  -1  -3  -1  -1  -3  -1
+B  -4  -1  -1  -1  -1  -1  -3  -3  -1  -1  -3  -1  -2  -2  -2  -1
+V  -1  -4  -4  -1  -1  -1  -3  -1  -3  -3  -1  -2  -1  -2  -2  -1
+H  -1  -1  -1  -4  -1  -3  -1  -3  -1  -3  -1  -2  -2  -1  -2  -1  
+D  -1  -1  -1  -1  -4  -3  -1  -1  -3  -1  -3  -2  -2  -2  -1  -1
+N  -2  -2  -2  -2  -2  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1
diff --git a/modules/seq/alg/src/data/generate_nuc_data.py b/modules/seq/alg/src/data/generate_nuc_data.py
new file mode 100644
index 000000000..e8e9b5183
--- /dev/null
+++ b/modules/seq/alg/src/data/generate_nuc_data.py
@@ -0,0 +1,44 @@
+# code generation for subst_weight_matrix.cc
+# loads NUC substitution matrix from file provided by NCBI at:
+# ftp://ftp.ncbi.nlm.nih.gov/blast/matrices/
+
+def Generate(filename, matrix_name):
+
+    with open(filename) as f:
+        data = f.readlines()
+
+    scores = dict()
+    olcs = None    
+    for line in data:
+        if line.startswith('#'):
+            continue
+        if olcs is None:
+            if " A " in line and " C " in line and " G " and " T " in line:
+                # very high likelihood that this line contains the olcs
+                olcs = line.strip().split()
+            continue
+        split_line = line.strip().split()
+        if split_line[0] in olcs:
+            olc = split_line[0]
+            for score_idx, score in enumerate(split_line[1:]):
+                scores[(olc, olcs[score_idx])] = score
+
+    ost_olcs = ''.join(olcs)
+    print("short %s[%i][%i]={"%(matrix_name, len(ost_olcs), len(ost_olcs)))
+    for a in ost_olcs:
+        score_string = "" 
+        for b in ost_olcs:
+            score = scores[(a,b)]
+            if len(score_string) == 0:
+                entry = [' ', ' ']
+            else:
+                entry = [',', ' ', ' ', ' ']
+            for i in range(len(score)):
+                entry[-1-i] = score[-1-i]
+            score_string += ''.join(entry)
+        print("  {%s},"%(score_string))
+    print("};")
+    print()
+
+Generate("NUC.4.4_edited", "RAW_NUC44_DATA")
+
diff --git a/modules/seq/alg/src/subst_weight_matrix.cc b/modules/seq/alg/src/subst_weight_matrix.cc
index 454dbeddb..6d22a0b9c 100644
--- a/modules/seq/alg/src/subst_weight_matrix.cc
+++ b/modules/seq/alg/src/subst_weight_matrix.cc
@@ -130,6 +130,25 @@ short RAW_BLOSUM100_DATA[23][23]={
   {-2,  0, -8,  0,  7, -7, -5, -1, -7,  0, -6, -4, -2, -4,  5, -1, -2, -3, -5, -7, -2, -6,  6},
 };
 
+short RAW_NUC44_DATA[16][16]={
+  { 5, -4, -4, -4, -4, -4,  1,  1, -4, -4,  1, -4, -1, -1, -1, -2},
+  {-4,  5,  5, -4, -4, -4,  1, -4,  1,  1, -4, -1, -4, -1, -1, -2},
+  {-4,  5,  5, -4, -4, -4,  1, -4,  1,  1, -4, -1, -4, -1, -1, -2},
+  {-4, -4, -4,  5, -4,  1, -4,  1, -4,  1, -4, -1, -1, -4, -1, -2},
+  {-4, -4, -4, -4,  5,  1, -4, -4,  1, -4,  1, -1, -1, -1, -4, -2},
+  {-4, -4, -4,  1,  1, -1, -4, -2, -2, -2, -2, -1, -1, -3, -3, -1},
+  { 1,  1,  1, -4, -4, -4, -1, -2, -2, -2, -2, -3, -3, -1, -1, -1},
+  { 1, -4, -4,  1, -4, -2, -2, -1, -4, -2, -2, -3, -1, -3, -1, -1},
+  {-4,  1,  1, -4,  1, -2, -2, -4, -1, -2, -2, -1, -3, -1, -3, -1},
+  {-4,  1,  1,  1, -4, -2, -2, -2, -2, -1, -4, -1, -3, -3, -1, -1},
+  { 1, -4, -4, -4,  1, -2, -2, -2, -2, -4, -1, -3, -1, -1, -3, -1},
+  {-4, -1, -1, -1, -1, -1, -3, -3, -1, -1, -3, -1, -2, -2, -2, -1},
+  {-1, -4, -4, -1, -1, -1, -3, -1, -3, -3, -1, -2, -1, -2, -2, -1},
+  {-1, -1, -1, -4, -1, -3, -1, -3, -1, -3, -1, -2, -2, -1, -2, -1},
+  {-1, -1, -1, -1, -4, -3, -1, -1, -3, -1, -3, -2, -2, -2, -1, -1},
+  {-2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+};
+
 void FillData(ost::seq::alg::SubstWeightMatrix* subst, short (&data)[23][23]){
   char chars[23] = {'A','B','C','D','E','F','G','H','I','K','L','M','N','P','Q',
                     'R','S','T','V','W','X','Y','Z'};
@@ -140,6 +159,17 @@ void FillData(ost::seq::alg::SubstWeightMatrix* subst, short (&data)[23][23]){
   }
 }
 
+void FillNucData(ost::seq::alg::SubstWeightMatrix* subst,
+                 short (&data)[16][16]) {
+  char chars[16] = {'A','T','U','G','C','S','W','R','Y','K','M','B','V','H',
+                    'D','N'};
+  for(uint i = 0; i < 16; ++i){
+    for(uint j = 0; j < 16; ++j){
+      subst->SetWeight(chars[i],chars[j],data[i][j]);
+    }
+  }
+}
+
 void FillIdentity(ost::seq::alg::SubstWeightMatrix* subst) {
   char chars[26] = {'A','B','C','D','E','F','G','H','I','J','K','L','M','N','O',
                     'P','Q','R','S','T','U','V','W','X','Y','Z'};
@@ -148,6 +178,20 @@ void FillIdentity(ost::seq::alg::SubstWeightMatrix* subst) {
   }
 }
 
+void FillMatch(ost::seq::alg::SubstWeightMatrix* subst) {
+  char chars[26] = {'A','B','C','D','E','F','G','H','I','J','K','L','M','N','O',
+                    'P','Q','R','S','T','U','V','W','X','Y','Z'};
+  for(uint i = 0; i < 26; ++i) {
+    for(uint j = 0; j < 26; ++j) {
+      if(i == j){
+        subst->SetWeight(chars[i], chars[j], 1.0);
+      } else {
+        subst->SetWeight(chars[i], chars[j], -1.0);
+      }
+    }
+  }
+}
+
 }
 
 namespace ost { namespace seq { namespace alg {
@@ -179,6 +223,14 @@ void SubstWeightMatrix::AssignPreset(SubstWeightMatrix::Preset p)
       FillIdentity(this);
       break;
     }
+    case MATCH:{
+      FillMatch(this);
+      break;
+    }
+    case NUC44:{
+      FillNucData(this,RAW_NUC44_DATA);
+      break;
+    }
   }
 }
 
diff --git a/modules/seq/alg/src/subst_weight_matrix.hh b/modules/seq/alg/src/subst_weight_matrix.hh
index fe057be1b..50636399e 100644
--- a/modules/seq/alg/src/subst_weight_matrix.hh
+++ b/modules/seq/alg/src/subst_weight_matrix.hh
@@ -44,7 +44,9 @@ public:
               BLOSUM62 = 1,
               BLOSUM80 = 2,
               BLOSUM100 = 3,
-              IDENTITY = 4};
+              IDENTITY = 4,
+              MATCH = 5,
+              NUC44 = 6};
   /// \brief Initialize substitution matrix with zero.
   /// 
   /// In order to get a useful  substitution weight matrix, use SetWeight(). 
-- 
GitLab