From 5c643b732d195286fc4218cc1d39c6c4bcf6ea9a Mon Sep 17 00:00:00 2001
From: Gabriel Studer <gabriel.studer@unibas.ch>
Date: Fri, 21 Jun 2024 10:20:58 +0200
Subject: [PATCH] subtitution matrices: Add string identifier and remove MATCH
 and IDENTITY presets

The reason for removing MATCH and IDENTITY is to only have matrices
which are also available in the parasail library. This simplifies the
usage of parasail as a drop-in replacement. To be honest: I doubt that
anyone used these matrices.
---
 modules/seq/alg/doc/seqalg.rst             | 61 ++++++++++++++++++----
 modules/seq/alg/pymod/mat.py               |  5 +-
 modules/seq/alg/pymod/wrap_seq_alg.cc      |  5 +-
 modules/seq/alg/src/subst_weight_matrix.cc | 36 ++-----------
 modules/seq/alg/src/subst_weight_matrix.hh | 10 ++--
 5 files changed, 68 insertions(+), 49 deletions(-)

diff --git a/modules/seq/alg/doc/seqalg.rst b/modules/seq/alg/doc/seqalg.rst
index c0f9b885b..31274af12 100644
--- a/modules/seq/alg/doc/seqalg.rst
+++ b/modules/seq/alg/doc/seqalg.rst
@@ -261,24 +261,59 @@ Algorithms for Alignments
 Substitution Weight Matrices and BLOSUM Matrices
 --------------------------------------------------------------------------------
 
-.. autoclass:: SubstWeightMatrix
-   :members:
+.. class:: SubstWeightMatrix
+
+  Substitution weights for alignment algorithms
+
+  .. method:: GetWeight(olc_one, olc_two)
+
+    Get :class:`int` weight for pair of characters
+
+    :param olc_one: first character
+    :type olc_one: :class:`string`
+    :param olc_two: second character
+    :type olc_two: :class:`string`
+
+  .. method:: SetWeight(olc_one, olc_two, weight)
+
+    Set :class:`int` weight for pair of characters
+
+    :param olc_one: first character
+    :type olc_one: :class:`string`
+    :param olc_two: second character
+    :type olc_two: :class:`string`
+    :param weight: the weight
+    :type weight: :class:`int`
+
+  .. method:: GetMinWeight()
+
+    Returns the minimal weight of the matrix
+
+  .. method:: GetMaxWeight()
+
+    Returns the maximum weight of the matrix
+
+  .. method:: GetName()
+
+    Getter for name (empty string if not set)
+
+  .. method:: SetName(name)
+
+    Setter for name
+
+    :param name: Name to be set
+    :type name: :class:`str`
 
 .. _blosum:
 
-Four preset BLOSUM (BLOcks SUbstitution Matrix) matrices are available at 
-different levels of sequence identity:
+Four already preset BLOSUM (BLOcks SUbstitution Matrix) matrices are available
+at different levels of sequence identity:
 
 - BLOSUM45
 - BLOSUM62
 - BLOSUM80
 - BLOSUM100
 
-Two naive substitution matrices:
-
-- IDENTITY: Matches have score of 1, all other are 0
-- MATCH: Matches have score of 1, all other are -1
-
 Nucleotide substitution matrices:
 
 - NUC44: Nucleotide substitution matrix used in blastn that can deal with IUPAC
@@ -286,6 +321,14 @@ Nucleotide substitution matrices:
   equivalence, i.e. you can just do `m.GetWeight('G', 'U')` instead of first
   translating 'U' to 'T'. 
 
+They can be directly accessed upon importing the sequence module:
+
+.. code-block:: python
+
+  from ost import seq
+  mat = seq.alg.BLOSUM62
+  print(mat.GetWeight('A', 'A'))
+
 
 .. _contact-prediction:
 
diff --git a/modules/seq/alg/pymod/mat.py b/modules/seq/alg/pymod/mat.py
index f00ec2237..873d2b42a 100644
--- a/modules/seq/alg/pymod/mat.py
+++ b/modules/seq/alg/pymod/mat.py
@@ -9,9 +9,6 @@ BLOSUM45 = _InitMatrix(SubstWeightMatrix.Preset.BLOSUM45)
 BLOSUM62 = _InitMatrix(SubstWeightMatrix.Preset.BLOSUM62)
 BLOSUM80 = _InitMatrix(SubstWeightMatrix.Preset.BLOSUM80)
 BLOSUM100 = _InitMatrix(SubstWeightMatrix.Preset.BLOSUM100)
-IDENTITY = _InitMatrix(SubstWeightMatrix.Preset.IDENTITY)
-MATCH = _InitMatrix(SubstWeightMatrix.Preset.MATCH)
 NUC44 = _InitMatrix(SubstWeightMatrix.Preset.NUC44)
 
-__all__=['BLOSUM45','BLOSUM62','BLOSUM80','BLOSUM100', 'IDENTITY', 'MATCH',
-         'NUC44']
+__all__=['BLOSUM45','BLOSUM62','BLOSUM80','BLOSUM100','NUC44']
diff --git a/modules/seq/alg/pymod/wrap_seq_alg.cc b/modules/seq/alg/pymod/wrap_seq_alg.cc
index 508a8cc5a..cc57693e8 100644
--- a/modules/seq/alg/pymod/wrap_seq_alg.cc
+++ b/modules/seq/alg/pymod/wrap_seq_alg.cc
@@ -215,6 +215,9 @@ void export_contact_prediction()
                       .def("GetMinWeight", &SubstWeightMatrix::GetMinWeight)
                       .def("GetMaxWeight", &SubstWeightMatrix::GetMaxWeight)
                       .def("AssignPreset", &SubstWeightMatrix::AssignPreset)
+                      .def("SetName", &SubstWeightMatrix::SetName)
+                      .def("GetName", &SubstWeightMatrix::GetName,
+                        return_value_policy<copy_const_reference>())
   ;
 
   enum_<SubstWeightMatrix::Preset>("Preset")
@@ -222,8 +225,6 @@ void export_contact_prediction()
     .value("BLOSUM62", SubstWeightMatrix::BLOSUM62)
     .value("BLOSUM80", SubstWeightMatrix::BLOSUM80)
     .value("BLOSUM100", SubstWeightMatrix::BLOSUM100)
-    .value("IDENTITY", SubstWeightMatrix::IDENTITY)
-    .value("MATCH", SubstWeightMatrix::MATCH)
     .value("NUC44", SubstWeightMatrix::NUC44)
   ;
 }
diff --git a/modules/seq/alg/src/subst_weight_matrix.cc b/modules/seq/alg/src/subst_weight_matrix.cc
index 6d22a0b9c..3c7ee7225 100644
--- a/modules/seq/alg/src/subst_weight_matrix.cc
+++ b/modules/seq/alg/src/subst_weight_matrix.cc
@@ -170,28 +170,6 @@ void FillNucData(ost::seq::alg::SubstWeightMatrix* subst,
   }
 }
 
-void FillIdentity(ost::seq::alg::SubstWeightMatrix* subst) {
-  char chars[26] = {'A','B','C','D','E','F','G','H','I','J','K','L','M','N','O',
-                    'P','Q','R','S','T','U','V','W','X','Y','Z'};
-  for(uint i = 0; i < 26; ++i) {
-    subst->SetWeight(chars[i], chars[i], 1.0);
-  }
-}
-
-void FillMatch(ost::seq::alg::SubstWeightMatrix* subst) {
-  char chars[26] = {'A','B','C','D','E','F','G','H','I','J','K','L','M','N','O',
-                    'P','Q','R','S','T','U','V','W','X','Y','Z'};
-  for(uint i = 0; i < 26; ++i) {
-    for(uint j = 0; j < 26; ++j) {
-      if(i == j){
-        subst->SetWeight(chars[i], chars[j], 1.0);
-      } else {
-        subst->SetWeight(chars[i], chars[j], -1.0);
-      }
-    }
-  }
-}
-
 }
 
 namespace ost { namespace seq { namespace alg {
@@ -205,34 +183,30 @@ void SubstWeightMatrix::AssignPreset(SubstWeightMatrix::Preset p)
   switch(p){
     case BLOSUM45:{
       FillData(this,RAW_BLOSUM45_DATA);
+      this->SetName("blosum45");
       break;
     }
     case BLOSUM62:{
       FillData(this,RAW_BLOSUM62_DATA);
+      this->SetName("blosum62");
       break;
     }
     case BLOSUM80:{
       FillData(this,RAW_BLOSUM80_DATA);
+      this->SetName("blosum80");
       break;
     }
     case BLOSUM100:{
       FillData(this,RAW_BLOSUM100_DATA);
-      break;
-    }
-    case IDENTITY:{
-      FillIdentity(this);
-      break;
-    }
-    case MATCH:{
-      FillMatch(this);
+      this->SetName("blosum100");
       break;
     }
     case NUC44:{
       FillNucData(this,RAW_NUC44_DATA);
+      this->SetName("nuc44");
       break;
     }
   }
 }
 
-
 }}}
diff --git a/modules/seq/alg/src/subst_weight_matrix.hh b/modules/seq/alg/src/subst_weight_matrix.hh
index 50636399e..a0e1e5862 100644
--- a/modules/seq/alg/src/subst_weight_matrix.hh
+++ b/modules/seq/alg/src/subst_weight_matrix.hh
@@ -22,6 +22,7 @@
 #include <ctype.h>
 #include <string.h>
 #include <boost/shared_ptr.hpp>
+#include <ost/base.hh>
 #include <ost/config.hh>
 #include <ost/seq/alg/module_config.hh>
 
@@ -44,9 +45,7 @@ public:
               BLOSUM62 = 1,
               BLOSUM80 = 2,
               BLOSUM100 = 3,
-              IDENTITY = 4,
-              MATCH = 5,
-              NUC44 = 6};
+              NUC44 = 4};
   /// \brief Initialize substitution matrix with zero.
   /// 
   /// In order to get a useful  substitution weight matrix, use SetWeight(). 
@@ -92,6 +91,10 @@ public:
     }
   }
 
+  void SetName(const String& name) { name_ = name; }
+
+  const String& GetName() { return name_; }
+
 private:
   int Index(char aa_one, char aa_two) const {
     return (toupper(aa_one)-'A')*ALPHABET_SIZE+(toupper(aa_two)-'A');
@@ -104,6 +107,7 @@ private:
   WeightType weights_[ALPHABET_SIZE*ALPHABET_SIZE];
   WeightType max_weight_;
   WeightType min_weight_;
+  String name_;
 };
 
 }}}
-- 
GitLab