From 85fb9022d16d80de370c5c2043c619bc7a597fae Mon Sep 17 00:00:00 2001 From: Gabriel Studer <gabriel.studer@unibas.ch> Date: Tue, 10 May 2022 10:53:29 +0200 Subject: [PATCH] add MATCH and NUC44 substitution matrices both can be used for nucleotide alignments --- modules/seq/alg/doc/seqalg.rst | 12 +++++ modules/seq/alg/pymod/mat.py | 5 +- modules/seq/alg/pymod/wrap_seq_alg.cc | 2 + modules/seq/alg/src/data/NUC.4.4_edited | 28 ++++++++++ modules/seq/alg/src/data/generate_nuc_data.py | 44 ++++++++++++++++ modules/seq/alg/src/subst_weight_matrix.cc | 52 +++++++++++++++++++ modules/seq/alg/src/subst_weight_matrix.hh | 4 +- 7 files changed, 145 insertions(+), 2 deletions(-) create mode 100644 modules/seq/alg/src/data/NUC.4.4_edited create mode 100644 modules/seq/alg/src/data/generate_nuc_data.py diff --git a/modules/seq/alg/doc/seqalg.rst b/modules/seq/alg/doc/seqalg.rst index b4061e23d..a092012d9 100644 --- a/modules/seq/alg/doc/seqalg.rst +++ b/modules/seq/alg/doc/seqalg.rst @@ -274,6 +274,18 @@ different levels of sequence identity: - BLOSUM80 - BLOSUM100 +Two naive substitution matrices: + +- IDENTITY: Matches have score of 1, all other are 0 +- MATCH: Matches have score of 1, all other are -1 + +Nucleotide substitution matrices: + +- NUC44: Nucleotide substitution matrix used in blastn that can deal with IUPAC + ambiguity codes. ATTENTION: has been edited to explicitely encode T/U + equivalence, i.e. you can just do `m.GetWeight('G', 'U')` instead of first + translating 'U' to 'T'. + .. _contact-prediction: diff --git a/modules/seq/alg/pymod/mat.py b/modules/seq/alg/pymod/mat.py index 57fa10c17..f00ec2237 100644 --- a/modules/seq/alg/pymod/mat.py +++ b/modules/seq/alg/pymod/mat.py @@ -10,5 +10,8 @@ BLOSUM62 = _InitMatrix(SubstWeightMatrix.Preset.BLOSUM62) BLOSUM80 = _InitMatrix(SubstWeightMatrix.Preset.BLOSUM80) BLOSUM100 = _InitMatrix(SubstWeightMatrix.Preset.BLOSUM100) IDENTITY = _InitMatrix(SubstWeightMatrix.Preset.IDENTITY) +MATCH = _InitMatrix(SubstWeightMatrix.Preset.MATCH) +NUC44 = _InitMatrix(SubstWeightMatrix.Preset.NUC44) -__all__=['BLOSUM45','BLOSUM62','BLOSUM80','BLOSUM100', 'IDENTITY'] +__all__=['BLOSUM45','BLOSUM62','BLOSUM80','BLOSUM100', 'IDENTITY', 'MATCH', + 'NUC44'] diff --git a/modules/seq/alg/pymod/wrap_seq_alg.cc b/modules/seq/alg/pymod/wrap_seq_alg.cc index d0577fa5c..508a8cc5a 100644 --- a/modules/seq/alg/pymod/wrap_seq_alg.cc +++ b/modules/seq/alg/pymod/wrap_seq_alg.cc @@ -223,6 +223,8 @@ void export_contact_prediction() .value("BLOSUM80", SubstWeightMatrix::BLOSUM80) .value("BLOSUM100", SubstWeightMatrix::BLOSUM100) .value("IDENTITY", SubstWeightMatrix::IDENTITY) + .value("MATCH", SubstWeightMatrix::MATCH) + .value("NUC44", SubstWeightMatrix::NUC44) ; } diff --git a/modules/seq/alg/src/data/NUC.4.4_edited b/modules/seq/alg/src/data/NUC.4.4_edited new file mode 100644 index 000000000..4c89f7719 --- /dev/null +++ b/modules/seq/alg/src/data/NUC.4.4_edited @@ -0,0 +1,28 @@ +# +# This matrix was created by Todd Lowe 12/10/92 +# +# Uses ambiguous nucleotide codes, probabilities rounded to +# nearest integer +# +# Lowest score = -4, Highest score = 5 +# +# EDIT GABRIEL STUDER (May 10 2022): +# Add column/row for U to explicitely express T/U equivalence +# + A T U G C S W R Y K M B V H D N +A 5 -4 -4 -4 -4 -4 1 1 -4 -4 1 -4 -1 -1 -1 -2 +T -4 5 5 -4 -4 -4 1 -4 1 1 -4 -1 -4 -1 -1 -2 +U -4 5 5 -4 -4 -4 1 -4 1 1 -4 -1 -4 -1 -1 -2 +G -4 -4 -4 5 -4 1 -4 1 -4 1 -4 -1 -1 -4 -1 -2 +C -4 -4 -4 -4 5 1 -4 -4 1 -4 1 -1 -1 -1 -4 -2 +S -4 -4 -4 1 1 -1 -4 -2 -2 -2 -2 -1 -1 -3 -3 -1 +W 1 1 1 -4 -4 -4 -1 -2 -2 -2 -2 -3 -3 -1 -1 -1 +R 1 -4 -4 1 -4 -2 -2 -1 -4 -2 -2 -3 -1 -3 -1 -1 +Y -4 1 1 -4 1 -2 -2 -4 -1 -2 -2 -1 -3 -1 -3 -1 +K -4 1 1 1 -4 -2 -2 -2 -2 -1 -4 -1 -3 -3 -1 -1 +M 1 -4 -4 -4 1 -2 -2 -2 -2 -4 -1 -3 -1 -1 -3 -1 +B -4 -1 -1 -1 -1 -1 -3 -3 -1 -1 -3 -1 -2 -2 -2 -1 +V -1 -4 -4 -1 -1 -1 -3 -1 -3 -3 -1 -2 -1 -2 -2 -1 +H -1 -1 -1 -4 -1 -3 -1 -3 -1 -3 -1 -2 -2 -1 -2 -1 +D -1 -1 -1 -1 -4 -3 -1 -1 -3 -1 -3 -2 -2 -2 -1 -1 +N -2 -2 -2 -2 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 diff --git a/modules/seq/alg/src/data/generate_nuc_data.py b/modules/seq/alg/src/data/generate_nuc_data.py new file mode 100644 index 000000000..e8e9b5183 --- /dev/null +++ b/modules/seq/alg/src/data/generate_nuc_data.py @@ -0,0 +1,44 @@ +# code generation for subst_weight_matrix.cc +# loads NUC substitution matrix from file provided by NCBI at: +# ftp://ftp.ncbi.nlm.nih.gov/blast/matrices/ + +def Generate(filename, matrix_name): + + with open(filename) as f: + data = f.readlines() + + scores = dict() + olcs = None + for line in data: + if line.startswith('#'): + continue + if olcs is None: + if " A " in line and " C " in line and " G " and " T " in line: + # very high likelihood that this line contains the olcs + olcs = line.strip().split() + continue + split_line = line.strip().split() + if split_line[0] in olcs: + olc = split_line[0] + for score_idx, score in enumerate(split_line[1:]): + scores[(olc, olcs[score_idx])] = score + + ost_olcs = ''.join(olcs) + print("short %s[%i][%i]={"%(matrix_name, len(ost_olcs), len(ost_olcs))) + for a in ost_olcs: + score_string = "" + for b in ost_olcs: + score = scores[(a,b)] + if len(score_string) == 0: + entry = [' ', ' '] + else: + entry = [',', ' ', ' ', ' '] + for i in range(len(score)): + entry[-1-i] = score[-1-i] + score_string += ''.join(entry) + print(" {%s},"%(score_string)) + print("};") + print() + +Generate("NUC.4.4_edited", "RAW_NUC44_DATA") + diff --git a/modules/seq/alg/src/subst_weight_matrix.cc b/modules/seq/alg/src/subst_weight_matrix.cc index 454dbeddb..6d22a0b9c 100644 --- a/modules/seq/alg/src/subst_weight_matrix.cc +++ b/modules/seq/alg/src/subst_weight_matrix.cc @@ -130,6 +130,25 @@ short RAW_BLOSUM100_DATA[23][23]={ {-2, 0, -8, 0, 7, -7, -5, -1, -7, 0, -6, -4, -2, -4, 5, -1, -2, -3, -5, -7, -2, -6, 6}, }; +short RAW_NUC44_DATA[16][16]={ + { 5, -4, -4, -4, -4, -4, 1, 1, -4, -4, 1, -4, -1, -1, -1, -2}, + {-4, 5, 5, -4, -4, -4, 1, -4, 1, 1, -4, -1, -4, -1, -1, -2}, + {-4, 5, 5, -4, -4, -4, 1, -4, 1, 1, -4, -1, -4, -1, -1, -2}, + {-4, -4, -4, 5, -4, 1, -4, 1, -4, 1, -4, -1, -1, -4, -1, -2}, + {-4, -4, -4, -4, 5, 1, -4, -4, 1, -4, 1, -1, -1, -1, -4, -2}, + {-4, -4, -4, 1, 1, -1, -4, -2, -2, -2, -2, -1, -1, -3, -3, -1}, + { 1, 1, 1, -4, -4, -4, -1, -2, -2, -2, -2, -3, -3, -1, -1, -1}, + { 1, -4, -4, 1, -4, -2, -2, -1, -4, -2, -2, -3, -1, -3, -1, -1}, + {-4, 1, 1, -4, 1, -2, -2, -4, -1, -2, -2, -1, -3, -1, -3, -1}, + {-4, 1, 1, 1, -4, -2, -2, -2, -2, -1, -4, -1, -3, -3, -1, -1}, + { 1, -4, -4, -4, 1, -2, -2, -2, -2, -4, -1, -3, -1, -1, -3, -1}, + {-4, -1, -1, -1, -1, -1, -3, -3, -1, -1, -3, -1, -2, -2, -2, -1}, + {-1, -4, -4, -1, -1, -1, -3, -1, -3, -3, -1, -2, -1, -2, -2, -1}, + {-1, -1, -1, -4, -1, -3, -1, -3, -1, -3, -1, -2, -2, -1, -2, -1}, + {-1, -1, -1, -1, -4, -3, -1, -1, -3, -1, -3, -2, -2, -2, -1, -1}, + {-2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, +}; + void FillData(ost::seq::alg::SubstWeightMatrix* subst, short (&data)[23][23]){ char chars[23] = {'A','B','C','D','E','F','G','H','I','K','L','M','N','P','Q', 'R','S','T','V','W','X','Y','Z'}; @@ -140,6 +159,17 @@ void FillData(ost::seq::alg::SubstWeightMatrix* subst, short (&data)[23][23]){ } } +void FillNucData(ost::seq::alg::SubstWeightMatrix* subst, + short (&data)[16][16]) { + char chars[16] = {'A','T','U','G','C','S','W','R','Y','K','M','B','V','H', + 'D','N'}; + for(uint i = 0; i < 16; ++i){ + for(uint j = 0; j < 16; ++j){ + subst->SetWeight(chars[i],chars[j],data[i][j]); + } + } +} + void FillIdentity(ost::seq::alg::SubstWeightMatrix* subst) { char chars[26] = {'A','B','C','D','E','F','G','H','I','J','K','L','M','N','O', 'P','Q','R','S','T','U','V','W','X','Y','Z'}; @@ -148,6 +178,20 @@ void FillIdentity(ost::seq::alg::SubstWeightMatrix* subst) { } } +void FillMatch(ost::seq::alg::SubstWeightMatrix* subst) { + char chars[26] = {'A','B','C','D','E','F','G','H','I','J','K','L','M','N','O', + 'P','Q','R','S','T','U','V','W','X','Y','Z'}; + for(uint i = 0; i < 26; ++i) { + for(uint j = 0; j < 26; ++j) { + if(i == j){ + subst->SetWeight(chars[i], chars[j], 1.0); + } else { + subst->SetWeight(chars[i], chars[j], -1.0); + } + } + } +} + } namespace ost { namespace seq { namespace alg { @@ -179,6 +223,14 @@ void SubstWeightMatrix::AssignPreset(SubstWeightMatrix::Preset p) FillIdentity(this); break; } + case MATCH:{ + FillMatch(this); + break; + } + case NUC44:{ + FillNucData(this,RAW_NUC44_DATA); + break; + } } } diff --git a/modules/seq/alg/src/subst_weight_matrix.hh b/modules/seq/alg/src/subst_weight_matrix.hh index fe057be1b..50636399e 100644 --- a/modules/seq/alg/src/subst_weight_matrix.hh +++ b/modules/seq/alg/src/subst_weight_matrix.hh @@ -44,7 +44,9 @@ public: BLOSUM62 = 1, BLOSUM80 = 2, BLOSUM100 = 3, - IDENTITY = 4}; + IDENTITY = 4, + MATCH = 5, + NUC44 = 6}; /// \brief Initialize substitution matrix with zero. /// /// In order to get a useful substitution weight matrix, use SetWeight(). -- GitLab