Skip to content
Snippets Groups Projects
Commit 85fb9022 authored by Studer Gabriel's avatar Studer Gabriel
Browse files

add MATCH and NUC44 substitution matrices

both can be used for nucleotide alignments
parent 2330eadd
Branches
Tags
No related merge requests found
...@@ -274,6 +274,18 @@ different levels of sequence identity: ...@@ -274,6 +274,18 @@ different levels of sequence identity:
- BLOSUM80 - BLOSUM80
- BLOSUM100 - BLOSUM100
Two naive substitution matrices:
- IDENTITY: Matches have score of 1, all other are 0
- MATCH: Matches have score of 1, all other are -1
Nucleotide substitution matrices:
- NUC44: Nucleotide substitution matrix used in blastn that can deal with IUPAC
ambiguity codes. ATTENTION: has been edited to explicitely encode T/U
equivalence, i.e. you can just do `m.GetWeight('G', 'U')` instead of first
translating 'U' to 'T'.
.. _contact-prediction: .. _contact-prediction:
......
...@@ -10,5 +10,8 @@ BLOSUM62 = _InitMatrix(SubstWeightMatrix.Preset.BLOSUM62) ...@@ -10,5 +10,8 @@ BLOSUM62 = _InitMatrix(SubstWeightMatrix.Preset.BLOSUM62)
BLOSUM80 = _InitMatrix(SubstWeightMatrix.Preset.BLOSUM80) BLOSUM80 = _InitMatrix(SubstWeightMatrix.Preset.BLOSUM80)
BLOSUM100 = _InitMatrix(SubstWeightMatrix.Preset.BLOSUM100) BLOSUM100 = _InitMatrix(SubstWeightMatrix.Preset.BLOSUM100)
IDENTITY = _InitMatrix(SubstWeightMatrix.Preset.IDENTITY) IDENTITY = _InitMatrix(SubstWeightMatrix.Preset.IDENTITY)
MATCH = _InitMatrix(SubstWeightMatrix.Preset.MATCH)
NUC44 = _InitMatrix(SubstWeightMatrix.Preset.NUC44)
__all__=['BLOSUM45','BLOSUM62','BLOSUM80','BLOSUM100', 'IDENTITY'] __all__=['BLOSUM45','BLOSUM62','BLOSUM80','BLOSUM100', 'IDENTITY', 'MATCH',
'NUC44']
...@@ -223,6 +223,8 @@ void export_contact_prediction() ...@@ -223,6 +223,8 @@ void export_contact_prediction()
.value("BLOSUM80", SubstWeightMatrix::BLOSUM80) .value("BLOSUM80", SubstWeightMatrix::BLOSUM80)
.value("BLOSUM100", SubstWeightMatrix::BLOSUM100) .value("BLOSUM100", SubstWeightMatrix::BLOSUM100)
.value("IDENTITY", SubstWeightMatrix::IDENTITY) .value("IDENTITY", SubstWeightMatrix::IDENTITY)
.value("MATCH", SubstWeightMatrix::MATCH)
.value("NUC44", SubstWeightMatrix::NUC44)
; ;
} }
......
#
# This matrix was created by Todd Lowe 12/10/92
#
# Uses ambiguous nucleotide codes, probabilities rounded to
# nearest integer
#
# Lowest score = -4, Highest score = 5
#
# EDIT GABRIEL STUDER (May 10 2022):
# Add column/row for U to explicitely express T/U equivalence
#
A T U G C S W R Y K M B V H D N
A 5 -4 -4 -4 -4 -4 1 1 -4 -4 1 -4 -1 -1 -1 -2
T -4 5 5 -4 -4 -4 1 -4 1 1 -4 -1 -4 -1 -1 -2
U -4 5 5 -4 -4 -4 1 -4 1 1 -4 -1 -4 -1 -1 -2
G -4 -4 -4 5 -4 1 -4 1 -4 1 -4 -1 -1 -4 -1 -2
C -4 -4 -4 -4 5 1 -4 -4 1 -4 1 -1 -1 -1 -4 -2
S -4 -4 -4 1 1 -1 -4 -2 -2 -2 -2 -1 -1 -3 -3 -1
W 1 1 1 -4 -4 -4 -1 -2 -2 -2 -2 -3 -3 -1 -1 -1
R 1 -4 -4 1 -4 -2 -2 -1 -4 -2 -2 -3 -1 -3 -1 -1
Y -4 1 1 -4 1 -2 -2 -4 -1 -2 -2 -1 -3 -1 -3 -1
K -4 1 1 1 -4 -2 -2 -2 -2 -1 -4 -1 -3 -3 -1 -1
M 1 -4 -4 -4 1 -2 -2 -2 -2 -4 -1 -3 -1 -1 -3 -1
B -4 -1 -1 -1 -1 -1 -3 -3 -1 -1 -3 -1 -2 -2 -2 -1
V -1 -4 -4 -1 -1 -1 -3 -1 -3 -3 -1 -2 -1 -2 -2 -1
H -1 -1 -1 -4 -1 -3 -1 -3 -1 -3 -1 -2 -2 -1 -2 -1
D -1 -1 -1 -1 -4 -3 -1 -1 -3 -1 -3 -2 -2 -2 -1 -1
N -2 -2 -2 -2 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
# code generation for subst_weight_matrix.cc
# loads NUC substitution matrix from file provided by NCBI at:
# ftp://ftp.ncbi.nlm.nih.gov/blast/matrices/
def Generate(filename, matrix_name):
with open(filename) as f:
data = f.readlines()
scores = dict()
olcs = None
for line in data:
if line.startswith('#'):
continue
if olcs is None:
if " A " in line and " C " in line and " G " and " T " in line:
# very high likelihood that this line contains the olcs
olcs = line.strip().split()
continue
split_line = line.strip().split()
if split_line[0] in olcs:
olc = split_line[0]
for score_idx, score in enumerate(split_line[1:]):
scores[(olc, olcs[score_idx])] = score
ost_olcs = ''.join(olcs)
print("short %s[%i][%i]={"%(matrix_name, len(ost_olcs), len(ost_olcs)))
for a in ost_olcs:
score_string = ""
for b in ost_olcs:
score = scores[(a,b)]
if len(score_string) == 0:
entry = [' ', ' ']
else:
entry = [',', ' ', ' ', ' ']
for i in range(len(score)):
entry[-1-i] = score[-1-i]
score_string += ''.join(entry)
print(" {%s},"%(score_string))
print("};")
print()
Generate("NUC.4.4_edited", "RAW_NUC44_DATA")
...@@ -130,6 +130,25 @@ short RAW_BLOSUM100_DATA[23][23]={ ...@@ -130,6 +130,25 @@ short RAW_BLOSUM100_DATA[23][23]={
{-2, 0, -8, 0, 7, -7, -5, -1, -7, 0, -6, -4, -2, -4, 5, -1, -2, -3, -5, -7, -2, -6, 6}, {-2, 0, -8, 0, 7, -7, -5, -1, -7, 0, -6, -4, -2, -4, 5, -1, -2, -3, -5, -7, -2, -6, 6},
}; };
short RAW_NUC44_DATA[16][16]={
{ 5, -4, -4, -4, -4, -4, 1, 1, -4, -4, 1, -4, -1, -1, -1, -2},
{-4, 5, 5, -4, -4, -4, 1, -4, 1, 1, -4, -1, -4, -1, -1, -2},
{-4, 5, 5, -4, -4, -4, 1, -4, 1, 1, -4, -1, -4, -1, -1, -2},
{-4, -4, -4, 5, -4, 1, -4, 1, -4, 1, -4, -1, -1, -4, -1, -2},
{-4, -4, -4, -4, 5, 1, -4, -4, 1, -4, 1, -1, -1, -1, -4, -2},
{-4, -4, -4, 1, 1, -1, -4, -2, -2, -2, -2, -1, -1, -3, -3, -1},
{ 1, 1, 1, -4, -4, -4, -1, -2, -2, -2, -2, -3, -3, -1, -1, -1},
{ 1, -4, -4, 1, -4, -2, -2, -1, -4, -2, -2, -3, -1, -3, -1, -1},
{-4, 1, 1, -4, 1, -2, -2, -4, -1, -2, -2, -1, -3, -1, -3, -1},
{-4, 1, 1, 1, -4, -2, -2, -2, -2, -1, -4, -1, -3, -3, -1, -1},
{ 1, -4, -4, -4, 1, -2, -2, -2, -2, -4, -1, -3, -1, -1, -3, -1},
{-4, -1, -1, -1, -1, -1, -3, -3, -1, -1, -3, -1, -2, -2, -2, -1},
{-1, -4, -4, -1, -1, -1, -3, -1, -3, -3, -1, -2, -1, -2, -2, -1},
{-1, -1, -1, -4, -1, -3, -1, -3, -1, -3, -1, -2, -2, -1, -2, -1},
{-1, -1, -1, -1, -4, -3, -1, -1, -3, -1, -3, -2, -2, -2, -1, -1},
{-2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
};
void FillData(ost::seq::alg::SubstWeightMatrix* subst, short (&data)[23][23]){ void FillData(ost::seq::alg::SubstWeightMatrix* subst, short (&data)[23][23]){
char chars[23] = {'A','B','C','D','E','F','G','H','I','K','L','M','N','P','Q', char chars[23] = {'A','B','C','D','E','F','G','H','I','K','L','M','N','P','Q',
'R','S','T','V','W','X','Y','Z'}; 'R','S','T','V','W','X','Y','Z'};
...@@ -140,6 +159,17 @@ void FillData(ost::seq::alg::SubstWeightMatrix* subst, short (&data)[23][23]){ ...@@ -140,6 +159,17 @@ void FillData(ost::seq::alg::SubstWeightMatrix* subst, short (&data)[23][23]){
} }
} }
void FillNucData(ost::seq::alg::SubstWeightMatrix* subst,
short (&data)[16][16]) {
char chars[16] = {'A','T','U','G','C','S','W','R','Y','K','M','B','V','H',
'D','N'};
for(uint i = 0; i < 16; ++i){
for(uint j = 0; j < 16; ++j){
subst->SetWeight(chars[i],chars[j],data[i][j]);
}
}
}
void FillIdentity(ost::seq::alg::SubstWeightMatrix* subst) { void FillIdentity(ost::seq::alg::SubstWeightMatrix* subst) {
char chars[26] = {'A','B','C','D','E','F','G','H','I','J','K','L','M','N','O', char chars[26] = {'A','B','C','D','E','F','G','H','I','J','K','L','M','N','O',
'P','Q','R','S','T','U','V','W','X','Y','Z'}; 'P','Q','R','S','T','U','V','W','X','Y','Z'};
...@@ -148,6 +178,20 @@ void FillIdentity(ost::seq::alg::SubstWeightMatrix* subst) { ...@@ -148,6 +178,20 @@ void FillIdentity(ost::seq::alg::SubstWeightMatrix* subst) {
} }
} }
void FillMatch(ost::seq::alg::SubstWeightMatrix* subst) {
char chars[26] = {'A','B','C','D','E','F','G','H','I','J','K','L','M','N','O',
'P','Q','R','S','T','U','V','W','X','Y','Z'};
for(uint i = 0; i < 26; ++i) {
for(uint j = 0; j < 26; ++j) {
if(i == j){
subst->SetWeight(chars[i], chars[j], 1.0);
} else {
subst->SetWeight(chars[i], chars[j], -1.0);
}
}
}
}
} }
namespace ost { namespace seq { namespace alg { namespace ost { namespace seq { namespace alg {
...@@ -179,6 +223,14 @@ void SubstWeightMatrix::AssignPreset(SubstWeightMatrix::Preset p) ...@@ -179,6 +223,14 @@ void SubstWeightMatrix::AssignPreset(SubstWeightMatrix::Preset p)
FillIdentity(this); FillIdentity(this);
break; break;
} }
case MATCH:{
FillMatch(this);
break;
}
case NUC44:{
FillNucData(this,RAW_NUC44_DATA);
break;
}
} }
} }
......
...@@ -44,7 +44,9 @@ public: ...@@ -44,7 +44,9 @@ public:
BLOSUM62 = 1, BLOSUM62 = 1,
BLOSUM80 = 2, BLOSUM80 = 2,
BLOSUM100 = 3, BLOSUM100 = 3,
IDENTITY = 4}; IDENTITY = 4,
MATCH = 5,
NUC44 = 6};
/// \brief Initialize substitution matrix with zero. /// \brief Initialize substitution matrix with zero.
/// ///
/// In order to get a useful substitution weight matrix, use SetWeight(). /// In order to get a useful substitution weight matrix, use SetWeight().
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment