Skip to content
Snippets Groups Projects
Commit 85fb9022 authored by Studer Gabriel's avatar Studer Gabriel
Browse files

add MATCH and NUC44 substitution matrices

both can be used for nucleotide alignments
parent 2330eadd
No related branches found
No related tags found
No related merge requests found
......@@ -274,6 +274,18 @@ different levels of sequence identity:
- BLOSUM80
- BLOSUM100
Two naive substitution matrices:
- IDENTITY: Matches have score of 1, all other are 0
- MATCH: Matches have score of 1, all other are -1
Nucleotide substitution matrices:
- NUC44: Nucleotide substitution matrix used in blastn that can deal with IUPAC
ambiguity codes. ATTENTION: has been edited to explicitely encode T/U
equivalence, i.e. you can just do `m.GetWeight('G', 'U')` instead of first
translating 'U' to 'T'.
.. _contact-prediction:
......
......@@ -10,5 +10,8 @@ BLOSUM62 = _InitMatrix(SubstWeightMatrix.Preset.BLOSUM62)
BLOSUM80 = _InitMatrix(SubstWeightMatrix.Preset.BLOSUM80)
BLOSUM100 = _InitMatrix(SubstWeightMatrix.Preset.BLOSUM100)
IDENTITY = _InitMatrix(SubstWeightMatrix.Preset.IDENTITY)
MATCH = _InitMatrix(SubstWeightMatrix.Preset.MATCH)
NUC44 = _InitMatrix(SubstWeightMatrix.Preset.NUC44)
__all__=['BLOSUM45','BLOSUM62','BLOSUM80','BLOSUM100', 'IDENTITY']
__all__=['BLOSUM45','BLOSUM62','BLOSUM80','BLOSUM100', 'IDENTITY', 'MATCH',
'NUC44']
......@@ -223,6 +223,8 @@ void export_contact_prediction()
.value("BLOSUM80", SubstWeightMatrix::BLOSUM80)
.value("BLOSUM100", SubstWeightMatrix::BLOSUM100)
.value("IDENTITY", SubstWeightMatrix::IDENTITY)
.value("MATCH", SubstWeightMatrix::MATCH)
.value("NUC44", SubstWeightMatrix::NUC44)
;
}
......
#
# This matrix was created by Todd Lowe 12/10/92
#
# Uses ambiguous nucleotide codes, probabilities rounded to
# nearest integer
#
# Lowest score = -4, Highest score = 5
#
# EDIT GABRIEL STUDER (May 10 2022):
# Add column/row for U to explicitely express T/U equivalence
#
A T U G C S W R Y K M B V H D N
A 5 -4 -4 -4 -4 -4 1 1 -4 -4 1 -4 -1 -1 -1 -2
T -4 5 5 -4 -4 -4 1 -4 1 1 -4 -1 -4 -1 -1 -2
U -4 5 5 -4 -4 -4 1 -4 1 1 -4 -1 -4 -1 -1 -2
G -4 -4 -4 5 -4 1 -4 1 -4 1 -4 -1 -1 -4 -1 -2
C -4 -4 -4 -4 5 1 -4 -4 1 -4 1 -1 -1 -1 -4 -2
S -4 -4 -4 1 1 -1 -4 -2 -2 -2 -2 -1 -1 -3 -3 -1
W 1 1 1 -4 -4 -4 -1 -2 -2 -2 -2 -3 -3 -1 -1 -1
R 1 -4 -4 1 -4 -2 -2 -1 -4 -2 -2 -3 -1 -3 -1 -1
Y -4 1 1 -4 1 -2 -2 -4 -1 -2 -2 -1 -3 -1 -3 -1
K -4 1 1 1 -4 -2 -2 -2 -2 -1 -4 -1 -3 -3 -1 -1
M 1 -4 -4 -4 1 -2 -2 -2 -2 -4 -1 -3 -1 -1 -3 -1
B -4 -1 -1 -1 -1 -1 -3 -3 -1 -1 -3 -1 -2 -2 -2 -1
V -1 -4 -4 -1 -1 -1 -3 -1 -3 -3 -1 -2 -1 -2 -2 -1
H -1 -1 -1 -4 -1 -3 -1 -3 -1 -3 -1 -2 -2 -1 -2 -1
D -1 -1 -1 -1 -4 -3 -1 -1 -3 -1 -3 -2 -2 -2 -1 -1
N -2 -2 -2 -2 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
# code generation for subst_weight_matrix.cc
# loads NUC substitution matrix from file provided by NCBI at:
# ftp://ftp.ncbi.nlm.nih.gov/blast/matrices/
def Generate(filename, matrix_name):
with open(filename) as f:
data = f.readlines()
scores = dict()
olcs = None
for line in data:
if line.startswith('#'):
continue
if olcs is None:
if " A " in line and " C " in line and " G " and " T " in line:
# very high likelihood that this line contains the olcs
olcs = line.strip().split()
continue
split_line = line.strip().split()
if split_line[0] in olcs:
olc = split_line[0]
for score_idx, score in enumerate(split_line[1:]):
scores[(olc, olcs[score_idx])] = score
ost_olcs = ''.join(olcs)
print("short %s[%i][%i]={"%(matrix_name, len(ost_olcs), len(ost_olcs)))
for a in ost_olcs:
score_string = ""
for b in ost_olcs:
score = scores[(a,b)]
if len(score_string) == 0:
entry = [' ', ' ']
else:
entry = [',', ' ', ' ', ' ']
for i in range(len(score)):
entry[-1-i] = score[-1-i]
score_string += ''.join(entry)
print(" {%s},"%(score_string))
print("};")
print()
Generate("NUC.4.4_edited", "RAW_NUC44_DATA")
......@@ -130,6 +130,25 @@ short RAW_BLOSUM100_DATA[23][23]={
{-2, 0, -8, 0, 7, -7, -5, -1, -7, 0, -6, -4, -2, -4, 5, -1, -2, -3, -5, -7, -2, -6, 6},
};
short RAW_NUC44_DATA[16][16]={
{ 5, -4, -4, -4, -4, -4, 1, 1, -4, -4, 1, -4, -1, -1, -1, -2},
{-4, 5, 5, -4, -4, -4, 1, -4, 1, 1, -4, -1, -4, -1, -1, -2},
{-4, 5, 5, -4, -4, -4, 1, -4, 1, 1, -4, -1, -4, -1, -1, -2},
{-4, -4, -4, 5, -4, 1, -4, 1, -4, 1, -4, -1, -1, -4, -1, -2},
{-4, -4, -4, -4, 5, 1, -4, -4, 1, -4, 1, -1, -1, -1, -4, -2},
{-4, -4, -4, 1, 1, -1, -4, -2, -2, -2, -2, -1, -1, -3, -3, -1},
{ 1, 1, 1, -4, -4, -4, -1, -2, -2, -2, -2, -3, -3, -1, -1, -1},
{ 1, -4, -4, 1, -4, -2, -2, -1, -4, -2, -2, -3, -1, -3, -1, -1},
{-4, 1, 1, -4, 1, -2, -2, -4, -1, -2, -2, -1, -3, -1, -3, -1},
{-4, 1, 1, 1, -4, -2, -2, -2, -2, -1, -4, -1, -3, -3, -1, -1},
{ 1, -4, -4, -4, 1, -2, -2, -2, -2, -4, -1, -3, -1, -1, -3, -1},
{-4, -1, -1, -1, -1, -1, -3, -3, -1, -1, -3, -1, -2, -2, -2, -1},
{-1, -4, -4, -1, -1, -1, -3, -1, -3, -3, -1, -2, -1, -2, -2, -1},
{-1, -1, -1, -4, -1, -3, -1, -3, -1, -3, -1, -2, -2, -1, -2, -1},
{-1, -1, -1, -1, -4, -3, -1, -1, -3, -1, -3, -2, -2, -2, -1, -1},
{-2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
};
void FillData(ost::seq::alg::SubstWeightMatrix* subst, short (&data)[23][23]){
char chars[23] = {'A','B','C','D','E','F','G','H','I','K','L','M','N','P','Q',
'R','S','T','V','W','X','Y','Z'};
......@@ -140,6 +159,17 @@ void FillData(ost::seq::alg::SubstWeightMatrix* subst, short (&data)[23][23]){
}
}
void FillNucData(ost::seq::alg::SubstWeightMatrix* subst,
short (&data)[16][16]) {
char chars[16] = {'A','T','U','G','C','S','W','R','Y','K','M','B','V','H',
'D','N'};
for(uint i = 0; i < 16; ++i){
for(uint j = 0; j < 16; ++j){
subst->SetWeight(chars[i],chars[j],data[i][j]);
}
}
}
void FillIdentity(ost::seq::alg::SubstWeightMatrix* subst) {
char chars[26] = {'A','B','C','D','E','F','G','H','I','J','K','L','M','N','O',
'P','Q','R','S','T','U','V','W','X','Y','Z'};
......@@ -148,6 +178,20 @@ void FillIdentity(ost::seq::alg::SubstWeightMatrix* subst) {
}
}
void FillMatch(ost::seq::alg::SubstWeightMatrix* subst) {
char chars[26] = {'A','B','C','D','E','F','G','H','I','J','K','L','M','N','O',
'P','Q','R','S','T','U','V','W','X','Y','Z'};
for(uint i = 0; i < 26; ++i) {
for(uint j = 0; j < 26; ++j) {
if(i == j){
subst->SetWeight(chars[i], chars[j], 1.0);
} else {
subst->SetWeight(chars[i], chars[j], -1.0);
}
}
}
}
}
namespace ost { namespace seq { namespace alg {
......@@ -179,6 +223,14 @@ void SubstWeightMatrix::AssignPreset(SubstWeightMatrix::Preset p)
FillIdentity(this);
break;
}
case MATCH:{
FillMatch(this);
break;
}
case NUC44:{
FillNucData(this,RAW_NUC44_DATA);
break;
}
}
}
......
......@@ -44,7 +44,9 @@ public:
BLOSUM62 = 1,
BLOSUM80 = 2,
BLOSUM100 = 3,
IDENTITY = 4};
IDENTITY = 4,
MATCH = 5,
NUC44 = 6};
/// \brief Initialize substitution matrix with zero.
///
/// In order to get a useful substitution weight matrix, use SetWeight().
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment