Skip to content
Snippets Groups Projects
Commit b54d2d09 authored by Studer Gabriel's avatar Studer Gabriel
Browse files

make blosum data reproducible

Reason for that effort was the detection of an error in one of the entries
in BLOSUM100 (a missing minus, see diff).
To guarantee reproducibility (and check all matrices for errors),
the blosum matrix files from NCBI at
ftp://ftp.ncbi.nlm.nih.gov/blast/matrices/ have been downloaded and
the generate_blosum_data.py script spits out the required content
in subst_weight_matrix.cc.
parent 5ae2daca
No related branches found
No related tags found
No related merge requests found
# Matrix made by matblas from blosum100_3.iij
# * column uses minimum score
# BLOSUM Clustered Scoring Matrix in 1/3 Bit Units
# Blocks Database = /data/blocks_5.0/blocks.dat
# Cluster Percentage: >= 100
# Entropy = 1.4516, Expected = -1.0948
A R N D C Q E G H I L K M F P S T W Y V B Z X *
A 8 -3 -4 -5 -2 -2 -3 -1 -4 -4 -4 -2 -3 -5 -2 1 -1 -6 -5 -2 -4 -2 -2 -10
R -3 10 -2 -5 -8 0 -2 -6 -1 -7 -6 3 -4 -6 -5 -3 -3 -7 -5 -6 -4 -1 -3 -10
N -4 -2 11 1 -5 -1 -2 -2 0 -7 -7 -1 -5 -7 -5 0 -1 -8 -5 -7 5 -2 -3 -10
D -5 -5 1 10 -8 -2 2 -4 -3 -8 -8 -3 -8 -8 -5 -2 -4 -10 -7 -8 6 0 -4 -10
C -2 -8 -5 -8 14 -7 -9 -7 -8 -3 -5 -8 -4 -4 -8 -3 -3 -7 -6 -3 -7 -8 -5 -10
Q -2 0 -1 -2 -7 11 2 -5 1 -6 -5 2 -2 -6 -4 -2 -3 -5 -4 -5 -2 5 -2 -10
E -3 -2 -2 2 -9 2 10 -6 -2 -7 -7 0 -5 -8 -4 -2 -3 -8 -7 -5 0 7 -3 -10
G -1 -6 -2 -4 -7 -5 -6 9 -6 -9 -8 -5 -7 -8 -6 -2 -5 -7 -8 -8 -3 -5 -4 -10
H -4 -1 0 -3 -8 1 -2 -6 13 -7 -6 -3 -5 -4 -5 -3 -4 -5 1 -7 -2 -1 -4 -10
I -4 -7 -7 -8 -3 -6 -7 -9 -7 8 2 -6 1 -2 -7 -5 -3 -6 -4 4 -8 -7 -3 -10
L -4 -6 -7 -8 -5 -5 -7 -8 -6 2 8 -6 3 0 -7 -6 -4 -5 -4 0 -8 -6 -3 -10
K -2 3 -1 -3 -8 2 0 -5 -3 -6 -6 10 -4 -6 -3 -2 -3 -8 -5 -5 -2 0 -3 -10
M -3 -4 -5 -8 -4 -2 -5 -7 -5 1 3 -4 12 -1 -5 -4 -2 -4 -5 0 -7 -4 -3 -10
F -5 -6 -7 -8 -4 -6 -8 -8 -4 -2 0 -6 -1 11 -7 -5 -5 0 4 -3 -7 -7 -4 -10
P -2 -5 -5 -5 -8 -4 -4 -6 -5 -7 -7 -3 -5 -7 12 -3 -4 -8 -7 -6 -5 -4 -4 -10
S 1 -3 0 -2 -3 -2 -2 -2 -3 -5 -6 -2 -4 -5 -3 9 2 -7 -5 -4 -1 -2 -2 -10
T -1 -3 -1 -4 -3 -3 -3 -5 -4 -3 -4 -3 -2 -5 -4 2 9 -7 -5 -1 -2 -3 -2 -10
W -6 -7 -8 -10 -7 -5 -8 -7 -5 -6 -5 -8 -4 0 -8 -7 -7 17 2 -5 -9 -7 -6 -10
Y -5 -5 -5 -7 -6 -4 -7 -8 1 -4 -4 -5 -5 4 -7 -5 -5 2 12 -5 -6 -6 -4 -10
V -2 -6 -7 -8 -3 -5 -5 -8 -7 4 0 -5 0 -3 -6 -4 -1 -5 -5 8 -7 -5 -3 -10
B -4 -4 5 6 -7 -2 0 -3 -2 -8 -8 -2 -7 -7 -5 -1 -2 -9 -6 -7 6 0 -4 -10
Z -2 -1 -2 0 -8 5 7 -5 -1 -7 -6 0 -4 -7 -4 -2 -3 -7 -6 -5 0 6 -2 -10
X -2 -3 -3 -4 -5 -2 -3 -4 -4 -3 -3 -3 -3 -4 -4 -2 -2 -6 -4 -3 -4 -2 -3 -10
* -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 1
# Matrix made by matblas from blosum45.iij
# * column uses minimum score
# BLOSUM Clustered Scoring Matrix in 1/3 Bit Units
# Blocks Database = /data/blocks_5.0/blocks.dat
# Cluster Percentage: >= 45
# Entropy = 0.3795, Expected = -0.2789
A R N D C Q E G H I L K M F P S T W Y V B Z X *
A 5 -2 -1 -2 -1 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -2 -2 0 -1 -1 0 -5
R -2 7 0 -1 -3 1 0 -2 0 -3 -2 3 -1 -2 -2 -1 -1 -2 -1 -2 -1 0 -1 -5
N -1 0 6 2 -2 0 0 0 1 -2 -3 0 -2 -2 -2 1 0 -4 -2 -3 4 0 -1 -5
D -2 -1 2 7 -3 0 2 -1 0 -4 -3 0 -3 -4 -1 0 -1 -4 -2 -3 5 1 -1 -5
C -1 -3 -2 -3 12 -3 -3 -3 -3 -3 -2 -3 -2 -2 -4 -1 -1 -5 -3 -1 -2 -3 -2 -5
Q -1 1 0 0 -3 6 2 -2 1 -2 -2 1 0 -4 -1 0 -1 -2 -1 -3 0 4 -1 -5
E -1 0 0 2 -3 2 6 -2 0 -3 -2 1 -2 -3 0 0 -1 -3 -2 -3 1 4 -1 -5
G 0 -2 0 -1 -3 -2 -2 7 -2 -4 -3 -2 -2 -3 -2 0 -2 -2 -3 -3 -1 -2 -1 -5
H -2 0 1 0 -3 1 0 -2 10 -3 -2 -1 0 -2 -2 -1 -2 -3 2 -3 0 0 -1 -5
I -1 -3 -2 -4 -3 -2 -3 -4 -3 5 2 -3 2 0 -2 -2 -1 -2 0 3 -3 -3 -1 -5
L -1 -2 -3 -3 -2 -2 -2 -3 -2 2 5 -3 2 1 -3 -3 -1 -2 0 1 -3 -2 -1 -5
K -1 3 0 0 -3 1 1 -2 -1 -3 -3 5 -1 -3 -1 -1 -1 -2 -1 -2 0 1 -1 -5
M -1 -1 -2 -3 -2 0 -2 -2 0 2 2 -1 6 0 -2 -2 -1 -2 0 1 -2 -1 -1 -5
F -2 -2 -2 -4 -2 -4 -3 -3 -2 0 1 -3 0 8 -3 -2 -1 1 3 0 -3 -3 -1 -5
P -1 -2 -2 -1 -4 -1 0 -2 -2 -2 -3 -1 -2 -3 9 -1 -1 -3 -3 -3 -2 -1 -1 -5
S 1 -1 1 0 -1 0 0 0 -1 -2 -3 -1 -2 -2 -1 4 2 -4 -2 -1 0 0 0 -5
T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -1 -1 2 5 -3 -1 0 0 -1 0 -5
W -2 -2 -4 -4 -5 -2 -3 -2 -3 -2 -2 -2 -2 1 -3 -4 -3 15 3 -3 -4 -2 -2 -5
Y -2 -1 -2 -2 -3 -1 -2 -3 2 0 0 -1 0 3 -3 -2 -1 3 8 -1 -2 -2 -1 -5
V 0 -2 -3 -3 -1 -3 -3 -3 -3 3 1 -2 1 0 -3 -1 0 -3 -1 5 -3 -3 -1 -5
B -1 -1 4 5 -2 0 1 -1 0 -3 -3 0 -2 -3 -2 0 0 -4 -2 -3 4 2 -1 -5
Z -1 0 0 1 -3 4 4 -2 0 -3 -2 1 -1 -3 -1 0 -1 -2 -2 -3 2 4 -1 -5
X 0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 0 0 -2 -1 -1 -1 -1 -1 -5
* -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 1
# Matrix made by matblas from blosum62.iij
# * column uses minimum score
# BLOSUM Clustered Scoring Matrix in 1/2 Bit Units
# Blocks Database = /data/blocks_5.0/blocks.dat
# Cluster Percentage: >= 62
# Entropy = 0.6979, Expected = -0.5209
A R N D C Q E G H I L K M F P S T W Y V B Z X *
A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 -2 -1 0 -4
R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 -1 0 -1 -4
N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 3 0 -1 -4
D -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 4 1 -1 -4
C 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3 -2 -4
Q -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0 3 -1 -4
E -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4
G 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 -1 -2 -1 -4
H -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0 0 -1 -4
I -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 -3 -3 -1 -4
L -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 -4 -3 -1 -4
K -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0 1 -1 -4
M -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 -3 -1 -1 -4
F -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 -3 -3 -1 -4
P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 -2 -1 -2 -4
S 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0 0 0 -4
T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 -1 -1 0 -4
W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 -4 -3 -2 -4
Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 -3 -2 -1 -4
V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 -3 -2 -1 -4
B -2 -1 3 4 -3 0 1 -1 0 -3 -4 0 -3 -3 -2 0 -1 -4 -3 -3 4 1 -1 -4
Z -1 0 0 1 -3 3 4 -2 0 -3 -3 1 -1 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4
X 0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2 0 0 -2 -1 -1 -1 -1 -1 -4
* -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1
# Matrix made by matblas from blosum80_3.iij
# * column uses minimum score
# BLOSUM Clustered Scoring Matrix in 1/3 Bit Units
# Blocks Database = /data/blocks_5.0/blocks.dat
# Cluster Percentage: >= 80
# Entropy = 0.9868, Expected = -0.7442
A R N D C Q E G H I L K M F P S T W Y V B Z X *
A 7 -3 -3 -3 -1 -2 -2 0 -3 -3 -3 -1 -2 -4 -1 2 0 -5 -4 -1 -3 -2 -1 -8
R -3 9 -1 -3 -6 1 -1 -4 0 -5 -4 3 -3 -5 -3 -2 -2 -5 -4 -4 -2 0 -2 -8
N -3 -1 9 2 -5 0 -1 -1 1 -6 -6 0 -4 -6 -4 1 0 -7 -4 -5 5 -1 -2 -8
D -3 -3 2 10 -7 -1 2 -3 -2 -7 -7 -2 -6 -6 -3 -1 -2 -8 -6 -6 6 1 -3 -8
C -1 -6 -5 -7 13 -5 -7 -6 -7 -2 -3 -6 -3 -4 -6 -2 -2 -5 -5 -2 -6 -7 -4 -8
Q -2 1 0 -1 -5 9 3 -4 1 -5 -4 2 -1 -5 -3 -1 -1 -4 -3 -4 -1 5 -2 -8
E -2 -1 -1 2 -7 3 8 -4 0 -6 -6 1 -4 -6 -2 -1 -2 -6 -5 -4 1 6 -2 -8
G 0 -4 -1 -3 -6 -4 -4 9 -4 -7 -7 -3 -5 -6 -5 -1 -3 -6 -6 -6 -2 -4 -3 -8
H -3 0 1 -2 -7 1 0 -4 12 -6 -5 -1 -4 -2 -4 -2 -3 -4 3 -5 -1 0 -2 -8
I -3 -5 -6 -7 -2 -5 -6 -7 -6 7 2 -5 2 -1 -5 -4 -2 -5 -3 4 -6 -6 -2 -8
L -3 -4 -6 -7 -3 -4 -6 -7 -5 2 6 -4 3 0 -5 -4 -3 -4 -2 1 -7 -5 -2 -8
K -1 3 0 -2 -6 2 1 -3 -1 -5 -4 8 -3 -5 -2 -1 -1 -6 -4 -4 -1 1 -2 -8
M -2 -3 -4 -6 -3 -1 -4 -5 -4 2 3 -3 9 0 -4 -3 -1 -3 -3 1 -5 -3 -2 -8
F -4 -5 -6 -6 -4 -5 -6 -6 -2 -1 0 -5 0 10 -6 -4 -4 0 4 -2 -6 -6 -3 -8
P -1 -3 -4 -3 -6 -3 -2 -5 -4 -5 -5 -2 -4 -6 12 -2 -3 -7 -6 -4 -4 -2 -3 -8
S 2 -2 1 -1 -2 -1 -1 -1 -2 -4 -4 -1 -3 -4 -2 7 2 -6 -3 -3 0 -1 -1 -8
T 0 -2 0 -2 -2 -1 -2 -3 -3 -2 -3 -1 -1 -4 -3 2 8 -5 -3 0 -1 -2 -1 -8
W -5 -5 -7 -8 -5 -4 -6 -6 -4 -5 -4 -6 -3 0 -7 -6 -5 16 3 -5 -8 -5 -5 -8
Y -4 -4 -4 -6 -5 -3 -5 -6 3 -3 -2 -4 -3 4 -6 -3 -3 3 11 -3 -5 -4 -3 -8
V -1 -4 -5 -6 -2 -4 -4 -6 -5 4 1 -4 1 -2 -4 -3 0 -5 -3 7 -6 -4 -2 -8
B -3 -2 5 6 -6 -1 1 -2 -1 -6 -7 -1 -5 -6 -4 0 -1 -8 -5 -6 6 0 -3 -8
Z -2 0 -1 1 -7 5 6 -4 0 -6 -5 1 -3 -6 -2 -1 -2 -5 -4 -4 0 6 -1 -8
X -1 -2 -2 -3 -4 -2 -2 -3 -2 -2 -2 -2 -2 -3 -3 -1 -1 -5 -3 -2 -3 -1 -2 -8
* -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 1
# code generation for subst_weight_matrix.cc
# loads BLOSUM substitution matrices from files provided by NCBI at:
# ftp://ftp.ncbi.nlm.nih.gov/blast/matrices/
def Generate(filename, matrix_name):
with open(filename) as f:
data = f.readlines()
scores = dict()
olcs = None
for line in data:
if line.startswith('#'):
continue
if olcs is None:
if " A " in line and " R " in line and " N " in line:
# very high likelihood that this line contains the olcs
olcs = line.strip().split()
continue
split_line = line.strip().split()
if split_line[0] in olcs:
olc = split_line[0]
for score_idx, score in enumerate(split_line[1:]):
scores[(olc, olcs[score_idx])] = score
ost_olcs = "ABCDEFGHIKLMNPQRSTVWXYZ"
print("short %s[23][23]={"%(matrix_name))
for a in ost_olcs:
score_string = ""
for b in ost_olcs:
score = scores[(a,b)]
if len(score_string) == 0:
entry = [' ', ' ']
else:
entry = [',', ' ', ' ', ' ']
for i in range(len(score)):
entry[-1-i] = score[-1-i]
score_string += ''.join(entry)
print(" {%s},"%(score_string))
print("};")
print()
Generate("BLOSUM45", "RAW_BLOSUM45_DATA")
Generate("BLOSUM62", "RAW_BLOSUM62_DATA")
Generate("BLOSUM80", "RAW_BLOSUM80_DATA")
Generate("BLOSUM100", "RAW_BLOSUM100_DATA")
......@@ -23,6 +23,9 @@
namespace{
// The data for the following matrices can be reproduced by running generate_blosum_data.py
// in OST_SOURCE/modules/seq/alg/src/data
short RAW_BLOSUM45_DATA[23][23]={
{ 5, -1, -1, -2, -1, -2, 0, -2, -1, -1, -1, -1, -1, -1, -1, -2, 1, 0, 0, -2, 0, -2, -1},
{-1, 4, -2, 5, 1, -3, -1, 0, -3, 0, -3, -2, 4, -2, 0, -1, 0, 0, -3, -4, -1, -2, 2},
......@@ -48,7 +51,7 @@ short RAW_BLOSUM45_DATA[23][23]={
{-2, -2, -3, -2, -2, 3, -3, 2, 0, -1, 0, 0, -2, -3, -1, -1, -2, -1, -1, 3, -1, 8, -2},
{-1, 2, -3, 1, 4, -3, -2, 0, -3, 1, -2, -1, 0, -1, 4, 0, 0, -1, -3, -2, -1, -2, 4},
};
short RAW_BLOSUM62_DATA[23][23]={
{ 4, -2, 0, -2, -1, -2, 0, -2, -1, -1, -1, -1, -2, -1, -1, -1, 1, 0, 0, -3, 0, -2, -1},
{-2, 4, -3, 4, 1, -3, -1, 0, -3, 0, -4, -3, 3, -2, 0, -1, 0, -1, -3, -4, -1, -3, 1},
......@@ -121,7 +124,7 @@ short RAW_BLOSUM100_DATA[23][23]={
{ 1, -1, -3, -2, -2, -5, -2, -3, -5, -2, -6, -4, 0, -3, -2, -3, 9, 2, -4, -7, -2, -5, -2},
{-1, -2, -3, -4, -3, -5, -5, -4, -3, -3, -4, -2, -1, -4, -3, -3, 2, 9, -1, -7, -2, -5, -3},
{-2, -7, -3, -8, -5, -3, -8, -7, 4, -5, 0, 0, -7, -6, -5, -6, -4, -1, 8, -5, -3, -5, -5},
{-6, -9, -7, 10, -8, 0, -7, -5, -6, -8, -5, -4, -8, -8, -5, -7, -7, -7, -5, 17, -6, 2, -7},
{-6, -9, -7,-10, -8, 0, -7, -5, -6, -8, -5, -4, -8, -8, -5, -7, -7, -7, -5, 17, -6, 2, -7},
{-2, -4, -5, -4, -3, -4, -4, -4, -3, -3, -3, -3, -3, -4, -2, -3, -2, -2, -3, -6, -3, -4, -2},
{-5, -6, -6, -7, -7, 4, -8, 1, -4, -5, -4, -5, -5, -7, -4, -5, -5, -5, -5, 2, -4, 12, -6},
{-2, 0, -8, 0, 7, -7, -5, -1, -7, 0, -6, -4, -2, -4, 5, -1, -2, -3, -5, -7, -2, -6, 6},
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment