Commit 886477e6 authored by Studer Gabriel's avatar Studer Gabriel
Browse files

loop modelling performance benchmark

parent b0a19b07
Scripts and data to reproduce the loop modelling accuracy benchmark.
The whole benchmark is a three step process:
1. Fetch Benchmark Set
2. Create Non-Redundant Databases
3. Modelling / Evaluation
Fetch Benchmark Set
-------------------
- Run `pm download_structures.py` - This reads the loop definitions in
fread_data.txt and directly fetches the according files from RCSB.
PDB files are dumped in the **structure** directory and the sequence
for each chain is dumped in **sequences**.
- To run ProMod3 as we do in SWISS-MODEL, you also need sequence profiles in
hhm format. You need for every file **xyz.fasta** in **sequences** an
equivalent named **xyz.hhm** in **profiles**. Either you run HHblits by
yourself or you just go for the files we're providing to you.
Create Non-Redundant Databases
------------------------------
- Run `pm create_non_redundant_structure_db.py` - Loads the default StructureDB
and FragDB used for loop modelling and creates non-redundant versions of it.
Entries with high sequence identity to any of the chains where we want
to model a loop are removed. This takes some time...
Modelling / Evaluation
-----------------------------------
- Run `pm model_loops.py` - This reads the loop definitions in fread_data.txt
and remodels the defined loops by using the prepared data.
The average and median RMSD values (based on N, CA, C and O atoms) are
directly printed at the end of the script. For each loop length you get:
- native RMSD values: Raw RMSD of the loop as it is remodelled towards the
experimental coordinates.
- superposed RMSD values: RMSD of the extracted loop upon minimum RMSD
superposition onto the experimental coordinates.
- minimized RMSD values: The whole structure with the remodelled loop is
energy minimized as you would process a model in the last step of the
default homology modelling pipeline in ProMod3. After that, RMSD values are
estimated the same way as the native RMSD values.
Table 1 in the ProMod3 manuscript reports the per-length averages of the
minimized RMSD values.
import os
from ost import io
from ost import seq
from promod3 import loop
# load the sequence of all chains that are part of the loop modelling benchmark
target_sequences = list()
with open('fread_data.txt', 'r') as fh:
fread_data = fh.readlines()
for item in fread_data:
pdb_id = item.split()[0]
chain_id = item.split()[1]
seq_path = os.path.join('sequences', pdb_id + '_' + chain_id + '.fasta')
target_sequences.append(io.LoadSequence(seq_path))
# iterate over entries in the default StructureDB and keep the nonredundant ones
structure_db = loop.LoadStructureDB()
indices_to_keep = list()
for idx in range(structure_db.GetNumCoords()):
print('processing ',idx)
coord_info = structure_db.GetCoordInfo(idx)
frag_info = loop.FragmentInfo(idx, 0, coord_info.size)
s = seq.CreateSequence('db_seq', structure_db.GetSequence(frag_info))
skip = False
for ts in target_sequences:
aln = seq.alg.SemiGlobalAlign(ts, s, seq.alg.BLOSUM62)[0]
s_id = seq.alg.SequenceIdentity(aln)
if s_id > 90.0:
# High sequence identity can be caused through a supper crappy
# alignment where we mostly have gaps. Lets estimate the fraction
# of aligned residues for the shorter sequence
s0 = str(aln.GetSequence(0))
s1 = str(aln.GetSequence(1))
n_aligned = sum([int(a!='-' and b!='-') for a,b in zip(s0, s1)])
frac = float(n_aligned) / min(sum([int(c!='-') for c in s0]),
sum([int(c!='-') for c in s1]))
# set arbitrary threshold, most residues are expected to be aligned
if frac > 0.8:
skip = True
break
if skip:
print('skip entry with pdb id', coord_info.id)
else:
indices_to_keep.append(idx)
# generate the StructureDB only containing the nonredundant entries as well as
# the accoring FragDB
non_redundant_db = structure_db.GetSubDB(indices_to_keep)
non_redundant_db.PrintStatistics()
non_redundant_db.SavePortable('nonredundant_structure_db_portable.dat')
max_pairwise_rmsd = 1.0
dist_bin_size = 1.0
angle_bin_size = 20
fragdb = loop.FragDB(dist_bin_size, angle_bin_size)
for i in range(3,15):
print('start to add fragments of length ', i)
fragdb.AddFragments(i, max_pairwise_rmsd, non_redundant_db)
fragdb.PrintStatistics()
fragdb.SavePortable('nonredundant_frag_db_portable.dat')
import os
from ost import io, mol, seq
import promod3
# sometimes the modelling fails, because the structure contains residues with
# olc 'U' / '?' lets simply replace them by alanine if all backbone atoms are
# present.
def Clean(prot):
ed = prot.EditXCS()
for r in prot.residues:
if r.one_letter_code == 'U' or r.one_letter_code == '?':
# check for presence of the backbone atoms
n = r.FindAtom('N')
ca = r.FindAtom('CA')
c = r.FindAtom('C')
o = r.FindAtom('O')
# if everything is there we keep it as alanine and kick away
# all non alanine atoms...
if n.IsValid() and ca.IsValid() and c.IsValid() and o.IsValid():
ed.RenameResidue(r,'ALA')
r.SetOneLetterCode('A')
for a in r.atoms:
aname = a.GetName()
if aname not in ['N', 'CA', 'C', 'O', 'CB']:
ed.DeleteAtom(a)
else:
ed.DeleteResidue(r)
structure_out_dir = 'structures'
sequence_out_dir = 'sequences'
# make sure we have a compound library
promod3.SetCompoundsChemlib()
if not os.path.exists(structure_out_dir):
os.makedirs(structure_out_dir)
if not os.path.exists(sequence_out_dir):
os.makedirs(sequence_out_dir)
with open('fread_data.txt') as fh:
testset_data = fh.readlines()
for item in testset_data:
pdb_id = item.split()[0]
chain_id = item.split()[1]
print('processing', pdb_id, chain_id)
out_path = os.path.join(structure_out_dir, pdb_id + '.pdb')
prot = io.LoadPDB(pdb_id, remote=True).Select('peptide=true')
prot = mol.CreateEntityFromView(prot, True)
Clean(prot)
io.SavePDB(prot, out_path)
for ch in prot.chains:
s = ''.join([r.one_letter_code for r in ch.residues])
s = seq.CreateSequence(ch.GetName(), s)
s_out = os.path.join(sequence_out_dir,
'_'.join([pdb_id, ch.GetName()]) + '.fasta')
io.SaveSequence(s, s_out)
1fnl A 95 4
1oxx K 134 4
1m2j A 11 4
3cls D 135 4
2elc A 168 4
2fvy A 41 4
1mg5 A 39 4
2d81 A 272 4
2z0j A 117 4
1wap N 48 4
1ht6 A 258 4
2axw B 86 4
1kjv A 250 4
2ozl A 185 4
1w3o A 63 4
2fxf A 120 4
1kqf A 915 4
2uxy A 181 4
2i47 A 330 4
2ho3 C 261 4
1wst A 285 4
2gf9 A 143 4
3b4u B 80 4
1j0h A 118 4
1dun A 45 4
2h6l A 34 4
1ru4 A 292 4
1fba A 219 4
3cms A 21 4
1ndb B 469 4
2apj B 51 5
2gsd A 174 5
2bdr A 86 5
2qed A 51 5
3cms A 33 5
2i3h B 125 5
2hqj A 25 5
1t61 A 184 5
2fcr A 7 5
1ra0 A 338 5
1ys1 X 283 5
1px5 B 112 5
2ntp B 175 5
2oss A 76 5
2apj D 217 5
2bbk J 267 5
1x46 A 21 5
1ra0 A 391 5
2i7d A 41 5
2h14 A 310 5
2ntp A 118 5
1b8p A 123 5
2ozl D 95 5
1oyg A 319 5
2aee A 64 5
2on5 H 35 5
1zkp A 34 5
1khb A 185 5
2pbp A 93 5
2b49 A 793 5
2i5v O 91 6
2fzw A 336 6
1t6g D 156 6
1uwc A 205 6
2bjk A 406 6
2d51 A 25 6
1wkr A 179 6
2ps2 C 245 6
2o2p B 793 6
1iua A 17 6
1j31 D 223 6
2vb1 A 37 6
2ejn B 72 6
1mv8 D 378 6
3clm A 65 6
2p6c B 11 6
2r2a B 179 6
2os0 A 7 6
1r45 C 118 6
2otu E 97 6
1wza A 250 6
2vba C 297 6
2c43 A 302 6
1w32 A 115 6
1w9h A 354 6
2yvt A 40 6
1svi A 115 6
1k92 A 269 6
2v27 A 255 6
2cn3 B 87 6
2nqt A 48 7
1wa3 E 145 7
2ifq B 69 7
2r8o B 601 7
2gke A 118 7
1v5d A 134 7
2g8s B 155 7
1kwf A 169 7
1ys1 X 111 7
3cms A 184 7
1brf A 22 7
1z2n X 126 7
2rb9 D 154 7
2cvd B 102 7
2dm6 A 293 7
2qjj C 237 7
3cms A 143 7
2fma A 140 7
1wta A 171 7
1f5v A 221 7
1wkq A 20 7
1oxx K 264 7
2cyg A 147 7
2qjw A 127 7
2e11 B 11 7
1mml A 138 7
2qg6 A 99 7
1r6x A 195 7
1pn2 C 21 7
2h6e A 24 7
1hn0 A 580 8
2z3h A 21 8
1sw0 A 170 8
1zz1 C 309 8
2yqu A 430 8
3biq A 134 8
1nwp A 84 8
3bwu D 24 8
2pw4 A 147 8
1rkq B 146 8
2bem B 149 8
1rh9 A 360 8
1juh A 20 8
1qfm A 639 8
2dsn B 314 8
1djt B 38 8
2oj6 D 432 8
2gag A 572 8
1hdh B 13 8
2h3b B 300 8
1enf A 178 8
1pk9 C 206 8
2b1x A 310 8
1taf A 32 8
1gwm A 54 8
1ui0 A 162 8
1vl2 C 114 8
1tjy A 91 8
2vo9 C 125 8
2iw1 A 277 8
2bbk J 288 9
2uw1 B 50 9
1ox0 A 89 9
2b82 B 113 9
1qvz B 49 9
1k7j A 174 9
1y0p A 156 9
1fnl A 73 9
2bv4 A 95 9
2h8g A 139 9
1iv8 A 43 9
1un3 A 14 9
2far A 594 9
2prd A 75 9
2jba A 40 9
1kjv B 12 9
2gdq B 337 9
1px0 D 80 9
2vef A 112 9
1mka B 114 9
1ezg A 18 9
1cqx B 210 9
2z98 A 93 9
2dsn A 339 9
1kqf A 528 9
1woq B 134 9
2cb5 A 90 9
1h1n A 130 9
2ob5 A 130 9
2jhf A 92 9
1dy2 A 91 10
2h61 B 40 10
2uzi H 99 10
2ft6 A 9 10
1rxq D 85 10
2dr1 B 39 10
1ndb A 216 10
1v0z D 169 10
1bqc A 27 10
1r7a A 470 10
1m6s A 119 10
2c1s B 54 10
2q8r H 9 10
2jda A 16 10
1zk7 A 307 10
1xv2 A 141 10
1ooh B 56 10
2fdv B 112 10
1b6g A 13 10
2rdq A 230 10
2q4g Y 148 10
2cst B 123 10
1p1j A 518 10
3clm A 297 10
1nvr A 246 10
1yoc B 109 10
1dbf C 78 10
1q7e A 259 10
2i51 B 44 10
1u5d B 118 10
1umg A 332 11
2c1v A 214 11
1ei5 A 418 11
2a6v B 188 11
2e5f A 126 11
1f9y A 132 11
2aq8 A 149 11
1nb9 A 90 11
1ju2 B 373 11
2rik A 187 11
1h03 Q 67 11
2ayd A 301 11
2cki A 160 11
2gb4 A 31 11
2z0j F 7 11
1r6x A 257 11
2qp8 B 419 11
2rgm A 322 11
2gak B 179 11
1qwl A 407 11
1pa2 A 220 11
2fr5 C 27 11
2sic E 33 11
2ot4 B 250 11
1jig C 80 11
1jhd A 370 11
2pez A 92 11
2iwk A 395 11
2car B 141 11
1ug6 A 174 11
1p0i A 224 12
1qw9 B 213 12
1alv B 180 12
2bw8 B 111 12
2pet A 124 12
3bc8 A 421 12
2ywi B 172 12
2dvm D 71 12
2g0w A 77 12
1vbu B 647 12
1qtw A 146 12
1q7f B 771 12
1kqf A 678 12
2rde A 190 12
1jix A 221 12
3b8x A 184 12
1uay A 117 12
1ynp A 15 12
1ypb I 53 12
1udh A 124 12
1l8n A 444 12
1t2w C 121 12
1to4 B 120 12
1v1h C 345 12
1lop A 86 12
3bzw C 202 12
2q99 A 124 12
1oz2 A 231 12
1yph C 69 12
1u60 D 89 12
1dw0 B 31 13
2rb7 B 324 13
1aj8 A 302 13
1sff B 240 13
1ec7 C 160 13
1vrm A 276 13
1c9o A 33 13
2ddt B 91 13
3b8z A 321 13
1rx0 A 208 13
1j31 C 40 13
1ht6 A 108 13
1jv1 A 109 13
2rfm A 88 13
2gou A 235 13
1vgm B 140 13
2p0a A 232 13
1sau A 35 13
2rij A 149 13
1q8f C 66 13
1b8a A 269 13
2inu C 347 13
1f60 A 345 13
2j7p A 86 13
1muc A 317 13
2rc3 C 98 13
1lam A 9 13
1zed A 158 13
1c2a A 26 13
1v0z C 386 13
3cls D 190 14
1qjw B 302 14
1jhd A 295 14
2dcf A 203 14
1y2k A 351 14
1ooy A 320 14
2cd7 A 55 14
1vmf C 68 14
1ock B 63 14
1jp4 A 153 14
2fhf A 882 14
2nt0 A 343 14
1x25 B 32 14
2d0w A 98 14
2bjk B 388 14
2bc0 A 432 14
1t5o B 153 14
1cnv A 188 14
2nw8 B 34 14
1hxh C 87 14
2gb4 A 182 14
1yqz A 415 14
1onw A 134 14
2deb A 394 14
1n2z A 161 14
1v7z F 221 14
1llf A 437 14
3cls C 14 14
2qul A 213 14
1o91 C 622 14
1y4w A 109 15
2faf A 535 15
2gz1 B 94 15
2z9w A 236 15
1qwo A 178 15
2j6g A 207 15
2nuw B 251 15
2cdu A 313 15
2eab B 596 15
2ab0 B 113 15
1mg4 A 133 15
1jnr C 213 15
2nac B 323 15
1zl0 B 198 15
1r6x A 161 15
2o0r A 199 15
1v73 A 93 15
2eo5 A 47 15
2qxx B 43 15
1p5x A 157 15
1lqa B 49 15
1gci A 74 15
2yzh A 118 15
1mpg A 143 15
1ccw D 172 15
1e4c P 59 15
1blx B 30 15
1llf A 388 15
2cch A 233 15
2dep A 298 15
2rb9 D 188 16
1k7c A 7 16
3cjy A 129 16
1q7e A 126 16
2uuy A 184 16
2dvt B 233 16
2zdr A 329 16
2ntp B 141 16
1brt A 57 16
2pgz C 158 16
1pjc A 153 16
1ucs A 41 16
1p5x A 70 16
2rfg B 258 16
1rdq E 273 16
1dxe B 112 16
2aax A 746 16
2v8h D 205 16
2h3b A 231 16
1a12 C 92 16
2rhw A 70 16
2r37 A 169 16
2eab A 615 16
2gou A 23 16
2j3x A 110 16
2olm A 94 16
2q62 D 71 16
1cxp C 310 16
2phj A 36 16
1nba B 95 16
2c0h A 278 17
2qmj A 200 17
1sh7 B 180 17
1aj2 A 140 17
1odz A 79 17
3b8i E 118 17
2oh5 A 91 17
2gju B 138 17
1g9g A 158 17
1wx1 A 116 17
1jfx A 164 17
1y9z B 157 17
1ys7 A 87 17
1owl A 159 17
1t64 A 264 17
1y12 B 10 17
1ydy B 131 17
2pyw B 65 17
1dbx A 97 17
2q8k A 88 17
1f61 A 99 17
1kap P 202 17
2c6u A 163 17
2bjf A 258 17
1nff A 192 17
2fhf A 900 17
2far A 814 17
1ds1 A 124 17
2vba B 218 17
1ups A 37 17
1vlb A 682 18
2qew A 196 18
1wdp A 224 18
1u6r B 62 18
2g6f X 14 18
2q3u B 121 18
2ow6 A 870 18
2v9t B 286 18
1va4 C 119 18
2f2b A 34 18
1jo8 A 6 18
1ug6 A 37 18
2pzm A 121 18
2j1g C 214 18
1xg0 A 16 18
2a28 C 6 18
2ptr A 287 18
3bi1 A 674 18
2bjq A 15 18
2nt0 A 186 18
1ek6 B 190 18
2rkv A 188 18
2ivf A 801 18
1umd C 267 18
1xd3 C 77 18
1oq1 D 84 18
2z6o A 103 18
1cpq A 55 18
2b1x C 395 18
2z8g A 435 18
1hh8 A 168 19
1m2x C 220 19
1ek6 A 137 19
1r03 A 77 19
1c7s A 479 19
1xlq B 30 19
1nc5 A 193 19
1vlg F 66 19
1pfv A 517 19
2ord A 254 19
1pa2 A 112 19
1brt A 123 19
1k7h A 96 19
1w7c A 617 19
2q3u A 120 19
1oi2 B 143 19
1zcj A 279 19
1hdh A 136 19
2aq2 B 87 19
2gmn A 200 19
1a58 A 45 19