-
Studer Gabriel authoredStuder Gabriel authored
model_loops.py 6.17 KiB
import os, time
import numpy as np
from ost import mol, geom, io, seq
from promod3 import modelling, loop
def N_CA_C_O_RMSD(bb_one, bb_two):
if len(bb_one) != len(bb_two):
raise RuntimeError('Inconsistent sizes when calculating RMSD')
msd = 0.0
for i in range(len(bb_one)):
d_one = geom.Distance(bb_one.GetN(i), bb_two.GetN(i))
d_two = geom.Distance(bb_one.GetCA(i), bb_two.GetCA(i))
d_three = geom.Distance(bb_one.GetC(i), bb_two.GetC(i))
d_four = geom.Distance(bb_one.GetO(i), bb_two.GetO(i))
msd += d_one * d_one
msd += d_two * d_two
msd += d_three * d_three
msd += d_four * d_four
return np.sqrt(msd / (4 * len(bb_one)))
# all important overall parameters come here!
data = open('fread_data.txt','r').readlines()
min_loop_length = 4
max_loop_length = 20
structure_db = loop.StructureDB.LoadPortable('nonredundant_structure_db_portable.dat')
frag_db = loop.FragDB.LoadPortable('nonredundant_frag_db_portable.dat')
torsion_sampler = loop.LoadTorsionSamplerCoil()
rmsd_values = {i: list() for i in range(min_loop_length, max_loop_length + 1)}
superposed_rmsd_values = {i: list() for i in range(min_loop_length, max_loop_length + 1)}
rmsd_values_emin = {i: list() for i in range(min_loop_length, max_loop_length + 1)}
start_time = time.time()
for item in data:
pdb_id = item.split()[0]
chain_name = item.split()[1]
start_rnum = int(item.split()[2])
loop_length = int(item.split()[3])
if loop_length < min_loop_length or loop_length > max_loop_length:
continue
end_rnum = start_rnum + loop_length - 1
print('process:', pdb_id, chain_name, start_rnum, end_rnum, loop_length)
prot = io.LoadPDB(os.path.join('structures',pdb_id+'.pdb'))
mol.alg.AssignSecStruct(prot)
chain = prot.FindChain(chain_name)
if not chain.IsValid():
raise RuntimeError('Required chain not found')
# determine chain index the hacky way
chain_idx = -1
for i, c in enumerate(prot.chains):
if c == chain:
chain_idx = i
break
loop_residues = list()
for num in range(start_rnum, end_rnum + 1):
r = chain.FindResidue(mol.ResNum(num))
if r.IsValid():
loop_residues.append(r.handle)
else:
raise RuntimeError('Could not find all residues of specified loop')
loop_seq = ''.join([r.one_letter_code for r in loop_residues])
native_bb_list = loop.BackboneList(loop_seq, loop_residues)
# renumber residues, so they start with one
# the residue numbers of the residues in loop_residues
# will also be altered... this allows an easy matching with the
# residues in the rawmodel built later on...
ed = prot.handle.EditXCS()
for c in prot.handle.chains:
ed.RenumberChain(c, 1, False)
# we can now reset the start and end rnum
start_rnum = loop_residues[0].GetNumber()
end_rnum = loop_residues[-1].GetNumber()
nstem_rnum = start_rnum - 1
cstem_rnum = end_rnum + 1
# setup mhandle
sequence_list = list()
profile_list = list()
for c in prot.chains:
s_path = os.path.join('sequences', pdb_id + '_' + c.GetName() + '.fasta')
sequence_list.append(io.LoadSequence(s_path))
p_path = os.path.join('profiles', pdb_id + '_' + c.GetName() + '.hhm')
profile_list.append(io.LoadSequenceProfile(p_path))
aln_list = seq.AlignmentList()
for s,c in zip(sequence_list, prot.chains):
aln = seq.CreateAlignment()
aln.AddSequence(s)
aln.AddSequence(s)
view = prot.Select('cname='+c.GetName())
aln.AttachView(1, view)
aln_list.append(aln)
mhandle = modelling.BuildRawModel(aln_list)
modelling.SetSequenceProfiles(mhandle, profile_list)
# let's manually add a gap
gap = modelling.StructuralGap(mhandle.model.chains[chain_idx].FindResidue(nstem_rnum),
mhandle.model.chains[chain_idx].FindResidue(cstem_rnum),
loop_seq)
mhandle.gaps.append(gap)
modelling.ReorderGaps(mhandle) # gaps are expected to be in order
# run default loop modelling pipeline
modelling.CloseGaps(mhandle, fragment_db = frag_db,
structure_db = structure_db,
torsion_sampler = torsion_sampler)
remodelled_loop_residues = list()
for num in range(start_rnum.GetNum(), end_rnum.GetNum() + 1):
r = mhandle.model.chains[chain_idx].FindResidue(mol.ResNum(num))
remodelled_loop_residues.append(r)
remodelled_bb_list = loop.BackboneList(loop_seq, remodelled_loop_residues)
rmsd = N_CA_C_O_RMSD(remodelled_bb_list, native_bb_list)
superposed_rmsd = remodelled_bb_list.RMSD(native_bb_list, True)
rmsd_values[loop_length].append(rmsd)
superposed_rmsd_values[loop_length].append(superposed_rmsd)
# model all sidechains and do an energy minimization...
modelling.BuildSidechains(mhandle)
modelling.MinimizeModelEnergy(mhandle)
remodelled_loop_residues = list()
for num in range(start_rnum.GetNum(), end_rnum.GetNum() + 1):
r = mhandle.model.chains[chain_idx].FindResidue(mol.ResNum(num))
remodelled_loop_residues.append(r)
remodelled_bb_list = loop.BackboneList(loop_seq, remodelled_loop_residues)
rmsd_emin = N_CA_C_O_RMSD(remodelled_bb_list, native_bb_list)
superposed_rmsd_emin = remodelled_bb_list.RMSD(native_bb_list,True)
rmsd_values_emin[loop_length].append(rmsd_emin)
print("time:", time.time() - start_time)
print('native RMSD values')
for loop_length in rmsd_values.keys():
print(loop_length, 'mean: ', np.mean(rmsd_values[loop_length]))
print(loop_length, 'median: ', np.median(rmsd_values[loop_length]))
print('superposed RMSD values:')
for loop_length in superposed_rmsd_values.keys():
print(loop_length, 'mean: ', np.mean(superposed_rmsd_values[loop_length]))
print(loop_length, 'median: ', np.median(superposed_rmsd_values[loop_length]))
# Thats the number that are reported in the manuscript:
print('minimized RMSD values:')
for loop_length in rmsd_values_emin.keys():
print(loop_length, 'mean: ', np.mean(rmsd_values_emin[loop_length]))
print(loop_length, 'median: ', np.median(rmsd_values_emin[loop_length]))