import os, time import numpy as np from ost import mol, geom, io, seq from promod3 import modelling, loop def N_CA_C_O_RMSD(bb_one, bb_two): if len(bb_one) != len(bb_two): raise RuntimeError('Inconsistent sizes when calculating RMSD') msd = 0.0 for i in range(len(bb_one)): d_one = geom.Distance(bb_one.GetN(i), bb_two.GetN(i)) d_two = geom.Distance(bb_one.GetCA(i), bb_two.GetCA(i)) d_three = geom.Distance(bb_one.GetC(i), bb_two.GetC(i)) d_four = geom.Distance(bb_one.GetO(i), bb_two.GetO(i)) msd += d_one * d_one msd += d_two * d_two msd += d_three * d_three msd += d_four * d_four return np.sqrt(msd / (4 * len(bb_one))) # all important overall parameters come here! data = open('fread_data.txt','r').readlines() min_loop_length = 4 max_loop_length = 20 structure_db = loop.StructureDB.LoadPortable('nonredundant_structure_db_portable.dat') frag_db = loop.FragDB.LoadPortable('nonredundant_frag_db_portable.dat') torsion_sampler = loop.LoadTorsionSamplerCoil() rmsd_values = {i: list() for i in range(min_loop_length, max_loop_length + 1)} superposed_rmsd_values = {i: list() for i in range(min_loop_length, max_loop_length + 1)} rmsd_values_emin = {i: list() for i in range(min_loop_length, max_loop_length + 1)} start_time = time.time() for item in data: pdb_id = item.split()[0] chain_name = item.split()[1] start_rnum = int(item.split()[2]) loop_length = int(item.split()[3]) if loop_length < min_loop_length or loop_length > max_loop_length: continue end_rnum = start_rnum + loop_length - 1 print('process:', pdb_id, chain_name, start_rnum, end_rnum, loop_length) prot = io.LoadPDB(os.path.join('structures',pdb_id+'.pdb')) mol.alg.AssignSecStruct(prot) chain = prot.FindChain(chain_name) if not chain.IsValid(): raise RuntimeError('Required chain not found') # determine chain index the hacky way chain_idx = -1 for i, c in enumerate(prot.chains): if c == chain: chain_idx = i break loop_residues = list() for num in range(start_rnum, end_rnum + 1): r = chain.FindResidue(mol.ResNum(num)) if r.IsValid(): loop_residues.append(r.handle) else: raise RuntimeError('Could not find all residues of specified loop') loop_seq = ''.join([r.one_letter_code for r in loop_residues]) native_bb_list = loop.BackboneList(loop_seq, loop_residues) # renumber residues, so they start with one # the residue numbers of the residues in loop_residues # will also be altered... this allows an easy matching with the # residues in the rawmodel built later on... ed = prot.handle.EditXCS() for c in prot.handle.chains: ed.RenumberChain(c, 1, False) # we can now reset the start and end rnum start_rnum = loop_residues[0].GetNumber() end_rnum = loop_residues[-1].GetNumber() nstem_rnum = start_rnum - 1 cstem_rnum = end_rnum + 1 # setup mhandle sequence_list = list() profile_list = list() for c in prot.chains: s_path = os.path.join('sequences', pdb_id + '_' + c.GetName() + '.fasta') sequence_list.append(io.LoadSequence(s_path)) p_path = os.path.join('profiles', pdb_id + '_' + c.GetName() + '.hhm') profile_list.append(io.LoadSequenceProfile(p_path)) aln_list = seq.AlignmentList() for s,c in zip(sequence_list, prot.chains): aln = seq.CreateAlignment() aln.AddSequence(s) aln.AddSequence(s) view = prot.Select('cname='+c.GetName()) aln.AttachView(1, view) aln_list.append(aln) mhandle = modelling.BuildRawModel(aln_list) modelling.SetSequenceProfiles(mhandle, profile_list) # let's manually add a gap gap = modelling.StructuralGap(mhandle.model.chains[chain_idx].FindResidue(nstem_rnum), mhandle.model.chains[chain_idx].FindResidue(cstem_rnum), loop_seq) mhandle.gaps.append(gap) modelling.ReorderGaps(mhandle) # gaps are expected to be in order # run default loop modelling pipeline modelling.CloseGaps(mhandle, fragment_db = frag_db, structure_db = structure_db, torsion_sampler = torsion_sampler) remodelled_loop_residues = list() for num in range(start_rnum.GetNum(), end_rnum.GetNum() + 1): r = mhandle.model.chains[chain_idx].FindResidue(mol.ResNum(num)) remodelled_loop_residues.append(r) remodelled_bb_list = loop.BackboneList(loop_seq, remodelled_loop_residues) rmsd = N_CA_C_O_RMSD(remodelled_bb_list, native_bb_list) superposed_rmsd = remodelled_bb_list.RMSD(native_bb_list, True) rmsd_values[loop_length].append(rmsd) superposed_rmsd_values[loop_length].append(superposed_rmsd) # model all sidechains and do an energy minimization... modelling.BuildSidechains(mhandle) modelling.MinimizeModelEnergy(mhandle) remodelled_loop_residues = list() for num in range(start_rnum.GetNum(), end_rnum.GetNum() + 1): r = mhandle.model.chains[chain_idx].FindResidue(mol.ResNum(num)) remodelled_loop_residues.append(r) remodelled_bb_list = loop.BackboneList(loop_seq, remodelled_loop_residues) rmsd_emin = N_CA_C_O_RMSD(remodelled_bb_list, native_bb_list) superposed_rmsd_emin = remodelled_bb_list.RMSD(native_bb_list,True) rmsd_values_emin[loop_length].append(rmsd_emin) print("time:", time.time() - start_time) print('native RMSD values') for loop_length in rmsd_values.keys(): print(loop_length, 'mean: ', np.mean(rmsd_values[loop_length])) print(loop_length, 'median: ', np.median(rmsd_values[loop_length])) print('superposed RMSD values:') for loop_length in superposed_rmsd_values.keys(): print(loop_length, 'mean: ', np.mean(superposed_rmsd_values[loop_length])) print(loop_length, 'median: ', np.median(superposed_rmsd_values[loop_length])) # Thats the number that are reported in the manuscript: print('minimized RMSD values:') for loop_length in rmsd_values_emin.keys(): print(loop_length, 'mean: ', np.mean(rmsd_values_emin[loop_length])) print(loop_length, 'median: ', np.median(rmsd_values_emin[loop_length]))