Skip to content
Snippets Groups Projects
Commit 0a51a477 authored by Gerardo Tauriello's avatar Gerardo Tauriello
Browse files

SCHWED-1555: added scripts to generate big table of loop scorings.

parent 587014e9
Branches
Tags
No related merge requests found
......@@ -2,8 +2,37 @@ This folder contains scripts that were used to train weights for the scoring
functions used in loop selection in the pipeline. You need a compiled and
working version of ProMod3 to run these scripts. Scripts are provided as-is!
Execute the scripts in this order (within this folder):
----------------------
Steps to be performed:
----------------------
1) setup testset and dbs by executing scripts in this order within this folder:
$ pm generate_bft_testset.py
$ pm generate_bft_dbs.py
-> by default we get 50000 loops (5000 for each loop length in [3,12])
2) generate big table with scores for each loop candidate for each loop
- This takes a lot of time and is best done on a cluster (we assume bc2)
- Test script on a small example
$ pm generate_bft_chunks.py range 0 10
-> should generate file "loop_data_range_00000_00010.json", which you can delete
- Adapt "PROMOD3_ROOT" in "run_array_job.sh" to point to PM3 stage. Then submit:
$ qsub run_array_job.sh
-> this executes 500 jobs, each doing 100 loops with generate_bft_chunks.py
-> the scripts automatically abort and dump partial results if mem. or runtime
above given thresholds ("max_time" and "max_mem" in the python script)
-> adapt those values if you customize max. mem. and runtime in run_array_job
-> output is stored as "loop_data_range_X_Y.json" for loops in [X,Y[
- check that each run_out_X.stderr is empty by executing:
$ cat *.stderr
- check for unfinished chunks (i.e. missing loops) by executing
$ grep "ABORT" *.stdout"
- if loops in [X,Y[ are missing, manually execute (or submit)
$ pm generate_bft_chunks.py range X Y
- combine all "loop_data_range_X_Y.json" by executing:
$ pm generate_bft.py
-> this should produce a fairly big numpy array dumped as loop_bft.dat
3) TODO: optimize weights...
See the descriptions in each script on what files are being generated.
"""
GOAL: combine chunks into a single BFT
IN:
- loop_data_range_X_Y.json from generate_bft_chunks.py
OUT:
- loop_bft.dat: big table pickled from numpy.matrix (load with numpy.load)
- loop_infos.json: info for each loop:
-> "first_indices": indexing into bft: bft rows in range(first_indices[i],
first_indices[i+1]) belong to loop i
-> "loop_lengths": loop_lengths[i] is length of loop i
-> "fragment_indices": fragment_indices[i] is loop index into fragments.json
"""
import json, os, numpy, time
###############################################################################
# SETUP
###############################################################################
# full paths to IN and OUT files
in_path = "out_frm" # all json files in that path with correct keys are ok
out_loop_bft = "loop_bft.dat"
out_loop_infos = "loop_infos.json"
###############################################################################
# MAIN
###############################################################################
# collect some files
exp_keys = sorted(["frag_start", "frag_end", "loop_datas"])
file_list = os.listdir(in_path)
# only consider json files
json_files = [f for f in file_list if os.path.splitext(f)[1] == ".json"]
# collect data:
# - BFT -> numpy matrices for each loop
# - fragment_indices -> fragment index (into fragments.json) for each loop
# - loop_lengths -> loop length for each loop
# - loop_data_keys -> sorted keys in lc_data (must be same for each loop!)
bft_list = list()
fragment_indices = list()
loop_lengths = list()
loop_data_keys = None
for file_name in sorted(json_files):
# try to load json...if we fail, we ignore...
start_time = time.time()
try:
with open(os.path.join(in_path, file_name), "r") as json_file:
json_obj = json.load(json_file)
except Exception as ex:
continue
# we expect dict
if type(json_obj) == dict:
# check for keys (take care for unicode mess)
cur_keys = sorted([str(key) for key in json_obj.keys()])
if cur_keys == exp_keys:
# extract data
frag_start = json_obj["frag_start"]
frag_end = json_obj["frag_end"]
loop_datas = json_obj["loop_datas"]
N_loops = frag_end - frag_start
assert(len(loop_datas) == N_loops)
# get loops
num_lc_range = 0
for i_loop in range(N_loops):
loop_data = loop_datas[i_loop]
lc_data = loop_data["lc_data"]
if loop_data_keys is None:
loop_data_keys = sorted(lc_data.keys())
else:
assert(loop_data_keys == sorted(lc_data.keys()))
num_keys = len(loop_data_keys)
assert(num_keys > 0)
num_lc = len(lc_data[loop_data_keys[0]])
# get BFT for this loop
cur_bft = numpy.empty((num_lc, num_keys), dtype=numpy.float32)
for i_lc in range(num_lc):
for i_key in range(num_keys):
cur_bft[i_lc, i_key] = lc_data[loop_data_keys[i_key]][i_lc]
# keep relevant data
bft_list.append(cur_bft)
fragment_indices.append(frag_start + i_loop)
loop_lengths.append(loop_data["loop_length"])
num_lc_range += num_lc
# report
print "ADDED %d LC for LOOPS in [%d,%d[ in %g s" \
% (num_lc_range, frag_start, frag_end, time.time() - start_time)
# check covered ranges
max_frag_idx = max(fragment_indices)
unique_frag_idx = set(fragment_indices)
if len(unique_frag_idx) != max_frag_idx+1:
missing_indices = sorted(set(range(max_frag_idx+1)) - unique_frag_idx)
print "MISSING LOOPS FOR", missing_indices
if len(unique_frag_idx) != len(fragment_indices):
for frag_idx in unique_frag_idx:
if fragment_indices.count(frag_idx) > 1:
print "DUPLICATE LOOPS FOR", frag_idx
# get indexing into bft: range(first_indices[i], first_indices[i+1]) for loop i
num_loops = len(bft_list)
assert(len(fragment_indices) == num_loops)
assert(len(loop_lengths) == num_loops)
first_indices = list()
total_num_lc = 0
for i in range(num_loops):
first_indices.append(total_num_lc)
total_num_lc += bft_list[i].shape[0]
# last one must be full size
first_indices.append(total_num_lc)
assert(len(first_indices) == num_loops + 1)
# BUILD BFT
num_keys = len(loop_data_keys)
print "BUILDING BFT with %d rows and %d cols" % (total_num_lc, num_keys)
bft = numpy.concatenate(bft_list)
assert(bft.shape[0] == total_num_lc)
assert(bft.shape[1] == num_keys)
# dump data
bft.dump(out_loop_bft)
with open(out_loop_infos, "w") as json_file:
json_obj = {"first_indices": first_indices, "loop_lengths": loop_lengths,
"fragment_indices": fragment_indices}
json.dump(json_obj, json_file)
# some stats
for loop_length in set(loop_lengths):
num_lc_ll = 0
num_loops_ll = 0
for i in range(num_loops):
if loop_lengths[i] == loop_length:
num_lc = first_indices[i+1] - first_indices[i]
num_lc_ll += num_lc
num_loops_ll += 1
print "LL", loop_length, "LC", num_lc_ll, "AVG-LC", \
float(num_lc_ll) / num_loops_ll
"""
GOAL: extract a chunk of the BFT
IN:
- fragments.json from generate_bft_testset.py
- sub_structure_db.dat from generate_bft_dbs.py
- sub_frag_db.dat from generate_bft_dbs.py
OUT:
- loop_data_range_X_Y.json: BFT for loops in [X,Y[. Content:
-> "frag_start" = X, "frag_end" = Y
-> "loop_datas" = list of loop_data (len = Y-X). One entry per loop with:
-> "chain_idx", "start_idx" and "loop_length" defining fragment info
-> "lc_data" = dict with a named column for each computed score
plus "ca_rmsd" for CA RMSD to reference loop
-> each loop has same column names
-> each column has 1 entry per loop candidate
"""
usage_string = """
USAGE: pm generate_bft_chunks.py MODE OPTIONS
-> MODE = 'chunk': do a chunk -> OPTIONS = 'chunk_id chunk_size'
chunk_id must be > 0 & if args not given, do all
-> MODE = 'range': do a range -> OPTIONS = 'range_start range_end'
if range_start not given: range_start = 0
if range_end not given: range_end = range_start + 1
"""
import math, json, os, sys, resource, time
from ost import io, seq
from promod3 import core, loop, modelling, scoring, sidechain
###############################################################################
# SETUP
###############################################################################
# full paths to IN and OUT files
in_path = "."
in_fragments = os.path.join(in_path, "fragments.json")
in_structure_db = os.path.join(in_path, "sub_structure_db.dat")
in_frag_db = os.path.join(in_path, "sub_frag_db.dat")
out_format = "loop_data_range_%05d_%05d.json"
# setup all atom relaxation (def. params: 100, 0.01)
aa_relax_steps = 50
aa_relax_stop_criterion = 0.01
# choose FF lookup for AA relaxation
ff_lookup = loop.ForcefieldLookup.GetDefault()
# choose torsion sampler for ApplyCCD
torsion_sampler = loop.LoadTorsionSamplerCoil()
# abort (and dump) once estimated runtime after next loop is above this
max_time = 23.8 # for 24h queue
# abort (and dump) once memory consumption is above this
max_mem = 7500
# shall we reconstruct sidechain of full chain first?
# -> our input is BB only and by def. we reconstruct env. sc. for each lc.
# -> if set to True, we only reconstruct sc. of lc. and not of env. (30% faster)
reconstruct_full_chain_sc = False
###############################################################################
# HELPERS
###############################################################################
def LoadScorer(key):
# get default scorers
if key == "cb_packing":
return scoring.LoadCBPackingScorer()
elif key == "cbeta":
return scoring.LoadCBetaScorer()
elif key == "reduced":
return scoring.LoadReducedScorer()
elif key == "clash":
return scoring.ClashScorer()
elif key == "hbond":
return scoring.LoadHBondScorer()
elif key == "torsion":
return scoring.LoadTorsionScorer()
elif key == "aa_interaction":
return scoring.LoadAllAtomInteractionScorer()
elif key == "aa_packing":
return scoring.LoadAllAtomPackingScorer()
elif key == "aa_clash":
return scoring.AllAtomClashScorer()
else:
raise RuntimeError("Unknown scorer " + key)
def GetNoRelaxKey(key):
return key + "_no_relax"
def WithinBoundsScore(loop_candidates, max_dist_diff, max_ang_dist):
# prepare result list (1 bool for each candidate)
result = []
# get stem orient.
n_stem_c = core.StemCoords(n_stem)
c_stem_c = core.StemCoords(c_stem)
stem_o = core.StemPairOrientation(n_stem_c, c_stem_c)
bb_n_stem = core.StemCoords()
bb_c_stem = core.StemCoords()
for lc in loop_candidates:
bb_n_stem.n_coord = lc.GetN(0)
bb_n_stem.ca_coord = lc.GetCA(0)
bb_n_stem.c_coord = lc.GetC(0)
bb_c_stem.n_coord = lc.GetN(loop_length-1)
bb_c_stem.ca_coord = lc.GetCA(loop_length-1)
bb_c_stem.c_coord = lc.GetC(loop_length-1)
bb_o = core.StemPairOrientation(bb_n_stem, bb_c_stem)
result.append( abs(bb_o.distance - stem_o.distance) <= max_dist_diff
and abs(bb_o.angle_one - stem_o.angle_one) <= max_ang_dist
and abs(bb_o.angle_two - stem_o.angle_two) <= max_ang_dist
and abs(bb_o.angle_three - stem_o.angle_three) <= max_ang_dist
and abs(bb_o.angle_four - stem_o.angle_four) <= max_ang_dist)
return result
def GetLoopData(lc):
# get scores before CCD
whithin_bounds = WithinBoundsScore(lc, dist_bin_size/2, ang_bin_size/2)
seq_prof_score = lc.CalculateSequenceProfileScores(structure_db, prof,
start_idx)
str_prof_score = lc.CalculateStructureProfileScores(structure_db, prof,
start_idx)
stem_rmsd = lc.CalculateStemRMSDs(n_stem, c_stem)
# only keep ones that converge in CCD
orig_indices = lc.ApplyCCD(n_stem, c_stem, torsion_sampler)
print "-", len(lc), "LC for LOOP", chain_idx, start_idx, loop_length, len(prof)
# fill lc_data columns -> dict with all keys in lc_data_keys
lc_data = dict()
# get CA RMSD for each candidate
lc_data["ca_rmsd"] = list()
for lc_bb_list in lc:
lc_data["ca_rmsd"].append(lc_bb_list.CARMSD(ref_bb_list))
# get BB scores
start_res_num = start_idx + 1
for key in bb_scorers:
lc_data[key] = lc.CalculateScores(bb_scorer, key, start_res_num)
##########################################################
# AA SCORING
##########################################################
# keep backup of loop
old_pos = aa_sc_env.GetAllAtomPositions().Copy()
old_loop = old_pos.Extract(start_idx, start_idx + loop_length)
# setup empty lists for AA results
lc_data["relax_pot_e"] = list()
for key in aa_scorers:
lc_data[GetNoRelaxKey(key)] = list()
lc_data[key] = list()
# do work for all
for lc_bb_list in lc:
# reconstruct sidechain
aa_sc_env.SetEnvironment(lc_bb_list, start_res_num)
sc_result = sc_rec.Reconstruct(start_res_num, loop_length)
# fix env. for scoring (always contains full loop too!)
aa_score_env.SetEnvironment(sc_result.env_pos)
for key in aa_scorers:
score = aa_scorer[key].CalculateScore(start_res_num, loop_length)
lc_data[GetNoRelaxKey(key)].append(score)
# repeat with relaxation
mm_sys = loop.MmSystemCreator(ff_lookup)
relaxer = modelling.AllAtomRelaxer(sc_result, mm_sys)
pot_e = relaxer.Run(sc_result, aa_relax_steps, aa_relax_stop_criterion)
lc_data["relax_pot_e"].append(pot_e)
aa_score_env.SetEnvironment(sc_result.env_pos)
for key in aa_scorers:
score = aa_scorer[key].CalculateScore(start_res_num, loop_length)
lc_data[key].append(score)
# reset score env.
sc_result.env_pos.all_pos = old_pos.Extract(sc_result.env_pos.res_indices)
aa_score_env.SetEnvironment(sc_result.env_pos)
# reset sc env
aa_sc_env.SetEnvironment(old_loop, start_res_num)
##########################################################
# get relevant pre-CCD scores
lc_data["whithin_bounds"] = [whithin_bounds[i] for i in orig_indices]
lc_data["seq_prof_score"] = [seq_prof_score[i] for i in orig_indices]
lc_data["str_prof_score"] = [str_prof_score[i] for i in orig_indices]
lc_data["stem_rmsd"] = [stem_rmsd[i] for i in orig_indices]
# get clustering
for key in cluster_max_dist:
clusters = lc.GetClusters(cluster_max_dist[key])
cluster_indices = [0] * len(lc)
for cluster_idx, cluster in enumerate(clusters):
for idx in cluster:
cluster_indices[idx] = cluster_idx
lc_data[key] = cluster_indices
# check lc_data columns (same length, all keys there)
assert(all(len(lc_data[key]) == len(lc) for key in lc_data))
assert(all(key in lc_data for key in lc_data_keys))
# fill loop_data
loop_data = dict()
loop_data["chain_idx"] = chain_idx
loop_data["start_idx"] = start_idx
loop_data["loop_length"] = loop_length
loop_data["lc_data"] = lc_data
return loop_data
###############################################################################
# MAIN
###############################################################################
# keep track of starting time for possible runtime abort
start_time = time.time()
# for better error handling: unbuffer stdout
unbuffered = os.fdopen(sys.stdout.fileno(), 'w', 0)
sys.stdout = unbuffered
# check command line args
if len(sys.argv) < 2 or not sys.argv[1].lower() in ["chunk", "range"]:
print usage_string
sys.exit(1)
# timing
core.StaticRuntimeProfiler.Start('PYTHON::OVERHEAD')
# get fragments
core.StaticRuntimeProfiler.Start('PYTHON::JSON')
with open(in_fragments, "r") as json_file:
json_obj = json.load(json_file)
fragments = [loop.FragmentInfo(f[0], f[1], f[2]) for f in json_obj]
core.StaticRuntimeProfiler.Stop()
# get command line args
run_mode = sys.argv[1].lower()
if run_mode == "chunk":
if len(sys.argv) < 4:
chunk_id = 1
chunk_size = len(fragments)
else:
chunk_id = int(sys.argv[2])
chunk_size = int(sys.argv[3])
assert(chunk_id > 0)
frag_start = (chunk_id-1)*chunk_size
frag_end = chunk_id*chunk_size
else:
if len(sys.argv) > 2:
frag_start = int(sys.argv[2])
else:
frag_start = 0
if len(sys.argv) > 3:
frag_end = int(sys.argv[3])
else:
frag_end = frag_start+1
# get dbs
big_structure_db = loop.LoadStructureDB()
# big_frag_db = loop.LoadFragDB()
sub_structure_db = loop.StructureDB.Load(in_structure_db)
sub_frag_db = loop.FragDB.Load(in_frag_db)
dist_bin_size = sub_frag_db.GetDistBinSize();
ang_bin_size = (sub_frag_db.GetAngularBinSize()*math.pi) / (180)
# dbs to use for loop candidates
structure_db = sub_structure_db
frag_db = sub_frag_db
# define score keys
bb_scorers = ["cb_packing", "cbeta", "reduced", "clash", "hbond", "torsion"]
aa_scorers = ["aa_interaction", "aa_packing", "aa_clash"]
# define all columns we want in loop_data for each loop
cluster_max_dist = {"cluster_idx_2": 2.0,
"cluster_idx_3": 3.0,
"cluster_idx_4": 4.0}
lc_data_keys = ["ca_rmsd", "whithin_bounds", "seq_prof_score"] \
+ ["str_prof_score", "stem_rmsd"] \
+ bb_scorers + [GetNoRelaxKey(key) for key in aa_scorers] \
+ ["relax_pot_e"] + aa_scorers \
+ sorted(cluster_max_dist.keys())
# choose fragments to analyze
print "LOOPS in [%d, %d[" % (frag_start, frag_end)
print '-> mem', resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1000
loop_datas = list()
max_frag_time = 0
for fragment in fragments[frag_start:frag_end]:
frag_start_time = time.time()
# extract info
chain_idx = fragment.chain_index
loop_length = fragment.length
start_idx = fragment.offset
# get entity and profile out of full DB
coord_info = big_structure_db.GetCoordInfo(chain_idx)
frag_info = loop.FragmentInfo(chain_idx, 0, coord_info.size)
seqres = big_structure_db.GetSequence(frag_info)
bb_list = big_structure_db.GetBackboneList(frag_info, seqres)
prof = big_structure_db.GetSequenceProfile(frag_info)
ent = bb_list.ToEntity()
# io.SavePDB(ent, "test.pdb")
# do some loop closing with sub dbs
assert(seqres == ''.join([r.one_letter_code for r in ent.residues]))
loop_seq = seqres[start_idx:start_idx+loop_length]
n_stem = ent.residues[start_idx]
c_stem = ent.residues[start_idx+loop_length-1]
# get loop candidates
loop_candidates = modelling.LoopCandidates.FillFromDatabase(\
n_stem, c_stem, loop_seq, frag_db, structure_db, True)
# get scorers
bb_scorer = scoring.BackboneOverallScorer()
for key in bb_scorers:
bb_scorer[key] = LoadScorer(key)
aa_scorer = scoring.AllAtomOverallScorer()
for key in aa_scorers:
aa_scorer[key] = LoadScorer(key)
# get/set environments
bb_score_env = scoring.BackboneScoreEnv(seqres)
bb_score_env.SetInitialEnvironment(ent)
bb_scorer.AttachEnvironment(bb_score_env)
aa_score_env = loop.AllAtomEnv(seqres)
aa_score_env.SetInitialEnvironment(ent)
aa_scorer.AttachEnvironment(aa_score_env)
# get sidechain reconstructor
aa_sc_env = loop.AllAtomEnv(seqres)
aa_sc_env.SetInitialEnvironment(ent)
sc_rec = sidechain.SidechainReconstructor(graph_max_complexity=10000000)
sc_rec.AttachEnvironment(aa_sc_env)
# precompute sidechains for chain with reference loop in it?
if reconstruct_full_chain_sc:
sc_result = sc_rec.Reconstruct(1, coord_info.size)
aa_sc_env.SetEnvironment(sc_result.env_pos)
# DO IT
ref_bb_list = bb_list.Extract(start_idx, start_idx+loop_length)
loop_datas.append(GetLoopData(loop_candidates))
# timing
frag_time = time.time() - frag_start_time
max_frag_time = max(max_frag_time, frag_time)
el_time = time.time() - start_time
cur_mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1000
est_next_time = (el_time + 2*max_frag_time)/3600
print '-> N %d / est_next_time %g / frag_time %g / mem %d' \
% (len(loop_datas), est_next_time, frag_time, cur_mem)
# check if we should abort and dump
if est_next_time > max_time or cur_mem > max_mem:
print "ABORTING", len(loop_datas)
break
# timing
core.StaticRuntimeProfiler.Stop()
core.StaticRuntimeProfiler.PrintSummary(12)
# dump it
frag_end = frag_start + len(loop_datas)
out_filename = out_format % (frag_start, frag_end)
with open(out_filename, "w") as json_file:
json_obj = {"frag_start": frag_start, "frag_end": frag_end,
"loop_datas": loop_datas}
json.dump(json_obj, json_file)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment