SCHWED-1555: added scripts to generate big table of loop scorings.

0a51a477 · Gerardo Tauriello · 587014e9 · 0a51a477 · 0a51a477 · 0a51a477
Commit 0a51a477 authored Nov 22, 2016 by Gerardo Tauriello
--- a/extras/scoring_weight_training/README
+++ b/extras/scoring_weight_training/README
@@ -2,8 +2,37 @@ This folder contains scripts that were used to train weights for the scoring
 functions used in loop selection in the pipeline. You need a compiled and
 working version of ProMod3 to run these scripts. Scripts are provided as-is!

-Execute the scripts in this order (within this folder):
+----------------------
+Steps to be performed:
+----------------------
+
+1) setup testset and dbs by executing scripts in this order within this folder:
 $ pm generate_bft_testset.py
 $ pm generate_bft_dbs.py
+-> by default we get 50000 loops (5000 for each loop length in [3,12])
+
+2) generate big table with scores for each loop candidate for each loop
+- This takes a lot of time and is best done on a cluster (we assume bc2)
+- Test script on a small example
+$ pm generate_bft_chunks.py range 0 10
+-> should generate file "loop_data_range_00000_00010.json", which you can delete
+- Adapt "PROMOD3_ROOT" in "run_array_job.sh" to point to PM3 stage. Then submit:
+$ qsub run_array_job.sh
+-> this executes 500 jobs, each doing 100 loops with generate_bft_chunks.py
+-> the scripts automatically abort and dump partial results if mem. or runtime
+   above given thresholds ("max_time" and "max_mem" in the python script)
+   -> adapt those values if you customize max. mem. and runtime in run_array_job
+-> output is stored as "loop_data_range_X_Y.json" for loops in [X,Y[
+- check that each run_out_X.stderr is empty by executing:
+$ cat *.stderr
+- check for unfinished chunks (i.e. missing loops) by executing
+$ grep "ABORT" *.stdout"
+- if loops in [X,Y[ are missing, manually execute (or submit)
+$ pm generate_bft_chunks.py range X Y
+- combine all "loop_data_range_X_Y.json" by executing:
+$ pm generate_bft.py
+-> this should produce a fairly big numpy array dumped as loop_bft.dat
+
+3) TODO: optimize weights...

 See the descriptions in each script on what files are being generated.
--- a/extras/scoring_weight_training/generate_bft.py
+++ b/extras/scoring_weight_training/generate_bft.py
+"""
+GOAL: combine chunks into a single BFT
+IN:
+- loop_data_range_X_Y.json from generate_bft_chunks.py
+OUT:
+- loop_bft.dat: big table pickled from numpy.matrix (load with numpy.load)
+- loop_infos.json: info for each loop:
+  -> "first_indices": indexing into bft: bft rows in range(first_indices[i],
+                      first_indices[i+1]) belong to loop i
+  -> "loop_lengths": loop_lengths[i] is length of loop i
+  -> "fragment_indices": fragment_indices[i] is loop index into fragments.json
+"""
+
+import json, os, numpy, time
+
+###############################################################################
+# SETUP
+###############################################################################
+# full paths to IN and OUT files
+in_path = "out_frm"  # all json files in that path with correct keys are ok
+out_loop_bft = "loop_bft.dat"
+out_loop_infos = "loop_infos.json"
+
+###############################################################################
+# MAIN
+###############################################################################
+# collect some files
+exp_keys = sorted(["frag_start", "frag_end", "loop_datas"])
+file_list = os.listdir(in_path)
+# only consider json files
+json_files = [f for f in file_list if os.path.splitext(f)[1] == ".json"]
+# collect data:
+# - BFT -> numpy matrices for each loop
+# - fragment_indices -> fragment index (into fragments.json) for each loop
+# - loop_lengths -> loop length for each loop
+# - loop_data_keys -> sorted keys in lc_data (must be same for each loop!)
+bft_list = list()
+fragment_indices = list()
+loop_lengths = list()
+loop_data_keys = None
+for file_name in sorted(json_files):
+  # try to load json...if we fail, we ignore...
+  start_time = time.time()
+  try:
+    with open(os.path.join(in_path, file_name), "r") as json_file:
+      json_obj = json.load(json_file)
+  except Exception as ex:
+    continue
+  # we expect dict
+  if type(json_obj) == dict:
+    # check for keys (take care for unicode mess)
+    cur_keys = sorted([str(key) for key in json_obj.keys()])
+    if cur_keys == exp_keys:
+      # extract data
+      frag_start = json_obj["frag_start"]
+      frag_end = json_obj["frag_end"]
+      loop_datas = json_obj["loop_datas"]
+      N_loops = frag_end - frag_start
+      assert(len(loop_datas) == N_loops)
+      # get loops
+      num_lc_range = 0
+      for i_loop in range(N_loops):
+        loop_data = loop_datas[i_loop]
+        lc_data = loop_data["lc_data"]
+        if loop_data_keys is None:
+          loop_data_keys = sorted(lc_data.keys())
+        else:
+          assert(loop_data_keys == sorted(lc_data.keys()))
+        num_keys = len(loop_data_keys)
+        assert(num_keys > 0)
+        num_lc = len(lc_data[loop_data_keys[0]])
+        # get BFT for this loop
+        cur_bft = numpy.empty((num_lc, num_keys), dtype=numpy.float32)
+        for i_lc in range(num_lc):
+          for i_key in range(num_keys):
+            cur_bft[i_lc, i_key] = lc_data[loop_data_keys[i_key]][i_lc]
+        # keep relevant data
+        bft_list.append(cur_bft)
+        fragment_indices.append(frag_start + i_loop)
+        loop_lengths.append(loop_data["loop_length"])
+        num_lc_range += num_lc
+      # report
+      print "ADDED %d LC for LOOPS in [%d,%d[ in %g s" \
+            % (num_lc_range, frag_start, frag_end, time.time() - start_time)
+
+# check covered ranges
+max_frag_idx = max(fragment_indices)
+unique_frag_idx = set(fragment_indices)
+if len(unique_frag_idx) != max_frag_idx+1:
+  missing_indices = sorted(set(range(max_frag_idx+1)) - unique_frag_idx)
+  print "MISSING LOOPS FOR", missing_indices
+if len(unique_frag_idx) != len(fragment_indices):
+  for frag_idx in unique_frag_idx:
+    if fragment_indices.count(frag_idx) > 1:
+      print "DUPLICATE LOOPS FOR", frag_idx
+
+# get indexing into bft: range(first_indices[i], first_indices[i+1]) for loop i
+num_loops = len(bft_list)
+assert(len(fragment_indices) == num_loops)
+assert(len(loop_lengths) == num_loops)
+first_indices = list()
+total_num_lc = 0
+for i in range(num_loops):
+  first_indices.append(total_num_lc)
+  total_num_lc += bft_list[i].shape[0]
+# last one must be full size
+first_indices.append(total_num_lc)
+assert(len(first_indices) == num_loops + 1)
+
+# BUILD BFT
+num_keys = len(loop_data_keys)
+print "BUILDING BFT with %d rows and %d cols" % (total_num_lc, num_keys)
+bft = numpy.concatenate(bft_list)
+assert(bft.shape[0] == total_num_lc)
+assert(bft.shape[1] == num_keys)
+
+# dump data
+bft.dump(out_loop_bft)
+with open(out_loop_infos, "w") as json_file:
+  json_obj = {"first_indices": first_indices, "loop_lengths": loop_lengths,
+              "fragment_indices": fragment_indices}
+  json.dump(json_obj, json_file)
+
+# some stats
+for loop_length in set(loop_lengths):
+  num_lc_ll = 0
+  num_loops_ll = 0
+  for i in range(num_loops):
+    if loop_lengths[i] == loop_length:
+      num_lc = first_indices[i+1] - first_indices[i]
+      num_lc_ll += num_lc
+      num_loops_ll += 1
+  print "LL", loop_length, "LC", num_lc_ll, "AVG-LC", \
+        float(num_lc_ll) / num_loops_ll
--- a/extras/scoring_weight_training/generate_bft_chunks.py
+++ b/extras/scoring_weight_training/generate_bft_chunks.py
+"""
+GOAL: extract a chunk of the BFT
+IN:
+- fragments.json from generate_bft_testset.py
+- sub_structure_db.dat from generate_bft_dbs.py
+- sub_frag_db.dat from generate_bft_dbs.py
+OUT:
+- loop_data_range_X_Y.json: BFT for loops in [X,Y[. Content:
+-> "frag_start" = X, "frag_end" = Y
+-> "loop_datas" = list of loop_data (len = Y-X). One entry per loop with:
+   -> "chain_idx", "start_idx" and "loop_length" defining fragment info
+   -> "lc_data" = dict with a named column for each computed score
+                  plus "ca_rmsd" for CA RMSD to reference loop
+      -> each loop has same column names
+      -> each column has 1 entry per loop candidate
+"""
+
+usage_string = """
+USAGE: pm generate_bft_chunks.py MODE OPTIONS
+-> MODE = 'chunk': do a chunk -> OPTIONS = 'chunk_id chunk_size'
+                   chunk_id must be > 0 & if args not given, do all
+-> MODE = 'range': do a range -> OPTIONS = 'range_start range_end'
+                   if range_start not given: range_start = 0
+                   if range_end not given: range_end = range_start + 1
+"""
+
+import math, json, os, sys, resource, time
+from ost import io, seq
+from promod3 import core, loop, modelling, scoring, sidechain
+
+###############################################################################
+# SETUP
+###############################################################################
+# full paths to IN and OUT files
+in_path = "."
+in_fragments = os.path.join(in_path, "fragments.json")
+in_structure_db = os.path.join(in_path, "sub_structure_db.dat")
+in_frag_db = os.path.join(in_path, "sub_frag_db.dat")
+out_format = "loop_data_range_%05d_%05d.json"
+# setup all atom relaxation (def. params: 100, 0.01)
+aa_relax_steps = 50
+aa_relax_stop_criterion = 0.01
+# choose FF lookup for AA relaxation
+ff_lookup = loop.ForcefieldLookup.GetDefault()
+# choose torsion sampler for ApplyCCD
+torsion_sampler = loop.LoadTorsionSamplerCoil()
+# abort (and dump) once estimated runtime after next loop is above this
+max_time = 23.8 # for 24h queue
+# abort (and dump) once memory consumption is above this
+max_mem = 7500
+# shall we reconstruct sidechain of full chain first?
+# -> our input is BB only and by def. we reconstruct env. sc. for each lc.
+# -> if set to True, we only reconstruct sc. of lc. and not of env. (30% faster)
+reconstruct_full_chain_sc = False
+
+###############################################################################
+# HELPERS
+###############################################################################
+def LoadScorer(key):
+  # get default scorers
+  if key == "cb_packing":
+    return scoring.LoadCBPackingScorer()
+  elif key == "cbeta":
+    return scoring.LoadCBetaScorer()
+  elif key == "reduced":
+    return scoring.LoadReducedScorer()
+  elif key == "clash":
+    return scoring.ClashScorer()
+  elif key == "hbond":
+    return scoring.LoadHBondScorer()
+  elif key == "torsion":
+    return scoring.LoadTorsionScorer()
+  elif key == "aa_interaction":
+    return scoring.LoadAllAtomInteractionScorer()
+  elif key == "aa_packing":
+    return scoring.LoadAllAtomPackingScorer()
+  elif key == "aa_clash":
+    return scoring.AllAtomClashScorer()
+  else:
+    raise RuntimeError("Unknown scorer " + key)
+
+def GetNoRelaxKey(key):
+  return key + "_no_relax"
+
+def WithinBoundsScore(loop_candidates, max_dist_diff, max_ang_dist):
+  # prepare result list (1 bool for each candidate)
+  result = []
+  # get stem orient.
+  n_stem_c = core.StemCoords(n_stem)
+  c_stem_c = core.StemCoords(c_stem)
+  stem_o = core.StemPairOrientation(n_stem_c, c_stem_c)
+  bb_n_stem = core.StemCoords()
+  bb_c_stem = core.StemCoords()
+  for lc in loop_candidates:
+    bb_n_stem.n_coord = lc.GetN(0)
+    bb_n_stem.ca_coord = lc.GetCA(0)
+    bb_n_stem.c_coord = lc.GetC(0)
+    bb_c_stem.n_coord = lc.GetN(loop_length-1)
+    bb_c_stem.ca_coord = lc.GetCA(loop_length-1)
+    bb_c_stem.c_coord = lc.GetC(loop_length-1)
+    bb_o = core.StemPairOrientation(bb_n_stem, bb_c_stem)
+    result.append(    abs(bb_o.distance - stem_o.distance) <= max_dist_diff
+                  and abs(bb_o.angle_one - stem_o.angle_one) <= max_ang_dist
+                  and abs(bb_o.angle_two - stem_o.angle_two) <= max_ang_dist
+                  and abs(bb_o.angle_three - stem_o.angle_three) <= max_ang_dist
+                  and abs(bb_o.angle_four - stem_o.angle_four) <= max_ang_dist)
+  return result
+
+def GetLoopData(lc):
+  # get scores before CCD
+  whithin_bounds = WithinBoundsScore(lc, dist_bin_size/2, ang_bin_size/2)
+  seq_prof_score = lc.CalculateSequenceProfileScores(structure_db, prof,
+                                                     start_idx)
+  str_prof_score = lc.CalculateStructureProfileScores(structure_db, prof,
+                                                      start_idx)
+  stem_rmsd = lc.CalculateStemRMSDs(n_stem, c_stem)
+
+  # only keep ones that converge in CCD
+  orig_indices = lc.ApplyCCD(n_stem, c_stem, torsion_sampler)
+  print "-", len(lc), "LC for LOOP", chain_idx, start_idx, loop_length, len(prof)
+
+  # fill lc_data columns -> dict with all keys in lc_data_keys
+  lc_data = dict()
+  # get CA RMSD for each candidate
+  lc_data["ca_rmsd"] = list()
+  for lc_bb_list in lc:
+    lc_data["ca_rmsd"].append(lc_bb_list.CARMSD(ref_bb_list))
+
+  # get BB scores
+  start_res_num = start_idx + 1
+  for key in bb_scorers:
+    lc_data[key] = lc.CalculateScores(bb_scorer, key, start_res_num)
+
+  ##########################################################
+  # AA SCORING
+  ##########################################################
+  # keep backup of loop
+  old_pos = aa_sc_env.GetAllAtomPositions().Copy()
+  old_loop = old_pos.Extract(start_idx, start_idx + loop_length)
+  # setup empty lists for AA results
+  lc_data["relax_pot_e"] = list()
+  for key in aa_scorers:
+    lc_data[GetNoRelaxKey(key)] = list()
+    lc_data[key] = list()
+  # do work for all
+  for lc_bb_list in lc:
+    # reconstruct sidechain
+    aa_sc_env.SetEnvironment(lc_bb_list, start_res_num)
+    sc_result = sc_rec.Reconstruct(start_res_num, loop_length)
+    # fix env. for scoring (always contains full loop too!)
+    aa_score_env.SetEnvironment(sc_result.env_pos)
+    for key in aa_scorers:
+      score = aa_scorer[key].CalculateScore(start_res_num, loop_length)
+      lc_data[GetNoRelaxKey(key)].append(score)
+    # repeat with relaxation
+    mm_sys = loop.MmSystemCreator(ff_lookup)
+    relaxer = modelling.AllAtomRelaxer(sc_result, mm_sys)
+    pot_e = relaxer.Run(sc_result, aa_relax_steps, aa_relax_stop_criterion)
+    lc_data["relax_pot_e"].append(pot_e)
+    aa_score_env.SetEnvironment(sc_result.env_pos)
+    for key in aa_scorers:
+      score = aa_scorer[key].CalculateScore(start_res_num, loop_length)
+      lc_data[key].append(score)
+    # reset score env.
+    sc_result.env_pos.all_pos = old_pos.Extract(sc_result.env_pos.res_indices)
+    aa_score_env.SetEnvironment(sc_result.env_pos)
+  # reset sc env
+  aa_sc_env.SetEnvironment(old_loop, start_res_num)
+  ##########################################################
+  
+  # get relevant pre-CCD scores
+  lc_data["whithin_bounds"] = [whithin_bounds[i] for i in orig_indices]
+  lc_data["seq_prof_score"] = [seq_prof_score[i] for i in orig_indices]
+  lc_data["str_prof_score"] = [str_prof_score[i] for i in orig_indices]
+  lc_data["stem_rmsd"] = [stem_rmsd[i] for i in orig_indices]
+
+  # get clustering
+  for key in cluster_max_dist:
+    clusters = lc.GetClusters(cluster_max_dist[key])
+    cluster_indices = [0] * len(lc)
+    for cluster_idx, cluster in enumerate(clusters):
+      for idx in cluster:
+        cluster_indices[idx] = cluster_idx
+    lc_data[key] = cluster_indices
+
+  # check lc_data columns (same length, all keys there)
+  assert(all(len(lc_data[key]) == len(lc) for key in lc_data))
+  assert(all(key in lc_data for key in lc_data_keys))
+
+  # fill loop_data
+  loop_data = dict()
+  loop_data["chain_idx"] = chain_idx
+  loop_data["start_idx"] = start_idx
+  loop_data["loop_length"] = loop_length
+  loop_data["lc_data"] = lc_data
+  return loop_data
+
+###############################################################################
+# MAIN
+###############################################################################
+# keep track of starting time for possible runtime abort
+start_time = time.time()
+
+# for better error handling: unbuffer stdout
+unbuffered = os.fdopen(sys.stdout.fileno(), 'w', 0)
+sys.stdout = unbuffered
+
+# check command line args
+if len(sys.argv) < 2 or not sys.argv[1].lower() in ["chunk", "range"]:
+  print usage_string
+  sys.exit(1)
+
+# timing
+core.StaticRuntimeProfiler.Start('PYTHON::OVERHEAD')
+
+# get fragments
+core.StaticRuntimeProfiler.Start('PYTHON::JSON')
+with open(in_fragments, "r") as json_file:
+  json_obj = json.load(json_file)
+  fragments = [loop.FragmentInfo(f[0], f[1], f[2]) for f in json_obj]
+core.StaticRuntimeProfiler.Stop()
+
+# get command line args
+run_mode = sys.argv[1].lower()
+if run_mode == "chunk":
+  if len(sys.argv) < 4:
+    chunk_id = 1
+    chunk_size = len(fragments)
+  else:
+    chunk_id = int(sys.argv[2])
+    chunk_size = int(sys.argv[3])
+    assert(chunk_id > 0)
+  frag_start = (chunk_id-1)*chunk_size
+  frag_end = chunk_id*chunk_size
+else:
+  if len(sys.argv) > 2:
+    frag_start = int(sys.argv[2])
+  else:
+    frag_start = 0
+  if len(sys.argv) > 3:
+    frag_end = int(sys.argv[3])
+  else:
+    frag_end = frag_start+1
+
+# get dbs
+big_structure_db = loop.LoadStructureDB()
+# big_frag_db = loop.LoadFragDB()
+sub_structure_db = loop.StructureDB.Load(in_structure_db)
+sub_frag_db = loop.FragDB.Load(in_frag_db)
+dist_bin_size = sub_frag_db.GetDistBinSize();
+ang_bin_size = (sub_frag_db.GetAngularBinSize()*math.pi) / (180)
+# dbs to use for loop candidates
+structure_db = sub_structure_db
+frag_db = sub_frag_db
+
+# define score keys
+bb_scorers = ["cb_packing", "cbeta", "reduced", "clash", "hbond", "torsion"]
+aa_scorers = ["aa_interaction", "aa_packing", "aa_clash"]
+# define all columns we want in loop_data for each loop
+cluster_max_dist = {"cluster_idx_2": 2.0,
+                    "cluster_idx_3": 3.0,
+                    "cluster_idx_4": 4.0}
+lc_data_keys = ["ca_rmsd", "whithin_bounds", "seq_prof_score"] \
+             + ["str_prof_score", "stem_rmsd"] \
+             + bb_scorers + [GetNoRelaxKey(key) for key in aa_scorers] \
+             + ["relax_pot_e"] + aa_scorers \
+             + sorted(cluster_max_dist.keys())
+
+# choose fragments to analyze
+print "LOOPS in [%d, %d[" % (frag_start, frag_end)
+print '-> mem', resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1000
+loop_datas = list()
+max_frag_time = 0
+for fragment in fragments[frag_start:frag_end]:
+  frag_start_time = time.time()
+  # extract info
+  chain_idx = fragment.chain_index
+  loop_length = fragment.length
+  start_idx = fragment.offset
+
+  # get entity and profile out of full DB
+  coord_info = big_structure_db.GetCoordInfo(chain_idx)
+  frag_info = loop.FragmentInfo(chain_idx, 0, coord_info.size)
+  seqres = big_structure_db.GetSequence(frag_info)
+  bb_list = big_structure_db.GetBackboneList(frag_info, seqres)
+  prof = big_structure_db.GetSequenceProfile(frag_info)
+  ent = bb_list.ToEntity()
+  # io.SavePDB(ent, "test.pdb")
+
+  # do some loop closing with sub dbs
+  assert(seqres == ''.join([r.one_letter_code for r in ent.residues]))
+  loop_seq = seqres[start_idx:start_idx+loop_length]
+  n_stem = ent.residues[start_idx]
+  c_stem = ent.residues[start_idx+loop_length-1]
+  # get loop candidates
+  loop_candidates = modelling.LoopCandidates.FillFromDatabase(\
+                      n_stem, c_stem, loop_seq, frag_db, structure_db, True)
+
+  # get scorers
+  bb_scorer = scoring.BackboneOverallScorer()
+  for key in bb_scorers:
+    bb_scorer[key] = LoadScorer(key)
+  aa_scorer = scoring.AllAtomOverallScorer()
+  for key in aa_scorers:
+    aa_scorer[key] = LoadScorer(key)
+
+  # get/set environments
+  bb_score_env = scoring.BackboneScoreEnv(seqres)
+  bb_score_env.SetInitialEnvironment(ent)
+  bb_scorer.AttachEnvironment(bb_score_env)
+  aa_score_env = loop.AllAtomEnv(seqres)
+  aa_score_env.SetInitialEnvironment(ent)
+  aa_scorer.AttachEnvironment(aa_score_env)
+
+  # get sidechain reconstructor
+  aa_sc_env = loop.AllAtomEnv(seqres)
+  aa_sc_env.SetInitialEnvironment(ent)
+  sc_rec = sidechain.SidechainReconstructor(graph_max_complexity=10000000)
+  sc_rec.AttachEnvironment(aa_sc_env)
+  # precompute sidechains for chain with reference loop in it?
+  if reconstruct_full_chain_sc:
+    sc_result = sc_rec.Reconstruct(1, coord_info.size)
+    aa_sc_env.SetEnvironment(sc_result.env_pos)
+
+  # DO IT
+  ref_bb_list = bb_list.Extract(start_idx, start_idx+loop_length)
+  loop_datas.append(GetLoopData(loop_candidates))
+
+  # timing
+  frag_time = time.time() - frag_start_time
+  max_frag_time = max(max_frag_time, frag_time)
+  el_time = time.time() - start_time
+  cur_mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1000
+  est_next_time = (el_time + 2*max_frag_time)/3600
+  print '-> N %d / est_next_time %g / frag_time %g / mem %d' \
+        % (len(loop_datas), est_next_time, frag_time, cur_mem)
+  # check if we should abort and dump
+  if est_next_time > max_time or cur_mem > max_mem:
+    print "ABORTING", len(loop_datas)
+    break
+
+# timing
+core.StaticRuntimeProfiler.Stop()
+core.StaticRuntimeProfiler.PrintSummary(12)
+
+# dump it
+frag_end = frag_start + len(loop_datas)
+out_filename = out_format % (frag_start, frag_end)
+with open(out_filename, "w") as json_file:
+  json_obj = {"frag_start": frag_start, "frag_end": frag_end,
+              "loop_datas": loop_datas}
+  json.dump(json_obj, json_file)