Cleanup BFT output by removing cases with 0 or 1 candidate

13d8e357 · Gerardo Tauriello · c15c6610 · 13d8e357 · 13d8e357
Commit 13d8e357 authored Nov 29, 2016 by Gerardo Tauriello
--- a/extras/scoring_weight_training/README
+++ b/extras/scoring_weight_training/README
@@ -31,7 +31,8 @@ $ grep "ABORT" *.stdout"
 $ pm generate_bft_chunks.py range X Y
 - combine all "loop_data_range_X_Y.json" by executing:
 $ pm generate_bft.py
-> this should produce a fairly big numpy array dumped as loop_bft.dat
+-> this should produce a fairly big numpy array dumped as loop_bft.dat and a
+   json file loop_infos.json containing information to access the data

 3) TODO: optimize weights...


--- a/extras/scoring_weight_training/generate_bft.py
+++ b/extras/scoring_weight_training/generate_bft.py
@@ -5,6 +5,7 @@ IN:
 OUT:
 - loop_bft.dat: big table pickled from numpy.matrix (load with numpy.load)
 - loop_infos.json: info for each loop:
+  -> "loop_data_keys": keys identifying each column in the numpy table
  -> "first_indices": indexing into bft: bft rows in range(first_indices[i],
                      first_indices[i+1]) belong to loop i
  -> "loop_lengths": loop_lengths[i] is length of loop i
@@ -17,7 +18,7 @@ import json, os, numpy, time
 # SETUP
 ###############################################################################
 # full paths to IN and OUT files
-in_path = "out_frm"  # all json files in that path with correct keys are ok
+in_path = "."  # all json files in that path with correct keys are ok
 out_loop_bft = "loop_bft.dat"
 out_loop_infos = "loop_infos.json"

@@ -94,17 +95,35 @@ if len(unique_frag_idx) != len(fragment_indices):
    if fragment_indices.count(frag_idx) > 1:
      print "DUPLICATE LOOPS FOR", frag_idx

-# get indexing into bft: range(first_indices[i], first_indices[i+1]) for loop i
+# consistency check
 num_loops = len(bft_list)
 assert(len(fragment_indices) == num_loops)
 assert(len(loop_lengths) == num_loops)
+
+# get indexing into bft: range(first_indices[i], first_indices[i+1]) for loop i
+# -> while doing it we also remove cases with 0 or 1 loop candidate
 first_indices = list()
 total_num_lc = 0
+to_remove = []
 for i in range(num_loops):
+  num_lc = bft_list[i].shape[0]
+  if num_lc < 2:
+    to_remove.append(i)
+  else:
    first_indices.append(total_num_lc)
-  total_num_lc += bft_list[i].shape[0]
+    total_num_lc += num_lc
 # last one must be full size
+print "REMOVED", num_loops - len(first_indices), "LOOPS"
 first_indices.append(total_num_lc)
+# clean up other lists
+for i in reversed(to_remove):
+  del bft_list[i]
+  del fragment_indices[i]
+  del loop_lengths[i]
+# consistency check
+num_loops = len(bft_list)
+assert(len(fragment_indices) == num_loops)
+assert(len(loop_lengths) == num_loops)
 assert(len(first_indices) == num_loops + 1)

 # BUILD BFT
@@ -132,5 +151,5 @@ for loop_length in set(loop_lengths):
      num_lc = first_indices[i+1] - first_indices[i]
      num_lc_ll += num_lc
      num_loops_ll += 1
-  print "LL", loop_length, "LC", num_lc_ll, "AVG-LC", \
-        float(num_lc_ll) / num_loops_ll
+  print "LL", loop_length, "LC", num_lc_ll, "LOOPS", num_loops_ll, \
+        "AVG-LC", float(num_lc_ll) / num_loops_ll