From 9e87b54599445d08f08658dc41867142eb9454bc Mon Sep 17 00:00:00 2001
From: ticlla <monicaroxana.ticllaccenhua@unibas.ch>
Date: Thu, 19 Sep 2019 20:06:15 -0500
Subject: [PATCH] update tacos.distance.hamming_pool to allow child process
 inherit data_array

---
 README.md         |  4 ----
 tacos/cli.py      |  8 ++++++--
 tacos/distance.py | 29 ++++++++++++++++++++++++-----
 3 files changed, 30 insertions(+), 11 deletions(-)

diff --git a/README.md b/README.md
index 34c8f78..c23171f 100644
--- a/README.md
+++ b/README.md
@@ -25,10 +25,6 @@ Run the application:
 $ conda activate <path/tacos_env> # if not already activated
 $ tacos --help
 ```
-or
-```
-$ <path/tacos_env>/bin/python -m tacos --help # no need to activate the environment
-```
 
 To run the tests:
 ```
diff --git a/tacos/cli.py b/tacos/cli.py
index 19df2cc..09cca4a 100644
--- a/tacos/cli.py
+++ b/tacos/cli.py
@@ -19,14 +19,17 @@ def tacos():
 def distance_matrix(ignore_gap, adjust, n_jobs, type, input_, output):
     '''Computes the genetic distance between each genome pair in INPUT.
 
+    \b
     Distances are measured as the number of substitutions needed to convert genome g_j to
     genome g_i (i.e Hamming distance). For each pair positions with missing values are ignored.
 
+    \b
     INPUT is the path to the file with variable positions (rows) and genomes (columns) of
     the following format:
         +-----------+--------+-----------+----------+-----------+-----+
         |Position   | Gene   |Reference  |GENOME_1  |GENOME_2   | ... |
         +-----------+--------+-----------+----------+-----------+-----+
+    \b
     Where:
         - Position: int
             Corresponds to a genomic position
@@ -40,8 +43,9 @@ def distance_matrix(ignore_gap, adjust, n_jobs, type, input_, output):
             The allele for the corresponding position in genome GENOME_1
             Values are any of IUPAC's encoding:
                 ['A', 'G', 'C', 'T', 'Y', 'R', 'W', 'S', 'K', 'M', 'D', 'V', 'H', 'B', 'X', 'N', '.', '-']
-
-    Ambiguous nucleotides ('Y', 'R', 'W', 'S', 'K', 'M', 'D', 'V', 'H', 'B', 'X', 'N') are encoded as missing values.
+    \b
+    Ambiguous nucleotides ('Y', 'R', 'W', 'S', 'K', 'M', 'D', 'V', 'H', 'B', 'X', 'N')
+    are encoded as missing values.
     '''
     INPUT_file = input_
     genomes, positions, genomes_positions = import_alignment(INPUT_file, sep='\t')
diff --git a/tacos/distance.py b/tacos/distance.py
index 8b19ea6..2e42b9b 100644
--- a/tacos/distance.py
+++ b/tacos/distance.py
@@ -133,6 +133,20 @@ def hamming_pdist(data_array, missing='X', gap='-', ignore_gap=False, adjust=Fal
         else:
             return(out_dist)
 
+def _hamming_pool_init_worker(data_array):
+    global data_array_in_worker
+    data_array_in_worker = data_array
+
+def _hamming_pool_helper(genome_pairs, missing='X', gap='-', ignore_gap=False, adjust=False):
+    try:
+        pair_distances = np.asarray([genetic_hamming(data_array_in_worker[g1, :],
+                                                     data_array_in_worker[g2, :],
+                                                     missing, gap, ignore_gap, adjust)
+                                     for g1, g2 in tqdm.tqdm(genome_pairs)])
+    except IndexError:
+        print("Provided index is out of range in data_array.")
+    else:
+        return (pair_distances)
 
 def hamming_pool(data_array, missing='X', gap='-', ignore_gap=False, adjust=False, form=1, n_jobs=1):
     """Computes pairwise genetic Hamming distance between every pair of genomic sequences (i.e observations) in
@@ -172,19 +186,24 @@ def hamming_pool(data_array, missing='X', gap='-', ignore_gap=False, adjust=Fals
         genome_ix_pairs = list(itertools.combinations(range(0,nr_genomes),2))
 
         # Here we copy the data_array to a shared memory array
-        # Using the shared memory array gained 5 fold more speed
+        # Using the shared memory unlock array gained 5 fold more speed
         data_array_shm = mp.Array(ctypes.c_int8, data_array.size, lock=False)
+        global data_array_shm_b
         data_array_shm_b = np.frombuffer(data_array_shm, dtype=ctypes.c_int8)
         data_array_shm_b.shape = data_array.shape
         data_array_shm_b[:] = data_array
 
-
         # Start a multiprocessing pool
         genome_pairs_chunks = np.array_split(genome_ix_pairs, n_jobs)
-        with closing(get_context("spawn").Pool(processes=n_jobs)) as pool:
+        #with closing(get_context("spawn").Pool(processes=n_jobs)) as pool:
+        with closing(get_context("spawn").Pool(processes=n_jobs,
+                                               initializer=_hamming_pool_init_worker,
+                                               initargs=(data_array_shm_b,))) as pool:
             print('starting pool with {} processes'.format(n_jobs))
-            get_distance_partial = partial(hamming_pairs,
-                                           data_array=data_array_shm_b,
+            #get_distance_partial = partial(hamming_pairs,
+            #                               data_array=data_array_shm_b,
+            #                               missing=missing, gap=gap, ignore_gap=ignore_gap, adjust=adjust)
+            get_distance_partial = partial(_hamming_pool_helper,
                                            missing=missing, gap=gap, ignore_gap=ignore_gap, adjust=adjust)
 
             distance_mp = pool.imap(get_distance_partial, tqdm.tqdm(genome_pairs_chunks), chunksize=1)
-- 
GitLab