update tacos.distance.hamming_pool to allow child process inherit data_array

9e87b545 · Ticlla Ccenhua Monica Roxana · cf39c997 · 9e87b545 · 9e87b545 · 9e87b545
Commit 9e87b545 authored 5 years ago by Ticlla Ccenhua Monica Roxana
--- a/README.md
+++ b/README.md
@@ -25,10 +25,6 @@ Run the application:
 $ conda activate <path/tacos_env> # if not already activated
 $ tacos --help
 ```
-or
-```
-$ <path/tacos_env>/bin/python -m tacos --help # no need to activate the environment
-```

 To run the tests:
 ```

--- a/tacos/cli.py
+++ b/tacos/cli.py
@@ -19,14 +19,17 @@ def tacos():
 def distance_matrix(ignore_gap, adjust, n_jobs, type, input_, output):
    '''Computes the genetic distance between each genome pair in INPUT.

+    \b
    Distances are measured as the number of substitutions needed to convert genome g_j to
    genome g_i (i.e Hamming distance). For each pair positions with missing values are ignored.

+    \b
    INPUT is the path to the file with variable positions (rows) and genomes (columns) of
    the following format:
        +-----------+--------+-----------+----------+-----------+-----+
        |Position   | Gene   |Reference  |GENOME_1  |GENOME_2   | ... |
        +-----------+--------+-----------+----------+-----------+-----+
+    \b
    Where:
        - Position: int
            Corresponds to a genomic position
@@ -40,8 +43,9 @@ def distance_matrix(ignore_gap, adjust, n_jobs, type, input_, output):
            The allele for the corresponding position in genome GENOME_1
            Values are any of IUPAC's encoding:
                ['A', 'G', 'C', 'T', 'Y', 'R', 'W', 'S', 'K', 'M', 'D', 'V', 'H', 'B', 'X', 'N', '.', '-']
-
-    Ambiguous nucleotides ('Y', 'R', 'W', 'S', 'K', 'M', 'D', 'V', 'H', 'B', 'X', 'N') are encoded as missing values.
+    \b
+    Ambiguous nucleotides ('Y', 'R', 'W', 'S', 'K', 'M', 'D', 'V', 'H', 'B', 'X', 'N')
+    are encoded as missing values.
    '''
    INPUT_file = input_
    genomes, positions, genomes_positions = import_alignment(INPUT_file, sep='\t')

--- a/tacos/distance.py
+++ b/tacos/distance.py
@@ -133,6 +133,20 @@ def hamming_pdist(data_array, missing='X', gap='-', ignore_gap=False, adjust=Fal
        else:
            return(out_dist)

+def _hamming_pool_init_worker(data_array):
+    global data_array_in_worker
+    data_array_in_worker = data_array
+
+def _hamming_pool_helper(genome_pairs, missing='X', gap='-', ignore_gap=False, adjust=False):
+    try:
+        pair_distances = np.asarray([genetic_hamming(data_array_in_worker[g1, :],
+                                                     data_array_in_worker[g2, :],
+                                                     missing, gap, ignore_gap, adjust)
+                                     for g1, g2 in tqdm.tqdm(genome_pairs)])
+    except IndexError:
+        print("Provided index is out of range in data_array.")
+    else:
+        return (pair_distances)

 def hamming_pool(data_array, missing='X', gap='-', ignore_gap=False, adjust=False, form=1, n_jobs=1):
    """Computes pairwise genetic Hamming distance between every pair of genomic sequences (i.e observations) in
@@ -172,19 +186,24 @@ def hamming_pool(data_array, missing='X', gap='-', ignore_gap=False, adjust=Fals
        genome_ix_pairs = list(itertools.combinations(range(0,nr_genomes),2))

        # Here we copy the data_array to a shared memory array
-        # Using the shared memory array gained 5 fold more speed
+        # Using the shared memory unlock array gained 5 fold more speed
        data_array_shm = mp.Array(ctypes.c_int8, data_array.size, lock=False)
+        global data_array_shm_b
        data_array_shm_b = np.frombuffer(data_array_shm, dtype=ctypes.c_int8)
        data_array_shm_b.shape = data_array.shape
        data_array_shm_b[:] = data_array

-
        # Start a multiprocessing pool
        genome_pairs_chunks = np.array_split(genome_ix_pairs, n_jobs)
-        with closing(get_context("spawn").Pool(processes=n_jobs)) as pool:
+        #with closing(get_context("spawn").Pool(processes=n_jobs)) as pool:
+        with closing(get_context("spawn").Pool(processes=n_jobs,
+                                               initializer=_hamming_pool_init_worker,
+                                               initargs=(data_array_shm_b,))) as pool:
            print('starting pool with {} processes'.format(n_jobs))
-            get_distance_partial = partial(hamming_pairs,
-                                           data_array=data_array_shm_b,
+            #get_distance_partial = partial(hamming_pairs,
+            #                               data_array=data_array_shm_b,
+            #                               missing=missing, gap=gap, ignore_gap=ignore_gap, adjust=adjust)
+            get_distance_partial = partial(_hamming_pool_helper,
                                           missing=missing, gap=gap, ignore_gap=ignore_gap, adjust=adjust)

            distance_mp = pool.imap(get_distance_partial, tqdm.tqdm(genome_pairs_chunks), chunksize=1)