Skip to content
Snippets Groups Projects
Commit 9e87b545 authored by Ticlla Ccenhua Monica Roxana's avatar Ticlla Ccenhua Monica Roxana
Browse files

update tacos.distance.hamming_pool to allow child process inherit data_array

parent cf39c997
No related branches found
No related tags found
No related merge requests found
......@@ -25,10 +25,6 @@ Run the application:
$ conda activate <path/tacos_env> # if not already activated
$ tacos --help
```
or
```
$ <path/tacos_env>/bin/python -m tacos --help # no need to activate the environment
```
To run the tests:
```
......
......@@ -19,14 +19,17 @@ def tacos():
def distance_matrix(ignore_gap, adjust, n_jobs, type, input_, output):
'''Computes the genetic distance between each genome pair in INPUT.
\b
Distances are measured as the number of substitutions needed to convert genome g_j to
genome g_i (i.e Hamming distance). For each pair positions with missing values are ignored.
\b
INPUT is the path to the file with variable positions (rows) and genomes (columns) of
the following format:
+-----------+--------+-----------+----------+-----------+-----+
|Position | Gene |Reference |GENOME_1 |GENOME_2 | ... |
+-----------+--------+-----------+----------+-----------+-----+
\b
Where:
- Position: int
Corresponds to a genomic position
......@@ -40,8 +43,9 @@ def distance_matrix(ignore_gap, adjust, n_jobs, type, input_, output):
The allele for the corresponding position in genome GENOME_1
Values are any of IUPAC's encoding:
['A', 'G', 'C', 'T', 'Y', 'R', 'W', 'S', 'K', 'M', 'D', 'V', 'H', 'B', 'X', 'N', '.', '-']
Ambiguous nucleotides ('Y', 'R', 'W', 'S', 'K', 'M', 'D', 'V', 'H', 'B', 'X', 'N') are encoded as missing values.
\b
Ambiguous nucleotides ('Y', 'R', 'W', 'S', 'K', 'M', 'D', 'V', 'H', 'B', 'X', 'N')
are encoded as missing values.
'''
INPUT_file = input_
genomes, positions, genomes_positions = import_alignment(INPUT_file, sep='\t')
......
......@@ -133,6 +133,20 @@ def hamming_pdist(data_array, missing='X', gap='-', ignore_gap=False, adjust=Fal
else:
return(out_dist)
def _hamming_pool_init_worker(data_array):
global data_array_in_worker
data_array_in_worker = data_array
def _hamming_pool_helper(genome_pairs, missing='X', gap='-', ignore_gap=False, adjust=False):
try:
pair_distances = np.asarray([genetic_hamming(data_array_in_worker[g1, :],
data_array_in_worker[g2, :],
missing, gap, ignore_gap, adjust)
for g1, g2 in tqdm.tqdm(genome_pairs)])
except IndexError:
print("Provided index is out of range in data_array.")
else:
return (pair_distances)
def hamming_pool(data_array, missing='X', gap='-', ignore_gap=False, adjust=False, form=1, n_jobs=1):
"""Computes pairwise genetic Hamming distance between every pair of genomic sequences (i.e observations) in
......@@ -172,19 +186,24 @@ def hamming_pool(data_array, missing='X', gap='-', ignore_gap=False, adjust=Fals
genome_ix_pairs = list(itertools.combinations(range(0,nr_genomes),2))
# Here we copy the data_array to a shared memory array
# Using the shared memory array gained 5 fold more speed
# Using the shared memory unlock array gained 5 fold more speed
data_array_shm = mp.Array(ctypes.c_int8, data_array.size, lock=False)
global data_array_shm_b
data_array_shm_b = np.frombuffer(data_array_shm, dtype=ctypes.c_int8)
data_array_shm_b.shape = data_array.shape
data_array_shm_b[:] = data_array
# Start a multiprocessing pool
genome_pairs_chunks = np.array_split(genome_ix_pairs, n_jobs)
with closing(get_context("spawn").Pool(processes=n_jobs)) as pool:
#with closing(get_context("spawn").Pool(processes=n_jobs)) as pool:
with closing(get_context("spawn").Pool(processes=n_jobs,
initializer=_hamming_pool_init_worker,
initargs=(data_array_shm_b,))) as pool:
print('starting pool with {} processes'.format(n_jobs))
get_distance_partial = partial(hamming_pairs,
data_array=data_array_shm_b,
#get_distance_partial = partial(hamming_pairs,
# data_array=data_array_shm_b,
# missing=missing, gap=gap, ignore_gap=ignore_gap, adjust=adjust)
get_distance_partial = partial(_hamming_pool_helper,
missing=missing, gap=gap, ignore_gap=ignore_gap, adjust=adjust)
distance_mp = pool.imap(get_distance_partial, tqdm.tqdm(genome_pairs_chunks), chunksize=1)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment