From 9e87b54599445d08f08658dc41867142eb9454bc Mon Sep 17 00:00:00 2001 From: ticlla <monicaroxana.ticllaccenhua@unibas.ch> Date: Thu, 19 Sep 2019 20:06:15 -0500 Subject: [PATCH] update tacos.distance.hamming_pool to allow child process inherit data_array --- README.md | 4 ---- tacos/cli.py | 8 ++++++-- tacos/distance.py | 29 ++++++++++++++++++++++++----- 3 files changed, 30 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 34c8f78..c23171f 100644 --- a/README.md +++ b/README.md @@ -25,10 +25,6 @@ Run the application: $ conda activate <path/tacos_env> # if not already activated $ tacos --help ``` -or -``` -$ <path/tacos_env>/bin/python -m tacos --help # no need to activate the environment -``` To run the tests: ``` diff --git a/tacos/cli.py b/tacos/cli.py index 19df2cc..09cca4a 100644 --- a/tacos/cli.py +++ b/tacos/cli.py @@ -19,14 +19,17 @@ def tacos(): def distance_matrix(ignore_gap, adjust, n_jobs, type, input_, output): '''Computes the genetic distance between each genome pair in INPUT. + \b Distances are measured as the number of substitutions needed to convert genome g_j to genome g_i (i.e Hamming distance). For each pair positions with missing values are ignored. + \b INPUT is the path to the file with variable positions (rows) and genomes (columns) of the following format: +-----------+--------+-----------+----------+-----------+-----+ |Position | Gene |Reference |GENOME_1 |GENOME_2 | ... | +-----------+--------+-----------+----------+-----------+-----+ + \b Where: - Position: int Corresponds to a genomic position @@ -40,8 +43,9 @@ def distance_matrix(ignore_gap, adjust, n_jobs, type, input_, output): The allele for the corresponding position in genome GENOME_1 Values are any of IUPAC's encoding: ['A', 'G', 'C', 'T', 'Y', 'R', 'W', 'S', 'K', 'M', 'D', 'V', 'H', 'B', 'X', 'N', '.', '-'] - - Ambiguous nucleotides ('Y', 'R', 'W', 'S', 'K', 'M', 'D', 'V', 'H', 'B', 'X', 'N') are encoded as missing values. + \b + Ambiguous nucleotides ('Y', 'R', 'W', 'S', 'K', 'M', 'D', 'V', 'H', 'B', 'X', 'N') + are encoded as missing values. ''' INPUT_file = input_ genomes, positions, genomes_positions = import_alignment(INPUT_file, sep='\t') diff --git a/tacos/distance.py b/tacos/distance.py index 8b19ea6..2e42b9b 100644 --- a/tacos/distance.py +++ b/tacos/distance.py @@ -133,6 +133,20 @@ def hamming_pdist(data_array, missing='X', gap='-', ignore_gap=False, adjust=Fal else: return(out_dist) +def _hamming_pool_init_worker(data_array): + global data_array_in_worker + data_array_in_worker = data_array + +def _hamming_pool_helper(genome_pairs, missing='X', gap='-', ignore_gap=False, adjust=False): + try: + pair_distances = np.asarray([genetic_hamming(data_array_in_worker[g1, :], + data_array_in_worker[g2, :], + missing, gap, ignore_gap, adjust) + for g1, g2 in tqdm.tqdm(genome_pairs)]) + except IndexError: + print("Provided index is out of range in data_array.") + else: + return (pair_distances) def hamming_pool(data_array, missing='X', gap='-', ignore_gap=False, adjust=False, form=1, n_jobs=1): """Computes pairwise genetic Hamming distance between every pair of genomic sequences (i.e observations) in @@ -172,19 +186,24 @@ def hamming_pool(data_array, missing='X', gap='-', ignore_gap=False, adjust=Fals genome_ix_pairs = list(itertools.combinations(range(0,nr_genomes),2)) # Here we copy the data_array to a shared memory array - # Using the shared memory array gained 5 fold more speed + # Using the shared memory unlock array gained 5 fold more speed data_array_shm = mp.Array(ctypes.c_int8, data_array.size, lock=False) + global data_array_shm_b data_array_shm_b = np.frombuffer(data_array_shm, dtype=ctypes.c_int8) data_array_shm_b.shape = data_array.shape data_array_shm_b[:] = data_array - # Start a multiprocessing pool genome_pairs_chunks = np.array_split(genome_ix_pairs, n_jobs) - with closing(get_context("spawn").Pool(processes=n_jobs)) as pool: + #with closing(get_context("spawn").Pool(processes=n_jobs)) as pool: + with closing(get_context("spawn").Pool(processes=n_jobs, + initializer=_hamming_pool_init_worker, + initargs=(data_array_shm_b,))) as pool: print('starting pool with {} processes'.format(n_jobs)) - get_distance_partial = partial(hamming_pairs, - data_array=data_array_shm_b, + #get_distance_partial = partial(hamming_pairs, + # data_array=data_array_shm_b, + # missing=missing, gap=gap, ignore_gap=ignore_gap, adjust=adjust) + get_distance_partial = partial(_hamming_pool_helper, missing=missing, gap=gap, ignore_gap=ignore_gap, adjust=adjust) distance_mp = pool.imap(get_distance_partial, tqdm.tqdm(genome_pairs_chunks), chunksize=1) -- GitLab