diff --git a/README.md b/README.md index 34c8f78b6f888556f3004e84141919f4b05fec56..c23171f9e9d176e1589750b71ea9eb5f8e8a3c17 100644 --- a/README.md +++ b/README.md @@ -25,10 +25,6 @@ Run the application: $ conda activate <path/tacos_env> # if not already activated $ tacos --help ``` -or -``` -$ <path/tacos_env>/bin/python -m tacos --help # no need to activate the environment -``` To run the tests: ``` diff --git a/tacos/cli.py b/tacos/cli.py index 19df2cc603c5de2903b8543483e7d366eff07a53..09cca4a994a058efa0feadcbdaba8930331e690d 100644 --- a/tacos/cli.py +++ b/tacos/cli.py @@ -19,14 +19,17 @@ def tacos(): def distance_matrix(ignore_gap, adjust, n_jobs, type, input_, output): '''Computes the genetic distance between each genome pair in INPUT. + \b Distances are measured as the number of substitutions needed to convert genome g_j to genome g_i (i.e Hamming distance). For each pair positions with missing values are ignored. + \b INPUT is the path to the file with variable positions (rows) and genomes (columns) of the following format: +-----------+--------+-----------+----------+-----------+-----+ |Position | Gene |Reference |GENOME_1 |GENOME_2 | ... | +-----------+--------+-----------+----------+-----------+-----+ + \b Where: - Position: int Corresponds to a genomic position @@ -40,8 +43,9 @@ def distance_matrix(ignore_gap, adjust, n_jobs, type, input_, output): The allele for the corresponding position in genome GENOME_1 Values are any of IUPAC's encoding: ['A', 'G', 'C', 'T', 'Y', 'R', 'W', 'S', 'K', 'M', 'D', 'V', 'H', 'B', 'X', 'N', '.', '-'] - - Ambiguous nucleotides ('Y', 'R', 'W', 'S', 'K', 'M', 'D', 'V', 'H', 'B', 'X', 'N') are encoded as missing values. + \b + Ambiguous nucleotides ('Y', 'R', 'W', 'S', 'K', 'M', 'D', 'V', 'H', 'B', 'X', 'N') + are encoded as missing values. ''' INPUT_file = input_ genomes, positions, genomes_positions = import_alignment(INPUT_file, sep='\t') diff --git a/tacos/distance.py b/tacos/distance.py index 8b19ea6272fb63de7ef85efbc96760ff13f41579..2e42b9b95db585af1a7c83dd5f2584e985a9167c 100644 --- a/tacos/distance.py +++ b/tacos/distance.py @@ -133,6 +133,20 @@ def hamming_pdist(data_array, missing='X', gap='-', ignore_gap=False, adjust=Fal else: return(out_dist) +def _hamming_pool_init_worker(data_array): + global data_array_in_worker + data_array_in_worker = data_array + +def _hamming_pool_helper(genome_pairs, missing='X', gap='-', ignore_gap=False, adjust=False): + try: + pair_distances = np.asarray([genetic_hamming(data_array_in_worker[g1, :], + data_array_in_worker[g2, :], + missing, gap, ignore_gap, adjust) + for g1, g2 in tqdm.tqdm(genome_pairs)]) + except IndexError: + print("Provided index is out of range in data_array.") + else: + return (pair_distances) def hamming_pool(data_array, missing='X', gap='-', ignore_gap=False, adjust=False, form=1, n_jobs=1): """Computes pairwise genetic Hamming distance between every pair of genomic sequences (i.e observations) in @@ -172,19 +186,24 @@ def hamming_pool(data_array, missing='X', gap='-', ignore_gap=False, adjust=Fals genome_ix_pairs = list(itertools.combinations(range(0,nr_genomes),2)) # Here we copy the data_array to a shared memory array - # Using the shared memory array gained 5 fold more speed + # Using the shared memory unlock array gained 5 fold more speed data_array_shm = mp.Array(ctypes.c_int8, data_array.size, lock=False) + global data_array_shm_b data_array_shm_b = np.frombuffer(data_array_shm, dtype=ctypes.c_int8) data_array_shm_b.shape = data_array.shape data_array_shm_b[:] = data_array - # Start a multiprocessing pool genome_pairs_chunks = np.array_split(genome_ix_pairs, n_jobs) - with closing(get_context("spawn").Pool(processes=n_jobs)) as pool: + #with closing(get_context("spawn").Pool(processes=n_jobs)) as pool: + with closing(get_context("spawn").Pool(processes=n_jobs, + initializer=_hamming_pool_init_worker, + initargs=(data_array_shm_b,))) as pool: print('starting pool with {} processes'.format(n_jobs)) - get_distance_partial = partial(hamming_pairs, - data_array=data_array_shm_b, + #get_distance_partial = partial(hamming_pairs, + # data_array=data_array_shm_b, + # missing=missing, gap=gap, ignore_gap=ignore_gap, adjust=adjust) + get_distance_partial = partial(_hamming_pool_helper, missing=missing, gap=gap, ignore_gap=ignore_gap, adjust=adjust) distance_mp = pool.imap(get_distance_partial, tqdm.tqdm(genome_pairs_chunks), chunksize=1)