diff --git a/tacos.py b/tacos.py index 463a43e265e0e07021346ba675b75e7b636b97d0..415088656b1a0005147f73dfdddb0710fe0a4111 100644 --- a/tacos.py +++ b/tacos.py @@ -4,6 +4,7 @@ import pandas as pd import itertools import multiprocessing as mp import igraph +from functools import partial def my_hamming(seq1, seq2, missing='X', gap='-', ignore_gap=False, adjust=False): """Return the Hamming distance between two given sequence of characters. @@ -31,7 +32,7 @@ def my_hamming(seq1, seq2, missing='X', gap='-', ignore_gap=False, adjust=False) seq2_keep = (seq2 != missing) to_keep = (seq1_keep & seq2_keep) except ValueError: - print("seq1 and seq2 must have equal lengths") + print("seq1 and seq2 must have equal lengths.") else: # drop positions with missing values and # compute Hamming distance @@ -43,10 +44,19 @@ def my_hamming(seq1, seq2, missing='X', gap='-', ignore_gap=False, adjust=False) if adjust: seq1_effective_len = np.sum(seq1_keep) seq2_effective_len = np.sum(seq2_keep) - pair_dist = len(seq1) * (2 * (pair_dist) / (seq1_effective_len + seq2_effective_len)) + pair_dist = np.around(len(seq1) * (2 * (pair_dist) / (seq1_effective_len + seq2_effective_len)), decimals=2) return (pair_dist) +def get_distance_helper(genome_pairs, data_df, missing='X', gap='-', ignore_gap=False, adjust=False): + try: + pair_distances = [(g1, g2, my_hamming(data_df.get(g1).values, + data_df.get(g2).values, + missing, gap, ignore_gap, adjust)) for g1,g2 in genome_pairs] + except ValueError: + print("data_df must be a Pandas DataFrame.") + else: + return(pair_distances) @click.group() def tacos(): @@ -65,9 +75,11 @@ def genetic_distance(): @click.option('--ignore_gap', 'ignore_gap', is_flag=True, help='enable coding gaps(i.e indels) as missing values') @click.option('--adjust_na', 'adjust', is_flag=True, help='enable adjustment of distance for missing values') @click.option('--ncores', 'ncores', default=1, show_default=True, help='specify number of processes for parallel computing') +@click.option('--type', default='0', show_default=True, type=click.Choice(['0','1','2']), + help='either 1 (writes the lower triangle of the matrix), or 0 (writes the upper triangle) or 2 (writes both parts)') @click.argument('INPUT_', required=True, type=click.Path(exists=True)) @click.argument('OUTPUT', required=True, type=click.Path()) -def get_distance_matrix(missing, gap, ignore_gap, adjust, ncores, input_, output): +def get_distance_matrix(missing, gap, ignore_gap, adjust, ncores, type, input_, output): '''Computes the genetic distance between each genome pair in INPUT. Distances are measured as the number of substitutions needed to convert genome g_j to genome g_i (i.e Hamming distance). @@ -80,15 +92,14 @@ def get_distance_matrix(missing, gap, ignore_gap, adjust, ncores, input_, output if ncores>1: click.echo('Starting pool with {} processes ...'.format(ncores)) - pool = mp.Pool(processes=4) - distance_mp = [(g1,g2,pool.apply_async(my_hamming, - args=(positions_genomes.get(g1).values, - positions_genomes.get(g2).values, - missing, gap,ignore_gap, adjust))) - for g1,g2 in genome_pairs] + pool = mp.Pool(ncores) + get_distance_partial = partial(get_distance_helper, + data_df=positions_genomes, + missing=missing, gap=gap, ignore_gap=ignore_gap, adjust=adjust) + distance_mp = pool.map(get_distance_partial, np.array_split(genome_pairs, ncores)) pool.close() pool.join() - pair_distances = [(g1,g2, p.get()) for g1,g2,p in distance_mp] + pair_distances = [pair_dist for item in distance_mp for pair_dist in item] del distance_mp else: pair_distances = [(g1, g2, my_hamming(positions_genomes.get(g1).values, @@ -102,7 +113,7 @@ def get_distance_matrix(missing, gap, ignore_gap, adjust, ncores, input_, output del pair_distances genomes_names = genomes_graph.vs['name'] - genomes_adj=genomes_graph.get_adjacency(type=2,attribute='weight') + genomes_adj=genomes_graph.get_adjacency(type=np.int(type), attribute='weight') del genomes_graph with click.open_file(output, 'w') as f: f.write('\t'.join(['']+genomes_names)) @@ -110,10 +121,7 @@ def get_distance_matrix(missing, gap, ignore_gap, adjust, ncores, input_, output for row_ix in range(genomes_adj.shape[0]): f.write('\t'.join(map(str,[genomes_names[row_ix]]+genomes_adj[row_ix]))) f.write('\n') - #genomes_graph.write_adjacency(output, sep='\t', attribute='weight') - #with click.open_file(output, 'w') as f: - # f.write('\n'.join('%s %s %f' % item for item in pair_distances)) click.echo('Finished! Computed {} pairwise distances for {} genomes.'.format(len(genome_pairs), nr_genomes)) if __name__ == '__main__':