Something went wrong on our end
-
Ticlla Ccenhua Monica Roxana authoredTiclla Ccenhua Monica Roxana authored
tacos.py 5.24 KiB
import click
import numpy as np
import pandas as pd
import itertools
import multiprocessing as mp
import igraph
def my_hamming(seq1, seq2, missing='X', gap='-', ignore_gap=False, adjust=False):
"""Return the Hamming distance between two given sequence of characters.
Sequences must have equal length.
Args:
seq1: numpy.ndarray (faster) or pandas.Series with sequence of characters.
seq1: numpy.ndarray (faster) or pandas.Series with sequence of characters.
missing: Character representing missing value in the sequence.
adjust: Boolean, default is False for no adjusment accounting for missing values.
Returns:
Int, absolute sequence difference after excluding positions with missing values.
"""
# Raises exception if vectors do not have equal length
try:
# get positions excluding missing values
if ignore_gap:
seq1_keep = ~(np.in1d(seq1,[missing,gap]))
seq2_keep = ~(np.in1d(seq2,[missing,gap]))
#seq1_keep = (seq1 != missing) & (seq1 != gap)
#seq2_keep = (seq2 != missing) & (seq2 != gap)
else:
seq1_keep = (seq1 != missing)
seq2_keep = (seq2 != missing)
to_keep = (seq1_keep & seq2_keep)
except ValueError:
print("seq1 and seq2 must have equal lengths")
else:
# drop positions with missing values and
# compute Hamming distance
# np.sum() instead of sum() improves speed considerably!
new_seq1 = seq1[to_keep]
new_seq2 = seq2[to_keep]
pair_dist = np.sum(new_seq1 != new_seq2)
# Adjust distance to account for missing values
if adjust:
seq1_effective_len = np.sum(seq1_keep)
seq2_effective_len = np.sum(seq2_keep)
pair_dist = len(seq1) * (2 * (pair_dist) / (seq1_effective_len + seq2_effective_len))
return (pair_dist)
@click.group()
def tacos():
pass
@tacos.command()
def genetic_distance():
'''Computes the genetic distance between genomes gi and gj, measured by the Hamming distance
(i.e number of substitutions needed to convert gj into gi)
'''
click.echo('compute_distances cmd1')
@tacos.command()
@click.option('-na','--missing', 'missing', default='X', show_default=True, help='Character for missing value')
@click.option('--gap', 'gap', default='-', show_default=True, help='Character to flag a position as indel')
@click.option('--ignore_gap', 'ignore_gap', is_flag=True, help='enable coding gaps(i.e indels) as missing values')
@click.option('--adjust_na', 'adjust', is_flag=True, help='enable adjustment of distance for missing values')
@click.option('--ncores', 'ncores', default=1, show_default=True, help='specify number of processes for parallel computing')
@click.argument('INPUT_', required=True, type=click.Path(exists=True))
@click.argument('OUTPUT', required=True, type=click.Path())
def get_distance_matrix(missing, gap, ignore_gap, adjust, ncores, input_, output):
'''Computes the genetic distance between each genome pair in INPUT. Distances are measured as
the number of substitutions needed to convert genome g_j to genome g_i (i.e Hamming distance).
INPUT is the path to the file with variable positions (rows) and genomes (columns)
'''
INPUT_file = input_
positions_genomes = pd.read_csv(INPUT_file, sep='\t', index_col=False)
nr_genomes = len(positions_genomes.columns[3:])
genome_pairs = list(itertools.combinations(positions_genomes.columns[3:],2))
if ncores>1:
click.echo('Starting pool with {} processes ...'.format(ncores))
pool = mp.Pool(processes=4)
distance_mp = [(g1,g2,pool.apply_async(my_hamming,
args=(positions_genomes.get(g1).values,
positions_genomes.get(g2).values,
missing, gap,ignore_gap, adjust)))
for g1,g2 in genome_pairs]
pool.close()
pool.join()
pair_distances = [(g1,g2, p.get()) for g1,g2,p in distance_mp]
del distance_mp
else:
pair_distances = [(g1, g2, my_hamming(positions_genomes.get(g1).values,
positions_genomes.get(g2).values,
missing, gap, ignore_gap, adjust))
for g1,g2 in genome_pairs]
del positions_genomes
genomes_graph=igraph.Graph.TupleList(pair_distances, weights=True)
del pair_distances
genomes_names = genomes_graph.vs['name']
genomes_adj=genomes_graph.get_adjacency(type=2,attribute='weight')
del genomes_graph
with click.open_file(output, 'w') as f:
f.write('\t'.join(['']+genomes_names))
f.write('\n')
for row_ix in range(genomes_adj.shape[0]):
f.write('\t'.join(map(str,[genomes_names[row_ix]]+genomes_adj[row_ix])))
f.write('\n')
#genomes_graph.write_adjacency(output, sep='\t', attribute='weight')
#with click.open_file(output, 'w') as f:
# f.write('\n'.join('%s %s %f' % item for item in pair_distances))
click.echo('Finished! Computed {} pairwise distances for {} genomes.'.format(len(genome_pairs), nr_genomes))
if __name__ == '__main__':
tacos()