Skip to content
Snippets Groups Projects
tacos.py 5.24 KiB
import click
import numpy as np
import pandas as pd
import itertools
import multiprocessing as mp
import igraph

def my_hamming(seq1, seq2, missing='X', gap='-', ignore_gap=False, adjust=False):
    """Return the Hamming distance between two given sequence of characters.
    Sequences must have equal length.

    Args:
        seq1: numpy.ndarray (faster) or pandas.Series with sequence of characters.
        seq1: numpy.ndarray (faster) or pandas.Series with sequence of characters.
        missing: Character representing missing value in the sequence.
        adjust: Boolean, default is False for no adjusment accounting for missing values.
    Returns:
        Int, absolute sequence difference after excluding positions with missing values.

    """
    # Raises exception if vectors do not have equal length
    try:
        # get positions excluding missing values
        if ignore_gap:
            seq1_keep = ~(np.in1d(seq1,[missing,gap]))
            seq2_keep = ~(np.in1d(seq2,[missing,gap]))
            #seq1_keep = (seq1 != missing) & (seq1 != gap)
            #seq2_keep = (seq2 != missing) & (seq2 != gap)
        else:
            seq1_keep = (seq1 != missing)
            seq2_keep = (seq2 != missing)
        to_keep = (seq1_keep & seq2_keep)
    except ValueError:
        print("seq1 and seq2 must have equal lengths")
    else:
        # drop positions with missing values and
        # compute Hamming distance
        # np.sum() instead of sum() improves speed considerably!
        new_seq1 = seq1[to_keep]
        new_seq2 = seq2[to_keep]
        pair_dist = np.sum(new_seq1 != new_seq2)
        # Adjust distance to account for missing values
        if adjust:
            seq1_effective_len = np.sum(seq1_keep)
            seq2_effective_len = np.sum(seq2_keep)
            pair_dist = len(seq1) * (2 * (pair_dist) / (seq1_effective_len + seq2_effective_len))

        return (pair_dist)


@click.group()
def tacos():
    pass

@tacos.command()
def genetic_distance():
    '''Computes the genetic distance between genomes gi and gj, measured by the Hamming distance
    (i.e number of substitutions needed to convert gj into gi)
    '''
    click.echo('compute_distances cmd1')

@tacos.command()
@click.option('-na','--missing', 'missing', default='X', show_default=True, help='Character for missing value')
@click.option('--gap', 'gap', default='-', show_default=True, help='Character to flag a position as indel')
@click.option('--ignore_gap', 'ignore_gap', is_flag=True, help='enable coding gaps(i.e indels) as missing values')
@click.option('--adjust_na', 'adjust', is_flag=True, help='enable adjustment of distance for missing values')
@click.option('--ncores', 'ncores', default=1, show_default=True, help='specify number of processes for parallel computing')
@click.argument('INPUT_', required=True, type=click.Path(exists=True))
@click.argument('OUTPUT', required=True, type=click.Path())
def get_distance_matrix(missing, gap, ignore_gap, adjust, ncores, input_, output):
    '''Computes the genetic distance between each genome pair in INPUT. Distances are measured as
    the number of substitutions needed to convert genome g_j to genome g_i (i.e Hamming distance).

    INPUT is the path to the file with variable positions (rows) and genomes (columns)
    '''
    INPUT_file = input_
    positions_genomes = pd.read_csv(INPUT_file, sep='\t', index_col=False)
    nr_genomes = len(positions_genomes.columns[3:])
    genome_pairs = list(itertools.combinations(positions_genomes.columns[3:],2))

    if ncores>1:
        click.echo('Starting pool with {} processes ...'.format(ncores))
        pool = mp.Pool(processes=4)
        distance_mp = [(g1,g2,pool.apply_async(my_hamming,
                                                args=(positions_genomes.get(g1).values,
                                                      positions_genomes.get(g2).values,
                                                      missing, gap,ignore_gap, adjust)))
                       for g1,g2 in genome_pairs]
        pool.close()
        pool.join()
        pair_distances = [(g1,g2, p.get()) for g1,g2,p in distance_mp]
        del distance_mp
    else:
        pair_distances = [(g1, g2, my_hamming(positions_genomes.get(g1).values,
                                              positions_genomes.get(g2).values,
                                              missing, gap, ignore_gap, adjust))
                          for g1,g2 in genome_pairs]

    del positions_genomes

    genomes_graph=igraph.Graph.TupleList(pair_distances, weights=True)
    del pair_distances

    genomes_names = genomes_graph.vs['name']
    genomes_adj=genomes_graph.get_adjacency(type=2,attribute='weight')
    del genomes_graph
    with click.open_file(output, 'w') as f:
        f.write('\t'.join(['']+genomes_names))
        f.write('\n')
        for row_ix in range(genomes_adj.shape[0]):
            f.write('\t'.join(map(str,[genomes_names[row_ix]]+genomes_adj[row_ix])))
            f.write('\n')
    #genomes_graph.write_adjacency(output, sep='\t', attribute='weight')

    #with click.open_file(output, 'w') as f:
    #    f.write('\n'.join('%s %s %f' % item for item in pair_distances))
    click.echo('Finished! Computed {} pairwise distances for {} genomes.'.format(len(genome_pairs), nr_genomes))

if __name__ == '__main__':
    tacos()