diff --git a/README.md b/README.md index 88e2f7d987f31c417dc9691d7d547b24c4def939..34c8f78b6f888556f3004e84141919f4b05fec56 100644 --- a/README.md +++ b/README.md @@ -1,31 +1,33 @@ # TACOS -TACOS is a diverse Set Of Commands And Tools (SOCAT, but TACOS sounds much better!). +TACOS is a diverse Set Of Commands And Tools (SOCAT, but TACOS sounds much better!) to handle genomic-derived data. ## Basic setup Create virtual environment ``` $ conda update conda -$ conda create --prefix=<path/tacos_env> -c intel intelpython3_core python=3.5 icu=58 pandas +$ conda create --prefix=<path/tacos_env> -c intel intelpython3_core python=3.6.* icu=58 pandas scipy numpy joblib matplotlib $ conda activate <path/tacos_env> -$ conda install -c conda-forge python-igraph +$ conda install -c conda-forge python-igraph tqdm pycairo +$ conda install -c bioconda gffutils ``` Install Tacos: ``` +$ conda activate <path/tacos_env> # if not already activated $ git clone https://git.scicore.unibas.ch/TBRU/tacos.git $ cd tacos -$ pip install . +$ pip install -e . ``` Run the application: ``` -$ conda activate <path/tacos_env> +$ conda activate <path/tacos_env> # if not already activated $ tacos --help ``` or ``` -$ <path/tacos_env>/bin/python -m tacos --help +$ <path/tacos_env>/bin/python -m tacos --help # no need to activate the environment ``` To run the tests: diff --git a/setup.py b/setup.py index 5446bf48a29be4d244edc6bee1df6477b6a08ea1..2e5f3acded37d467d5f3da0b0c9cec2a1990cd7e 100644 --- a/setup.py +++ b/setup.py @@ -1,18 +1,23 @@ -from setuptools import setup +from setuptools import setup, find_packages setup( name='tacos', author='Monica R. Ticlla', - version='0.1', - py_modules=['tacos'], + author_email='mticlla@gmail.com', + version='0.2', + packages=find_packages(include=['tacos']), install_requires=[ 'Click', 'numpy', 'pandas', - 'python-igraph' + 'python-igraph', + 'tqdm', + 'scipy', + 'joblib', ], entry_points=''' [console_scripts] - tacos=tacos:tacos + tacos=tacos.cli:tacos ''', + test_suite='tests', ) \ No newline at end of file diff --git a/tacos/__init__.py b/tacos/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..af130df4a5b5bff2c350a3f9c83f25a488304de3 --- /dev/null +++ b/tacos/__init__.py @@ -0,0 +1,7 @@ +# -*- coding: utf-8 -*- + +"""Top-level package for pathways.""" + +__author__ = """Monica R. Ticlla Ccenhua""" +__email__ = 'mticlla@gmail.com' +__version__ = '0.1.0' diff --git a/tacos/alignment_utils.py b/tacos/alignment_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..4d491532c8faabc71f186c3b8aa81bc9d116cef1 --- /dev/null +++ b/tacos/alignment_utils.py @@ -0,0 +1,77 @@ +""" + tacos.alignment_utils + + Library for reading or re-formatting a genomic alignment table. + + :copyright: (c) 2019 by Monica R. Ticlla + :license: MIT, see LICENSE for more details. + + This module provides multiple functions for reading a genomic alignment table of the following format: + +-----------+--------+-----------+----------+-----------+-----+ + |Position | Gene |Reference |GENOME_1 |GENOME_2 | ... | + +-----------+--------+-----------+----------+-----------+-----+ + Where: + - Position: int + Corresponds to a genomic position + - Gene: string + Name of Gene + - Reference: character + The allele for the corresponding position in the reference genome + - GENOME_1: character + The allele for the corresponding position in genome GENOME_1 + + Available functions: + - import_alignment: Reads a genomic alignment table into a numeric table. +""" +import numpy as np +import csv + +def import_alignment(data_file, sep='\t'): + """Reads a genomic alignment table into a numeric table. + + This function assigns numeric codes to each nucleotide (including IUPAC's ambiguous nucleotides) and returns a + positions-by-genomes table. Numeric codes are assigned as in the following dictionary: + {'A':1, 'G':2, 'C':3, 'T':4, ['Y','R','W','S','K','M','D','V','H','B','X','N']:-1, and ['.','-']:0} + + Args: + data_file: str + A valid string path to a genomic alignment table. + sep: str, default '\t' + Delimiter to use. + Returns: + An m by n array of m original genomic sequences (i.e observations) in an n-dimensional space (genomic positions) + and two lists. The first list maps the row indices of the numpy.ndarray to the corresponding genomic positions. + The second list maps the column indices of the numpy.ndarray to the corresponding genomes. In the array, + allele Values (e.g. 'A', 'T', 'C', 'G') were converted to numerical values (e.g. 1,2,3,4). + + """ + # IUPAC's ambiguous nucleotides + nucleotides_letters = ['A', 'G', 'C', 'T', 'Y', 'R', 'W', 'S', 'K', 'M', 'D', 'V', 'H', 'B', 'X', 'N', '.', '-'] + # Numeric codes for each nucleotide + # Ambiguous nucleotides are coded as missing values and receive a -1 value + # nucleotide_numeric = list(range(1,15)) + [-1,-1,0,0] + nucleotide_numeric = [1, 2, 3, 4] + [-1] * 12 + [0, 0] + # Dictionary to map nucleotides letters to corresponding numeric equivalents + iupac_nucleotide_code_mapper = dict(zip(nucleotides_letters, nucleotide_numeric)) + + records_header = list() + # positions_gene = dict() + # positions_ref_allele = dict() + positions = [] + positions_genomes_alleles = [] + alignment_table_reader = csv.reader(open(data_file), delimiter=sep) + + for ix, row in enumerate(alignment_table_reader): + if (ix == 0): + records_header = row + else: + position = row[0] + # positions_gene[position] = row[1] + # positions_ref_allele[position] = row[2] + positions.append(position) + alleles, inv_ix = np.unique(row[3:], return_inverse=True) + alleles_numeric = np.array([iupac_nucleotide_code_mapper[key] for key in alleles], dtype='int8') + positions_genomes_alleles.append(alleles_numeric[inv_ix]) + genomes = records_header[3:] + positions_genomes_alleles = np.asarray(positions_genomes_alleles, dtype='int8') + return (genomes, positions, positions_genomes_alleles.T) diff --git a/tacos/cli.py b/tacos/cli.py new file mode 100644 index 0000000000000000000000000000000000000000..19df2cc603c5de2903b8543483e7d366eff07a53 --- /dev/null +++ b/tacos/cli.py @@ -0,0 +1,75 @@ +import click +import numpy as np +import itertools +from tacos.alignment_utils import import_alignment +from tacos.distance import hamming_pool + +@click.group() +def tacos(): + pass + +@tacos.command() +@click.option('--ignore_gap', 'ignore_gap', is_flag=True, help='enable coding gaps(i.e indels) as missing values') +@click.option('--adjust_na', 'adjust', is_flag=True, help='enable adjustment of distance for missing values') +@click.option('--n_jobs', 'n_jobs', default=1, show_default=True, help='specify number of processes for parallel computing') +@click.option('--type', default='0', show_default=True, type=click.Choice(['0','1','2']), + help='either 1 (writes the lower triangle of the matrix), or 0 (writes the upper triangle) or 2 (writes both parts)') +@click.argument('INPUT_', required=True, type=click.Path(exists=True)) +@click.argument('OUTPUT', required=True, type=click.Path()) +def distance_matrix(ignore_gap, adjust, n_jobs, type, input_, output): + '''Computes the genetic distance between each genome pair in INPUT. + + Distances are measured as the number of substitutions needed to convert genome g_j to + genome g_i (i.e Hamming distance). For each pair positions with missing values are ignored. + + INPUT is the path to the file with variable positions (rows) and genomes (columns) of + the following format: + +-----------+--------+-----------+----------+-----------+-----+ + |Position | Gene |Reference |GENOME_1 |GENOME_2 | ... | + +-----------+--------+-----------+----------+-----------+-----+ + Where: + - Position: int + Corresponds to a genomic position + - Gene: string + Name of Gene + - Reference: str + The allele for the corresponding position in the reference genome. + Values are any of IUPAC's encoding: + ['A', 'G', 'C', 'T', 'Y', 'R', 'W', 'S', 'K', 'M', 'D', 'V', 'H', 'B', 'X', 'N', '.', '-'] + - GENOME_1: str + The allele for the corresponding position in genome GENOME_1 + Values are any of IUPAC's encoding: + ['A', 'G', 'C', 'T', 'Y', 'R', 'W', 'S', 'K', 'M', 'D', 'V', 'H', 'B', 'X', 'N', '.', '-'] + + Ambiguous nucleotides ('Y', 'R', 'W', 'S', 'K', 'M', 'D', 'V', 'H', 'B', 'X', 'N') are encoded as missing values. + ''' + INPUT_file = input_ + genomes, positions, genomes_positions = import_alignment(INPUT_file, sep='\t') + nr_genomes = len(genomes) + genome_pairs = list(itertools.combinations(genomes, 2)) + + distance_matrix = hamming_pool(data_array=genomes_positions, + missing=-1, + gap=0, + ignore_gap=ignore_gap, + adjust=adjust, + form=2, + n_jobs=n_jobs) + if type == '0': + distance_matrix = np.triu(distance_matrix) + elif type == '1': + distance_matrix = np.tril(distance_matrix) + else: + pass + + with click.open_file(output, 'w') as f: + f.write('\t'.join(['']+genomes)) + f.write('\n') + for row_ix in range(nr_genomes): + f.write(genomes[row_ix]+'\t') + f.write('\t'.join(map(str,distance_matrix[row_ix,:]))) + f.write('\n') + + click.echo('Finished! Computed {} pairwise distances for {} genomes.'.format(len(genome_pairs), nr_genomes)) +if __name__ == '__main__': + tacos() \ No newline at end of file diff --git a/tacos/distance.py b/tacos/distance.py new file mode 100644 index 0000000000000000000000000000000000000000..8b19ea6272fb63de7ef85efbc96760ff13f41579 --- /dev/null +++ b/tacos/distance.py @@ -0,0 +1,247 @@ +""" + tacos.distance + + Library for computing genomic pairwise distance as nucleotide substitutions. + + :copyright: (c) 2019 by Monica R. Ticlla + :license: MIT, see LICENSE for more details. + + This module provides multiple functions to ... + + Available functions: + - ... + +""" +import numpy as np +from scipy.spatial.distance import pdist, squareform +import itertools +import multiprocessing as mp +from multiprocessing import get_context +from contextlib import closing +from functools import partial +from joblib import Parallel, delayed +import ctypes +import tqdm + +def genetic_hamming(seq1, seq2, missing='X', gap='-', ignore_gap=False, adjust=False): + """Return the Hamming distance between two given sequence of characters or two discrete numerical vectors. + Sequences must have equal length. + + This function computes the genetic distance between two sequences of nucleotides represented as characters (e.g A, + T, C, G) or as discrete numerical values (e.g 1, 2, 3, 4) + + Args: + seq1: numpy.ndarray (faster), pandas.Series + Sequence of characters or discrete numerical vectors. + seq1: numpy.ndarray (faster), pandas.Series + Sequence of characters or discrete numerical vectors. + missing: Character or value (int) representing missing value in the sequence. + gap: Character or value (int) representing gap/deletion value in the sequence. + ignore_gap: boolean, default False + Flag to consider gaps as missing values instead of coding them as a 5th character + adjust: boolean, default False + Flag specifying adjustment of distance to account for missing values. + Returns: + Int, Float + Absolute sequence difference between seq1 and seq2, after excluding positions with missing values. + + """ + # Raises exception if vectors do not have equal length + try: + # get positions excluding missing values + if ignore_gap: + seq1_keep = ~(np.in1d(seq1,[missing,gap])) + seq2_keep = ~(np.in1d(seq2,[missing,gap])) + #seq1_keep = (seq1 != missing) & (seq1 != gap) + #seq2_keep = (seq2 != missing) & (seq2 != gap) + else: + seq1_keep = (seq1 != missing) + seq2_keep = (seq2 != missing) + to_keep = (seq1_keep & seq2_keep) + except ValueError: + print("seq1 and seq2 must have equal lengths.") + else: + # drop positions with missing values and + # compute Hamming distance + # np.sum() instead of sum() improves speed considerably! + new_seq1 = seq1[to_keep] + new_seq2 = seq2[to_keep] + pair_dist = np.sum(new_seq1 != new_seq2) + # Adjust distance to account for missing values + if adjust: + seq1_effective_len = np.sum(seq1_keep) + seq2_effective_len = np.sum(seq2_keep) + pair_dist = np.around(len(seq1) * (2 * (pair_dist) / (seq1_effective_len + seq2_effective_len)), decimals=2) + #else: + #pair_dist = np.sum(new_seq1 != new_seq2) + + return (pair_dist) + +def hamming_pairs(genome_pairs, data_array, missing='X', gap='-', ignore_gap=False, adjust=False): + """ + Args: + data_array: numpy.ndarray + An m by n array of m original genomic sequences (i.e observations) in an n-dimensional space (genomic positions). + Values can either be characters ('A', 'T', 'C', 'G') or numerical values (1,2,3,4). It's strongly recommended + to provide a ndarray of numerical vectors. + """ + try: + pair_distances = np.asarray([genetic_hamming(data_array[g1,:], data_array[g2,:], + missing, gap, ignore_gap, adjust) + for g1,g2 in tqdm.tqdm(genome_pairs)]) + except IndexError: + print("Provided index is out of range in data_array.") + else: + return(pair_distances) + +def hamming_pdist(data_array, missing='X', gap='-', ignore_gap=False, adjust=False, form=1): + """Computes pairwise genetic Hamming distance between every pair of genomic sequences (i.e observations) in + n-dimensional space (genomic positions). + + This function uses Scipy's pdist function to apply tacos.distance.genetic_hamming to every pair of genomic sequences in + data_array. It runs fast but it's not using multiprocessing. + + Args: + data_array: numpy.ndarray + An m by n array of m original genomic sequences (i.e observations) in an n-dimensional space (genomic positions). + Values can either be characters ('A', 'T', 'C', 'G') or numerical values (1,2,3,4). It's strongly recommended + to provide a numpy.ndarray of numerical vectors. + missing: str, int, default 'X' + Value representing missing value in the genomic sequence. + gap: str, int, default '-' + Value representing gap/deletion value in the sequence. + ignore_gap: boolean, default False + Flag to consider gaps as missing values instead of coding them as a 5th character + adjust: boolean, default False + Flag specifying adjustment of genetic Hamming distance to account for missing values. + form:int, default 1 + Specify the format of the resulting distance matrix as either condensed (1, a one-dimensional vector) or + square-form distance matrix (2) + + Returns: + numpy.ndarray + A condensed distance matrix (vector-form) or a redundant square-form distance matrix. + + """ + try: + out_dist = pdist(data_array, genetic_hamming, missing=missing, gap=gap, ignore_gap=ignore_gap, adjust=adjust) + except ValueError as e: + print(e) + else: + if form != 1: + return(squareform(out_dist)) + else: + return(out_dist) + + +def hamming_pool(data_array, missing='X', gap='-', ignore_gap=False, adjust=False, form=1, n_jobs=1): + """Computes pairwise genetic Hamming distance between every pair of genomic sequences (i.e observations) in + n-dimensional space (genomic positions). + + This function uses multiprocessing.Pool to apply tacos.distance.genetic_hamming to every pair of genomic sequences in + data_array. + + Args: + data_array: numpy.ndarray + An m by n array of m original genomic sequences (i.e observations) in an n-dimensional space (genomic positions). + Values can either be characters ('A', 'T', 'C', 'G') or numerical values (1,2,3,4). It's strongly recommended + to provide a numpy.ndarray of numerical vectors. + missing: str, int, default 'X' + Value representing missing value in the genomic sequence. + gap: str, int, default '-' + Value representing gap/deletion value in the sequence. + ignore_gap: boolean, default False + Flag to consider gaps as missing values instead of coding them as a 5th character + adjust: boolean, default False + Flag specifying adjustment of genetic Hamming distance to account for missing values. + form:int, default 1 + Specify the format of the resulting distance matrix as either condensed (1, a one-dimensional vector) or + square-form distance matrix (2). + n_jobs: int + The number of concurrent processes. + + Returns: + numpy.ndarray + A condensed distance matrix (vector-form) or a redundant square-form distance matrix. + """ + if n_jobs==1: + return(hamming_pdist(data_array=data_array, missing=missing, gap=gap, ignore_gap=ignore_gap, adjust=adjust, + form=form)) + else: + nr_genomes = data_array.shape[0] + genome_ix_pairs = list(itertools.combinations(range(0,nr_genomes),2)) + + # Here we copy the data_array to a shared memory array + # Using the shared memory array gained 5 fold more speed + data_array_shm = mp.Array(ctypes.c_int8, data_array.size, lock=False) + data_array_shm_b = np.frombuffer(data_array_shm, dtype=ctypes.c_int8) + data_array_shm_b.shape = data_array.shape + data_array_shm_b[:] = data_array + + + # Start a multiprocessing pool + genome_pairs_chunks = np.array_split(genome_ix_pairs, n_jobs) + with closing(get_context("spawn").Pool(processes=n_jobs)) as pool: + print('starting pool with {} processes'.format(n_jobs)) + get_distance_partial = partial(hamming_pairs, + data_array=data_array_shm_b, + missing=missing, gap=gap, ignore_gap=ignore_gap, adjust=adjust) + + distance_mp = pool.imap(get_distance_partial, tqdm.tqdm(genome_pairs_chunks), chunksize=1) + pool.close() + pool.join() + + pair_distances = [pair_dist for item in distance_mp for pair_dist in item] + if form == 1: + return(pair_distances) + else: + return (squareform(pair_distances)) + +def hamming_joblib(data_array, missing='X', gap='-', ignore_gap=False, adjust=False, form=1, n_jobs=2): + """Computes pairwise genetic Hamming distance between every pair of genomic sequences (i.e observations) in + n-dimensional space (genomic positions). + + This function uses joblib.Parallel to apply tacos.distance.genetic_hamming to every pair of genomic sequences in + data_array. + + Args: + data_array: numpy.ndarray + An m by n array of m original genomic sequences (i.e observations) in an n-dimensional space (genomic positions). + Values can either be characters ('A', 'T', 'C', 'G') or numerical values (1,2,3,4). It's strongly recommended + to provide a numpy.ndarray of numerical vectors. + missing: str, int, default 'X' + Value representing missing value in the genomic sequence. + gap: str, int, default '-' + Value representing gap/deletion value in the sequence. + ignore_gap: boolean, default False + Flag to consider gaps as missing values instead of coding them as a 5th character + adjust: boolean, default False + Flag specifying adjustment of genetic Hamming distance to account for missing values. + form:int, default 1 + Specify the format of the resulting distance matrix as either condensed (1, a one-dimensional vector) or + square-form distance matrix (2). + n_jobs: int + The number of parallel jobs. + + Returns: + numpy.ndarray + A condensed distance matrix (vector-form) or a redundant square-form distance matrix. + + """ + try: + nr_genomes = data_array.shape[0] + genome_ix_pairs = list(itertools.combinations(range(0, nr_genomes), 2)) + out_dist = np.asarray(Parallel(n_jobs=n_jobs, prefer='threads')(delayed(genetic_hamming)(data_array[g1_ix, :], + data_array[g2_ix,:], + missing = missing, + gap = gap, + ignore_gap=ignore_gap, + adjust=adjust) + for g1_ix, g2_ix in tqdm.tqdm(genome_ix_pairs))) + except ValueError as e: + print(e) + else: + if form != 1: + return(squareform(out_dist)) + else: + return(out_dist) \ No newline at end of file