From 2b7f473920986548591ce2dac913b10053b8dd54 Mon Sep 17 00:00:00 2001 From: ticlla <monicaroxana.ticllaccenhua@unibas.ch> Date: Fri, 20 Sep 2019 18:05:38 -0500 Subject: [PATCH] add function to convert alignment table into snp-genome incidence matrix --- tacos/alignment_utils.py | 69 ++++++++++++++++++++++++++++++++++++++++ tacos/cli.py | 48 ++++++++++++++++++++++++++++ 2 files changed, 117 insertions(+) diff --git a/tacos/alignment_utils.py b/tacos/alignment_utils.py index 4d49153..18b02d0 100644 --- a/tacos/alignment_utils.py +++ b/tacos/alignment_utils.py @@ -20,6 +20,8 @@ - GENOME_1: character The allele for the corresponding position in genome GENOME_1 + We assume alleles are coded following IUPAC's nucleotide convention. + Available functions: - import_alignment: Reads a genomic alignment table into a numeric table. """ @@ -75,3 +77,70 @@ def import_alignment(data_file, sep='\t'): genomes = records_header[3:] positions_genomes_alleles = np.asarray(positions_genomes_alleles, dtype='int8') return (genomes, positions, positions_genomes_alleles.T) + + +def as_snp_incidence(data_file, sep='\t', ignore_gap=False): + """Reads a genomic alignment table into a SNP-Genome incidence matrix + + This function reads an alignment table and converts it to a SNP-Genome incidence matrix, + where rows correspond to SNPs and columns correspond to genomes. Values in each cell are + either 0 or 1. + + Args: + data_file: str + A valid string path to a genomic alignment table. This table has the following format: + +-----------+--------+-----------+----------+-----------+-----+ + |Position | Gene |Reference |GENOME_1 |GENOME_2 | ... | + +-----------+--------+-----------+----------+-----------+-----+ + Where: + - Position: int + Corresponds to a genomic position + - Gene: string + Name of Gene + - Reference: character + The allele for the corresponding position in the reference genome + - GENOME_1: character + The allele for the corresponding position in genome GENOME_1 + + We assume alleles are coded following IUPAC's nucleotide convention. + sep: str, default '\t' + Delimiter to use. + ignore_gap: bool, default False + By default gaps (i.e '.', '-') are considered a 5th character. If set to True, gaps are encoded as + missing values and therefore ignored or not considered as an allele. + Returns: numpy.ndarray + A SNP by Genome incidence matrix and two lists. The first list maps the row indices of the numpy.ndarray + to the corresponding SNP names. SNP names have this format S_<position>_<reference allele>_<alternative allele> + (e.g S_63_A_G). The second list maps the column indices of the numpy.ndarray to the corresponding genome names. + """ + # IUPAC's anbiguous nucleotides are ignored + default_characters_to_ignore = ['Y', 'R', 'W', 'S', 'K', 'M', 'D', 'V', 'H', 'B', 'X', 'N'] + # IUPAC's characters for gaps/deletions + gap_characters = ['.', '-'] + + if ignore_gap: + default_characters_to_ignore.extend(gap_characters) + + genomes = [] + snps = list() + snps_genomes = list() + + try: + with open(data_file) as my_file: + alignment_table_reader = csv.reader(my_file, delimiter=sep) + for ix, row in enumerate(alignment_table_reader): + if (ix == 0): + genomes = row[3:] + else: + position = row[0] + ref_allele = row[2] + alt_alleles = np.array(row[3:]) + to_ignore = np.in1d(alt_alleles, default_characters_to_ignore + [ref_allele]) + uniq_alt_alleles = np.unique(alt_alleles[~to_ignore]) + if uniq_alt_alleles.size > 0: + for allele in uniq_alt_alleles: + snps.append('S_{}_{}_{}'.format(position, ref_allele, allele)) + snps_genomes.append(np.array(alt_alleles == allele, dtype=np.ubyte)) + return (snps, genomes, np.asarray(snps_genomes, dtype=np.ubyte)) + except OSError as e: + print('e') diff --git a/tacos/cli.py b/tacos/cli.py index 09cca4a..96d076e 100644 --- a/tacos/cli.py +++ b/tacos/cli.py @@ -2,11 +2,58 @@ import click import numpy as np import itertools from tacos.alignment_utils import import_alignment +from tacos.alignment_utils import as_snp_incidence from tacos.distance import hamming_pool @click.group() def tacos(): pass +@tacos.command() +@click.option('--ignore_gap', 'ignore_gap', is_flag=True, help='enable coding gaps(i.e indels) as missing values') +@click.option('-d','--separator', 'sep', default='\t', help='field separator [default: TAB]') +@click.argument('INPUT_', required=True, type=click.Path(exists=True)) +@click.argument('OUTPUT', required=True, type=click.Path()) +def snp_incidence_matrix(ignore_gap, sep, input_, output): + """Reads a genomic alignment table into a SNP-Genome incidence matrix. + + \b + This function reads an alignment table and converts it to a SNP-Genome incidence matrix, + where rows correspond to SNPs and columns correspond to genomes. + Values in each cell are either 0 or 1. + + \b + INPUT is the path to the file of a genomic alignment table. This table has the + following format: + +-----------+--------+-----------+----------+-----------+-----+ + |Position | Gene |Reference |GENOME_1 |GENOME_2 | ... | + +-----------+--------+-----------+----------+-----------+-----+ + \b + Where: + - Position: int + Corresponds to a genomic position + - Gene: string + Name of Gene + - Reference: str + The allele for the corresponding position in the reference genome. + Values are any of IUPAC's encoding: + ['A', 'G', 'C', 'T', 'Y', 'R', 'W', 'S', 'K', 'M', 'D', 'V', 'H', 'B', 'X', 'N', '.', '-'] + - GENOME_1: str + The allele for the corresponding position in genome GENOME_1 + Values are any of IUPAC's encoding: + ['A', 'G', 'C', 'T', 'Y', 'R', 'W', 'S', 'K', 'M', 'D', 'V', 'H', 'B', 'X', 'N', '.', '-'] + We assume alleles are coded following IUPAC's nucleotide convention. + """ + INPUT_file = input_ + snps, genomes, incidence_matrix = as_snp_incidence(INPUT_file, sep=sep, ignore_gap=ignore_gap) + nr_snps = len(snps) + with click.open_file(output, 'w') as f: + f.write('\t'.join(['']+genomes)) + f.write('\n') + for row_ix in range(nr_snps): + f.write(snps[row_ix]+'\t') + f.write('\t'.join(map(str,incidence_matrix[row_ix,:]))) + f.write('\n') + click.echo('Finished! Created {} by {} SNP-Genome incidence matrix.'.format(nr_snps, len(genomes))) @tacos.command() @click.option('--ignore_gap', 'ignore_gap', is_flag=True, help='enable coding gaps(i.e indels) as missing values') @@ -43,6 +90,7 @@ def distance_matrix(ignore_gap, adjust, n_jobs, type, input_, output): The allele for the corresponding position in genome GENOME_1 Values are any of IUPAC's encoding: ['A', 'G', 'C', 'T', 'Y', 'R', 'W', 'S', 'K', 'M', 'D', 'V', 'H', 'B', 'X', 'N', '.', '-'] + We assume alleles are coded following IUPAC's nucleotide convention. \b Ambiguous nucleotides ('Y', 'R', 'W', 'S', 'K', 'M', 'D', 'V', 'H', 'B', 'X', 'N') are encoded as missing values. -- GitLab