add function to convert alignment table into snp-genome incidence matrix

2b7f4739 · Ticlla Ccenhua Monica Roxana · 9e87b545 · 2b7f4739 · 2b7f4739
Commit 2b7f4739 authored 5 years ago by Ticlla Ccenhua Monica Roxana
--- a/tacos/alignment_utils.py
+++ b/tacos/alignment_utils.py
@@ -20,6 +20,8 @@
        - GENOME_1: character
            The allele for the corresponding position in genome GENOME_1

+        We assume alleles are coded following IUPAC's nucleotide convention.
+
    Available functions:
    - import_alignment: Reads a genomic alignment table into a numeric table.
 """
@@ -75,3 +77,70 @@ def import_alignment(data_file, sep='\t'):
    genomes = records_header[3:]
    positions_genomes_alleles = np.asarray(positions_genomes_alleles, dtype='int8')
    return (genomes, positions, positions_genomes_alleles.T)
+
+
+def as_snp_incidence(data_file, sep='\t', ignore_gap=False):
+    """Reads a genomic alignment table into a SNP-Genome incidence matrix
+
+    This function reads an alignment table and converts it to a SNP-Genome incidence matrix,
+    where rows correspond to SNPs and columns correspond to genomes. Values in each cell are
+    either 0 or 1.
+
+    Args:
+        data_file: str
+            A valid string path to a genomic alignment table. This table has the following format:
+            +-----------+--------+-----------+----------+-----------+-----+
+            |Position   | Gene   |Reference  |GENOME_1  |GENOME_2   | ... |
+            +-----------+--------+-----------+----------+-----------+-----+
+            Where:
+            - Position: int
+                Corresponds to a genomic position
+            - Gene: string
+                Name of Gene
+            - Reference: character
+                The allele for the corresponding position in the reference genome
+            - GENOME_1: character
+                The allele for the corresponding position in genome GENOME_1
+
+            We assume alleles are coded following IUPAC's nucleotide convention.
+        sep: str, default '\t'
+            Delimiter to use.
+        ignore_gap: bool, default False
+            By default gaps (i.e '.', '-') are considered a 5th character. If set to True, gaps are encoded as
+            missing values and therefore ignored or not considered as an allele.
+    Returns: numpy.ndarray
+         A SNP by Genome incidence matrix and two lists. The first list maps the row indices of the numpy.ndarray
+         to the corresponding SNP names. SNP names have this format S_<position>_<reference allele>_<alternative allele>
+         (e.g S_63_A_G). The second list maps the column indices of the numpy.ndarray to the corresponding genome names.
+    """
+    # IUPAC's anbiguous nucleotides are ignored
+    default_characters_to_ignore = ['Y', 'R', 'W', 'S', 'K', 'M', 'D', 'V', 'H', 'B', 'X', 'N']
+    # IUPAC's characters for gaps/deletions
+    gap_characters = ['.', '-']
+
+    if ignore_gap:
+        default_characters_to_ignore.extend(gap_characters)
+
+    genomes = []
+    snps = list()
+    snps_genomes = list()
+
+    try:
+        with open(data_file) as my_file:
+            alignment_table_reader = csv.reader(my_file, delimiter=sep)
+            for ix, row in enumerate(alignment_table_reader):
+                if (ix == 0):
+                    genomes = row[3:]
+                else:
+                    position = row[0]
+                    ref_allele = row[2]
+                    alt_alleles = np.array(row[3:])
+                    to_ignore = np.in1d(alt_alleles, default_characters_to_ignore + [ref_allele])
+                    uniq_alt_alleles = np.unique(alt_alleles[~to_ignore])
+                    if uniq_alt_alleles.size > 0:
+                        for allele in uniq_alt_alleles:
+                            snps.append('S_{}_{}_{}'.format(position, ref_allele, allele))
+                            snps_genomes.append(np.array(alt_alleles == allele, dtype=np.ubyte))
+        return (snps, genomes, np.asarray(snps_genomes, dtype=np.ubyte))
+    except OSError as e:
+        print('e')
--- a/tacos/cli.py
+++ b/tacos/cli.py
@@ -2,11 +2,58 @@ import click
 import numpy as np
 import itertools
 from tacos.alignment_utils import import_alignment
+from tacos.alignment_utils import as_snp_incidence
 from tacos.distance import hamming_pool

 @click.group()
 def tacos():
    pass
+@tacos.command()
+@click.option('--ignore_gap', 'ignore_gap', is_flag=True, help='enable coding gaps(i.e indels) as missing values')
+@click.option('-d','--separator', 'sep', default='\t', help='field separator [default: TAB]')
+@click.argument('INPUT_', required=True, type=click.Path(exists=True))
+@click.argument('OUTPUT', required=True, type=click.Path())
+def snp_incidence_matrix(ignore_gap, sep, input_, output):
+    """Reads a genomic alignment table into a SNP-Genome incidence matrix.
+
+    \b
+    This function reads an alignment table and converts it to a SNP-Genome incidence matrix,
+    where rows correspond to SNPs and columns correspond to genomes.
+    Values in each cell are either 0 or 1.
+
+    \b
+    INPUT is the path to the file of a genomic alignment table. This table has the
+    following format:
+        +-----------+--------+-----------+----------+-----------+-----+
+        |Position   | Gene   |Reference  |GENOME_1  |GENOME_2   | ... |
+        +-----------+--------+-----------+----------+-----------+-----+
+    \b
+    Where:
+        - Position: int
+            Corresponds to a genomic position
+        - Gene: string
+            Name of Gene
+        - Reference: str
+            The allele for the corresponding position in the reference genome.
+            Values are any of IUPAC's encoding:
+                ['A', 'G', 'C', 'T', 'Y', 'R', 'W', 'S', 'K', 'M', 'D', 'V', 'H', 'B', 'X', 'N', '.', '-']
+        - GENOME_1: str
+            The allele for the corresponding position in genome GENOME_1
+            Values are any of IUPAC's encoding:
+                ['A', 'G', 'C', 'T', 'Y', 'R', 'W', 'S', 'K', 'M', 'D', 'V', 'H', 'B', 'X', 'N', '.', '-']
+        We assume alleles are coded following IUPAC's nucleotide convention.
+    """
+    INPUT_file = input_
+    snps, genomes, incidence_matrix = as_snp_incidence(INPUT_file, sep=sep, ignore_gap=ignore_gap)
+    nr_snps = len(snps)
+    with click.open_file(output, 'w') as f:
+        f.write('\t'.join(['']+genomes))
+        f.write('\n')
+        for row_ix in range(nr_snps):
+            f.write(snps[row_ix]+'\t')
+            f.write('\t'.join(map(str,incidence_matrix[row_ix,:])))
+            f.write('\n')
+    click.echo('Finished! Created  {} by {} SNP-Genome incidence matrix.'.format(nr_snps, len(genomes)))

 @tacos.command()
 @click.option('--ignore_gap', 'ignore_gap', is_flag=True, help='enable coding gaps(i.e indels) as missing values')
@@ -43,6 +90,7 @@ def distance_matrix(ignore_gap, adjust, n_jobs, type, input_, output):
            The allele for the corresponding position in genome GENOME_1
            Values are any of IUPAC's encoding:
                ['A', 'G', 'C', 'T', 'Y', 'R', 'W', 'S', 'K', 'M', 'D', 'V', 'H', 'B', 'X', 'N', '.', '-']
+        We assume alleles are coded following IUPAC's nucleotide convention.
    \b
    Ambiguous nucleotides ('Y', 'R', 'W', 'S', 'K', 'M', 'D', 'V', 'H', 'B', 'X', 'N')
    are encoded as missing values.