Skip to content
Snippets Groups Projects
Commit 2b7f4739 authored by Ticlla Ccenhua Monica Roxana's avatar Ticlla Ccenhua Monica Roxana
Browse files

add function to convert alignment table into snp-genome incidence matrix

parent 9e87b545
No related branches found
No related tags found
No related merge requests found
......@@ -20,6 +20,8 @@
- GENOME_1: character
The allele for the corresponding position in genome GENOME_1
We assume alleles are coded following IUPAC's nucleotide convention.
Available functions:
- import_alignment: Reads a genomic alignment table into a numeric table.
"""
......@@ -75,3 +77,70 @@ def import_alignment(data_file, sep='\t'):
genomes = records_header[3:]
positions_genomes_alleles = np.asarray(positions_genomes_alleles, dtype='int8')
return (genomes, positions, positions_genomes_alleles.T)
def as_snp_incidence(data_file, sep='\t', ignore_gap=False):
"""Reads a genomic alignment table into a SNP-Genome incidence matrix
This function reads an alignment table and converts it to a SNP-Genome incidence matrix,
where rows correspond to SNPs and columns correspond to genomes. Values in each cell are
either 0 or 1.
Args:
data_file: str
A valid string path to a genomic alignment table. This table has the following format:
+-----------+--------+-----------+----------+-----------+-----+
|Position | Gene |Reference |GENOME_1 |GENOME_2 | ... |
+-----------+--------+-----------+----------+-----------+-----+
Where:
- Position: int
Corresponds to a genomic position
- Gene: string
Name of Gene
- Reference: character
The allele for the corresponding position in the reference genome
- GENOME_1: character
The allele for the corresponding position in genome GENOME_1
We assume alleles are coded following IUPAC's nucleotide convention.
sep: str, default '\t'
Delimiter to use.
ignore_gap: bool, default False
By default gaps (i.e '.', '-') are considered a 5th character. If set to True, gaps are encoded as
missing values and therefore ignored or not considered as an allele.
Returns: numpy.ndarray
A SNP by Genome incidence matrix and two lists. The first list maps the row indices of the numpy.ndarray
to the corresponding SNP names. SNP names have this format S_<position>_<reference allele>_<alternative allele>
(e.g S_63_A_G). The second list maps the column indices of the numpy.ndarray to the corresponding genome names.
"""
# IUPAC's anbiguous nucleotides are ignored
default_characters_to_ignore = ['Y', 'R', 'W', 'S', 'K', 'M', 'D', 'V', 'H', 'B', 'X', 'N']
# IUPAC's characters for gaps/deletions
gap_characters = ['.', '-']
if ignore_gap:
default_characters_to_ignore.extend(gap_characters)
genomes = []
snps = list()
snps_genomes = list()
try:
with open(data_file) as my_file:
alignment_table_reader = csv.reader(my_file, delimiter=sep)
for ix, row in enumerate(alignment_table_reader):
if (ix == 0):
genomes = row[3:]
else:
position = row[0]
ref_allele = row[2]
alt_alleles = np.array(row[3:])
to_ignore = np.in1d(alt_alleles, default_characters_to_ignore + [ref_allele])
uniq_alt_alleles = np.unique(alt_alleles[~to_ignore])
if uniq_alt_alleles.size > 0:
for allele in uniq_alt_alleles:
snps.append('S_{}_{}_{}'.format(position, ref_allele, allele))
snps_genomes.append(np.array(alt_alleles == allele, dtype=np.ubyte))
return (snps, genomes, np.asarray(snps_genomes, dtype=np.ubyte))
except OSError as e:
print('e')
......@@ -2,11 +2,58 @@ import click
import numpy as np
import itertools
from tacos.alignment_utils import import_alignment
from tacos.alignment_utils import as_snp_incidence
from tacos.distance import hamming_pool
@click.group()
def tacos():
pass
@tacos.command()
@click.option('--ignore_gap', 'ignore_gap', is_flag=True, help='enable coding gaps(i.e indels) as missing values')
@click.option('-d','--separator', 'sep', default='\t', help='field separator [default: TAB]')
@click.argument('INPUT_', required=True, type=click.Path(exists=True))
@click.argument('OUTPUT', required=True, type=click.Path())
def snp_incidence_matrix(ignore_gap, sep, input_, output):
"""Reads a genomic alignment table into a SNP-Genome incidence matrix.
\b
This function reads an alignment table and converts it to a SNP-Genome incidence matrix,
where rows correspond to SNPs and columns correspond to genomes.
Values in each cell are either 0 or 1.
\b
INPUT is the path to the file of a genomic alignment table. This table has the
following format:
+-----------+--------+-----------+----------+-----------+-----+
|Position | Gene |Reference |GENOME_1 |GENOME_2 | ... |
+-----------+--------+-----------+----------+-----------+-----+
\b
Where:
- Position: int
Corresponds to a genomic position
- Gene: string
Name of Gene
- Reference: str
The allele for the corresponding position in the reference genome.
Values are any of IUPAC's encoding:
['A', 'G', 'C', 'T', 'Y', 'R', 'W', 'S', 'K', 'M', 'D', 'V', 'H', 'B', 'X', 'N', '.', '-']
- GENOME_1: str
The allele for the corresponding position in genome GENOME_1
Values are any of IUPAC's encoding:
['A', 'G', 'C', 'T', 'Y', 'R', 'W', 'S', 'K', 'M', 'D', 'V', 'H', 'B', 'X', 'N', '.', '-']
We assume alleles are coded following IUPAC's nucleotide convention.
"""
INPUT_file = input_
snps, genomes, incidence_matrix = as_snp_incidence(INPUT_file, sep=sep, ignore_gap=ignore_gap)
nr_snps = len(snps)
with click.open_file(output, 'w') as f:
f.write('\t'.join(['']+genomes))
f.write('\n')
for row_ix in range(nr_snps):
f.write(snps[row_ix]+'\t')
f.write('\t'.join(map(str,incidence_matrix[row_ix,:])))
f.write('\n')
click.echo('Finished! Created {} by {} SNP-Genome incidence matrix.'.format(nr_snps, len(genomes)))
@tacos.command()
@click.option('--ignore_gap', 'ignore_gap', is_flag=True, help='enable coding gaps(i.e indels) as missing values')
......@@ -43,6 +90,7 @@ def distance_matrix(ignore_gap, adjust, n_jobs, type, input_, output):
The allele for the corresponding position in genome GENOME_1
Values are any of IUPAC's encoding:
['A', 'G', 'C', 'T', 'Y', 'R', 'W', 'S', 'K', 'M', 'D', 'V', 'H', 'B', 'X', 'N', '.', '-']
We assume alleles are coded following IUPAC's nucleotide convention.
\b
Ambiguous nucleotides ('Y', 'R', 'W', 'S', 'K', 'M', 'D', 'V', 'H', 'B', 'X', 'N')
are encoded as missing values.
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment