Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
T
TACOS
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Deploy
Releases
Container Registry
Model registry
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
TBRU
TACOS
Commits
2b7f4739
Commit
2b7f4739
authored
5 years ago
by
Ticlla Ccenhua Monica Roxana
Browse files
Options
Downloads
Patches
Plain Diff
add function to convert alignment table into snp-genome incidence matrix
parent
9e87b545
No related branches found
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
tacos/alignment_utils.py
+69
-0
69 additions, 0 deletions
tacos/alignment_utils.py
tacos/cli.py
+48
-0
48 additions, 0 deletions
tacos/cli.py
with
117 additions
and
0 deletions
tacos/alignment_utils.py
+
69
−
0
View file @
2b7f4739
...
...
@@ -20,6 +20,8 @@
- GENOME_1: character
The allele for the corresponding position in genome GENOME_1
We assume alleles are coded following IUPAC
'
s nucleotide convention.
Available functions:
- import_alignment: Reads a genomic alignment table into a numeric table.
"""
...
...
@@ -75,3 +77,70 @@ def import_alignment(data_file, sep='\t'):
genomes
=
records_header
[
3
:]
positions_genomes_alleles
=
np
.
asarray
(
positions_genomes_alleles
,
dtype
=
'
int8
'
)
return
(
genomes
,
positions
,
positions_genomes_alleles
.
T
)
def
as_snp_incidence
(
data_file
,
sep
=
'
\t
'
,
ignore_gap
=
False
):
"""
Reads a genomic alignment table into a SNP-Genome incidence matrix
This function reads an alignment table and converts it to a SNP-Genome incidence matrix,
where rows correspond to SNPs and columns correspond to genomes. Values in each cell are
either 0 or 1.
Args:
data_file: str
A valid string path to a genomic alignment table. This table has the following format:
+-----------+--------+-----------+----------+-----------+-----+
|Position | Gene |Reference |GENOME_1 |GENOME_2 | ... |
+-----------+--------+-----------+----------+-----------+-----+
Where:
- Position: int
Corresponds to a genomic position
- Gene: string
Name of Gene
- Reference: character
The allele for the corresponding position in the reference genome
- GENOME_1: character
The allele for the corresponding position in genome GENOME_1
We assume alleles are coded following IUPAC
'
s nucleotide convention.
sep: str, default
'
\t
'
Delimiter to use.
ignore_gap: bool, default False
By default gaps (i.e
'
.
'
,
'
-
'
) are considered a 5th character. If set to True, gaps are encoded as
missing values and therefore ignored or not considered as an allele.
Returns: numpy.ndarray
A SNP by Genome incidence matrix and two lists. The first list maps the row indices of the numpy.ndarray
to the corresponding SNP names. SNP names have this format S_<position>_<reference allele>_<alternative allele>
(e.g S_63_A_G). The second list maps the column indices of the numpy.ndarray to the corresponding genome names.
"""
# IUPAC's anbiguous nucleotides are ignored
default_characters_to_ignore
=
[
'
Y
'
,
'
R
'
,
'
W
'
,
'
S
'
,
'
K
'
,
'
M
'
,
'
D
'
,
'
V
'
,
'
H
'
,
'
B
'
,
'
X
'
,
'
N
'
]
# IUPAC's characters for gaps/deletions
gap_characters
=
[
'
.
'
,
'
-
'
]
if
ignore_gap
:
default_characters_to_ignore
.
extend
(
gap_characters
)
genomes
=
[]
snps
=
list
()
snps_genomes
=
list
()
try
:
with
open
(
data_file
)
as
my_file
:
alignment_table_reader
=
csv
.
reader
(
my_file
,
delimiter
=
sep
)
for
ix
,
row
in
enumerate
(
alignment_table_reader
):
if
(
ix
==
0
):
genomes
=
row
[
3
:]
else
:
position
=
row
[
0
]
ref_allele
=
row
[
2
]
alt_alleles
=
np
.
array
(
row
[
3
:])
to_ignore
=
np
.
in1d
(
alt_alleles
,
default_characters_to_ignore
+
[
ref_allele
])
uniq_alt_alleles
=
np
.
unique
(
alt_alleles
[
~
to_ignore
])
if
uniq_alt_alleles
.
size
>
0
:
for
allele
in
uniq_alt_alleles
:
snps
.
append
(
'
S_{}_{}_{}
'
.
format
(
position
,
ref_allele
,
allele
))
snps_genomes
.
append
(
np
.
array
(
alt_alleles
==
allele
,
dtype
=
np
.
ubyte
))
return
(
snps
,
genomes
,
np
.
asarray
(
snps_genomes
,
dtype
=
np
.
ubyte
))
except
OSError
as
e
:
print
(
'
e
'
)
This diff is collapsed.
Click to expand it.
tacos/cli.py
+
48
−
0
View file @
2b7f4739
...
...
@@ -2,11 +2,58 @@ import click
import
numpy
as
np
import
itertools
from
tacos.alignment_utils
import
import_alignment
from
tacos.alignment_utils
import
as_snp_incidence
from
tacos.distance
import
hamming_pool
@click.group
()
def
tacos
():
pass
@tacos.command
()
@click.option
(
'
--ignore_gap
'
,
'
ignore_gap
'
,
is_flag
=
True
,
help
=
'
enable coding gaps(i.e indels) as missing values
'
)
@click.option
(
'
-d
'
,
'
--separator
'
,
'
sep
'
,
default
=
'
\t
'
,
help
=
'
field separator [default: TAB]
'
)
@click.argument
(
'
INPUT_
'
,
required
=
True
,
type
=
click
.
Path
(
exists
=
True
))
@click.argument
(
'
OUTPUT
'
,
required
=
True
,
type
=
click
.
Path
())
def
snp_incidence_matrix
(
ignore_gap
,
sep
,
input_
,
output
):
"""
Reads a genomic alignment table into a SNP-Genome incidence matrix.
\b
This function reads an alignment table and converts it to a SNP-Genome incidence matrix,
where rows correspond to SNPs and columns correspond to genomes.
Values in each cell are either 0 or 1.
\b
INPUT is the path to the file of a genomic alignment table. This table has the
following format:
+-----------+--------+-----------+----------+-----------+-----+
|Position | Gene |Reference |GENOME_1 |GENOME_2 | ... |
+-----------+--------+-----------+----------+-----------+-----+
\b
Where:
- Position: int
Corresponds to a genomic position
- Gene: string
Name of Gene
- Reference: str
The allele for the corresponding position in the reference genome.
Values are any of IUPAC
'
s encoding:
[
'
A
'
,
'
G
'
,
'
C
'
,
'
T
'
,
'
Y
'
,
'
R
'
,
'
W
'
,
'
S
'
,
'
K
'
,
'
M
'
,
'
D
'
,
'
V
'
,
'
H
'
,
'
B
'
,
'
X
'
,
'
N
'
,
'
.
'
,
'
-
'
]
- GENOME_1: str
The allele for the corresponding position in genome GENOME_1
Values are any of IUPAC
'
s encoding:
[
'
A
'
,
'
G
'
,
'
C
'
,
'
T
'
,
'
Y
'
,
'
R
'
,
'
W
'
,
'
S
'
,
'
K
'
,
'
M
'
,
'
D
'
,
'
V
'
,
'
H
'
,
'
B
'
,
'
X
'
,
'
N
'
,
'
.
'
,
'
-
'
]
We assume alleles are coded following IUPAC
'
s nucleotide convention.
"""
INPUT_file
=
input_
snps
,
genomes
,
incidence_matrix
=
as_snp_incidence
(
INPUT_file
,
sep
=
sep
,
ignore_gap
=
ignore_gap
)
nr_snps
=
len
(
snps
)
with
click
.
open_file
(
output
,
'
w
'
)
as
f
:
f
.
write
(
'
\t
'
.
join
([
''
]
+
genomes
))
f
.
write
(
'
\n
'
)
for
row_ix
in
range
(
nr_snps
):
f
.
write
(
snps
[
row_ix
]
+
'
\t
'
)
f
.
write
(
'
\t
'
.
join
(
map
(
str
,
incidence_matrix
[
row_ix
,:])))
f
.
write
(
'
\n
'
)
click
.
echo
(
'
Finished! Created {} by {} SNP-Genome incidence matrix.
'
.
format
(
nr_snps
,
len
(
genomes
)))
@tacos.command
()
@click.option
(
'
--ignore_gap
'
,
'
ignore_gap
'
,
is_flag
=
True
,
help
=
'
enable coding gaps(i.e indels) as missing values
'
)
...
...
@@ -43,6 +90,7 @@ def distance_matrix(ignore_gap, adjust, n_jobs, type, input_, output):
The allele for the corresponding position in genome GENOME_1
Values are any of IUPAC
'
s encoding:
[
'
A
'
,
'
G
'
,
'
C
'
,
'
T
'
,
'
Y
'
,
'
R
'
,
'
W
'
,
'
S
'
,
'
K
'
,
'
M
'
,
'
D
'
,
'
V
'
,
'
H
'
,
'
B
'
,
'
X
'
,
'
N
'
,
'
.
'
,
'
-
'
]
We assume alleles are coded following IUPAC
'
s nucleotide convention.
\b
Ambiguous nucleotides (
'
Y
'
,
'
R
'
,
'
W
'
,
'
S
'
,
'
K
'
,
'
M
'
,
'
D
'
,
'
V
'
,
'
H
'
,
'
B
'
,
'
X
'
,
'
N
'
)
are encoded as missing values.
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment