Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
T
TACOS
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Deploy
Releases
Container Registry
Model registry
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
TBRU
TACOS
Commits
9e87b545
Commit
9e87b545
authored
5 years ago
by
Ticlla Ccenhua Monica Roxana
Browse files
Options
Downloads
Patches
Plain Diff
update tacos.distance.hamming_pool to allow child process inherit data_array
parent
cf39c997
No related branches found
No related tags found
No related merge requests found
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
README.md
+0
-4
0 additions, 4 deletions
README.md
tacos/cli.py
+6
-2
6 additions, 2 deletions
tacos/cli.py
tacos/distance.py
+24
-5
24 additions, 5 deletions
tacos/distance.py
with
30 additions
and
11 deletions
README.md
+
0
−
4
View file @
9e87b545
...
...
@@ -25,10 +25,6 @@ Run the application:
$ conda activate <path/tacos_env> # if not already activated
$ tacos --help
```
or
```
$ <path/tacos_env>/bin/python -m tacos --help # no need to activate the environment
```
To run the tests:
```
...
...
This diff is collapsed.
Click to expand it.
tacos/cli.py
+
6
−
2
View file @
9e87b545
...
...
@@ -19,14 +19,17 @@ def tacos():
def
distance_matrix
(
ignore_gap
,
adjust
,
n_jobs
,
type
,
input_
,
output
):
'''
Computes the genetic distance between each genome pair in INPUT.
\b
Distances are measured as the number of substitutions needed to convert genome g_j to
genome g_i (i.e Hamming distance). For each pair positions with missing values are ignored.
\b
INPUT is the path to the file with variable positions (rows) and genomes (columns) of
the following format:
+-----------+--------+-----------+----------+-----------+-----+
|Position | Gene |Reference |GENOME_1 |GENOME_2 | ... |
+-----------+--------+-----------+----------+-----------+-----+
\b
Where:
- Position: int
Corresponds to a genomic position
...
...
@@ -40,8 +43,9 @@ def distance_matrix(ignore_gap, adjust, n_jobs, type, input_, output):
The allele for the corresponding position in genome GENOME_1
Values are any of IUPAC
'
s encoding:
[
'
A
'
,
'
G
'
,
'
C
'
,
'
T
'
,
'
Y
'
,
'
R
'
,
'
W
'
,
'
S
'
,
'
K
'
,
'
M
'
,
'
D
'
,
'
V
'
,
'
H
'
,
'
B
'
,
'
X
'
,
'
N
'
,
'
.
'
,
'
-
'
]
Ambiguous nucleotides (
'
Y
'
,
'
R
'
,
'
W
'
,
'
S
'
,
'
K
'
,
'
M
'
,
'
D
'
,
'
V
'
,
'
H
'
,
'
B
'
,
'
X
'
,
'
N
'
) are encoded as missing values.
\b
Ambiguous nucleotides (
'
Y
'
,
'
R
'
,
'
W
'
,
'
S
'
,
'
K
'
,
'
M
'
,
'
D
'
,
'
V
'
,
'
H
'
,
'
B
'
,
'
X
'
,
'
N
'
)
are encoded as missing values.
'''
INPUT_file
=
input_
genomes
,
positions
,
genomes_positions
=
import_alignment
(
INPUT_file
,
sep
=
'
\t
'
)
...
...
This diff is collapsed.
Click to expand it.
tacos/distance.py
+
24
−
5
View file @
9e87b545
...
...
@@ -133,6 +133,20 @@ def hamming_pdist(data_array, missing='X', gap='-', ignore_gap=False, adjust=Fal
else
:
return
(
out_dist
)
def
_hamming_pool_init_worker
(
data_array
):
global
data_array_in_worker
data_array_in_worker
=
data_array
def
_hamming_pool_helper
(
genome_pairs
,
missing
=
'
X
'
,
gap
=
'
-
'
,
ignore_gap
=
False
,
adjust
=
False
):
try
:
pair_distances
=
np
.
asarray
([
genetic_hamming
(
data_array_in_worker
[
g1
,
:],
data_array_in_worker
[
g2
,
:],
missing
,
gap
,
ignore_gap
,
adjust
)
for
g1
,
g2
in
tqdm
.
tqdm
(
genome_pairs
)])
except
IndexError
:
print
(
"
Provided index is out of range in data_array.
"
)
else
:
return
(
pair_distances
)
def
hamming_pool
(
data_array
,
missing
=
'
X
'
,
gap
=
'
-
'
,
ignore_gap
=
False
,
adjust
=
False
,
form
=
1
,
n_jobs
=
1
):
"""
Computes pairwise genetic Hamming distance between every pair of genomic sequences (i.e observations) in
...
...
@@ -172,19 +186,24 @@ def hamming_pool(data_array, missing='X', gap='-', ignore_gap=False, adjust=Fals
genome_ix_pairs
=
list
(
itertools
.
combinations
(
range
(
0
,
nr_genomes
),
2
))
# Here we copy the data_array to a shared memory array
# Using the shared memory array gained 5 fold more speed
# Using the shared memory
unlock
array gained 5 fold more speed
data_array_shm
=
mp
.
Array
(
ctypes
.
c_int8
,
data_array
.
size
,
lock
=
False
)
global
data_array_shm_b
data_array_shm_b
=
np
.
frombuffer
(
data_array_shm
,
dtype
=
ctypes
.
c_int8
)
data_array_shm_b
.
shape
=
data_array
.
shape
data_array_shm_b
[:]
=
data_array
# Start a multiprocessing pool
genome_pairs_chunks
=
np
.
array_split
(
genome_ix_pairs
,
n_jobs
)
with
closing
(
get_context
(
"
spawn
"
).
Pool
(
processes
=
n_jobs
))
as
pool
:
#with closing(get_context("spawn").Pool(processes=n_jobs)) as pool:
with
closing
(
get_context
(
"
spawn
"
).
Pool
(
processes
=
n_jobs
,
initializer
=
_hamming_pool_init_worker
,
initargs
=
(
data_array_shm_b
,)))
as
pool
:
print
(
'
starting pool with {} processes
'
.
format
(
n_jobs
))
get_distance_partial
=
partial
(
hamming_pairs
,
data_array
=
data_array_shm_b
,
#get_distance_partial = partial(hamming_pairs,
# data_array=data_array_shm_b,
# missing=missing, gap=gap, ignore_gap=ignore_gap, adjust=adjust)
get_distance_partial
=
partial
(
_hamming_pool_helper
,
missing
=
missing
,
gap
=
gap
,
ignore_gap
=
ignore_gap
,
adjust
=
adjust
)
distance_mp
=
pool
.
imap
(
get_distance_partial
,
tqdm
.
tqdm
(
genome_pairs_chunks
),
chunksize
=
1
)
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment