Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
T
Terminal fragment selector
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
zavolan_group
tools
Terminal fragment selector
Commits
551c2056
Commit
551c2056
authored
2 years ago
by
Hugo Madge Leon
Browse files
Options
Downloads
Patches
Plain Diff
tanya's changes
parent
f6ebce88
No related branches found
No related tags found
1 merge request
!46
Merged Tanya's files + final lint check
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
term_frag_sel/cli.py
+8
-8
8 additions, 8 deletions
term_frag_sel/cli.py
term_frag_sel/fragmentation.py
+42
-8
42 additions, 8 deletions
term_frag_sel/fragmentation.py
tests/test_frag.py
+39
-5
39 additions, 5 deletions
tests/test_frag.py
tests/test_utils.py
+0
-6
0 additions, 6 deletions
tests/test_utils.py
with
89 additions
and
27 deletions
term_frag_sel/cli.py
+
8
−
8
View file @
551c2056
...
@@ -35,8 +35,8 @@ def main(args: argparse.Namespace):
...
@@ -35,8 +35,8 @@ def main(args: argparse.Namespace):
fasta_dict
=
fasta_parse
[
split
:
splits
[
i
+
1
]]
fasta_dict
=
fasta_parse
[
split
:
splits
[
i
+
1
]]
term_frags
=
fragmentation
(
fasta_dict
,
seq_counts
,
term_frags
=
fragmentation
(
fasta_dict
,
seq_counts
,
args
.
mean
,
args
.
std
,
args
.
mean
,
args
.
std
,
args
.
A
_prob
,
args
.
T
_prob
,
args
.
a
_prob
,
args
.
t
_prob
,
args
.
G
_prob
,
args
.
C
_prob
)
args
.
g
_prob
,
args
.
c
_prob
)
logger
.
info
(
"
Writing batch %s sequences to %s...
"
,
i
,
args
.
output
)
logger
.
info
(
"
Writing batch %s sequences to %s...
"
,
i
,
args
.
output
)
with
open
(
args
.
output
,
'
a
'
,
encoding
=
"
utf-8
"
)
as
out_file
:
with
open
(
args
.
output
,
'
a
'
,
encoding
=
"
utf-8
"
)
as
out_file
:
...
@@ -97,21 +97,21 @@ def parse_arguments() -> argparse.Namespace:
...
@@ -97,21 +97,21 @@ def parse_arguments() -> argparse.Namespace:
help
=
"
output file path
"
)
help
=
"
output file path
"
)
parser
.
add_argument
(
'
--mean
'
,
required
=
False
,
default
=
300
,
parser
.
add_argument
(
'
--mean
'
,
required
=
False
,
default
=
300
,
type
=
check_positive
,
type
=
check_positive
,
help
=
"
Mean fragment length (default:
1
0)
"
)
help
=
"
Mean fragment length (default:
30
0)
"
)
parser
.
add_argument
(
'
--std
'
,
required
=
False
,
default
=
60
,
parser
.
add_argument
(
'
--std
'
,
required
=
False
,
default
=
60
,
type
=
check_positive
,
type
=
check_positive
,
help
=
"
Standard deviation fragment length
\
help
=
"
Standard deviation fragment length
\
(defafult:
1
)
"
)
(defafult:
60
)
"
)
parser
.
add_argument
(
'
-a
'
,
'
--
A
_prob
'
,
required
=
False
,
default
=
0.22
,
parser
.
add_argument
(
'
-a
'
,
'
--
a
_prob
'
,
required
=
False
,
default
=
0.22
,
type
=
check_prob
,
type
=
check_prob
,
help
=
"
Probability cut happens after nucleotide A
"
)
help
=
"
Probability cut happens after nucleotide A
"
)
parser
.
add_argument
(
'
-t
'
,
'
--
T
_prob
'
,
required
=
False
,
default
=
0.25
,
parser
.
add_argument
(
'
-t
'
,
'
--
t
_prob
'
,
required
=
False
,
default
=
0.25
,
type
=
check_prob
,
type
=
check_prob
,
help
=
"
Probability cut happens after nucleotide T
"
)
help
=
"
Probability cut happens after nucleotide T
"
)
parser
.
add_argument
(
'
-g
'
,
'
--
G
_prob
'
,
required
=
False
,
default
=
0.25
,
parser
.
add_argument
(
'
-g
'
,
'
--
g
_prob
'
,
required
=
False
,
default
=
0.25
,
type
=
check_prob
,
type
=
check_prob
,
help
=
"
Probability cut happens after nucleotide G
"
)
help
=
"
Probability cut happens after nucleotide G
"
)
parser
.
add_argument
(
'
-c
'
,
'
--
C
_prob
'
,
required
=
False
,
default
=
0.28
,
parser
.
add_argument
(
'
-c
'
,
'
--
c
_prob
'
,
required
=
False
,
default
=
0.28
,
type
=
check_prob
,
type
=
check_prob
,
help
=
"
Probability cut happens after nucleotide C
"
)
help
=
"
Probability cut happens after nucleotide C
"
)
parser
.
add_argument
(
'
-s
'
,
'
--size
'
,
required
=
False
,
default
=
10000
,
parser
.
add_argument
(
'
-s
'
,
'
--size
'
,
required
=
False
,
default
=
10000
,
...
...
This diff is collapsed.
Click to expand it.
term_frag_sel/fragmentation.py
+
42
−
8
View file @
551c2056
...
@@ -4,6 +4,38 @@ import re
...
@@ -4,6 +4,38 @@ import re
import
numpy
as
np
import
numpy
as
np
import
pandas
as
pd
# type: ignore
import
pandas
as
pd
# type: ignore
def
get_cut_number
(
seq_len
:
int
,
mean
:
float
)
->
int
:
"""
Get the number of cuts for a particular sequence.
Args:
seq_len (int): length of sequence/number of nucleotides in the sequence
mean (float): mean fragment length
Returns:
int: number of cuts
"""
cuts_distribution
=
[]
# distribution of cut numbers (n_cuts)
# 1000 iterations should not be too computationally inefficient given the nature of the code
for
_
in
range
(
1000
):
n_cuts
=
0
len_sum
=
0
while
True
:
len_sum
+=
np
.
random
.
exponential
(
scale
=
mean
)
if
len_sum
<
seq_len
:
n_cuts
+=
1
else
:
cuts_distribution
.
append
(
n_cuts
)
break
cuts_distribution
.
sort
()
cut_counts
=
{
x
:
cuts_distribution
.
count
(
x
)
for
x
in
cuts_distribution
}
cut_probs
=
[
x
/
1000
for
x
in
cut_counts
.
values
()]
# randomly ick no. of cut from cut distribution based on probability of cut numbers
n_cuts
=
np
.
random
.
choice
(
list
(
cut_counts
.
keys
()),
p
=
cut_probs
)
return
n_cuts
def
fragmentation
(
fasta
:
dict
,
seq_counts
:
pd
.
DataFrame
,
def
fragmentation
(
fasta
:
dict
,
seq_counts
:
pd
.
DataFrame
,
mean_length
:
int
=
100
,
std
:
int
=
10
,
mean_length
:
int
=
100
,
std
:
int
=
10
,
...
@@ -32,7 +64,10 @@ def fragmentation(fasta: dict, seq_counts: pd.DataFrame,
...
@@ -32,7 +64,10 @@ def fragmentation(fasta: dict, seq_counts: pd.DataFrame,
for
seq_id
,
seq
in
fasta
.
items
():
for
seq_id
,
seq
in
fasta
.
items
():
counts
=
seq_counts
[
seq_counts
[
"
seqID
"
]
==
seq_id
][
"
count
"
]
counts
=
seq_counts
[
seq_counts
[
"
seqID
"
]
==
seq_id
][
"
count
"
]
for
_
in
range
(
counts
):
for
_
in
range
(
counts
):
n_cuts
=
int
(
len
(
seq
)
/
mean_length
)
seq_len
=
len
(
seq
)
# pick no. of cuts from gauss fragment length distribution
n_cuts
=
get_cut_number
(
seq_len
,
mean_length
)
# non-uniformly random DNA fragmentation implementation based on
# non-uniformly random DNA fragmentation implementation based on
# https://www.nature.com/articles/srep04532#Sec1
# https://www.nature.com/articles/srep04532#Sec1
...
@@ -50,13 +85,12 @@ def fragmentation(fasta: dict, seq_counts: pd.DataFrame,
...
@@ -50,13 +85,12 @@ def fragmentation(fasta: dict, seq_counts: pd.DataFrame,
cuts
.
sort
()
cuts
.
sort
()
cuts
.
insert
(
0
,
0
)
cuts
.
insert
(
0
,
0
)
term_frag
=
""
term_frag
=
""
for
i
,
val
in
enumerate
(
cuts
):
if
i
==
len
(
cuts
)
-
1
:
# check if 3' fragment is in the correct size range
fragment
=
seq
[
val
+
1
:
cuts
[
-
1
]]
fragment
=
seq
[
cuts
[
-
1
]:
len
(
seq
)]
else
:
if
mean_length
-
std
<=
len
(
fragment
)
<=
mean_length
+
std
:
fragment
=
seq
[
val
:
cuts
[
i
+
1
]]
term_frag
=
fragment
if
mean_length
-
std
<=
len
(
fragment
)
<=
mean_length
+
std
:
term_frag
=
fragment
if
term_frag
!=
""
:
if
term_frag
!=
""
:
term_frags
.
append
(
term_frag
)
term_frags
.
append
(
term_frag
)
...
...
This diff is collapsed.
Click to expand it.
tests/test_frag.py
+
39
−
5
View file @
551c2056
"""
Test utils.py functions.
"""
"""
Test utils.py functions.
"""
import
random
import
pytest
import
pytest
from
Bio
import
SeqIO
from
Bio
import
SeqIO
from
term_frag_sel.fragmentation
import
fragmentation
# type: ignore
from
term_frag_sel.fragmentation
import
fragmentation
,
get_cut_number
# type: ignore
with
open
(
"
tests/test_files/test.fasta
"
,
"
r
"
,
encoding
=
"
utf-8
"
)
as
handle
:
with
open
(
"
tests/test_files/test.fasta
"
,
"
r
"
,
encoding
=
"
utf-8
"
)
as
handle
:
fasta
=
SeqIO
.
parse
(
handle
,
"
fasta
"
)
fasta
=
SeqIO
.
parse
(
handle
,
"
fasta
"
)
# have to create the counts file
seq_counts
=
{}
for
entry
in
fasta
:
name
,
sequence
=
entry
.
id
,
str
(
entry
.
seq
)
seq_counts
[
name
]
=
random
.
randint
(
50
,
150
)
MU
=
100
MU
=
100
STD
=
10
STD
=
10
...
@@ -16,8 +20,38 @@ C_PROB = 0.28
...
@@ -16,8 +20,38 @@ C_PROB = 0.28
T_PROB
=
0.25
T_PROB
=
0.25
G_PROB
=
0.25
G_PROB
=
0.25
def
test_get_cut_number
():
"""
Test get_cut_number function.
"""
assert
get_cut_number
(
100
,
20
,
4
)
<=
15
and
get_cut_number
(
100
,
20
,
4
)
>=
0
def
test_frag
():
with
pytest
.
raises
(
ValueError
):
"""
Test fragmentation function.
"""
get_cut_number
(
seq_len
=
'
a
'
,
mean
=
4
,
std
=
2.3
)
# error if seq_len not int
with
pytest
.
raises
(
ValueError
):
get_cut_number
(
seq_len
=
10
,
mean
=
'
a
'
,
std
=
2.3
)
# error if mean not int
with
pytest
.
raises
(
ValueError
):
get_cut_number
(
seq_len
=
10
,
mean
=
3
,
std
=
'
a
'
)
# error if std not float
with
pytest
.
raises
(
TypeError
):
with
pytest
.
raises
(
TypeError
):
fragmentation
()
get_cut_number
(
seq_len
=
10
,
mean
=
3
)
# error if there is a missing argument
def
test_fragmentation
():
"""
Test fragmentation function.
"""
assert
isinstance
(
fragmentation
(
fasta
,
seq_counts
),
list
)
with
pytest
.
raises
(
ValueError
):
fragmentation
(
fasta
,
seq_counts
,
mean_length
=
'
a
'
,
std
=
10
,
a_prob
=
0.22
,
t_prob
=
0.25
,
g_prob
=
0.25
,
c_prob
=
0.28
)
with
pytest
.
raises
(
ValueError
):
fragmentation
(
fasta
,
seq_counts
,
mean_length
=
100
,
std
=
'
a
'
,
a_prob
=
0.22
,
t_prob
=
0.25
,
g_prob
=
0.25
,
c_prob
=
0.28
)
with
pytest
.
raises
(
ValueError
):
fragmentation
(
fasta
,
seq_counts
,
mean_length
=
100
,
std
=
10
,
a_prob
=
'
a
'
,
t_prob
=
0.25
,
g_prob
=
0.25
,
c_prob
=
0.28
)
with
pytest
.
raises
(
ValueError
):
fragmentation
(
fasta
,
seq_counts
,
mean_length
=
100
,
std
=
10
,
a_prob
=
0.22
,
t_prob
=
'
a
'
,
g_prob
=
0.25
,
c_prob
=
0.28
)
with
pytest
.
raises
(
ValueError
):
fragmentation
(
fasta
,
seq_counts
,
mean_length
=
100
,
std
=
10
,
a_prob
=
0.22
,
t_prob
=
0.25
,
g_prob
=
'
a
'
,
c_prob
=
0.28
)
with
pytest
.
raises
(
ValueError
):
fragmentation
(
fasta
,
seq_counts
,
mean_length
=
100
,
std
=
10
,
a_prob
=
0.22
,
t_prob
=
0.25
,
g_prob
=
0.25
,
c_prob
=
'
a
'
)
This diff is collapsed.
Click to expand it.
tests/test_utils.py
+
0
−
6
View file @
551c2056
"""
Test utils.py functions.
"""
"""
Test utils.py functions.
"""
<<<<<<<
HEAD
import
pytest
from
=======
import
argparse
import
argparse
import
pytest
import
pytest
...
@@ -40,4 +35,3 @@ def test_prob():
...
@@ -40,4 +35,3 @@ def test_prob():
check_prob
(
"
string
"
)
check_prob
(
"
string
"
)
with
pytest
.
raises
(
ValueError
):
with
pytest
.
raises
(
ValueError
):
check_prob
(
""
)
check_prob
(
""
)
>>>>>>>
hugo_new
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment