Skip to content
Snippets Groups Projects
Commit bd397e57 authored by sunhollyjolly's avatar sunhollyjolly
Browse files

add README

parent dfdafafe
No related branches found
No related tags found
3 merge requests!52Last,!51Sunho final fix,!50Sunho fixed
import re
import numpy as np
import pandas as pd
def fasta_process(fasta_file):
with open(fasta_file, "r") as f:
lines = f.readlines()
ident_pattern = re.compile('>(\S+)')
seq_pattern = re.compile('^(\S+)$')
genes = {}
for line in lines:
if ident_pattern.search(line):
seq_id = (ident_pattern.search(line)).group(1)
elif seq_id in genes.keys():
genes[seq_id] += (seq_pattern.search(line)).group(1)
else:
genes[seq_id] = (seq_pattern.search(line)).group(1)
return genes
def fragmentation(fasta_file, counts_file, mean_length, std):
fasta = fasta_process(fasta_file)
seq_counts = pd.read_csv(counts_file, names = ["seqID", "count"])
nucs = ['A','T','G','C']
mononuc_freqs = [0.22, 0.25, 0.23, 0.30]
term_frags = []
for seq_id, seq in fasta.items():
counts = seq_counts[seq_counts["seqID"] == seq_id]["count"]
for _ in range(counts):
n_cuts = int(len(seq)/mean_length)
# non-uniformly random DNA fragmentation implementation based on https://www.nature.com/articles/srep04532#Sec1
# assume fragmentation by sonication for NGS workflow
cuts = []
cut_nucs = np.random.choice(nucs, n_cuts, p=mononuc_freqs)
for nuc in cut_nucs:
nuc_pos = [x.start() for x in re.finditer(nuc, seq)]
pos = np.random.choice(nuc_pos)
while pos in cuts:
pos = np.random.choice(nuc_pos)
cuts.append(pos)
cuts.sort()
cuts.insert(0,0)
term_frag = ""
for i, val in enumerate(cuts):
if i == len(cuts)-1:
fragment = seq[val+1:cuts[-1]]
else:
fragment = seq[val:cuts[i+1]]
if mean_length-std <= len(fragment) <= mean_length+std:
term_frag = fragment
if term_frag == "":
continue
else:
term_frags.append(term_frag)
return term_frags
"""Fragment sequences."""
import re
import numpy as np
import pandas as pd # type: ignore
def fragmentation(fasta: dict, seq_counts: pd.DataFrame,
mean_length: int = 100, std: int = 10,
a_prob: float = 0.22, t_prob: float = 0.25,
g_prob: float = 0.25, c_prob: float = 0.28
) -> list:
"""Fragment cDNA sequences and select terminal fragment.
Args:
fasta_file (dict): dictionary of {transcript IDs: sequences}
counts_file (pd.DataFrame): dataframe with sequence counts and IDs
mean_length (int): mean length of desired fragments
std (int): standard deviation of desired fragment lengths
a_prob (float): probability of nucleotide A
t_prob (float): probability of nucleotide T
g_prob (float): probability of nucleotide G
c_prob (float): probability of nucleotide C
Returns:
list: list of selected terminal fragments
"""
# calculated using https://www.nature.com/articles/srep04532#MOESM1
nuc_probs = {'A': a_prob, 'T': t_prob, 'G': g_prob, 'C': c_prob}
term_frags = []
for seq_id, seq in fasta.items():
counts = seq_counts[seq_counts["seqID"] == seq_id]["count"]
for _ in range(counts):
n_cuts = int(len(seq)/mean_length)
# non-uniformly random DNA fragmentation implementation based on
# https://www.nature.com/articles/srep04532#Sec1
# assume fragmentation by sonication for NGS workflow
cuts = []
cut_nucs = np.random.choice(list(nuc_probs.keys()),
n_cuts, p=list(nuc_probs.values()))
for nuc in cut_nucs:
nuc_pos = [x.start() for x in re.finditer(nuc, seq)]
pos = np.random.choice(nuc_pos)
while pos in cuts:
pos = np.random.choice(nuc_pos)
cuts.append(pos)
cuts.sort()
cuts.insert(0, 0)
term_frag = ""
for i, val in enumerate(cuts):
if i == len(cuts)-1:
fragment = seq[val+1:cuts[-1]]
else:
fragment = seq[val:cuts[i+1]]
if mean_length-std <= len(fragment) <= mean_length+std:
term_frag = fragment
if term_frag != "":
term_frags.append(term_frag)
return term_frags
"""Utility functions for CLI arguments."""
import argparse
def check_positive(value: str) -> int:
"""Check input value is a positive integer.
Args:
value (str): command line parameter
Raises:
argparse.ArgumentTypeError: received a negative integer
argparse.ArgumentTypeError: received a non-integer value
Returns:
integer: integer version of input value
"""
try:
ivalue = round(float(value))
if ivalue <= 0:
raise argparse.ArgumentTypeError(f"""Expected positive integer,
got negative integer: {value}""")
except ValueError as exc:
raise argparse.ArgumentTypeError(f"""Expected positive integer,
got: {value}""") from exc
else:
return ivalue
def check_prob(value: str) -> float:
"""Check probability value is within ]0,1] range.
Args:
value (str): command line parameter
Raises:
argparse.ArgumentTypeError: received a value outside valid range
Returns:
float: float version of input value
"""
pvalue = float(value)
if pvalue <= 0 or pvalue > 1:
raise argparse.ArgumentTypeError("""Expected a positive float between
0 and 1, but got {value}""")
return pvalue
"""Test cli.py functions."""
import pytest
# from Bio import SeqIO
from term_frag_sel.cli import file_validation # type: ignore
FASTA_FILE = "tests/test_files/test.fasta"
def test_file():
"""Test check_positive function."""
with pytest.raises(FileNotFoundError):
file_validation("", "", ",")
This diff is collapsed.
>1 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 481 bp
tgagcactcggtgccaagggcggggatacacagatggttggctgatacaaccgggactta
aattccctagactagatctgtgttggaacgcctctctacg
>2 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 495 bp
ctgaatcaggtgtaggttctttttacgtcgtttaaggagctacacggtatcttgttttca
gttaaggtgccacacccccgggtggatcatccgtcagctt
>3 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 193 bp
acttcagtactggaaggatctaggaaccattaatgcgagtgtggtgacgccagacgaccc
ccggtgttctgccaccttctttggataggagaaccgtcac
>4 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 625 bp
acgtctggagcgtgggttgacccctgtacatggttctttccggatccttaacgtgccgat
acaactcaaaggtaactgtgcttaccacttccgaagctac
>5 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 845 bp
agagcgtacggcgcgcatcgtataccctacgagggcggcgtgtggaggaacgctgggctg
acactgtagaagattagatacacttgtccctaaaattaac
>6 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 703 bp
tgcagtcgatgtgctattcgttttaggcagtctacgcgcttagtaactcccacggccata
gacttatctcagacatggaccatgtcgatatcggacgccg
>7 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 243 bp
actctttagaatgggtttcactaatagtacgtgcatacaatttcgtcagaaagggcgctt
gctaagggacacggatcaatgatgaccagacttatggtgt
>8 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 863 bp
attggcccggtccaggacagagccttatattgctactggtatgagaaccgttctgacgta
aacttgatggctttacgcctgcacgggcttcatacacaca
>9 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 494 bp
aagcgaaactcctagaacttcccatcaggcaatcgtgtcccacgaagcacggatactacg
ggcactagttgaatggggggtttttttcgtaggtcgtaat
>10 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 86 bp
atcctagcgccaaagatttactgttatggggtcgacgaacactagccgataatgccgtcc
tgggatctctagcctagtattatgcgTCTTCGGAGCAGGG
>11 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 360 bp
cgcctgagggtcctaaatctgacgtatgatcgaagagattggaaggtcccggcgggtcac
cccacgttgcgatcatggccaaggccatggtttgctcaaa
>12 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 140 bp
gaattcctggggatttactcacccccgaggcggacaagatttccagctggatcaccgagg
gttacttaatcccttcgatgctttcaaaggccctaatcag
>13 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 832 bp
aatactctcgttgaagcgtcggacagtaaagtgagagatttcggcccacggtagtcggac
attctcagtggggagcgaagagttgcgcttagagccgacg
>14 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 296 bp
atcggggtgcgaaatcccctgagctggttgactacatacgtaaccacgttccgtgcgtca
tctaagcgtatcggctcatactggtggtaactagacttgg
>15 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 515 bp
accttcaatttgttcgcccgggacaagtagaaattactgtaaactaaacttaacctattc
cttgttaaagtccgcaccaagtgtactgtaagaatggtcg
>16 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 820 bp
ccggctcaatcctgtagaaccgcgtacaacacacccaagctataccgcacacggcgcctt
agcaaccactgcttatctgcgtattatacctttacaatca
>17 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 791 bp
attgttagggcctgtccggaaaagatcaacggaagatattcaccagcacctatgctgact
cacgtagttcccgacgttcagtcccctccaacgtggaagg
>18 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 328 bp
accgattacaggcagtcggccttgtccgctcgtatatccagggatgttccaccgaaagtg
ggagtgtggcacttattggtaaaaggcatttttacgaacg
>19 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 249 bp
ggagtggaaaattctgtagtccgttggcggcgaccgcaaaccagaataatatggtcacgt
taggccctcgggccccttcatatgtacggagtcattgaat
>20 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 440 bp
atcttaaacagcccaatcggctcgccgaccaatttcccgcttcacagtacgcggaagaat
ctgcagatagaagtcagccctctcacgtcaataggaatgc
>21 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 840 bp
cataactcgtgagtggccctgtacaagtcattgcatcacaatccttgcaatttgctcctt
tggccaagcgtacaagaccccggacccatacgctcccggc
>22 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 234 bp
caccgcgaaagtgactcagttttcccggtcttatcacggtcgttgtcgtccagattccgg
ttgttaactgcgggagctataacacttattccttactgcg
>23 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 917 bp
atcaagtgattacctggtaacccgccgctcttgcagtgttcaccctttgtgtcgtcttag
tgtttgtacacgttaaggaaaagcgttagcttaaccatta
>24 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 676 bp
cacacggcatcgcaaagcgagctatccagagatgatacatgtggttgaaggtgattgcgt
caacatgggggttgctcagtttggttggtcaatcaacggt
>25 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 870 bp
ctgatcaccaatagcttgcgcttaacacacgcgccttacaattatatgacgcccttgcca
atgacagatagagccattaatcgtggaaaccaggcattta
>26 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 751 bp
gggtgcgttatggggactaaagactgttactaccggtactccgccttatagagccgtcac
gtattaatcagctatcaacagatactatcgtcacagccct
>27 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 574 bp
gtactgcaccttgcactgctatctacaatgccgagggtcgccctagtgctttgcatgttt
ggcctctacctacgagtctacgcgggcgtttttaagcaag
>28 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 169 bp
agctccctaaacaacacccgcgtaaaaccttcagttatggtgccgactaaccctgtggat
gtcttagcgctctcgttccgatgggtgctgatactagtaa
>29 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 408 bp
tgcagtgatgcatcgataagaccgcatagttacctccttacaggtgacgctaggctaatt
gggagtgctggcacttgtgccctacagtcaagcgctcacg
>30 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 52 bp
caaagcgattcgggttaacgcacttaagagttcgacgtaggttagtcccctcCTTTTAGG
TGTCTAACTAGAGAGGCGCCCTATTGGGCTCAGGATGACG
>31 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 581 bp
tgctctgacgtgtaagcgccttcgataacgtctttgcagcgccccacaaagtaaggaccg
gtctaacagggcttccgaatcaatagactgatagtaatgg
>32 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 249 bp
gcggaactacctctctaagaccgcacaacaagtgtagtagatgaagatcacgcagagtgc
tcggcactgcatttttatacgtcgaatcagaaacgaggtt
>33 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 297 bp
gccctcgcgccagcttacttttagaaaacatcgaccggtaagagatacctgggtgagctg
ggcttcacgacatgttcttaaatcaatactctaaatctgc
>34 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 573 bp
gcctaggggtcttgaccacagggagtacgagcattgatcattggagcaggtggctaatat
tgatagtggttagaccaccggcgcatcatcgtacgagcgc
>35 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 559 bp
gaaaaagtcgccccattcagttacaatcgtcttcagaagccagctcggttggggctatct
gcggggtaatgcaacagggggctaccagacggtaaaccag
>36 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 187 bp
ctaagtccttatctatgatgcatctttcgttactgcgacaatatccgagacgagcagagt
tacacgccgaggtgtaaacgaatacgattgctatatgcaa
>37 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 549 bp
ttcatatggggatttggaatcgggtttgtgcggaatatgcccacgagactgcttatgtca
acgagacgacccattgtcacgttgtaaggccaccaataac
>38 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 916 bp
gtggcctaccataaatcaatttgggttaacgctctttgatctacgcactatgttgattca
cttaccccttgtcaccgggcagaagagagccagtttaggt
>39 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 848 bp
accccggtcgctttggccggtcgtagccctaatcaattctgttcgtatcactaaagtaac
ggtttgaaatcctttgcaaacttgatctgggtatatgaac
>40 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 289 bp
agagcaaaagaaagtctgctccgcgtgacacacttgctcgttgtagtaactgcacgcgcc
gtctactcgacagggaccccccgtcggttcctctctatag
>41 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 642 bp
ctggggaatgaccgtaccgatctaattccccgtcgaaaaacttatgacgcgcagttgtcc
ttatgcttgagacatgaatccttgccccatattggcgatc
>42 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 993 bp
gcaagaagccaaaaaccttgcaggaggtcatttaagtttacccgcgcataagcagagacg
gacctctctgagatctcgcaccgcgcgcccggccggcact
>43 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 473 bp
ggttgtccaggcgcgagcaagtagctgactcgctaatcttaacgagtattgcttaggact
tccaaatactccaagacgtcaatacgctttatctttgtga
>44 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 272 bp
gtagaacttgttccccatggacaatgctagttccgttaatgccaggtattcatgtgccaa
gcgcctgcctggggaatacgagcctctctacaaacttacg
>45 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 860 bp
aaaagcatcactctaacgacgctaccgtctgaatagatcaagattgctatcggttcgacc
ttgatcgcatgtgaacccgcccaaaaacccgtctcgacaa
>46 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 884 bp
aagcctctacaggctctgcggtttggctttacttaacggtgagtcaggaaaacattactg
ctacgttcaccgtgttcagagatagagagtacattaggga
>47 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 888 bp
gcgccttgaagaggcgaggtctaaaggcaaaaatttagatccgccctatgagacggccga
cgcggagaattccctaaccactattgtcctctgcatcgat
>48 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 588 bp
catcaagatgggttacgtaggaccgagattcagtctctgggttagagccgacagcggggc
cgctacatagtacacggcgaggaatgcggggttgggctga
>49 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 626 bp
taacctcagtctcgttcccccctcggtagttcggacccttattcgcttatctcacattca
tcactgtagaccaaggaccgggcatacttgcggatatcta
>50 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 214 bp
taactgtcggtcactgctcatcccgactagttcggctcactagacttactcgcggaagcg
agaagtaggacgtcgtgtaatactccaacgtcgttacgca
"""Test utils.py functions."""
import pytest
from Bio import SeqIO
from term_frag_sel.fragmentation import fragmentation # type: ignore
with open("tests/test_files/test.fasta", "r", encoding="utf-8") as handle:
fasta = SeqIO.parse(handle, "fasta")
# have to create the counts file
MU = 100
STD = 10
A_PROB = 0.22
C_PROB = 0.28
T_PROB = 0.25
G_PROB = 0.25
def test_frag():
"""Test fragmentation function."""
with pytest.raises(TypeError):
fragmentation()
"""Test utils.py functions."""
<<<<<<< HEAD
import pytest
from
=======
import argparse
import pytest
from term_frag_sel.utils import check_positive, check_prob # type: ignore
def test_positive():
"""Test check_positive function."""
assert check_positive("1") == 1
assert check_positive("100") == 100
assert check_positive("2.2") == 2
assert check_positive("2.6") == 3
with pytest.raises(argparse.ArgumentTypeError):
check_positive("0")
with pytest.raises(argparse.ArgumentTypeError):
check_positive("-1")
with pytest.raises(argparse.ArgumentTypeError):
check_positive("string")
with pytest.raises(argparse.ArgumentTypeError):
check_positive("")
def test_prob():
"""Test check_prob function."""
assert check_prob("0.1") == 0.1
assert check_prob("1") == 1.0
with pytest.raises(argparse.ArgumentTypeError):
check_prob("0")
with pytest.raises(argparse.ArgumentTypeError):
check_prob("10")
with pytest.raises(argparse.ArgumentTypeError):
check_prob("-1")
with pytest.raises(ValueError):
check_prob("string")
with pytest.raises(ValueError):
check_prob("")
>>>>>>> hugo_new
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment