diff --git a/.gitignore b/.gitignore index 67cda8e87a9a2ed336794254c9a0039a2fbdcc35..20d1246b9039e9f35190e7656ec1a278c205281b 100644 --- a/.gitignore +++ b/.gitignore @@ -60,4 +60,5 @@ __pycache__/ *_cache *egg-info/ .coverage -build/ \ No newline at end of file +build/ +*/play.py \ No newline at end of file diff --git a/term_frag_sel/cli.py b/term_frag_sel/cli.py index a9b8e957a60f3a64c33e2e15cd10de951b887850..30c2535c219bf40e94ef441f075d99f5ab628884 100644 --- a/term_frag_sel/cli.py +++ b/term_frag_sel/cli.py @@ -10,6 +10,13 @@ import pandas as pd # type: ignore from term_frag_sel.fragmentation import fragmentation from term_frag_sel.utils import check_positive, check_prob +logging.basicConfig( + format='[%(asctime)s: %(levelname)s] %(message)s \ + (module "%(module)s")', + level=logging.INFO, +) +logger = logging.getLogger("main") + def main(args: argparse.Namespace): """Use CLI arguments to fragment sequences and output text file \ @@ -18,6 +25,9 @@ def main(args: argparse.Namespace): Args: args (parser): list of arguments from CLI. """ + if not isinstance(args, argparse.Namespace): + raise TypeError("Input should be argparse.Namespace") + # Create or wipe output file with open(args.output, "w", encoding="utf-8") as _: pass @@ -26,17 +36,16 @@ def main(args: argparse.Namespace): fasta, seq_counts = file_validation(args.fasta, args.counts, args.sep) logger.info("Fragmentation of %s...", args.fasta) - fasta_parse = {} - for record in fasta: - fasta_parse[record.id] = record.seq - splits = np.arange(0, len(list(fasta_parse))+args.size, args.size) + splits = np.arange(0, len(list(fasta))+args.size, args.size) + nuc_probs = {'A': args.a_prob, 'T': args.t_prob, + 'G': args.g_prob, 'C': args.c_prob} for i, split in enumerate(splits): - fasta_dict = fasta_parse[split:splits[i+1]] + fasta_dict = fasta[split:splits[i+1]] term_frags = fragmentation(fasta_dict, seq_counts, + nuc_probs, args.mean, args.std, - args.A_prob, args.T_prob, - args.G_prob, args.C_prob) + ) logger.info("Writing batch %s sequences to %s...", i, args.output) with open(args.output, 'a', encoding="utf-8") as out_file: @@ -55,24 +64,32 @@ def file_validation(fasta_file: str, sep (str): Separator for counts file. Returns: - tuple: fasta and sequence counts variables + tuple: fasta dict and sequence counts pd.DataFrame """ + fasta_dict = {} with open(fasta_file, "r", encoding="utf-8") as handle: - fasta = SeqIO.parse(handle, "fasta") - if not any(fasta): - raise ValueError("Input FASTA file is either empty or \ - incorrect file type.") + fasta_sequences = SeqIO.parse(handle, "fasta") + + if not any(fasta_sequences): + raise ValueError("Input FASTA file is either empty or \ + incorrect file type.") + + for record in fasta_sequences: + fasta_dict[record.id] = str(record.seq).upper() count_path = Path(counts_file) if not count_path.is_file(): - logger.exception("Input counts file does not exist or isn't a file.") + raise FileNotFoundError("Input counts file does not exist or \ + isn't a file.") + + if sep == ",": + seq_counts = pd.read_csv(counts_file, names=["seqID", "count"]) + seq_counts = seq_counts.astype({"seqID": str}) else: - if sep == ",": - seq_counts = pd.read_csv(counts_file, names=["seqID", "count"]) - else: - seq_counts = pd.read_table(counts_file, names=["seqID", "count"]) + seq_counts = pd.read_table(counts_file, names=["seqID", "count"]) + seq_counts = seq_counts.astype({"seqID": str}) - return fasta, seq_counts + return fasta_dict, seq_counts def parse_arguments() -> argparse.Namespace: @@ -97,21 +114,21 @@ def parse_arguments() -> argparse.Namespace: help="output file path") parser.add_argument('--mean', required=False, default=300, type=check_positive, - help="Mean fragment length (default: 10)") + help="Mean fragment length (default: 300)") parser.add_argument('--std', required=False, default=60, type=check_positive, help="Standard deviation fragment length \ - (defafult: 1)") - parser.add_argument('-a', '--A_prob', required=False, default=0.22, + (defafult: 60)") + parser.add_argument('-a', '--a_prob', required=False, default=0.22, type=check_prob, help="Probability cut happens after nucleotide A") - parser.add_argument('-t', '--T_prob', required=False, default=0.25, + parser.add_argument('-t', '--t_prob', required=False, default=0.25, type=check_prob, help="Probability cut happens after nucleotide T") - parser.add_argument('-g', '--G_prob', required=False, default=0.25, + parser.add_argument('-g', '--g_prob', required=False, default=0.25, type=check_prob, help="Probability cut happens after nucleotide G") - parser.add_argument('-c', '--C_prob', required=False, default=0.28, + parser.add_argument('-c', '--c_prob', required=False, default=0.28, type=check_prob, help="Probability cut happens after nucleotide C") parser.add_argument('-s', '--size', required=False, default=10000, diff --git a/term_frag_sel/fragmentation.py b/term_frag_sel/fragmentation.py index 302fe6f41a8602688af4abcf8681ef830e91ba48..ef347cab82a652ede9f1ea3d2cd57f27920bb0cd 100644 --- a/term_frag_sel/fragmentation.py +++ b/term_frag_sel/fragmentation.py @@ -5,41 +5,81 @@ import numpy as np import pandas as pd # type: ignore +def get_cut_number(seq_len: int, mean: float) -> int: + """Get the number of cuts for a particular sequence. + + Args: + seq_len (int): length of sequence/number of nucleotides in the sequence + mean (float): mean fragment length + + Returns: + int: number of cuts + """ + if not isinstance(seq_len, int): + raise ValueError(f"Sequence length must be numeric, \ + not {type(seq_len)}") + + if not isinstance(mean, int): + raise ValueError(f"Mean must be numeric, not {type(mean)}") + + cuts_distribution = [] # distribution of cut numbers (n_cuts) + + # 1000 iterations should not be too computationally inefficient + # given the nature of the code + for _ in range(1000): + n_cuts = 0 + len_sum = 0 + while True: + len_sum += np.random.exponential(scale=mean) + if len_sum < seq_len: + n_cuts += 1 + else: + cuts_distribution.append(n_cuts) + break + + cuts_distribution.sort() + cut_counts = {x: cuts_distribution.count(x) for x in cuts_distribution} + cut_probs = [x/1000 for x in cut_counts.values()] + + # randomly ick no. of cut from cut distribution based on probability + # of cut numbers + n_cuts = np.random.choice(list(cut_counts.keys()), p=cut_probs) + + return n_cuts + + def fragmentation(fasta: dict, seq_counts: pd.DataFrame, - mean_length: int = 100, std: int = 10, - a_prob: float = 0.22, t_prob: float = 0.25, - g_prob: float = 0.25, c_prob: float = 0.28 + nuc_probs: dict, + mu_length: int = 100, std: int = 10, ) -> list: """Fragment cDNA sequences and select terminal fragment. Args: fasta_file (dict): dictionary of {transcript IDs: sequences} counts_file (pd.DataFrame): dataframe with sequence counts and IDs - mean_length (int): mean length of desired fragments + nuc_probs (dict): probability of cut occuring a certain nucleotide. \ + Ordered as A, T, G, C. E.g: {'A': 0.22, 'T': 0.25, \ + 'G': 0.25, 'C': 0.28}. + mu_length (int): mean length of desired fragments std (int): standard deviation of desired fragment lengths - a_prob (float): probability of nucleotide A - t_prob (float): probability of nucleotide T - g_prob (float): probability of nucleotide G - c_prob (float): probability of nucleotide C Returns: list: list of selected terminal fragments """ # calculated using https://www.nature.com/articles/srep04532#MOESM1 - nuc_probs = {'A': a_prob, 'T': t_prob, 'G': g_prob, 'C': c_prob} - term_frags = [] for seq_id, seq in fasta.items(): counts = seq_counts[seq_counts["seqID"] == seq_id]["count"] - for _ in range(counts): - n_cuts = int(len(seq)/mean_length) - + for _ in range(counts.iloc[0]): + # pick no. of cuts from gauss fragment length distribution # non-uniformly random DNA fragmentation implementation based on # https://www.nature.com/articles/srep04532#Sec1 # assume fragmentation by sonication for NGS workflow cuts = [] cut_nucs = np.random.choice(list(nuc_probs.keys()), - n_cuts, p=list(nuc_probs.values())) + size=get_cut_number(len(seq), + mu_length), + p=list(nuc_probs.values())) for nuc in cut_nucs: nuc_pos = [x.start() for x in re.finditer(nuc, seq)] pos = np.random.choice(nuc_pos) @@ -50,13 +90,11 @@ def fragmentation(fasta: dict, seq_counts: pd.DataFrame, cuts.sort() cuts.insert(0, 0) term_frag = "" - for i, val in enumerate(cuts): - if i == len(cuts)-1: - fragment = seq[val+1:cuts[-1]] - else: - fragment = seq[val:cuts[i+1]] - if mean_length-std <= len(fragment) <= mean_length+std: - term_frag = fragment + + # check if 3' fragment is in the correct size range + if mu_length-std <= len(seq[cuts[-1]:len(seq)]) <= mu_length+std: + term_frag = seq[cuts[-1]:len(seq)] + if term_frag != "": term_frags.append(term_frag) diff --git a/tests/test_cli.py b/tests/test_cli.py index 53a94c3f5011ee77266ceb8ee8c96951d3d1ad82..4ffe56aa4c4a9dfc43f195c305dc4ada3e89536d 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,13 +1,31 @@ """Test cli.py functions.""" import pytest -# from Bio import SeqIO -from term_frag_sel.cli import file_validation # type: ignore +from term_frag_sel.cli import file_validation, main # type: ignore FASTA_FILE = "tests/test_files/test.fasta" +CSV_FILE = "tests/test_files/test.csv" +TAB_FILE = "tests/test_files/test_tab.csv" + +_, csv_counts = file_validation(FASTA_FILE, CSV_FILE, ",") +_, tab_counts = file_validation(FASTA_FILE, TAB_FILE, "\t") def test_file(): - """Test check_positive function.""" + """Test file_validation function.""" + assert isinstance(file_validation(FASTA_FILE, CSV_FILE, ","), tuple) + assert isinstance(file_validation(FASTA_FILE, TAB_FILE, "\t"), tuple) + assert csv_counts.equals(tab_counts) + + with pytest.raises(FileNotFoundError): + file_validation(FASTA_FILE, "", ",") with pytest.raises(FileNotFoundError): - file_validation("", "", ",") + file_validation("", CSV_FILE, ",") + with pytest.raises(ValueError): + file_validation(CSV_FILE, CSV_FILE, ",") + + +def test_main(): + """Test main() function.""" + with pytest.raises(TypeError): + main("") diff --git a/tests/test_files/test.csv b/tests/test_files/test.csv new file mode 100644 index 0000000000000000000000000000000000000000..d6ffadaa26aed64c3281302fc419a59031c57d71 --- /dev/null +++ b/tests/test_files/test.csv @@ -0,0 +1,50 @@ +0,1,99 +1,2,130 +2,3,144 +3,4,52 +4,5,98 +5,6,85 +6,7,119 +7,8,143 +8,9,112 +9,10,87 +10,11,53 +11,12,50 +12,13,129 +13,14,112 +14,15,56 +15,16,82 +16,17,130 +17,18,98 +18,19,87 +19,20,75 +20,21,140 +21,22,141 +22,23,74 +23,24,54 +24,25,56 +25,26,56 +26,27,56 +27,28,99 +28,29,101 +29,30,101 +30,31,62 +31,32,96 +32,33,131 +33,34,117 +34,35,53 +35,36,81 +36,37,114 +37,38,106 +38,39,67 +39,40,121 +40,41,134 +41,42,105 +42,43,91 +43,44,90 +44,45,145 +45,46,59 +46,47,84 +47,48,62 +48,49,50 +49,50,86 diff --git a/tests/test_files/test_tab.csv b/tests/test_files/test_tab.csv new file mode 100644 index 0000000000000000000000000000000000000000..ece6e9e305df43d46df7f6f4f7a240585e279e4f --- /dev/null +++ b/tests/test_files/test_tab.csv @@ -0,0 +1,50 @@ +0 1 99 +1 2 130 +2 3 144 +3 4 52 +4 5 98 +5 6 85 +6 7 119 +7 8 143 +8 9 112 +9 10 87 +10 11 53 +11 12 50 +12 13 129 +13 14 112 +14 15 56 +15 16 82 +16 17 130 +17 18 98 +18 19 87 +19 20 75 +20 21 140 +21 22 141 +22 23 74 +23 24 54 +24 25 56 +25 26 56 +26 27 56 +27 28 99 +28 29 101 +29 30 101 +30 31 62 +31 32 96 +32 33 131 +33 34 117 +34 35 53 +35 36 81 +36 37 114 +37 38 106 +38 39 67 +39 40 121 +40 41 134 +41 42 105 +42 43 91 +43 44 90 +44 45 145 +45 46 59 +46 47 84 +47 48 62 +48 49 50 +49 50 86 diff --git a/tests/test_frag.py b/tests/test_frag.py index d97b176054f801a93796832ffc54c324b4076046..d6eaccd78626a6129fe4f1c4a05a9e66da43328e 100644 --- a/tests/test_frag.py +++ b/tests/test_frag.py @@ -1,23 +1,54 @@ """Test utils.py functions.""" +import pandas as pd import pytest from Bio import SeqIO -from term_frag_sel.fragmentation import fragmentation # type: ignore +from term_frag_sel.fragmentation import fragmentation, get_cut_number -with open("tests/test_files/test.fasta", "r", encoding="utf-8") as handle: - fasta = SeqIO.parse(handle, "fasta") +FASTA_FILE = "tests/test_files/test.fasta" +CSV_FILE = "tests/test_files/test.csv" -# have to create the counts file +fasta_dict = {} +with open(FASTA_FILE, "r", encoding="utf-8") as handle: + fasta_sequences = SeqIO.parse(handle, "fasta") + for record in fasta_sequences: + fasta_dict[record.id] = str(record.seq).upper() -MU = 100 -STD = 10 -A_PROB = 0.22 -C_PROB = 0.28 -T_PROB = 0.25 -G_PROB = 0.25 +seq_counts = pd.read_csv(CSV_FILE, names=["seqID", "count"]) +seq_counts = seq_counts.astype({"seqID": str}) +NUC = {'A': 0.22, 'T': 0.25, 'G': 0.25, 'C': 0.28} -def test_frag(): + +def test_get_cut_number(): + """Test get_cut_number function.""" + assert get_cut_number(100, 20) <= 15 and get_cut_number(100, 20) >= 0 + + with pytest.raises(ValueError): + get_cut_number(seq_len='a', mean=4) + with pytest.raises(ValueError): + get_cut_number(seq_len=10, mean='a') + + +def test_fragmentation(): """Test fragmentation function.""" - with pytest.raises(TypeError): - fragmentation() + assert isinstance(fragmentation(fasta_dict, seq_counts, + NUC), list) + + # no need to check string mean or std since it's checked at CLI + with pytest.raises(ValueError): + nuc_probs = {'A': 'a', 'T': 0.25, + 'G': 0.25, 'C': 0.28} + fragmentation(fasta_dict, seq_counts, nuc_probs) + with pytest.raises(ValueError): + nuc_probs = {'A': 0.22, 'T': 'a', + 'G': 0.25, 'C': 0.28} + fragmentation(fasta_dict, seq_counts, nuc_probs) + with pytest.raises(ValueError): + nuc_probs = {'A': 0.22, 'T': 0.25, + 'G': 'a', 'C': 0.28} + fragmentation(fasta_dict, seq_counts, nuc_probs) + with pytest.raises(ValueError): + nuc_probs = {'A': 0.22, 'T': 0.25, + 'G': 0.25, 'C': 'a'} + fragmentation(fasta_dict, seq_counts, nuc_probs) diff --git a/tests/test_utils.py b/tests/test_utils.py index 36084a66eb91d452b0bfd525b3e8b91c786f2b81..c7d7be2d3fb43f5511322d1bdfe2d4107ecc38ae 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,9 +1,4 @@ """Test utils.py functions.""" -<<<<<<< HEAD -import pytest - -from -======= import argparse import pytest @@ -40,4 +35,3 @@ def test_prob(): check_prob("string") with pytest.raises(ValueError): check_prob("") ->>>>>>> hugo_new