add README

bd397e57 · sunhollyjolly · dfdafafe · bd397e57 · bd397e57 · bd397e57
Commit bd397e57 authored 2 years ago by sunhollyjolly
--- a/frag_package/fragmentation_v2.py
+++ b/frag_package/fragmentation_v2.py
-import re
-
-import numpy as np
-import pandas as pd
-
-
-def fasta_process(fasta_file):
-    with open(fasta_file, "r") as f:
-        lines = f.readlines()
-
-        ident_pattern = re.compile('>(\S+)')
-        seq_pattern = re.compile('^(\S+)$')
-
-        genes = {}
-        for line in lines:
-            if ident_pattern.search(line):
-                seq_id = (ident_pattern.search(line)).group(1)
-            elif seq_id in genes.keys():
-                genes[seq_id] += (seq_pattern.search(line)).group(1)
-            else:
-                genes[seq_id] = (seq_pattern.search(line)).group(1)
-    return genes
-
-def fragmentation(fasta_file, counts_file, mean_length, std):
-    fasta = fasta_process(fasta_file)
-    seq_counts = pd.read_csv(counts_file, names = ["seqID", "count"])
-
-    nucs = ['A','T','G','C']
-    mononuc_freqs = [0.22, 0.25, 0.23, 0.30]
-
-    term_frags = [] 
-    for seq_id, seq in fasta.items():
-        counts = seq_counts[seq_counts["seqID"] == seq_id]["count"]
-        for _ in range(counts): 
-            n_cuts = int(len(seq)/mean_length)
-            
-            # non-uniformly random DNA fragmentation implementation based on https://www.nature.com/articles/srep04532#Sec1
-            # assume fragmentation by sonication for NGS workflow
-            cuts = []
-            cut_nucs = np.random.choice(nucs, n_cuts, p=mononuc_freqs) 
-            for nuc in cut_nucs:
-                nuc_pos = [x.start() for x in re.finditer(nuc, seq)]
-                pos = np.random.choice(nuc_pos)
-                while pos in cuts:
-                    pos = np.random.choice(nuc_pos)
-                cuts.append(pos)
-
-            cuts.sort() 
-            cuts.insert(0,0)
-            term_frag = ""
-            for i, val in enumerate(cuts):
-                if i == len(cuts)-1:
-                    fragment = seq[val+1:cuts[-1]]
-                else:
-                    fragment = seq[val:cuts[i+1]]
-                if mean_length-std <= len(fragment) <= mean_length+std:
-                    term_frag = fragment
-            if term_frag == "":
-                continue
-            else:
-                term_frags.append(term_frag)
-    return term_frags
-
+"""Fragment sequences."""
+import re
+
+import numpy as np
+import pandas as pd  # type: ignore
+
+
+def fragmentation(fasta: dict, seq_counts: pd.DataFrame,
+                  mean_length: int = 100, std: int = 10,
+                  a_prob: float = 0.22, t_prob: float = 0.25,
+                  g_prob: float = 0.25, c_prob: float = 0.28
+                  ) -> list:
+    """Fragment cDNA sequences and select terminal fragment.
+
+    Args:
+        fasta_file (dict): dictionary of {transcript IDs: sequences}
+        counts_file (pd.DataFrame): dataframe with sequence counts and IDs
+        mean_length (int): mean length of desired fragments
+        std (int): standard deviation of desired fragment lengths
+        a_prob (float): probability of nucleotide A
+        t_prob (float): probability of nucleotide T
+        g_prob (float): probability of nucleotide G
+        c_prob (float): probability of nucleotide C
+
+    Returns:
+        list: list of selected terminal fragments
+    """
+    # calculated using https://www.nature.com/articles/srep04532#MOESM1
+    nuc_probs = {'A': a_prob, 'T': t_prob, 'G': g_prob, 'C': c_prob}
+
+    term_frags = []
+    for seq_id, seq in fasta.items():
+        counts = seq_counts[seq_counts["seqID"] == seq_id]["count"]
+        for _ in range(counts):
+            n_cuts = int(len(seq)/mean_length)
+
+            # non-uniformly random DNA fragmentation implementation based on
+            # https://www.nature.com/articles/srep04532#Sec1
+            # assume fragmentation by sonication for NGS workflow
+            cuts = []
+            cut_nucs = np.random.choice(list(nuc_probs.keys()),
+                                        n_cuts, p=list(nuc_probs.values()))
+            for nuc in cut_nucs:
+                nuc_pos = [x.start() for x in re.finditer(nuc, seq)]
+                pos = np.random.choice(nuc_pos)
+                while pos in cuts:
+                    pos = np.random.choice(nuc_pos)
+                cuts.append(pos)
+
+            cuts.sort()
+            cuts.insert(0, 0)
+            term_frag = ""
+            for i, val in enumerate(cuts):
+                if i == len(cuts)-1:
+                    fragment = seq[val+1:cuts[-1]]
+                else:
+                    fragment = seq[val:cuts[i+1]]
+                if mean_length-std <= len(fragment) <= mean_length+std:
+                    term_frag = fragment
+            if term_frag != "":
+                term_frags.append(term_frag)
+
+    return term_frags
--- a/term_frag_sel/utils.py
+++ b/term_frag_sel/utils.py
+"""Utility functions for CLI arguments."""
+import argparse
+
+
+def check_positive(value: str) -> int:
+    """Check input value is a positive integer.
+
+    Args:
+        value (str): command line parameter
+
+    Raises:
+        argparse.ArgumentTypeError: received a negative integer
+        argparse.ArgumentTypeError: received a non-integer value
+
+    Returns:
+        integer: integer version of input value
+    """
+    try:
+        ivalue = round(float(value))
+        if ivalue <= 0:
+            raise argparse.ArgumentTypeError(f"""Expected positive integer,
+                                             got negative integer: {value}""")
+    except ValueError as exc:
+        raise argparse.ArgumentTypeError(f"""Expected positive integer,
+                                         got: {value}""") from exc
+    else:
+        return ivalue
+
+
+def check_prob(value: str) -> float:
+    """Check probability value is within ]0,1] range.
+
+    Args:
+        value (str): command line parameter
+
+    Raises:
+        argparse.ArgumentTypeError: received a value outside valid range
+
+    Returns:
+        float: float version of input value
+    """
+    pvalue = float(value)
+    if pvalue <= 0 or pvalue > 1:
+        raise argparse.ArgumentTypeError("""Expected a positive float between
+                                         0 and 1, but got {value}""")
+    return pvalue
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
+"""Test cli.py functions."""
+import pytest
+# from Bio import SeqIO
+
+from term_frag_sel.cli import file_validation  # type: ignore
+
+FASTA_FILE = "tests/test_files/test.fasta"
+
+
+def test_file():
+    """Test check_positive function."""
+    with pytest.raises(FileNotFoundError):
+        file_validation("", "", ",")
--- a/tests/test_files/50_seqs_50_1000_bp.fasta
+++ b/tests/test_files/50_seqs_50_1000_bp.fasta
--- a/tests/test_files/test.fasta
+++ b/tests/test_files/test.fasta
+>1 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 481 bp
+tgagcactcggtgccaagggcggggatacacagatggttggctgatacaaccgggactta
+aattccctagactagatctgtgttggaacgcctctctacg
+>2 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 495 bp
+ctgaatcaggtgtaggttctttttacgtcgtttaaggagctacacggtatcttgttttca
+gttaaggtgccacacccccgggtggatcatccgtcagctt
+>3 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 193 bp
+acttcagtactggaaggatctaggaaccattaatgcgagtgtggtgacgccagacgaccc
+ccggtgttctgccaccttctttggataggagaaccgtcac
+>4 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 625 bp
+acgtctggagcgtgggttgacccctgtacatggttctttccggatccttaacgtgccgat
+acaactcaaaggtaactgtgcttaccacttccgaagctac
+>5 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 845 bp
+agagcgtacggcgcgcatcgtataccctacgagggcggcgtgtggaggaacgctgggctg
+acactgtagaagattagatacacttgtccctaaaattaac
+>6 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 703 bp
+tgcagtcgatgtgctattcgttttaggcagtctacgcgcttagtaactcccacggccata
+gacttatctcagacatggaccatgtcgatatcggacgccg
+>7 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 243 bp
+actctttagaatgggtttcactaatagtacgtgcatacaatttcgtcagaaagggcgctt
+gctaagggacacggatcaatgatgaccagacttatggtgt
+>8 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 863 bp
+attggcccggtccaggacagagccttatattgctactggtatgagaaccgttctgacgta
+aacttgatggctttacgcctgcacgggcttcatacacaca
+>9 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 494 bp
+aagcgaaactcctagaacttcccatcaggcaatcgtgtcccacgaagcacggatactacg
+ggcactagttgaatggggggtttttttcgtaggtcgtaat
+>10 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 86 bp
+atcctagcgccaaagatttactgttatggggtcgacgaacactagccgataatgccgtcc
+tgggatctctagcctagtattatgcgTCTTCGGAGCAGGG
+>11 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 360 bp
+cgcctgagggtcctaaatctgacgtatgatcgaagagattggaaggtcccggcgggtcac
+cccacgttgcgatcatggccaaggccatggtttgctcaaa
+>12 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 140 bp
+gaattcctggggatttactcacccccgaggcggacaagatttccagctggatcaccgagg
+gttacttaatcccttcgatgctttcaaaggccctaatcag
+>13 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 832 bp
+aatactctcgttgaagcgtcggacagtaaagtgagagatttcggcccacggtagtcggac
+attctcagtggggagcgaagagttgcgcttagagccgacg
+>14 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 296 bp
+atcggggtgcgaaatcccctgagctggttgactacatacgtaaccacgttccgtgcgtca
+tctaagcgtatcggctcatactggtggtaactagacttgg
+>15 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 515 bp
+accttcaatttgttcgcccgggacaagtagaaattactgtaaactaaacttaacctattc
+cttgttaaagtccgcaccaagtgtactgtaagaatggtcg
+>16 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 820 bp
+ccggctcaatcctgtagaaccgcgtacaacacacccaagctataccgcacacggcgcctt
+agcaaccactgcttatctgcgtattatacctttacaatca
+>17 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 791 bp
+attgttagggcctgtccggaaaagatcaacggaagatattcaccagcacctatgctgact
+cacgtagttcccgacgttcagtcccctccaacgtggaagg
+>18 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 328 bp
+accgattacaggcagtcggccttgtccgctcgtatatccagggatgttccaccgaaagtg
+ggagtgtggcacttattggtaaaaggcatttttacgaacg
+>19 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 249 bp
+ggagtggaaaattctgtagtccgttggcggcgaccgcaaaccagaataatatggtcacgt
+taggccctcgggccccttcatatgtacggagtcattgaat
+>20 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 440 bp
+atcttaaacagcccaatcggctcgccgaccaatttcccgcttcacagtacgcggaagaat
+ctgcagatagaagtcagccctctcacgtcaataggaatgc
+>21 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 840 bp
+cataactcgtgagtggccctgtacaagtcattgcatcacaatccttgcaatttgctcctt
+tggccaagcgtacaagaccccggacccatacgctcccggc
+>22 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 234 bp
+caccgcgaaagtgactcagttttcccggtcttatcacggtcgttgtcgtccagattccgg
+ttgttaactgcgggagctataacacttattccttactgcg
+>23 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 917 bp
+atcaagtgattacctggtaacccgccgctcttgcagtgttcaccctttgtgtcgtcttag
+tgtttgtacacgttaaggaaaagcgttagcttaaccatta
+>24 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 676 bp
+cacacggcatcgcaaagcgagctatccagagatgatacatgtggttgaaggtgattgcgt
+caacatgggggttgctcagtttggttggtcaatcaacggt
+>25 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 870 bp
+ctgatcaccaatagcttgcgcttaacacacgcgccttacaattatatgacgcccttgcca
+atgacagatagagccattaatcgtggaaaccaggcattta
+>26 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 751 bp
+gggtgcgttatggggactaaagactgttactaccggtactccgccttatagagccgtcac
+gtattaatcagctatcaacagatactatcgtcacagccct
+>27 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 574 bp
+gtactgcaccttgcactgctatctacaatgccgagggtcgccctagtgctttgcatgttt
+ggcctctacctacgagtctacgcgggcgtttttaagcaag
+>28 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 169 bp
+agctccctaaacaacacccgcgtaaaaccttcagttatggtgccgactaaccctgtggat
+gtcttagcgctctcgttccgatgggtgctgatactagtaa
+>29 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 408 bp
+tgcagtgatgcatcgataagaccgcatagttacctccttacaggtgacgctaggctaatt
+gggagtgctggcacttgtgccctacagtcaagcgctcacg
+>30 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 52 bp
+caaagcgattcgggttaacgcacttaagagttcgacgtaggttagtcccctcCTTTTAGG
+TGTCTAACTAGAGAGGCGCCCTATTGGGCTCAGGATGACG
+>31 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 581 bp
+tgctctgacgtgtaagcgccttcgataacgtctttgcagcgccccacaaagtaaggaccg
+gtctaacagggcttccgaatcaatagactgatagtaatgg
+>32 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 249 bp
+gcggaactacctctctaagaccgcacaacaagtgtagtagatgaagatcacgcagagtgc
+tcggcactgcatttttatacgtcgaatcagaaacgaggtt
+>33 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 297 bp
+gccctcgcgccagcttacttttagaaaacatcgaccggtaagagatacctgggtgagctg
+ggcttcacgacatgttcttaaatcaatactctaaatctgc
+>34 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 573 bp
+gcctaggggtcttgaccacagggagtacgagcattgatcattggagcaggtggctaatat
+tgatagtggttagaccaccggcgcatcatcgtacgagcgc
+>35 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 559 bp
+gaaaaagtcgccccattcagttacaatcgtcttcagaagccagctcggttggggctatct
+gcggggtaatgcaacagggggctaccagacggtaaaccag
+>36 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 187 bp
+ctaagtccttatctatgatgcatctttcgttactgcgacaatatccgagacgagcagagt
+tacacgccgaggtgtaaacgaatacgattgctatatgcaa
+>37 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 549 bp
+ttcatatggggatttggaatcgggtttgtgcggaatatgcccacgagactgcttatgtca
+acgagacgacccattgtcacgttgtaaggccaccaataac
+>38 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 916 bp
+gtggcctaccataaatcaatttgggttaacgctctttgatctacgcactatgttgattca
+cttaccccttgtcaccgggcagaagagagccagtttaggt
+>39 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 848 bp
+accccggtcgctttggccggtcgtagccctaatcaattctgttcgtatcactaaagtaac
+ggtttgaaatcctttgcaaacttgatctgggtatatgaac
+>40 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 289 bp
+agagcaaaagaaagtctgctccgcgtgacacacttgctcgttgtagtaactgcacgcgcc
+gtctactcgacagggaccccccgtcggttcctctctatag
+>41 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 642 bp
+ctggggaatgaccgtaccgatctaattccccgtcgaaaaacttatgacgcgcagttgtcc
+ttatgcttgagacatgaatccttgccccatattggcgatc
+>42 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 993 bp
+gcaagaagccaaaaaccttgcaggaggtcatttaagtttacccgcgcataagcagagacg
+gacctctctgagatctcgcaccgcgcgcccggccggcact
+>43 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 473 bp
+ggttgtccaggcgcgagcaagtagctgactcgctaatcttaacgagtattgcttaggact
+tccaaatactccaagacgtcaatacgctttatctttgtga
+>44 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 272 bp
+gtagaacttgttccccatggacaatgctagttccgttaatgccaggtattcatgtgccaa
+gcgcctgcctggggaatacgagcctctctacaaacttacg
+>45 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 860 bp
+aaaagcatcactctaacgacgctaccgtctgaatagatcaagattgctatcggttcgacc
+ttgatcgcatgtgaacccgcccaaaaacccgtctcgacaa
+>46 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 884 bp
+aagcctctacaggctctgcggtttggctttacttaacggtgagtcaggaaaacattactg
+ctacgttcaccgtgttcagagatagagagtacattaggga
+>47 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 888 bp
+gcgccttgaagaggcgaggtctaaaggcaaaaatttagatccgccctatgagacggccga
+cgcggagaattccctaaccactattgtcctctgcatcgat
+>48 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 588 bp
+catcaagatgggttacgtaggaccgagattcagtctctgggttagagccgacagcggggc
+cgctacatagtacacggcgaggaatgcggggttgggctga
+>49 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 626 bp
+taacctcagtctcgttcccccctcggtagttcggacccttattcgcttatctcacattca
+tcactgtagaccaaggaccgggcatacttgcggatatcta
+>50 |A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 214 bp
+taactgtcggtcactgctcatcccgactagttcggctcactagacttactcgcggaagcg
+agaagtaggacgtcgtgtaatactccaacgtcgttacgca
--- a/tests/test_frag.py
+++ b/tests/test_frag.py
+"""Test utils.py functions."""
+import pytest
+from Bio import SeqIO
+
+from term_frag_sel.fragmentation import fragmentation  # type: ignore
+
+with open("tests/test_files/test.fasta", "r", encoding="utf-8") as handle:
+    fasta = SeqIO.parse(handle, "fasta")
+
+# have to create the counts file
+
+MU = 100
+STD = 10
+A_PROB = 0.22
+C_PROB = 0.28
+T_PROB = 0.25
+G_PROB = 0.25
+
+
+def test_frag():
+    """Test fragmentation function."""
+    with pytest.raises(TypeError):
+        fragmentation()
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
+"""Test utils.py functions."""
+<<<<<<< HEAD
+import pytest
+
+from 
+=======
+import argparse
+import pytest
+
+from term_frag_sel.utils import check_positive, check_prob  # type: ignore
+
+
+def test_positive():
+    """Test check_positive function."""
+    assert check_positive("1") == 1
+    assert check_positive("100") == 100
+    assert check_positive("2.2") == 2
+    assert check_positive("2.6") == 3
+    with pytest.raises(argparse.ArgumentTypeError):
+        check_positive("0")
+    with pytest.raises(argparse.ArgumentTypeError):
+        check_positive("-1")
+    with pytest.raises(argparse.ArgumentTypeError):
+        check_positive("string")
+    with pytest.raises(argparse.ArgumentTypeError):
+        check_positive("")
+
+
+def test_prob():
+    """Test check_prob function."""
+    assert check_prob("0.1") == 0.1
+    assert check_prob("1") == 1.0
+    with pytest.raises(argparse.ArgumentTypeError):
+        check_prob("0")
+    with pytest.raises(argparse.ArgumentTypeError):
+        check_prob("10")
+    with pytest.raises(argparse.ArgumentTypeError):
+        check_prob("-1")
+    with pytest.raises(ValueError):
+        check_prob("string")
+    with pytest.raises(ValueError):
+        check_prob("")
+>>>>>>> hugo_new