fixes to imports

bdd0ee5d · Hugo Madge Leon · 551c2056 · bdd0ee5d · bdd0ee5d · bdd0ee5d
Commit bdd0ee5d authored 2 years ago by Hugo Madge Leon
--- a/.gitignore
+++ b/.gitignore
@@ -60,4 +60,5 @@ __pycache__/
 *_cache
 *egg-info/
 .coverage
-build/
\ No newline at end of file
+build/
+*play.py
\ No newline at end of file
--- a/term_frag_sel/cli.py
+++ b/term_frag_sel/cli.py
@@ -26,13 +26,10 @@ def main(args: argparse.Namespace):
    fasta, seq_counts = file_validation(args.fasta, args.counts, args.sep)

    logger.info("Fragmentation of %s...", args.fasta)
-    fasta_parse = {}
-    for record in fasta:
-        fasta_parse[record.id] = record.seq
-    splits = np.arange(0, len(list(fasta_parse))+args.size, args.size)
+    splits = np.arange(0, len(list(fasta))+args.size, args.size)

    for i, split in enumerate(splits):
-        fasta_dict = fasta_parse[split:splits[i+1]]
+        fasta_dict = fasta[split:splits[i+1]]
        term_frags = fragmentation(fasta_dict, seq_counts,
                                   args.mean, args.std,
                                   args.a_prob, args.t_prob,
@@ -55,14 +52,17 @@ def file_validation(fasta_file: str,
        sep (str): Separator for counts file.

    Returns:
-        tuple: fasta and sequence counts variables
+        tuple: fasta dict and sequence counts pd.DataFrame
    """
-    with open(fasta_file, "r", encoding="utf-8") as handle:
-        fasta = SeqIO.parse(handle, "fasta")
-    if not any(fasta):
+    fasta_sequences = SeqIO.parse(open(fasta_file, "r", encoding="utf-8"),'fasta')
+    if not any(fasta_sequences):
        raise ValueError("Input FASTA file is either empty or \
            incorrect file type.")

+    fasta_dict = {}
+    for record in fasta_sequences:
+        fasta_dict[record.id] = str(record.seq).upper()
+
    count_path = Path(counts_file)
    if not count_path.is_file():
        logger.exception("Input counts file does not exist or isn't a file.")
@@ -72,7 +72,7 @@ def file_validation(fasta_file: str,
        else:
            seq_counts = pd.read_table(counts_file, names=["seqID", "count"])

-    return fasta, seq_counts
+    return fasta_dict, seq_counts


 def parse_arguments() -> argparse.Namespace:

--- a/tests/test_cli.py
+++ b/tests/test_cli.py
 """Test cli.py functions."""
 import pytest
+import pandas as pd
 # from Bio import SeqIO

 from term_frag_sel.cli import file_validation  # type: ignore

 FASTA_FILE = "tests/test_files/test.fasta"
-
+CSV_FILE = "tests/test_files/test.csv"

 def test_file():
-    """Test check_positive function."""
+    """Test file_validation function."""
+    assert isinstance(file_validation(FASTA_FILE, CSV_FILE, ","), tuple[dict, pd.DataFrame])
    with pytest.raises(FileNotFoundError):
        file_validation("", "", ",")
--- a/tests/test_files/test.csv
+++ b/tests/test_files/test.csv
+,seqID,count
+0,1,120
+1,2,60
+2,3,76
+3,4,141
+4,5,107
+5,6,59
+6,7,121
+7,8,72
+8,9,114
+9,10,126
+10,11,77
+11,12,130
+12,13,146
+13,14,69
+14,15,62
+15,16,93
+16,17,103
+17,18,150
+18,19,140
+19,20,56
+20,21,103
+21,22,115
+22,23,113
+23,24,53
+24,25,95
+25,26,112
+26,27,102
+27,28,53
+28,29,139
+29,30,143
+30,31,150
+31,32,126
+32,33,50
+33,34,94
+34,35,81
+35,36,116
+36,37,51
+37,38,110
+38,39,98
+39,40,60
+40,41,57
+41,42,73
+42,43,143
+43,44,116
+44,45,98
+45,46,139
+46,47,89
+47,48,63
+48,49,68
+49,50,84
--- a/tests/test_frag.py
+++ b/tests/test_frag.py
 """Test utils.py functions."""
-import random
+import pandas as pd
 import pytest
 from Bio import SeqIO

 from term_frag_sel.fragmentation import fragmentation, get_cut_number  # type: ignore

-with open("tests/test_files/test.fasta", "r", encoding="utf-8") as handle:
-    fasta = SeqIO.parse(handle, "fasta")
+FASTA_FILE = "tests/test_files/test.fasta"
+CSV_FILE = "tests/test_files/test.csv"

-seq_counts = {}
-for entry in fasta:
-    name, sequence = entry.id, str(entry.seq)
-    seq_counts[name] = random.randint(50,150)
+fasta_sequences = SeqIO.parse(open(FASTA_FILE, "r", encoding="utf-8"),'fasta')
+fasta_dict = {}
+for record in fasta_sequences:
+    fasta_dict[record.id] = str(record.seq).upper()
+
+seq_counts = pd.read_csv(CSV_FILE, names=["seqID", "count"])

 MU = 100
 STD = 10
@@ -35,23 +37,23 @@ def test_get_cut_number():

 def test_fragmentation():
    """Test fragmentation function."""
-    assert isinstance(fragmentation(fasta, seq_counts), list)
+    assert isinstance(fragmentation(fasta_dict, seq_counts), list)

    with pytest.raises(ValueError):
-        fragmentation(fasta, seq_counts, mean_length = 'a', std = 10,
+        fragmentation(fasta_dict, seq_counts, mean_length = 'a', std = 10,
                      a_prob = 0.22, t_prob = 0.25,g_prob = 0.25, c_prob = 0.28)
    with pytest.raises(ValueError):
-        fragmentation(fasta, seq_counts, mean_length = 100, std = 'a',
+        fragmentation(fasta_dict, seq_counts, mean_length = 100, std = 'a',
                      a_prob = 0.22, t_prob = 0.25,g_prob = 0.25, c_prob = 0.28)
    with pytest.raises(ValueError):
-        fragmentation(fasta, seq_counts, mean_length = 100, std = 10,
-                      a_prob = 'a', t_prob = 0.25,g_prob = 0.25, c_prob = 0.28)
+        fragmentation(fasta_dict, seq_counts, mean_length = 100, std = 10,
+                      a_prob = 'a', t_prob = 0.25, g_prob = 0.25, c_prob = 0.28)
    with pytest.raises(ValueError):
-        fragmentation(fasta, seq_counts, mean_length = 100, std = 10,
-                      a_prob = 0.22, t_prob = 'a',g_prob = 0.25, c_prob = 0.28)
+        fragmentation(fasta_dict, seq_counts, mean_length = 100, std = 10,
+                      a_prob = 0.22, t_prob = 'a', g_prob = 0.25, c_prob = 0.28)
    with pytest.raises(ValueError):
-        fragmentation(fasta, seq_counts, mean_length = 100, std = 10,
-                      a_prob = 0.22, t_prob = 0.25,g_prob = 'a', c_prob = 0.28)
+        fragmentation(fasta_dict, seq_counts, mean_length = 100, std = 10,
+                      a_prob = 0.22, t_prob = 0.25, g_prob = 'a', c_prob = 0.28)
    with pytest.raises(ValueError):
-        fragmentation(fasta, seq_counts, mean_length = 100, std = 10,
-                      a_prob = 0.22, t_prob = 0.25,g_prob = 0.25, c_prob = 'a')
+        fragmentation(fasta_dict, seq_counts, mean_length = 100, std = 10,
+                      a_prob = 0.22, t_prob = 0.25, g_prob = 0.25, c_prob = 'a')