add: description line for reads

12ef1d8e · Kathleen Moriarty · b9d94593 · 12ef1d8e
Commit 12ef1d8e authored 3 years ago by Kathleen Moriarty
--- a/src/read_sequencing.py
+++ b/src/read_sequencing.py
@@ -30,71 +30,72 @@ def read_sequencing(
        read_len: integer of identical read length
    """
    # Import classes
-    from random import choices, randrange
+    from random import choices
-    import numpy as np
    from typing import List
    # Read data from terminal fragment file
-    # Store fragments in a list
+    # Store fragment descriptions in a list
-    f = open(frag_file_name, "r")
+    frag_desc = [] # type: List[str]
-    frag_line = f.readline()
-    frag_list = []  # type: List[str]
-    frag_str = ""
-    while frag_line != "":
-        # To stop when the end of file is reached
-        if frag_line.startswith('>'):
-            # Determine if this is the first fragment in the file
-            # Ignore the description line (starting with >) of the first fragment
-            if not (len(frag_list) == 0 and frag_str == ""):
-                # Not the first fragment. Append to list.
-                frag_list.append(frag_str)
-            frag_str = ""
-        else:
-            frag_str = frag_str + frag_line.rstrip("\n")
-        frag_line = f.readline()
-    frag_list.append(frag_str)
-    # print(frag_list)
-    f.close()
-    # Initiate list to store reads from modified fragments
+    with open(frag_file_name, 'r') as f:
-    fasta_list = list()
+        frag_line = f.readline()
+        frag_list = []  # type: List[str]
+        frag_str = ""
+        while frag_line != "":
+            # To stop when the end of file is reached
+            if frag_line.startswith('>'):
+                # Determine if this is the first fragment in the file
+                # Ignore the description line (starting with >) of the first fragment
+                if not (len(frag_list) == 0 and frag_str == ""):
+                    # Not the first fragment. Append to list.
+                    frag_list.append(frag_str)
+                frag_str = ""
+                # Store description line for output file
+                frag_desc.append(frag_line)
+            else:
+                frag_str = frag_str + frag_line.rstrip("\n")
+            # Read next line
+            frag_line = f.readline()
+        frag_list.append(frag_str)
-    # store list of random nucleotides from which to sample when read length is too short
+    # Store list of random nucleotides from which to sample when read length is too short
    nucleotides = ['A', 'C', 'G', 'T']
    # Calculate sum of all lengths to determine the relative abundance for that fragment
    sum_frags = sum(map(len, frag_list))
-    # Loop through fasta fragments that start with 5'
+    # Open the file to save the reads
-    for frag in frag_list:
+    with open(output_file_name, 'w') as fw:
-        # Determine number of reads to create from this fragment
+        # Loop through fasta fragments that start with 5'
-        # This might not always provide an exact number of reads that were asked
+        for frag in frag_list:
-        # TODO resolve this issue
+            # Determine number of reads to create from this fragment
-        num_frag_reads = round((len(frag)/sum_frags) * num_reads)
+            # This might not always provide an exact number of reads that were asked
+            # TODO resolve this issue
-        for i in range(0, num_frag_reads):
+            num_frag_reads = round((len(frag)/sum_frags) * num_reads)
-            # If the read length is less than the required length given by the parameter, then add random nucleotides
+            for i in range(0, num_frag_reads):
-            if len(frag) < read_len:
+                # If the read length is less than the required length given by the parameter,
+                # then add random nucleotides
-                # Calculate number of random nucleotides to add to the end of the read
+                if len(frag) < read_len:
-                diff = read_len - read_len
+                    # Calculate number of random nucleotides to add to the end of the read
-                # Select random nucleotides from list of possible
+                    diff = read_len - read_len
-                rand_samp = choices(nucleotides, k=diff)
+                    # Select random nucleotides from list of possible
-                # Add the random list to the read and save
+                    rand_samp = choices(nucleotides, k=diff)
-                tmp_read = frag[0:(len(frag)-1)] + ''.join(rand_samp)
-            else:
+                    # Add the random list to the read and save
-                # Save subset of fragment as read
+                    tmp_read = frag[0:len(frag)] + ''.join(rand_samp)
-                tmp_read = frag[0:(read_len-1)]
+                else:
+                    # Save subset of fragment as read
-            # append read to list
+                    tmp_read = frag[0:read_len]
-            fasta_list.append(tmp_read)
+                # Write read to file and original fragment description
-    # Save list to file
+                fw.write(frag_desc[frag_list.index(frag)])
-    np.savetxt(output_file_name,
+                fw.write(tmp_read + "\n")
-               fasta_list,
-               delimiter=",",
+read_sequencing(frag_file_name = "../tests/resources/test_terminal_fragments.txt",
-               fmt='%s')
+                output_file_name = "reads.txt",
+                num_reads=90,
+                read_len=10)
\ No newline at end of file