first version of read_sequencing

e3aeefd5 · U-STI\morika · Kathleen Moriarty · 557be05b · e3aeefd5
Commit e3aeefd5 authored 3 years ago by $U-STI\morika's avatar$ U-STI\morika Committed by Kathleen Moriarty 3 years ago
--- a/src/read_sequencing.py
+++ b/src/read_sequencing.py
+""" Read Sequencing
+Author: Kathleen Moriarty
+Class: Programming for Life Sciences
+"""
+def read_sequencing(frag_file_name, output_file_name, num_reads, read_len, num_seq_cyc):
+    """Reads a fasta-formatted file of terminal fragments and simulates reads of these fragments.
+    Simulate the sequencing of reads on the template of terminal
+    fragments. Reads are copies of fixed length starting
+    from the 5' end of fragments. If the desired read length
+    is larger than the fragment length, sequencing would in
+    principle proceed into the 3' adaptor and then would perhaps
+    yield random bases. For simplicity, here we assume that random
+    nucleotides are introduced in this case.
+    :param output_file_name: file name where to store the output
+    :param num_seq_cyc: integer of number of cycles
+    :param read_len: integer of identical read length
+    :param num_reads: number of total reads to simulate
+    :param frag_file_name: input file of terminal fragments
+    Output:
+    Fasta-formatted file ("../output/reads.csv") of reads of identical length, representing 5’
+    ends of the terminal fragments
+    """
+    # Import classes
+    from random import choices, randrange
+    import numpy as np
+    # Read data from terminal fragment file
+    # Store fragments in a list
+    f = open(frag_file_name, "r")
+    frag_line = f.readline()
+    frag_list = []
+    frag_str = ""
+    while frag_line != "":
+        # To stop when the end of file is reached
+        if frag_line.startswith('>'):
+            if not (len(frag_list) == 0 and frag_str == ""):
+                frag_list.append(frag_str)
+            frag_str = ""
+        else:
+            frag_str = frag_str + frag_line.rstrip("\n")
+        frag_line = f.readline()
+    frag_list.append(frag_str)
+    # print(frag_list)
+    f.close()
+    # Initiate list to store reads from modified fragments
+    fasta_list = list()
+    # store list of random nucleotides from which to sample when read length is too short
+    nucleotides = ['A', 'C', 'G', 'T']
+    # Calculate sum of all lengths to determine the relative abundance for that fragment
+    sum_frags = sum(map(len, frag_list))
+    # Repeat the read process for given number of cycles
+    for j in range(1, num_seq_cyc):
+        # Loop through fasta fragments that start with 5'
+        for frag in frag_list:
+            # Determine number of reads to create from this fragment
+            # This might not always provide an exact number of reads that were asked
+            # TODO resolve this issue
+            num_frag_reads = round((len(frag)/sum_frags) * num_reads)
+            for i in range(1, num_frag_reads):
+                # Obtain random first position for the read on the fragment
+                rand_start = randrange(0, len(frag))
+                # Calculate the difference of start position and length of read
+                diff_start_end = len(frag)-rand_start
+                # If length of read is greater than difference of start to end, then add random nucleotides
+                if diff_start_end < read_len:
+                    # Calculate number of random nucleotides to add to the end of the read
+                    diff = read_len - diff_start_end
+                    # Select random nucleotides from list of possible
+                    rand_samp = choices(nucleotides, k=diff)
+                    # Add the random list to the read and save
+                    tmp_read = frag[rand_start:len(frag)] + ''.join(rand_samp)
+                else:
+                    # Save subset of fragment as read
+                    tmp_read = frag[rand_start:(rand_start + read_len)]
+                # append read to list
+                fasta_list.append(tmp_read)
+    # Save list to file
+    np.savetxt(output_file_name,
+               fasta_list,
+               delimiter=",",
+               fmt='%s')