diff --git a/src/read_sequencing.py b/src/read_sequencing.py new file mode 100644 index 0000000000000000000000000000000000000000..aaec0e9f48c077f7cc5e3c3e2e2cc75f331c9562 --- /dev/null +++ b/src/read_sequencing.py @@ -0,0 +1,103 @@ +""" Read Sequencing +Author: Kathleen Moriarty +Class: Programming for Life Sciences +""" + + +def read_sequencing(frag_file_name, output_file_name, num_reads, read_len, num_seq_cyc): + """Reads a fasta-formatted file of terminal fragments and simulates reads of these fragments. + + Simulate the sequencing of reads on the template of terminal + fragments. Reads are copies of fixed length starting + from the 5' end of fragments. If the desired read length + is larger than the fragment length, sequencing would in + principle proceed into the 3' adaptor and then would perhaps + yield random bases. For simplicity, here we assume that random + nucleotides are introduced in this case. + + :param output_file_name: file name where to store the output + :param num_seq_cyc: integer of number of cycles + :param read_len: integer of identical read length + :param num_reads: number of total reads to simulate + :param frag_file_name: input file of terminal fragments + + Output: + Fasta-formatted file ("../output/reads.csv") of reads of identical length, representing 5’ + ends of the terminal fragments + """ + + # Import classes + from random import choices, randrange + import numpy as np + + # Read data from terminal fragment file + # Store fragments in a list + + f = open(frag_file_name, "r") + frag_line = f.readline() + frag_list = [] + frag_str = "" + while frag_line != "": + # To stop when the end of file is reached + if frag_line.startswith('>'): + if not (len(frag_list) == 0 and frag_str == ""): + frag_list.append(frag_str) + frag_str = "" + else: + frag_str = frag_str + frag_line.rstrip("\n") + frag_line = f.readline() + frag_list.append(frag_str) + # print(frag_list) + f.close() + + # Initiate list to store reads from modified fragments + fasta_list = list() + + # store list of random nucleotides from which to sample when read length is too short + nucleotides = ['A', 'C', 'G', 'T'] + + # Calculate sum of all lengths to determine the relative abundance for that fragment + sum_frags = sum(map(len, frag_list)) + + # Repeat the read process for given number of cycles + for j in range(1, num_seq_cyc): + + # Loop through fasta fragments that start with 5' + for frag in frag_list: + + # Determine number of reads to create from this fragment + # This might not always provide an exact number of reads that were asked + # TODO resolve this issue + num_frag_reads = round((len(frag)/sum_frags) * num_reads) + + for i in range(1, num_frag_reads): + + # Obtain random first position for the read on the fragment + rand_start = randrange(0, len(frag)) + + # Calculate the difference of start position and length of read + diff_start_end = len(frag)-rand_start + + # If length of read is greater than difference of start to end, then add random nucleotides + if diff_start_end < read_len: + + # Calculate number of random nucleotides to add to the end of the read + diff = read_len - diff_start_end + + # Select random nucleotides from list of possible + rand_samp = choices(nucleotides, k=diff) + + # Add the random list to the read and save + tmp_read = frag[rand_start:len(frag)] + ''.join(rand_samp) + else: + # Save subset of fragment as read + tmp_read = frag[rand_start:(rand_start + read_len)] + + # append read to list + fasta_list.append(tmp_read) + + # Save list to file + np.savetxt(output_file_name, + fasta_list, + delimiter=",", + fmt='%s')