diff --git a/src/read_sequencing.py b/src/read_sequencing.py index 6c47005b5f51e6727a6e5bd4ce00d3804c596bbd..e7e6581f3ffefb7bedbe13625cf7bbbdfdc35e6e 100644 --- a/src/read_sequencing.py +++ b/src/read_sequencing.py @@ -30,71 +30,72 @@ def read_sequencing( read_len: integer of identical read length """ # Import classes - from random import choices, randrange - import numpy as np + from random import choices from typing import List # Read data from terminal fragment file - # Store fragments in a list - f = open(frag_file_name, "r") - frag_line = f.readline() - frag_list = [] # type: List[str] - frag_str = "" - while frag_line != "": - # To stop when the end of file is reached - if frag_line.startswith('>'): - # Determine if this is the first fragment in the file - # Ignore the description line (starting with >) of the first fragment - if not (len(frag_list) == 0 and frag_str == ""): - # Not the first fragment. Append to list. - frag_list.append(frag_str) - frag_str = "" - else: - frag_str = frag_str + frag_line.rstrip("\n") - frag_line = f.readline() - frag_list.append(frag_str) - # print(frag_list) - f.close() + # Store fragment descriptions in a list + frag_desc = [] # type: List[str] - # Initiate list to store reads from modified fragments - fasta_list = list() + with open(frag_file_name, 'r') as f: + frag_line = f.readline() + frag_list = [] # type: List[str] + frag_str = "" + while frag_line != "": + # To stop when the end of file is reached + if frag_line.startswith('>'): + # Determine if this is the first fragment in the file + # Ignore the description line (starting with >) of the first fragment + if not (len(frag_list) == 0 and frag_str == ""): + # Not the first fragment. Append to list. + frag_list.append(frag_str) + frag_str = "" + # Store description line for output file + frag_desc.append(frag_line) + else: + frag_str = frag_str + frag_line.rstrip("\n") + # Read next line + frag_line = f.readline() + frag_list.append(frag_str) - # store list of random nucleotides from which to sample when read length is too short + # Store list of random nucleotides from which to sample when read length is too short nucleotides = ['A', 'C', 'G', 'T'] # Calculate sum of all lengths to determine the relative abundance for that fragment sum_frags = sum(map(len, frag_list)) - # Loop through fasta fragments that start with 5' - for frag in frag_list: - - # Determine number of reads to create from this fragment - # This might not always provide an exact number of reads that were asked - # TODO resolve this issue - num_frag_reads = round((len(frag)/sum_frags) * num_reads) - - for i in range(0, num_frag_reads): - - # If the read length is less than the required length given by the parameter, then add random nucleotides - if len(frag) < read_len: - - # Calculate number of random nucleotides to add to the end of the read - diff = read_len - read_len - - # Select random nucleotides from list of possible - rand_samp = choices(nucleotides, k=diff) - - # Add the random list to the read and save - tmp_read = frag[0:(len(frag)-1)] + ''.join(rand_samp) - else: - # Save subset of fragment as read - tmp_read = frag[0:(read_len-1)] - - # append read to list - fasta_list.append(tmp_read) - - # Save list to file - np.savetxt(output_file_name, - fasta_list, - delimiter=",", - fmt='%s') + # Open the file to save the reads + with open(output_file_name, 'w') as fw: + + # Loop through fasta fragments that start with 5' + for frag in frag_list: + # Determine number of reads to create from this fragment + # This might not always provide an exact number of reads that were asked + # TODO resolve this issue + num_frag_reads = round((len(frag)/sum_frags) * num_reads) + + for i in range(0, num_frag_reads): + # If the read length is less than the required length given by the parameter, + # then add random nucleotides + if len(frag) < read_len: + + # Calculate number of random nucleotides to add to the end of the read + diff = read_len - read_len + + # Select random nucleotides from list of possible + rand_samp = choices(nucleotides, k=diff) + + # Add the random list to the read and save + tmp_read = frag[0:len(frag)] + ''.join(rand_samp) + else: + # Save subset of fragment as read + tmp_read = frag[0:read_len] + + # Write read to file and original fragment description + fw.write(frag_desc[frag_list.index(frag)]) + fw.write(tmp_read + "\n") + +read_sequencing(frag_file_name = "../tests/resources/test_terminal_fragments.txt", + output_file_name = "reads.txt", + num_reads=90, + read_len=10) \ No newline at end of file