diff --git a/src/read_sequencing.py b/src/read_sequencing.py index 29ba72ee860ef5c11251a830907aeddcd4f0e38e..085316d2b9e62aa32ab7afbc82e9cb923422fcbf 100644 --- a/src/read_sequencing.py +++ b/src/read_sequencing.py @@ -10,7 +10,6 @@ def read_sequencing( output_file_name, num_reads, read_len, - num_seq_cyc, ) -> None: """Reads a fasta-formatted file of terminal fragments and simulates reads. @@ -29,7 +28,6 @@ def read_sequencing( output_file_name (string): file name where to store the output num_reads: number of total reads to simulate read_len: integer of identical read length - num_seq_cyc: integer of number of cycles """ # Import classes from random import choices, randrange @@ -38,7 +36,6 @@ def read_sequencing( # Read data from terminal fragment file # Store fragments in a list - f = open(frag_file_name, "r") frag_line = f.readline() frag_list = [] # type: List[str] @@ -68,42 +65,39 @@ def read_sequencing( # Calculate sum of all lengths to determine the relative abundance for that fragment sum_frags = sum(map(len, frag_list)) - # Repeat the read process for given number of cycles - for j in range(0, num_seq_cyc): - - # Loop through fasta fragments that start with 5' - for frag in frag_list: + # Loop through fasta fragments that start with 5' + for frag in frag_list: - # Determine number of reads to create from this fragment - # This might not always provide an exact number of reads that were asked - # TODO resolve this issue - num_frag_reads = round((len(frag)/sum_frags) * num_reads) + # Determine number of reads to create from this fragment + # This might not always provide an exact number of reads that were asked + # TODO resolve this issue + num_frag_reads = round((len(frag)/sum_frags) * num_reads) - for i in range(0, num_frag_reads): + for i in range(0, num_frag_reads): - # Obtain random first position for the read on the fragment - rand_start = randrange(0, len(frag)) + # Obtain random first position for the read on the fragment + rand_start = randrange(0, len(frag)) - # Calculate the difference of start position and length of read - diff_start_end = len(frag)-rand_start + # Calculate the difference of start position and length of read + diff_start_end = len(frag)-rand_start - # If length of read is greater than difference of start to end, then add random nucleotides - if diff_start_end < read_len: + # If length of read is greater than difference of start to end, then add random nucleotides + if diff_start_end < read_len: - # Calculate number of random nucleotides to add to the end of the read - diff = read_len - diff_start_end + # Calculate number of random nucleotides to add to the end of the read + diff = read_len - diff_start_end - # Select random nucleotides from list of possible - rand_samp = choices(nucleotides, k=diff) + # Select random nucleotides from list of possible + rand_samp = choices(nucleotides, k=diff) - # Add the random list to the read and save - tmp_read = frag[rand_start:len(frag)] + ''.join(rand_samp) - else: - # Save subset of fragment as read - tmp_read = frag[rand_start:(rand_start + read_len)] + # Add the random list to the read and save + tmp_read = frag[rand_start:len(frag)] + ''.join(rand_samp) + else: + # Save subset of fragment as read + tmp_read = frag[rand_start:(rand_start + read_len)] - # append read to list - fasta_list.append(tmp_read) + # append read to list + fasta_list.append(tmp_read) # Save list to file np.savetxt(output_file_name, diff --git a/tests/test_read_sequence.py b/tests/test_read_sequence.py index 2af0a2db26ebdd18eeb78261b267fb337e909b48..00caf1f9bf8162cdf24922b0296b74d442465bc9 100644 --- a/tests/test_read_sequence.py +++ b/tests/test_read_sequence.py @@ -5,13 +5,12 @@ from src.read_sequencing import read_sequencing def test_read_sequencing(tmpdir): - """Tests the output, input file name and separator.""" + """Tests the correct number of reads were generated.""" read_sequencing( frag_file_name='./tests/resources/test_terminal_fragments.txt', num_reads=80, read_len=10, - num_seq_cyc=5, output_file_name=tmpdir / 'reads.txt' ) df_out = pd.read_table(tmpdir / 'reads.txt', header=None) - assert df_out.shape[0] == 80 * 5 + assert df_out.shape[0] == 80