Skip to content
Snippets Groups Projects
Commit e3aeefd5 authored by U-STI\morika's avatar U-STI\morika Committed by Kathleen Moriarty
Browse files

first version of read_sequencing

parent 557be05b
No related branches found
No related tags found
1 merge request!17Issue 7
""" Read Sequencing
Author: Kathleen Moriarty
Class: Programming for Life Sciences
"""
def read_sequencing(frag_file_name, output_file_name, num_reads, read_len, num_seq_cyc):
"""Reads a fasta-formatted file of terminal fragments and simulates reads of these fragments.
Simulate the sequencing of reads on the template of terminal
fragments. Reads are copies of fixed length starting
from the 5' end of fragments. If the desired read length
is larger than the fragment length, sequencing would in
principle proceed into the 3' adaptor and then would perhaps
yield random bases. For simplicity, here we assume that random
nucleotides are introduced in this case.
:param output_file_name: file name where to store the output
:param num_seq_cyc: integer of number of cycles
:param read_len: integer of identical read length
:param num_reads: number of total reads to simulate
:param frag_file_name: input file of terminal fragments
Output:
Fasta-formatted file ("../output/reads.csv") of reads of identical length, representing 5’
ends of the terminal fragments
"""
# Import classes
from random import choices, randrange
import numpy as np
# Read data from terminal fragment file
# Store fragments in a list
f = open(frag_file_name, "r")
frag_line = f.readline()
frag_list = []
frag_str = ""
while frag_line != "":
# To stop when the end of file is reached
if frag_line.startswith('>'):
if not (len(frag_list) == 0 and frag_str == ""):
frag_list.append(frag_str)
frag_str = ""
else:
frag_str = frag_str + frag_line.rstrip("\n")
frag_line = f.readline()
frag_list.append(frag_str)
# print(frag_list)
f.close()
# Initiate list to store reads from modified fragments
fasta_list = list()
# store list of random nucleotides from which to sample when read length is too short
nucleotides = ['A', 'C', 'G', 'T']
# Calculate sum of all lengths to determine the relative abundance for that fragment
sum_frags = sum(map(len, frag_list))
# Repeat the read process for given number of cycles
for j in range(1, num_seq_cyc):
# Loop through fasta fragments that start with 5'
for frag in frag_list:
# Determine number of reads to create from this fragment
# This might not always provide an exact number of reads that were asked
# TODO resolve this issue
num_frag_reads = round((len(frag)/sum_frags) * num_reads)
for i in range(1, num_frag_reads):
# Obtain random first position for the read on the fragment
rand_start = randrange(0, len(frag))
# Calculate the difference of start position and length of read
diff_start_end = len(frag)-rand_start
# If length of read is greater than difference of start to end, then add random nucleotides
if diff_start_end < read_len:
# Calculate number of random nucleotides to add to the end of the read
diff = read_len - diff_start_end
# Select random nucleotides from list of possible
rand_samp = choices(nucleotides, k=diff)
# Add the random list to the read and save
tmp_read = frag[rand_start:len(frag)] + ''.join(rand_samp)
else:
# Save subset of fragment as read
tmp_read = frag[rand_start:(rand_start + read_len)]
# append read to list
fasta_list.append(tmp_read)
# Save list to file
np.savetxt(output_file_name,
fasta_list,
delimiter=",",
fmt='%s')
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment