Skip to content
Snippets Groups Projects

adding cDNA generation script

Closed Suvarnan Selliah requested to merge suvi into main
1 file
+ 239
0
Compare changes
  • Side-by-side
  • Inline
+ 239
0
 
#!/usr/bin/env python3
 
#Author: Suvarnan Selliah
 
 
class GeneratecDNA:
 
 
def generatecDNA(fasta, gtf, cp_nr, my_output_fasta="cDNA.fasta", my_output_csv="cDNA.csv", placeholder = "ph-file.csv"):
 
 
cDNA_transcript_id = 1
 
#READING INPUT FILES / PART I
 
#open files
 
with open(fasta, 'r') as fa, open(gtf, 'r') as gt, open(cp_nr, 'r') as cp:
 
 
#read fasta-file
 
for myFastaline in fa:
 
 
#search for transcript id and transcript sequence in fasta-file
 
fasta_id = ""
 
fasta_seq = ""
 
fasta_id_found = False
 
fasta_seq_found = False
 
currentFastaString = myFastaline
 
if fasta_id_found == False:
 
position_of_start = currentFastaString.find('>')
 
if position_of_start != 0:
 
continue
 
elif position_of_start == 0:
 
fasta_id = myFastaline
 
fasta_id = fasta_id.replace(">", "")
 
fasta_id_found = True
 
continue
 
else:
 
print("FASTA: Start position in fasta file not found")
 
break
 
if fasta_id_found == True and fasta_seq_found == False:
 
while fasta_seq_found == False:
 
currentFastaString = fa.readline()
 
zero_position = currentFastaString[0]
 
if zero_position == ";":
 
continue
 
elif zero_position == ">":
 
print("FASTA: No Sequence after headline")
 
break
 
else:
 
fasta_seq = currentFastaString
 
fasta_seq_found = True
 
 
#starting to work with gtf-file
 
#defining variables for gtf-file
 
gtf_seqname = ''
 
gtf_start = 0
 
gtf_end = 0
 
gtf_score = 0.0
 
gtf_info_found = False
 
gtf_entries = 0
 
gtf_list_of_lines = []
 
gtf_prob_list = []
 
 
#defining variables for csv-file
 
csv_trans_id = ""
 
csv_gene_id = ""
 
csv_count = 0
 
csv_info_found = False
 
 
if fasta_id_found == True and fasta_seq_found == True:
 
# search for transcript id from fasta-file in gtf-file
 
for myGTFline in gt:
 
currentGTFString = myGTFline
 
gtf_list = currentGTFString.split('\t')
 
gtf_seqname = gtf_list[0]
 
if gtf_seqname == fasta_id:
 
gtf_entries += 1
 
gtf_start = gtf_list[3]
 
gtf_end = gtf_list[4]
 
gtf_score = gtf_list[5]
 
gtf_temp_list = []
 
gtf_temp_list.append(gtf_start)
 
gtf_temp_list.append(gtf_end)
 
gtf_temp_list.append(gtf_score)
 
gtf_list_of_lines.append(gtf_temp_list)
 
gtf_prob_list.append(gtf_score)
 
else:
 
continue
 
if gtf_entries != 0:
 
gtf_info_found = True
 
fasta_id_found = False
 
fasta_seq_found = False
 
assert gtf_info_found, "Sequence ID from fasta-file not found in gtf-file"
 
 
#search copy number of transcript in 3. file/csv-file
 
for myCSVline in cp:
 
currentCSVString = myCSVline
 
csv_list = currentCSVString.split(',')
 
csv_trans_id = csv_list[0]
 
if csv_trans_id == fasta_id:
 
csv_gene_id = csv_list[1]
 
csv_count = csv_list[2]
 
csv_info_found = True
 
gtf_info_found = False
 
break
 
else:
 
continue
 
assert csv_info_found, "Data (TranscriptID,GeneID,Count) from csv-file/3.file not found"
 
 
#COMPUTATION & OUTPUT / PART II
 
#set score to 0 for primimg sites close to the end of sequence
 
csv_info_found = False
 
seq_len = len(fasta_seq)
 
seq_len -= 22
 
gtf_list_of_lines_len = len(gtf_list_of_lines)
 
for i in range(gtf_list_of_lines_len):
 
if gtf_list_of_lines[i][1] > seq_len:
 
gtf_list_of_lines[i][2] = 0.0
 
gtf_prob_list[i] = 0.0
 
 
#assign priming sites (according to score) to copy number
 
sum_of_score = 0.0
 
for i in gtf_prob_list:
 
sum_of_score += i
 
one_score = 100/sum_of_score
 
norm_list = []
 
gtf_prob_list_len = len(gtf_prob_list)
 
for i in range(gtf_prob_list_len):
 
norm_list.append((one_score * gtf_prob_list[i]))
 
one_norm = csv_count / 100
 
distr_RNA_to_prim_sites = []
 
norm_list_len = len(norm_list)
 
for i in range(norm_list_len):
 
distr_RNA_to_prim_sites.append((one_norm * norm_list[i]))
 
total_RNA_number = 0
 
distr_RNA_to_prim_sites_len = len(distr_RNA_to_prim_sites)
 
for i in range(distr_RNA_to_prim_sites_len):
 
total_RNA_number += distr_RNA_to_prim_sites[i]
 
new_distr_RNA_to_prim_sites = []
 
if total_RNA_number != csv_count:
 
for i in range(distr_RNA_to_prim_sites_len):
 
new_distr_RNA_to_prim_sites.append(int(distr_RNA_to_prim_sites[i]))
 
new_distr_RNA_to_prim_sites_len = len(new_distr_RNA_to_prim_sites)
 
counter = 0
 
while total_RNA_number != csv_count:
 
new_distr_RNA_to_prim_sites[counter] = round(distr_RNA_to_prim_sites[counter])
 
counter += 1
 
assert counter <= new_distr_RNA_to_prim_sites_len, "Calculated RNA transcripts (assigned to priming sites) are more than initial count"
 
 
#order the priming sites
 
prim_sites_ordered = []
 
for i in range(gtf_list_of_lines_len):
 
prim_sites_ordered.append(gtf_list_of_lines[i][0])
 
prim_sites_ordered.sort()
 
 
#searching for 2 priming sites
 
prim_sites_ordered_len = len(prim_sites_ordered)
 
ph_1 = 0
 
ph_2 = 0
 
for i in range(0, (prim_sites_ordered_len-1), 2):
 
if prim_sites_ordered[i] == 0.0:
 
continue
 
else:
 
search_for_1 = prim_sites_ordered[i]
 
search_for_2 = prim_sites_ordered[i+1]
 
for j in range(gtf_list_of_lines_len):
 
if gtf_list_of_lines[j][0] == search_for_1:
 
ph_1 = j
 
if gtf_list_of_lines[j][0] == search_for_2:
 
ph_2 = j
 
#making cDNA and comparing in library
 
start_1 = gtf_list_of_lines[ph_1][0]
 
start_2 = gtf_list_of_lines[ph_2][0]
 
start_1 -= 1
 
start_2 -= 1
 
trans_between_prim_sites = fasta_seq[start_1:start_2]
 
cDNA = ''
 
for element in range(0, len(trans_between_prim_sites)):
 
if trans_between_prim_sites[element] == 'A':
 
cDNA[element] = 'T'
 
elif trans_between_prim_sites[element] == 'U':
 
cDNA[element] = 'A'
 
elif trans_between_prim_sites[element] == 'G':
 
cDNA[element] = 'C'
 
elif trans_between_prim_sites[element] == 'C':
 
cDNA[element] = 'G'
 
else:
 
assert False, "cDNA synthesis failed, position is not A,U,G or C in transcript"
 
# open output files
 
if i == 0:
 
with open(my_output_fasta, 'a') as myfasta, open(my_output_csv, 'a') as mycsv:
 
myfasta.write(">" + string(cDNA_transcript_id))
 
myfasta.write("\n")
 
myfasta.write(cDNA)
 
mycsv.write(",".join([cDNA_transcript_id, csv_gene_id, new_distr_RNA_to_prim_sites[ph_1]]))
 
else:
 
found_cDNA_id = ''
 
found_cDNA_id_bool = False
 
with open(my_output_fasta, 'r') as myfasta, open(my_output_csv, 'r') as mycsv, open(placeholder, 'w') as phf:
 
for myline in myfasta:
 
pos = myline.find('>')
 
if pos != 0:
 
continue
 
if pos == 0:
 
fid = myline
 
fid = fid.replace(">", "")
 
fbool = False
 
while fbool == False:
 
myline = myfasta.readline()
 
zer = myline[0]
 
if zer == ";":
 
continue
 
elif zer == ">":
 
assert False, "Error in searching cDNA in output fasta-file"
 
else:
 
fseq = myline
 
fbool = True
 
if fseq ==cDNA:
 
found_cDNA_id = fid
 
found_cDNA_id_bool = True
 
break
 
if found_cDNA_id_bool == True:
 
for myline_csv_out in mycsv:
 
csvlist = myline_csv_out.split(',')
 
csvcDNAid = csvlist[0]
 
if csvcDNAid == found_cDNA_id:
 
csvgeneid = csvlist[1]
 
csvcDNAcount = csvlist[2]
 
csvcDNAcount += new_distr_RNA_to_prim_sites[ph_1]
 
phf.write(",".join([csvcDNAid, csvgeneid, csvcDNAcount]))
 
else:
 
phf.write(myline_csv_out)
 
if found_cDNA_id_bool == True:
 
with open(my_output_csv, 'w') as mycsv, open(placeholder, 'r') as phf:
 
for myplaceholder in phf:
 
mycsv.write(myplaceholder)
 
else:
 
cDNA_transcript_id += 1
 
with open(my_output_fasta, 'a') as myfasta, open(my_output_csv, 'a') as mycsv:
 
myfasta.write(">" + string(cDNA_transcript_id))
 
myfasta.write("\n")
 
myfasta.write(cDNA)
 
mycsv.write(",".join(
 
[cDNA_transcript_id, csv_gene_id, new_distr_RNA_to_prim_sites[ph_1]]))
 
return my_output_fasta, my_output_csv
 
\ No newline at end of file
Loading