Skip to content
Snippets Groups Projects

adding cDNA generation script

Closed Suvarnan Selliah requested to merge suvi into main
1 file
+ 239
0
Compare changes
  • Side-by-side
  • Inline
+ 239
0
#!/usr/bin/env python3
#Author: Suvarnan Selliah
class GeneratecDNA:
def generatecDNA(fasta, gtf, cp_nr, my_output_fasta="cDNA.fasta", my_output_csv="cDNA.csv", placeholder = "ph-file.csv"):
cDNA_transcript_id = 1
#READING INPUT FILES / PART I
#open files
with open(fasta, 'r') as fa, open(gtf, 'r') as gt, open(cp_nr, 'r') as cp:
#read fasta-file
for myFastaline in fa:
#search for transcript id and transcript sequence in fasta-file
fasta_id = ""
fasta_seq = ""
fasta_id_found = False
fasta_seq_found = False
currentFastaString = myFastaline
if fasta_id_found == False:
position_of_start = currentFastaString.find('>')
if position_of_start != 0:
continue
elif position_of_start == 0:
fasta_id = myFastaline
fasta_id = fasta_id.replace(">", "")
fasta_id_found = True
continue
else:
print("FASTA: Start position in fasta file not found")
break
if fasta_id_found == True and fasta_seq_found == False:
while fasta_seq_found == False:
currentFastaString = fa.readline()
zero_position = currentFastaString[0]
if zero_position == ";":
continue
elif zero_position == ">":
print("FASTA: No Sequence after headline")
break
else:
fasta_seq = currentFastaString
fasta_seq_found = True
#starting to work with gtf-file
#defining variables for gtf-file
gtf_seqname = ''
gtf_start = 0
gtf_end = 0
gtf_score = 0.0
gtf_info_found = False
gtf_entries = 0
gtf_list_of_lines = []
gtf_prob_list = []
#defining variables for csv-file
csv_trans_id = ""
csv_gene_id = ""
csv_count = 0
csv_info_found = False
if fasta_id_found == True and fasta_seq_found == True:
# search for transcript id from fasta-file in gtf-file
for myGTFline in gt:
currentGTFString = myGTFline
gtf_list = currentGTFString.split('\t')
gtf_seqname = gtf_list[0]
if gtf_seqname == fasta_id:
gtf_entries += 1
gtf_start = gtf_list[3]
gtf_end = gtf_list[4]
gtf_score = gtf_list[5]
gtf_temp_list = []
gtf_temp_list.append(gtf_start)
gtf_temp_list.append(gtf_end)
gtf_temp_list.append(gtf_score)
gtf_list_of_lines.append(gtf_temp_list)
gtf_prob_list.append(gtf_score)
else:
continue
if gtf_entries != 0:
gtf_info_found = True
fasta_id_found = False
fasta_seq_found = False
assert gtf_info_found, "Sequence ID from fasta-file not found in gtf-file"
#search copy number of transcript in 3. file/csv-file
for myCSVline in cp:
currentCSVString = myCSVline
csv_list = currentCSVString.split(',')
csv_trans_id = csv_list[0]
if csv_trans_id == fasta_id:
csv_gene_id = csv_list[1]
csv_count = csv_list[2]
csv_info_found = True
gtf_info_found = False
break
else:
continue
assert csv_info_found, "Data (TranscriptID,GeneID,Count) from csv-file/3.file not found"
#COMPUTATION & OUTPUT / PART II
#set score to 0 for primimg sites close to the end of sequence
csv_info_found = False
seq_len = len(fasta_seq)
seq_len -= 22
gtf_list_of_lines_len = len(gtf_list_of_lines)
for i in range(gtf_list_of_lines_len):
if gtf_list_of_lines[i][1] > seq_len:
gtf_list_of_lines[i][2] = 0.0
gtf_prob_list[i] = 0.0
#assign priming sites (according to score) to copy number
sum_of_score = 0.0
for i in gtf_prob_list:
sum_of_score += i
one_score = 100/sum_of_score
norm_list = []
gtf_prob_list_len = len(gtf_prob_list)
for i in range(gtf_prob_list_len):
norm_list.append((one_score * gtf_prob_list[i]))
one_norm = csv_count / 100
distr_RNA_to_prim_sites = []
norm_list_len = len(norm_list)
for i in range(norm_list_len):
distr_RNA_to_prim_sites.append((one_norm * norm_list[i]))
total_RNA_number = 0
distr_RNA_to_prim_sites_len = len(distr_RNA_to_prim_sites)
for i in range(distr_RNA_to_prim_sites_len):
total_RNA_number += distr_RNA_to_prim_sites[i]
new_distr_RNA_to_prim_sites = []
if total_RNA_number != csv_count:
for i in range(distr_RNA_to_prim_sites_len):
new_distr_RNA_to_prim_sites.append(int(distr_RNA_to_prim_sites[i]))
new_distr_RNA_to_prim_sites_len = len(new_distr_RNA_to_prim_sites)
counter = 0
while total_RNA_number != csv_count:
new_distr_RNA_to_prim_sites[counter] = round(distr_RNA_to_prim_sites[counter])
counter += 1
assert counter <= new_distr_RNA_to_prim_sites_len, "Calculated RNA transcripts (assigned to priming sites) are more than initial count"
#order the priming sites
prim_sites_ordered = []
for i in range(gtf_list_of_lines_len):
prim_sites_ordered.append(gtf_list_of_lines[i][0])
prim_sites_ordered.sort()
#searching for 2 priming sites
prim_sites_ordered_len = len(prim_sites_ordered)
ph_1 = 0
ph_2 = 0
for i in range(0, (prim_sites_ordered_len-1), 2):
if prim_sites_ordered[i] == 0.0:
continue
else:
search_for_1 = prim_sites_ordered[i]
search_for_2 = prim_sites_ordered[i+1]
for j in range(gtf_list_of_lines_len):
if gtf_list_of_lines[j][0] == search_for_1:
ph_1 = j
if gtf_list_of_lines[j][0] == search_for_2:
ph_2 = j
#making cDNA and comparing in library
start_1 = gtf_list_of_lines[ph_1][0]
start_2 = gtf_list_of_lines[ph_2][0]
start_1 -= 1
start_2 -= 1
trans_between_prim_sites = fasta_seq[start_1:start_2]
cDNA = ''
for element in range(0, len(trans_between_prim_sites)):
if trans_between_prim_sites[element] == 'A':
cDNA[element] = 'T'
elif trans_between_prim_sites[element] == 'U':
cDNA[element] = 'A'
elif trans_between_prim_sites[element] == 'G':
cDNA[element] = 'C'
elif trans_between_prim_sites[element] == 'C':
cDNA[element] = 'G'
else:
assert False, "cDNA synthesis failed, position is not A,U,G or C in transcript"
# open output files
if i == 0:
with open(my_output_fasta, 'a') as myfasta, open(my_output_csv, 'a') as mycsv:
myfasta.write(">" + string(cDNA_transcript_id))
myfasta.write("\n")
myfasta.write(cDNA)
mycsv.write(",".join([cDNA_transcript_id, csv_gene_id, new_distr_RNA_to_prim_sites[ph_1]]))
else:
found_cDNA_id = ''
found_cDNA_id_bool = False
with open(my_output_fasta, 'r') as myfasta, open(my_output_csv, 'r') as mycsv, open(placeholder, 'w') as phf:
for myline in myfasta:
pos = myline.find('>')
if pos != 0:
continue
if pos == 0:
fid = myline
fid = fid.replace(">", "")
fbool = False
while fbool == False:
myline = myfasta.readline()
zer = myline[0]
if zer == ";":
continue
elif zer == ">":
assert False, "Error in searching cDNA in output fasta-file"
else:
fseq = myline
fbool = True
if fseq ==cDNA:
found_cDNA_id = fid
found_cDNA_id_bool = True
break
if found_cDNA_id_bool == True:
for myline_csv_out in mycsv:
csvlist = myline_csv_out.split(',')
csvcDNAid = csvlist[0]
if csvcDNAid == found_cDNA_id:
csvgeneid = csvlist[1]
csvcDNAcount = csvlist[2]
csvcDNAcount += new_distr_RNA_to_prim_sites[ph_1]
phf.write(",".join([csvcDNAid, csvgeneid, csvcDNAcount]))
else:
phf.write(myline_csv_out)
if found_cDNA_id_bool == True:
with open(my_output_csv, 'w') as mycsv, open(placeholder, 'r') as phf:
for myplaceholder in phf:
mycsv.write(myplaceholder)
else:
cDNA_transcript_id += 1
with open(my_output_fasta, 'a') as myfasta, open(my_output_csv, 'a') as mycsv:
myfasta.write(">" + string(cDNA_transcript_id))
myfasta.write("\n")
myfasta.write(cDNA)
mycsv.write(",".join(
[cDNA_transcript_id, csv_gene_id, new_distr_RNA_to_prim_sites[ph_1]]))
return my_output_fasta, my_output_csv
\ No newline at end of file
Loading