Skip to content
Snippets Groups Projects
Commit 1036b432 authored by Suvarnan Selliah's avatar Suvarnan Selliah
Browse files

adding cDNA generation script

parent 95d07fbd
No related branches found
No related tags found
1 merge request!14adding cDNA generation script
Pipeline #13638 failed
This commit is part of merge request !14. Comments created here will be created in the context of that merge request.
#!/usr/bin/env python3
#Author: Suvarnan Selliah
class GeneratecDNA:
def generatecDNA(fasta, gtf, cp_nr, my_output_fasta="cDNA.fasta", my_output_csv="cDNA.csv", placeholder = "ph-file.csv"):
cDNA_transcript_id = 1
#READING INPUT FILES / PART I
#open files
with open(fasta, 'r') as fa, open(gtf, 'r') as gt, open(cp_nr, 'r') as cp:
#read fasta-file
for myFastaline in fa:
#search for transcript id and transcript sequence in fasta-file
fasta_id = ""
fasta_seq = ""
fasta_id_found = False
fasta_seq_found = False
currentFastaString = myFastaline
if fasta_id_found == False:
position_of_start = currentFastaString.find('>')
if position_of_start != 0:
continue
elif position_of_start == 0:
fasta_id = myFastaline
fasta_id = fasta_id.replace(">", "")
fasta_id_found = True
continue
else:
print("FASTA: Start position in fasta file not found")
break
if fasta_id_found == True and fasta_seq_found == False:
while fasta_seq_found == False:
currentFastaString = fa.readline()
zero_position = currentFastaString[0]
if zero_position == ";":
continue
elif zero_position == ">":
print("FASTA: No Sequence after headline")
break
else:
fasta_seq = currentFastaString
fasta_seq_found = True
#starting to work with gtf-file
#defining variables for gtf-file
gtf_seqname = ''
gtf_start = 0
gtf_end = 0
gtf_score = 0.0
gtf_info_found = False
gtf_entries = 0
gtf_list_of_lines = []
gtf_prob_list = []
#defining variables for csv-file
csv_trans_id = ""
csv_gene_id = ""
csv_count = 0
csv_info_found = False
if fasta_id_found == True and fasta_seq_found == True:
# search for transcript id from fasta-file in gtf-file
for myGTFline in gt:
currentGTFString = myGTFline
gtf_list = currentGTFString.split('\t')
gtf_seqname = gtf_list[0]
if gtf_seqname == fasta_id:
gtf_entries += 1
gtf_start = gtf_list[3]
gtf_end = gtf_list[4]
gtf_score = gtf_list[5]
gtf_temp_list = []
gtf_temp_list.append(gtf_start)
gtf_temp_list.append(gtf_end)
gtf_temp_list.append(gtf_score)
gtf_list_of_lines.append(gtf_temp_list)
gtf_prob_list.append(gtf_score)
else:
continue
if gtf_entries != 0:
gtf_info_found = True
fasta_id_found = False
fasta_seq_found = False
assert gtf_info_found, "Sequence ID from fasta-file not found in gtf-file"
#search copy number of transcript in 3. file/csv-file
for myCSVline in cp:
currentCSVString = myCSVline
csv_list = currentCSVString.split(',')
csv_trans_id = csv_list[0]
if csv_trans_id == fasta_id:
csv_gene_id = csv_list[1]
csv_count = csv_list[2]
csv_info_found = True
gtf_info_found = False
break
else:
continue
assert csv_info_found, "Data (TranscriptID,GeneID,Count) from csv-file/3.file not found"
#COMPUTATION & OUTPUT / PART II
#set score to 0 for primimg sites close to the end of sequence
csv_info_found = False
seq_len = len(fasta_seq)
seq_len -= 22
gtf_list_of_lines_len = len(gtf_list_of_lines)
for i in range(gtf_list_of_lines_len):
if gtf_list_of_lines[i][1] > seq_len:
gtf_list_of_lines[i][2] = 0.0
gtf_prob_list[i] = 0.0
#assign priming sites (according to score) to copy number
sum_of_score = 0.0
for i in gtf_prob_list:
sum_of_score += i
one_score = 100/sum_of_score
norm_list = []
gtf_prob_list_len = len(gtf_prob_list)
for i in range(gtf_prob_list_len):
norm_list.append((one_score * gtf_prob_list[i]))
one_norm = csv_count / 100
distr_RNA_to_prim_sites = []
norm_list_len = len(norm_list)
for i in range(norm_list_len):
distr_RNA_to_prim_sites.append((one_norm * norm_list[i]))
total_RNA_number = 0
distr_RNA_to_prim_sites_len = len(distr_RNA_to_prim_sites)
for i in range(distr_RNA_to_prim_sites_len):
total_RNA_number += distr_RNA_to_prim_sites[i]
new_distr_RNA_to_prim_sites = []
if total_RNA_number != csv_count:
for i in range(distr_RNA_to_prim_sites_len):
new_distr_RNA_to_prim_sites.append(int(distr_RNA_to_prim_sites[i]))
new_distr_RNA_to_prim_sites_len = len(new_distr_RNA_to_prim_sites)
counter = 0
while total_RNA_number != csv_count:
new_distr_RNA_to_prim_sites[counter] = round(distr_RNA_to_prim_sites[counter])
counter += 1
assert counter <= new_distr_RNA_to_prim_sites_len, "Calculated RNA transcripts (assigned to priming sites) are more than initial count"
#order the priming sites
prim_sites_ordered = []
for i in range(gtf_list_of_lines_len):
prim_sites_ordered.append(gtf_list_of_lines[i][0])
prim_sites_ordered.sort()
#searching for 2 priming sites
prim_sites_ordered_len = len(prim_sites_ordered)
ph_1 = 0
ph_2 = 0
for i in range(0, (prim_sites_ordered_len-1), 2):
if prim_sites_ordered[i] == 0.0:
continue
else:
search_for_1 = prim_sites_ordered[i]
search_for_2 = prim_sites_ordered[i+1]
for j in range(gtf_list_of_lines_len):
if gtf_list_of_lines[j][0] == search_for_1:
ph_1 = j
if gtf_list_of_lines[j][0] == search_for_2:
ph_2 = j
#making cDNA and comparing in library
start_1 = gtf_list_of_lines[ph_1][0]
start_2 = gtf_list_of_lines[ph_2][0]
start_1 -= 1
start_2 -= 1
trans_between_prim_sites = fasta_seq[start_1:start_2]
cDNA = ''
for element in range(0, len(trans_between_prim_sites)):
if trans_between_prim_sites[element] == 'A':
cDNA[element] = 'T'
elif trans_between_prim_sites[element] == 'U':
cDNA[element] = 'A'
elif trans_between_prim_sites[element] == 'G':
cDNA[element] = 'C'
elif trans_between_prim_sites[element] == 'C':
cDNA[element] = 'G'
else:
assert False, "cDNA synthesis failed, position is not A,U,G or C in transcript"
# open output files
if i == 0:
with open(my_output_fasta, 'a') as myfasta, open(my_output_csv, 'a') as mycsv:
myfasta.write(">" + string(cDNA_transcript_id))
myfasta.write("\n")
myfasta.write(cDNA)
mycsv.write(",".join([cDNA_transcript_id, csv_gene_id, new_distr_RNA_to_prim_sites[ph_1]]))
else:
found_cDNA_id = ''
found_cDNA_id_bool = False
with open(my_output_fasta, 'r') as myfasta, open(my_output_csv, 'r') as mycsv, open(placeholder, 'w') as phf:
for myline in myfasta:
pos = myline.find('>')
if pos != 0:
continue
if pos == 0:
fid = myline
fid = fid.replace(">", "")
fbool = False
while fbool == False:
myline = myfasta.readline()
zer = myline[0]
if zer == ";":
continue
elif zer == ">":
assert False, "Error in searching cDNA in output fasta-file"
else:
fseq = myline
fbool = True
if fseq ==cDNA:
found_cDNA_id = fid
found_cDNA_id_bool = True
break
if found_cDNA_id_bool == True:
for myline_csv_out in mycsv:
csvlist = myline_csv_out.split(',')
csvcDNAid = csvlist[0]
if csvcDNAid == found_cDNA_id:
csvgeneid = csvlist[1]
csvcDNAcount = csvlist[2]
csvcDNAcount += new_distr_RNA_to_prim_sites[ph_1]
phf.write(",".join([csvcDNAid, csvgeneid, csvcDNAcount]))
else:
phf.write(myline_csv_out)
if found_cDNA_id_bool == True:
with open(my_output_csv, 'w') as mycsv, open(placeholder, 'r') as phf:
for myplaceholder in phf:
mycsv.write(myplaceholder)
else:
cDNA_transcript_id += 1
with open(my_output_fasta, 'a') as myfasta, open(my_output_csv, 'a') as mycsv:
myfasta.write(">" + string(cDNA_transcript_id))
myfasta.write("\n")
myfasta.write(cDNA)
mycsv.write(",".join(
[cDNA_transcript_id, csv_gene_id, new_distr_RNA_to_prim_sites[ph_1]]))
return my_output_fasta, my_output_csv
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment