diff --git a/src/generate_cdna.py b/src/generate_cdna.py new file mode 100644 index 0000000000000000000000000000000000000000..373c1951f5654ff3ad0f9fa912b16327f619926d --- /dev/null +++ b/src/generate_cdna.py @@ -0,0 +1,239 @@ +#!/usr/bin/env python3 +#Author: Suvarnan Selliah + +class GeneratecDNA: + + def generatecDNA(fasta, gtf, cp_nr, my_output_fasta="cDNA.fasta", my_output_csv="cDNA.csv", placeholder = "ph-file.csv"): + + cDNA_transcript_id = 1 + #READING INPUT FILES / PART I + #open files + with open(fasta, 'r') as fa, open(gtf, 'r') as gt, open(cp_nr, 'r') as cp: + + #read fasta-file + for myFastaline in fa: + + #search for transcript id and transcript sequence in fasta-file + fasta_id = "" + fasta_seq = "" + fasta_id_found = False + fasta_seq_found = False + currentFastaString = myFastaline + if fasta_id_found == False: + position_of_start = currentFastaString.find('>') + if position_of_start != 0: + continue + elif position_of_start == 0: + fasta_id = myFastaline + fasta_id = fasta_id.replace(">", "") + fasta_id_found = True + continue + else: + print("FASTA: Start position in fasta file not found") + break + if fasta_id_found == True and fasta_seq_found == False: + while fasta_seq_found == False: + currentFastaString = fa.readline() + zero_position = currentFastaString[0] + if zero_position == ";": + continue + elif zero_position == ">": + print("FASTA: No Sequence after headline") + break + else: + fasta_seq = currentFastaString + fasta_seq_found = True + + #starting to work with gtf-file + #defining variables for gtf-file + gtf_seqname = '' + gtf_start = 0 + gtf_end = 0 + gtf_score = 0.0 + gtf_info_found = False + gtf_entries = 0 + gtf_list_of_lines = [] + gtf_prob_list = [] + + #defining variables for csv-file + csv_trans_id = "" + csv_gene_id = "" + csv_count = 0 + csv_info_found = False + + if fasta_id_found == True and fasta_seq_found == True: + # search for transcript id from fasta-file in gtf-file + for myGTFline in gt: + currentGTFString = myGTFline + gtf_list = currentGTFString.split('\t') + gtf_seqname = gtf_list[0] + if gtf_seqname == fasta_id: + gtf_entries += 1 + gtf_start = gtf_list[3] + gtf_end = gtf_list[4] + gtf_score = gtf_list[5] + gtf_temp_list = [] + gtf_temp_list.append(gtf_start) + gtf_temp_list.append(gtf_end) + gtf_temp_list.append(gtf_score) + gtf_list_of_lines.append(gtf_temp_list) + gtf_prob_list.append(gtf_score) + else: + continue + if gtf_entries != 0: + gtf_info_found = True + fasta_id_found = False + fasta_seq_found = False + assert gtf_info_found, "Sequence ID from fasta-file not found in gtf-file" + + #search copy number of transcript in 3. file/csv-file + for myCSVline in cp: + currentCSVString = myCSVline + csv_list = currentCSVString.split(',') + csv_trans_id = csv_list[0] + if csv_trans_id == fasta_id: + csv_gene_id = csv_list[1] + csv_count = csv_list[2] + csv_info_found = True + gtf_info_found = False + break + else: + continue + assert csv_info_found, "Data (TranscriptID,GeneID,Count) from csv-file/3.file not found" + + #COMPUTATION & OUTPUT / PART II + #set score to 0 for primimg sites close to the end of sequence + csv_info_found = False + seq_len = len(fasta_seq) + seq_len -= 22 + gtf_list_of_lines_len = len(gtf_list_of_lines) + for i in range(gtf_list_of_lines_len): + if gtf_list_of_lines[i][1] > seq_len: + gtf_list_of_lines[i][2] = 0.0 + gtf_prob_list[i] = 0.0 + + #assign priming sites (according to score) to copy number + sum_of_score = 0.0 + for i in gtf_prob_list: + sum_of_score += i + one_score = 100/sum_of_score + norm_list = [] + gtf_prob_list_len = len(gtf_prob_list) + for i in range(gtf_prob_list_len): + norm_list.append((one_score * gtf_prob_list[i])) + one_norm = csv_count / 100 + distr_RNA_to_prim_sites = [] + norm_list_len = len(norm_list) + for i in range(norm_list_len): + distr_RNA_to_prim_sites.append((one_norm * norm_list[i])) + total_RNA_number = 0 + distr_RNA_to_prim_sites_len = len(distr_RNA_to_prim_sites) + for i in range(distr_RNA_to_prim_sites_len): + total_RNA_number += distr_RNA_to_prim_sites[i] + new_distr_RNA_to_prim_sites = [] + if total_RNA_number != csv_count: + for i in range(distr_RNA_to_prim_sites_len): + new_distr_RNA_to_prim_sites.append(int(distr_RNA_to_prim_sites[i])) + new_distr_RNA_to_prim_sites_len = len(new_distr_RNA_to_prim_sites) + counter = 0 + while total_RNA_number != csv_count: + new_distr_RNA_to_prim_sites[counter] = round(distr_RNA_to_prim_sites[counter]) + counter += 1 + assert counter <= new_distr_RNA_to_prim_sites_len, "Calculated RNA transcripts (assigned to priming sites) are more than initial count" + + #order the priming sites + prim_sites_ordered = [] + for i in range(gtf_list_of_lines_len): + prim_sites_ordered.append(gtf_list_of_lines[i][0]) + prim_sites_ordered.sort() + + #searching for 2 priming sites + prim_sites_ordered_len = len(prim_sites_ordered) + ph_1 = 0 + ph_2 = 0 + for i in range(0, (prim_sites_ordered_len-1), 2): + if prim_sites_ordered[i] == 0.0: + continue + else: + search_for_1 = prim_sites_ordered[i] + search_for_2 = prim_sites_ordered[i+1] + for j in range(gtf_list_of_lines_len): + if gtf_list_of_lines[j][0] == search_for_1: + ph_1 = j + if gtf_list_of_lines[j][0] == search_for_2: + ph_2 = j + #making cDNA and comparing in library + start_1 = gtf_list_of_lines[ph_1][0] + start_2 = gtf_list_of_lines[ph_2][0] + start_1 -= 1 + start_2 -= 1 + trans_between_prim_sites = fasta_seq[start_1:start_2] + cDNA = '' + for element in range(0, len(trans_between_prim_sites)): + if trans_between_prim_sites[element] == 'A': + cDNA[element] = 'T' + elif trans_between_prim_sites[element] == 'U': + cDNA[element] = 'A' + elif trans_between_prim_sites[element] == 'G': + cDNA[element] = 'C' + elif trans_between_prim_sites[element] == 'C': + cDNA[element] = 'G' + else: + assert False, "cDNA synthesis failed, position is not A,U,G or C in transcript" + # open output files + if i == 0: + with open(my_output_fasta, 'a') as myfasta, open(my_output_csv, 'a') as mycsv: + myfasta.write(">" + string(cDNA_transcript_id)) + myfasta.write("\n") + myfasta.write(cDNA) + mycsv.write(",".join([cDNA_transcript_id, csv_gene_id, new_distr_RNA_to_prim_sites[ph_1]])) + else: + found_cDNA_id = '' + found_cDNA_id_bool = False + with open(my_output_fasta, 'r') as myfasta, open(my_output_csv, 'r') as mycsv, open(placeholder, 'w') as phf: + for myline in myfasta: + pos = myline.find('>') + if pos != 0: + continue + if pos == 0: + fid = myline + fid = fid.replace(">", "") + fbool = False + while fbool == False: + myline = myfasta.readline() + zer = myline[0] + if zer == ";": + continue + elif zer == ">": + assert False, "Error in searching cDNA in output fasta-file" + else: + fseq = myline + fbool = True + if fseq ==cDNA: + found_cDNA_id = fid + found_cDNA_id_bool = True + break + if found_cDNA_id_bool == True: + for myline_csv_out in mycsv: + csvlist = myline_csv_out.split(',') + csvcDNAid = csvlist[0] + if csvcDNAid == found_cDNA_id: + csvgeneid = csvlist[1] + csvcDNAcount = csvlist[2] + csvcDNAcount += new_distr_RNA_to_prim_sites[ph_1] + phf.write(",".join([csvcDNAid, csvgeneid, csvcDNAcount])) + else: + phf.write(myline_csv_out) + if found_cDNA_id_bool == True: + with open(my_output_csv, 'w') as mycsv, open(placeholder, 'r') as phf: + for myplaceholder in phf: + mycsv.write(myplaceholder) + else: + cDNA_transcript_id += 1 + with open(my_output_fasta, 'a') as myfasta, open(my_output_csv, 'a') as mycsv: + myfasta.write(">" + string(cDNA_transcript_id)) + myfasta.write("\n") + myfasta.write(cDNA) + mycsv.write(",".join( + [cDNA_transcript_id, csv_gene_id, new_distr_RNA_to_prim_sites[ph_1]])) + return my_output_fasta, my_output_csv \ No newline at end of file