diff --git a/src/primingprob/priming_prob.py b/src/primingprob/priming_prob.py index 8088b646f85110b8c6136897c4958ba6b9d2044b..fbb93d4a0e33dc654b221784211d388a0db03a13 100644 --- a/src/primingprob/priming_prob.py +++ b/src/primingprob/priming_prob.py @@ -1,154 +1,152 @@ -"""Imports.""" +"""Calculates the probability of priming and write the gff file.""" import numpy as np import scipy.constants # type: ignore -class Probability: - """Calculates the probability of priming and write the gff file.""" +def inter_para(input_path): + """Open the RIblast output file and read only the parameter lines. - def inter_para(input_path): - """Open the RIblast output file and read only the parameter lines. + Args: + Path to Energies.txt file - Args: - Path to Energies.txt file + Returns: + my_list (list): Contains the paramter lines from Energies.txt + """ + Energies = open(input_path, "r") # ouput of RIblast - Returns: - my_list (list): Contains the paramter lines from Energies.txt - """ - Energies = open(input_path, "r") # ouput of RIblast + mylist = [] # all lines of Energies starting with an ID-number - mylist = [] # all lines of Energies starting with an ID-number + for myline in Energies: # Read lines containing needed data + if myline[0].isdigit(): + mylist.append(myline) + elif myline[0].isdigit() is False: + continue - for myline in Energies: # Read lines containing needed data - if myline[0].isdigit(): - mylist.append(myline) - elif myline[0].isdigit() is False: - continue + Energies.close() - Energies.close() + return(mylist) - return(mylist) - def inter_prob(data_list, fasta_path, output_path): - """Calculate the prob. and write the gff file. +def inter_prob(data_list, fasta_path, output_path): + """Calculate the prob. and write the gff file. - Args: - data_list (list): Contains all parameters of Energies.txt - fasta_path (path): Path to fasta file - output_path (path): Path to output file + Args: + data_list (list): Contains all parameters of Energies.txt + fasta_path (path): Path to fasta file + output_path (path): Path to output file - Returns: - gff (file): Gff file contains all the output information - """ - # count interactions per script through fasta-ID - fastafile = open(fasta_path, "r") - id_list = [] # contains list of transcipt IDs + Returns: + gff (file): Gff file contains all the output information + """ + # count interactions per script through fasta-ID + fastafile = open(fasta_path, "r") + id_list = [] # contains list of transcipt IDs - for mylinecounter in fastafile: - if mylinecounter.startswith(">"): - a = mylinecounter - a = mylinecounter.replace(">", "") - b = a.split() - c = b[0] - id_list.append(c) - elif mylinecounter.startswith(">") is False: - continue + for mylinecounter in fastafile: + if mylinecounter.startswith(">"): + a = mylinecounter + a = mylinecounter.replace(">", "") + b = a.split() + c = b[0] + id_list.append(c) + elif mylinecounter.startswith(">") is False: + continue - counter = 0 - counter_list = [] # list of number of interactions per transcript + counter = 0 + counter_list = [] # list of number of interactions per transcript - for cc in range(0, len(id_list)): - for dd in range(0, len(data_list)): - if id_list[cc] in data_list[dd]: - counter = counter + 1 + for cc in range(0, len(id_list)): + for dd in range(0, len(data_list)): + if id_list[cc] in data_list[dd]: + counter = counter + 1 - counter_list.append(counter) - counter = 0 + counter_list.append(counter) + counter = 0 - para_list = [] + para_list = [] - for i in range(0, len(data_list)): - x = data_list[i].split(",") - para_list.append(x) - # splitting each list item by the "," this results in a 2D list + for i in range(0, len(data_list)): + x = data_list[i].split(",") + para_list.append(x) + # splitting each list item by the "," this results in a 2D list - for j in range(0, len(para_list)): - del para_list[j][1:-2] - # only keeps the ID-numer, the interaction - # energy, and interaction site of both sequences. (still a 2D-list) + for j in range(0, len(para_list)): + del para_list[j][1:-2] + # only keeps the ID-numer, the interaction + # energy, and interaction site of both sequences. (still a 2D-list) - start_end_index = 1 # index of start and end in the list + start_end_index = 1 # index of start and end in the list - for d in range(0, len(para_list)): # Optimize location output - a = para_list[d][2].split(":") - a[start_end_index] = a[start_end_index].replace(") ", "") - a[start_end_index] = a[start_end_index].replace("\n", "") - a[start_end_index] = a[start_end_index].replace("-", " ") - a[start_end_index] = a[start_end_index].split(" ") - para_list[d][2] = a[start_end_index] + for d in range(0, len(para_list)): # Optimize location output + a = para_list[d][2].split(":") + a[start_end_index] = a[start_end_index].replace(") ", "") + a[start_end_index] = a[start_end_index].replace("\n", "") + a[start_end_index] = a[start_end_index].replace("-", " ") + a[start_end_index] = a[start_end_index].split(" ") + para_list[d][2] = a[start_end_index] - for k in range(0, len(para_list)): # type-conversion of ID and E - for w in range(0, 2): # "2" because first two elements in each sublist are ID and E - para_list[k][w] = float(para_list[k][w]) + for k in range(0, len(para_list)): # type-conversion of ID and E + for w in range(0, 2): # "2" because first two elements in each sublist are ID and E + para_list[k][w] = float(para_list[k][w]) - joule = 4184 # 1kcal = 4184 joule - inter_energy = 1 # index of inter. energy in list + joule = 4184 # 1kcal = 4184 joule + inter_energy = 1 # index of inter. energy in list - for z in range(0, len(para_list)): # from kcal/mol to Joule/mol - para_list[z][inter_energy] = para_list[z][inter_energy] * joule + for z in range(0, len(para_list)): # from kcal/mol to Joule/mol + para_list[z][inter_energy] = para_list[z][inter_energy] * joule - T = 300.15 # Roomtemperature (27 degree celsius) in Kelvin - KT = scipy.constants.R * T # calculating gas constant R * T + T = 300.15 # Roomtemperature (27 degree celsius) in Kelvin + KT = scipy.constants.R * T # calculating gas constant R * T - for u in range(0, len(para_list)): # calculating -E / RT - para_list[u][inter_energy] = (-(para_list[u][inter_energy])/KT) + for u in range(0, len(para_list)): # calculating -E / RT + para_list[u][inter_energy] = (-(para_list[u][inter_energy])/KT) - prob_list = [] # List containing all the prob. + prob_list = [] # List containing all the prob. - for h in range(0, len(para_list)): # calculating the e^(-E/kT) - probab = np.exp(para_list[h][inter_energy]) - prob_list.append(probab) - para_list[h][inter_energy] = probab + for h in range(0, len(para_list)): # calculating the e^(-E/kT) + probab = np.exp(para_list[h][inter_energy]) + prob_list.append(probab) + para_list[h][inter_energy] = probab - probability_sum = 0 # variable to calculate sum of probabilities per transcipt - sum_list = [] # List containing all the sums + probability_sum = 0 # variable to calculate sum of probabilities per transcipt + sum_list = [] # List containing all the sums - prob_list2 = prob_list.copy() + prob_list2 = prob_list.copy() - for jj in range(0, len(counter_list)): - for ii in range(0, counter_list[jj]): - probability_sum = probability_sum + prob_list[ii] - sum_list.append(probability_sum) - probability_sum = 0 # setting back to 0, for next interactions - del prob_list[0:counter_list[jj]] + for jj in range(0, len(counter_list)): + for ii in range(0, counter_list[jj]): + probability_sum = probability_sum + prob_list[ii] + sum_list.append(probability_sum) + probability_sum = 0 # setting back to 0, for next interactions + del prob_list[0:counter_list[jj]] - real_prob = [] # contains all the normalized probabilities + real_prob = [] # contains all the normalized probabilities - for jj in range(0, len(sum_list)): - for ii in range(0, counter_list[jj]): - prob_list2[ii] = prob_list2[ii]/sum_list[jj] - real_prob.append(prob_list2[ii]) - del prob_list2[0:counter_list[jj]] # Normalized probabilities + for jj in range(0, len(sum_list)): + for ii in range(0, counter_list[jj]): + prob_list2[ii] = prob_list2[ii]/sum_list[jj] + real_prob.append(prob_list2[ii]) + del prob_list2[0:counter_list[jj]] # Normalized probabilities - for vv in range(0, len(para_list)): # inserting the normalized values in para_list - para_list[vv][1] = real_prob[vv] + for vv in range(0, len(para_list)): # inserting the normalized values in para_list + para_list[vv][1] = real_prob[vv] - final_list = [] # List containing all the final paramters to print + final_list = [] # List containing all the final paramters to print - for bb in range(0, len(sum_list)): # Insert ID in paralist - for ss in range(0, counter_list[bb]): - para_list[ss][0] = id_list[bb] - final_list.append(para_list[ss]) - del para_list[0:counter_list[bb]] + for bb in range(0, len(sum_list)): # Insert ID in paralist + for ss in range(0, counter_list[bb]): + para_list[ss][0] = id_list[bb] + final_list.append(para_list[ss]) + del para_list[0:counter_list[bb]] - gff = open(output_path, "w+") # gff file + gff = open(output_path, "w+") # gff file - for ll in range(0, len(final_list)): # writing gff file - gff.write(str(final_list[ll][0]) + # transcript ID - "\tRIblast\tPriming Site\t" + - str(final_list[ll][2][1])+"\t" + # start - str(final_list[ll][2][0])+"\t" + # end - str(final_list[ll][1])+"\t.\t.\t.\n") # probability + for ll in range(0, len(final_list)): # writing gff file + gff.write(str(final_list[ll][0]) + # Transcript ID + "\tRIblast\tPriming Site\t" + + str(final_list[ll][2][1])+"\t" + # Start + str(final_list[ll][2][0])+"\t" + # End + str(final_list[ll][1])+"\t.\t.\t.\n") # probability - gff.close - return final_list + gff.close + return final_list