Skip to content
Snippets Groups Projects
priming_prob.py 5.70 KiB
"""Imports."""
import numpy as np
import scipy.constants


class Probability:
    """Calculates the probability of priming and write the gff file."""

    def inter_para(input_path):
        """Open the RIblast output file and read only the parameter lines.

        Args:
            Path to Energies.txt file

        Returns:
            my_list (list): Contains all the paramter lines from Energies.txt
        """
        Energies = open(input_path, "r")  # ouput of RIblast

        mylist = []  # all lines of Energies starting with an ID-number

        for myline in Energies:  # Read lines containing needed data
            if myline[0].isdigit():
                mylist.append(myline)
            elif myline[0].isdigit() is False:
                continue

        Energies.close()

        return(mylist)

    def inter_prob(data_list, fasta_path, output_path):
        """Calculate the prob. and write the gff file.

        Args:
            data_list (list): Contains all parameters of Energies.txt
            fasta_path (path): Path to fasta file
            output_path (path): Path to output file

        Returns:
            gff (file): Gff file contains all the output information
        """
        # count interactions per script through fasta-ID
        fastafile = open(fasta_path, "r")
        id_list = []  # contains list of transcipt IDs

        for mylinecounter in fastafile:
            if mylinecounter.startswith(">"):
                a = mylinecounter
                a = mylinecounter.replace(">", "")
                b = a.split()
                c = b[0]
                id_list.append(c)
            elif mylinecounter.startswith(">") is False:
                continue

        counter = 0
        counter_list = []  # list of number of interactions per transcript

        for cc in range(0, len(id_list)):
            for dd in range(0, len(data_list)):
                if id_list[cc] in data_list[dd]:
                    counter = counter + 1

            counter_list.append(counter)
            counter = 0

        para_list = []

        for i in range(0, len(data_list)):
            x = data_list[i].split(",")
            para_list.append(x)
        # splitting each list item by the "," this results in a 2D list

        for j in range(0, len(para_list)):
            del para_list[j][1:-2]
        # only keeps the ID-numer, the interaction
        # energy, and interaction site of both sequences. (still a 2D-list)

        start_end_index = 1  # index of start and end in the list

        for d in range(0, len(para_list)):  # Optimize location output
            a = para_list[d][2].split(":")
            a[start_end_index] = a[start_end_index].replace(") ", "")
            a[start_end_index] = a[start_end_index].replace("\n", "")
            a[start_end_index] = a[start_end_index].replace("-", " ")
            a[start_end_index] = a[start_end_index].split(" ")
            para_list[d][2] = a[start_end_index]

        for k in range(0, len(para_list)):  # type-conversion of ID and E
            for w in range(0, 2):  # "2" because the first two elements in each sublist are ID and E
                para_list[k][w] = float(para_list[k][w])

        joule = 4184  # 1kcal = 4184 joule
        inter_energy = 1  # index of inter. energy in list

        for z in range(0, len(para_list)):  # from kcal/mol to Joule/mol
            para_list[z][inter_energy] = para_list[z][inter_energy] * joule

        T = 300.15  # Roomtemperature (27 degree celsius) in Kelvin
        kT = scipy.constants.R * T  # calculating gas constant R * T

        for u in range(0, len(para_list)):  # calculating -E / RT
            para_list[u][inter_energy] = (-(para_list[u][inter_energy])/kT)

        prob_list = []  # List containing all the prob.

        for h in range(0, len(para_list)):  # calculating the e^(-E/kT)
            probab = np.exp(para_list[h][inter_energy])
            prob_list.append(probab)
            para_list[h][inter_energy] = probab

        probability_sum = 0  # variable to calculate sum of probabilities per transcipt
        sum_list = []  # List containing all the sums

        prob_list2 = prob_list.copy()

        for jj in range(0, len(counter_list)):
            for ii in range(0, counter_list[jj]):
                probability_sum = probability_sum + prob_list[ii]
            sum_list.append(probability_sum)
            probability_sum = 0  # setting back to 0, for next interactions
            del prob_list[0:counter_list[jj]]

        real_prob = []  # contains all the normalized probabilities

        for jj in range(0, len(sum_list)):
            for ii in range(0, counter_list[jj]):
                prob_list2[ii] = prob_list2[ii]/sum_list[jj]
                real_prob.append(prob_list2[ii])
            del prob_list2[0:counter_list[jj]]  # Normalized probabilities

        for vv in range(0, len(para_list)):  # inserting the normalized values in para_list
            para_list[vv][1] = real_prob[vv]

        final_list = []  # List containing all the final paramters to print

        for bb in range(0, len(sum_list)):  # Insert ID in paralist
            for ss in range(0, counter_list[bb]):
                para_list[ss][0] = id_list[bb]
                final_list.append(para_list[ss])
            del para_list[0:counter_list[bb]]

        gff = open(output_path, "w+")  # gff file

        for ll in range(0, len(final_list)):  # writing gff file
            gff.write(str(final_list[ll][0]) +  # transcript ID
                      "\tRIblast\tPriming Site\t" +
                      str(final_list[ll][2][1])+"\t" +  # start
                      str(final_list[ll][2][0])+"\t" +  # end
                      str(final_list[ll][1])+"\t.\t.\t.\n")  # probability

        gff.close
        return final_list