Skip to content
Snippets Groups Projects
priming_prob.py 5.23 KiB
"""Calculates the probability of priming and write the gff file."""
import numpy as np
import scipy.constants  # type: ignore


def inter_para(input_path):
    """Open the RIblast output file and read only the parameter lines.

    Args:
    Path to Energies.txt file

    Returns:
    my_list (list): Contains the paramter lines from Energies.txt
    """
    Energies = open(input_path, "r")  # ouput of RIblast

    mylist = []  # all lines of Energies starting with an ID-number

    for myline in Energies:  # Read lines containing needed data
        if myline[0].isdigit():
            mylist.append(myline)
        elif myline[0].isdigit() is False:
            continue

    Energies.close()

    return(mylist)


def inter_prob(data_list, fasta_path, output_path):
    """Calculate the prob. and write the gff file.

     Args:
        data_list (list): Contains all parameters of Energies.txt
        fasta_path (path): Path to fasta file
        output_path (path): Path to output file

    Returns:
        gff (file): Gff file contains all the output information
    """
    # count interactions per script through fasta-ID
    fastafile = open(fasta_path, "r")
    id_list = []  # contains list of transcipt IDs

    for mylinecounter in fastafile:
        if mylinecounter.startswith(">"):
            a = mylinecounter
            a = mylinecounter.replace(">", "")
            b = a.split()
            c = b[0]
            id_list.append(c)
        elif mylinecounter.startswith(">") is False:
            continue

    counter = 0
    counter_list = []  # list of number of interactions per transcript

    for cc in range(0, len(id_list)):
        for dd in range(0, len(data_list)):
            if id_list[cc] in data_list[dd]:
                counter = counter + 1

        counter_list.append(counter)
        counter = 0

    para_list = []

    for i in range(0, len(data_list)):
        x = data_list[i].split(",")
        para_list.append(x)
    # splitting each list item by the "," this results in a 2D list

    for j in range(0, len(para_list)):
        del para_list[j][1:-2]
    # only keeps the ID-numer, the interaction
    # energy, and interaction site of both sequences. (still a 2D-list)

    start_end_index = 1  # index of start and end in the list

    for d in range(0, len(para_list)):  # Optimize location output
        a = para_list[d][2].split(":")
        a[start_end_index] = a[start_end_index].replace(") ", "")
        a[start_end_index] = a[start_end_index].replace("\n", "")
        a[start_end_index] = a[start_end_index].replace("-", " ")
        a[start_end_index] = a[start_end_index].split(" ")
        para_list[d][2] = a[start_end_index]

    for k in range(0, len(para_list)):  # type-conversion of ID and E
        for w in range(0, 2):  # "2" because first two elements in each sublist are ID and E
            para_list[k][w] = float(para_list[k][w])

    joule = 4184  # 1kcal = 4184 joule
    inter_energy = 1  # index of inter. energy in list

    for z in range(0, len(para_list)):  # from kcal/mol to Joule/mol
        para_list[z][inter_energy] = para_list[z][inter_energy] * joule

    T = 300.15  # Roomtemperature (27 degree celsius) in Kelvin
    KT = scipy.constants.R * T  # calculating gas constant R * T

    for u in range(0, len(para_list)):  # calculating -E / RT
        para_list[u][inter_energy] = (-(para_list[u][inter_energy])/KT)

    prob_list = []  # List containing all the prob.

    for h in range(0, len(para_list)):  # calculating the e^(-E/kT)
        probab = np.exp(para_list[h][inter_energy])
        prob_list.append(probab)
        para_list[h][inter_energy] = probab

    probability_sum = 0  # variable to calculate sum of probabilities per transcipt
    sum_list = []  # List containing all the sums

    prob_list2 = prob_list.copy()

    for jj in range(0, len(counter_list)):
        for ii in range(0, counter_list[jj]):
            probability_sum = probability_sum + prob_list[ii]
        sum_list.append(probability_sum)
        probability_sum = 0  # setting back to 0, for next interactions
        del prob_list[0:counter_list[jj]]

    real_prob = []  # contains all the normalized probabilities

    for jj in range(0, len(sum_list)):
        for ii in range(0, counter_list[jj]):
            prob_list2[ii] = prob_list2[ii]/sum_list[jj]
            real_prob.append(prob_list2[ii])
        del prob_list2[0:counter_list[jj]]  # Normalized probabilities

    for vv in range(0, len(para_list)):  # inserting the normalized values in para_list
        para_list[vv][1] = real_prob[vv]

    final_list = []  # List containing all the final paramters to print

    for bb in range(0, len(sum_list)):  # Insert ID in paralist
        for ss in range(0, counter_list[bb]):
            para_list[ss][0] = id_list[bb]
            final_list.append(para_list[ss])
        del para_list[0:counter_list[bb]]

    gff = open(output_path, "w+")  # gff file

    for ll in range(0, len(final_list)):  # writing gff file
        gff.write(str(final_list[ll][0]) +  # Transcript ID
                  "\tRIblast\tPriming Site\t" +
                  str(final_list[ll][2][1])+"\t" +  # Start
                  str(final_list[ll][2][0])+"\t" +  # End
                  str(final_list[ll][1])+"\t.\t.\t.\n")  # probability

    gff.close
    return final_list