Melvin Alappat · 0329e263 · ec7f686c · abb1e512 · 4c591824 · 81e851d5
--- a/src/PrimingProb_Final.py 0 → 100644

+ 173

− 0
+++ b/src/PrimingProb_Final.py 0 → 100644

+ 173

− 0
+"""Imports."""
+import numpy as np
+import scipy.constants
+import argparse
+from pathlib import Path
+class Probability:
+    """Calculates the probability of priming and write the gff file."""
+    #  adding parser
+    parser = argparse.ArgumentParser(
+        description="Fasta-file input",
+        add_help=False,
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    # add arguments
+    parser.add_argument(
+        'input_file',
+        type=lambda p: Path(p).absolute(),
+        metavar="PATH",
+        help="path to fasta-file",
+    )
+    args = parser.parse_args()
+    def InterPara(path):
+        """Open the RIblast output file and read only the parameter lines.
+        Args:
+            Path to Fasta-file
+        Returns:
+            my_list (list): Contains all the paramter lines from RIblast
+        """
+        # myfile = open(sys.argv[1], "r")  # ouput of RIblast
+        myfile = open(path, "r")
+        mylist = []  # all lines of Energies starting with an ID-number
+        for myline in myfile:  # Read lines containing needed data
+            if myline[0].isdigit():
+                mylist.append(myline)
+            else:
+                continue
+        myfile.close()
+        return(mylist)
+    data = InterPara(args.input_file)
+    def InterProb(data_list):
+        """Calculate the prob. and make the gff file.
+        Args:
+            data_list (list): Contains all parameters of RIblast
+        Returns:
+            gff (file): Gff file contains all the output information
+        """
+        # count interactions per script through fasta ID (first line of fasta)
+        mycounter = open("../inputs/transcript.fasta", "r")
+        mycounter_list = []
+        for mylinecounter in mycounter:
+            if mylinecounter.startswith(">"):
+                a = mylinecounter
+                a = mylinecounter.replace(">", "")
+                b = a.replace("\n", "")
+                mycounter_list.append(b)
+            else:
+                continue
+        counter = 0
+        counter_list = []
+        for cc in range(0, len(mycounter_list)):
+            for dd in range(0, len(data_list)):
+                if mycounter_list[cc] in data_list[dd]:
+                    counter = counter + 1
+                else:
+                    continue
+            counter_list.append(counter)
+            counter = 0
+        para_list = []
+        for i in range(0, len(data_list)):
+            x = data_list[i].split(",")
+            para_list.append(x)
+        # splitting each list item by the "," this results in a 2-D list
+        for j in range(0, len(para_list)):
+            del para_list[j][1:-2]
+        # only keeps the ID-numer, the interaction
+        # energy, and interaction site of both sequences. (still a 2D-list)
+        for d in range(0, len(para_list)):  # Optimize location output
+            a = para_list[d][2].split(":")
+            a[1] = a[1].replace(") ", "")
+            a[1] = a[1].replace("\n", "")
+            a[1] = a[1].replace("-", " ")
+            a[1] = a[1].split(" ")
+            para_list[d][2] = a[1]
+        for k in range(0, len(para_list)):  # type-conversion of ID and E
+            for w in range(0, 2):
+                para_list[k][w] = float(para_list[k][w])
+        for z in range(0, len(para_list)):  # from kcal/mol to Joule/mol
+            para_list[z][1] = para_list[z][1] * 4184
+        kT = scipy.constants.R * 300.15  # calculating gas constant R * T
+        for u in range(0, len(para_list)):  # calculating -E / RT
+            para_list[u][1] = (-(para_list[u][1])/kT)
+        prob_list = []  # List containing all the prob.
+        for h in range(0, len(para_list)):  # calculating the e^(-E/kT)
+            probab = np.exp(para_list[h][1])
+            prob_list.append(probab)
+            para_list[h][1] = probab
+        count_sum = 0
+        sum_list = []
+        prob_list2 = prob_list.copy()
+        for jj in range(0, len(counter_list)):
+            for ii in range(0, counter_list[jj]):
+                count_sum = count_sum + prob_list[ii]
+            sum_list.append(count_sum)
+            count_sum = 0
+            del prob_list[0:counter_list[jj]]
+        real_prob = []
+        for jj in range(0, len(sum_list)):
+            for ii in range(0, counter_list[jj]):
+                prob_list2[ii] = prob_list2[ii]/sum_list[jj]
+                real_prob.append(prob_list2[ii])
+            del prob_list2[0:counter_list[jj]]  # Normalized probabilities
+        # real_prob contains all the linearized probabilities
+        for vv in range(0, len(para_list)):
+            para_list[vv][1] = real_prob[vv]
+        final_list = []
+        for bb in range(0, len(sum_list)):  # Insert ID in paralist
+            for ss in range(0, counter_list[bb]):
+                para_list[ss][0] = mycounter_list[bb]
+                final_list.append(para_list[ss])
+            del para_list[0:counter_list[bb]]
+        gff = open("../inputs/Potential_Priming_sites.txt", "w+")  # gff file
+        for ll in range(0, len(final_list)):
+            gff.write(str(final_list[ll][0]) +
+                      "\tRIblast\ttranscript\t" +
+                      str(final_list[ll][2][1])+"\t" +
+                      str(final_list[ll][2][0])+"\t" +
+                      str(final_list[ll][1])+"\t.\t.\t.\n")
+        gff.close
+        return gff
+    InterProb(data)