From 3d8c725ab1e5e3ad7258883bfc9df277821714e4 Mon Sep 17 00:00:00 2001
From: LauraU123 <laura.urbanska@stud.unibas.ch>
Date: Wed, 9 Nov 2022 10:18:15 +0100
Subject: [PATCH] added representative transcript and exon length filter

---
 scripts/Exon_length_filter.py       | 141 ++++++++++++++++++++++++++++
 scripts/representativeTranscript.py |  84 +++++++++++++++++
 2 files changed, 225 insertions(+)
 create mode 100644 scripts/Exon_length_filter.py
 create mode 100644 scripts/representativeTranscript.py

diff --git a/scripts/Exon_length_filter.py b/scripts/Exon_length_filter.py
new file mode 100644
index 0000000..42979a9
--- /dev/null
+++ b/scripts/Exon_length_filter.py
@@ -0,0 +1,141 @@
+#### Exon length filter #####
+
+### Called Packages ###
+import re
+import os
+
+import transkript_extractor as te
+### Functions ###
+
+def exon_length_calculator(entry): 
+    """This funtion finds the start and end cordinates of the exon and uses them to calculate its lenght"""
+    try:
+        find_exon_coordinates = re.compile("\t\d{1,15}\t")
+        #this difines the pattern of the coordinates 
+        try_find_start_coordinates = find_exon_coordinates.search(entry)
+        #this line findes the start coordinares based on the pattern 
+        start_coordinates = int(try_find_start_coordinates[0].replace("\t",""))
+        #this line removes the \t at the end and the start of the pattern and 
+        #turn the string of the coordinates into intergers  
+        final_index_start_coordinates = entry.find(try_find_start_coordinates[0])+len(try_find_start_coordinates[0])-1
+        #this line determines the indes of the final digit of the start coordinates    
+        sub_entry = entry[final_index_start_coordinates:]
+        #this lineused the index determin above a starting point for a new sub entry
+        try_find_end_coordinates = find_exon_coordinates.search(sub_entry)
+        end_coordinates = int(try_find_end_coordinates[0].replace("\t",""))
+        #these two lines find the end coordinates and turn tham int an int 
+        exon_lenght = end_coordinates-start_coordinates
+        #this line claculates the transcript length 
+    except:
+        print("\n\nIn the following enty only one or no valid coordinates could be found:\n",entry,"the value will be set to NA")
+        exon_lenght = "NA"
+    return(exon_lenght)
+
+def exon_fider(entry):
+    """This funtion determines if a given entry belongs to an exon"""
+    exon_test = entry.find("\texon\t")
+    #This line look for the entry exon in the file
+    if exon_test == -1: 
+        try_exon_test = False
+    else:
+        try_exon_test = True
+    #The block above evaluates the results of the search for the wort exon
+    return(try_exon_test)
+
+def __longest_transcript_finder(current_exon_length,longest_transcript,longest_transcript_ID,old_transcript_ID):
+    """This funtion encapsulates an opperation that has to be carried out at several point ind the exon_length_filter funktion and servers to make that funktion more modular"""
+    if current_exon_length > longest_transcript: 
+        #This condition updates the most promesing for
+        #beeing the representative transcript
+        longest_transcript = current_exon_length
+        longest_transcript_ID = old_transcript_ID
+    current_exon_length = 0
+    return(current_exon_length,longest_transcript,longest_transcript_ID)
+
+def exon_length_filter(file_name = "test",source_pathway_name = os.getcwd(),deposit_pathway_name =os.getcwd(),gen_dict = {"ENSG00000160072":["ENST00000673477","ENST00000472194","ENST00000378736","ENST00000308647","ENST00000442483"],"ENSG00000225972":["ENST00000416931"],"ENSG00000279928":["ENST00000624431","ENST00000424215"],"ENSG00000142611":["ENST00000378391","ENST00000607632","ENST00000511072"]}):
+    """This funtion selects only the transcripts for a dictionar that have the longest total mRNA"""
+    print("Representative trascripts are filterd based on exon length please wait...")
+    source_pathway_name,deposit_pathway_name = te.__do_pathways_exist__(source_pathway_name,deposit_pathway_name)
+    with open(source_pathway_name+"\\"+file_name+".gtf", 'r') as f:
+        
+        old_gen = str()
+        old_transcript_ID = str()
+        representative_transcript = dict()
+        representative_trasnscript_not_found = True
+        longest_transcript_ID = str()
+        current_exon_length = 0
+        longest_transcript = 0 
+        
+        for entry in f: 
+            
+            try:
+                corrent_gen = te.gene_ID_finder(entry)
+            except:
+                corrent_gen = old_gen
+            #The block above test if there is a gen name in the entry
+            if corrent_gen != old_gen:    
+                representative_trasnscript_not_found = True
+            #The block above determines if the Gen name is new and set the test
+            #representative_trasnscript_not_found back to true which is used to 
+            #make the program faster if there is just one transcript for a given
+            #gen in the dict
+            if representative_trasnscript_not_found and corrent_gen in gen_dict:
+                #print(corrent_gen)
+                #The conditon prvents serges if a representative transcript has
+                #all ready been chosen
+                if corrent_gen != old_gen:
+                    current_exon_length,longest_transcript,longest_transcript_ID = __longest_transcript_finder(current_exon_length,longest_transcript,longest_transcript_ID,old_transcript_ID)
+                    representative_transcript[old_gen] = longest_transcript_ID
+                    try:
+                        del gen_dict[old_gen]
+                        old_gen = corrent_gen 
+                    except:
+                        old_gen = corrent_gen
+                    longest_transcript = 0
+                    #The block above adds the transcript of the last gen that 
+                    #had the longest exons into the representative transcripts dict
+                    try: 
+                        #This try / except block test if the gen is in the input dictionary
+                        transcript_IDs = gen_dict[corrent_gen]
+                        if len(gen_dict[corrent_gen]) == 1:
+                            #This conditions is a short cut for Genes that 
+                            #allready have a representative transcript
+                            representative_transcript=gen_dict[corrent_gen[0]]
+                            representative_trasnscript_not_found = False
+                            continue
+                    except:
+                        continue
+                    
+                try: 
+                    current_transcript_ID = te.transcript_ID_finder(entry)                     
+                except: 
+                    continue
+                #The block above searches for a trnascript ID in the  current enty
+
+                if current_transcript_ID in transcript_IDs:
+                    #This condition test if the Transcript is one of the 
+                    #candidates for representative transcripts
+                    if current_transcript_ID != old_transcript_ID:
+                        #This condition if the enty still belongs to the 
+                        #previous transcript and is triggers if that is not the case
+                        current_exon_length,longest_transcript,longest_transcript_ID = __longest_transcript_finder(current_exon_length,longest_transcript,longest_transcript_ID,old_transcript_ID)
+                        try:
+                            transcript_IDs.remove(old_transcript_ID)
+                            old_transcript_ID = current_transcript_ID
+                        except:
+                            old_transcript_ID = current_transcript_ID
+                    if exon_fider(entry): 
+                        exon_length = exon_length_calculator(entry)
+                        current_exon_length += exon_length
+                    else: 
+                        continue 
+        current_exon_length,longest_transcript,longest_transcript_ID = __longest_transcript_finder(current_exon_length,longest_transcript,longest_transcript_ID,old_transcript_ID)
+        representative_transcript[old_gen] = longest_transcript_ID
+    del representative_transcript[str()]
+    print("Representative transcripts collected\n")
+    return(representative_transcript)
+
+if __name__ == "__main__":
+    exon_length_filter()
+    
+#This line allows the file to be executed on its own also from 
\ No newline at end of file
diff --git a/scripts/representativeTranscript.py b/scripts/representativeTranscript.py
new file mode 100644
index 0000000..f5649f2
--- /dev/null
+++ b/scripts/representativeTranscript.py
@@ -0,0 +1,84 @@
+
+import pandas as pd
+import re 
+
+'''
+This part of the code find a representative transcript  for each ID given in the csv input file. 
+It works with the output of the function "transcript selector".
+It return a csv file containing the names of the representative transcript and their levels.
+
+'''
+
+#import csv file [ID, Expression_level] and create a df
+def import_csv_to_df(csv_file) :
+    df_csv = pd.read_csv(csv_file, names=["ID", "Expression_level"])
+    return df_csv
+
+
+#import modified gtf file and create a df
+def import_gtfSelection_to_df(gtf_modified_file):
+
+    #create a df from the tab separated file input
+    df_input =pd.read_csv(gtf_modified_file, sep='\t', lineterminator='\n', 
+names =["Gene_mixed", "Transcript", "Support_level", "Length", "Line_index", "Na"] )
+
+    #Create a new column with only gene name from Gene_mixed column
+    df_input["Gene"] = df_input["Gene_mixed"].str.extract('([A-Z]\w{0,})', expand=True)
+
+    #Create a new column with only transcript number from Gene_mixed column
+    df_input["Transcript_number"] = df_input["Gene_mixed"].str.extract('(^\d)', expand=True)
+
+    #Create a new df with relevant column and without NA
+    df_clean = df_input.loc[:, ["Gene","Transcript_number","Transcript","Support_level", "Length", "Line_index"]]
+    df_clean["Gene"] = df_clean["Gene"].fillna(method='ffill')
+    df_clean = df_clean.dropna(axis=0)
+    return df_clean
+
+
+
+# Return a df containing representative transcripts and their expression level from genes mentioned in the csv file 
+
+def representative_from_gene(df_gtfSelection, df_csv): 
+    #check wich genes are shared by df_gtfSelection and df_csv and create a reduced df
+    df_shared = df_gtfSelection[(df_gtfSelection["Gene"].isin(df_csv["ID"]))]# /!\ return an empty list if I do the same with "Transcript" column. 
+
+    #pick only the transcripts with the highest support level (best is = 1 )
+    df_filtered = df_shared[df_shared["Support_level"]==df_shared["Support_level"].min()]
+
+    #pick the transcript with the greatest length if multiple transcript have the same support level
+    df_filtered2 = df_filtered.sort_index().groupby(df_filtered["Gene"]).max()
+
+    #combine expression level with transcript with highest support level and greatest length 
+    df_filtered2["Expression_level"] = df_filtered2.Gene.map(df_csv.set_index("ID")['Expression_level'])
+
+    #create a reduced df with only the representative transcript and its support level 
+    df_final = df_filtered2[["Transcript","Expression_level"]]
+    print("This is your representative transcripts : \n \n {}".format(df_final))
+
+    return df_final 
+
+
+#Missing conditions :
+    # If multiple transcript form the same gene, add their expression 
+    # If given transcript name, return the most representative transcript 
+
+
+
+# Output csv file containing representative transcript levels and id
+def write_csv_Transcript_Expression_level(df_representative):
+    df_final = df_representative[["Transcript","Expression_level"]]
+    with open("transcripts.csv", "w") as fileout : 
+        fileout.write(df_representative.to_csv(columns=["Transcript","Expression_level"], index=False))
+
+### put your inputs here ! ###
+
+ID_levels_csv = "scRNA\input.csv" # put your csv file here
+gtf_file = "scRNA/test_inter_mediat_file.txt" # put your gtf selected file here 
+
+df_gtf = import_gtfSelection_to_df(gtf_file)
+df_csv_input = import_csv_to_df(ID_levels_csv)
+
+representative_transcript = representative_from_gene(df_gtf, df_csv_input)
+
+write_csv_Transcript_Expression_level(representative_transcript) # create a csv file named "transcript.csv"
+
-- 
GitLab