From 37ce5e830d83b06cb923869529da78a6b4ac79e5 Mon Sep 17 00:00:00 2001
From: LauraU123 <laura.urbanska@stud.unibas.ch>
Date: Wed, 9 Nov 2022 10:36:05 +0100
Subject: [PATCH] added representative script

---
 scripts/representative_v3.py | 63 ++++++++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)
 create mode 100644 scripts/representative_v3.py

diff --git a/scripts/representative_v3.py b/scripts/representative_v3.py
new file mode 100644
index 0000000..387dc62
--- /dev/null
+++ b/scripts/representative_v3.py
@@ -0,0 +1,63 @@
+
+import pandas as pd
+import re 
+import itertools 
+
+'''
+This code take as input a gtf file and returns a dictionary of transcripts with best support level of each gene of the input
+
+'''
+
+
+
+##import modified gtf file and create a df##
+
+def import_gtfSelection_to_df(gtf_modified_file):
+
+    #create a df from the tab separated file input
+    df_input =pd.read_csv(gtf_modified_file, sep='\t', lineterminator='\n', 
+names =["Gene_mixed", "Transcript", "Support_level", "Na1", "Na2"] )
+
+    df_input["Support_level"] = df_input["Support_level"].replace(" ", "")
+
+    #Create a new column with only gene name from Gene_mixed column
+    df_input["Gene"] = df_input["Gene_mixed"].str.extract('([A-Z]\w{0,})', expand=True)
+
+    #Create a new column with only transcript number from Gene_mixed column
+    df_input["Transcript_number"] = df_input["Gene_mixed"].str.extract('(^\d)', expand=True)
+
+    #Create a new df with relevant column and without NA
+    df_clean = df_input.loc[:, ["Gene", "Transcript","Support_level"]]
+    df_clean["Gene"] = df_clean["Gene"].fillna(method='ffill')
+    df_clean = df_clean.dropna(axis=0)
+    return df_clean
+
+
+
+##Returns a df containing representative transcripts and their expression level from genes mentioned in the csv file##
+
+def representative_transcripts_inDict(df_gtfSelection): 
+   
+
+    #create a df indexed on booth Gene and Transcript columns 
+    df_multIndex = df_gtfSelection.set_index(["Gene", "Transcript"])
+    #create a df with only the transcripts with the highest support level (best is = 1 )
+    df_min = df_multIndex.groupby(level=["Gene"])["Support_level"].transform("min")
+    print("\n=== This is your 10 first representative transcripts : === \n \n {}".format(df_min.head(10)))
+    #create a df without transcript levels
+    df_final = df_multIndex.reset_index(level="Transcript")
+    df_final = df_final.drop(columns=["Support_level"])
+    
+    #create a dict with only Gene and representative transcripts
+    dict_representative_transcripts = df_final.groupby("Gene")["Transcript"].apply(list).to_dict()
+    return dict_representative_transcripts  
+
+
+
+### add your inputs here ! ###
+
+gtf_file = "Homo_sapiens.GRCh38.107_intermediat_file.txt" # add the gtf input file here 
+
+df_gtf = import_gtfSelection_to_df(gtf_file)
+
+dictionary_of_representative_transcripts = representative_transcripts_inDict(df_gtf)
-- 
GitLab