From 5ef1e0d2d3d837310ead314f962d2fe37973325c Mon Sep 17 00:00:00 2001
From: Hugo Gillet <hugo.gillet@stud.unibas.ch>
Date: Mon, 5 Dec 2022 22:10:50 +0000
Subject: [PATCH] Update representative.py

---
 scripts/representative.py | 66 ++++++++++++++++++++-------------------
 1 file changed, 34 insertions(+), 32 deletions(-)

diff --git a/scripts/representative.py b/scripts/representative.py
index fcc7bf8..047a959 100644
--- a/scripts/representative.py
+++ b/scripts/representative.py
@@ -1,15 +1,13 @@
-
+### Made by Hugo Gillet ###
 import pandas as pd
-import os 
+import os
 
-'''
+"""
 This part of the code take as input a gtf modified file 
 and return a dictionary of transcripts with best
 support level for each gene of the input
 
-'''
-
-
+"""
 
 
 def import_gtfSelection_to_df(gtf_modified_file: str) -> pd.DataFrame:
@@ -28,20 +26,24 @@ def import_gtfSelection_to_df(gtf_modified_file: str) -> pd.DataFrame:
     """
     pass
     if not type(gtf_modified_file) is str:
-      raise TypeError("Only str path is allowed")
-    df_input = pd.read_csv(gtf_modified_file, sep = '\t', lineterminator = '\n', 
-names = ["Gene_mixed", "Transcript", "Support_level", "Na1", "Na2"] )
+        raise TypeError("Only str path is allowed")
+    df_input = pd.read_csv(
+        gtf_modified_file,
+        sep="\t",
+        lineterminator="\n",
+        names=["Gene_mixed", "Transcript", "Support_level", "Na1", "Na2"],
+    )
     df_input["Support_level"] = df_input["Support_level"].replace(" ", "")
-    df_input["Gene"] = df_input["Gene_mixed"].str.extract('([A-Z]\w{0,})', expand=True)
-    df_input["Transcript_number"] = df_input["Gene_mixed"].str.extract('(^\d)', expand=True)
-    df_clean = df_input.loc[:, ["Gene", "Transcript","Support_level"]]
-    df_clean["Gene"] = df_clean["Gene"].fillna(method = 'ffill')
-    df_clean = df_clean.dropna(axis = 0)
+    df_input["Gene"] = df_input["Gene_mixed"].str.extract("([A-Z]\w{0,})", expand=True)
+    df_input["Transcript_number"] = df_input["Gene_mixed"].str.extract(
+        "(^\d)", expand=True
+    )
+    df_clean = df_input.loc[:, ["Gene", "Transcript", "Support_level"]]
+    df_clean["Gene"] = df_clean["Gene"].fillna(method="ffill")
+    df_clean = df_clean.dropna(axis=0)
     return df_clean
 
 
-
-
 def representative_transcripts_inDict(df_gtfSelection: pd.DataFrame) -> pd.DataFrame:
     """Return a dict containing for each gene transcripts 
         with highest confidence level
@@ -56,22 +58,22 @@ def representative_transcripts_inDict(df_gtfSelection: pd.DataFrame) -> pd.DataF
         Raises:
             TypeError : Only pandas DataFrame is allowed
     """
-    pass 
+    pass
 
     if not type(df_gtfSelection) is pd.DataFrame:
         raise TypeError("Only pandas DataFrame is allowed")
-  
-    df_multIndex = df_gtfSelection.set_index(["Gene", "Transcript"])
-    #highest support level = 1 , worst = 5, NA = 100
-    df_min = df_multIndex[df_multIndex["Support_level"] == df_multIndex["Support_level"].min()]
-    df_final = df_min.reset_index(level = "Transcript")
-    df_final = df_final.drop(columns = ["Support_level"])
-    dict_representative_transcripts = df_final.groupby("Gene")["Transcript"].apply(list).to_dict()
-    return dict_representative_transcripts  
-
-
-
-def find_repr_by_SupportLevel(intermediate_file: str) -> dict[str,str]: 
+    df_min = df_gtfSelection[
+        df_gtfSelection["Support_level"]
+        == df_gtfSelection.groupby("Gene")["Support_level"].transform(min)
+    ]
+    df_final = df_min.drop(columns=["Support_level"])
+    dict_representative_transcripts = (
+        df_final.groupby("Gene")["Transcript"].apply(list).to_dict()
+    )
+    return dict_representative_transcripts
+
+
+def find_repr_by_SupportLevel(intermediate_file: str) -> dict[str, str]:
     """Combine functions import_gtfSelection_to_df() 
         and representative_transcripts_inDict()
 
@@ -86,12 +88,12 @@ def find_repr_by_SupportLevel(intermediate_file: str) -> dict[str,str]:
 
           
     """
-    pass 
+    pass
     df_gtf = import_gtfSelection_to_df(intermediate_file)
     dict_reprTrans = representative_transcripts_inDict(df_gtf)
     return dict_reprTrans
 
 
 
-if __name__ == "__main__":  
-    find_repr_by_SupportLevel()
+if __name__ == "__main__":
+   find_repr_by_SupportLevel()
-- 
GitLab