From 441f364b5a192bcdc7ad9b5c0536c48c7a723462 Mon Sep 17 00:00:00 2001
From: LauraU123 <laura.urbanska@stud.unibas.ch>
Date: Wed, 16 Nov 2022 10:32:01 +0100
Subject: [PATCH] added script and updated representative transcript script

---
 .../match_reprTranscript_expressionLevel.py   | 200 ++++++++++++++++++
 scripts/representative_v3.py                  |  63 ------
 scripts/representative_v4.py                  |  96 +++++++++
 3 files changed, 296 insertions(+), 63 deletions(-)
 create mode 100644 scripts/match_reprTranscript_expressionLevel.py
 delete mode 100644 scripts/representative_v3.py
 create mode 100644 scripts/representative_v4.py

diff --git a/scripts/match_reprTranscript_expressionLevel.py b/scripts/match_reprTranscript_expressionLevel.py
new file mode 100644
index 0000000..2dfca50
--- /dev/null
+++ b/scripts/match_reprTranscript_expressionLevel.py
@@ -0,0 +1,200 @@
+
+import pandas as pd
+import json
+import re
+import rerpresentative_v4 as repr
+import os
+
+
+def dict_reprTrans_to_df(dict_reprTrans: dict):
+
+    """Convert a dictionary of genes and their representative transcript into a dataframe 
+
+        Args:
+            dict_reprTrans (dict) : {'Gene':['transcriptA', 'transcriptB'], ...}
+
+        Returns:
+            Pandas dataframe having Gene and transcript as columns
+      
+        Raises:
+            /!\ None, I wasn't able to make a TypeError with dict  
+            : Only dict made of key string and value string is allowed
+          
+    """
+    pass
+
+    df_reprTrans = pd.DataFrame.from_dict(dict_reprTrans, orient="index", columns=["reprTranscript"])
+    df_reprTrans = df_reprTrans.reset_index(level=0)
+    df_reprTrans.columns = ["Gene", 'reprTrans']
+    df_reprTrans["reprTrans"] = df_reprTrans["reprTrans"].str.replace(r'\.[1-9]', '', regex=True)
+    return df_reprTrans
+
+
+def txt_to_dict(dict_txt: str):
+    """Convert a txt file into a dictionary 
+
+        Args:
+            dict_txt (str) : pathe to a txt file of a dict
+            structured as {'Gene':['transcriptA', 'transcriptB'], ...}
+
+        Returns:
+            dict (dict) : dictionary stuctured as {'Gene':['transcriptA', 'transcriptB'], ...}
+      
+        Raises:
+            None          
+    """
+    pass
+
+    input : str = open(dict_txt, "r").read()
+    input : str = input.replace("\'", "\"")
+    dict = json.loads(input)
+    return dict
+
+
+
+def transcripts_by_gene_inDf(df_gtfSelection: str) -> pd.DataFrame:
+    """Convert multiindex dataframe from function into a simple dataframe 
+
+        Args:
+            df_gtfSelection (str): Pandas multiindex dataframe having Gene,
+            transcript as indexs and support level as columns. 
+            Come from the function import_gtfSelection_to_df()
+
+        Returns:
+            df_gene (str): Pandas dataframe having Gene and
+            transcript as columns 
+      
+        Raises:
+            None          
+    """
+    pass
+    df_gene = df_gtfSelection.set_index(["Gene"])
+    df_gene = df_gene.drop(columns=["Support_level"])
+    df_gene['Transcript']=df_gene['Transcript'].str.replace(r"\.[0-9]","", regex=True)
+    df_gene = df_gene.reset_index(level=0)
+    return df_gene
+
+
+def tsv_or_csv_to_df(input_txt:str) :
+    """Convert tsv or csv file into a pandas dataframe
+
+        Args:
+            input_txt (str): csv or tsv file containing transcript expression level
+
+        Returns:
+            df_gene (str): Pandas dataframe having transcript and expression level
+            as columns  
+      
+        Raises:
+            None          
+    """
+    pass
+    df_input =pd.read_csv(input_txt, sep=r"[\t,]", lineterminator='\n',
+     names=["Transcript", "Expression_level"],
+     engine = "python")
+    return df_input
+
+
+def exprLevel_byGene(df_exprTrasncript:str, df_output_gtf_selection:str) -> pd.DataFrame :
+    """Find matching transcripts bewteen the 2 args 
+
+        Args:
+            df_exprTranscript (str): pandas Dataframe containing transcript and their expression level
+            df_output_gtf_selection (str) : pandas Dataframe containing genes and transcripts 
+
+        Returns:
+            Pandas dataframe having gene and sum of its transcript expression level
+      
+        Raises:
+            None          
+    """
+    pass 
+    df_merged = pd.merge(df_output_gtf_selection, df_exprTrasncript , how="inner", on="Transcript")
+    df_sum = df_merged.groupby("Gene").sum("Expression_level") # sum transcripts comming from the same gene  
+    return df_sum
+
+def match_byGene(df_reprTranscript:str, df_expressionLevel_byGene:str) -> pd.DataFrame: 
+    """Find matching genes bewteen the 2 args 
+
+        Args:
+            df_reprTranscript (str): pandas Dataframe containing genes 
+            and their representative transcript
+            df_expressionLevel_byGene (str) : pandas Dataframe containing 
+            genes and their expression level 
+
+        Returns:
+            Pandas dataframe having representative trasncripts 
+            and their expression level
+      
+        Raises:
+            None          
+    """
+    pass 
+    df_merged = pd.merge(df_reprTranscript, df_expressionLevel_byGene , how="outer", on="Gene")
+    df_clean = df_merged.dropna(axis=0)
+    df_clean = df_clean.loc[:, ["reprTrans","Expression_level"]]
+    return df_clean
+
+def output_tsv(dataframe:str)-> pd.DataFrame :
+    """Convert pandas dataframe into a tsv file 
+
+        Args:
+            dataframe (str): Pandas dataframe containing
+            representative transcripts and their expression level 
+
+        Returns:
+            Tsv file containing representative transcripts
+             and their expression level in the same directory
+      
+        Raises:
+            None          
+    """
+    pass 
+
+    csv_file = dataframe.to_csv(os.getcwd()+"\ReprTrans_ExpressionLevel.tsv", sep="\t", 
+    index=False, header=False)
+    return csv_file
+
+### functions to run this part of the programm
+
+def match_reprTranscript_expressionLevel(exprTrans:str, dict_reprTrans:dict, intermediate_file:str): 
+    """Combine functions to replace transcripts from an expression level csv/tsv file 
+       with representative transcripts 
+
+        Args:
+            exprTrans (str): csv or tsv file containing transcripts
+            and their expression level 
+            dict_reprTrans (dict) : dict of genes and their 
+            representative transcipt
+            intemediate_file (str) : txt file containing genes, transcript 
+            and their expression level from the transkript_extractor function
+
+        Returns:
+            tsv file of representative trasncripts and their expression level
+      
+        Raises:
+            None          
+    """
+    df_intermediate = repr.import_gtfSelection_to_df(intermediate_file)
+    df_geneTrans = transcripts_by_gene_inDf(df_intermediate)
+    df_exprTrans = tsv_or_csv_to_df(exprTrans)
+    df_reprTrans = dict_reprTrans_to_df(dict_reprTrans)
+    df_exprLevel_byGene = exprLevel_byGene(df_exprTrans, df_geneTrans)
+    df_match = match_byGene(df_reprTrans, df_exprLevel_byGene)
+    output = output_tsv(df_match)
+    return output
+
+
+# run the programm 
+
+dict_txt = a #input a dict of {gene:reprTrans} in the form of a txt file
+input_intermediate_file = b #input the intermediate file generated by transckript extractor
+input_expr = c #input a csv or tsv file containing the expr level 
+
+dict_reprTrans = txt_to_dict(dict_txt)
+match_final = match_reprTranscript_expressionLevel(input_expr, dict_reprTrans, input_intermediate_file)
+print("this is the function :\n\n {}".format(match_final))
+
+if __name__ == "__main__":  
+    match_reprTranscript_expressionLevel()
+ 
\ No newline at end of file
diff --git a/scripts/representative_v3.py b/scripts/representative_v3.py
deleted file mode 100644
index 387dc62..0000000
--- a/scripts/representative_v3.py
+++ /dev/null
@@ -1,63 +0,0 @@
-
-import pandas as pd
-import re 
-import itertools 
-
-'''
-This code take as input a gtf file and returns a dictionary of transcripts with best support level of each gene of the input
-
-'''
-
-
-
-##import modified gtf file and create a df##
-
-def import_gtfSelection_to_df(gtf_modified_file):
-
-    #create a df from the tab separated file input
-    df_input =pd.read_csv(gtf_modified_file, sep='\t', lineterminator='\n', 
-names =["Gene_mixed", "Transcript", "Support_level", "Na1", "Na2"] )
-
-    df_input["Support_level"] = df_input["Support_level"].replace(" ", "")
-
-    #Create a new column with only gene name from Gene_mixed column
-    df_input["Gene"] = df_input["Gene_mixed"].str.extract('([A-Z]\w{0,})', expand=True)
-
-    #Create a new column with only transcript number from Gene_mixed column
-    df_input["Transcript_number"] = df_input["Gene_mixed"].str.extract('(^\d)', expand=True)
-
-    #Create a new df with relevant column and without NA
-    df_clean = df_input.loc[:, ["Gene", "Transcript","Support_level"]]
-    df_clean["Gene"] = df_clean["Gene"].fillna(method='ffill')
-    df_clean = df_clean.dropna(axis=0)
-    return df_clean
-
-
-
-##Returns a df containing representative transcripts and their expression level from genes mentioned in the csv file##
-
-def representative_transcripts_inDict(df_gtfSelection): 
-   
-
-    #create a df indexed on booth Gene and Transcript columns 
-    df_multIndex = df_gtfSelection.set_index(["Gene", "Transcript"])
-    #create a df with only the transcripts with the highest support level (best is = 1 )
-    df_min = df_multIndex.groupby(level=["Gene"])["Support_level"].transform("min")
-    print("\n=== This is your 10 first representative transcripts : === \n \n {}".format(df_min.head(10)))
-    #create a df without transcript levels
-    df_final = df_multIndex.reset_index(level="Transcript")
-    df_final = df_final.drop(columns=["Support_level"])
-    
-    #create a dict with only Gene and representative transcripts
-    dict_representative_transcripts = df_final.groupby("Gene")["Transcript"].apply(list).to_dict()
-    return dict_representative_transcripts  
-
-
-
-### add your inputs here ! ###
-
-gtf_file = "Homo_sapiens.GRCh38.107_intermediat_file.txt" # add the gtf input file here 
-
-df_gtf = import_gtfSelection_to_df(gtf_file)
-
-dictionary_of_representative_transcripts = representative_transcripts_inDict(df_gtf)
diff --git a/scripts/representative_v4.py b/scripts/representative_v4.py
new file mode 100644
index 0000000..c940686
--- /dev/null
+++ b/scripts/representative_v4.py
@@ -0,0 +1,96 @@
+
+import pandas as pd
+
+'''
+This part of the code take as input a gtf modified file 
+and return a dictionary of transcripts with best
+support level for each gene of the input
+
+'''
+
+
+
+
+def import_gtfSelection_to_df(gtf_modified_file: str):
+    """Import intermediate file from gtf and create a df
+
+        Args:
+            gtf_modified_file (str) : path to the intermediate file
+
+        Returns:
+            Pandas dataframe having Gene, transcript 
+            and support level as columns
+      
+        Raises:
+            TypeError : Only str path is allowed
+          
+    """
+    pass
+    if not type(gtf_modified_file) is str:
+      raise TypeError("Only str path is allowed")
+    df_input = pd.read_csv(gtf_modified_file, sep = '\t', lineterminator = '\n', 
+names = ["Gene_mixed", "Transcript", "Support_level", "Na1", "Na2"] )
+    df_input["Support_level"] = df_input["Support_level"].replace(" ", "")
+    df_input["Gene"] = df_input["Gene_mixed"].str.extract('([A-Z]\w{0,})', expand=True)
+    df_input["Transcript_number"] = df_input["Gene_mixed"].str.extract('(^\d)', expand=True)
+    df_clean = df_input.loc[:, ["Gene", "Transcript","Support_level"]]
+    df_clean["Gene"] = df_clean["Gene"].fillna(method = 'ffill')
+    df_clean = df_clean.dropna(axis = 0)
+    return df_clean
+
+
+
+
+def representative_transcripts_inDict(df_gtfSelection: str) -> pd.DataFrame:
+    """Return a dict containing for each gene transcripts 
+        with highest confidence level
+
+        Args:
+            df_gtfSelection (str): Pandas dataframe having Gene,
+            transcript and support level as columns
+
+        Returns:
+            Dict {'Gene':['transcriptA', 'transcriptB'], ...}
+      
+        Raises:
+            TypeError : Only pandas DataFrame is allowed
+    """
+    pass 
+
+    if not type(df_gtfSelection) is pd.DataFrame:
+        raise TypeError("Only pandas DataFrame is allowed")
+  
+    df_multIndex = df_gtfSelection.set_index(["Gene", "Transcript"])
+    #highest support level = 1 , worst = 5, NA = 100
+    df_min = df_multIndex.groupby(level=["Gene"])["Support_level"].transform("min")
+    df_final = df_min.reset_index(level = "Transcript")
+    df_final = df_final.drop(columns = ["Support_level"])
+    dict_representative_transcripts = df_final.groupby("Gene")["Transcript"].apply(list).to_dict()
+    return dict_representative_transcripts  
+
+
+
+def find_repr_by_SupportLevel(intermediate_file:str): 
+    """Combine functions import_gtfSelection_to_df() 
+        and representative_transcripts_inDict()
+
+        Args:
+            intermediate_file : path to the intermediate file
+
+        Returns:
+            Dict {'Gene':['transcriptA', 'transcriptB'], ...}
+      
+        Raises:
+            None
+
+          
+    """
+    pass 
+    df_gtf = import_gtfSelection_to_df(intermediate_file)
+    dict_reprTrans = representative_transcripts_inDict(df_gtf)
+    return dict_reprTrans
+
+
+
+if __name__ == "__main__":  
+    find_repr_by_SupportLevel() 
-- 
GitLab