diff --git a/scripts/match_reprtranscript_expressionlevel.py b/scripts/match_reprtranscript_expressionlevel.py index 17d8bcffcf4da1eee4b987182194fafd830446f0..a4fc8dfde20e70ceeaf03de4aac3983fb6c39993 100644 --- a/scripts/match_reprtranscript_expressionlevel.py +++ b/scripts/match_reprtranscript_expressionlevel.py @@ -1,4 +1,3 @@ - import pandas as pd import json import re @@ -6,7 +5,7 @@ import representative as repr import os -def dict_reprTrans_to_df(dict_reprTrans: dict): +def dict_reprTrans_to_df(dict_reprTrans: dict[str,str])-> pd.Dataframe: """Convert a dictionary of genes and their representative transcript into a dataframe @@ -17,11 +16,18 @@ def dict_reprTrans_to_df(dict_reprTrans: dict): Pandas dataframe having Gene and transcript as columns Raises: - /!\ None, I wasn't able to make a TypeError with dict - : Only dict made of key string and value string is allowed + Only dict are allowed + Key should be strings + Value should be strings """ pass + if not type(dict_reprTrans) is dict : + raise TypeError("Only dict are allowed") + if type(list(dict_reprTrans.keys())[0]) is not str : + raise TypeError("Key should be strings") + if type(list(dict_reprTrans.values())[0]) is not str : + raise TypeError("Values should be strings") df_reprTrans = pd.DataFrame.from_dict(dict_reprTrans, orient="index", columns=["reprTranscript"]) df_reprTrans = df_reprTrans.reset_index(level=0) @@ -30,7 +36,7 @@ def dict_reprTrans_to_df(dict_reprTrans: dict): return df_reprTrans -def txt_to_dict(dict_txt: str): +def txt_to_dict(dict_txt: str) -> dict: """Convert a txt file into a dictionary Args: @@ -52,13 +58,14 @@ def txt_to_dict(dict_txt: str): -def transcripts_by_gene_inDf(df_gtfSelection: str) -> pd.DataFrame: +def transcripts_by_gene_inDf(df_gtfSelection: pd.DataFrame) -> pd.DataFrame: """Convert multiindex dataframe from function into a simple dataframe Args: - df_gtfSelection (str): Pandas multiindex dataframe having Gene, + df_gtfSelection : Pandas multiindex dataframe having Gene, transcript as indexs and support level as columns. Come from the function import_gtfSelection_to_df() + from representative.py script. Returns: df_gene (str): Pandas dataframe having Gene and @@ -75,7 +82,7 @@ def transcripts_by_gene_inDf(df_gtfSelection: str) -> pd.DataFrame: return df_gene -def tsv_or_csv_to_df(input_txt:str) : +def tsv_or_csv_to_df(input_txt:str) -> pd.DataFrame : """Convert tsv or csv file into a pandas dataframe Args: @@ -95,12 +102,15 @@ def tsv_or_csv_to_df(input_txt:str) : return df_input -def exprLevel_byGene(df_exprTrasncript:str, df_output_gtf_selection:str) -> pd.DataFrame : - """Find matching transcripts bewteen the 2 args +def exprLevel_byGene(df_exprTrasncript:pd.DataFrame, df_output_gtf_selection:pd.DataFrame) -> pd.DataFrame : + """find the gene of each transcipt given by the expression level csv/tsv file, + and summ expression level of all transcipts from the same gene. Args: - df_exprTranscript (str): pandas Dataframe containing transcript and their expression level - df_output_gtf_selection (str) : pandas Dataframe containing genes and transcripts + df_exprTranscript : pandas Dataframe containing transcript and their expression level, + generated by "tsv_or_csv_to_df" function + df_output_gtf_selection : pandas Dataframe containing genes and transcripts, + generated by "transcripts_by_gene_inDf" function Returns: Pandas dataframe having gene and sum of its transcript expression level @@ -113,14 +123,16 @@ def exprLevel_byGene(df_exprTrasncript:str, df_output_gtf_selection:str) -> pd.D df_sum = df_merged.groupby("Gene").sum("Expression_level") # sum transcripts comming from the same gene return df_sum -def match_byGene(df_reprTranscript:str, df_expressionLevel_byGene:str) -> pd.DataFrame: +def match_byGene(df_reprTranscript: pd.DataFrame, df_expressionLevel_byGene:pd.DataFrame) -> pd.DataFrame: """Find matching genes bewteen the 2 args Args: - df_reprTranscript (str): pandas Dataframe containing genes - and their representative transcript - df_expressionLevel_byGene (str) : pandas Dataframe containing - genes and their expression level + df_reprTranscript : pandas Dataframe containing genes + and their representative transcript, generated by + "dict_reprTrans_to_df()" + df_expressionLevel_byGene : pandas Dataframe containing + genes and their expression level generated by + "transcript_by_gene_inDf()" Returns: Pandas dataframe having representative trasncripts @@ -135,7 +147,7 @@ def match_byGene(df_reprTranscript:str, df_expressionLevel_byGene:str) -> pd.Dat df_clean = df_clean.loc[:, ["reprTrans","Expression_level"]] return df_clean -def output_tsv(dataframe:str)-> pd.DataFrame : +def output_tsv(dataframe: pd.DataFrame)-> pd.DataFrame : """Convert pandas dataframe into a tsv file Args: @@ -195,6 +207,6 @@ def match_reprTranscript_expressionLevel(exprTrans:str, dict_reprTrans:dict, int #match_final = match_reprTranscript_expressionLevel(input_expr, dict_reprTrans, input_intermediate_file) #print("this is the function :\n\n {}".format(match_final)) -#if __name__ == "__main__": -# match_reprTranscript_expressionLevel() +if __name__ == "__main__": + match_reprTranscript_expressionLevel()