### Made by Hugo Gillet ### import pandas as pd import json import re import representative as repr import os def dict_reprTrans_to_df(dict_reprTrans: dict[str, str]) -> pd.DataFrame: """Convert a dictionary of genes and their representative transcript into a dataframe Args: dict_reprTrans (dict) : {'Gene':['transcriptA', 'transcriptB'], ...} Returns: Pandas dataframe having Gene and transcript as columns Raises: Only dict are allowed Key should be strings Value should be strings """ pass if not type(dict_reprTrans) is dict: raise TypeError("Only dict are allowed") if type(list(dict_reprTrans.keys())[0]) is not str: raise TypeError("Key should be strings") if type(list(dict_reprTrans.values())[0]) is not str: raise TypeError("Values should be strings") df_reprTrans = pd.DataFrame.from_dict( dict_reprTrans, orient="index", columns=["reprTranscript"] ) df_reprTrans = df_reprTrans.reset_index(level=0) df_reprTrans.columns = ["Gene", "reprTrans"] df_reprTrans["reprTrans"] = df_reprTrans["reprTrans"].str.replace( r"\.[1-9]", "", regex=True ) return df_reprTrans def txt_to_dict(dict_txt: str) -> dict: """Convert a txt file into a dictionary Args: dict_txt (str) : pathe to a txt file of a dict structured as {'Gene':['transcriptA', 'transcriptB'], ...} Returns: dict (dict) : dictionary stuctured as {'Gene':['transcriptA', 'transcriptB'], ...} Raises: None """ pass input: str = open(dict_txt, "r").read() input: str = input.replace("'", '"') dict = json.loads(input) return dict def transcripts_by_gene_inDf(df_gtfSelection: pd.DataFrame) -> pd.DataFrame: """Convert multiindex dataframe from function into a simple dataframe Args: df_gtfSelection : Pandas multiindex dataframe having Gene, transcript as indexs and support level as columns. Come from the function import_gtfSelection_to_df() from representative.py script. Returns: df_gene (str): Pandas dataframe having Gene and transcript as columns Raises: None """ pass df_gene = df_gtfSelection.set_index(["Gene"]) df_gene = df_gene.drop(columns=["Support_level"]) df_gene["Transcript"] = df_gene["Transcript"].str.replace( r"\.[0-9]", "", regex=True ) df_gene = df_gene.reset_index(level=0) return df_gene def tsv_or_csv_to_df(input_txt: str) -> pd.DataFrame: """Convert tsv or csv file into a pandas dataframe Args: input_txt (str): csv or tsv file containing transcript expression level Returns: df_gene (str): Pandas dataframe having transcript and expression level as columns Raises: None """ pass df_input = pd.read_csv( input_txt, sep=r"[\t,]", lineterminator="\n", names=["Transcript", "Expression_level"], engine="python", ) return df_input def exprLevel_byGene( df_exprTrasncript: pd.DataFrame, df_output_gtf_selection: pd.DataFrame ) -> pd.DataFrame: """find the gene of each transcipt given by the expression level csv/tsv file, and summ expression level of all transcipts from the same gene. Args: df_exprTranscript : pandas Dataframe containing transcript and their expression level, generated by "tsv_or_csv_to_df" function df_output_gtf_selection : pandas Dataframe containing genes and transcripts, generated by "transcripts_by_gene_inDf" function Returns: Pandas dataframe having gene and sum of its transcript expression level Raises: None """ pass df_merged = pd.merge( df_output_gtf_selection, df_exprTrasncript, how="inner", on="Transcript" ) df_sum = df_merged.groupby("Gene").sum( "Expression_level" ) # sum transcripts comming from the same gene return df_sum def match_byGene( df_reprTranscript: pd.DataFrame, df_expressionLevel_byGene: pd.DataFrame ) -> pd.DataFrame: """Find matching genes bewteen the 2 args Args: df_reprTranscript : pandas Dataframe containing genes and their representative transcript, generated by "dict_reprTrans_to_df()" df_expressionLevel_byGene : pandas Dataframe containing genes and their expression level generated by "transcript_by_gene_inDf()" Returns: Pandas dataframe having representative trasncripts and their expression level Raises: None """ pass df_merged = pd.merge( df_reprTranscript, df_expressionLevel_byGene, how="outer", on="Gene" ) df_clean = df_merged.dropna(axis=0) df_clean = df_clean.loc[:, ["reprTrans", "Expression_level"]] return df_clean def output_tsv(dataframe: pd.DataFrame) -> pd.DataFrame: """Convert pandas dataframe into a tsv file Args: dataframe (str): Pandas dataframe containing representative transcripts and their expression level Returns: Tsv file containing representative transcripts and their expression level in the same directory Raises: None """ pass csv_file = dataframe.to_csv( os.getcwd() + "\ReprTrans_ExpressionLevel.tsv", sep="\t", index=False, header=False, ) return csv_file ### functions to run this part of the programm def match_reprTranscript_expressionLevel( exprTrans: str, dict_reprTrans: dict, intermediate_file: str ): """Combine functions to replace transcripts from an expression level csv/tsv file with representative transcripts Args: exprTrans (str): csv or tsv file containing transcripts and their expression level dict_reprTrans (dict) : dict of genes and their representative transcipt intemediate_file (str) : txt file containing genes, transcript and their expression level from the transkript_extractor function Returns: tsv file of representative trasncripts and their expression level Raises: None """ df_intermediate = repr.import_gtfSelection_to_df(intermediate_file) df_geneTrans = transcripts_by_gene_inDf(df_intermediate) df_exprTrans = tsv_or_csv_to_df(exprTrans) df_reprTrans = dict_reprTrans_to_df(dict_reprTrans) df_exprLevel_byGene = exprLevel_byGene(df_exprTrans, df_geneTrans) df_match = match_byGene(df_reprTrans, df_exprLevel_byGene) output = output_tsv(df_match) return output # run the programm if __name__ == "__main__": match_reprTranscript_expressionLevel()