diff --git a/scripts/match_reprTranscript_expressionLevel.py b/scripts/match_reprTranscript_expressionLevel.py new file mode 100644 index 0000000000000000000000000000000000000000..2dfca50a3c3ee1458b7358fbd1e1d801751d727c --- /dev/null +++ b/scripts/match_reprTranscript_expressionLevel.py @@ -0,0 +1,200 @@ + +import pandas as pd +import json +import re +import rerpresentative_v4 as repr +import os + + +def dict_reprTrans_to_df(dict_reprTrans: dict): + + """Convert a dictionary of genes and their representative transcript into a dataframe + + Args: + dict_reprTrans (dict) : {'Gene':['transcriptA', 'transcriptB'], ...} + + Returns: + Pandas dataframe having Gene and transcript as columns + + Raises: + /!\ None, I wasn't able to make a TypeError with dict + : Only dict made of key string and value string is allowed + + """ + pass + + df_reprTrans = pd.DataFrame.from_dict(dict_reprTrans, orient="index", columns=["reprTranscript"]) + df_reprTrans = df_reprTrans.reset_index(level=0) + df_reprTrans.columns = ["Gene", 'reprTrans'] + df_reprTrans["reprTrans"] = df_reprTrans["reprTrans"].str.replace(r'\.[1-9]', '', regex=True) + return df_reprTrans + + +def txt_to_dict(dict_txt: str): + """Convert a txt file into a dictionary + + Args: + dict_txt (str) : pathe to a txt file of a dict + structured as {'Gene':['transcriptA', 'transcriptB'], ...} + + Returns: + dict (dict) : dictionary stuctured as {'Gene':['transcriptA', 'transcriptB'], ...} + + Raises: + None + """ + pass + + input : str = open(dict_txt, "r").read() + input : str = input.replace("\'", "\"") + dict = json.loads(input) + return dict + + + +def transcripts_by_gene_inDf(df_gtfSelection: str) -> pd.DataFrame: + """Convert multiindex dataframe from function into a simple dataframe + + Args: + df_gtfSelection (str): Pandas multiindex dataframe having Gene, + transcript as indexs and support level as columns. + Come from the function import_gtfSelection_to_df() + + Returns: + df_gene (str): Pandas dataframe having Gene and + transcript as columns + + Raises: + None + """ + pass + df_gene = df_gtfSelection.set_index(["Gene"]) + df_gene = df_gene.drop(columns=["Support_level"]) + df_gene['Transcript']=df_gene['Transcript'].str.replace(r"\.[0-9]","", regex=True) + df_gene = df_gene.reset_index(level=0) + return df_gene + + +def tsv_or_csv_to_df(input_txt:str) : + """Convert tsv or csv file into a pandas dataframe + + Args: + input_txt (str): csv or tsv file containing transcript expression level + + Returns: + df_gene (str): Pandas dataframe having transcript and expression level + as columns + + Raises: + None + """ + pass + df_input =pd.read_csv(input_txt, sep=r"[\t,]", lineterminator='\n', + names=["Transcript", "Expression_level"], + engine = "python") + return df_input + + +def exprLevel_byGene(df_exprTrasncript:str, df_output_gtf_selection:str) -> pd.DataFrame : + """Find matching transcripts bewteen the 2 args + + Args: + df_exprTranscript (str): pandas Dataframe containing transcript and their expression level + df_output_gtf_selection (str) : pandas Dataframe containing genes and transcripts + + Returns: + Pandas dataframe having gene and sum of its transcript expression level + + Raises: + None + """ + pass + df_merged = pd.merge(df_output_gtf_selection, df_exprTrasncript , how="inner", on="Transcript") + df_sum = df_merged.groupby("Gene").sum("Expression_level") # sum transcripts comming from the same gene + return df_sum + +def match_byGene(df_reprTranscript:str, df_expressionLevel_byGene:str) -> pd.DataFrame: + """Find matching genes bewteen the 2 args + + Args: + df_reprTranscript (str): pandas Dataframe containing genes + and their representative transcript + df_expressionLevel_byGene (str) : pandas Dataframe containing + genes and their expression level + + Returns: + Pandas dataframe having representative trasncripts + and their expression level + + Raises: + None + """ + pass + df_merged = pd.merge(df_reprTranscript, df_expressionLevel_byGene , how="outer", on="Gene") + df_clean = df_merged.dropna(axis=0) + df_clean = df_clean.loc[:, ["reprTrans","Expression_level"]] + return df_clean + +def output_tsv(dataframe:str)-> pd.DataFrame : + """Convert pandas dataframe into a tsv file + + Args: + dataframe (str): Pandas dataframe containing + representative transcripts and their expression level + + Returns: + Tsv file containing representative transcripts + and their expression level in the same directory + + Raises: + None + """ + pass + + csv_file = dataframe.to_csv(os.getcwd()+"\ReprTrans_ExpressionLevel.tsv", sep="\t", + index=False, header=False) + return csv_file + +### functions to run this part of the programm + +def match_reprTranscript_expressionLevel(exprTrans:str, dict_reprTrans:dict, intermediate_file:str): + """Combine functions to replace transcripts from an expression level csv/tsv file + with representative transcripts + + Args: + exprTrans (str): csv or tsv file containing transcripts + and their expression level + dict_reprTrans (dict) : dict of genes and their + representative transcipt + intemediate_file (str) : txt file containing genes, transcript + and their expression level from the transkript_extractor function + + Returns: + tsv file of representative trasncripts and their expression level + + Raises: + None + """ + df_intermediate = repr.import_gtfSelection_to_df(intermediate_file) + df_geneTrans = transcripts_by_gene_inDf(df_intermediate) + df_exprTrans = tsv_or_csv_to_df(exprTrans) + df_reprTrans = dict_reprTrans_to_df(dict_reprTrans) + df_exprLevel_byGene = exprLevel_byGene(df_exprTrans, df_geneTrans) + df_match = match_byGene(df_reprTrans, df_exprLevel_byGene) + output = output_tsv(df_match) + return output + + +# run the programm + +dict_txt = a #input a dict of {gene:reprTrans} in the form of a txt file +input_intermediate_file = b #input the intermediate file generated by transckript extractor +input_expr = c #input a csv or tsv file containing the expr level + +dict_reprTrans = txt_to_dict(dict_txt) +match_final = match_reprTranscript_expressionLevel(input_expr, dict_reprTrans, input_intermediate_file) +print("this is the function :\n\n {}".format(match_final)) + +if __name__ == "__main__": + match_reprTranscript_expressionLevel() + \ No newline at end of file diff --git a/scripts/representative_v3.py b/scripts/representative_v3.py deleted file mode 100644 index 387dc627205643b73ab5a0f8f28abd0e385e1961..0000000000000000000000000000000000000000 --- a/scripts/representative_v3.py +++ /dev/null @@ -1,63 +0,0 @@ - -import pandas as pd -import re -import itertools - -''' -This code take as input a gtf file and returns a dictionary of transcripts with best support level of each gene of the input - -''' - - - -##import modified gtf file and create a df## - -def import_gtfSelection_to_df(gtf_modified_file): - - #create a df from the tab separated file input - df_input =pd.read_csv(gtf_modified_file, sep='\t', lineterminator='\n', -names =["Gene_mixed", "Transcript", "Support_level", "Na1", "Na2"] ) - - df_input["Support_level"] = df_input["Support_level"].replace(" ", "") - - #Create a new column with only gene name from Gene_mixed column - df_input["Gene"] = df_input["Gene_mixed"].str.extract('([A-Z]\w{0,})', expand=True) - - #Create a new column with only transcript number from Gene_mixed column - df_input["Transcript_number"] = df_input["Gene_mixed"].str.extract('(^\d)', expand=True) - - #Create a new df with relevant column and without NA - df_clean = df_input.loc[:, ["Gene", "Transcript","Support_level"]] - df_clean["Gene"] = df_clean["Gene"].fillna(method='ffill') - df_clean = df_clean.dropna(axis=0) - return df_clean - - - -##Returns a df containing representative transcripts and their expression level from genes mentioned in the csv file## - -def representative_transcripts_inDict(df_gtfSelection): - - - #create a df indexed on booth Gene and Transcript columns - df_multIndex = df_gtfSelection.set_index(["Gene", "Transcript"]) - #create a df with only the transcripts with the highest support level (best is = 1 ) - df_min = df_multIndex.groupby(level=["Gene"])["Support_level"].transform("min") - print("\n=== This is your 10 first representative transcripts : === \n \n {}".format(df_min.head(10))) - #create a df without transcript levels - df_final = df_multIndex.reset_index(level="Transcript") - df_final = df_final.drop(columns=["Support_level"]) - - #create a dict with only Gene and representative transcripts - dict_representative_transcripts = df_final.groupby("Gene")["Transcript"].apply(list).to_dict() - return dict_representative_transcripts - - - -### add your inputs here ! ### - -gtf_file = "Homo_sapiens.GRCh38.107_intermediat_file.txt" # add the gtf input file here - -df_gtf = import_gtfSelection_to_df(gtf_file) - -dictionary_of_representative_transcripts = representative_transcripts_inDict(df_gtf) diff --git a/scripts/representative_v4.py b/scripts/representative_v4.py new file mode 100644 index 0000000000000000000000000000000000000000..c940686b2126121b33d12a858026338aff737706 --- /dev/null +++ b/scripts/representative_v4.py @@ -0,0 +1,96 @@ + +import pandas as pd + +''' +This part of the code take as input a gtf modified file +and return a dictionary of transcripts with best +support level for each gene of the input + +''' + + + + +def import_gtfSelection_to_df(gtf_modified_file: str): + """Import intermediate file from gtf and create a df + + Args: + gtf_modified_file (str) : path to the intermediate file + + Returns: + Pandas dataframe having Gene, transcript + and support level as columns + + Raises: + TypeError : Only str path is allowed + + """ + pass + if not type(gtf_modified_file) is str: + raise TypeError("Only str path is allowed") + df_input = pd.read_csv(gtf_modified_file, sep = '\t', lineterminator = '\n', +names = ["Gene_mixed", "Transcript", "Support_level", "Na1", "Na2"] ) + df_input["Support_level"] = df_input["Support_level"].replace(" ", "") + df_input["Gene"] = df_input["Gene_mixed"].str.extract('([A-Z]\w{0,})', expand=True) + df_input["Transcript_number"] = df_input["Gene_mixed"].str.extract('(^\d)', expand=True) + df_clean = df_input.loc[:, ["Gene", "Transcript","Support_level"]] + df_clean["Gene"] = df_clean["Gene"].fillna(method = 'ffill') + df_clean = df_clean.dropna(axis = 0) + return df_clean + + + + +def representative_transcripts_inDict(df_gtfSelection: str) -> pd.DataFrame: + """Return a dict containing for each gene transcripts + with highest confidence level + + Args: + df_gtfSelection (str): Pandas dataframe having Gene, + transcript and support level as columns + + Returns: + Dict {'Gene':['transcriptA', 'transcriptB'], ...} + + Raises: + TypeError : Only pandas DataFrame is allowed + """ + pass + + if not type(df_gtfSelection) is pd.DataFrame: + raise TypeError("Only pandas DataFrame is allowed") + + df_multIndex = df_gtfSelection.set_index(["Gene", "Transcript"]) + #highest support level = 1 , worst = 5, NA = 100 + df_min = df_multIndex.groupby(level=["Gene"])["Support_level"].transform("min") + df_final = df_min.reset_index(level = "Transcript") + df_final = df_final.drop(columns = ["Support_level"]) + dict_representative_transcripts = df_final.groupby("Gene")["Transcript"].apply(list).to_dict() + return dict_representative_transcripts + + + +def find_repr_by_SupportLevel(intermediate_file:str): + """Combine functions import_gtfSelection_to_df() + and representative_transcripts_inDict() + + Args: + intermediate_file : path to the intermediate file + + Returns: + Dict {'Gene':['transcriptA', 'transcriptB'], ...} + + Raises: + None + + + """ + pass + df_gtf = import_gtfSelection_to_df(intermediate_file) + dict_reprTrans = representative_transcripts_inDict(df_gtf) + return dict_reprTrans + + + +if __name__ == "__main__": + find_repr_by_SupportLevel()