diff --git a/scripts/representative.py b/scripts/representative.py index fcc7bf8c6baf6758ed61e78ea82c2cfda14300bb..047a959eb3bebd235caae96218d5a6dcf5aa2fd5 100644 --- a/scripts/representative.py +++ b/scripts/representative.py @@ -1,15 +1,13 @@ - +### Made by Hugo Gillet ### import pandas as pd -import os +import os -''' +""" This part of the code take as input a gtf modified file and return a dictionary of transcripts with best support level for each gene of the input -''' - - +""" def import_gtfSelection_to_df(gtf_modified_file: str) -> pd.DataFrame: @@ -28,20 +26,24 @@ def import_gtfSelection_to_df(gtf_modified_file: str) -> pd.DataFrame: """ pass if not type(gtf_modified_file) is str: - raise TypeError("Only str path is allowed") - df_input = pd.read_csv(gtf_modified_file, sep = '\t', lineterminator = '\n', -names = ["Gene_mixed", "Transcript", "Support_level", "Na1", "Na2"] ) + raise TypeError("Only str path is allowed") + df_input = pd.read_csv( + gtf_modified_file, + sep="\t", + lineterminator="\n", + names=["Gene_mixed", "Transcript", "Support_level", "Na1", "Na2"], + ) df_input["Support_level"] = df_input["Support_level"].replace(" ", "") - df_input["Gene"] = df_input["Gene_mixed"].str.extract('([A-Z]\w{0,})', expand=True) - df_input["Transcript_number"] = df_input["Gene_mixed"].str.extract('(^\d)', expand=True) - df_clean = df_input.loc[:, ["Gene", "Transcript","Support_level"]] - df_clean["Gene"] = df_clean["Gene"].fillna(method = 'ffill') - df_clean = df_clean.dropna(axis = 0) + df_input["Gene"] = df_input["Gene_mixed"].str.extract("([A-Z]\w{0,})", expand=True) + df_input["Transcript_number"] = df_input["Gene_mixed"].str.extract( + "(^\d)", expand=True + ) + df_clean = df_input.loc[:, ["Gene", "Transcript", "Support_level"]] + df_clean["Gene"] = df_clean["Gene"].fillna(method="ffill") + df_clean = df_clean.dropna(axis=0) return df_clean - - def representative_transcripts_inDict(df_gtfSelection: pd.DataFrame) -> pd.DataFrame: """Return a dict containing for each gene transcripts with highest confidence level @@ -56,22 +58,22 @@ def representative_transcripts_inDict(df_gtfSelection: pd.DataFrame) -> pd.DataF Raises: TypeError : Only pandas DataFrame is allowed """ - pass + pass if not type(df_gtfSelection) is pd.DataFrame: raise TypeError("Only pandas DataFrame is allowed") - - df_multIndex = df_gtfSelection.set_index(["Gene", "Transcript"]) - #highest support level = 1 , worst = 5, NA = 100 - df_min = df_multIndex[df_multIndex["Support_level"] == df_multIndex["Support_level"].min()] - df_final = df_min.reset_index(level = "Transcript") - df_final = df_final.drop(columns = ["Support_level"]) - dict_representative_transcripts = df_final.groupby("Gene")["Transcript"].apply(list).to_dict() - return dict_representative_transcripts - - - -def find_repr_by_SupportLevel(intermediate_file: str) -> dict[str,str]: + df_min = df_gtfSelection[ + df_gtfSelection["Support_level"] + == df_gtfSelection.groupby("Gene")["Support_level"].transform(min) + ] + df_final = df_min.drop(columns=["Support_level"]) + dict_representative_transcripts = ( + df_final.groupby("Gene")["Transcript"].apply(list).to_dict() + ) + return dict_representative_transcripts + + +def find_repr_by_SupportLevel(intermediate_file: str) -> dict[str, str]: """Combine functions import_gtfSelection_to_df() and representative_transcripts_inDict() @@ -86,12 +88,12 @@ def find_repr_by_SupportLevel(intermediate_file: str) -> dict[str,str]: """ - pass + pass df_gtf = import_gtfSelection_to_df(intermediate_file) dict_reprTrans = representative_transcripts_inDict(df_gtf) return dict_reprTrans -if __name__ == "__main__": - find_repr_by_SupportLevel() +if __name__ == "__main__": + find_repr_by_SupportLevel()