Skip to content
Snippets Groups Projects
Commit 5ef1e0d2 authored by Hugo Gillet's avatar Hugo Gillet
Browse files

Update representative.py

parent e4ffc910
No related branches found
No related tags found
No related merge requests found
### Made by Hugo Gillet ###
import pandas as pd import pandas as pd
import os import os
''' """
This part of the code take as input a gtf modified file This part of the code take as input a gtf modified file
and return a dictionary of transcripts with best and return a dictionary of transcripts with best
support level for each gene of the input support level for each gene of the input
''' """
def import_gtfSelection_to_df(gtf_modified_file: str) -> pd.DataFrame: def import_gtfSelection_to_df(gtf_modified_file: str) -> pd.DataFrame:
...@@ -28,20 +26,24 @@ def import_gtfSelection_to_df(gtf_modified_file: str) -> pd.DataFrame: ...@@ -28,20 +26,24 @@ def import_gtfSelection_to_df(gtf_modified_file: str) -> pd.DataFrame:
""" """
pass pass
if not type(gtf_modified_file) is str: if not type(gtf_modified_file) is str:
raise TypeError("Only str path is allowed") raise TypeError("Only str path is allowed")
df_input = pd.read_csv(gtf_modified_file, sep = '\t', lineterminator = '\n', df_input = pd.read_csv(
names = ["Gene_mixed", "Transcript", "Support_level", "Na1", "Na2"] ) gtf_modified_file,
sep="\t",
lineterminator="\n",
names=["Gene_mixed", "Transcript", "Support_level", "Na1", "Na2"],
)
df_input["Support_level"] = df_input["Support_level"].replace(" ", "") df_input["Support_level"] = df_input["Support_level"].replace(" ", "")
df_input["Gene"] = df_input["Gene_mixed"].str.extract('([A-Z]\w{0,})', expand=True) df_input["Gene"] = df_input["Gene_mixed"].str.extract("([A-Z]\w{0,})", expand=True)
df_input["Transcript_number"] = df_input["Gene_mixed"].str.extract('(^\d)', expand=True) df_input["Transcript_number"] = df_input["Gene_mixed"].str.extract(
df_clean = df_input.loc[:, ["Gene", "Transcript","Support_level"]] "(^\d)", expand=True
df_clean["Gene"] = df_clean["Gene"].fillna(method = 'ffill') )
df_clean = df_clean.dropna(axis = 0) df_clean = df_input.loc[:, ["Gene", "Transcript", "Support_level"]]
df_clean["Gene"] = df_clean["Gene"].fillna(method="ffill")
df_clean = df_clean.dropna(axis=0)
return df_clean return df_clean
def representative_transcripts_inDict(df_gtfSelection: pd.DataFrame) -> pd.DataFrame: def representative_transcripts_inDict(df_gtfSelection: pd.DataFrame) -> pd.DataFrame:
"""Return a dict containing for each gene transcripts """Return a dict containing for each gene transcripts
with highest confidence level with highest confidence level
...@@ -56,22 +58,22 @@ def representative_transcripts_inDict(df_gtfSelection: pd.DataFrame) -> pd.DataF ...@@ -56,22 +58,22 @@ def representative_transcripts_inDict(df_gtfSelection: pd.DataFrame) -> pd.DataF
Raises: Raises:
TypeError : Only pandas DataFrame is allowed TypeError : Only pandas DataFrame is allowed
""" """
pass pass
if not type(df_gtfSelection) is pd.DataFrame: if not type(df_gtfSelection) is pd.DataFrame:
raise TypeError("Only pandas DataFrame is allowed") raise TypeError("Only pandas DataFrame is allowed")
df_min = df_gtfSelection[
df_multIndex = df_gtfSelection.set_index(["Gene", "Transcript"]) df_gtfSelection["Support_level"]
#highest support level = 1 , worst = 5, NA = 100 == df_gtfSelection.groupby("Gene")["Support_level"].transform(min)
df_min = df_multIndex[df_multIndex["Support_level"] == df_multIndex["Support_level"].min()] ]
df_final = df_min.reset_index(level = "Transcript") df_final = df_min.drop(columns=["Support_level"])
df_final = df_final.drop(columns = ["Support_level"]) dict_representative_transcripts = (
dict_representative_transcripts = df_final.groupby("Gene")["Transcript"].apply(list).to_dict() df_final.groupby("Gene")["Transcript"].apply(list).to_dict()
return dict_representative_transcripts )
return dict_representative_transcripts
def find_repr_by_SupportLevel(intermediate_file: str) -> dict[str,str]: def find_repr_by_SupportLevel(intermediate_file: str) -> dict[str, str]:
"""Combine functions import_gtfSelection_to_df() """Combine functions import_gtfSelection_to_df()
and representative_transcripts_inDict() and representative_transcripts_inDict()
...@@ -86,12 +88,12 @@ def find_repr_by_SupportLevel(intermediate_file: str) -> dict[str,str]: ...@@ -86,12 +88,12 @@ def find_repr_by_SupportLevel(intermediate_file: str) -> dict[str,str]:
""" """
pass pass
df_gtf = import_gtfSelection_to_df(intermediate_file) df_gtf = import_gtfSelection_to_df(intermediate_file)
dict_reprTrans = representative_transcripts_inDict(df_gtf) dict_reprTrans = representative_transcripts_inDict(df_gtf)
return dict_reprTrans return dict_reprTrans
if __name__ == "__main__": if __name__ == "__main__":
find_repr_by_SupportLevel() find_repr_by_SupportLevel()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment