Skip to content
Snippets Groups Projects
Commit 5ef1e0d2 authored by Hugo Gillet's avatar Hugo Gillet
Browse files

Update representative.py

parent e4ffc910
No related branches found
No related tags found
No related merge requests found
### Made by Hugo Gillet ###
import pandas as pd
import os
import os
'''
"""
This part of the code take as input a gtf modified file
and return a dictionary of transcripts with best
support level for each gene of the input
'''
"""
def import_gtfSelection_to_df(gtf_modified_file: str) -> pd.DataFrame:
......@@ -28,20 +26,24 @@ def import_gtfSelection_to_df(gtf_modified_file: str) -> pd.DataFrame:
"""
pass
if not type(gtf_modified_file) is str:
raise TypeError("Only str path is allowed")
df_input = pd.read_csv(gtf_modified_file, sep = '\t', lineterminator = '\n',
names = ["Gene_mixed", "Transcript", "Support_level", "Na1", "Na2"] )
raise TypeError("Only str path is allowed")
df_input = pd.read_csv(
gtf_modified_file,
sep="\t",
lineterminator="\n",
names=["Gene_mixed", "Transcript", "Support_level", "Na1", "Na2"],
)
df_input["Support_level"] = df_input["Support_level"].replace(" ", "")
df_input["Gene"] = df_input["Gene_mixed"].str.extract('([A-Z]\w{0,})', expand=True)
df_input["Transcript_number"] = df_input["Gene_mixed"].str.extract('(^\d)', expand=True)
df_clean = df_input.loc[:, ["Gene", "Transcript","Support_level"]]
df_clean["Gene"] = df_clean["Gene"].fillna(method = 'ffill')
df_clean = df_clean.dropna(axis = 0)
df_input["Gene"] = df_input["Gene_mixed"].str.extract("([A-Z]\w{0,})", expand=True)
df_input["Transcript_number"] = df_input["Gene_mixed"].str.extract(
"(^\d)", expand=True
)
df_clean = df_input.loc[:, ["Gene", "Transcript", "Support_level"]]
df_clean["Gene"] = df_clean["Gene"].fillna(method="ffill")
df_clean = df_clean.dropna(axis=0)
return df_clean
def representative_transcripts_inDict(df_gtfSelection: pd.DataFrame) -> pd.DataFrame:
"""Return a dict containing for each gene transcripts
with highest confidence level
......@@ -56,22 +58,22 @@ def representative_transcripts_inDict(df_gtfSelection: pd.DataFrame) -> pd.DataF
Raises:
TypeError : Only pandas DataFrame is allowed
"""
pass
pass
if not type(df_gtfSelection) is pd.DataFrame:
raise TypeError("Only pandas DataFrame is allowed")
df_multIndex = df_gtfSelection.set_index(["Gene", "Transcript"])
#highest support level = 1 , worst = 5, NA = 100
df_min = df_multIndex[df_multIndex["Support_level"] == df_multIndex["Support_level"].min()]
df_final = df_min.reset_index(level = "Transcript")
df_final = df_final.drop(columns = ["Support_level"])
dict_representative_transcripts = df_final.groupby("Gene")["Transcript"].apply(list).to_dict()
return dict_representative_transcripts
def find_repr_by_SupportLevel(intermediate_file: str) -> dict[str,str]:
df_min = df_gtfSelection[
df_gtfSelection["Support_level"]
== df_gtfSelection.groupby("Gene")["Support_level"].transform(min)
]
df_final = df_min.drop(columns=["Support_level"])
dict_representative_transcripts = (
df_final.groupby("Gene")["Transcript"].apply(list).to_dict()
)
return dict_representative_transcripts
def find_repr_by_SupportLevel(intermediate_file: str) -> dict[str, str]:
"""Combine functions import_gtfSelection_to_df()
and representative_transcripts_inDict()
......@@ -86,12 +88,12 @@ def find_repr_by_SupportLevel(intermediate_file: str) -> dict[str,str]:
"""
pass
pass
df_gtf = import_gtfSelection_to_df(intermediate_file)
dict_reprTrans = representative_transcripts_inDict(df_gtf)
return dict_reprTrans
if __name__ == "__main__":
find_repr_by_SupportLevel()
if __name__ == "__main__":
find_repr_by_SupportLevel()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment