Skip to content
Snippets Groups Projects
Commit 5ef1e0d2 authored by Hugo Gillet's avatar Hugo Gillet
Browse files

Update representative.py

parent e4ffc910
No related merge requests found
### Made by Hugo Gillet ###
import pandas as pd
import os
import os
'''
"""
This part of the code take as input a gtf modified file
and return a dictionary of transcripts with best
support level for each gene of the input
'''
"""
def import_gtfSelection_to_df(gtf_modified_file: str) -> pd.DataFrame:
......@@ -28,20 +26,24 @@ def import_gtfSelection_to_df(gtf_modified_file: str) -> pd.DataFrame:
"""
pass
if not type(gtf_modified_file) is str:
raise TypeError("Only str path is allowed")
df_input = pd.read_csv(gtf_modified_file, sep = '\t', lineterminator = '\n',
names = ["Gene_mixed", "Transcript", "Support_level", "Na1", "Na2"] )
raise TypeError("Only str path is allowed")
df_input = pd.read_csv(
gtf_modified_file,
sep="\t",
lineterminator="\n",
names=["Gene_mixed", "Transcript", "Support_level", "Na1", "Na2"],
)
df_input["Support_level"] = df_input["Support_level"].replace(" ", "")
df_input["Gene"] = df_input["Gene_mixed"].str.extract('([A-Z]\w{0,})', expand=True)
df_input["Transcript_number"] = df_input["Gene_mixed"].str.extract('(^\d)', expand=True)
df_clean = df_input.loc[:, ["Gene", "Transcript","Support_level"]]
df_clean["Gene"] = df_clean["Gene"].fillna(method = 'ffill')
df_clean = df_clean.dropna(axis = 0)
df_input["Gene"] = df_input["Gene_mixed"].str.extract("([A-Z]\w{0,})", expand=True)
df_input["Transcript_number"] = df_input["Gene_mixed"].str.extract(
"(^\d)", expand=True
)
df_clean = df_input.loc[:, ["Gene", "Transcript", "Support_level"]]
df_clean["Gene"] = df_clean["Gene"].fillna(method="ffill")
df_clean = df_clean.dropna(axis=0)
return df_clean
def representative_transcripts_inDict(df_gtfSelection: pd.DataFrame) -> pd.DataFrame:
"""Return a dict containing for each gene transcripts
with highest confidence level
......@@ -56,22 +58,22 @@ def representative_transcripts_inDict(df_gtfSelection: pd.DataFrame) -> pd.DataF
Raises:
TypeError : Only pandas DataFrame is allowed
"""
pass
pass
if not type(df_gtfSelection) is pd.DataFrame:
raise TypeError("Only pandas DataFrame is allowed")
df_multIndex = df_gtfSelection.set_index(["Gene", "Transcript"])
#highest support level = 1 , worst = 5, NA = 100
df_min = df_multIndex[df_multIndex["Support_level"] == df_multIndex["Support_level"].min()]
df_final = df_min.reset_index(level = "Transcript")
df_final = df_final.drop(columns = ["Support_level"])
dict_representative_transcripts = df_final.groupby("Gene")["Transcript"].apply(list).to_dict()
return dict_representative_transcripts
def find_repr_by_SupportLevel(intermediate_file: str) -> dict[str,str]:
df_min = df_gtfSelection[
df_gtfSelection["Support_level"]
== df_gtfSelection.groupby("Gene")["Support_level"].transform(min)
]
df_final = df_min.drop(columns=["Support_level"])
dict_representative_transcripts = (
df_final.groupby("Gene")["Transcript"].apply(list).to_dict()
)
return dict_representative_transcripts
def find_repr_by_SupportLevel(intermediate_file: str) -> dict[str, str]:
"""Combine functions import_gtfSelection_to_df()
and representative_transcripts_inDict()
......@@ -86,12 +88,12 @@ def find_repr_by_SupportLevel(intermediate_file: str) -> dict[str,str]:
"""
pass
pass
df_gtf = import_gtfSelection_to_df(intermediate_file)
dict_reprTrans = representative_transcripts_inDict(df_gtf)
return dict_reprTrans
if __name__ == "__main__":
find_repr_by_SupportLevel()
if __name__ == "__main__":
find_repr_by_SupportLevel()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment