Skip to content
Snippets Groups Projects
Commit 9e16eb71 authored by Hugo Gillet's avatar Hugo Gillet
Browse files

Update match_reprtranscript_expressionlevel.py

parent 5ef1e0d2
Branches
No related tags found
No related merge requests found
### Made by Hugo Gillet ###
import pandas as pd import pandas as pd
import json import json
import re import re
...@@ -5,7 +6,7 @@ import representative as repr ...@@ -5,7 +6,7 @@ import representative as repr
import os import os
def dict_reprTrans_to_df(dict_reprTrans: dict[str,str])-> pd.DataFrame: def dict_reprTrans_to_df(dict_reprTrans: dict[str, str]) -> pd.DataFrame:
"""Convert a dictionary of genes and their representative transcript into a dataframe """Convert a dictionary of genes and their representative transcript into a dataframe
...@@ -22,17 +23,21 @@ def dict_reprTrans_to_df(dict_reprTrans: dict[str,str])-> pd.DataFrame: ...@@ -22,17 +23,21 @@ def dict_reprTrans_to_df(dict_reprTrans: dict[str,str])-> pd.DataFrame:
""" """
pass pass
if not type(dict_reprTrans) is dict : if not type(dict_reprTrans) is dict:
raise TypeError("Only dict are allowed") raise TypeError("Only dict are allowed")
if type(list(dict_reprTrans.keys())[0]) is not str : if type(list(dict_reprTrans.keys())[0]) is not str:
raise TypeError("Key should be strings") raise TypeError("Key should be strings")
if type(list(dict_reprTrans.values())[0]) is not str : if type(list(dict_reprTrans.values())[0]) is not str:
raise TypeError("Values should be strings") raise TypeError("Values should be strings")
df_reprTrans = pd.DataFrame.from_dict(dict_reprTrans, orient="index", columns=["reprTranscript"]) df_reprTrans = pd.DataFrame.from_dict(
dict_reprTrans, orient="index", columns=["reprTranscript"]
)
df_reprTrans = df_reprTrans.reset_index(level=0) df_reprTrans = df_reprTrans.reset_index(level=0)
df_reprTrans.columns = ["Gene", 'reprTrans'] df_reprTrans.columns = ["Gene", "reprTrans"]
df_reprTrans["reprTrans"] = df_reprTrans["reprTrans"].str.replace(r'\.[1-9]', '', regex=True) df_reprTrans["reprTrans"] = df_reprTrans["reprTrans"].str.replace(
r"\.[1-9]", "", regex=True
)
return df_reprTrans return df_reprTrans
...@@ -51,13 +56,12 @@ def txt_to_dict(dict_txt: str) -> dict: ...@@ -51,13 +56,12 @@ def txt_to_dict(dict_txt: str) -> dict:
""" """
pass pass
input : str = open(dict_txt, "r").read() input: str = open(dict_txt, "r").read()
input : str = input.replace("\'", "\"") input: str = input.replace("'", '"')
dict = json.loads(input) dict = json.loads(input)
return dict return dict
def transcripts_by_gene_inDf(df_gtfSelection: pd.DataFrame) -> pd.DataFrame: def transcripts_by_gene_inDf(df_gtfSelection: pd.DataFrame) -> pd.DataFrame:
"""Convert multiindex dataframe from function into a simple dataframe """Convert multiindex dataframe from function into a simple dataframe
...@@ -77,12 +81,14 @@ def transcripts_by_gene_inDf(df_gtfSelection: pd.DataFrame) -> pd.DataFrame: ...@@ -77,12 +81,14 @@ def transcripts_by_gene_inDf(df_gtfSelection: pd.DataFrame) -> pd.DataFrame:
pass pass
df_gene = df_gtfSelection.set_index(["Gene"]) df_gene = df_gtfSelection.set_index(["Gene"])
df_gene = df_gene.drop(columns=["Support_level"]) df_gene = df_gene.drop(columns=["Support_level"])
df_gene['Transcript']=df_gene['Transcript'].str.replace(r"\.[0-9]","", regex=True) df_gene["Transcript"] = df_gene["Transcript"].str.replace(
r"\.[0-9]", "", regex=True
)
df_gene = df_gene.reset_index(level=0) df_gene = df_gene.reset_index(level=0)
return df_gene return df_gene
def tsv_or_csv_to_df(input_txt:str) -> pd.DataFrame : def tsv_or_csv_to_df(input_txt: str) -> pd.DataFrame:
"""Convert tsv or csv file into a pandas dataframe """Convert tsv or csv file into a pandas dataframe
Args: Args:
...@@ -96,13 +102,19 @@ def tsv_or_csv_to_df(input_txt:str) -> pd.DataFrame : ...@@ -96,13 +102,19 @@ def tsv_or_csv_to_df(input_txt:str) -> pd.DataFrame :
None None
""" """
pass pass
df_input =pd.read_csv(input_txt, sep=r"[\t,]", lineterminator='\n', df_input = pd.read_csv(
names=["Transcript", "Expression_level"], input_txt,
engine = "python") sep=r"[\t,]",
lineterminator="\n",
names=["Transcript", "Expression_level"],
engine="python",
)
return df_input return df_input
def exprLevel_byGene(df_exprTrasncript:pd.DataFrame, df_output_gtf_selection:pd.DataFrame) -> pd.DataFrame : def exprLevel_byGene(
df_exprTrasncript: pd.DataFrame, df_output_gtf_selection: pd.DataFrame
) -> pd.DataFrame:
"""find the gene of each transcipt given by the expression level csv/tsv file, """find the gene of each transcipt given by the expression level csv/tsv file,
and summ expression level of all transcipts from the same gene. and summ expression level of all transcipts from the same gene.
...@@ -118,12 +130,19 @@ def exprLevel_byGene(df_exprTrasncript:pd.DataFrame, df_output_gtf_selection:pd. ...@@ -118,12 +130,19 @@ def exprLevel_byGene(df_exprTrasncript:pd.DataFrame, df_output_gtf_selection:pd.
Raises: Raises:
None None
""" """
pass pass
df_merged = pd.merge(df_output_gtf_selection, df_exprTrasncript , how="inner", on="Transcript") df_merged = pd.merge(
df_sum = df_merged.groupby("Gene").sum("Expression_level") # sum transcripts comming from the same gene df_output_gtf_selection, df_exprTrasncript, how="inner", on="Transcript"
)
df_sum = df_merged.groupby("Gene").sum(
"Expression_level"
) # sum transcripts comming from the same gene
return df_sum return df_sum
def match_byGene(df_reprTranscript: pd.DataFrame, df_expressionLevel_byGene:pd.DataFrame) -> pd.DataFrame:
def match_byGene(
df_reprTranscript: pd.DataFrame, df_expressionLevel_byGene: pd.DataFrame
) -> pd.DataFrame:
"""Find matching genes bewteen the 2 args """Find matching genes bewteen the 2 args
Args: Args:
...@@ -141,13 +160,16 @@ def match_byGene(df_reprTranscript: pd.DataFrame, df_expressionLevel_byGene:pd.D ...@@ -141,13 +160,16 @@ def match_byGene(df_reprTranscript: pd.DataFrame, df_expressionLevel_byGene:pd.D
Raises: Raises:
None None
""" """
pass pass
df_merged = pd.merge(df_reprTranscript, df_expressionLevel_byGene , how="outer", on="Gene") df_merged = pd.merge(
df_reprTranscript, df_expressionLevel_byGene, how="outer", on="Gene"
)
df_clean = df_merged.dropna(axis=0) df_clean = df_merged.dropna(axis=0)
df_clean = df_clean.loc[:, ["reprTrans","Expression_level"]] df_clean = df_clean.loc[:, ["reprTrans", "Expression_level"]]
return df_clean return df_clean
def output_tsv(dataframe: pd.DataFrame)-> pd.DataFrame :
def output_tsv(dataframe: pd.DataFrame) -> pd.DataFrame:
"""Convert pandas dataframe into a tsv file """Convert pandas dataframe into a tsv file
Args: Args:
...@@ -161,15 +183,23 @@ def output_tsv(dataframe: pd.DataFrame)-> pd.DataFrame : ...@@ -161,15 +183,23 @@ def output_tsv(dataframe: pd.DataFrame)-> pd.DataFrame :
Raises: Raises:
None None
""" """
pass pass
csv_file = dataframe.to_csv(os.getcwd()+"\ReprTrans_ExpressionLevel.tsv", sep="\t", csv_file = dataframe.to_csv(
index=False, header=False) os.getcwd() + "\ReprTrans_ExpressionLevel.tsv",
sep="\t",
index=False,
header=False,
)
return csv_file return csv_file
### functions to run this part of the programm ### functions to run this part of the programm
def match_reprTranscript_expressionLevel(exprTrans:str, dict_reprTrans:dict, intermediate_file:str):
def match_reprTranscript_expressionLevel(
exprTrans: str, dict_reprTrans: dict, intermediate_file: str
):
"""Combine functions to replace transcripts from an expression level csv/tsv file """Combine functions to replace transcripts from an expression level csv/tsv file
with representative transcripts with representative transcripts
...@@ -199,14 +229,5 @@ def match_reprTranscript_expressionLevel(exprTrans:str, dict_reprTrans:dict, int ...@@ -199,14 +229,5 @@ def match_reprTranscript_expressionLevel(exprTrans:str, dict_reprTrans:dict, int
# run the programm # run the programm
#dict_txt = a #input a dict of {gene:reprTrans} in the form of a txt file if __name__ == "__main__":
#input_intermediate_file = b #input the intermediate file generated by transckript extractor
#input_expr = c #input a csv or tsv file containing the expr level
#dict_reprTrans = txt_to_dict(dict_txt)
#match_final = match_reprTranscript_expressionLevel(input_expr, dict_reprTrans, input_intermediate_file)
#print("this is the function :\n\n {}".format(match_final))
if __name__ == "__main__":
match_reprTranscript_expressionLevel() match_reprTranscript_expressionLevel()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment