-
Hugo Gillet authoredHugo Gillet authored
match_reprtranscript_expressionlevel.py 7.26 KiB
import pandas as pd
import json
import re
import representative as repr
import os
def dict_reprTrans_to_df(dict_reprTrans: dict[str,str])-> pd.DataFrame:
"""Convert a dictionary of genes and their representative transcript into a dataframe
Args:
dict_reprTrans (dict) : {'Gene':['transcriptA', 'transcriptB'], ...}
Returns:
Pandas dataframe having Gene and transcript as columns
Raises:
Only dict are allowed
Key should be strings
Value should be strings
"""
pass
if not type(dict_reprTrans) is dict :
raise TypeError("Only dict are allowed")
if type(list(dict_reprTrans.keys())[0]) is not str :
raise TypeError("Key should be strings")
if type(list(dict_reprTrans.values())[0]) is not str :
raise TypeError("Values should be strings")
df_reprTrans = pd.DataFrame.from_dict(dict_reprTrans, orient="index", columns=["reprTranscript"])
df_reprTrans = df_reprTrans.reset_index(level=0)
df_reprTrans.columns = ["Gene", 'reprTrans']
df_reprTrans["reprTrans"] = df_reprTrans["reprTrans"].str.replace(r'\.[1-9]', '', regex=True)
return df_reprTrans
def txt_to_dict(dict_txt: str) -> dict:
"""Convert a txt file into a dictionary
Args:
dict_txt (str) : pathe to a txt file of a dict
structured as {'Gene':['transcriptA', 'transcriptB'], ...}
Returns:
dict (dict) : dictionary stuctured as {'Gene':['transcriptA', 'transcriptB'], ...}
Raises:
None
"""
pass
input : str = open(dict_txt, "r").read()
input : str = input.replace("\'", "\"")
dict = json.loads(input)
return dict
def transcripts_by_gene_inDf(df_gtfSelection: pd.DataFrame) -> pd.DataFrame:
"""Convert multiindex dataframe from function into a simple dataframe
Args:
df_gtfSelection : Pandas multiindex dataframe having Gene,
transcript as indexs and support level as columns.
Come from the function import_gtfSelection_to_df()
from representative.py script.
Returns:
df_gene (str): Pandas dataframe having Gene and
transcript as columns
Raises:
None
"""
pass
df_gene = df_gtfSelection.set_index(["Gene"])
df_gene = df_gene.drop(columns=["Support_level"])
df_gene['Transcript']=df_gene['Transcript'].str.replace(r"\.[0-9]","", regex=True)
df_gene = df_gene.reset_index(level=0)
return df_gene
def tsv_or_csv_to_df(input_txt:str) -> pd.DataFrame :
"""Convert tsv or csv file into a pandas dataframe
Args:
input_txt (str): csv or tsv file containing transcript expression level
Returns:
df_gene (str): Pandas dataframe having transcript and expression level
as columns
Raises:
None
"""
pass
df_input =pd.read_csv(input_txt, sep=r"[\t,]", lineterminator='\n',
names=["Transcript", "Expression_level"],
engine = "python")
return df_input
def exprLevel_byGene(df_exprTrasncript:pd.DataFrame, df_output_gtf_selection:pd.DataFrame) -> pd.DataFrame :
"""find the gene of each transcipt given by the expression level csv/tsv file,
and summ expression level of all transcipts from the same gene.
Args:
df_exprTranscript : pandas Dataframe containing transcript and their expression level,
generated by "tsv_or_csv_to_df" function
df_output_gtf_selection : pandas Dataframe containing genes and transcripts,
generated by "transcripts_by_gene_inDf" function
Returns:
Pandas dataframe having gene and sum of its transcript expression level
Raises:
None
"""
pass
df_merged = pd.merge(df_output_gtf_selection, df_exprTrasncript , how="inner", on="Transcript")
df_sum = df_merged.groupby("Gene").sum("Expression_level") # sum transcripts comming from the same gene
return df_sum
def match_byGene(df_reprTranscript: pd.DataFrame, df_expressionLevel_byGene:pd.DataFrame) -> pd.DataFrame:
"""Find matching genes bewteen the 2 args
Args:
df_reprTranscript : pandas Dataframe containing genes
and their representative transcript, generated by
"dict_reprTrans_to_df()"
df_expressionLevel_byGene : pandas Dataframe containing
genes and their expression level generated by
"transcript_by_gene_inDf()"
Returns:
Pandas dataframe having representative trasncripts
and their expression level
Raises:
None
"""
pass
df_merged = pd.merge(df_reprTranscript, df_expressionLevel_byGene , how="outer", on="Gene")
df_clean = df_merged.dropna(axis=0)
df_clean = df_clean.loc[:, ["reprTrans","Expression_level"]]
return df_clean
def output_tsv(dataframe: pd.DataFrame)-> pd.DataFrame :
"""Convert pandas dataframe into a tsv file
Args:
dataframe (str): Pandas dataframe containing
representative transcripts and their expression level
Returns:
Tsv file containing representative transcripts
and their expression level in the same directory
Raises:
None
"""
pass
csv_file = dataframe.to_csv(os.getcwd()+"\ReprTrans_ExpressionLevel.tsv", sep="\t",
index=False, header=False)
return csv_file
### functions to run this part of the programm
def match_reprTranscript_expressionLevel(exprTrans:str, dict_reprTrans:dict, intermediate_file:str):
"""Combine functions to replace transcripts from an expression level csv/tsv file
with representative transcripts
Args:
exprTrans (str): csv or tsv file containing transcripts
and their expression level
dict_reprTrans (dict) : dict of genes and their
representative transcipt
intemediate_file (str) : txt file containing genes, transcript
and their expression level from the transkript_extractor function
Returns:
tsv file of representative trasncripts and their expression level
Raises:
None
"""
df_intermediate = repr.import_gtfSelection_to_df(intermediate_file)
df_geneTrans = transcripts_by_gene_inDf(df_intermediate)
df_exprTrans = tsv_or_csv_to_df(exprTrans)
df_reprTrans = dict_reprTrans_to_df(dict_reprTrans)
df_exprLevel_byGene = exprLevel_byGene(df_exprTrans, df_geneTrans)
df_match = match_byGene(df_reprTrans, df_exprLevel_byGene)
output = output_tsv(df_match)
return output
# run the programm
#dict_txt = a #input a dict of {gene:reprTrans} in the form of a txt file
#input_intermediate_file = b #input the intermediate file generated by transckript extractor
#input_expr = c #input a csv or tsv file containing the expr level
#dict_reprTrans = txt_to_dict(dict_txt)
#match_final = match_reprTranscript_expressionLevel(input_expr, dict_reprTrans, input_intermediate_file)
#print("this is the function :\n\n {}".format(match_final))
if __name__ == "__main__":
match_reprTranscript_expressionLevel()