Skip to content
Snippets Groups Projects
match_reprtranscript_expressionlevel.py 7.26 KiB
import pandas as pd
import json
import re
import representative as repr
import os


def dict_reprTrans_to_df(dict_reprTrans: dict[str,str])-> pd.DataFrame:

    """Convert a dictionary of genes and their representative transcript into a dataframe 

        Args:
            dict_reprTrans (dict) : {'Gene':['transcriptA', 'transcriptB'], ...}

        Returns:
            Pandas dataframe having Gene and transcript as columns
      
        Raises:
            Only dict are allowed
            Key should be strings
            Value should be strings
          
    """
    pass
    if not type(dict_reprTrans) is dict :
        raise TypeError("Only dict are allowed")
    if type(list(dict_reprTrans.keys())[0]) is not str :
        raise TypeError("Key should be strings")
    if type(list(dict_reprTrans.values())[0]) is not str :
        raise TypeError("Values should be strings")

    df_reprTrans = pd.DataFrame.from_dict(dict_reprTrans, orient="index", columns=["reprTranscript"])
    df_reprTrans = df_reprTrans.reset_index(level=0)
    df_reprTrans.columns = ["Gene", 'reprTrans']
    df_reprTrans["reprTrans"] = df_reprTrans["reprTrans"].str.replace(r'\.[1-9]', '', regex=True)
    return df_reprTrans


def txt_to_dict(dict_txt: str) -> dict:
    """Convert a txt file into a dictionary 

        Args:
            dict_txt (str) : pathe to a txt file of a dict
            structured as {'Gene':['transcriptA', 'transcriptB'], ...}

        Returns:
            dict (dict) : dictionary stuctured as {'Gene':['transcriptA', 'transcriptB'], ...}
      
        Raises:
            None          
    """
    pass

    input : str = open(dict_txt, "r").read()
    input : str = input.replace("\'", "\"")
    dict = json.loads(input)
    return dict



def transcripts_by_gene_inDf(df_gtfSelection: pd.DataFrame) -> pd.DataFrame:
    """Convert multiindex dataframe from function into a simple dataframe 

        Args:
            df_gtfSelection : Pandas multiindex dataframe having Gene,
            transcript as indexs and support level as columns. 
            Come from the function import_gtfSelection_to_df()
            from representative.py script. 

        Returns:
            df_gene (str): Pandas dataframe having Gene and
            transcript as columns 
      
        Raises:
            None          
    """
    pass
    df_gene = df_gtfSelection.set_index(["Gene"])
    df_gene = df_gene.drop(columns=["Support_level"])
    df_gene['Transcript']=df_gene['Transcript'].str.replace(r"\.[0-9]","", regex=True)
    df_gene = df_gene.reset_index(level=0)
    return df_gene


def tsv_or_csv_to_df(input_txt:str) -> pd.DataFrame :
    """Convert tsv or csv file into a pandas dataframe

        Args:
            input_txt (str): csv or tsv file containing transcript expression level

        Returns:
            df_gene (str): Pandas dataframe having transcript and expression level
            as columns  
      
        Raises:
            None          
    """
    pass
    df_input =pd.read_csv(input_txt, sep=r"[\t,]", lineterminator='\n',
     names=["Transcript", "Expression_level"],
     engine = "python")
    return df_input


def exprLevel_byGene(df_exprTrasncript:pd.DataFrame, df_output_gtf_selection:pd.DataFrame) -> pd.DataFrame :
    """find the gene of each transcipt given by the expression level csv/tsv file,
       and summ expression level of all transcipts from the same gene. 

        Args:
            df_exprTranscript : pandas Dataframe containing transcript and their expression level,
            generated by "tsv_or_csv_to_df" function
            df_output_gtf_selection : pandas Dataframe containing genes and transcripts,
            generated by "transcripts_by_gene_inDf" function 

        Returns:
            Pandas dataframe having gene and sum of its transcript expression level
      
        Raises:
            None          
    """
    pass 
    df_merged = pd.merge(df_output_gtf_selection, df_exprTrasncript , how="inner", on="Transcript")
    df_sum = df_merged.groupby("Gene").sum("Expression_level") # sum transcripts comming from the same gene  
    return df_sum

def match_byGene(df_reprTranscript: pd.DataFrame, df_expressionLevel_byGene:pd.DataFrame) -> pd.DataFrame: 
    """Find matching genes bewteen the 2 args 

        Args:
            df_reprTranscript : pandas Dataframe containing genes 
            and their representative transcript, generated by
            "dict_reprTrans_to_df()" 
            df_expressionLevel_byGene : pandas Dataframe containing 
            genes and their expression level generated by 
            "transcript_by_gene_inDf()"

        Returns:
            Pandas dataframe having representative trasncripts 
            and their expression level
      
        Raises:
            None          
    """
    pass 
    df_merged = pd.merge(df_reprTranscript, df_expressionLevel_byGene , how="outer", on="Gene")
    df_clean = df_merged.dropna(axis=0)
    df_clean = df_clean.loc[:, ["reprTrans","Expression_level"]]
    return df_clean

def output_tsv(dataframe: pd.DataFrame)-> pd.DataFrame :
    """Convert pandas dataframe into a tsv file 

        Args:
            dataframe (str): Pandas dataframe containing
            representative transcripts and their expression level 

        Returns:
            Tsv file containing representative transcripts
             and their expression level in the same directory
      
        Raises:
            None          
    """
    pass 

    csv_file = dataframe.to_csv(os.getcwd()+"\ReprTrans_ExpressionLevel.tsv", sep="\t", 
    index=False, header=False)
    return csv_file

### functions to run this part of the programm

def match_reprTranscript_expressionLevel(exprTrans:str, dict_reprTrans:dict, intermediate_file:str): 
    """Combine functions to replace transcripts from an expression level csv/tsv file 
       with representative transcripts 

        Args:
            exprTrans (str): csv or tsv file containing transcripts
            and their expression level 
            dict_reprTrans (dict) : dict of genes and their 
            representative transcipt
            intemediate_file (str) : txt file containing genes, transcript 
            and their expression level from the transkript_extractor function

        Returns:
            tsv file of representative trasncripts and their expression level
      
        Raises:
            None          
    """
    df_intermediate = repr.import_gtfSelection_to_df(intermediate_file)
    df_geneTrans = transcripts_by_gene_inDf(df_intermediate)
    df_exprTrans = tsv_or_csv_to_df(exprTrans)
    df_reprTrans = dict_reprTrans_to_df(dict_reprTrans)
    df_exprLevel_byGene = exprLevel_byGene(df_exprTrans, df_geneTrans)
    df_match = match_byGene(df_reprTrans, df_exprLevel_byGene)
    output = output_tsv(df_match)
    return output


# run the programm 

#dict_txt = a #input a dict of {gene:reprTrans} in the form of a txt file
#input_intermediate_file = b #input the intermediate file generated by transckript extractor
#input_expr = c #input a csv or tsv file containing the expr level 

#dict_reprTrans = txt_to_dict(dict_txt)
#match_final = match_reprTranscript_expressionLevel(input_expr, dict_reprTrans, input_intermediate_file)
#print("this is the function :\n\n {}".format(match_final))

if __name__ == "__main__":  
    match_reprTranscript_expressionLevel()