Skip to content
Snippets Groups Projects
Commit 441f364b authored by Laura Urbanska's avatar Laura Urbanska
Browse files

added script and updated representative transcript script

parent 1d5a1253
No related branches found
No related tags found
No related merge requests found
import pandas as pd
import json
import re
import rerpresentative_v4 as repr
import os
def dict_reprTrans_to_df(dict_reprTrans: dict):
"""Convert a dictionary of genes and their representative transcript into a dataframe
Args:
dict_reprTrans (dict) : {'Gene':['transcriptA', 'transcriptB'], ...}
Returns:
Pandas dataframe having Gene and transcript as columns
Raises:
/!\ None, I wasn't able to make a TypeError with dict
: Only dict made of key string and value string is allowed
"""
pass
df_reprTrans = pd.DataFrame.from_dict(dict_reprTrans, orient="index", columns=["reprTranscript"])
df_reprTrans = df_reprTrans.reset_index(level=0)
df_reprTrans.columns = ["Gene", 'reprTrans']
df_reprTrans["reprTrans"] = df_reprTrans["reprTrans"].str.replace(r'\.[1-9]', '', regex=True)
return df_reprTrans
def txt_to_dict(dict_txt: str):
"""Convert a txt file into a dictionary
Args:
dict_txt (str) : pathe to a txt file of a dict
structured as {'Gene':['transcriptA', 'transcriptB'], ...}
Returns:
dict (dict) : dictionary stuctured as {'Gene':['transcriptA', 'transcriptB'], ...}
Raises:
None
"""
pass
input : str = open(dict_txt, "r").read()
input : str = input.replace("\'", "\"")
dict = json.loads(input)
return dict
def transcripts_by_gene_inDf(df_gtfSelection: str) -> pd.DataFrame:
"""Convert multiindex dataframe from function into a simple dataframe
Args:
df_gtfSelection (str): Pandas multiindex dataframe having Gene,
transcript as indexs and support level as columns.
Come from the function import_gtfSelection_to_df()
Returns:
df_gene (str): Pandas dataframe having Gene and
transcript as columns
Raises:
None
"""
pass
df_gene = df_gtfSelection.set_index(["Gene"])
df_gene = df_gene.drop(columns=["Support_level"])
df_gene['Transcript']=df_gene['Transcript'].str.replace(r"\.[0-9]","", regex=True)
df_gene = df_gene.reset_index(level=0)
return df_gene
def tsv_or_csv_to_df(input_txt:str) :
"""Convert tsv or csv file into a pandas dataframe
Args:
input_txt (str): csv or tsv file containing transcript expression level
Returns:
df_gene (str): Pandas dataframe having transcript and expression level
as columns
Raises:
None
"""
pass
df_input =pd.read_csv(input_txt, sep=r"[\t,]", lineterminator='\n',
names=["Transcript", "Expression_level"],
engine = "python")
return df_input
def exprLevel_byGene(df_exprTrasncript:str, df_output_gtf_selection:str) -> pd.DataFrame :
"""Find matching transcripts bewteen the 2 args
Args:
df_exprTranscript (str): pandas Dataframe containing transcript and their expression level
df_output_gtf_selection (str) : pandas Dataframe containing genes and transcripts
Returns:
Pandas dataframe having gene and sum of its transcript expression level
Raises:
None
"""
pass
df_merged = pd.merge(df_output_gtf_selection, df_exprTrasncript , how="inner", on="Transcript")
df_sum = df_merged.groupby("Gene").sum("Expression_level") # sum transcripts comming from the same gene
return df_sum
def match_byGene(df_reprTranscript:str, df_expressionLevel_byGene:str) -> pd.DataFrame:
"""Find matching genes bewteen the 2 args
Args:
df_reprTranscript (str): pandas Dataframe containing genes
and their representative transcript
df_expressionLevel_byGene (str) : pandas Dataframe containing
genes and their expression level
Returns:
Pandas dataframe having representative trasncripts
and their expression level
Raises:
None
"""
pass
df_merged = pd.merge(df_reprTranscript, df_expressionLevel_byGene , how="outer", on="Gene")
df_clean = df_merged.dropna(axis=0)
df_clean = df_clean.loc[:, ["reprTrans","Expression_level"]]
return df_clean
def output_tsv(dataframe:str)-> pd.DataFrame :
"""Convert pandas dataframe into a tsv file
Args:
dataframe (str): Pandas dataframe containing
representative transcripts and their expression level
Returns:
Tsv file containing representative transcripts
and their expression level in the same directory
Raises:
None
"""
pass
csv_file = dataframe.to_csv(os.getcwd()+"\ReprTrans_ExpressionLevel.tsv", sep="\t",
index=False, header=False)
return csv_file
### functions to run this part of the programm
def match_reprTranscript_expressionLevel(exprTrans:str, dict_reprTrans:dict, intermediate_file:str):
"""Combine functions to replace transcripts from an expression level csv/tsv file
with representative transcripts
Args:
exprTrans (str): csv or tsv file containing transcripts
and their expression level
dict_reprTrans (dict) : dict of genes and their
representative transcipt
intemediate_file (str) : txt file containing genes, transcript
and their expression level from the transkript_extractor function
Returns:
tsv file of representative trasncripts and their expression level
Raises:
None
"""
df_intermediate = repr.import_gtfSelection_to_df(intermediate_file)
df_geneTrans = transcripts_by_gene_inDf(df_intermediate)
df_exprTrans = tsv_or_csv_to_df(exprTrans)
df_reprTrans = dict_reprTrans_to_df(dict_reprTrans)
df_exprLevel_byGene = exprLevel_byGene(df_exprTrans, df_geneTrans)
df_match = match_byGene(df_reprTrans, df_exprLevel_byGene)
output = output_tsv(df_match)
return output
# run the programm
dict_txt = a #input a dict of {gene:reprTrans} in the form of a txt file
input_intermediate_file = b #input the intermediate file generated by transckript extractor
input_expr = c #input a csv or tsv file containing the expr level
dict_reprTrans = txt_to_dict(dict_txt)
match_final = match_reprTranscript_expressionLevel(input_expr, dict_reprTrans, input_intermediate_file)
print("this is the function :\n\n {}".format(match_final))
if __name__ == "__main__":
match_reprTranscript_expressionLevel()
\ No newline at end of file
import pandas as pd
import re
import itertools
'''
This code take as input a gtf file and returns a dictionary of transcripts with best support level of each gene of the input
'''
##import modified gtf file and create a df##
def import_gtfSelection_to_df(gtf_modified_file):
#create a df from the tab separated file input
df_input =pd.read_csv(gtf_modified_file, sep='\t', lineterminator='\n',
names =["Gene_mixed", "Transcript", "Support_level", "Na1", "Na2"] )
df_input["Support_level"] = df_input["Support_level"].replace(" ", "")
#Create a new column with only gene name from Gene_mixed column
df_input["Gene"] = df_input["Gene_mixed"].str.extract('([A-Z]\w{0,})', expand=True)
#Create a new column with only transcript number from Gene_mixed column
df_input["Transcript_number"] = df_input["Gene_mixed"].str.extract('(^\d)', expand=True)
#Create a new df with relevant column and without NA
df_clean = df_input.loc[:, ["Gene", "Transcript","Support_level"]]
df_clean["Gene"] = df_clean["Gene"].fillna(method='ffill')
df_clean = df_clean.dropna(axis=0)
return df_clean
##Returns a df containing representative transcripts and their expression level from genes mentioned in the csv file##
def representative_transcripts_inDict(df_gtfSelection):
#create a df indexed on booth Gene and Transcript columns
df_multIndex = df_gtfSelection.set_index(["Gene", "Transcript"])
#create a df with only the transcripts with the highest support level (best is = 1 )
df_min = df_multIndex.groupby(level=["Gene"])["Support_level"].transform("min")
print("\n=== This is your 10 first representative transcripts : === \n \n {}".format(df_min.head(10)))
#create a df without transcript levels
df_final = df_multIndex.reset_index(level="Transcript")
df_final = df_final.drop(columns=["Support_level"])
#create a dict with only Gene and representative transcripts
dict_representative_transcripts = df_final.groupby("Gene")["Transcript"].apply(list).to_dict()
return dict_representative_transcripts
### add your inputs here ! ###
gtf_file = "Homo_sapiens.GRCh38.107_intermediat_file.txt" # add the gtf input file here
df_gtf = import_gtfSelection_to_df(gtf_file)
dictionary_of_representative_transcripts = representative_transcripts_inDict(df_gtf)
import pandas as pd
'''
This part of the code take as input a gtf modified file
and return a dictionary of transcripts with best
support level for each gene of the input
'''
def import_gtfSelection_to_df(gtf_modified_file: str):
"""Import intermediate file from gtf and create a df
Args:
gtf_modified_file (str) : path to the intermediate file
Returns:
Pandas dataframe having Gene, transcript
and support level as columns
Raises:
TypeError : Only str path is allowed
"""
pass
if not type(gtf_modified_file) is str:
raise TypeError("Only str path is allowed")
df_input = pd.read_csv(gtf_modified_file, sep = '\t', lineterminator = '\n',
names = ["Gene_mixed", "Transcript", "Support_level", "Na1", "Na2"] )
df_input["Support_level"] = df_input["Support_level"].replace(" ", "")
df_input["Gene"] = df_input["Gene_mixed"].str.extract('([A-Z]\w{0,})', expand=True)
df_input["Transcript_number"] = df_input["Gene_mixed"].str.extract('(^\d)', expand=True)
df_clean = df_input.loc[:, ["Gene", "Transcript","Support_level"]]
df_clean["Gene"] = df_clean["Gene"].fillna(method = 'ffill')
df_clean = df_clean.dropna(axis = 0)
return df_clean
def representative_transcripts_inDict(df_gtfSelection: str) -> pd.DataFrame:
"""Return a dict containing for each gene transcripts
with highest confidence level
Args:
df_gtfSelection (str): Pandas dataframe having Gene,
transcript and support level as columns
Returns:
Dict {'Gene':['transcriptA', 'transcriptB'], ...}
Raises:
TypeError : Only pandas DataFrame is allowed
"""
pass
if not type(df_gtfSelection) is pd.DataFrame:
raise TypeError("Only pandas DataFrame is allowed")
df_multIndex = df_gtfSelection.set_index(["Gene", "Transcript"])
#highest support level = 1 , worst = 5, NA = 100
df_min = df_multIndex.groupby(level=["Gene"])["Support_level"].transform("min")
df_final = df_min.reset_index(level = "Transcript")
df_final = df_final.drop(columns = ["Support_level"])
dict_representative_transcripts = df_final.groupby("Gene")["Transcript"].apply(list).to_dict()
return dict_representative_transcripts
def find_repr_by_SupportLevel(intermediate_file:str):
"""Combine functions import_gtfSelection_to_df()
and representative_transcripts_inDict()
Args:
intermediate_file : path to the intermediate file
Returns:
Dict {'Gene':['transcriptA', 'transcriptB'], ...}
Raises:
None
"""
pass
df_gtf = import_gtfSelection_to_df(intermediate_file)
dict_reprTrans = representative_transcripts_inDict(df_gtf)
return dict_reprTrans
if __name__ == "__main__":
find_repr_by_SupportLevel()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment