diff --git a/scripts/transcript_sampler.ipynb b/scripts/transcript_sampler.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..175ba6d1bb075c96b0b243d76a73cc4540c9165a --- /dev/null +++ b/scripts/transcript_sampler.ipynb @@ -0,0 +1,574 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 85, + "metadata": {}, + "outputs": [], + "source": [ + "import argparse\n", + "import pandas as pd\n", + "import numpy as np\n", + "from gtfparse import read_gtf" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "find representative transcripts" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "metadata": {}, + "outputs": [], + "source": [ + "def attributs_converter(attributs):\n", + " \"\"\"\n", + " This funtion converts the \"unstrucktured\" ;-seperated part of he line into a list of identifyers and coresponding data the struckture of\n", + " which can be used ot find the data easyly e.g the index of the identifier transcrip_id + 1 will give the trasncript id of the current gene\n", + " Input: \n", + " attributs = str() #the unstrucktured part of the entry\n", + " Output:\n", + " attributs = list() # cleand list with the characterritsics discribed above\n", + " \"\"\"\n", + " attributs = attributs.replace(\"\\\"\",\"\")\n", + " attributs = attributs.replace(\";\",\"\")\n", + " attributs = attributs.replace(\"\\\\n\",\"\")\n", + " attributs =attributs.split(\" \")\n", + " \n", + " return(attributs)\n", + "\n", + "def find_in_attributs (attributs,look_for):\n", + " \"\"\"\n", + " This function finds a key word and used that to lokat the value of that key word e.g key = gene_id, value = 'ENSMUSG00002074970',\n", + " this works as they are next to each other in the attributs list. \n", + " Inputs:\n", + " sub_enty = list() \n", + " look_fore = str() #string of with the name of the key to look for\n", + " Output: \n", + " attributs[index] or NA = str() #NA is returned if the key was not found in the attributs\n", + " \"\"\"\n", + " try:\n", + " index = attributs.index(look_for)+1\n", + " return attributs[index]\n", + " except: \n", + " #print(\"No\",look_for,\"in the entry the return was set to NA\\n\",attributs)\n", + " return \"NA\"\n", + "\n", + "def _re_format(rep_trans_dict):\n", + " \"\"\"\n", + " This function is ment to reformat dictionary of the representatice transcripts into an dictionary with only one entry per key\n", + " Input:\n", + " rep_trans_dict = {gene_id : [transcript_id , transcript_support_level , transcript_length]}\n", + " Output: \n", + " rep_transcripts = {gene_id : transcript_id}\n", + " \"\"\"\n", + " rep_transcripts = dict()\n", + " for gene_id in rep_trans_dict: \n", + " rep_transcripts[gene_id] = rep_trans_dict[gene_id][0]\n", + " \n", + " return rep_transcripts\n", + " \n", + " \n", + "\n", + "def get_rep_trans(file_name = \"test\"):\n", + " \"\"\" \n", + " This is the main function of this script it selects one representative transcrip per gene based on a gtf annotation file. \n", + " It does so be two criteria: first the transcript support level and it there are several transcript \n", + " of one gene that have the same trasncript_support_level it chooses the one that corresponds to the longest mRNA.\n", + " Input: \n", + " file_name = str() # name of the annotation file with or without the .gtf part\n", + " Output: \n", + " rep_transcripts = {gene_id : transcript_id}\n", + " \"\"\"\n", + " \n", + " #setting defoult variables\n", + " rep_trans = dict()\n", + " cur_gID = str()\n", + " cur_best_trans = [str(),100,0] # [transcript_id , transcript_support_level , transcript_length]\n", + " pot_best_trans = False\n", + " cur_tID = str()\n", + " ignor_trans = False\n", + " \n", + " with open (file_name,\"r\") as f: \n", + " for line in f: \n", + " entry = line.split(\"\\t\")\n", + " \n", + " #removes expected but unneeded entrys\n", + " exp_unneed = [\"CDS\",\"stop_codon\",\"five_prime_utr\",\"three_prime_utr\",\"start_codon\",'Selenocysteine']\n", + " if len(entry) == 1 or entry[2] in exp_unneed:\n", + " continue\n", + " \n", + " #this function turns the less organized part of the entry into a reable list\n", + " attributs = attributs_converter(entry[8])\n", + " #looking for and processing exons entrys\n", + " if entry[2] == \"exon\": \n", + " \n", + " #dicide if to contiune or not\n", + " if ignor_trans: \n", + " continue\n", + " elif cur_gID != attributs[1]:\n", + " raise ValueError(\"ERROR exon from an unexpected Gen\")\n", + " continue\n", + " elif find_in_attributs (attributs,\"transcript_id\") != cur_tID:\n", + " raise ValueError(\"exon from an unexpected transcript\")\n", + " continue\n", + " \n", + " #adding the length of the exon to the appropriat list and chacking for changes in best transcript\n", + " if pot_best_trans: \n", + " pot_best_trans[2]+= int(entry[4])-int(entry[3])\n", + " if pot_best_trans[2] > cur_best_trans[2]: \n", + " cur_best_trans = pot_best_trans\n", + " pot_best_trans = False\n", + " else:\n", + " cur_best_trans[2]+= int(entry[4])-int(entry[3])\n", + "\n", + " \n", + " \n", + " #looking for and processing transcript entrys\n", + " elif entry[2] == \"transcript\":\n", + " \n", + " #varryfi that the gen is correct\n", + " if cur_gID != attributs[1]:\n", + " raise ValueError(\"ERROR transcript from an unexpected Gen\")\n", + " continue\n", + " \n", + " #finding the transcript id and the support level\n", + " cur_tID = find_in_attributs (attributs,\"transcript_id\") \n", + " t_supp_lvl = find_in_attributs (attributs,\"transcript_support_level\") \n", + " \n", + " #If there is no transcript support level or the level is given as NA it is nomed as 100. else the transcript support level is tunrn into int\n", + " if t_supp_lvl == \"NA\": \n", + " t_supp_lvl = 100\n", + " else:\n", + " try:\n", + " t_supp_lvl = int(t_supp_lvl)\n", + " except: \n", + " t_supp_lvl = 100\n", + " \n", + " \n", + " #decides if the transcript has potential to become the representative transcript\n", + " if t_supp_lvl < cur_best_trans[1] or cur_best_trans[0] == \"\":\n", + " cur_best_trans = [cur_tID,t_supp_lvl,0]\n", + " pot_best_trans = False\n", + " ignor_trans = False\n", + " \n", + " elif t_supp_lvl == cur_best_trans[1]:\n", + " pot_best_trans = [cur_tID,t_supp_lvl,0] \n", + " else:\n", + " ignor_trans = True\n", + " \n", + " \n", + " #looking for and processing gene entrys\n", + " elif entry[2] == \"gene\":\n", + " \n", + " #updating rep_trans dict\n", + " if cur_gID not in rep_trans: \n", + " rep_trans[cur_gID] = cur_best_trans\n", + " else: \n", + " if rep_trans[cur_gID][1] > cur_best_trans[1]: \n", + " rep_trans[cur_gID] = cur_best_trans\n", + " elif rep_trans[cur_gID][1] == cur_best_trans[1] and rep_trans[cur_gID][2] < cur_best_trans[2]: \n", + " rep_trans[cur_gID] = cur_best_trans\n", + " \n", + " #updating cur_gID and resetting cur_best_trans\n", + " cur_gID = attributs[1]\n", + " cur_best_trans = [str(),100,0]\n", + " \n", + " #raises an error for unidentifyable entrys\n", + " else: \n", + " raise ValueError(\"This entry could not be identified\\n\",entry)\n", + " \n", + " #addding the final gene to the dictionary\n", + " if cur_gID not in rep_trans: \n", + " rep_trans[cur_gID] = cur_best_trans\n", + " else: \n", + " if rep_trans[cur_gID][1] > cur_best_trans[1]: \n", + " rep_trans[cur_gID] = cur_best_trans\n", + " elif rep_trans[cur_gID][1] == cur_best_trans[1] and rep_trans[cur_gID][2] < cur_best_trans[2]: \n", + " rep_trans[cur_gID] = cur_best_trans \n", + " \n", + " del rep_trans[\"\"]\n", + " rep_transcripts = _re_format(rep_trans)\n", + " return(rep_transcripts )\n", + "\n", + "\n", + "def _test(): \n", + " \"\"\"\n", + " This funtion is ment to be run for test\n", + " Output: \n", + " file with the dictionary generated based on the test file \n", + " \"\"\"\n", + " file_name = \"test.gtf\"\n", + " rt = get_rep_trans(file_name)\n", + " expected_result = {\"ENSG00000160072\":\"ENST00000472194\",\"ENSG00000234396\":\"ENST00000442483\",\n", + " \"ENSG00000225972\":\"ENST00000416931\",\"ENSG00000224315\":\"ENST00000428803\",\n", + " \"ENSG00000198744\":\"ENST00000416718\",\"ENSG00000279928\":\"ENST00000624431\",\n", + " \"ENSG00000228037\":\"ENST00000424215\",'ENSG00000142611':'ENST00000378391'}\n", + " if rt != expected_result: \n", + " print(\"The test fail due to not yieding the same results\")\n", + " print(\"The results the program got\\n\",rt)\n", + " print(\"The expected results\\n\",expected_result)\n", + " else: \n", + " print(\"The test was succses full\")" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": {}, + "outputs": [], + "source": [ + "def gtf_file_writer (original_file, rep_transcript_dict, output_file): \n", + " \"\"\"\n", + " this function writes the output GTF file\n", + " \"\"\"\n", + " output = [] \n", + "\n", + " with open(original_file, 'r') as f:\n", + " for line in f: \n", + " entry = line.split(\"\\t\")\n", + " if line[0] != '#':\n", + " attributes = attributs_converter(entry[8])\n", + " type_ = entry[2]\n", + " else:\n", + " continue\n", + " if type_ == 'gene':\n", + " gene_id = find_in_attributs(attributes, 'gene_id')\n", + " output.append(line)\n", + " else:\n", + " transcript_id = find_in_attributs(attributes, 'transcript_id')\n", + " if rep_transcript_dict[gene_id] == transcript_id:\n", + " output.append(line)\n", + "\n", + " with open(output_file, 'w') as last_file:\n", + " for item in output : \n", + " last_file.write(item)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Match representative transcript and expression level file " + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": {}, + "outputs": [], + "source": [ + "def gtf_to_df(gtf_file:str)-> pd.DataFrame: \n", + " \"\"\"\n", + " This function take a .gtf file and convert it into a \n", + " dataframe containing gene_id and their transcripts_id.\n", + " Args:\n", + " gtf_file (str) : path to the .gtf file\n", + "\n", + " Returns:\n", + " df_gtf (pd.DataFrame) : pandas dataframe containing columns\n", + " gene_id and their transcripts_id.\n", + " Raises : \n", + " None \n", + " \n", + " \"\"\"\n", + " df_gtf = read_gtf(gtf_file)\n", + " df_gtf = df_gtf.loc[df_gtf[\"feature\"]==\"transcript\"]\n", + " df_gtf = df_gtf[[\"gene_id\",\"transcript_id\"]]\n", + " df_gtf = df_gtf.rename(columns={\"gene_id\":\"Gene\",\"transcript_id\":\"Transcript\"})\n", + " return df_gtf" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": {}, + "outputs": [], + "source": [ + "def dict_reprTrans_to_df(dict_reprTrans: dict[str, str]) -> pd.DataFrame:\n", + "\n", + " \"\"\"Convert a dictionary of genes and their representative transcript into a dataframe \n", + "\n", + " Args:\n", + " dict_reprTrans (dict) : {'Gene':['transcriptA', 'transcriptB'], ...}\n", + "\n", + " Returns:\n", + " Pandas dataframe having Gene and transcript as columns\n", + " \n", + " Raises:\n", + " Only dict are allowed\n", + " Key should be strings\n", + " Value should be strings\n", + " \n", + " \"\"\"\n", + " pass\n", + " if not type(dict_reprTrans) is dict:\n", + " raise TypeError(\"Only dict are allowed\")\n", + " if type(list(dict_reprTrans.keys())[0]) is not str:\n", + " raise TypeError(\"Key should be strings\")\n", + " if type(list(dict_reprTrans.values())[0]) is not str:\n", + " raise TypeError(\"Values should be strings\")\n", + "\n", + " df_reprTrans = pd.DataFrame.from_dict(\n", + " dict_reprTrans, orient=\"index\", columns=[\"reprTranscript\"]\n", + " )\n", + " df_reprTrans = df_reprTrans.reset_index(level=0)\n", + " df_reprTrans.columns = [\"Gene\", \"reprTrans\"]\n", + " df_reprTrans[\"reprTrans\"] = df_reprTrans[\"reprTrans\"].str.replace(\n", + " r\"\\.[1-9]\", \"\", regex=True\n", + " )\n", + " return df_reprTrans\n", + "\n", + "\n", + "\n", + "def tsv_or_csv_to_df(input_txt: str) -> pd.DataFrame:\n", + " \"\"\"Convert tsv or csv file into a pandas dataframe\n", + "\n", + " Args:\n", + " input_txt (str): csv or tsv file containing transcript expression level\n", + "\n", + " Returns:\n", + " df_gene (str): Pandas dataframe having transcript and expression level\n", + " as columns \n", + " \n", + " Raises:\n", + " None \n", + " \"\"\"\n", + " pass\n", + " df_input = pd.read_csv(\n", + " input_txt,\n", + " sep=r\"[\\t,]\",\n", + " lineterminator=\"\\n\",\n", + " names=[\"Transcript\", \"Expression_level\"],\n", + " engine=\"python\",\n", + " )\n", + " return df_input\n", + "\n", + "\n", + "def exprLevel_byGene(\n", + " df_exprTrasncript: pd.DataFrame, df_output_gtf_selection: pd.DataFrame\n", + ") -> pd.DataFrame:\n", + " \"\"\"find the gene of each transcipt given by the expression level csv/tsv file,\n", + " and summ expression level of all transcipts from the same gene. \n", + "\n", + " Args:\n", + " df_exprTranscript : pandas Dataframe containing transcript and their expression level,\n", + " generated by \"tsv_or_csv_to_df\" function\n", + " df_output_gtf_selection : pandas Dataframe containing genes and transcripts,\n", + " generated by \"transcripts_by_gene_inDf\" function \n", + "\n", + " Returns:\n", + " Pandas dataframe having gene and sum of its transcript expression level\n", + " \n", + " Raises:\n", + " None \n", + " \"\"\"\n", + " pass\n", + " df_merged = pd.merge(\n", + " df_output_gtf_selection, df_exprTrasncript, how=\"inner\", on=\"Transcript\"\n", + " )\n", + " df_sum = df_merged.groupby(\"Gene\").sum(\n", + " \"Expression_level\"\n", + " ) \n", + " return df_sum\n", + "\n", + "\n", + "def match_byGene(\n", + " df_reprTranscript: pd.DataFrame, df_expressionLevel_byGene: pd.DataFrame\n", + ") -> pd.DataFrame:\n", + " \"\"\"Find matching genes bewteen the 2 args \n", + "\n", + " Args:\n", + " df_reprTranscript : pandas Dataframe containing genes \n", + " and their representative transcript, generated by\n", + " \"dict_reprTrans_to_df()\" \n", + " df_expressionLevel_byGene : pandas Dataframe containing \n", + " genes and their expression level generated by \n", + " \"transcript_by_gene_inDf()\"\n", + "\n", + " Returns:\n", + " Pandas dataframe having representative trasncripts \n", + " and their expression level\n", + " \n", + " Raises:\n", + " None \n", + " \"\"\"\n", + " pass\n", + " df_merged = pd.merge(\n", + " df_reprTranscript, df_expressionLevel_byGene, how=\"outer\", on=\"Gene\"\n", + " )\n", + " df_clean = df_merged.dropna(axis=0)\n", + " df_clean = df_clean.loc[:, [\"reprTrans\", \"Expression_level\"]]\n", + " return df_clean\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "### functions to run this part of the programm\n", + "\n", + "\n", + "def match_reprTranscript_expressionLevel(\n", + " exprTrans: str, dict_reprTrans: dict, gtf_file: str,\n", + "):\n", + " \"\"\"Combine functions to replace transcripts from an expression level csv/tsv file \n", + " with representative transcripts \n", + "\n", + " Args:\n", + " exprTrans (str): csv or tsv file containing transcripts\n", + " and their expression level \n", + " dict_reprTrans (dict) : dict of genes and their \n", + " representative transcipt\n", + " intemediate_file (str) : txt file containing genes, transcript \n", + " and their expression level from the transkript_extractor function\n", + " output_path : path indicating were the tsv file should be written\n", + "\n", + " Returns:\n", + " tsv file of representative trasncripts and their expression level\n", + " \n", + " Raises:\n", + " None \n", + " \"\"\"\n", + " df_gene_transcript = gtf_to_df(gtf_file)\n", + " df_exprTrans = tsv_or_csv_to_df(exprTrans)\n", + " df_reprTrans = dict_reprTrans_to_df(dict_reprTrans)\n", + " df_exprLevel_byGene = exprLevel_byGene(df_exprTrans, df_gene_transcript)\n", + " df_match = match_byGene(df_reprTrans, df_exprLevel_byGene)\n", + " df_match.rename(columns = {'reprTrans':'id', 'Expression_level':'level'}, inplace = True)\n", + " return df_match\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "poisson sampling" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "metadata": {}, + "outputs": [], + "source": [ + "def transcript_sampling(total_transcript_number, df_repr, output_csv):\n", + " df = df_repr \n", + " levels = []\n", + " sums = df['level'].tolist()\n", + " total = sum(sums)\n", + " total_transcript_number=int(total_transcript_number)\n", + " normalized = total_transcript_number/total\n", + " for expression_level in df['level']:\n", + " poisson_sampled = np.random.poisson(expression_level*normalized)\n", + " levels.append(poisson_sampled)\n", + "\n", + " transcript_numbers = pd.DataFrame({'id': df['id'],'count': levels})\n", + " pd.DataFrame.to_csv(transcript_numbers, output_csv)\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "execution" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": {}, + "outputs": [], + "source": [ + "input_gtf = r\"C:\\Users\\frr665\\Python_M1_VSC\\transcript_sampler_15.01.23\\transcript-sampler\\input_files\\test.gtf\"\n", + "input_csv = r\"C:\\Users\\frr665\\Python_M1_VSC\\transcript_sampler_15.01.23\\transcript-sampler\\input_files\\expression.csv\"\n", + "number_to_sample = 100\n", + "output_gtf = r\"C:\\Users\\frr665\\Python_M1_VSC\\transcript_sampler_15.01.23\\transcript-sampler\\images\\output_gtf.gtf\"\n", + "output_csv = r\"C:\\Users\\frr665\\Python_M1_VSC\\transcript_sampler_15.01.23\\transcript-sampler\\images\\output_csv.csv\"" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": {}, + "outputs": [], + "source": [ + "def exe(input_gtf, input_csv, transcript_nr, output_gtf, output_csv, input_free = True):\n", + " dict_repr_trans = get_rep_trans(input_gtf)\n", + " df_repr = match_reprTranscript_expressionLevel(dict_reprTrans=dict_repr_trans, exprTrans=input_csv, gtf_file=input_gtf)\n", + " print(df_repr)\n", + " print(\"Finiding match between representative transcripts and expression level file\") \n", + " print(\"Poisson sampling of transcripts\")\n", + " transcript_sampling(transcript_nr, df_repr, output_csv)\n", + " print(\"output csv file ready\")\n", + " print(\"writing output gtf file\")\n", + " gtf_file_writer(input_gtf, dict_repr_trans, output_gtf)" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:root:Extracted GTF attributes: ['gene_id', 'gene_version', 'gene_name', 'gene_source', 'gene_biotype', 'transcript_id', 'transcript_version', 'transcript_name', 'transcript_source', 'transcript_biotype', 'tag', 'ccds_id', 'exon_number', 'exon_id', 'exon_version', 'protein_id', 'protein_version', 'transcript_support_level']\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " id level\n", + "0 ENST00000472194 1.980250\n", + "1 ENST00000442483 0.838144\n", + "7 ENST00000378391 0.914558\n", + "Finiding match between representative transcripts and expression level file\n", + "Poisson sampling of transcripts\n", + "output csv file ready\n", + "writing output gtf file\n" + ] + } + ], + "source": [ + "exe(input_gtf=input_gtf, input_csv=input_csv, transcript_nr=number_to_sample, output_gtf=output_gtf, output_csv=output_csv)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "1e2fb702d9886c77776a2bcd731cac0877c51b973772daa49239be006cec7b9a" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}