diff --git a/.gitignore b/.gitignore deleted file mode 100644 index 2e906cc871537e0816ae7fd4437ade59afa2986c..0000000000000000000000000000000000000000 --- a/.gitignore +++ /dev/null @@ -1,5 +0,0 @@ -# ignore ALL .log files -*.log - -# ignore ALL files in ANY directory named temp -temp/ diff --git a/Inputs_files/mini_file.txt b/Inputs_files/mini_file.txt deleted file mode 100644 index ba6539a78d4752c68a2408395dd471a0f42f501e..0000000000000000000000000000000000000000 --- a/Inputs_files/mini_file.txt +++ /dev/null @@ -1,19 +0,0 @@ ->ATAD3B -1 ENST00000673477 NA 6 -2 ENST00000472194 1 43 -3 ENST00000378736 5 58 -4 ENST00000485748 2 63 -5 ENST00000474481 2 74 -6 ENST00000308647 1 80 -7 ENST00000442483 3 113 ->PRDM16 -1 ENST00000511072 5 138 -2 ENST00000607632 2 175 -3 ENST00000378391 1 178 -4 ENST00000514189 5 217 -5 ENST00000270722 1 254 -6 ENST00000512462 1 293 -7 ENST00000463591 5 310 -8 ENST00000509860 5 319 -9 ENST00000378389 5 348 -10 ENST00000606170 4 354 \ No newline at end of file diff --git a/LICENSE b/LICENSE deleted file mode 100644 index edb874900e3f120cb97930894f9a0c54cbb340b1..0000000000000000000000000000000000000000 --- a/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -MIT License - -Copyright (c) 2022 zavolan_group / tools - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. diff --git a/README.md b/README.md deleted file mode 100644 index 64a092fbd9ad6482a7c3cac4b6b0570aa7743a20..0000000000000000000000000000000000000000 --- a/README.md +++ /dev/null @@ -1,10 +0,0 @@ -# Transcript Sampler - -This workflow takes as input: - - genome annotation gtf file - - expression levels of each gene - - csv file with transcript IDs and expression levels - - The output is a trancript sample gtf file and csv file containing transcript IDs and counts. - - diff --git a/images/.gitkeep b/images/.gitkeep deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/images/screenshot_git_tutorial_1_hGillet.png b/images/screenshot_git_tutorial_1_hGillet.png deleted file mode 100644 index 68151e8b0b837c03b7fcac317aa6a989333244f5..0000000000000000000000000000000000000000 Binary files a/images/screenshot_git_tutorial_1_hGillet.png and /dev/null differ diff --git a/images/screenshot_git_tutorial_2_hGillet.png b/images/screenshot_git_tutorial_2_hGillet.png deleted file mode 100644 index ec1d38848ce1a364475038fb0a74b49e4f6cce07..0000000000000000000000000000000000000000 Binary files a/images/screenshot_git_tutorial_2_hGillet.png and /dev/null differ diff --git a/images/screenshot_markdown_tutorial_hGillet.png b/images/screenshot_markdown_tutorial_hGillet.png deleted file mode 100644 index a3ea90d1c9fa47190015f028d4b7fb09a0e0031b..0000000000000000000000000000000000000000 Binary files a/images/screenshot_markdown_tutorial_hGillet.png and /dev/null differ diff --git a/scripts/Excecution_file.py b/scripts/Excecution_file.py deleted file mode 100644 index 788525eb4438f485ed1e2a554be3f61ccc102170..0000000000000000000000000000000000000000 --- a/scripts/Excecution_file.py +++ /dev/null @@ -1,20 +0,0 @@ -### Imports ### -import os - -import transkript_extractor as te -import Exon_length_filter as elf -import representative_v4 as rtcl - -### Scipt ### -def exe(file_name = "test",source_pathway_name = os.getcwd(),deposit_pathway_name = os.getcwd(),Input_free = True): - file_name,source_pathway_name_2,deposit_pathway_name_2 = te.extract_transkript(file_name,source_pathway_name,deposit_pathway_name,Input_free = Input_free) - inter_mediate_file_directory = os.path.join(deposit_pathway_name,file_name+"_intermediate_file.txt") - print("Transcripts are filterd based on transcipt score please wait...") - pre_filter_representative_transcripts_dict = rtcl.find_repr_by_SupportLevel(inter_mediate_file_directory) - print("Transcripts filtered\n") - elf.exon_length_filter(file_name,source_pathway_name,deposit_pathway_name,gen_dict= pre_filter_representative_transcripts_dict,Input_free = Input_free) - return(file_name,source_pathway_name,deposit_pathway_name) -### from consol #### -##D:\\Uni\\Sem 9\\Programing in the Life sciences\\Projekt\\Intermediat Files -if __name__ == "__main__": - exe() \ No newline at end of file diff --git a/scripts/Exon_length_filter.py b/scripts/Exon_length_filter.py deleted file mode 100644 index 162ff1f302b4ef144d9c34bde47b5b69c4ba72c0..0000000000000000000000000000000000000000 --- a/scripts/Exon_length_filter.py +++ /dev/null @@ -1,188 +0,0 @@ -#### Exon length filter ##### -"""Exon length filter -Version 1.1.0""" -### Called Packages ### -import re -import os - -import transkript_extractor as te -### Functions ### - -def exon_length_calculator(entry): - """This funtion finds the start and end cordinates of the exon and uses them to calculate its lenght""" - try: - find_exon_coordinates = re.compile("\t\d{1,15}\t") - #this difines the pattern of the coordinates - try_find_start_coordinates = find_exon_coordinates.search(entry) - #this line findes the start coordinares based on the pattern - start_coordinates = int(try_find_start_coordinates[0].replace("\t","")) - #this line removes the \t at the end and the start of the pattern and - #turn the string of the coordinates into intergers - final_index_start_coordinates = entry.find(try_find_start_coordinates[0])+len(try_find_start_coordinates[0])-1 - #this line determines the indes of the final digit of the start coordinates - sub_entry = entry[final_index_start_coordinates:] - #this lineused the index determin above a starting point for a new sub entry - try_find_end_coordinates = find_exon_coordinates.search(sub_entry) - end_coordinates = int(try_find_end_coordinates[0].replace("\t","")) - #these two lines find the end coordinates and turn tham int an int - exon_lenght = end_coordinates-start_coordinates - #this line claculates the transcript length - except: - print("\n\nIn the following enty only one or no valid coordinates could be found:\n",entry,"the value will be set to NA") - exon_lenght = "NA" - return(exon_lenght) - -def exon_fider(entry): - """This funtion determines if a given entry belongs to an exon - Expected inputs: - entry: str #any enty of a gtf file""" - exon_test = entry.find("\texon\t") - #This line look for the entry exon in the file - if exon_test == -1: - try_exon_test = False - else: - try_exon_test = True - #The block above evaluates the results of the search for the wort exon - return(try_exon_test) - -def __longest_transcript_finder(current_exon_length,longest_transcript,longest_transcript_ID,old_transcript_ID): - """This funtion encapsulates an opperation that has to be carried out at several point ind the exon_length_filter funktion and servers to make that funktion more modular""" - if current_exon_length > longest_transcript: - #This condition updates the most promesing for - #beeing the representative transcript - longest_transcript = current_exon_length - longest_transcript_ID = old_transcript_ID - current_exon_length = 0 - return(current_exon_length,longest_transcript,longest_transcript_ID) - -def _representative_transcript_csv (representative_transcript,file_name = "test",deposit_pathway_name =os.getcwd()): - with open(os.path.join(deposit_pathway_name,file_name+"_"+"representative_transcripts"+".csv"),"w") as rt: - for i in representative_transcript: - transcript = representative_transcript[i] - new_entry = str(i)+","+transcript+"\n" - rt.write(new_entry) - - - -def _exon_length_filter(file_name = "test",source_pathway_name = os.getcwd(),deposit_pathway_name =os.getcwd(),gen_dict = {"ENSG00000160072":["ENST00000673477","ENST00000472194","ENST00000378736","ENST00000308647","ENST00000442483"],"ENSG00000225972":["ENST00000416931"],"ENSG00000279928":["ENST00000624431","ENST00000424215"],"ENSG00000142611":["ENST00000378391","ENST00000607632","ENST00000511072"]}): - """This funtion selects only the transcripts for a dictionar that have the longest total mRNA""" - bar,start_time = te.bar_builder(length_multiplyer = 3) - total_genes = len(gen_dict) - gens_done = 0 - - with open(os.path.join(source_pathway_name,file_name+".gtf"), 'r') as f: - - old_gen = str() - old_transcript_ID = str() - representative_transcript = dict() - representative_trasnscript_not_found = True - longest_transcript_ID = str() - current_exon_length = 0 - longest_transcript = 0 - percentage_done = 0 - - for entry in f: - - try: - corrent_gen = te.gene_ID_finder(entry) - except: - corrent_gen = old_gen - #The block above test if there is a gen name in the entry - if corrent_gen != old_gen: - representative_trasnscript_not_found = True - - #The block above determines if the Gen name is new and set the test - #representative_trasnscript_not_found back to true which is used to - #make the program faster if there is just one transcript for a given - #gen in the dict - if representative_trasnscript_not_found and corrent_gen != str(): - #print(corrent_gen) - #The conditon prvents serges if a representative transcript has - #all ready been chosen - if corrent_gen != old_gen: - current_exon_length,longest_transcript,longest_transcript_ID = __longest_transcript_finder(current_exon_length,longest_transcript,longest_transcript_ID,old_transcript_ID) - representative_transcript[old_gen] = longest_transcript_ID - try: - del gen_dict[old_gen] - old_gen = corrent_gen - gens_done += 1 - corrent_percentage_done = (gens_done/total_genes)*100 - if corrent_percentage_done > percentage_done+10: - bar,start_time = te.bar_builder(percentage=percentage_done+10,length_multiplyer = 3,start_time=start_time,bar =bar) - percentage_done = int(corrent_percentage_done) - - - except: - old_gen = corrent_gen - longest_transcript = 0 - #The block above adds the transcript of the last gen that - #had the longest exons into the representative transcripts dict - try: - #This try / except block test if the gen is in the input dictionary - transcript_IDs = gen_dict[corrent_gen] - if len(gen_dict[corrent_gen]) == 1: - #This conditions is a short cut for Genes that - #allready have a representative transcript - representative_transcript=gen_dict[corrent_gen[0]] - representative_trasnscript_not_found = False - continue - except: - continue - - try: - current_transcript_ID = te.transcript_ID_finder(entry) - except: - continue - #The block above searches for a trnascript ID in the current enty - - if current_transcript_ID in transcript_IDs: - #This condition test if the Transcript is one of the - #candidates for representative transcripts - if current_transcript_ID != old_transcript_ID: - #This condition if the enty still belongs to the - #previous transcript and is triggers if that is not the case - current_exon_length,longest_transcript,longest_transcript_ID = __longest_transcript_finder(current_exon_length,longest_transcript,longest_transcript_ID,old_transcript_ID) - try: - transcript_IDs.remove(old_transcript_ID) - old_transcript_ID = current_transcript_ID - except: - old_transcript_ID = current_transcript_ID - if exon_fider(entry): - exon_length = exon_length_calculator(entry) - current_exon_length += exon_length - else: - continue - current_exon_length,longest_transcript,longest_transcript_ID = __longest_transcript_finder(current_exon_length,longest_transcript,longest_transcript_ID,old_transcript_ID) - representative_transcript[old_gen] = longest_transcript_ID - del representative_transcript[str()] - te.bar_builder(100,length_multiplyer = 3,start_time=start_time,bar =bar) - return(representative_transcript) - -def exon_length_filter(file_name = "test",source_pathway_name = os.getcwd(),deposit_pathway_name =os.getcwd(),gen_dict = {"ENSG00000160072":["ENST00000673477","ENST00000472194","ENST00000378736","ENST00000308647","ENST00000442483"],"ENSG00000225972":["ENST00000416931"],"ENSG00000279928":["ENST00000624431","ENST00000424215"],"ENSG00000142611":["ENST00000378391","ENST00000607632","ENST00000511072"]},Input_free = False): - """This function filters a dictionary of genes and there transcripts by the length of there exons an selects the longes transcript for each gene ans saves tham in a "," seperated csv file. - Expected inputs: - file_name: str ; default = test #the name of the gft file you want to look at - source_pathway_name: str ; default = current work directory #path of the gtf file - deposit_pathway_name: str ; default = current work directory #path for saving the csv file - gen_dict:dict{key == gene ID:[transcript IDs that belong to that gene]} - Input_free: tuple ; default = False # this input should be set to True for automation""" - - print("Representative trascipts are filterd based on exon length please wait...") - source_pathway_name,deposit_pathway_name = te.__do_pathways_exist__(source_pathway_name,deposit_pathway_name) - if Input_free: - pre_existing_file = False - else: - search_profile = file_name+"_"+"representative_transcripts"+".csv" - pre_existing_file = te.__searche_for_preexisting_files(search_profile,deposit_pathway_name) - if pre_existing_file == False: - representative_transcript = _exon_length_filter(file_name,source_pathway_name,deposit_pathway_name,gen_dict) - _representative_transcript_csv(representative_transcript,file_name,deposit_pathway_name) - print("\nRepresentative transcripts collected") - - -if __name__ == "__main__": - help(exon_length_filter) - exon_length_filter() - - -#This line allows the file to be executed on its own also from \ No newline at end of file diff --git a/scripts/match_reprTranscript_expressionLevel.py b/scripts/match_reprTranscript_expressionLevel.py deleted file mode 100644 index 2dfca50a3c3ee1458b7358fbd1e1d801751d727c..0000000000000000000000000000000000000000 --- a/scripts/match_reprTranscript_expressionLevel.py +++ /dev/null @@ -1,200 +0,0 @@ - -import pandas as pd -import json -import re -import rerpresentative_v4 as repr -import os - - -def dict_reprTrans_to_df(dict_reprTrans: dict): - - """Convert a dictionary of genes and their representative transcript into a dataframe - - Args: - dict_reprTrans (dict) : {'Gene':['transcriptA', 'transcriptB'], ...} - - Returns: - Pandas dataframe having Gene and transcript as columns - - Raises: - /!\ None, I wasn't able to make a TypeError with dict - : Only dict made of key string and value string is allowed - - """ - pass - - df_reprTrans = pd.DataFrame.from_dict(dict_reprTrans, orient="index", columns=["reprTranscript"]) - df_reprTrans = df_reprTrans.reset_index(level=0) - df_reprTrans.columns = ["Gene", 'reprTrans'] - df_reprTrans["reprTrans"] = df_reprTrans["reprTrans"].str.replace(r'\.[1-9]', '', regex=True) - return df_reprTrans - - -def txt_to_dict(dict_txt: str): - """Convert a txt file into a dictionary - - Args: - dict_txt (str) : pathe to a txt file of a dict - structured as {'Gene':['transcriptA', 'transcriptB'], ...} - - Returns: - dict (dict) : dictionary stuctured as {'Gene':['transcriptA', 'transcriptB'], ...} - - Raises: - None - """ - pass - - input : str = open(dict_txt, "r").read() - input : str = input.replace("\'", "\"") - dict = json.loads(input) - return dict - - - -def transcripts_by_gene_inDf(df_gtfSelection: str) -> pd.DataFrame: - """Convert multiindex dataframe from function into a simple dataframe - - Args: - df_gtfSelection (str): Pandas multiindex dataframe having Gene, - transcript as indexs and support level as columns. - Come from the function import_gtfSelection_to_df() - - Returns: - df_gene (str): Pandas dataframe having Gene and - transcript as columns - - Raises: - None - """ - pass - df_gene = df_gtfSelection.set_index(["Gene"]) - df_gene = df_gene.drop(columns=["Support_level"]) - df_gene['Transcript']=df_gene['Transcript'].str.replace(r"\.[0-9]","", regex=True) - df_gene = df_gene.reset_index(level=0) - return df_gene - - -def tsv_or_csv_to_df(input_txt:str) : - """Convert tsv or csv file into a pandas dataframe - - Args: - input_txt (str): csv or tsv file containing transcript expression level - - Returns: - df_gene (str): Pandas dataframe having transcript and expression level - as columns - - Raises: - None - """ - pass - df_input =pd.read_csv(input_txt, sep=r"[\t,]", lineterminator='\n', - names=["Transcript", "Expression_level"], - engine = "python") - return df_input - - -def exprLevel_byGene(df_exprTrasncript:str, df_output_gtf_selection:str) -> pd.DataFrame : - """Find matching transcripts bewteen the 2 args - - Args: - df_exprTranscript (str): pandas Dataframe containing transcript and their expression level - df_output_gtf_selection (str) : pandas Dataframe containing genes and transcripts - - Returns: - Pandas dataframe having gene and sum of its transcript expression level - - Raises: - None - """ - pass - df_merged = pd.merge(df_output_gtf_selection, df_exprTrasncript , how="inner", on="Transcript") - df_sum = df_merged.groupby("Gene").sum("Expression_level") # sum transcripts comming from the same gene - return df_sum - -def match_byGene(df_reprTranscript:str, df_expressionLevel_byGene:str) -> pd.DataFrame: - """Find matching genes bewteen the 2 args - - Args: - df_reprTranscript (str): pandas Dataframe containing genes - and their representative transcript - df_expressionLevel_byGene (str) : pandas Dataframe containing - genes and their expression level - - Returns: - Pandas dataframe having representative trasncripts - and their expression level - - Raises: - None - """ - pass - df_merged = pd.merge(df_reprTranscript, df_expressionLevel_byGene , how="outer", on="Gene") - df_clean = df_merged.dropna(axis=0) - df_clean = df_clean.loc[:, ["reprTrans","Expression_level"]] - return df_clean - -def output_tsv(dataframe:str)-> pd.DataFrame : - """Convert pandas dataframe into a tsv file - - Args: - dataframe (str): Pandas dataframe containing - representative transcripts and their expression level - - Returns: - Tsv file containing representative transcripts - and their expression level in the same directory - - Raises: - None - """ - pass - - csv_file = dataframe.to_csv(os.getcwd()+"\ReprTrans_ExpressionLevel.tsv", sep="\t", - index=False, header=False) - return csv_file - -### functions to run this part of the programm - -def match_reprTranscript_expressionLevel(exprTrans:str, dict_reprTrans:dict, intermediate_file:str): - """Combine functions to replace transcripts from an expression level csv/tsv file - with representative transcripts - - Args: - exprTrans (str): csv or tsv file containing transcripts - and their expression level - dict_reprTrans (dict) : dict of genes and their - representative transcipt - intemediate_file (str) : txt file containing genes, transcript - and their expression level from the transkript_extractor function - - Returns: - tsv file of representative trasncripts and their expression level - - Raises: - None - """ - df_intermediate = repr.import_gtfSelection_to_df(intermediate_file) - df_geneTrans = transcripts_by_gene_inDf(df_intermediate) - df_exprTrans = tsv_or_csv_to_df(exprTrans) - df_reprTrans = dict_reprTrans_to_df(dict_reprTrans) - df_exprLevel_byGene = exprLevel_byGene(df_exprTrans, df_geneTrans) - df_match = match_byGene(df_reprTrans, df_exprLevel_byGene) - output = output_tsv(df_match) - return output - - -# run the programm - -dict_txt = a #input a dict of {gene:reprTrans} in the form of a txt file -input_intermediate_file = b #input the intermediate file generated by transckript extractor -input_expr = c #input a csv or tsv file containing the expr level - -dict_reprTrans = txt_to_dict(dict_txt) -match_final = match_reprTranscript_expressionLevel(input_expr, dict_reprTrans, input_intermediate_file) -print("this is the function :\n\n {}".format(match_final)) - -if __name__ == "__main__": - match_reprTranscript_expressionLevel() - \ No newline at end of file diff --git a/scripts/poisson_sampling.py b/scripts/poisson_sampling.py deleted file mode 100644 index 60d043db32e0daed7e4cfdf685ed024942af5747..0000000000000000000000000000000000000000 --- a/scripts/poisson_sampling.py +++ /dev/null @@ -1,47 +0,0 @@ -import pandas as pd -import numpy as np -import argparse - - -''' -Sample transcript - -This part of the code does Poisson sampling proportionally to gene expression levels for each gene. - -input: total transcript number (int) - csv file with gene id and gene expression levels (columns named 'id' and 'level') - -output: csv file with gene id and count - gtf file with transcript samples -''' - - -def transcript_sampling(total_transcript_number, csv_file, output_csv): - df = pd.read_csv(csv_file, sep='\t', lineterminator='\n', names=["id", "level"]) - levels = [] - sums = df['level'].tolist() - total = sum(sums) - normalized = total_transcript_number/total - for expression_level in df['level']: - poisson_sampled = np.random.poisson(expression_level*normalized) - levels.append(poisson_sampled) - - transcript_numbers = pd.DataFrame({'id': df['id'],'count': levels}) - pd.DataFrame.to_csv(transcript_numbers, output_csv) - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description="Transcript Poisson sampler, csv output", - formatter_class=argparse.ArgumentDefaultsHelpFormatter - ) - - parser.add_argument("--expression_level", required=True, help="csv file with expression level") - parser.add_argument("--output_csv", required=True, help="output csv file") - parser.add_argument("--input_csv", required=True, help="input csv file") - parser.add_argument("--transcript_number", required=True, help="total number of transcripts to sample") - args = parser.parse_args() - - - transcript_sampling(args.transcript_number, args.input_csv, args.output_csv, args.transcript_number) - - diff --git a/scripts/representative_v4.py b/scripts/representative_v4.py deleted file mode 100644 index c940686b2126121b33d12a858026338aff737706..0000000000000000000000000000000000000000 --- a/scripts/representative_v4.py +++ /dev/null @@ -1,96 +0,0 @@ - -import pandas as pd - -''' -This part of the code take as input a gtf modified file -and return a dictionary of transcripts with best -support level for each gene of the input - -''' - - - - -def import_gtfSelection_to_df(gtf_modified_file: str): - """Import intermediate file from gtf and create a df - - Args: - gtf_modified_file (str) : path to the intermediate file - - Returns: - Pandas dataframe having Gene, transcript - and support level as columns - - Raises: - TypeError : Only str path is allowed - - """ - pass - if not type(gtf_modified_file) is str: - raise TypeError("Only str path is allowed") - df_input = pd.read_csv(gtf_modified_file, sep = '\t', lineterminator = '\n', -names = ["Gene_mixed", "Transcript", "Support_level", "Na1", "Na2"] ) - df_input["Support_level"] = df_input["Support_level"].replace(" ", "") - df_input["Gene"] = df_input["Gene_mixed"].str.extract('([A-Z]\w{0,})', expand=True) - df_input["Transcript_number"] = df_input["Gene_mixed"].str.extract('(^\d)', expand=True) - df_clean = df_input.loc[:, ["Gene", "Transcript","Support_level"]] - df_clean["Gene"] = df_clean["Gene"].fillna(method = 'ffill') - df_clean = df_clean.dropna(axis = 0) - return df_clean - - - - -def representative_transcripts_inDict(df_gtfSelection: str) -> pd.DataFrame: - """Return a dict containing for each gene transcripts - with highest confidence level - - Args: - df_gtfSelection (str): Pandas dataframe having Gene, - transcript and support level as columns - - Returns: - Dict {'Gene':['transcriptA', 'transcriptB'], ...} - - Raises: - TypeError : Only pandas DataFrame is allowed - """ - pass - - if not type(df_gtfSelection) is pd.DataFrame: - raise TypeError("Only pandas DataFrame is allowed") - - df_multIndex = df_gtfSelection.set_index(["Gene", "Transcript"]) - #highest support level = 1 , worst = 5, NA = 100 - df_min = df_multIndex.groupby(level=["Gene"])["Support_level"].transform("min") - df_final = df_min.reset_index(level = "Transcript") - df_final = df_final.drop(columns = ["Support_level"]) - dict_representative_transcripts = df_final.groupby("Gene")["Transcript"].apply(list).to_dict() - return dict_representative_transcripts - - - -def find_repr_by_SupportLevel(intermediate_file:str): - """Combine functions import_gtfSelection_to_df() - and representative_transcripts_inDict() - - Args: - intermediate_file : path to the intermediate file - - Returns: - Dict {'Gene':['transcriptA', 'transcriptB'], ...} - - Raises: - None - - - """ - pass - df_gtf = import_gtfSelection_to_df(intermediate_file) - dict_reprTrans = representative_transcripts_inDict(df_gtf) - return dict_reprTrans - - - -if __name__ == "__main__": - find_repr_by_SupportLevel() diff --git a/scripts/transkript_extractor.py b/scripts/transkript_extractor.py deleted file mode 100644 index 6bcd13bd151ae173cfbacfd57ebb2a247320c669..0000000000000000000000000000000000000000 --- a/scripts/transkript_extractor.py +++ /dev/null @@ -1,311 +0,0 @@ -#### Transcript extractor ##### -"""Transcript extractor -Version 1.1.0""" -### Called Packages ### -import re -import os -import time - -### Functions ### - - - -def __parameter_editor(file_name,source_pathway_name,deposit_pathway_name): - """This function allows for chaging the parameters after running the program""" - while True: - print("The program will run with the following parameters:\nFile name:\t\t",file_name,"\nSource pathway:\t",source_pathway_name,"\nDeposit pathway:\t",deposit_pathway_name,"\n") - parameter_conformation = input("To continue with these parameters input [continue or c] to change them input [edit]\n>") - if parameter_conformation == "continue"or parameter_conformation =="c": - break - elif parameter_conformation == "edit": - #edit the parameters - while True: - change_question = input("select the parameter you want to change [nfile/spath/dpath] or input [b] to go back\n>") - if change_question == "nfile": - #This condition allows the user to chenge the file name - file_name = input("Please input the new file name\n>") - break - elif change_question == "spath": - #This condition allows the user to change the source path - source_pathway_name = input("Please input the new source path\n>") - - does_source_pathway_exist = os.path.exists(source_pathway_name) - if does_source_pathway_exist: - break - else: - print("The new source pathway:",source_pathway_name,"does not exist\nThe source pathway was returned to default:",os.getcwd()) - source_pathway_name = os.getcwd() - elif change_question == "dpath": - #This condition allows the user to change output file location - deposit_pathway_name = input("Please input the new output file path name\n>") - does_deposit_pathway_exist = os.path.exists(deposit_pathway_name) - if does_deposit_pathway_exist: - break - else: - print("The new deposit pathway:",deposit_pathway_name,"does not existe\nThe deposit pathway was returnt to default:",source_pathway_name) - deposit_pathway_name = source_pathway_name - #The block above test if the new deposit pathway is valid - elif change_question == "b": - # This condition allows the user to return to the main loop - break - else: - #This condition covers all non valid inputs into the secund loop - print("The input",change_question,"is not valid. Please use one of the specified commands") - - else: - #This condition covers all non valid input for the main loop - print("The input",parameter_conformation,"is not valide please use one of the specified comands\n") - return(file_name,source_pathway_name,deposit_pathway_name) - - - - - - - -def __searche_for_preexisting_files(file_name,deposit_pathway_name = os.getcwd()): - """This function searches for preexisting files of the same name as the results file of the current program. It allows the user to choose to move on with the pre-existing file """ - File_of_same_name_found = False - generat_new_file = False - directory_content = os.listdir(deposit_pathway_name) - for file in directory_content: - if file == file_name: - while True: - File_found_input = input (file_name+" has allready been generated\nDo you want to generate a new one [y/n] \n>") - if File_found_input == "n": - File_of_same_name_found = True - break - elif File_found_input == "y": - generat_new_file = True - break - else: - print("Invalid input\nPlease press [y] if you want to generate a new file or [n] if you want to use the preexisting file") - break - else: - continue - if File_of_same_name_found: - print("No new file will be generated, the program can continue") - elif generat_new_file: - print("A new file will be generated please wait...\n") - else: - print("No pre-existing file of the relevant type has been found.\nA new file will be generated please wait...\n") - return(File_of_same_name_found) - -def bar_builder(percentage = 0,length_multiplyer = 2,start_time = time.time(),bar = str()): - """This function creates a loading bar that can load in 10% increments starting a 0% and ending at 100% - Expected inputs: - percentage: int between 0 and 100 in steps of 10; default = 0 #defines the current loading increment - length_multiplyer: int > 0 ; default = 2 #determiens the amount of symbols per loading increment - start_time: any int ; default= time.time() #for determening loading time - bar: str ; default = str()#input of the current bar status does not need to be defined if for the 0% increment - """ - if percentage == 100: - bar = bar.replace("-","#") - print("\r"+bar+"\t"+"100%\t\t"+str(int(time.time()-start_time))) - elif percentage > 0: - bar = bar.replace("-","#",length_multiplyer) - print("\r"+bar+"\t"+str(percentage)+"%", end='',flush=True) - elif percentage == 0: - bar = "["+"-"*length_multiplyer*10+"]" - print(bar+"\t", end='',flush=True) - return(bar,start_time) - -def __test_file_name(file_name,source_pathway_name = os.getcwd()): - """This function validates that the source file exists at the source path. It turns the file name input in a standardized format that can be used in the next steps""" - - directory_content = os.listdir(source_pathway_name) - - index_of_the_dot = file_name.rfind(".") - valide_source_file = False - validate_source_file = True - if index_of_the_dot ==-1: - file_name += ".gtf" - else: - source_file_typ = file_name[index_of_the_dot:] - not_a_file_type = re.compile(".\d{1,13}") - try_not_a_file_type = not_a_file_type.search(source_file_typ) - if source_file_typ == ".gtf": - file_name = file_name - elif try_not_a_file_type: - file_name += ".gtf" - else: - print("This program can not handle",source_file_typ,"files. \nplease use a .gtf file" ) - validate_source_file = False - #The block above tests if the file_name includes the file type and if no - #file type is found adds ".gtf" und if a non ".gtf" file is found gives an error - - if validate_source_file: - for file in directory_content: - if file == file_name: - valide_source_file = True - break - #The block above tests if a file on the given name is in the given directora - - if valide_source_file: - print("The file:",file_name,"has been found.\n") - else: - print("No .gtf file of the name",file_name,"has been found in this pathway") - #The bock above gives feed back regarding the results of the file test - - file_name = file_name.replace(".gtf","") - #This line normalizes the file name - return(valide_source_file,file_name) - -def __do_pathways_exist__(source_pathway_name,deposit_pathway_name): - """This funtion tests that the entered pathways actualy exist""" - does_source_pathway_exist = os.path.exists(source_pathway_name) - does_deposit_pathway_exist = os.path.exists(deposit_pathway_name) - #The Block above does the actual testing - if does_source_pathway_exist: - source_pathway_name = source_pathway_name - else: - print("The source pathway:",source_pathway_name,"has not been found\nThe source pathway was set to the default") - source_pathway_name = os.getcwd() - #The block above detail the possible reactions for the source pathe existing or not existing - if does_deposit_pathway_exist: - deposit_pathway_name = deposit_pathway_name - else: - print("The deposit pathway:",deposit_pathway_name,"has not been found\nThe deposit pathway was set to the default") - deposit_pathway_name = source_pathway_name - #The block above details the possible reactions for the deposit pathway existing or not existing - return(source_pathway_name,deposit_pathway_name) - -def gene_ID_finder(entry): - """This function is supposed to find the gene ID of a known gene entry - Expected inputs: - entry: str #a line from a gtf file that contains a gene ID""" - index_gene_id = entry.find("gene_id") - find_gene_id_name = re.compile("\"\S{1,25}\"") - sub_entry = entry[index_gene_id:] - try_find_gene_id_name = find_gene_id_name.search(sub_entry) - gene_ID = try_find_gene_id_name[0].replace("\"","") - return (gene_ID) - -def transcript_ID_finder (entry): - """This function is supposed to finde the transcript ID in a known transcript entry - Expected inputs: - entry: str #a line from a gtf file that contains a transcript ID""" - index_transcript_id = entry.find("transcript_id") - find_transcript_id_name = re.compile("\"\S{1,25}\"") - sub_entry = entry[index_transcript_id:] - try_find_transcript_id_name = find_transcript_id_name.search(sub_entry) - - try: - transcript_ID = try_find_transcript_id_name[0].replace("\"","") - except: - transcript_ID = "" - return (transcript_ID) - -def transcript_support_level_finder(entry): - """This function is supposed to find the transcript support level in a known transcript entry - Expected input: - entry: str #a line from a gtf file that be blongs to a transcript""" - transcript_support_level_start_ID = entry.find("transcript_support_level") - sub_entry = entry[transcript_support_level_start_ID:] - - try: - score_finder = re.compile("\W\w{1,16}\W{2}") - try_score_finder = score_finder.search(sub_entry) - Pre_score_1 = try_score_finder[0] - Pre_score_2 = Pre_score_1.replace("\"","") - Pre_score_2 = Pre_score_2.replace("(","") - transcript_support_level = Pre_score_2.replace(";","") - if "NA" in transcript_support_level: - transcript_support_level = 100 - #I changed This tell laura - - - except: - transcript_support_level = 100 - return (transcript_support_level) - - - - -def _transcript_extractor (file_name,source_pathway_name,deposit_pathway_name): - """This function extracts the transcript number ,transcript ID, the transcript support level, the transcrip length and the line index from a gtf file of a given name and saves tham as a new file name given_name_intermediat_file.txt. - Expected input: - file_name: str #the name of the gft file you want to look at without the .gtf part - source_pathway_name: str #path of the gtf file - deposit_pathway_name: str #path for saving the intermediat file""" - - with open(os.path.join(source_pathway_name,file_name+".gtf"), 'r') as f: - total_entrys =len(f.readlines()) - with open(os.path.join(source_pathway_name,file_name+".gtf"), 'r') as f: - current_entry = 0 - percentage_done = 0 - bar,start_time = bar_builder(length_multiplyer = 3) - - - Old_gen_ID = str() - #stand-in as the first couple entrys are not genes - with open(os.path.join(deposit_pathway_name,file_name+"_"+"intermediate_file"+".txt"),"w") as IMF: - transcript_number = 0 - for entry in f: - - - current_entry += 1 - current_percentage_done = 100* current_entry/total_entrys - if current_percentage_done > percentage_done +10: - bar,start_time = bar_builder(percentage=percentage_done+10,length_multiplyer = 3,start_time=start_time,bar =bar) - percentage_done = int(current_percentage_done) - - if "gene_id" in entry: - Gen_ID = gene_ID_finder(entry) - else: - Gen_ID = Old_gen_ID - - if Gen_ID != Old_gen_ID: - Gen_entry = ">"+ Gen_ID +"\n" - IMF.write(Gen_entry) - transcript_number = 0 - Old_gen_ID = Gen_ID - - if "\ttranscript\t" in entry: - transcript_number += 1 - Transcript_ID = transcript_ID_finder(entry) - #the function that determins the transcript ID is called - transcript_support_level = transcript_support_level_finder(entry) - #the function that determins the transcript support level is called - New_entry = str(transcript_number)+"\t"+str(Transcript_ID)+"\t"+str(transcript_support_level)+"\t"+"\t\n" - IMF.write(New_entry) - bar_builder(100,length_multiplyer = 3,start_time=start_time,bar =bar) - print("The transcripts have been collected") - - -def extract_transkript (file_name = "test",source_pathway_name = os.getcwd(),deposit_pathway_name = False,Input_free = False): - """This it the overall exetutable funtion that will execute the transcript extraction process for a given file with all checks. - Expected input: - file_name: str ; default = test #the name of the gft file you want to look at - source_pathway_name: str ; default = current work directory #path of the gtf file - deposit_pathway_name: str ; default = source_pathway_name #path for saving the intermediat file - Outputs: - file_name: str - source_pathway_name: str - deposit_pathway_name: str""" - - if deposit_pathway_name == False: - deposit_pathway_name = source_pathway_name - if Input_free: - validated_file_name = __test_file_name(file_name,source_pathway_name) - file_name = validated_file_name[1] - _transcript_extractor (file_name,source_pathway_name,deposit_pathway_name) - else: - file_name,source_pathway_name,deposit_pathway_name = __parameter_editor(file_name,source_pathway_name,deposit_pathway_name) - source_pathway_name,deposit_pathway_name =__do_pathways_exist__(source_pathway_name,deposit_pathway_name) - validated_file_name = __test_file_name(file_name,source_pathway_name) - file_name = validated_file_name[1] - if validated_file_name[0]: - if __searche_for_preexisting_files(file_name+"_intermediate_file.txt",deposit_pathway_name): - print("The transcripts has been collected\n") - else: - _transcript_extractor (file_name,source_pathway_name,deposit_pathway_name) - return(file_name,source_pathway_name,deposit_pathway_name) - -#### Dev part #### - -if __name__ == "__main__": - extract_transkript() -#This line allows the file to be executed on its own also from - - diff --git a/scripts/writegtf.py b/scripts/writegtf.py deleted file mode 100644 index 9a8c7a8a7bbafbf7191f679c1a4d654f1ba9e9b9..0000000000000000000000000000000000000000 --- a/scripts/writegtf.py +++ /dev/null @@ -1,54 +0,0 @@ -import pandas as pd -import numpy as np -import argparse -import re - -def transcript_ID_finder (entry): - index_transcript_id = entry.find("transcript_id") - find_transcript_id_name = re.compile("\"\S{1,25}\"") - sub_entry = entry[index_transcript_id:] - try_find_transcript_id_name = find_transcript_id_name.search(sub_entry) - - try: - transcript_ID = try_find_transcript_id_name[0].replace("\"","") - except: - transcript_ID = "" - return (transcript_ID) - - -'''gtf_file_writer takes as input the original gtf file and the csv file containing relevant transcripts. - - It produces a gtf file containing only the transcript entries of those contained in the csv file - - based on id''' - - -def gtf_file_writer (original_file, csv_file, output_file): - output = [] - - df = pd.read_csv(csv_file) - listoftranscripts = df['id'].tolist() - if df['id'] == False: - print('Error. \'id\' column needed in input csv file.') - - with open(original_file, 'r') as f: - for entry in f: - if "\ttranscript\t" in entry: - transcript_id = transcript_ID_finder(entry) - if transcript_id in listoftranscripts: - output.append(entry) - with open(output_file, 'w') as last_file: - last_file.write(output) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description="gtf output file writer", - formatter_class=argparse.ArgumentDefaultsHelpFormatter - ) - parser.add_argument("--annotation", required=True, help="gtf file with genome annotation") - parser.add_argument("--output_gtf", required=True, help="output gtf file") - parser.add_argument("--input_csv", required=True, help="input csv file") - args = parser.parse_args() - - gtf_file_writer(args.annotation, args.input_csv, args.output_gtf)