From 2c662762a2040f5e05a1dcba8f9798aa38209ffe Mon Sep 17 00:00:00 2001 From: LauraU123 <laura.urbanska@stud.unibas.ch> Date: Mon, 21 Nov 2022 15:12:56 +0100 Subject: [PATCH] updated script names and started updated execution file --- scripts/Excecution_file.py | 20 ---- ...length_filter.py => exon_length_filter.py} | 18 ++-- ...> match_reprtranscript_expressionlevel.py} | 0 scripts/new_exe_file.ipynb | 101 ++++++++++++++++++ ...representative_v4.py => representative.py} | 0 ...t_extractor.py => transcript_extractor.py} | 6 +- 6 files changed, 113 insertions(+), 32 deletions(-) delete mode 100644 scripts/Excecution_file.py rename scripts/{Exon_length_filter.py => exon_length_filter.py} (92%) rename scripts/{match_reprTranscript_expressionLevel.py => match_reprtranscript_expressionlevel.py} (100%) create mode 100644 scripts/new_exe_file.ipynb rename scripts/{representative_v4.py => representative.py} (100%) rename scripts/{transkript_extractor.py => transcript_extractor.py} (96%) diff --git a/scripts/Excecution_file.py b/scripts/Excecution_file.py deleted file mode 100644 index 788525e..0000000 --- a/scripts/Excecution_file.py +++ /dev/null @@ -1,20 +0,0 @@ -### Imports ### -import os - -import transkript_extractor as te -import Exon_length_filter as elf -import representative_v4 as rtcl - -### Scipt ### -def exe(file_name = "test",source_pathway_name = os.getcwd(),deposit_pathway_name = os.getcwd(),Input_free = True): - file_name,source_pathway_name_2,deposit_pathway_name_2 = te.extract_transkript(file_name,source_pathway_name,deposit_pathway_name,Input_free = Input_free) - inter_mediate_file_directory = os.path.join(deposit_pathway_name,file_name+"_intermediate_file.txt") - print("Transcripts are filterd based on transcipt score please wait...") - pre_filter_representative_transcripts_dict = rtcl.find_repr_by_SupportLevel(inter_mediate_file_directory) - print("Transcripts filtered\n") - elf.exon_length_filter(file_name,source_pathway_name,deposit_pathway_name,gen_dict= pre_filter_representative_transcripts_dict,Input_free = Input_free) - return(file_name,source_pathway_name,deposit_pathway_name) -### from consol #### -##D:\\Uni\\Sem 9\\Programing in the Life sciences\\Projekt\\Intermediat Files -if __name__ == "__main__": - exe() \ No newline at end of file diff --git a/scripts/Exon_length_filter.py b/scripts/exon_length_filter.py similarity index 92% rename from scripts/Exon_length_filter.py rename to scripts/exon_length_filter.py index 162ff1f..db8b12c 100644 --- a/scripts/Exon_length_filter.py +++ b/scripts/exon_length_filter.py @@ -5,11 +5,11 @@ Version 1.1.0""" import re import os -import transkript_extractor as te +import transcript_extractor as te ### Functions ### def exon_length_calculator(entry): - """This funtion finds the start and end cordinates of the exon and uses them to calculate its lenght""" + """This function finds the start and end cordinates of the exon and uses them to calculate its length""" try: find_exon_coordinates = re.compile("\t\d{1,15}\t") #this difines the pattern of the coordinates @@ -25,12 +25,12 @@ def exon_length_calculator(entry): try_find_end_coordinates = find_exon_coordinates.search(sub_entry) end_coordinates = int(try_find_end_coordinates[0].replace("\t","")) #these two lines find the end coordinates and turn tham int an int - exon_lenght = end_coordinates-start_coordinates + exon_length = end_coordinates-start_coordinates #this line claculates the transcript length except: print("\n\nIn the following enty only one or no valid coordinates could be found:\n",entry,"the value will be set to NA") - exon_lenght = "NA" - return(exon_lenght) + exon_length = "NA" + return(exon_length) def exon_fider(entry): """This funtion determines if a given entry belongs to an exon @@ -46,7 +46,7 @@ def exon_fider(entry): return(try_exon_test) def __longest_transcript_finder(current_exon_length,longest_transcript,longest_transcript_ID,old_transcript_ID): - """This funtion encapsulates an opperation that has to be carried out at several point ind the exon_length_filter funktion and servers to make that funktion more modular""" + """This funtion encapsulates an operation that has to be carried out at several points in the exon_length_filter function and serves to make that function more modular""" if current_exon_length > longest_transcript: #This condition updates the most promesing for #beeing the representative transcript @@ -65,7 +65,7 @@ def _representative_transcript_csv (representative_transcript,file_name = "test" def _exon_length_filter(file_name = "test",source_pathway_name = os.getcwd(),deposit_pathway_name =os.getcwd(),gen_dict = {"ENSG00000160072":["ENST00000673477","ENST00000472194","ENST00000378736","ENST00000308647","ENST00000442483"],"ENSG00000225972":["ENST00000416931"],"ENSG00000279928":["ENST00000624431","ENST00000424215"],"ENSG00000142611":["ENST00000378391","ENST00000607632","ENST00000511072"]}): - """This funtion selects only the transcripts for a dictionar that have the longest total mRNA""" + """This funtion selects only the transcripts for a dictionary that have the longest total mRNA""" bar,start_time = te.bar_builder(length_multiplyer = 3) total_genes = len(gen_dict) gens_done = 0 @@ -133,7 +133,7 @@ def _exon_length_filter(file_name = "test",source_pathway_name = os.getcwd(),dep current_transcript_ID = te.transcript_ID_finder(entry) except: continue - #The block above searches for a trnascript ID in the current enty + #The block above searches for a transcript ID in the current entry if current_transcript_ID in transcript_IDs: #This condition test if the Transcript is one of the @@ -185,4 +185,4 @@ if __name__ == "__main__": exon_length_filter() -#This line allows the file to be executed on its own also from \ No newline at end of file +#This line allows the file to be executed on its own also from diff --git a/scripts/match_reprTranscript_expressionLevel.py b/scripts/match_reprtranscript_expressionlevel.py similarity index 100% rename from scripts/match_reprTranscript_expressionLevel.py rename to scripts/match_reprtranscript_expressionlevel.py diff --git a/scripts/new_exe_file.ipynb b/scripts/new_exe_file.ipynb new file mode 100644 index 0000000..c79b091 --- /dev/null +++ b/scripts/new_exe_file.ipynb @@ -0,0 +1,101 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "usage: ipykernel_launcher.py [-h] --annotation ANNOTATION --expression_level\n", + " EXPRESSION_LEVEL --output_csv OUTPUT_CSV\n", + " --output_gtf OUTPUT_GTF --transcript_number\n", + " TRANSCRIPT_NUMBER\n", + "ipykernel_launcher.py: error: the following arguments are required: --annotation, --expression_level, --output_csv, --output_gtf, --transcript_number\n" + ] + }, + { + "ename": "SystemExit", + "evalue": "2", + "output_type": "error", + "traceback": [ + "An exception has occurred, use %tb to see the full traceback.\n", + "\u001b[0;31mSystemExit\u001b[0m\u001b[0;31m:\u001b[0m 2\n" + ] + } + ], + "source": [ + "import os\n", + "\n", + "import argparse\n", + "import transcript_extractor as te\n", + "import exon_length_filter as elf\n", + "import representative as rtcl\n", + "import representative as rp\n", + "\n", + "if __name__ == '__main__':\n", + " parser = argparse.ArgumentParser(\n", + " description=\"transcript sampler\",\n", + " formatter_class=argparse.ArgumentDefaultsHelpFormatter\n", + " )\n", + " parser.add_argument(\"--annotation\", required=True, help=\"gtf file with genome annotation\")\n", + " #parser.add_argument(\"--expression_level\", required=True, help=\"csv file with expression level\")\n", + " parser.add_argument(\"--output_csv\", required=True, help=\"output csv file\")\n", + " parser.add_argument(\"--output_gtf\", required=True, help=\"output gtf file\")\n", + " parser.add_argument(\"--transcript_number\", required=True, help=\"total number of transcripts to sample\")\n", + " args = parser.parse_args()\n", + "\n", + "def exe(input_file, csv, gtf, transcript_nr):\n", + " file_name,source_pathway_name_2,deposit_pathway_name_2 = te.extract_transcript(input_file, deposit_pathway_name = True, Input_free = Input_free)\n", + " inter_mediate_file_directory = input_file +\"_intermediate_file.txt\"\n", + "\n", + " print(\"Transcripts are filtered based on transcript score. Please wait...\")\n", + "\n", + " pre_filter_representative_transcripts_dict = rtcl.find_repr_by_SupportLevel(inter_mediate_file_directory)\n", + "\n", + " print(\"Transcripts filtered\\n\")\n", + " elf.exon_length_filter(file_name,gen_dict= pre_filter_representative_transcripts_dict, Input_free = True)\n", + "\n", + "\n", + " #return(file_name,source_pathway_name,deposit_pathway_name)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.9.12 ('nextstrain')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "41a54f34eee8c9e478b3404dd74579d3248e5c82a4969468d7042e338229b1fe" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/scripts/representative_v4.py b/scripts/representative.py similarity index 100% rename from scripts/representative_v4.py rename to scripts/representative.py diff --git a/scripts/transkript_extractor.py b/scripts/transcript_extractor.py similarity index 96% rename from scripts/transkript_extractor.py rename to scripts/transcript_extractor.py index 6bcd13b..3538339 100644 --- a/scripts/transkript_extractor.py +++ b/scripts/transcript_extractor.py @@ -11,7 +11,7 @@ import time def __parameter_editor(file_name,source_pathway_name,deposit_pathway_name): - """This function allows for chaging the parameters after running the program""" + """This function allows for changing the parameters after running the program""" while True: print("The program will run with the following parameters:\nFile name:\t\t",file_name,"\nSource pathway:\t",source_pathway_name,"\nDeposit pathway:\t",deposit_pathway_name,"\n") parameter_conformation = input("To continue with these parameters input [continue or c] to change them input [edit]\n>") @@ -273,7 +273,7 @@ def _transcript_extractor (file_name,source_pathway_name,deposit_pathway_name): print("The transcripts have been collected") -def extract_transkript (file_name = "test",source_pathway_name = os.getcwd(),deposit_pathway_name = False,Input_free = False): +def extract_transcript(file_name = "test",source_pathway_name = os.getcwd(),deposit_pathway_name = False,Input_free = False): """This it the overall exetutable funtion that will execute the transcript extraction process for a given file with all checks. Expected input: file_name: str ; default = test #the name of the gft file you want to look at @@ -305,7 +305,7 @@ def extract_transkript (file_name = "test",source_pathway_name = os.getcwd(),dep #### Dev part #### if __name__ == "__main__": - extract_transkript() + extract_transcript() #This line allows the file to be executed on its own also from -- GitLab