updated script names and started updated execution file

2c662762 · Laura Urbanska · 83f6b6f3 · 83f6b6f3 · 2c662762 · 2c662762
Commit 2c662762 authored 2 years ago by Laura Urbanska
--- a/scripts/Excecution_file.py
+++ b/scripts/Excecution_file.py
-### Imports ###
-import os
-import transkript_extractor as te
-import Exon_length_filter as elf
-import representative_v4 as rtcl
-### Scipt ###
-def exe(file_name = "test",source_pathway_name = os.getcwd(),deposit_pathway_name = os.getcwd(),Input_free = True):
-    file_name,source_pathway_name_2,deposit_pathway_name_2 = te.extract_transkript(file_name,source_pathway_name,deposit_pathway_name,Input_free = Input_free)
-    inter_mediate_file_directory = os.path.join(deposit_pathway_name,file_name+"_intermediate_file.txt")
-    print("Transcripts are filterd based on transcipt score please wait...")
-    pre_filter_representative_transcripts_dict = rtcl.find_repr_by_SupportLevel(inter_mediate_file_directory)
-    print("Transcripts filtered\n")
-    elf.exon_length_filter(file_name,source_pathway_name,deposit_pathway_name,gen_dict= pre_filter_representative_transcripts_dict,Input_free = Input_free)
-    return(file_name,source_pathway_name,deposit_pathway_name)
-### from consol ####
-##D:\\Uni\\Sem 9\\Programing in the Life sciences\\Projekt\\Intermediat Files
-if __name__ == "__main__":
-    exe()
\ No newline at end of file
--- a/scripts/Exon_length_filter.py
+++ b/scripts/Exon_length_filter.py
@@ -5,11 +5,11 @@ Version 1.1.0"""
 import re
 import os
-import transkript_extractor as te
+import transcript_extractor as te
 ### Functions ###
 def exon_length_calculator(entry): 
-    """This funtion finds the start and end cordinates of the exon and uses them to calculate its lenght"""
+    """This function finds the start and end cordinates of the exon and uses them to calculate its length"""
    try:
        find_exon_coordinates = re.compile("\t\d{1,15}\t")
        #this difines the pattern of the coordinates 
@@ -25,12 +25,12 @@ def exon_length_calculator(entry):
        try_find_end_coordinates = find_exon_coordinates.search(sub_entry)
        end_coordinates = int(try_find_end_coordinates[0].replace("\t",""))
        #these two lines find the end coordinates and turn tham int an int 
-        exon_lenght = end_coordinates-start_coordinates
+        exon_length = end_coordinates-start_coordinates
        #this line claculates the transcript length 
    except:
        print("\n\nIn the following enty only one or no valid coordinates could be found:\n",entry,"the value will be set to NA")
-        exon_lenght = "NA"
+        exon_length = "NA"
-    return(exon_lenght)
+    return(exon_length)
 def exon_fider(entry):
    """This funtion determines if a given entry belongs to an exon
@@ -46,7 +46,7 @@ def exon_fider(entry):
    return(try_exon_test)
 def __longest_transcript_finder(current_exon_length,longest_transcript,longest_transcript_ID,old_transcript_ID):
-    """This funtion encapsulates an opperation that has to be carried out at several point ind the exon_length_filter funktion and servers to make that funktion more modular"""
+    """This funtion encapsulates an operation that has to be carried out at several points in the exon_length_filter function and serves to make that function more modular"""
    if current_exon_length > longest_transcript: 
        #This condition updates the most promesing for
        #beeing the representative transcript
@@ -65,7 +65,7 @@ def _representative_transcript_csv (representative_transcript,file_name = "test"
 def _exon_length_filter(file_name = "test",source_pathway_name = os.getcwd(),deposit_pathway_name =os.getcwd(),gen_dict = {"ENSG00000160072":["ENST00000673477","ENST00000472194","ENST00000378736","ENST00000308647","ENST00000442483"],"ENSG00000225972":["ENST00000416931"],"ENSG00000279928":["ENST00000624431","ENST00000424215"],"ENSG00000142611":["ENST00000378391","ENST00000607632","ENST00000511072"]}):
-    """This funtion selects only the transcripts for a dictionar that have the longest total mRNA"""  
+    """This funtion selects only the transcripts for a dictionary that have the longest total mRNA"""  
    bar,start_time = te.bar_builder(length_multiplyer = 3)
    total_genes = len(gen_dict)
    gens_done = 0
@@ -133,7 +133,7 @@ def _exon_length_filter(file_name = "test",source_pathway_name = os.getcwd(),dep
                    current_transcript_ID = te.transcript_ID_finder(entry)         
                except: 
                    continue
-                #The block above searches for a trnascript ID in the  current enty
+                #The block above searches for a transcript ID in the current entry
                if current_transcript_ID in transcript_IDs:
                    #This condition test if the Transcript is one of the 
@@ -185,4 +185,4 @@ if __name__ == "__main__":
    exon_length_filter()
 #This line allows the file to be executed on its own also from 
\ No newline at end of file
--- a/scripts/match_reprTranscript_expressionLevel.py
+++ b/scripts/match_reprTranscript_expressionLevel.py
--- a/scripts/new_exe_file.ipynb
+++ b/scripts/new_exe_file.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "usage: ipykernel_launcher.py [-h] --annotation ANNOTATION --expression_level\n",
+      "                             EXPRESSION_LEVEL --output_csv OUTPUT_CSV\n",
+      "                             --output_gtf OUTPUT_GTF --transcript_number\n",
+      "                             TRANSCRIPT_NUMBER\n",
+      "ipykernel_launcher.py: error: the following arguments are required: --annotation, --expression_level, --output_csv, --output_gtf, --transcript_number\n"
+     ]
+    },
+    {
+     "ename": "SystemExit",
+     "evalue": "2",
+     "output_type": "error",
+     "traceback": [
+      "An exception has occurred, use %tb to see the full traceback.\n",
+      "\u001b[0;31mSystemExit\u001b[0m\u001b[0;31m:\u001b[0m 2\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "\n",
+    "import argparse\n",
+    "import transcript_extractor as te\n",
+    "import exon_length_filter as elf\n",
+    "import representative as rtcl\n",
+    "import representative as rp\n",
+    "\n",
+    "if __name__ == '__main__':\n",
+    "    parser = argparse.ArgumentParser(\n",
+    "        description=\"transcript sampler\",\n",
+    "        formatter_class=argparse.ArgumentDefaultsHelpFormatter\n",
+    "    )\n",
+    "    parser.add_argument(\"--annotation\", required=True, help=\"gtf file with genome annotation\")\n",
+    "    #parser.add_argument(\"--expression_level\", required=True, help=\"csv file with expression level\")\n",
+    "    parser.add_argument(\"--output_csv\", required=True, help=\"output csv file\")\n",
+    "    parser.add_argument(\"--output_gtf\", required=True, help=\"output gtf file\")\n",
+    "    parser.add_argument(\"--transcript_number\", required=True, help=\"total number of transcripts to sample\")\n",
+    "    args = parser.parse_args()\n",
+    "\n",
+    "def exe(input_file, csv, gtf, transcript_nr):\n",
+    "    file_name,source_pathway_name_2,deposit_pathway_name_2 = te.extract_transcript(input_file, deposit_pathway_name = True, Input_free = Input_free)\n",
+    "    inter_mediate_file_directory = input_file +\"_intermediate_file.txt\"\n",
+    "\n",
+    "    print(\"Transcripts are filtered based on transcript score. Please wait...\")\n",
+    "\n",
+    "    pre_filter_representative_transcripts_dict = rtcl.find_repr_by_SupportLevel(inter_mediate_file_directory)\n",
+    "\n",
+    "    print(\"Transcripts filtered\\n\")\n",
+    "    elf.exon_length_filter(file_name,gen_dict= pre_filter_representative_transcripts_dict, Input_free = True)\n",
+    "\n",
+    "\n",
+    "    #return(file_name,source_pathway_name,deposit_pathway_name)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.9.12 ('nextstrain')",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "41a54f34eee8c9e478b3404dd74579d3248e5c82a4969468d7042e338229b1fe"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
+%% Cell type:code id: tags:
+``` python
+import os
+import argparse
+import transcript_extractor as te
+import exon_length_filter as elf
+import representative as rtcl
+import representative as rp
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description="transcript sampler",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument("--annotation", required=True, help="gtf file with genome annotation")
+    #parser.add_argument("--expression_level", required=True, help="csv file with expression level")
+    parser.add_argument("--output_csv", required=True, help="output csv file")
+    parser.add_argument("--output_gtf", required=True, help="output gtf file")
+    parser.add_argument("--transcript_number", required=True, help="total number of transcripts to sample")
+    args = parser.parse_args()
+def exe(input_file, csv, gtf, transcript_nr):
+    file_name,source_pathway_name_2,deposit_pathway_name_2 = te.extract_transcript(input_file, deposit_pathway_name = True, Input_free = Input_free)
+    inter_mediate_file_directory = input_file +"_intermediate_file.txt"
+    print("Transcripts are filtered based on transcript score. Please wait...")
+    pre_filter_representative_transcripts_dict = rtcl.find_repr_by_SupportLevel(inter_mediate_file_directory)
+    print("Transcripts filtered\n")
+    elf.exon_length_filter(file_name,gen_dict= pre_filter_representative_transcripts_dict, Input_free = True)
+    #return(file_name,source_pathway_name,deposit_pathway_name)
+```
+%% Output
+    usage: ipykernel_launcher.py [-h] --annotation ANNOTATION --expression_level
+                                 EXPRESSION_LEVEL --output_csv OUTPUT_CSV
+                                 --output_gtf OUTPUT_GTF --transcript_number
+                                 TRANSCRIPT_NUMBER
+    ipykernel_launcher.py: error: the following arguments are required: --annotation, --expression_level, --output_csv, --output_gtf, --transcript_number
+An exception has occurred, use %tb to see the full traceback.
+    SystemExit: 2
+%% Cell type:code id: tags:
+``` python
+```
--- a/scripts/representative_v4.py
+++ b/scripts/representative_v4.py
--- a/scripts/transkript_extractor.py
+++ b/scripts/transkript_extractor.py
@@ -11,7 +11,7 @@ import time
 def __parameter_editor(file_name,source_pathway_name,deposit_pathway_name):
-    """This function allows for chaging the parameters after running the program"""
+    """This function allows for changing the parameters after running the program"""
    while True:
        print("The program will run with the following parameters:\nFile name:\t\t",file_name,"\nSource pathway:\t",source_pathway_name,"\nDeposit pathway:\t",deposit_pathway_name,"\n")
        parameter_conformation = input("To continue with these parameters input [continue or c] to change them input [edit]\n>")
@@ -273,7 +273,7 @@ def _transcript_extractor (file_name,source_pathway_name,deposit_pathway_name):
        print("The transcripts have been collected") 
-def extract_transkript (file_name = "test",source_pathway_name = os.getcwd(),deposit_pathway_name = False,Input_free = False): 
+def extract_transcript(file_name = "test",source_pathway_name = os.getcwd(),deposit_pathway_name = False,Input_free = False): 
   """This it the overall exetutable funtion that will execute the transcript extraction process for a given file with all checks. 
    Expected input:
        file_name: str ; default = test #the name of the gft file you want to look at
@@ -305,7 +305,7 @@ def extract_transkript (file_name = "test",source_pathway_name = os.getcwd(),dep
 #### Dev part ####
 if __name__ == "__main__":
-    extract_transkript()
+    extract_transcript()
 #This line allows the file to be executed on its own also from