Skip to content
Snippets Groups Projects
Commit 67252a2b authored by Laura Urbanska's avatar Laura Urbanska
Browse files

updated transcript extractor and exon length filter

parent 132757a5
No related branches found
No related tags found
No related merge requests found
...@@ -3,8 +3,9 @@ ...@@ -3,8 +3,9 @@
### Called Packages ### ### Called Packages ###
import re import re
import os import os
import time
import transkript_extractor as te import transcript_extractor as te
### Functions ### ### Functions ###
def exon_length_calculator(entry): def exon_length_calculator(entry):
...@@ -52,10 +53,17 @@ def __longest_transcript_finder(current_exon_length,longest_transcript,longest_t ...@@ -52,10 +53,17 @@ def __longest_transcript_finder(current_exon_length,longest_transcript,longest_t
current_exon_length = 0 current_exon_length = 0
return(current_exon_length,longest_transcript,longest_transcript_ID) return(current_exon_length,longest_transcript,longest_transcript_ID)
def exon_length_filter(file_name = "test",source_pathway_name = os.getcwd(),deposit_pathway_name =os.getcwd(),gen_dict = {"ENSG00000160072":["ENST00000673477","ENST00000472194","ENST00000378736","ENST00000308647","ENST00000442483"],"ENSG00000225972":["ENST00000416931"],"ENSG00000279928":["ENST00000624431","ENST00000424215"],"ENSG00000142611":["ENST00000378391","ENST00000607632","ENST00000511072"]}): def exon_length_filter(file_name = "test",source_pathway_name = os.getcwd(),deposit_pathway_name =os.getcwd(),gen_dict = {"ENSG00000160072":["ENST00000673477","ENST00000472194","ENST00000378736","ENST00000308647","ENST00000442483"],"ENSG00000225972":["ENST00000416931"],"ENSG00000279928":["ENST00000624431","ENST00000424215"],"ENSG00000142611":["ENST00000378391","ENST00000607632","ENST00000511072"]}):
"""This funtion selects only the transcripts for a dictionar that have the longest total mRNA""" """This funtion selects only the transcripts for a dictionar that have the longest total mRNA"""
print("Representative trascripts are filterd based on exon length please wait...")
print("Representative trascipts are filterd based on exon length please wait...")
bar,start_time = te.bar_builder(length_multiplyer = 3)
source_pathway_name,deposit_pathway_name = te.__do_pathways_exist__(source_pathway_name,deposit_pathway_name) source_pathway_name,deposit_pathway_name = te.__do_pathways_exist__(source_pathway_name,deposit_pathway_name)
total_genes = len(gen_dict)
gens_done = 0
with open(source_pathway_name+"\\"+file_name+".gtf", 'r') as f: with open(source_pathway_name+"\\"+file_name+".gtf", 'r') as f:
old_gen = str() old_gen = str()
...@@ -65,6 +73,7 @@ def exon_length_filter(file_name = "test",source_pathway_name = os.getcwd(),depo ...@@ -65,6 +73,7 @@ def exon_length_filter(file_name = "test",source_pathway_name = os.getcwd(),depo
longest_transcript_ID = str() longest_transcript_ID = str()
current_exon_length = 0 current_exon_length = 0
longest_transcript = 0 longest_transcript = 0
percentage_done = 0
for entry in f: for entry in f:
...@@ -73,13 +82,14 @@ def exon_length_filter(file_name = "test",source_pathway_name = os.getcwd(),depo ...@@ -73,13 +82,14 @@ def exon_length_filter(file_name = "test",source_pathway_name = os.getcwd(),depo
except: except:
corrent_gen = old_gen corrent_gen = old_gen
#The block above test if there is a gen name in the entry #The block above test if there is a gen name in the entry
if corrent_gen != old_gen: if corrent_gen != old_gen:
representative_trasnscript_not_found = True representative_trasnscript_not_found = True
#The block above determines if the Gen name is new and set the test #The block above determines if the Gen name is new and set the test
#representative_trasnscript_not_found back to true which is used to #representative_trasnscript_not_found back to true which is used to
#make the program faster if there is just one transcript for a given #make the program faster if there is just one transcript for a given
#gen in the dict #gen in the dict
if representative_trasnscript_not_found and corrent_gen in gen_dict: if representative_trasnscript_not_found and corrent_gen != str():
#print(corrent_gen) #print(corrent_gen)
#The conditon prvents serges if a representative transcript has #The conditon prvents serges if a representative transcript has
#all ready been chosen #all ready been chosen
...@@ -88,7 +98,14 @@ def exon_length_filter(file_name = "test",source_pathway_name = os.getcwd(),depo ...@@ -88,7 +98,14 @@ def exon_length_filter(file_name = "test",source_pathway_name = os.getcwd(),depo
representative_transcript[old_gen] = longest_transcript_ID representative_transcript[old_gen] = longest_transcript_ID
try: try:
del gen_dict[old_gen] del gen_dict[old_gen]
old_gen = corrent_gen old_gen = corrent_gen
gens_done += 1
corrent_percentage_done = (gens_done/total_genes)*100
if corrent_percentage_done > percentage_done+10:
bar,start_time = te.bar_builder(percentage=percentage_done+10,length_multiplyer = 3,start_time=start_time,bar =bar)
percentage_done = int(corrent_percentage_done)
except: except:
old_gen = corrent_gen old_gen = corrent_gen
longest_transcript = 0 longest_transcript = 0
...@@ -107,7 +124,7 @@ def exon_length_filter(file_name = "test",source_pathway_name = os.getcwd(),depo ...@@ -107,7 +124,7 @@ def exon_length_filter(file_name = "test",source_pathway_name = os.getcwd(),depo
continue continue
try: try:
current_transcript_ID = te.transcript_ID_finder(entry) current_transcript_ID = te.transcript_ID_finder(entry)
except: except:
continue continue
#The block above searches for a trnascript ID in the current enty #The block above searches for a trnascript ID in the current enty
...@@ -132,10 +149,11 @@ def exon_length_filter(file_name = "test",source_pathway_name = os.getcwd(),depo ...@@ -132,10 +149,11 @@ def exon_length_filter(file_name = "test",source_pathway_name = os.getcwd(),depo
current_exon_length,longest_transcript,longest_transcript_ID = __longest_transcript_finder(current_exon_length,longest_transcript,longest_transcript_ID,old_transcript_ID) current_exon_length,longest_transcript,longest_transcript_ID = __longest_transcript_finder(current_exon_length,longest_transcript,longest_transcript_ID,old_transcript_ID)
representative_transcript[old_gen] = longest_transcript_ID representative_transcript[old_gen] = longest_transcript_ID
del representative_transcript[str()] del representative_transcript[str()]
print("Representative transcripts collected\n") te.bar_builder(100,length_multiplyer = 3,start_time=start_time,bar =bar)
return(representative_transcript) return(representative_transcript)
if __name__ == "__main__": if __name__ == "__main__":
exon_length_filter() exon_length_filter()
#This line allows the file to be executed on its own also from
\ No newline at end of file #This line allows the file to be executed on its own also from
...@@ -12,7 +12,7 @@ def __searche_for_preexisting_files(file_name,deposit_pathway_name = os.getcwd() ...@@ -12,7 +12,7 @@ def __searche_for_preexisting_files(file_name,deposit_pathway_name = os.getcwd()
generat_new_file = False generat_new_file = False
directory_content = os.listdir(deposit_pathway_name) directory_content = os.listdir(deposit_pathway_name)
for file in directory_content: for file in directory_content:
Search_profile = file_name+"_intermediat_file.txt" Search_profile = file_name+"_intermediate_file.txt"
if file == Search_profile: if file == Search_profile:
while True: while True:
File_found_input = input ("An intermediate file has allready been generated from this file\nDo you want to generate a new one [y/n] \n>") File_found_input = input ("An intermediate file has allready been generated from this file\nDo you want to generate a new one [y/n] \n>")
...@@ -163,7 +163,7 @@ def _transcript_extractor (file_name,source_pathway_name,deposit_pathway_name): ...@@ -163,7 +163,7 @@ def _transcript_extractor (file_name,source_pathway_name,deposit_pathway_name):
transcript_number = 0 transcript_number = 0
for entry in f: for entry in f:
#this loop reads all lines in the source file one by one #this loop reads all lines in the source file one by one
Gen_finder = re.compile("gene_name") Gen_finder = re.compile("gene_id")
try_gen_finder = Gen_finder.search(entry) try_gen_finder = Gen_finder.search(entry)
#the lines above determin if the is a "gene_name" collumn #the lines above determin if the is a "gene_name" collumn
#in the current entry #in the current entry
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment