diff --git a/sequence_extractor/pre_bedtools.py b/sequence_extractor/pre_bedtools.py
index 13433b47c70e8a9215ebc1e4fc65952f223dc531..6aa5fe2e20fd2c9ef0b2531a1869eae8be83b471 100755
--- a/sequence_extractor/pre_bedtools.py
+++ b/sequence_extractor/pre_bedtools.py
@@ -1,19 +1,20 @@
import pandas as pd
-gtf = pd.read_table("Homo_sapiens.GRCh38.107.gtf.gz",skiprows=5,header=None)
+def exon_extraction_from_gtf(gtf_filename,output_filename):
+ gtf = pd.read_table(gtf_filename,skiprows=5,header=None)
+ exons = gtf[gtf[2]=="exon"]
+ features = list(exons[8])
+ transcript_id_list = []
+ gene_id_list = []
+ for x in range(len(features)):
+ newlist = features[x].split(";")
+ transcript_id_list.append(str(newlist[2])[16:-1])
+ gene_id_list.append(str(newlist[0])[9:-1])
-exons = gtf[gtf[2]=="exon"]
-feat = list(exons[8])
-superlist = []
-idlist = []
-for x in range(len(feat)):
- newlist = feat[x].split(";")
- superlist.append(str(newlist[2])[16:-1])
- idlist.append(str(newlist[0])[9:-1])
+ bed = {"chr":exons[0],"start":exons[3],"end":exons[4],"transcript_id":transcript_id_list,"score":exons[5],"strand":exons[6],"gene_id":gene_id_list}
+ bed = pd.DataFrame(bed)
+ bed.to_csv(output_filename,sep="\t",index=False)
-
-bed = {"chr":exons[0],"start":exons[3],"end":exons[4],"transcript_id":superlist,"score":exons[5],"strand":exons[6],"gene_id":idlist}
-bed = pd.DataFrame(bed)
-bed.to_csv("bed_file.bed",sep="\t",index=False)
-bed[(bed["gene_id"]=="ENSG00000160072")|(bed["gene_id"]== "ENSG00000142611")|(bed["gene_id"]=="ENSG00000232596")].to_csv("test.bed",sep="\t",index=False,header=None)
+ # This line is used to generate a test file from some of the manually selected gene_ids for now (Plans to make it choose randonly in future)
+ bed[(bed["gene_id"]=="ENSG00000160072")|(bed["gene_id"]== "ENSG00000142611")|(bed["gene_id"]=="ENSG00000232596")].to_csv("test.bed",sep="\t",index=False,header=None)