diff --git a/sequence_extractor/pre_bedtools.py b/sequence_extractor/pre_bedtools.py index fd7ff3d0698cb57984b01018c1855bf7249be43f..0f3a48d29f2ef171d2bcd544962cb27c25ea5970 100755 --- a/sequence_extractor/pre_bedtools.py +++ b/sequence_extractor/pre_bedtools.py @@ -1,22 +1,17 @@ import pandas as pd -def exon_extraction_from_gtf(gtf_filename,output_filename): - gtf = pd.read_table(gtf_filename,skiprows=5,header=None) - exons = gtf[gtf[2]=="exon"] - features = list(exons[8]) +gtf = pd.read_table("Homo_sapiens.GRCh38.107.gtf.gz",skiprows=5,header=None) - transcript_id_list = [] - gene_id_list = [] - for x in range(len(features)): - newlist = features[x].split(";") - transcript_id_list.append(str(newlist[2])[16:-1]) - gene_id_list.append(str(newlist[0])[9:-1]) - bed = {"chr":exons[0],"start":exons[3],"end":exons[4],"transcript_id":transcript_id_list,"score":exons[5],"strand":exons[6],"gene_id":gene_id_list} - bed = pd.DataFrame(bed) - bed.to_csv(output_filename,sep="\t",index=False) +exons = gtf[gtf[2]=="exon"] +feat = list(exons[8]) +superlist = [] +idlist = [] +for x in range(len(feat)): + newlist = feat[x].split(";") + superlist.append(str(newlist[2])[16:-1]) + idlist.append(str(newlist[0])[9:-1]) -<<<<<<< HEAD bed = {"chr":exons[0],"start":exons[3],"end":exons[4],"transcript_id":superlist,"score":exons[5],"strand":exons[6],"gene_id":idlist} class bed: @@ -52,7 +47,3 @@ class bed: bed = pd.DataFrame(bed) bed.to_csv("bed_file.bed",sep="\t",index=False) bed[(bed["gene_id"]=="ENSG00000160072")|(bed["gene_id"]== "ENSG00000142611")|(bed["gene_id"]=="ENSG00000232596")].to_csv("test.bed",sep="\t",index=False,header=None) -======= - # This line is used to generate a test file from some of the manually selected gene_ids for now (Plans to make it choose randonly in future) - bed[(bed["gene_id"]=="ENSG00000160072")|(bed["gene_id"]== "ENSG00000142611")|(bed["gene_id"]=="ENSG00000232596")].to_csv("test.bed",sep="\t",index=False,header=None) ->>>>>>> f35b4d0acdb49bce8ccda3f322fb4b5486737b75