Skip to content
Snippets Groups Projects
Commit 5d34c21b authored by Samuel Mondal's avatar Samuel Mondal
Browse files

creating a function from the code

parent 45881109
No related branches found
No related tags found
1 merge request!16creating a function from the code
import pandas as pd
gtf = pd.read_table("Homo_sapiens.GRCh38.107.gtf.gz",skiprows=5,header=None)
def exon_extraction_from_gtf(gtf_filename,output_filename):
gtf = pd.read_table(gtf_filename,skiprows=5,header=None)
exons = gtf[gtf[2]=="exon"]
features = list(exons[8])
transcript_id_list = []
gene_id_list = []
for x in range(len(features)):
newlist = features[x].split(";")
transcript_id_list.append(str(newlist[2])[16:-1])
gene_id_list.append(str(newlist[0])[9:-1])
exons = gtf[gtf[2]=="exon"]
feat = list(exons[8])
superlist = []
idlist = []
for x in range(len(feat)):
newlist = feat[x].split(";")
superlist.append(str(newlist[2])[16:-1])
idlist.append(str(newlist[0])[9:-1])
bed = {"chr":exons[0],"start":exons[3],"end":exons[4],"transcript_id":transcript_id_list,"score":exons[5],"strand":exons[6],"gene_id":gene_id_list}
bed = pd.DataFrame(bed)
bed.to_csv(output_filename,sep="\t",index=False)
bed = {"chr":exons[0],"start":exons[3],"end":exons[4],"transcript_id":superlist,"score":exons[5],"strand":exons[6],"gene_id":idlist}
bed = pd.DataFrame(bed)
bed.to_csv("bed_file.bed",sep="\t",index=False)
bed[(bed["gene_id"]=="ENSG00000160072")|(bed["gene_id"]== "ENSG00000142611")|(bed["gene_id"]=="ENSG00000232596")].to_csv("test.bed",sep="\t",index=False,header=None)
# This line is used to generate a test file from some of the manually selected gene_ids for now (Plans to make it choose randonly in future)
bed[(bed["gene_id"]=="ENSG00000160072")|(bed["gene_id"]== "ENSG00000142611")|(bed["gene_id"]=="ENSG00000232596")].to_csv("test.bed",sep="\t",index=False,header=None)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment