From 10c1f29f917db1d310fa681aa4ad46ed99dc1325 Mon Sep 17 00:00:00 2001
From: Gina <gina.unibas@gmail.com>
Date: Tue, 15 Nov 2022 19:59:44 +0100
Subject: [PATCH] adding class

---
 sequence_extractor/pre_bedtools.py | 27 +++++++++------------------
 1 file changed, 9 insertions(+), 18 deletions(-)

diff --git a/sequence_extractor/pre_bedtools.py b/sequence_extractor/pre_bedtools.py
index fd7ff3d..0f3a48d 100755
--- a/sequence_extractor/pre_bedtools.py
+++ b/sequence_extractor/pre_bedtools.py
@@ -1,22 +1,17 @@
 import pandas as pd
 
-def exon_extraction_from_gtf(gtf_filename,output_filename):
-    gtf = pd.read_table(gtf_filename,skiprows=5,header=None)
-    exons = gtf[gtf[2]=="exon"]
-    features = list(exons[8])
+gtf = pd.read_table("Homo_sapiens.GRCh38.107.gtf.gz",skiprows=5,header=None)
 
-    transcript_id_list = []
-    gene_id_list = []
-    for x in range(len(features)):
-        newlist = features[x].split(";")
-        transcript_id_list.append(str(newlist[2])[16:-1])
-        gene_id_list.append(str(newlist[0])[9:-1])
 
-    bed = {"chr":exons[0],"start":exons[3],"end":exons[4],"transcript_id":transcript_id_list,"score":exons[5],"strand":exons[6],"gene_id":gene_id_list}
-    bed = pd.DataFrame(bed)
-    bed.to_csv(output_filename,sep="\t",index=False)
+exons = gtf[gtf[2]=="exon"]
+feat = list(exons[8])
+superlist = []
+idlist = []
+for x in range(len(feat)):
+    newlist = feat[x].split(";")
+    superlist.append(str(newlist[2])[16:-1])
+    idlist.append(str(newlist[0])[9:-1])
 
-<<<<<<< HEAD
 
 bed = {"chr":exons[0],"start":exons[3],"end":exons[4],"transcript_id":superlist,"score":exons[5],"strand":exons[6],"gene_id":idlist}
 class bed:
@@ -52,7 +47,3 @@ class bed:
 bed = pd.DataFrame(bed)
 bed.to_csv("bed_file.bed",sep="\t",index=False)
 bed[(bed["gene_id"]=="ENSG00000160072")|(bed["gene_id"]== "ENSG00000142611")|(bed["gene_id"]=="ENSG00000232596")].to_csv("test.bed",sep="\t",index=False,header=None)
-=======
-    # This line is used to generate a test file from some of the manually selected gene_ids for now (Plans to make it choose randonly in future)
-    bed[(bed["gene_id"]=="ENSG00000160072")|(bed["gene_id"]== "ENSG00000142611")|(bed["gene_id"]=="ENSG00000232596")].to_csv("test.bed",sep="\t",index=False,header=None)
->>>>>>> f35b4d0acdb49bce8ccda3f322fb4b5486737b75
-- 
GitLab