Skip to content
Snippets Groups Projects
Commit f0569652 authored by Samuel Mondal's avatar Samuel Mondal
Browse files

edits to pre_bedtools

parent 98bc811b
No related branches found
No related tags found
1 merge request!51edits to pre_bedtools
import pandas as pd
from gtfparse import read_gtf
"""This script defines a BED from exon annotation in a GTF, to get sequences with transcript ID as header after usage in bedtools.
For each transcript, take exons only and sort exons by start position (reverse order for -ve strand)
Input: GTF file
Columns needed for BED: chr, start, end, transcript_id, score, strand, gene_id
...
:returns: BED file format
:rtype: dataframe
"""
gtf = read_gtf('../scrna-seq-simulation-main/inputs/ref_annotation.gtf')
gtf_exons = gtf[gtf["feature"] == "exon"]
gtf_exons = gtf_exons[["seqname", "start", "end", "transcript_id", "score", "strand", "gene_id"]]
gtf_df_neg = gtf_exons[gtf_exons["strand"] == "-"]
gtf_df_neg = gtf_df_neg.sort_values(['transcript_id','start'],ascending=False).groupby('transcript_id').head(len(gtf_df_neg. transcript_id))
gtf_df_pos = gtf_exons[gtf_exons["strand"] == "+"]
gtf_df_pos = gtf_df_pos.sort_values(['transcript_id','start'],ascending=True).groupby('transcript_id').head(len(gtf_df_pos. transcript_id))
pd.concat([gtf_df_pos, gtf_df_neg]).to_csv("bed_file.bed",sep="\t",index=False) #gtf_df_pos and gtf_df_neg must be dataframes
import pandas as pd import pandas as pd
import argparse
from gtfparse import read_gtf
"""This script defines a BED from exon annotation in a GTF, to get exon coordinates for use in bedtools. It also ensures that the concatenation happens in the correct order, regardless of the strandedness of the transcript.
Args:
GTF file
Returns:
BED file with the format: chr, start, end, transcript_id, score, strand, gene_id
"""
parser = argparse.ArgumentParser(
prog = 'pre_bedtools',
description = 'extracts ordered information from gtf file and for transcripts in the negative strand, flips the order in which exons are ordered.')
parser.add_argument('--input_gtf_file',
help='ordered and processed gtf file')
parser.add_argument('--output_bed_file',
help='bed file with only exons with strandedness taken into account')
args = parser.parse_args()
gtf = read_gtf(args.input_gtf_file)
gtf_exons = gtf[gtf["feature"] == "exon"]
gtf_exons = gtf_exons[["seqname", "start", "end", "transcript_id", "score", "strand", "gene_id"]]
gtf_df_neg = gtf_exons[gtf_exons["strand"] == "-"]
gtf_df_neg = gtf_df_neg.sort_values(['transcript_id','start'],ascending=False).groupby('transcript_id').head(len(gtf_df_neg. transcript_id))
gtf_df_pos = gtf_exons[gtf_exons["strand"] == "+"]
gtf_df_pos = gtf_df_pos.sort_values(['transcript_id','start'],ascending=True).groupby('transcript_id').head(len(gtf_df_pos. transcript_id))
pd.concat([gtf_df_pos, gtf_df_neg]).to_csv(args.output_bed_file,sep="\t",index=False) #gtf_df_pos and gtf_df_neg must be dataframes
gtf = pd.read_table("Homo_sapiens.GRCh38.107.gtf.gz",skiprows=5,header=None)
exons = gtf[gtf[2]=="exon"]
feat = list(exons[8])
superlist = []
idlist = []
for x in range(len(feat)):
newlist = feat[x].split(";")
superlist.append(str(newlist[2])[16:-1])
idlist.append(str(newlist[0])[9:-1])
bed = {"chr":exons[0],"start":exons[3],"end":exons[4],"transcript_id":superlist,"score":exons[5],"strand":exons[6],"gene_id":idlist}
class bed:
def__init__(self, exons, chr, start, end, transcript_id, score, strand, gene_id):
self.exons = exons
self.chr = exons[0]
self.start = exons[3]
self.end = exons[4]
self.transcript_id = superlist
self.score = exons[5]
self.strand = exons[6]
self.gene_id = idList
"""Creates BED from GTF for bedtools.
This class defines a BED from exon annotation from a GTF, for use in bedtools to get sequences with transcript ID as header.
Parameters
----------
arg1 : GTF file.
Returns
-------
Class
A class which defines columns in standard BED format.
Raises
------
TypeError
ValueError: Not all columns found in GTF.
"""
bed = pd.DataFrame(bed)
bed.to_csv("bed_file.bed",sep="\t",index=False)
bed[(bed["gene_id"]=="ENSG00000160072")|(bed["gene_id"]== "ENSG00000142611")|(bed["gene_id"]=="ENSG00000232596")].to_csv("test.bed",sep="\t",index=False,header=None)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment