Skip to content
Snippets Groups Projects

changes made by black

Merged Samuel Mondal requested to merge new into master
4 files
+ 61
40
Compare changes
  • Side-by-side
  • Inline
Files
4
@@ -12,22 +12,36 @@ import pandas as pd
from gtfparse import read_gtf
parser = argparse.ArgumentParser(
prog = 'pre_bedtools',
description = 'extracts ordered information from gtf file and for transcripts in the negative strand, flips the order in which exons are ordered.')
parser.add_argument('--input_gtf_file',
help='ordered and processed gtf file')
parser.add_argument('--output_bed_file',
help='bed file with only exons with strandedness taken into account')
prog="pre_bedtools",
description="extracts ordered information from gtf file and for transcripts in the negative strand, flips the order in which exons are ordered.",
)
parser.add_argument("--input_gtf_file", help="ordered and processed gtf file")
parser.add_argument(
"--output_bed_file",
help="bed file with only exons with strandedness taken into account",
)
args = parser.parse_args()
gtf = read_gtf(args.input_gtf_file)
gtf_exons = gtf[gtf["feature"] == "exon"]
gtf_exons = gtf_exons[["seqname", "start", "end", "transcript_id", "score", "strand", "gene_id"]]
gtf_exons = gtf_exons[
["seqname", "start", "end", "transcript_id", "score", "strand", "gene_id"]
]
gtf_df_neg = gtf_exons[gtf_exons["strand"] == "-"]
gtf_df_neg = gtf_df_neg.sort_values(['transcript_id','start'],ascending=False).groupby('transcript_id').head(len(gtf_df_neg. transcript_id))
gtf_df_neg = (
gtf_df_neg.sort_values(["transcript_id", "start"], ascending=False)
.groupby("transcript_id")
.head(len(gtf_df_neg.transcript_id))
)
gtf_df_pos = gtf_exons[gtf_exons["strand"] == "+"]
gtf_df_pos = gtf_df_pos.sort_values(['transcript_id','start'],ascending=True).groupby('transcript_id').head(len(gtf_df_pos. transcript_id))
pd.concat([gtf_df_pos, gtf_df_neg]).to_csv(args.output_bed_file,sep="\t",index=False) #gtf_df_pos and gtf_df_neg must be dataframes
gtf_df_pos = (
gtf_df_pos.sort_values(["transcript_id", "start"], ascending=True)
.groupby("transcript_id")
.head(len(gtf_df_pos.transcript_id))
)
pd.concat([gtf_df_pos, gtf_df_neg]).to_csv(
args.output_bed_file, sep="\t", index=False
) # gtf_df_pos and gtf_df_neg must be dataframes
Loading