diff --git a/gtf_processing/pre_bedtools.py b/gtf_processing/pre_bedtools.py index 20889478f604b5e409b2220c494a7e57176cb8de..cc69d5b3fae683a6b61ccf5e2ed21aef48c5b11d 100644 --- a/gtf_processing/pre_bedtools.py +++ b/gtf_processing/pre_bedtools.py @@ -12,22 +12,36 @@ import pandas as pd from gtfparse import read_gtf parser = argparse.ArgumentParser( - prog = 'pre_bedtools', - description = 'extracts ordered information from gtf file and for transcripts in the negative strand, flips the order in which exons are ordered.') -parser.add_argument('--input_gtf_file', - help='ordered and processed gtf file') -parser.add_argument('--output_bed_file', - help='bed file with only exons with strandedness taken into account') + prog="pre_bedtools", + description="extracts ordered information from gtf file and for transcripts in the negative strand, flips the order in which exons are ordered.", +) +parser.add_argument("--input_gtf_file", help="ordered and processed gtf file") +parser.add_argument( + "--output_bed_file", + help="bed file with only exons with strandedness taken into account", +) args = parser.parse_args() gtf = read_gtf(args.input_gtf_file) gtf_exons = gtf[gtf["feature"] == "exon"] -gtf_exons = gtf_exons[["seqname", "start", "end", "transcript_id", "score", "strand", "gene_id"]] +gtf_exons = gtf_exons[ + ["seqname", "start", "end", "transcript_id", "score", "strand", "gene_id"] +] gtf_df_neg = gtf_exons[gtf_exons["strand"] == "-"] -gtf_df_neg = gtf_df_neg.sort_values(['transcript_id','start'],ascending=False).groupby('transcript_id').head(len(gtf_df_neg. transcript_id)) +gtf_df_neg = ( + gtf_df_neg.sort_values(["transcript_id", "start"], ascending=False) + .groupby("transcript_id") + .head(len(gtf_df_neg.transcript_id)) +) gtf_df_pos = gtf_exons[gtf_exons["strand"] == "+"] -gtf_df_pos = gtf_df_pos.sort_values(['transcript_id','start'],ascending=True).groupby('transcript_id').head(len(gtf_df_pos. transcript_id)) - -pd.concat([gtf_df_pos, gtf_df_neg]).to_csv(args.output_bed_file,sep="\t",index=False) #gtf_df_pos and gtf_df_neg must be dataframes +gtf_df_pos = ( + gtf_df_pos.sort_values(["transcript_id", "start"], ascending=True) + .groupby("transcript_id") + .head(len(gtf_df_pos.transcript_id)) +) + +pd.concat([gtf_df_pos, gtf_df_neg]).to_csv( + args.output_bed_file, sep="\t", index=False +) # gtf_df_pos and gtf_df_neg must be dataframes diff --git a/sequence_extractor/cli.py b/sequence_extractor/cli.py index 73062da3fb84f18f67391f490203612d65c4f0e2..b3bfeab5176ae3fd05fae2475b28e2f72488e5a8 100644 --- a/sequence_extractor/cli.py +++ b/sequence_extractor/cli.py @@ -5,15 +5,15 @@ from exon_concatenation import exon_concatenation from poly_a import poly_a_addition_to_fasta_list parser = argparse.ArgumentParser( - prog = 'transcript_sequence_extractor', - description = 'extracts transcript sequences from genome sequence and ouputs transcripts with PolyA tail added to them') -parser.add_argument('--input_fasta_file', - help='fasta file obtained from bedtools') -parser.add_argument('--output_file_name', - help='Name of the output fasta file') + prog="transcript_sequence_extractor", + description="extracts transcript sequences from genome sequence and ouputs transcripts with PolyA tail added to them", +) +parser.add_argument("--input_fasta_file", help="fasta file obtained from bedtools") +parser.add_argument("--output_file_name", help="Name of the output fasta file") args = parser.parse_args() + def main(): """Runs on the output from bedtools and concatenates the exons together and adds a polyA tail and outputs a fasta file. @@ -26,11 +26,12 @@ def main(): LOG.info("sequence_extractor begins") fasta_list = exon_concatenation(args.input_fasta_file) final_list = poly_a_addition_to_fasta_list(fasta_list) - with open(args.output_file_name, 'w', encoding="utf-8") as fasta_out: - fasta_out.write('\n'.join('%s\n%s' % x for x in final_list)) + with open(args.output_file_name, "w", encoding="utf-8") as fasta_out: + fasta_out.write("\n".join("%s\n%s" % x for x in final_list)) LOG.info("sequence_extractor ends") -if ___name__ == 'main': + +if ___name__ == "main": logging.basicConfig( format='[%(asctime)s: %(levelname)s] %(message)s (module "%(module)s")', level=logging.INFO, diff --git a/sequence_extractor/exon_concatenation.py b/sequence_extractor/exon_concatenation.py index d9f3266e90d058c0698c029aec2cf5811fceca2a..88377f27ee5ee0a01c63a503596b854066123ea2 100644 --- a/sequence_extractor/exon_concatenation.py +++ b/sequence_extractor/exon_concatenation.py @@ -1,4 +1,6 @@ """Script containing the function to concatenate exons and output the results in a list of tuples""" + + def exon_concatenation( post_bedtools_fasta: str, ) -> list: @@ -10,10 +12,10 @@ def exon_concatenation( Returns: A list containing transcript ID and concatenated exons in tuples. """ - with open(post_bedtools_fasta,'r', encoding="utf-8") as fasta: + with open(post_bedtools_fasta, "r", encoding="utf-8") as fasta: annotation = [] fasta_format_list = [] - for line1,line2 in zip(fasta,fasta): + for line1, line2 in zip(fasta, fasta): if len(annotation) == 0: annotation.append(line1[0:16]) read = line2[:-1] @@ -21,8 +23,8 @@ def exon_concatenation( if annotation[-1] == line1[0:16]: read += line2[:-1] elif annotation[-1] != line1[0:16]: - fasta_format_list.append((annotation[-1],read)) + fasta_format_list.append((annotation[-1], read)) annotation.append(line1[0:16]) read = line2[:-1] - fasta_format_list.append((annotation[-1],read)) + fasta_format_list.append((annotation[-1], read)) return fasta_format_list diff --git a/sequence_extractor/poly_a.py b/sequence_extractor/poly_a.py index 60b299788743a192f984a2d580ac139a42c35a7b..d1975628337d4b0e5bb6f5c90f55c6981ca34dde 100644 --- a/sequence_extractor/poly_a.py +++ b/sequence_extractor/poly_a.py @@ -1,31 +1,35 @@ """ This script contains two functions and the first function is called by the second function and used to add poly A tail to the concatenated exon""" import numpy as np + # To do: Taking probabilities of nucleotides from user and raising error if sum != 1 def poly_a_generator( - exon: str, + exon: str, ) -> str: """Adds a PolyA tail to an exon sequence input into the function. - Args: - exon: RNA sequence, obtained from concatenation of exons, that needs polyA to be added to its 3' end. + Args: + exon: RNA sequence, obtained from concatenation of exons, that needs polyA to be added to its 3' end. + + Returns: + RNA with polyA tail added to its 3' end. + """ + list_of_nucleotides = ["A", "T", "G", "C"] + poly_a_string = "".join( + np.random.choice(list_of_nucleotides, 250, p=[0.914, 0.028, 0.025, 0.033]) + ) + return exon + poly_a_string - Returns: - RNA with polyA tail added to its 3' end. - """ - list_of_nucleotides = ['A','T','G','C'] - poly_a_string = ''.join(np.random.choice(list_of_nucleotides,250,p=[0.914,0.028,0.025,0.033])) - return exon+poly_a_string def poly_a_addition_to_fasta_list( - fasta_list: list, + fasta_list: list, ) -> list: """Takes in a list of tuples with annotations and exons and outputs a list where polyA tail has been added to all the exon 3' ends. - Args: - fasta_list: List contaning tuples of annotations and exons + Args: + fasta_list: List contaning tuples of annotations and exons - Returns: - A list like the initial list, this time with polyA tail added onto it. - """ - mature_rna_list = [(i[0],poly_a_generator(i[1])) for i in fasta_list] + Returns: + A list like the initial list, this time with polyA tail added onto it. + """ + mature_rna_list = [(i[0], poly_a_generator(i[1])) for i in fasta_list] return mature_rna_list