Skip to content
Snippets Groups Projects
Commit 6ebd862e authored by Samuel Mondal's avatar Samuel Mondal
Browse files

changes made by black

parent 3d754385
No related branches found
No related tags found
1 merge request!62changes made by black
...@@ -12,22 +12,36 @@ import pandas as pd ...@@ -12,22 +12,36 @@ import pandas as pd
from gtfparse import read_gtf from gtfparse import read_gtf
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
prog = 'pre_bedtools', prog="pre_bedtools",
description = 'extracts ordered information from gtf file and for transcripts in the negative strand, flips the order in which exons are ordered.') description="extracts ordered information from gtf file and for transcripts in the negative strand, flips the order in which exons are ordered.",
parser.add_argument('--input_gtf_file', )
help='ordered and processed gtf file') parser.add_argument("--input_gtf_file", help="ordered and processed gtf file")
parser.add_argument('--output_bed_file', parser.add_argument(
help='bed file with only exons with strandedness taken into account') "--output_bed_file",
help="bed file with only exons with strandedness taken into account",
)
args = parser.parse_args() args = parser.parse_args()
gtf = read_gtf(args.input_gtf_file) gtf = read_gtf(args.input_gtf_file)
gtf_exons = gtf[gtf["feature"] == "exon"] gtf_exons = gtf[gtf["feature"] == "exon"]
gtf_exons = gtf_exons[["seqname", "start", "end", "transcript_id", "score", "strand", "gene_id"]] gtf_exons = gtf_exons[
["seqname", "start", "end", "transcript_id", "score", "strand", "gene_id"]
]
gtf_df_neg = gtf_exons[gtf_exons["strand"] == "-"] gtf_df_neg = gtf_exons[gtf_exons["strand"] == "-"]
gtf_df_neg = gtf_df_neg.sort_values(['transcript_id','start'],ascending=False).groupby('transcript_id').head(len(gtf_df_neg. transcript_id)) gtf_df_neg = (
gtf_df_neg.sort_values(["transcript_id", "start"], ascending=False)
.groupby("transcript_id")
.head(len(gtf_df_neg.transcript_id))
)
gtf_df_pos = gtf_exons[gtf_exons["strand"] == "+"] gtf_df_pos = gtf_exons[gtf_exons["strand"] == "+"]
gtf_df_pos = gtf_df_pos.sort_values(['transcript_id','start'],ascending=True).groupby('transcript_id').head(len(gtf_df_pos. transcript_id)) gtf_df_pos = (
gtf_df_pos.sort_values(["transcript_id", "start"], ascending=True)
pd.concat([gtf_df_pos, gtf_df_neg]).to_csv(args.output_bed_file,sep="\t",index=False) #gtf_df_pos and gtf_df_neg must be dataframes .groupby("transcript_id")
.head(len(gtf_df_pos.transcript_id))
)
pd.concat([gtf_df_pos, gtf_df_neg]).to_csv(
args.output_bed_file, sep="\t", index=False
) # gtf_df_pos and gtf_df_neg must be dataframes
...@@ -5,15 +5,15 @@ from exon_concatenation import exon_concatenation ...@@ -5,15 +5,15 @@ from exon_concatenation import exon_concatenation
from poly_a import poly_a_addition_to_fasta_list from poly_a import poly_a_addition_to_fasta_list
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
prog = 'transcript_sequence_extractor', prog="transcript_sequence_extractor",
description = 'extracts transcript sequences from genome sequence and ouputs transcripts with PolyA tail added to them') description="extracts transcript sequences from genome sequence and ouputs transcripts with PolyA tail added to them",
parser.add_argument('--input_fasta_file', )
help='fasta file obtained from bedtools') parser.add_argument("--input_fasta_file", help="fasta file obtained from bedtools")
parser.add_argument('--output_file_name', parser.add_argument("--output_file_name", help="Name of the output fasta file")
help='Name of the output fasta file')
args = parser.parse_args() args = parser.parse_args()
def main(): def main():
"""Runs on the output from bedtools and concatenates the exons together and adds a polyA tail and outputs a fasta file. """Runs on the output from bedtools and concatenates the exons together and adds a polyA tail and outputs a fasta file.
...@@ -26,11 +26,12 @@ def main(): ...@@ -26,11 +26,12 @@ def main():
LOG.info("sequence_extractor begins") LOG.info("sequence_extractor begins")
fasta_list = exon_concatenation(args.input_fasta_file) fasta_list = exon_concatenation(args.input_fasta_file)
final_list = poly_a_addition_to_fasta_list(fasta_list) final_list = poly_a_addition_to_fasta_list(fasta_list)
with open(args.output_file_name, 'w', encoding="utf-8") as fasta_out: with open(args.output_file_name, "w", encoding="utf-8") as fasta_out:
fasta_out.write('\n'.join('%s\n%s' % x for x in final_list)) fasta_out.write("\n".join("%s\n%s" % x for x in final_list))
LOG.info("sequence_extractor ends") LOG.info("sequence_extractor ends")
if ___name__ == 'main':
if ___name__ == "main":
logging.basicConfig( logging.basicConfig(
format='[%(asctime)s: %(levelname)s] %(message)s (module "%(module)s")', format='[%(asctime)s: %(levelname)s] %(message)s (module "%(module)s")',
level=logging.INFO, level=logging.INFO,
......
"""Script containing the function to concatenate exons and output the results in a list of tuples""" """Script containing the function to concatenate exons and output the results in a list of tuples"""
def exon_concatenation( def exon_concatenation(
post_bedtools_fasta: str, post_bedtools_fasta: str,
) -> list: ) -> list:
...@@ -10,10 +12,10 @@ def exon_concatenation( ...@@ -10,10 +12,10 @@ def exon_concatenation(
Returns: Returns:
A list containing transcript ID and concatenated exons in tuples. A list containing transcript ID and concatenated exons in tuples.
""" """
with open(post_bedtools_fasta,'r', encoding="utf-8") as fasta: with open(post_bedtools_fasta, "r", encoding="utf-8") as fasta:
annotation = [] annotation = []
fasta_format_list = [] fasta_format_list = []
for line1,line2 in zip(fasta,fasta): for line1, line2 in zip(fasta, fasta):
if len(annotation) == 0: if len(annotation) == 0:
annotation.append(line1[0:16]) annotation.append(line1[0:16])
read = line2[:-1] read = line2[:-1]
...@@ -21,8 +23,8 @@ def exon_concatenation( ...@@ -21,8 +23,8 @@ def exon_concatenation(
if annotation[-1] == line1[0:16]: if annotation[-1] == line1[0:16]:
read += line2[:-1] read += line2[:-1]
elif annotation[-1] != line1[0:16]: elif annotation[-1] != line1[0:16]:
fasta_format_list.append((annotation[-1],read)) fasta_format_list.append((annotation[-1], read))
annotation.append(line1[0:16]) annotation.append(line1[0:16])
read = line2[:-1] read = line2[:-1]
fasta_format_list.append((annotation[-1],read)) fasta_format_list.append((annotation[-1], read))
return fasta_format_list return fasta_format_list
""" This script contains two functions and the first function is called by the second function and used to add poly A tail to the concatenated exon""" """ This script contains two functions and the first function is called by the second function and used to add poly A tail to the concatenated exon"""
import numpy as np import numpy as np
# To do: Taking probabilities of nucleotides from user and raising error if sum != 1 # To do: Taking probabilities of nucleotides from user and raising error if sum != 1
def poly_a_generator( def poly_a_generator(
exon: str, exon: str,
) -> str: ) -> str:
"""Adds a PolyA tail to an exon sequence input into the function. """Adds a PolyA tail to an exon sequence input into the function.
Args: Args:
exon: RNA sequence, obtained from concatenation of exons, that needs polyA to be added to its 3' end. exon: RNA sequence, obtained from concatenation of exons, that needs polyA to be added to its 3' end.
Returns:
RNA with polyA tail added to its 3' end.
"""
list_of_nucleotides = ["A", "T", "G", "C"]
poly_a_string = "".join(
np.random.choice(list_of_nucleotides, 250, p=[0.914, 0.028, 0.025, 0.033])
)
return exon + poly_a_string
Returns:
RNA with polyA tail added to its 3' end.
"""
list_of_nucleotides = ['A','T','G','C']
poly_a_string = ''.join(np.random.choice(list_of_nucleotides,250,p=[0.914,0.028,0.025,0.033]))
return exon+poly_a_string
def poly_a_addition_to_fasta_list( def poly_a_addition_to_fasta_list(
fasta_list: list, fasta_list: list,
) -> list: ) -> list:
"""Takes in a list of tuples with annotations and exons and outputs a list where polyA tail has been added to all the exon 3' ends. """Takes in a list of tuples with annotations and exons and outputs a list where polyA tail has been added to all the exon 3' ends.
Args: Args:
fasta_list: List contaning tuples of annotations and exons fasta_list: List contaning tuples of annotations and exons
Returns: Returns:
A list like the initial list, this time with polyA tail added onto it. A list like the initial list, this time with polyA tail added onto it.
""" """
mature_rna_list = [(i[0],poly_a_generator(i[1])) for i in fasta_list] mature_rna_list = [(i[0], poly_a_generator(i[1])) for i in fasta_list]
return mature_rna_list return mature_rna_list
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment