Skip to content
Snippets Groups Projects
Commit 6ebd862e authored by Samuel Mondal's avatar Samuel Mondal
Browse files

changes made by black

parent 3d754385
No related branches found
No related tags found
1 merge request!62changes made by black
...@@ -12,22 +12,36 @@ import pandas as pd ...@@ -12,22 +12,36 @@ import pandas as pd
from gtfparse import read_gtf from gtfparse import read_gtf
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
prog = 'pre_bedtools', prog="pre_bedtools",
description = 'extracts ordered information from gtf file and for transcripts in the negative strand, flips the order in which exons are ordered.') description="extracts ordered information from gtf file and for transcripts in the negative strand, flips the order in which exons are ordered.",
parser.add_argument('--input_gtf_file', )
help='ordered and processed gtf file') parser.add_argument("--input_gtf_file", help="ordered and processed gtf file")
parser.add_argument('--output_bed_file', parser.add_argument(
help='bed file with only exons with strandedness taken into account') "--output_bed_file",
help="bed file with only exons with strandedness taken into account",
)
args = parser.parse_args() args = parser.parse_args()
gtf = read_gtf(args.input_gtf_file) gtf = read_gtf(args.input_gtf_file)
gtf_exons = gtf[gtf["feature"] == "exon"] gtf_exons = gtf[gtf["feature"] == "exon"]
gtf_exons = gtf_exons[["seqname", "start", "end", "transcript_id", "score", "strand", "gene_id"]] gtf_exons = gtf_exons[
["seqname", "start", "end", "transcript_id", "score", "strand", "gene_id"]
]
gtf_df_neg = gtf_exons[gtf_exons["strand"] == "-"] gtf_df_neg = gtf_exons[gtf_exons["strand"] == "-"]
gtf_df_neg = gtf_df_neg.sort_values(['transcript_id','start'],ascending=False).groupby('transcript_id').head(len(gtf_df_neg. transcript_id)) gtf_df_neg = (
gtf_df_neg.sort_values(["transcript_id", "start"], ascending=False)
.groupby("transcript_id")
.head(len(gtf_df_neg.transcript_id))
)
gtf_df_pos = gtf_exons[gtf_exons["strand"] == "+"] gtf_df_pos = gtf_exons[gtf_exons["strand"] == "+"]
gtf_df_pos = gtf_df_pos.sort_values(['transcript_id','start'],ascending=True).groupby('transcript_id').head(len(gtf_df_pos. transcript_id)) gtf_df_pos = (
gtf_df_pos.sort_values(["transcript_id", "start"], ascending=True)
pd.concat([gtf_df_pos, gtf_df_neg]).to_csv(args.output_bed_file,sep="\t",index=False) #gtf_df_pos and gtf_df_neg must be dataframes .groupby("transcript_id")
.head(len(gtf_df_pos.transcript_id))
)
pd.concat([gtf_df_pos, gtf_df_neg]).to_csv(
args.output_bed_file, sep="\t", index=False
) # gtf_df_pos and gtf_df_neg must be dataframes
...@@ -5,15 +5,15 @@ from exon_concatenation import exon_concatenation ...@@ -5,15 +5,15 @@ from exon_concatenation import exon_concatenation
from poly_a import poly_a_addition_to_fasta_list from poly_a import poly_a_addition_to_fasta_list
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
prog = 'transcript_sequence_extractor', prog="transcript_sequence_extractor",
description = 'extracts transcript sequences from genome sequence and ouputs transcripts with PolyA tail added to them') description="extracts transcript sequences from genome sequence and ouputs transcripts with PolyA tail added to them",
parser.add_argument('--input_fasta_file', )
help='fasta file obtained from bedtools') parser.add_argument("--input_fasta_file", help="fasta file obtained from bedtools")
parser.add_argument('--output_file_name', parser.add_argument("--output_file_name", help="Name of the output fasta file")
help='Name of the output fasta file')
args = parser.parse_args() args = parser.parse_args()
def main(): def main():
"""Runs on the output from bedtools and concatenates the exons together and adds a polyA tail and outputs a fasta file. """Runs on the output from bedtools and concatenates the exons together and adds a polyA tail and outputs a fasta file.
...@@ -26,11 +26,12 @@ def main(): ...@@ -26,11 +26,12 @@ def main():
LOG.info("sequence_extractor begins") LOG.info("sequence_extractor begins")
fasta_list = exon_concatenation(args.input_fasta_file) fasta_list = exon_concatenation(args.input_fasta_file)
final_list = poly_a_addition_to_fasta_list(fasta_list) final_list = poly_a_addition_to_fasta_list(fasta_list)
with open(args.output_file_name, 'w', encoding="utf-8") as fasta_out: with open(args.output_file_name, "w", encoding="utf-8") as fasta_out:
fasta_out.write('\n'.join('%s\n%s' % x for x in final_list)) fasta_out.write("\n".join("%s\n%s" % x for x in final_list))
LOG.info("sequence_extractor ends") LOG.info("sequence_extractor ends")
if ___name__ == 'main':
if ___name__ == "main":
logging.basicConfig( logging.basicConfig(
format='[%(asctime)s: %(levelname)s] %(message)s (module "%(module)s")', format='[%(asctime)s: %(levelname)s] %(message)s (module "%(module)s")',
level=logging.INFO, level=logging.INFO,
......
"""Script containing the function to concatenate exons and output the results in a list of tuples""" """Script containing the function to concatenate exons and output the results in a list of tuples"""
def exon_concatenation( def exon_concatenation(
post_bedtools_fasta: str, post_bedtools_fasta: str,
) -> list: ) -> list:
...@@ -10,7 +12,7 @@ def exon_concatenation( ...@@ -10,7 +12,7 @@ def exon_concatenation(
Returns: Returns:
A list containing transcript ID and concatenated exons in tuples. A list containing transcript ID and concatenated exons in tuples.
""" """
with open(post_bedtools_fasta,'r', encoding="utf-8") as fasta: with open(post_bedtools_fasta, "r", encoding="utf-8") as fasta:
annotation = [] annotation = []
fasta_format_list = [] fasta_format_list = []
for line1, line2 in zip(fasta, fasta): for line1, line2 in zip(fasta, fasta):
......
""" This script contains two functions and the first function is called by the second function and used to add poly A tail to the concatenated exon""" """ This script contains two functions and the first function is called by the second function and used to add poly A tail to the concatenated exon"""
import numpy as np import numpy as np
# To do: Taking probabilities of nucleotides from user and raising error if sum != 1 # To do: Taking probabilities of nucleotides from user and raising error if sum != 1
def poly_a_generator( def poly_a_generator(
exon: str, exon: str,
...@@ -12,10 +13,13 @@ def poly_a_generator( ...@@ -12,10 +13,13 @@ def poly_a_generator(
Returns: Returns:
RNA with polyA tail added to its 3' end. RNA with polyA tail added to its 3' end.
""" """
list_of_nucleotides = ['A','T','G','C'] list_of_nucleotides = ["A", "T", "G", "C"]
poly_a_string = ''.join(np.random.choice(list_of_nucleotides,250,p=[0.914,0.028,0.025,0.033])) poly_a_string = "".join(
np.random.choice(list_of_nucleotides, 250, p=[0.914, 0.028, 0.025, 0.033])
)
return exon + poly_a_string return exon + poly_a_string
def poly_a_addition_to_fasta_list( def poly_a_addition_to_fasta_list(
fasta_list: list, fasta_list: list,
) -> list: ) -> list:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment