From 3d7543859bef5ebb6c9e7b90881979fe6b668b15 Mon Sep 17 00:00:00 2001 From: Samuel Mondal <mondal0000@bz-rgab01-pdm02.bioz.unibas.ch> Date: Tue, 13 Dec 2022 17:12:04 +0100 Subject: [PATCH] minor changes based on pylint --- gtf_processing/pre_bedtools.py | 9 ++++----- sequence_extractor/cli.py | 11 ++++++++++- sequence_extractor/exon_concatenation.py | 19 ++++++++++--------- sequence_extractor/poly_a.py | 15 ++++++++------- 4 files changed, 32 insertions(+), 22 deletions(-) diff --git a/gtf_processing/pre_bedtools.py b/gtf_processing/pre_bedtools.py index 57be294..2088947 100644 --- a/gtf_processing/pre_bedtools.py +++ b/gtf_processing/pre_bedtools.py @@ -1,7 +1,3 @@ -import pandas as pd -import argparse -from gtfparse import read_gtf - """This script defines a BED from exon annotation in a GTF, to get exon coordinates for use in bedtools. It also ensures that the concatenation happens in the correct order, regardless of the strandedness of the transcript. Args: @@ -11,6 +7,10 @@ from gtfparse import read_gtf BED file with the format: chr, start, end, transcript_id, score, strand, gene_id """ +import argparse +import pandas as pd +from gtfparse import read_gtf + parser = argparse.ArgumentParser( prog = 'pre_bedtools', description = 'extracts ordered information from gtf file and for transcripts in the negative strand, flips the order in which exons are ordered.') @@ -31,4 +31,3 @@ gtf_df_pos = gtf_exons[gtf_exons["strand"] == "+"] gtf_df_pos = gtf_df_pos.sort_values(['transcript_id','start'],ascending=True).groupby('transcript_id').head(len(gtf_df_pos. transcript_id)) pd.concat([gtf_df_pos, gtf_df_neg]).to_csv(args.output_bed_file,sep="\t",index=False) #gtf_df_pos and gtf_df_neg must be dataframes - diff --git a/sequence_extractor/cli.py b/sequence_extractor/cli.py index 2e05f55..73062da 100644 --- a/sequence_extractor/cli.py +++ b/sequence_extractor/cli.py @@ -1,3 +1,4 @@ +""" command line script to be run on output fasta file from bedtools getfasta """ import argparse import logging from exon_concatenation import exon_concatenation @@ -14,10 +15,18 @@ parser.add_argument('--output_file_name', args = parser.parse_args() def main(): + """Runs on the output from bedtools and concatenates the exons together and adds a polyA tail and outputs a fasta file. + + Args: + None: this will run on its own by taking the information from argparse + + Returns: + A fasta file with a single entry for each transcript ID with polyA tail being added onto the sequence at 3'end + """ LOG.info("sequence_extractor begins") fasta_list = exon_concatenation(args.input_fasta_file) final_list = poly_a_addition_to_fasta_list(fasta_list) - with open(args.output_file_name, 'w') as fasta_out: + with open(args.output_file_name, 'w', encoding="utf-8") as fasta_out: fasta_out.write('\n'.join('%s\n%s' % x for x in final_list)) LOG.info("sequence_extractor ends") diff --git a/sequence_extractor/exon_concatenation.py b/sequence_extractor/exon_concatenation.py index 4a4b1c0..d9f3266 100644 --- a/sequence_extractor/exon_concatenation.py +++ b/sequence_extractor/exon_concatenation.py @@ -1,18 +1,19 @@ +"""Script containing the function to concatenate exons and output the results in a list of tuples""" def exon_concatenation( - post_bedtools_fasta: str + post_bedtools_fasta: str, ) -> list: - """Concatenate all sequences starting with identical transcripit ID and outputs it as a list with sequence header (Transcript ID) and concatenated sequences as tuples. + """Concatenate all sequences starting with identical transcripit ID and outputs it as a list with sequence header (Transcript ID) and concatenated sequences as tuples. - Args: - post_bedtools_fasta: The name of the fasta file obtained after bedtools has been run + Args: + post_bedtools_fasta: The name of the fasta file obtained after bedtools has been run - Returns: - A list containing transcript ID and concatenated exons in tuples. - """ - with open(post_bedtools_fasta,'r') as fa: + Returns: + A list containing transcript ID and concatenated exons in tuples. + """ + with open(post_bedtools_fasta,'r', encoding="utf-8") as fasta: annotation = [] fasta_format_list = [] - for line1,line2 in zip(fa,fa): + for line1,line2 in zip(fasta,fasta): if len(annotation) == 0: annotation.append(line1[0:16]) read = line2[:-1] diff --git a/sequence_extractor/poly_a.py b/sequence_extractor/poly_a.py index 28a9679..60b2997 100644 --- a/sequence_extractor/poly_a.py +++ b/sequence_extractor/poly_a.py @@ -1,9 +1,10 @@ +""" This script contains two functions and the first function is called by the second function and used to add poly A tail to the concatenated exon""" import numpy as np # To do: Taking probabilities of nucleotides from user and raising error if sum != 1 def poly_a_generator( exon: str, ) -> str: - """Adds a PolyA tail to an exon sequence input into the function. + """Adds a PolyA tail to an exon sequence input into the function. Args: exon: RNA sequence, obtained from concatenation of exons, that needs polyA to be added to its 3' end. @@ -11,14 +12,14 @@ def poly_a_generator( Returns: RNA with polyA tail added to its 3' end. """ - listA = ['A','T','G','C'] - polyA = ''.join(np.random.choice(listA,250,p=[0.914,0.028,0.025,0.033])) - return (exon+polyA) + list_of_nucleotides = ['A','T','G','C'] + poly_a_string = ''.join(np.random.choice(list_of_nucleotides,250,p=[0.914,0.028,0.025,0.033])) + return exon+poly_a_string def poly_a_addition_to_fasta_list( fasta_list: list, ) -> list: - """Takes in a list of tuples with annotations and exons and outputs a list where polyA tail has been added to all the exon 3' ends. + """Takes in a list of tuples with annotations and exons and outputs a list where polyA tail has been added to all the exon 3' ends. Args: fasta_list: List contaning tuples of annotations and exons @@ -26,5 +27,5 @@ def poly_a_addition_to_fasta_list( Returns: A list like the initial list, this time with polyA tail added onto it. """ - mature_rna_list = [(i[0],poly_a_generator(i[1])) for i in fasta_list] - return mature_rna_list + mature_rna_list = [(i[0],poly_a_generator(i[1])) for i in fasta_list] + return mature_rna_list -- GitLab