From 3d7543859bef5ebb6c9e7b90881979fe6b668b15 Mon Sep 17 00:00:00 2001
From: Samuel Mondal <mondal0000@bz-rgab01-pdm02.bioz.unibas.ch>
Date: Tue, 13 Dec 2022 17:12:04 +0100
Subject: [PATCH] minor changes based on pylint

---
 gtf_processing/pre_bedtools.py           |  9 ++++-----
 sequence_extractor/cli.py                | 11 ++++++++++-
 sequence_extractor/exon_concatenation.py | 19 ++++++++++---------
 sequence_extractor/poly_a.py             | 15 ++++++++-------
 4 files changed, 32 insertions(+), 22 deletions(-)

diff --git a/gtf_processing/pre_bedtools.py b/gtf_processing/pre_bedtools.py
index 57be294..2088947 100644
--- a/gtf_processing/pre_bedtools.py
+++ b/gtf_processing/pre_bedtools.py
@@ -1,7 +1,3 @@
-import pandas as pd
-import argparse
-from gtfparse import read_gtf
-
 """This script defines a BED from exon annotation in a GTF, to get exon coordinates for use in bedtools. It also ensures that the concatenation happens in the correct order, regardless of the strandedness of the transcript.
 
     Args:
@@ -11,6 +7,10 @@ from gtfparse import read_gtf
         BED file with the format: chr, start, end, transcript_id, score, strand, gene_id
 """
 
+import argparse
+import pandas as pd
+from gtfparse import read_gtf
+
 parser = argparse.ArgumentParser(
     prog = 'pre_bedtools',
     description = 'extracts ordered information from gtf file and for transcripts in the negative strand, flips the order in which exons are ordered.')
@@ -31,4 +31,3 @@ gtf_df_pos = gtf_exons[gtf_exons["strand"] == "+"]
 gtf_df_pos = gtf_df_pos.sort_values(['transcript_id','start'],ascending=True).groupby('transcript_id').head(len(gtf_df_pos. transcript_id))
 
 pd.concat([gtf_df_pos, gtf_df_neg]).to_csv(args.output_bed_file,sep="\t",index=False) #gtf_df_pos and gtf_df_neg must be dataframes
-
diff --git a/sequence_extractor/cli.py b/sequence_extractor/cli.py
index 2e05f55..73062da 100644
--- a/sequence_extractor/cli.py
+++ b/sequence_extractor/cli.py
@@ -1,3 +1,4 @@
+""" command line script to be run on output fasta file from bedtools getfasta """
 import argparse
 import logging
 from exon_concatenation import exon_concatenation
@@ -14,10 +15,18 @@ parser.add_argument('--output_file_name',
 args = parser.parse_args()
 
 def main():
+    """Runs on the output from bedtools and concatenates the exons together and adds a polyA tail and outputs a fasta file.
+
+    Args:
+        None: this will run on its own by taking the information from argparse
+
+    Returns:
+        A fasta file with a single entry for each transcript ID with polyA tail being added onto the sequence at 3'end
+    """
     LOG.info("sequence_extractor begins")
     fasta_list = exon_concatenation(args.input_fasta_file)
     final_list = poly_a_addition_to_fasta_list(fasta_list)
-    with open(args.output_file_name, 'w') as fasta_out:
+    with open(args.output_file_name, 'w', encoding="utf-8") as fasta_out:
         fasta_out.write('\n'.join('%s\n%s' % x for x in final_list))
     LOG.info("sequence_extractor ends")
 
diff --git a/sequence_extractor/exon_concatenation.py b/sequence_extractor/exon_concatenation.py
index 4a4b1c0..d9f3266 100644
--- a/sequence_extractor/exon_concatenation.py
+++ b/sequence_extractor/exon_concatenation.py
@@ -1,18 +1,19 @@
+"""Script containing the function to concatenate exons and output the results in a list of tuples"""
 def exon_concatenation(
-	post_bedtools_fasta: str
+    post_bedtools_fasta: str,
 ) -> list:
-	"""Concatenate all sequences starting with identical transcripit ID and outputs it as a list with sequence header (Transcript ID) and concatenated sequences as tuples.
+    """Concatenate all sequences starting with identical transcripit ID and outputs it as a list with sequence header (Transcript ID) and concatenated sequences as tuples.
 
-	Args:
-		post_bedtools_fasta: The name of the fasta file obtained after bedtools has been run
+    Args:
+        post_bedtools_fasta: The name of the fasta file obtained after bedtools has been run
 
-	Returns:
-		A list containing transcript ID and concatenated exons in tuples.
-	"""
-    with open(post_bedtools_fasta,'r') as fa:
+    Returns:
+        A list containing transcript ID and concatenated exons in tuples.
+    """
+    with open(post_bedtools_fasta,'r', encoding="utf-8") as fasta:
         annotation = []
         fasta_format_list = []
-        for line1,line2 in zip(fa,fa):
+        for line1,line2 in zip(fasta,fasta):
             if len(annotation) == 0:
                 annotation.append(line1[0:16])
                 read = line2[:-1]
diff --git a/sequence_extractor/poly_a.py b/sequence_extractor/poly_a.py
index 28a9679..60b2997 100644
--- a/sequence_extractor/poly_a.py
+++ b/sequence_extractor/poly_a.py
@@ -1,9 +1,10 @@
+""" This script contains two functions and the first function is called by the second function and used to add poly A tail to the concatenated exon"""
 import numpy as np
 # To do: Taking probabilities of nucleotides from user and raising error if sum != 1
 def poly_a_generator(
 	exon: str,
 ) -> str:
-	"""Adds a PolyA tail to an exon sequence input into the function.
+    """Adds a PolyA tail to an exon sequence input into the function.
 
 	 Args:
 		exon: RNA sequence, obtained from concatenation of exons, that needs polyA to be added to its 3' end.
@@ -11,14 +12,14 @@ def poly_a_generator(
 	Returns:
 		RNA with polyA tail added to its 3' end.
 	"""
-	listA = ['A','T','G','C']
-	polyA = ''.join(np.random.choice(listA,250,p=[0.914,0.028,0.025,0.033]))
-	return (exon+polyA)
+    list_of_nucleotides = ['A','T','G','C']
+    poly_a_string = ''.join(np.random.choice(list_of_nucleotides,250,p=[0.914,0.028,0.025,0.033]))
+    return exon+poly_a_string
 
 def poly_a_addition_to_fasta_list(
 	fasta_list: list,
 ) -> list:
-	"""Takes in a list of tuples with annotations and exons and outputs a list where polyA tail has been added to all the exon 3' ends.
+    """Takes in a list of tuples with annotations and exons and outputs a list where polyA tail has been added to all the exon 3' ends.
 
 	Args:
 		fasta_list: List contaning tuples of annotations and exons
@@ -26,5 +27,5 @@ def poly_a_addition_to_fasta_list(
 	Returns:
 		A list like the initial list, this time with polyA tail added onto it.
 	"""
-	mature_rna_list = [(i[0],poly_a_generator(i[1])) for i in fasta_list]
-	return mature_rna_list
+    mature_rna_list = [(i[0],poly_a_generator(i[1])) for i in fasta_list]
+    return mature_rna_list
-- 
GitLab