changes made by black

6ebd862e · Samuel Mondal · 3d754385 · 6ebd862e · 6ebd862e · 6ebd862e
Commit 6ebd862e authored 2 years ago by Samuel Mondal
--- a/gtf_processing/pre_bedtools.py
+++ b/gtf_processing/pre_bedtools.py
@@ -12,22 +12,36 @@ import pandas as pd
 from gtfparse import read_gtf
 parser = argparse.ArgumentParser(
-    prog = 'pre_bedtools',
+    prog="pre_bedtools",
-    description = 'extracts ordered information from gtf file and for transcripts in the negative strand, flips the order in which exons are ordered.')
+    description="extracts ordered information from gtf file and for transcripts in the negative strand, flips the order in which exons are ordered.",
-parser.add_argument('--input_gtf_file',
+)
-                    help='ordered and processed gtf file')
+parser.add_argument("--input_gtf_file", help="ordered and processed gtf file")
-parser.add_argument('--output_bed_file',
+parser.add_argument(
-                    help='bed file with only exons with strandedness taken into account')
+    "--output_bed_file",
+    help="bed file with only exons with strandedness taken into account",
+)
 args = parser.parse_args()
 gtf = read_gtf(args.input_gtf_file)
 gtf_exons = gtf[gtf["feature"] == "exon"]
-gtf_exons = gtf_exons[["seqname", "start", "end", "transcript_id", "score", "strand", "gene_id"]]
+gtf_exons = gtf_exons[
+    ["seqname", "start", "end", "transcript_id", "score", "strand", "gene_id"]
+]
 gtf_df_neg = gtf_exons[gtf_exons["strand"] == "-"]
-gtf_df_neg = gtf_df_neg.sort_values(['transcript_id','start'],ascending=False).groupby('transcript_id').head(len(gtf_df_neg. transcript_id))
+gtf_df_neg = (
+    gtf_df_neg.sort_values(["transcript_id", "start"], ascending=False)
+    .groupby("transcript_id")
+    .head(len(gtf_df_neg.transcript_id))
+)
 gtf_df_pos = gtf_exons[gtf_exons["strand"] == "+"]
-gtf_df_pos = gtf_df_pos.sort_values(['transcript_id','start'],ascending=True).groupby('transcript_id').head(len(gtf_df_pos. transcript_id))
+gtf_df_pos = (
+    gtf_df_pos.sort_values(["transcript_id", "start"], ascending=True)
-pd.concat([gtf_df_pos, gtf_df_neg]).to_csv(args.output_bed_file,sep="\t",index=False) #gtf_df_pos and gtf_df_neg must be dataframes
+    .groupby("transcript_id")
+    .head(len(gtf_df_pos.transcript_id))
+)
+pd.concat([gtf_df_pos, gtf_df_neg]).to_csv(
+    args.output_bed_file, sep="\t", index=False
+)  # gtf_df_pos and gtf_df_neg must be dataframes
--- a/sequence_extractor/cli.py
+++ b/sequence_extractor/cli.py
@@ -5,15 +5,15 @@ from exon_concatenation import exon_concatenation
 from poly_a import poly_a_addition_to_fasta_list
 parser = argparse.ArgumentParser(
-    prog = 'transcript_sequence_extractor',
+    prog="transcript_sequence_extractor",
-    description = 'extracts transcript sequences from genome sequence and ouputs transcripts with PolyA tail added to them')
+    description="extracts transcript sequences from genome sequence and ouputs transcripts with PolyA tail added to them",
-parser.add_argument('--input_fasta_file',
+)
-                    help='fasta file obtained from bedtools')
+parser.add_argument("--input_fasta_file", help="fasta file obtained from bedtools")
-parser.add_argument('--output_file_name',
+parser.add_argument("--output_file_name", help="Name of the output fasta file")
-                    help='Name of the output fasta file')
 args = parser.parse_args()
 def main():
    """Runs on the output from bedtools and concatenates the exons together and adds a polyA tail and outputs a fasta file.
@@ -26,11 +26,12 @@ def main():
    LOG.info("sequence_extractor begins")
    fasta_list = exon_concatenation(args.input_fasta_file)
    final_list = poly_a_addition_to_fasta_list(fasta_list)
-    with open(args.output_file_name, 'w', encoding="utf-8") as fasta_out:
+    with open(args.output_file_name, "w", encoding="utf-8") as fasta_out:
-        fasta_out.write('\n'.join('%s\n%s' % x for x in final_list))
+        fasta_out.write("\n".join("%s\n%s" % x for x in final_list))
    LOG.info("sequence_extractor ends")
-if ___name__ == 'main':
+if ___name__ == "main":
    logging.basicConfig(
        format='[%(asctime)s: %(levelname)s] %(message)s (module "%(module)s")',
        level=logging.INFO,

--- a/sequence_extractor/exon_concatenation.py
+++ b/sequence_extractor/exon_concatenation.py
 """Script containing the function to concatenate exons and output the results in a list of tuples"""
 def exon_concatenation(
    post_bedtools_fasta: str,
 ) -> list:
@@ -10,10 +12,10 @@ def exon_concatenation(
    Returns:
        A list containing transcript ID and concatenated exons in tuples.
    """
-    with open(post_bedtools_fasta,'r', encoding="utf-8") as fasta:
+    with open(post_bedtools_fasta, "r", encoding="utf-8") as fasta:
        annotation = []
        fasta_format_list = []
-        for line1,line2 in zip(fasta,fasta):
+        for line1, line2 in zip(fasta, fasta):
            if len(annotation) == 0:
                annotation.append(line1[0:16])
                read = line2[:-1]
@@ -21,8 +23,8 @@ def exon_concatenation(
                if annotation[-1] == line1[0:16]:
                    read += line2[:-1]
                elif annotation[-1] != line1[0:16]:
-                    fasta_format_list.append((annotation[-1],read))
+                    fasta_format_list.append((annotation[-1], read))
                    annotation.append(line1[0:16])
                    read = line2[:-1]
-        fasta_format_list.append((annotation[-1],read))
+        fasta_format_list.append((annotation[-1], read))
    return fasta_format_list
--- a/sequence_extractor/poly_a.py
+++ b/sequence_extractor/poly_a.py
 """ This script contains two functions and the first function is called by the second function and used to add poly A tail to the concatenated exon"""
 import numpy as np
 # To do: Taking probabilities of nucleotides from user and raising error if sum != 1
 def poly_a_generator(
-	exon: str,
+    exon: str,
 ) -> str:
    """Adds a PolyA tail to an exon sequence input into the function.
-	 Args:
+     Args:
-		exon: RNA sequence, obtained from concatenation of exons, that needs polyA to be added to its 3' end.
+            exon: RNA sequence, obtained from concatenation of exons, that needs polyA to be added to its 3' end.
+    Returns:
+            RNA with polyA tail added to its 3' end.
+    """
+    list_of_nucleotides = ["A", "T", "G", "C"]
+    poly_a_string = "".join(
+        np.random.choice(list_of_nucleotides, 250, p=[0.914, 0.028, 0.025, 0.033])
+    )
+    return exon + poly_a_string
-	Returns:
-		RNA with polyA tail added to its 3' end.
-	"""
-    list_of_nucleotides = ['A','T','G','C']
-    poly_a_string = ''.join(np.random.choice(list_of_nucleotides,250,p=[0.914,0.028,0.025,0.033]))
-    return exon+poly_a_string
 def poly_a_addition_to_fasta_list(
-	fasta_list: list,
+    fasta_list: list,
 ) -> list:
    """Takes in a list of tuples with annotations and exons and outputs a list where polyA tail has been added to all the exon 3' ends.
-	Args:
+    Args:
-		fasta_list: List contaning tuples of annotations and exons
+            fasta_list: List contaning tuples of annotations and exons
-	Returns:
+    Returns:
-		A list like the initial list, this time with polyA tail added onto it.
+            A list like the initial list, this time with polyA tail added onto it.
-	"""
+    """
-    mature_rna_list = [(i[0],poly_a_generator(i[1])) for i in fasta_list]
+    mature_rna_list = [(i[0], poly_a_generator(i[1])) for i in fasta_list]
    return mature_rna_list