feat: clean code, combine scripts, add tests

1578448c · Mate Balajti · 4e1dac07 · 1578448c · 4e1dac07 · 4e1dac07
Commit 1578448c authored 1 year ago by Mate Balajti
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -15,12 +15,21 @@ build-job:       # First stage deployment and installation of dependencies.
    - pip install -e .
    - echo "Dependencies successfully deployed."
+unit-test-job:   # This job runs in the test stage.
+  stage: test    # It only starts when the job in the build stage completes successfully.
+  script:
+    - pip install -r requirements.txt
+    - pip install -r requirements_dev.txt
+    - pip install -e .
+    - coverage run --source sequence_extractor -m pytest
+    - coverage report -m
 lint-test-job:   # Test Stage
  stage: test    # Deploys and runs all 3 linters.
  script:
    - pip install -r requirements.txt
    - pip install -r requirements_dev.txt
    - pip install -e .
-    - flake8 --docstring-convention google sequence_extractor/ gtf_processing/ --ignore=D104,F821
+    - flake8 --docstring-convention google sequence_extractor/
-    - pylint sequence_extractor/ gtf_processing/
+    - pylint sequence_extractor/
-    - mypy sequence_extractor/ gtf_processing/
+    - mypy sequence_extractor/
--- a/gtf_processing/pre_bedtools.py
+++ b/gtf_processing/pre_bedtools.py
-"""This script defines a BED from exon annotation in a GTF, to get exon coordinates for use in bedtools. It also ensures that the concatenation happens in the correct order, regardless of the strandedness of the transcript.
-Args:
-    GTF file
-Returns:
-    BED file with the format: chr, start, end, transcript_id, score, strand, gene_id
-"""
-import argparse
-import pandas as pd
-from gtfparse import read_gtf
-parser = argparse.ArgumentParser(
-    prog="pre_bedtools",
-    description="extracts ordered information from gtf file and for transcripts in the negative strand, flips the order in which exons are ordered.",
-)
-parser.add_argument("--input_gtf_file", help="ordered and processed gtf file")
-parser.add_argument(
-    "--output_bed_file",
-    help="bed file with only exons with strandedness taken into account",
-)
-args = parser.parse_args()
-gtf = read_gtf(args.input_gtf_file)
-gtf_exons = gtf[gtf["feature"] == "exon"]
-gtf_exons = gtf_exons[
-    ["seqname", "start", "end", "transcript_id", "score", "strand", "gene_id"]
-]
-gtf_df_neg = gtf_exons[gtf_exons["strand"] == "-"]
-gtf_df_neg = (
-    gtf_df_neg.sort_values(["transcript_id", "start"], ascending=False)
-    .groupby("transcript_id")
-    .head(len(gtf_df_neg.transcript_id))
-)
-gtf_df_pos = gtf_exons[gtf_exons["strand"] == "+"]
-gtf_df_pos = (
-    gtf_df_pos.sort_values(["transcript_id", "start"], ascending=True)
-    .groupby("transcript_id")
-    .head(len(gtf_df_pos.transcript_id))
-)
-pd.concat([gtf_df_pos, gtf_df_neg]).to_csv(
-    args.output_bed_file, sep="\t", index=False
-)  # gtf_df_pos and gtf_df_neg must be dataframes
--- a/images/Ahmed_mahmoud_git_tutorial.PNG
+++ b/images/Ahmed_mahmoud_git_tutorial.PNG
--- a/images/Gina_Homework2Image.png
+++ b/images/Gina_Homework2Image.png
--- a/images/Markdown_Homework_GinaBoot.png
+++ b/images/Markdown_Homework_GinaBoot.png
--- a/images/Samuel_Mondal_Markdown_tutorial_completion_page.png
+++ b/images/Samuel_Mondal_Markdown_tutorial_completion_page.png
--- a/images/Samuel_Mondal_git_tutorial_completion_page.png
+++ b/images/Samuel_Mondal_git_tutorial_completion_page.png
--- a/images/markdown-tutorial-AM.PNG
+++ b/images/markdown-tutorial-AM.PNG
--- a/requirements.txt
+++ b/requirements.txt
- pandas~=1.5
+numpy>=1.23.3
- numpy~=1.23
+pandas>=1.4.4
- gtfparse~=1.2
+gtfparse
+polars==0.16.17
--- a/requirements_dev.txt
+++ b/requirements_dev.txt
-pytest~=7.2
+pytest
-coverage~=6.5
+coverage
-black~=22.10
+black>=22.10
-flake8~=6.0
+flake8
-flake8-docstrings~=1.6
+flake8-docstrings
-mypy~=0.991
+mypy
-pylint~=2.15
+pylint
--- a/sequence_extractor/__init__.py
+++ b/sequence_extractor/__init__.py
+"""Initalise package."""
--- a/sequence_extractor/cli.py
+++ b/sequence_extractor/cli.py
-"""command line script to be run on output fasta file from bedtools getfasta."""
+"""CLI to be run on output fasta file from bedtools getfasta."""
 import argparse
 import logging
-from exon_concatenation import exon_concatenation
+from sequence_extractor.pre_bedtools import pre_bedtools_mode
-from poly_a import poly_a_addition_to_fasta_list
+from sequence_extractor.exon_concatenation import exon_concatenation
+from sequence_extractor.poly_a import poly_a_addition_to_fasta_list
-parser = argparse.ArgumentParser(
+LOG = logging.getLogger(__name__)
-    prog="transcript_sequence_extractor",
-    description="extracts transcript sequences from genome sequence and ouputs transcripts with PolyA tail added to them",
-)
-parser.add_argument("--input_fasta_file", help="fasta file obtained from bedtools")
-parser.add_argument("--output_file_name", help="Name of the output fasta file")
-args = parser.parse_args()
 def main():
-    """Runs on the output from bedtools and concatenates the exons together and adds a polyA tail and outputs a fasta file.
+    """Use CLI arguments to extract sequences.
+    Runs on the output from bedtools and concatenates the exons together
+    and adds a polyA tail and outputs a fasta file.
    Args:
        None: this will run on its own by taking the information from argparse
    Returns:
-        A fasta file with a single entry for each transcript ID with polyA tail being added onto the sequence at 3'end
+        A fasta file with a single entry for each transcript ID with
+        polyA tail being added onto the sequence at 3'end
    """
-    LOG.info("sequence_extractor begins")
+    args = parse_args()
-    fasta_list = exon_concatenation(args.input_fasta_file)
+    setup_logging()
-    final_list = poly_a_addition_to_fasta_list(fasta_list)
-    with open(args.output_file_name, "w", encoding="utf-8") as fasta_out:
+    if args.mode == "pre_bedtools":
-        fasta_out.write("\n".join("%s\n%s" % x for x in final_list))
+        pre_bedtools_mode(args)
-    LOG.info("sequence_extractor ends")
+    elif args.mode == "post_bedtools":
+        post_bedtools_mode(args)
+    else:
+        LOG.error(
+            "Invalid mode specified."
+            "Please choose 'pre_bedtools' or 'post_bedtools'.")
-if ___name__ == "main":
+def setup_logging() -> None:
+    """Configure logging."""
    logging.basicConfig(
-        format='[%(asctime)s: %(levelname)s] %(message)s (module "%(module)s")',
+        format='[%(asctime)s: %(levelname)s] %(message)s '
+               '(module "%(module)s")',
        level=logging.INFO,
    )
-    LOG = logging.getLogger(__name__)
+def parse_args():
+    """Parse arguments for CLI."""
+    parser = argparse.ArgumentParser(
+        description="extracts transcript sequences from genome sequence and"
+                    "ouputs transcripts with PolyA tail added to them",
+        )
+    parser.add_argument(
+        "--mode",
+        choices=["pre_bedtools", "post_bedtools"],
+        required=True,
+        help="Select the mode of operation"
+             "('pre_bedtools' or 'post_bedtools')."
+    )
+    parser.add_argument(
+        "-i", "--input-fasta-file",
+        dest="input_fasta_file",
+        help="Fasta-formatted file obtained from bedtools"
+    )
+    parser.add_argument(
+        "-o", "--output-file-name",
+        dest="output_file_name",
+        help="Name of the output fasta file"
+    )
+    parser.add_argument(
+        "-p", "--polyA-length",
+        dest="poly_a_length",
+        type=int,
+        help="Length of the polyA tail to be added (def: 250)",
+        default=250
+    )
+    parser.add_argument(
+        "--input-gtf-file",
+        dest="input_gtf_file",
+        help="Ordered and processed gtf file for 'pre_bedtools' mode.")
+    parser.add_argument(
+        "--output-bed-file",
+        dest="output_bed_file",
+        help="Bed file with only exons with strandedness"
+             "taken into account for 'pre_bedtools' mode.")
+    args = parser.parse_args()
+    return args
+def post_bedtools_mode(args):
+    """Execute the 'post_bedtools' mode."""
+    LOG.info("Starting 'post_bedtools' mode...")
+    fasta_list = exon_concatenation(args.input_fasta_file)
+    final_list = poly_a_addition_to_fasta_list(fasta_list, args.poly_a_length)
+    if args.output_file_name is None:
+        args.output_file_name = "default_output.fasta"
+    with open(args.output_file_name, "w", encoding="utf-8") as fasta_out:
+        fasta_out.write("\n".join(f"{x[0]}\n{x[1]}" for x in final_list))
+    LOG.info("Transcript sequence extractor finished in 'post_bedtools' mode.")
+if __name__ == '__main__':
    main()
--- a/sequence_extractor/exon_concatenation.py
+++ b/sequence_extractor/exon_concatenation.py
-"""Script containing the function to concatenate exons and output the results in a list of tuples."""
+"""Concatenate exons and output the results in a list of tuples."""
 def exon_concatenation(
    post_bedtools_fasta: str,
 ) -> list:
-    """Concatenate all sequences starting with identical transcripit ID and outputs it as a list with sequence header (Transcript ID) and concatenated sequences as tuples.
+    """Concatenate exons.
+    Concatenate all sequences starting with identical transcript ID and
+    output it as a list with sequence header (Transcript ID) and
+    concatenated sequences as tuples.
    Args:
-        post_bedtools_fasta: The name of the fasta file obtained after bedtools has been run
+        post_bedtools_fasta: The name of the fasta file obtained after
+                             bedtools has been run
    Returns:
        A list containing transcript ID and concatenated exons in tuples.
    """
    with open(post_bedtools_fasta, "r", encoding="utf-8") as fasta:
-        annotation = []
+        annotation: list = []
        fasta_format_list = []
        for line1, line2 in zip(fasta, fasta):
            if len(annotation) == 0:

--- a/sequence_extractor/poly_a.py
+++ b/sequence_extractor/poly_a.py
-"""This script contains two functions and the first function is called by the second function and used to add poly A tail to the concatenated exon."""
+"""Add poly A tail to the concatenated exon."""
 import numpy as np
-# To do: Taking probabilities of nucleotides from user and raising error if sum != 1
+# To do: Taking probabilities of nucleotides from user
+# and raising errorif sum != 1
 def poly_a_generator(
    exon: str,
+    poly_a_length: int = 250,
 ) -> str:
-    """Adds a PolyA tail to an exon sequence input into the function.
+    """Add a PolyA tail to an exon sequence input into the function.
     Args:
-            exon: RNA sequence, obtained from concatenation of exons, that needs polyA to be added to its 3' end.
+        exon: RNA sequence, obtained from concatenation of exons,
+        that needs polyA to be added to its 3' end.
    Returns:
-            RNA with polyA tail added to its 3' end.
+        RNA with polyA tail added to its 3' end.
    """
    list_of_nucleotides = ["A", "T", "G", "C"]
    poly_a_string = "".join(
-        np.random.choice(list_of_nucleotides, 250, p=[0.914, 0.028, 0.025, 0.033])
+        np.random.choice(
+            list_of_nucleotides, poly_a_length, p=[0.914, 0.028, 0.025, 0.033]
+            )
    )
    return exon + poly_a_string
 def poly_a_addition_to_fasta_list(
    fasta_list: list,
+    poly_a_length: int = 250,
 ) -> list:
-    """Takes in a list of tuples with annotations and exons and outputs a list where polyA tail has been added to all the exon 3' ends.
+    """Add polyA tail to all the exon 3' ends.
+    Takes in a list of tuples with annotations and exons and outputs a list.
    Args:
-            fasta_list: List contaning tuples of annotations and exons
+        fasta_list: List contaning tuples of annotations and exons
    Returns:
-            A list like the initial list, this time with polyA tail added onto it.
+        A list like the initial list, this time with polyA tail added onto it.
    """
-    mature_rna_list = [(i[0], poly_a_generator(i[1])) for i in fasta_list]
+    mature_rna_list = [
+        (i[0], poly_a_generator(i[1], poly_a_length)) for i in fasta_list
+    ]
    return mature_rna_list
--- a/sequence_extractor/pre_bedtools.py
+++ b/sequence_extractor/pre_bedtools.py
+"""Prepare gtf file for bedtools.
+This script defines a BED from exon annotation in a GTF, to get exon
+coordinates for use in bedtools. It also ensures that the concatenation happens
+in the correct order, regardless of the strandedness of the transcript.
+Args:
+    GTF file
+Returns:
+    BED file with the format:
+        chr, start, end, transcript_id, score, strand, gene_id
+"""
+import pandas as pd  # type: ignore
+from gtfparse import read_gtf  # type: ignore
+def pre_bedtools_mode(args):
+    """Execute the 'pre_bedtools' mode."""
+    gtf = read_gtf(args.input_gtf_file, result_type="pandas")
+    gtf_exons = gtf[gtf["feature"] == "exon"]
+    gtf_exons = gtf_exons[
+        ["seqname", "start", "end",
+         "transcript_id", "score",
+         "strand", "gene_id"]
+    ]
+    gtf_df_neg = gtf_exons[gtf_exons["strand"] == "-"]
+    gtf_df_neg = (
+        gtf_df_neg.sort_values(["transcript_id", "start"], ascending=False)
+        .groupby("transcript_id")
+        .head(len(gtf_df_neg.transcript_id))
+    )
+    gtf_df_pos = gtf_exons[gtf_exons["strand"] == "+"]
+    gtf_df_pos = (
+        gtf_df_pos.sort_values(["transcript_id", "start"], ascending=True)
+        .groupby("transcript_id")
+        .head(len(gtf_df_pos.transcript_id))
+    )
+    if args.output_bed_file is None:
+        args.output_bed_file = "default_output.bed"
+    pd.concat([gtf_df_pos, gtf_df_neg]).to_csv(
+        args.output_bed_file, sep="\t", index=False
+    )  # gtf_df_pos and gtf_df_neg must be dataframes
--- a/setup.py
+++ b/setup.py
-from setuptools import setup, find_packages
+"""Set up project."""
 from pathlib import Path
+from setuptools import setup, find_packages
 project_root_dir = Path(__file__).parent.resolve()
+with open(project_root_dir / "requirements.txt",
+          "r", encoding="utf-8") as file:
+    INSTALL_REQUIRES = file.read().splitlines()
-with open(project_root_dir / "requirements.txt", "r", encoding="utf-8") as _file:
+URL = ('https://git.scicore.unibas.ch/zavolan_group/'
-    INSTALL_REQUIRES = _file.read().splitlines()
+       'tools/transcript-sequence-extractor')
 setup(
-    name='sequence_extractor',
+    name='transcript-sequence-extractor',
+    version='0.1.0',
+    url=URL,
+    license='MIT',
    author='Samuel Mondal',
    author_email='samuel.mondal@unibas.ch',
-    url='https://git.scicore.unibas.ch/zavolan_group/tools/transcript-sequence-extractor',
+    description=('Extracts transcript sequences from gtf file'
-    license='MIT',
+                 'and adds polyA tail to the output sequence'),
-    version='0.0.1',
-    description='Extracts transcript sequences from gtf file and adds polyA tail to the output sequence',
    packages=find_packages(),
    install_requires=INSTALL_REQUIRES,
-    entrypoints={
+    entry_points={
-        'console_scripts': ['sequence_extractor=sequence_extractor.cli:main']
+        'console_scripts': [
+            'sequence-extractor=sequence_extractor.cli:main'
+            ]
        }
 )
--- a/tests/test_exon_concatenation.py
+++ b/tests/test_exon_concatenation.py
-import pytest
+"""Test exon_concatenation.py."""
-import exon_concatenation from exon_concatenation
+from pathlib import Path
+from sequence_extractor.exon_concatenation import exon_concatenation
+test_dir = Path(__file__).parent.resolve()
+test_fasta_1 = test_dir / "test_files" / "test_1.fa"
+test_fasta_2 = test_dir / "test_files" / "test_2.fa"
-test_fasta_1 = "test_files/test_1.fa"
-test_fasta_2 = "test_files/test_2.fa"
 def test_exon_concatenation():
-    assert exon_concatenation(test_fasta_1) == expected_list_of_tuples
+    """Test exon_concatenation function."""
-    assert exon_concatenation(test_fasta_2) == expected
+    # Test for test_fasta_1
+    expected_fasta_1 = [
+        (">ENST00000673477", "TTTCGCCTGCGCAGTGGTCCTGGCCACCGGCTCGCGGCGCGTGGAGGCTGCTCCCAGCCGCGCCCGAGTCAGACTCGGGTGGGGGTCCCGGTTACGCCAAGGAGGCCCTGAATCTGGCGCAGATGCAGGAGCAGACGCTGCAGTTGGAGCAACAGTCCAAGCTCAAA"),
+        (">ENST00000378391", "AAATACTGACGGACGTGGAAGTGTCGCCCCAGGAAGGCTGCATCACAAAGTCTCCGAAGACCTGGGCAGTGAGAAGTTCTGCGTGGATGCAAATCAGGCGGGGG"),
+    ]
+    result_fasta_1 = exon_concatenation(test_fasta_1)
+    assert result_fasta_1 == expected_fasta_1
+    # Test for test_fasta_2
+    expected_fasta_2 = [
+        (">ENST00000673477", "ACGGCTGGCACCTTGTTTGGGGAAGGATTCCGTGCCTTTGTGACAGACCGGGACAAAGTGACTGGCTGGGCTGACGCTGCTGGCTGTCGGGGTCTACTCAGCCAAGAATGCGATCAGCCGGCGGCTCCTCAGTCGACCCCAGGACGTGCTGGAGGGTGTTGTGCTTAGT"),
+    ]
+    result_fasta_2 = exon_concatenation(test_fasta_2)
+    assert result_fasta_2 == expected_fasta_2
--- a/tests/test_files/test.gtf
+++ b/tests/test_files/test.gtf
--- a/tests/test_files/test_1.fa
+++ b/tests/test_files/test_1.fa
 >ENST00000673477::1:1471765-1472089
-TTTCGCCTGCGCAGTGGTCCTGGCCACCGGCTCGCGGCGCGTGGAGGCTGCTCCCAGCCGCGCCCGAGTCAGACTCGGGTGGGGGTCCCGGCGGCGGTAGCGGCGGCGGCGGTGCGAGCATGTCGTGGCTCTTCGGCGTTAACAAGGGCCCCAAGGGTGAAGGCGCGGGGCCGCCGCCGCCTTTGCCGCCCGCGCAGCCCGGGGCCGAGGGCGGCGGGGACCGCGGTTTGGGAGACCGGCCGGCGCCCAAGGACAAATGGAGCAACTTCGACCCCACCGGCCTGGAGCGCGCCGCCAAGGCGGCGCGCGAGCTGGAGCACTCGC
+TTTCGCCTGCGCAGTGGTCCTGGCCACCGGCTCGCGGCGCGTGGAGGCTGCTCCCAGCCGCGCCCGAGTCAGACTCGGGTGGGGGTCCCGG
 >ENST00000673477::1:1477274-1477350
 TTACGCCAAGGAGGCCCTGAATCTGGCGCAGATGCAGGAGCAGACGCTGCAGTTGGAGCAACAGTCCAAGCTCAAA
 >ENST00000378391::1:3244087-3244137
 AAATACTGACGGACGTGGAAGTGTCGCCCCAGGAAGGCTGCATCACAAAG
 >ENST00000378391::1:3385152-3385286
-TCTCCGAAGACCTGGGCAGTGAGAAGTTCTGCGTGGATGCAAATCAGGCGGGGGCTGGCAGCTGGCTCAAGTACATCCGTGTGGCGTGCTCCTGCGATGACCAGAACCTCACCATGTGTCAGATCAGTGAGCAG
+TCTCCGAAGACCTGGGCAGTGAGAAGTTCTGCGTGGATGCAAATCAGGCGGGGG
--- a/tests/test_files/test_2.fa
+++ b/tests/test_files/test_2.fa
 >ENST00000673477::1:1482545-1482614
-ACGGCTGGCACCTTGTTTGGGGAAGGATTCCGTGCCTTTGTGACAGACCGGGACAAAGTGACAGCCACG
+ACGGCTGGCACCTTGTTTGGGGAAGGATTCCGTGCCTTTGTGACAGACCGGGACAAAGTGAC
 >ENST00000673477::1:1485016-1485171
-TGGCTGGGCTGACGCTGCTGGCTGTCGGGGTCTACTCAGCCAAGAATGCGACAGCCGTCACTGGCCGCTTCATCGAGGCTCGGCTGGGGAAGCCGTCCCTAGTGAGGGAGACGTCCCGCATCACGGTGCTGGAGGCGCTGCGGCACCCCATCCAG
+TGGCTGGGCTGACGCTGCTGGCTGTCGGGGTCTACTCAGCCAAGAATGCGA
 >ENST00000673477::1:1485782-1485838
 TCAGCCGGCGGCTCCTCAGTCGACCCCAGGACGTGCTGGAGGGTGTTGTGCTTAGT
 >ENST00000673477::1:1486110-1486235