Skip to content
Snippets Groups Projects
Commit 1578448c authored by Mate Balajti's avatar Mate Balajti
Browse files

feat: clean code, combine scripts, add tests

parent 4e1dac07
No related branches found
No related tags found
1 merge request!69feat: clean code, combine scripts, add tests
Pipeline #17382 passed
Showing
with 378 additions and 118 deletions
...@@ -15,12 +15,21 @@ build-job: # First stage deployment and installation of dependencies. ...@@ -15,12 +15,21 @@ build-job: # First stage deployment and installation of dependencies.
- pip install -e . - pip install -e .
- echo "Dependencies successfully deployed." - echo "Dependencies successfully deployed."
unit-test-job: # This job runs in the test stage.
stage: test # It only starts when the job in the build stage completes successfully.
script:
- pip install -r requirements.txt
- pip install -r requirements_dev.txt
- pip install -e .
- coverage run --source sequence_extractor -m pytest
- coverage report -m
lint-test-job: # Test Stage lint-test-job: # Test Stage
stage: test # Deploys and runs all 3 linters. stage: test # Deploys and runs all 3 linters.
script: script:
- pip install -r requirements.txt - pip install -r requirements.txt
- pip install -r requirements_dev.txt - pip install -r requirements_dev.txt
- pip install -e . - pip install -e .
- flake8 --docstring-convention google sequence_extractor/ gtf_processing/ --ignore=D104,F821 - flake8 --docstring-convention google sequence_extractor/
- pylint sequence_extractor/ gtf_processing/ - pylint sequence_extractor/
- mypy sequence_extractor/ gtf_processing/ - mypy sequence_extractor/
"""This script defines a BED from exon annotation in a GTF, to get exon coordinates for use in bedtools. It also ensures that the concatenation happens in the correct order, regardless of the strandedness of the transcript.
Args:
GTF file
Returns:
BED file with the format: chr, start, end, transcript_id, score, strand, gene_id
"""
import argparse
import pandas as pd
from gtfparse import read_gtf
parser = argparse.ArgumentParser(
prog="pre_bedtools",
description="extracts ordered information from gtf file and for transcripts in the negative strand, flips the order in which exons are ordered.",
)
parser.add_argument("--input_gtf_file", help="ordered and processed gtf file")
parser.add_argument(
"--output_bed_file",
help="bed file with only exons with strandedness taken into account",
)
args = parser.parse_args()
gtf = read_gtf(args.input_gtf_file)
gtf_exons = gtf[gtf["feature"] == "exon"]
gtf_exons = gtf_exons[
["seqname", "start", "end", "transcript_id", "score", "strand", "gene_id"]
]
gtf_df_neg = gtf_exons[gtf_exons["strand"] == "-"]
gtf_df_neg = (
gtf_df_neg.sort_values(["transcript_id", "start"], ascending=False)
.groupby("transcript_id")
.head(len(gtf_df_neg.transcript_id))
)
gtf_df_pos = gtf_exons[gtf_exons["strand"] == "+"]
gtf_df_pos = (
gtf_df_pos.sort_values(["transcript_id", "start"], ascending=True)
.groupby("transcript_id")
.head(len(gtf_df_pos.transcript_id))
)
pd.concat([gtf_df_pos, gtf_df_neg]).to_csv(
args.output_bed_file, sep="\t", index=False
) # gtf_df_pos and gtf_df_neg must be dataframes
images/Ahmed_mahmoud_git_tutorial.PNG

31.9 KiB

images/Gina_Homework2Image.png

312 KiB

images/Markdown_Homework_GinaBoot.png

282 KiB

images/Samuel_Mondal_Markdown_tutorial_completion_page.png

161 KiB

images/Samuel_Mondal_git_tutorial_completion_page.png

165 KiB

images/markdown-tutorial-AM.PNG

85.8 KiB

pandas~=1.5 numpy>=1.23.3
numpy~=1.23 pandas>=1.4.4
gtfparse~=1.2 gtfparse
polars==0.16.17
pytest~=7.2 pytest
coverage~=6.5 coverage
black~=22.10 black>=22.10
flake8~=6.0 flake8
flake8-docstrings~=1.6 flake8-docstrings
mypy~=0.991 mypy
pylint~=2.15 pylint
"""Initalise package."""
"""command line script to be run on output fasta file from bedtools getfasta.""" """CLI to be run on output fasta file from bedtools getfasta."""
import argparse import argparse
import logging import logging
from exon_concatenation import exon_concatenation from sequence_extractor.pre_bedtools import pre_bedtools_mode
from poly_a import poly_a_addition_to_fasta_list from sequence_extractor.exon_concatenation import exon_concatenation
from sequence_extractor.poly_a import poly_a_addition_to_fasta_list
parser = argparse.ArgumentParser( LOG = logging.getLogger(__name__)
prog="transcript_sequence_extractor",
description="extracts transcript sequences from genome sequence and ouputs transcripts with PolyA tail added to them",
)
parser.add_argument("--input_fasta_file", help="fasta file obtained from bedtools")
parser.add_argument("--output_file_name", help="Name of the output fasta file")
args = parser.parse_args()
def main(): def main():
"""Runs on the output from bedtools and concatenates the exons together and adds a polyA tail and outputs a fasta file. """Use CLI arguments to extract sequences.
Runs on the output from bedtools and concatenates the exons together
and adds a polyA tail and outputs a fasta file.
Args: Args:
None: this will run on its own by taking the information from argparse None: this will run on its own by taking the information from argparse
Returns: Returns:
A fasta file with a single entry for each transcript ID with polyA tail being added onto the sequence at 3'end A fasta file with a single entry for each transcript ID with
polyA tail being added onto the sequence at 3'end
""" """
LOG.info("sequence_extractor begins") args = parse_args()
fasta_list = exon_concatenation(args.input_fasta_file) setup_logging()
final_list = poly_a_addition_to_fasta_list(fasta_list)
with open(args.output_file_name, "w", encoding="utf-8") as fasta_out: if args.mode == "pre_bedtools":
fasta_out.write("\n".join("%s\n%s" % x for x in final_list)) pre_bedtools_mode(args)
LOG.info("sequence_extractor ends") elif args.mode == "post_bedtools":
post_bedtools_mode(args)
else:
LOG.error(
"Invalid mode specified."
"Please choose 'pre_bedtools' or 'post_bedtools'.")
if ___name__ == "main": def setup_logging() -> None:
"""Configure logging."""
logging.basicConfig( logging.basicConfig(
format='[%(asctime)s: %(levelname)s] %(message)s (module "%(module)s")', format='[%(asctime)s: %(levelname)s] %(message)s '
'(module "%(module)s")',
level=logging.INFO, level=logging.INFO,
) )
LOG = logging.getLogger(__name__)
def parse_args():
"""Parse arguments for CLI."""
parser = argparse.ArgumentParser(
description="extracts transcript sequences from genome sequence and"
"ouputs transcripts with PolyA tail added to them",
)
parser.add_argument(
"--mode",
choices=["pre_bedtools", "post_bedtools"],
required=True,
help="Select the mode of operation"
"('pre_bedtools' or 'post_bedtools')."
)
parser.add_argument(
"-i", "--input-fasta-file",
dest="input_fasta_file",
help="Fasta-formatted file obtained from bedtools"
)
parser.add_argument(
"-o", "--output-file-name",
dest="output_file_name",
help="Name of the output fasta file"
)
parser.add_argument(
"-p", "--polyA-length",
dest="poly_a_length",
type=int,
help="Length of the polyA tail to be added (def: 250)",
default=250
)
parser.add_argument(
"--input-gtf-file",
dest="input_gtf_file",
help="Ordered and processed gtf file for 'pre_bedtools' mode.")
parser.add_argument(
"--output-bed-file",
dest="output_bed_file",
help="Bed file with only exons with strandedness"
"taken into account for 'pre_bedtools' mode.")
args = parser.parse_args()
return args
def post_bedtools_mode(args):
"""Execute the 'post_bedtools' mode."""
LOG.info("Starting 'post_bedtools' mode...")
fasta_list = exon_concatenation(args.input_fasta_file)
final_list = poly_a_addition_to_fasta_list(fasta_list, args.poly_a_length)
if args.output_file_name is None:
args.output_file_name = "default_output.fasta"
with open(args.output_file_name, "w", encoding="utf-8") as fasta_out:
fasta_out.write("\n".join(f"{x[0]}\n{x[1]}" for x in final_list))
LOG.info("Transcript sequence extractor finished in 'post_bedtools' mode.")
if __name__ == '__main__':
main() main()
"""Script containing the function to concatenate exons and output the results in a list of tuples.""" """Concatenate exons and output the results in a list of tuples."""
def exon_concatenation( def exon_concatenation(
post_bedtools_fasta: str, post_bedtools_fasta: str,
) -> list: ) -> list:
"""Concatenate all sequences starting with identical transcripit ID and outputs it as a list with sequence header (Transcript ID) and concatenated sequences as tuples. """Concatenate exons.
Concatenate all sequences starting with identical transcript ID and
output it as a list with sequence header (Transcript ID) and
concatenated sequences as tuples.
Args: Args:
post_bedtools_fasta: The name of the fasta file obtained after bedtools has been run post_bedtools_fasta: The name of the fasta file obtained after
bedtools has been run
Returns: Returns:
A list containing transcript ID and concatenated exons in tuples. A list containing transcript ID and concatenated exons in tuples.
""" """
with open(post_bedtools_fasta, "r", encoding="utf-8") as fasta: with open(post_bedtools_fasta, "r", encoding="utf-8") as fasta:
annotation = [] annotation: list = []
fasta_format_list = [] fasta_format_list = []
for line1, line2 in zip(fasta, fasta): for line1, line2 in zip(fasta, fasta):
if len(annotation) == 0: if len(annotation) == 0:
......
"""This script contains two functions and the first function is called by the second function and used to add poly A tail to the concatenated exon.""" """Add poly A tail to the concatenated exon."""
import numpy as np import numpy as np
# To do: Taking probabilities of nucleotides from user and raising error if sum != 1 # To do: Taking probabilities of nucleotides from user
# and raising errorif sum != 1
def poly_a_generator( def poly_a_generator(
exon: str, exon: str,
poly_a_length: int = 250,
) -> str: ) -> str:
"""Adds a PolyA tail to an exon sequence input into the function. """Add a PolyA tail to an exon sequence input into the function.
Args: Args:
exon: RNA sequence, obtained from concatenation of exons, that needs polyA to be added to its 3' end. exon: RNA sequence, obtained from concatenation of exons,
that needs polyA to be added to its 3' end.
Returns: Returns:
RNA with polyA tail added to its 3' end. RNA with polyA tail added to its 3' end.
""" """
list_of_nucleotides = ["A", "T", "G", "C"] list_of_nucleotides = ["A", "T", "G", "C"]
poly_a_string = "".join( poly_a_string = "".join(
np.random.choice(list_of_nucleotides, 250, p=[0.914, 0.028, 0.025, 0.033]) np.random.choice(
list_of_nucleotides, poly_a_length, p=[0.914, 0.028, 0.025, 0.033]
)
) )
return exon + poly_a_string return exon + poly_a_string
def poly_a_addition_to_fasta_list( def poly_a_addition_to_fasta_list(
fasta_list: list, fasta_list: list,
poly_a_length: int = 250,
) -> list: ) -> list:
"""Takes in a list of tuples with annotations and exons and outputs a list where polyA tail has been added to all the exon 3' ends. """Add polyA tail to all the exon 3' ends.
Takes in a list of tuples with annotations and exons and outputs a list.
Args: Args:
fasta_list: List contaning tuples of annotations and exons fasta_list: List contaning tuples of annotations and exons
Returns: Returns:
A list like the initial list, this time with polyA tail added onto it. A list like the initial list, this time with polyA tail added onto it.
""" """
mature_rna_list = [(i[0], poly_a_generator(i[1])) for i in fasta_list] mature_rna_list = [
(i[0], poly_a_generator(i[1], poly_a_length)) for i in fasta_list
]
return mature_rna_list return mature_rna_list
"""Prepare gtf file for bedtools.
This script defines a BED from exon annotation in a GTF, to get exon
coordinates for use in bedtools. It also ensures that the concatenation happens
in the correct order, regardless of the strandedness of the transcript.
Args:
GTF file
Returns:
BED file with the format:
chr, start, end, transcript_id, score, strand, gene_id
"""
import pandas as pd # type: ignore
from gtfparse import read_gtf # type: ignore
def pre_bedtools_mode(args):
"""Execute the 'pre_bedtools' mode."""
gtf = read_gtf(args.input_gtf_file, result_type="pandas")
gtf_exons = gtf[gtf["feature"] == "exon"]
gtf_exons = gtf_exons[
["seqname", "start", "end",
"transcript_id", "score",
"strand", "gene_id"]
]
gtf_df_neg = gtf_exons[gtf_exons["strand"] == "-"]
gtf_df_neg = (
gtf_df_neg.sort_values(["transcript_id", "start"], ascending=False)
.groupby("transcript_id")
.head(len(gtf_df_neg.transcript_id))
)
gtf_df_pos = gtf_exons[gtf_exons["strand"] == "+"]
gtf_df_pos = (
gtf_df_pos.sort_values(["transcript_id", "start"], ascending=True)
.groupby("transcript_id")
.head(len(gtf_df_pos.transcript_id))
)
if args.output_bed_file is None:
args.output_bed_file = "default_output.bed"
pd.concat([gtf_df_pos, gtf_df_neg]).to_csv(
args.output_bed_file, sep="\t", index=False
) # gtf_df_pos and gtf_df_neg must be dataframes
from setuptools import setup, find_packages """Set up project."""
from pathlib import Path from pathlib import Path
from setuptools import setup, find_packages
project_root_dir = Path(__file__).parent.resolve() project_root_dir = Path(__file__).parent.resolve()
with open(project_root_dir / "requirements.txt",
"r", encoding="utf-8") as file:
INSTALL_REQUIRES = file.read().splitlines()
with open(project_root_dir / "requirements.txt", "r", encoding="utf-8") as _file: URL = ('https://git.scicore.unibas.ch/zavolan_group/'
INSTALL_REQUIRES = _file.read().splitlines() 'tools/transcript-sequence-extractor')
setup( setup(
name='sequence_extractor', name='transcript-sequence-extractor',
version='0.1.0',
url=URL,
license='MIT',
author='Samuel Mondal', author='Samuel Mondal',
author_email='samuel.mondal@unibas.ch', author_email='samuel.mondal@unibas.ch',
url='https://git.scicore.unibas.ch/zavolan_group/tools/transcript-sequence-extractor', description=('Extracts transcript sequences from gtf file'
license='MIT', 'and adds polyA tail to the output sequence'),
version='0.0.1',
description='Extracts transcript sequences from gtf file and adds polyA tail to the output sequence',
packages=find_packages(), packages=find_packages(),
install_requires=INSTALL_REQUIRES, install_requires=INSTALL_REQUIRES,
entrypoints={ entry_points={
'console_scripts': ['sequence_extractor=sequence_extractor.cli:main'] 'console_scripts': [
'sequence-extractor=sequence_extractor.cli:main'
]
} }
) )
import pytest """Test exon_concatenation.py."""
import exon_concatenation from exon_concatenation from pathlib import Path
from sequence_extractor.exon_concatenation import exon_concatenation
test_dir = Path(__file__).parent.resolve()
test_fasta_1 = test_dir / "test_files" / "test_1.fa"
test_fasta_2 = test_dir / "test_files" / "test_2.fa"
test_fasta_1 = "test_files/test_1.fa"
test_fasta_2 = "test_files/test_2.fa"
def test_exon_concatenation(): def test_exon_concatenation():
assert exon_concatenation(test_fasta_1) == expected_list_of_tuples """Test exon_concatenation function."""
assert exon_concatenation(test_fasta_2) == expected # Test for test_fasta_1
expected_fasta_1 = [
(">ENST00000673477", "TTTCGCCTGCGCAGTGGTCCTGGCCACCGGCTCGCGGCGCGTGGAGGCTGCTCCCAGCCGCGCCCGAGTCAGACTCGGGTGGGGGTCCCGGTTACGCCAAGGAGGCCCTGAATCTGGCGCAGATGCAGGAGCAGACGCTGCAGTTGGAGCAACAGTCCAAGCTCAAA"),
(">ENST00000378391", "AAATACTGACGGACGTGGAAGTGTCGCCCCAGGAAGGCTGCATCACAAAGTCTCCGAAGACCTGGGCAGTGAGAAGTTCTGCGTGGATGCAAATCAGGCGGGGG"),
]
result_fasta_1 = exon_concatenation(test_fasta_1)
assert result_fasta_1 == expected_fasta_1
# Test for test_fasta_2
expected_fasta_2 = [
(">ENST00000673477", "ACGGCTGGCACCTTGTTTGGGGAAGGATTCCGTGCCTTTGTGACAGACCGGGACAAAGTGACTGGCTGGGCTGACGCTGCTGGCTGTCGGGGTCTACTCAGCCAAGAATGCGATCAGCCGGCGGCTCCTCAGTCGACCCCAGGACGTGCTGGAGGGTGTTGTGCTTAGT"),
]
result_fasta_2 = exon_concatenation(test_fasta_2)
assert result_fasta_2 == expected_fasta_2
This diff is collapsed.
>ENST00000673477::1:1471765-1472089 >ENST00000673477::1:1471765-1472089
TTTCGCCTGCGCAGTGGTCCTGGCCACCGGCTCGCGGCGCGTGGAGGCTGCTCCCAGCCGCGCCCGAGTCAGACTCGGGTGGGGGTCCCGGCGGCGGTAGCGGCGGCGGCGGTGCGAGCATGTCGTGGCTCTTCGGCGTTAACAAGGGCCCCAAGGGTGAAGGCGCGGGGCCGCCGCCGCCTTTGCCGCCCGCGCAGCCCGGGGCCGAGGGCGGCGGGGACCGCGGTTTGGGAGACCGGCCGGCGCCCAAGGACAAATGGAGCAACTTCGACCCCACCGGCCTGGAGCGCGCCGCCAAGGCGGCGCGCGAGCTGGAGCACTCGC TTTCGCCTGCGCAGTGGTCCTGGCCACCGGCTCGCGGCGCGTGGAGGCTGCTCCCAGCCGCGCCCGAGTCAGACTCGGGTGGGGGTCCCGG
>ENST00000673477::1:1477274-1477350 >ENST00000673477::1:1477274-1477350
TTACGCCAAGGAGGCCCTGAATCTGGCGCAGATGCAGGAGCAGACGCTGCAGTTGGAGCAACAGTCCAAGCTCAAA TTACGCCAAGGAGGCCCTGAATCTGGCGCAGATGCAGGAGCAGACGCTGCAGTTGGAGCAACAGTCCAAGCTCAAA
>ENST00000378391::1:3244087-3244137 >ENST00000378391::1:3244087-3244137
AAATACTGACGGACGTGGAAGTGTCGCCCCAGGAAGGCTGCATCACAAAG AAATACTGACGGACGTGGAAGTGTCGCCCCAGGAAGGCTGCATCACAAAG
>ENST00000378391::1:3385152-3385286 >ENST00000378391::1:3385152-3385286
TCTCCGAAGACCTGGGCAGTGAGAAGTTCTGCGTGGATGCAAATCAGGCGGGGGCTGGCAGCTGGCTCAAGTACATCCGTGTGGCGTGCTCCTGCGATGACCAGAACCTCACCATGTGTCAGATCAGTGAGCAG TCTCCGAAGACCTGGGCAGTGAGAAGTTCTGCGTGGATGCAAATCAGGCGGGGG
>ENST00000673477::1:1482545-1482614 >ENST00000673477::1:1482545-1482614
ACGGCTGGCACCTTGTTTGGGGAAGGATTCCGTGCCTTTGTGACAGACCGGGACAAAGTGACAGCCACG ACGGCTGGCACCTTGTTTGGGGAAGGATTCCGTGCCTTTGTGACAGACCGGGACAAAGTGAC
>ENST00000673477::1:1485016-1485171 >ENST00000673477::1:1485016-1485171
TGGCTGGGCTGACGCTGCTGGCTGTCGGGGTCTACTCAGCCAAGAATGCGACAGCCGTCACTGGCCGCTTCATCGAGGCTCGGCTGGGGAAGCCGTCCCTAGTGAGGGAGACGTCCCGCATCACGGTGCTGGAGGCGCTGCGGCACCCCATCCAG TGGCTGGGCTGACGCTGCTGGCTGTCGGGGTCTACTCAGCCAAGAATGCGA
>ENST00000673477::1:1485782-1485838 >ENST00000673477::1:1485782-1485838
TCAGCCGGCGGCTCCTCAGTCGACCCCAGGACGTGCTGGAGGGTGTTGTGCTTAGT TCAGCCGGCGGCTCCTCAGTCGACCCCAGGACGTGCTGGAGGGTGTTGTGCTTAGT
>ENST00000673477::1:1486110-1486235 >ENST00000673477::1:1486110-1486235
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment