From 444cc961dd9768e3ca6e9d92c1c700d41e7d5843 Mon Sep 17 00:00:00 2001 From: BIOPZ-Zavolan Mihaela <mihaela.zavolan@unibas.ch> Date: Wed, 17 Nov 2021 19:04:20 +0000 Subject: [PATCH] feat: function to generate poly(A) tail sequence --- requirements_dev.txt | 6 +- src/poly_a.py | 67 ++++++++++++++++++++ tests/test_poly_a.py | 49 ++++++++++++++ tests/test_root.py | 11 ++++ tests/{test_dummy.py => test_sampleinput.py} | 14 ++-- 5 files changed, 134 insertions(+), 13 deletions(-) create mode 100644 src/poly_a.py create mode 100644 tests/test_poly_a.py create mode 100644 tests/test_root.py rename tests/{test_dummy.py => test_sampleinput.py} (56%) diff --git a/requirements_dev.txt b/requirements_dev.txt index 61e56cc..720501c 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -1,6 +1,6 @@ +coverage flake8 flake8-docstrings -pytest mypy -coverage -pandas \ No newline at end of file +pandas +pytest diff --git a/src/poly_a.py b/src/poly_a.py new file mode 100644 index 0000000..2c3ee30 --- /dev/null +++ b/src/poly_a.py @@ -0,0 +1,67 @@ +"""Generate a poly(A) tail.""" + +from random import choices +from typing import (List, Tuple) + + +def generate_poly_a( + length: int = 100, + weights: Tuple[float, float, float, float] = ( + 0.914, 0.028, 0.025, 0.033 + ) +) -> str: + """Generate a poly(A) tail of specified length and composition. + + This function generates a nucleotide sequence that has compositional + statistics resembling those of poly(A) tails. + + Args: + length: Length of the desired tail. + weights: Tuple of relative `A`, `C`, `G` and `U` frequencies in + the tail. + + Returns: + The generated poly(A) tail. + + Raises: + ValueError: The provided length is not a positive `int` or is + too large (maximum length = 200). + ValueError: One or more of the provided `weights` are not + positive or all weights are zero. + """ + max_len: int = 200 + bases: Tuple[str, str, str, str] = ('A', 'C', 'G', 'U') + + # check parameters + if not isinstance(length, int): + raise ValueError( + f"The provided length is not an integer: {length}" + ) + if not 1 <= int(length) <= max_len: + raise ValueError( + "The provided length is outside of the accepted range " + f"(1-{max_len}): {length}" + ) + if len(weights) != len(bases): + raise ValueError( + "There is not a weight provided for each of the bases '{bases}': " + "{weights}" + ) + try: + sum(weights) + except TypeError: + raise ValueError( + "At least one of the provided weights is not a number: {weights}" + ) + if any(w < 0 for w in weights): + raise ValueError( + "At least one of the provided weights is negative: {weights}" + ) + if all(w == 0 for w in weights): + raise ValueError(f"All weights are zero: {weights}") + + # ensure that the values are normalized + s: float = float(sum(weights)) + norm_weights: List[float] = [freq/s for freq in weights] + tail_bases: List[str] = choices(bases, weights=norm_weights, k=length) + return "".join(tail_bases) diff --git a/tests/test_poly_a.py b/tests/test_poly_a.py new file mode 100644 index 0000000..81b4a02 --- /dev/null +++ b/tests/test_poly_a.py @@ -0,0 +1,49 @@ +"""Tests for poly_a module.""" + +import pytest + +from src.poly_a import generate_poly_a + + +class TestGeneratePolyA(): + """Tests for poly(A) tail generation.""" + + def test_passes_default_args(self): + res = generate_poly_a() + assert isinstance(res, str) + assert len(res) == 100 + + def test_passes_set_all_args(self): + res = generate_poly_a( + length=10, + weights=(1,0,0,0), + ) + assert isinstance(res, str) + assert len(res) == 10 + assert res == len(res) * 'A' + + @pytest.mark.parametrize( + "length, expected", + [ + ('a', ValueError), + (-1, ValueError), + (0, ValueError), + (250, ValueError), + ] + ) + def test_wrong_length(self, expected, length): + with pytest.raises(expected): + generate_poly_a(length=length) + + @pytest.mark.parametrize( + "weights, expected", + [ + ((0,0,1), ValueError), + (('a', 0,0,1), ValueError), + ((0,0,0,-1), ValueError), + ((0,0,0,0), ValueError), + ] + ) + def test_wrong_weights(self, expected, weights): + with pytest.raises(expected): + generate_poly_a(weights=weights) diff --git a/tests/test_root.py b/tests/test_root.py new file mode 100644 index 0000000..d15773e --- /dev/null +++ b/tests/test_root.py @@ -0,0 +1,11 @@ +"""Tests for module in root package.""" + +from re import match + +from src import __version__ + + +def test_version(): + """Assert that version matches semantic versioning format.""" + assert match(r'\d\.\d\.\d', __version__) + diff --git a/tests/test_dummy.py b/tests/test_sampleinput.py similarity index 56% rename from tests/test_dummy.py rename to tests/test_sampleinput.py index d15a71f..f73dfb9 100644 --- a/tests/test_dummy.py +++ b/tests/test_sampleinput.py @@ -1,20 +1,14 @@ """Placeholder test for pipeline.""" import pytest -import src -import re -from src import sampleinput as si import pandas as pd -def test_version(): - """Assert that version matches semantic versioning format.""" +from src.sampleinput import sample_from_input - assert re.match(r'\d\.\d\.\d', src.__version__) def test_sampleinput(tmpdir): """Tests the output, input file name and separator.""" - - si.sample_from_input( + sample_from_input( input_file='./tests/resources/Transcript2.tsv', output_file=tmpdir / 'test1.csv', sep='\t', @@ -23,6 +17,6 @@ def test_sampleinput(tmpdir): t1=pd.read_table(tmpdir / 'test1.csv', header=None, sep=',') assert t1[1].sum()==142958 with pytest.raises(IndexError): - si.sample_from_input(input_file='./tests/resources/Transcript2.tsv') + sample_from_input(input_file='./tests/resources/Transcript2.tsv') with pytest.raises(IOError): - si.sample_from_input(input_file='file_not_existing.txt') \ No newline at end of file + sample_from_input(input_file='file_not_existing.txt') -- GitLab