feat: function to generate poly(A) tail sequence

444cc961 · MihaelaZavolan · Alex Kanitz · e8f8acc9 · 444cc961 · 444cc961
Commit 444cc961 authored 3 years ago by MihaelaZavolan Committed by Alex Kanitz 3 years ago
--- a/requirements_dev.txt
+++ b/requirements_dev.txt
+coverage
 flake8
 flake8-docstrings
-pytest
 mypy
-coverage
+pandas
-pandas
+pytest
\ No newline at end of file
--- a/src/poly_a.py
+++ b/src/poly_a.py
+"""Generate a poly(A) tail."""
+from random import choices
+from typing import (List, Tuple)
+def generate_poly_a(
+    length: int = 100,
+    weights: Tuple[float, float, float, float] = (
+        0.914, 0.028, 0.025, 0.033
+    )
+) -> str:
+    """Generate a poly(A) tail of specified length and composition.
+    This function generates a nucleotide sequence that has compositional
+    statistics resembling those of poly(A) tails.
+    Args:
+        length: Length of the desired tail.
+        weights: Tuple of relative `A`, `C`, `G` and `U` frequencies in
+            the tail.
+    Returns:
+        The generated poly(A) tail.
+    Raises:
+        ValueError: The provided length is not a positive `int` or is
+            too large (maximum length = 200).
+        ValueError: One or more of the provided `weights` are not
+            positive or all weights are zero.
+    """
+    max_len: int = 200
+    bases: Tuple[str, str, str, str] = ('A', 'C', 'G', 'U')
+    # check parameters
+    if not isinstance(length, int):
+        raise ValueError(
+            f"The provided length is not an integer: {length}"
+        )
+    if not 1 <= int(length) <= max_len:
+        raise ValueError(
+            "The provided length is outside of the accepted range "
+            f"(1-{max_len}): {length}"
+        )
+    if len(weights) != len(bases):
+        raise ValueError(
+            "There is not a weight provided for each of the bases '{bases}': "
+            "{weights}"
+        )
+    try:
+        sum(weights)
+    except TypeError:
+        raise ValueError(
+            "At least one of the provided weights is not a number: {weights}"
+        )
+    if any(w < 0 for w in weights):
+        raise ValueError(
+            "At least one of the provided weights is negative: {weights}"
+        )
+    if all(w == 0 for w in weights):
+        raise ValueError(f"All weights are zero: {weights}")
+    # ensure that the values are normalized
+    s: float = float(sum(weights))
+    norm_weights: List[float] = [freq/s for freq in weights]
+    tail_bases: List[str] = choices(bases, weights=norm_weights, k=length)
+    return "".join(tail_bases)
--- a/tests/test_poly_a.py
+++ b/tests/test_poly_a.py
+"""Tests for poly_a module."""
+import pytest
+from src.poly_a import generate_poly_a
+class TestGeneratePolyA():
+    """Tests for poly(A) tail generation."""
+    def test_passes_default_args(self):
+        res = generate_poly_a()
+        assert isinstance(res, str)
+        assert len(res) == 100
+    def test_passes_set_all_args(self):
+        res = generate_poly_a(
+            length=10,
+            weights=(1,0,0,0),
+        )
+        assert isinstance(res, str)
+        assert len(res) == 10
+        assert res == len(res) * 'A'
+    @pytest.mark.parametrize(
+        "length, expected",
+        [
+            ('a', ValueError),
+            (-1, ValueError),
+            (0, ValueError),
+            (250, ValueError),
+        ]
+    )
+    def test_wrong_length(self, expected, length):
+        with pytest.raises(expected):
+            generate_poly_a(length=length)
+    @pytest.mark.parametrize(
+        "weights, expected",
+        [
+            ((0,0,1), ValueError),
+            (('a', 0,0,1), ValueError),
+            ((0,0,0,-1), ValueError),
+            ((0,0,0,0), ValueError),
+        ]
+    )
+    def test_wrong_weights(self, expected, weights):
+        with pytest.raises(expected):
+            generate_poly_a(weights=weights)
--- a/tests/test_root.py
+++ b/tests/test_root.py
+"""Tests for module in root package."""
+from re import match
+from src import __version__
+def test_version():
+    """Assert that version matches semantic versioning format."""
+    assert match(r'\d\.\d\.\d', __version__)
--- a/tests/test_dummy.py
+++ b/tests/test_dummy.py
 """Placeholder test for pipeline."""
 import pytest
-import src
-import re
-from src import sampleinput as si
 import pandas as pd
-def test_version():
+from src.sampleinput import sample_from_input
-    """Assert that version matches semantic versioning format."""
-    assert re.match(r'\d\.\d\.\d', src.__version__)
 def test_sampleinput(tmpdir):
    """Tests the output, input file name and separator."""
+    sample_from_input(
-    si.sample_from_input(
        input_file='./tests/resources/Transcript2.tsv',
        output_file=tmpdir / 'test1.csv',
        sep='\t',
@@ -23,6 +17,6 @@ def test_sampleinput(tmpdir):
    t1=pd.read_table(tmpdir / 'test1.csv', header=None, sep=',')
    assert t1[1].sum()==142958
    with pytest.raises(IndexError):
-        si.sample_from_input(input_file='./tests/resources/Transcript2.tsv')
+        sample_from_input(input_file='./tests/resources/Transcript2.tsv')
    with pytest.raises(IOError):
-        si.sample_from_input(input_file='file_not_existing.txt')
+        sample_from_input(input_file='file_not_existing.txt')
\ No newline at end of file