From 444cc961dd9768e3ca6e9d92c1c700d41e7d5843 Mon Sep 17 00:00:00 2001
From: BIOPZ-Zavolan Mihaela <mihaela.zavolan@unibas.ch>
Date: Wed, 17 Nov 2021 19:04:20 +0000
Subject: [PATCH] feat: function to generate poly(A) tail sequence

---
 requirements_dev.txt                         |  6 +-
 src/poly_a.py                                | 67 ++++++++++++++++++++
 tests/test_poly_a.py                         | 49 ++++++++++++++
 tests/test_root.py                           | 11 ++++
 tests/{test_dummy.py => test_sampleinput.py} | 14 ++--
 5 files changed, 134 insertions(+), 13 deletions(-)
 create mode 100644 src/poly_a.py
 create mode 100644 tests/test_poly_a.py
 create mode 100644 tests/test_root.py
 rename tests/{test_dummy.py => test_sampleinput.py} (56%)

diff --git a/requirements_dev.txt b/requirements_dev.txt
index 61e56cc..720501c 100644
--- a/requirements_dev.txt
+++ b/requirements_dev.txt
@@ -1,6 +1,6 @@
+coverage
 flake8
 flake8-docstrings
-pytest
 mypy
-coverage
-pandas
\ No newline at end of file
+pandas
+pytest
diff --git a/src/poly_a.py b/src/poly_a.py
new file mode 100644
index 0000000..2c3ee30
--- /dev/null
+++ b/src/poly_a.py
@@ -0,0 +1,67 @@
+"""Generate a poly(A) tail."""
+
+from random import choices
+from typing import (List, Tuple)
+
+
+def generate_poly_a(
+    length: int = 100,
+    weights: Tuple[float, float, float, float] = (
+        0.914, 0.028, 0.025, 0.033
+    )
+) -> str:
+    """Generate a poly(A) tail of specified length and composition.
+
+    This function generates a nucleotide sequence that has compositional
+    statistics resembling those of poly(A) tails.
+
+    Args:
+        length: Length of the desired tail.
+        weights: Tuple of relative `A`, `C`, `G` and `U` frequencies in
+            the tail.
+
+    Returns:
+        The generated poly(A) tail.
+
+    Raises:
+        ValueError: The provided length is not a positive `int` or is
+            too large (maximum length = 200).
+        ValueError: One or more of the provided `weights` are not
+            positive or all weights are zero.
+    """
+    max_len: int = 200
+    bases: Tuple[str, str, str, str] = ('A', 'C', 'G', 'U')
+
+    # check parameters
+    if not isinstance(length, int):
+        raise ValueError(
+            f"The provided length is not an integer: {length}"
+        )
+    if not 1 <= int(length) <= max_len:
+        raise ValueError(
+            "The provided length is outside of the accepted range "
+            f"(1-{max_len}): {length}"
+        )
+    if len(weights) != len(bases):
+        raise ValueError(
+            "There is not a weight provided for each of the bases '{bases}': "
+            "{weights}"
+        )
+    try:
+        sum(weights)
+    except TypeError:
+        raise ValueError(
+            "At least one of the provided weights is not a number: {weights}"
+        )
+    if any(w < 0 for w in weights):
+        raise ValueError(
+            "At least one of the provided weights is negative: {weights}"
+        )
+    if all(w == 0 for w in weights):
+        raise ValueError(f"All weights are zero: {weights}")
+
+    # ensure that the values are normalized
+    s: float = float(sum(weights))
+    norm_weights: List[float] = [freq/s for freq in weights]
+    tail_bases: List[str] = choices(bases, weights=norm_weights, k=length)
+    return "".join(tail_bases)
diff --git a/tests/test_poly_a.py b/tests/test_poly_a.py
new file mode 100644
index 0000000..81b4a02
--- /dev/null
+++ b/tests/test_poly_a.py
@@ -0,0 +1,49 @@
+"""Tests for poly_a module."""
+
+import pytest
+
+from src.poly_a import generate_poly_a
+
+
+class TestGeneratePolyA():
+    """Tests for poly(A) tail generation."""
+
+    def test_passes_default_args(self):
+        res = generate_poly_a()
+        assert isinstance(res, str)
+        assert len(res) == 100
+
+    def test_passes_set_all_args(self):
+        res = generate_poly_a(
+            length=10,
+            weights=(1,0,0,0),
+        )
+        assert isinstance(res, str)
+        assert len(res) == 10
+        assert res == len(res) * 'A'
+
+    @pytest.mark.parametrize(
+        "length, expected",
+        [
+            ('a', ValueError),
+            (-1, ValueError),
+            (0, ValueError),
+            (250, ValueError),
+        ]
+    )
+    def test_wrong_length(self, expected, length):
+        with pytest.raises(expected):
+            generate_poly_a(length=length)
+
+    @pytest.mark.parametrize(
+        "weights, expected",
+        [
+            ((0,0,1), ValueError),
+            (('a', 0,0,1), ValueError),
+            ((0,0,0,-1), ValueError),
+            ((0,0,0,0), ValueError),
+        ]
+    )
+    def test_wrong_weights(self, expected, weights):
+        with pytest.raises(expected):
+            generate_poly_a(weights=weights)
diff --git a/tests/test_root.py b/tests/test_root.py
new file mode 100644
index 0000000..d15773e
--- /dev/null
+++ b/tests/test_root.py
@@ -0,0 +1,11 @@
+"""Tests for module in root package."""
+
+from re import match
+
+from src import __version__
+
+
+def test_version():
+    """Assert that version matches semantic versioning format."""
+    assert match(r'\d\.\d\.\d', __version__)
+
diff --git a/tests/test_dummy.py b/tests/test_sampleinput.py
similarity index 56%
rename from tests/test_dummy.py
rename to tests/test_sampleinput.py
index d15a71f..f73dfb9 100644
--- a/tests/test_dummy.py
+++ b/tests/test_sampleinput.py
@@ -1,20 +1,14 @@
 """Placeholder test for pipeline."""
 
 import pytest
-import src
-import re
-from src import sampleinput as si
 import pandas as pd
 
-def test_version():
-    """Assert that version matches semantic versioning format."""
+from src.sampleinput import sample_from_input
 
-    assert re.match(r'\d\.\d\.\d', src.__version__)
 
 def test_sampleinput(tmpdir):
     """Tests the output, input file name and separator."""
-
-    si.sample_from_input(
+    sample_from_input(
         input_file='./tests/resources/Transcript2.tsv',
         output_file=tmpdir / 'test1.csv',
         sep='\t',
@@ -23,6 +17,6 @@ def test_sampleinput(tmpdir):
     t1=pd.read_table(tmpdir / 'test1.csv', header=None, sep=',')
     assert t1[1].sum()==142958
     with pytest.raises(IndexError):
-        si.sample_from_input(input_file='./tests/resources/Transcript2.tsv')
+        sample_from_input(input_file='./tests/resources/Transcript2.tsv')
     with pytest.raises(IOError):
-        si.sample_from_input(input_file='file_not_existing.txt')
\ No newline at end of file
+        sample_from_input(input_file='file_not_existing.txt')
-- 
GitLab