Skip to content
Snippets Groups Projects
Commit 444cc961 authored by MihaelaZavolan's avatar MihaelaZavolan Committed by Alex Kanitz
Browse files

feat: function to generate poly(A) tail sequence

parent e8f8acc9
No related branches found
No related tags found
1 merge request!8feat: function to generate poly(A) tail sequence
Pipeline #13587 passed
coverage
flake8 flake8
flake8-docstrings flake8-docstrings
pytest
mypy mypy
coverage pandas
pandas pytest
\ No newline at end of file
"""Generate a poly(A) tail."""
from random import choices
from typing import (List, Tuple)
def generate_poly_a(
length: int = 100,
weights: Tuple[float, float, float, float] = (
0.914, 0.028, 0.025, 0.033
)
) -> str:
"""Generate a poly(A) tail of specified length and composition.
This function generates a nucleotide sequence that has compositional
statistics resembling those of poly(A) tails.
Args:
length: Length of the desired tail.
weights: Tuple of relative `A`, `C`, `G` and `U` frequencies in
the tail.
Returns:
The generated poly(A) tail.
Raises:
ValueError: The provided length is not a positive `int` or is
too large (maximum length = 200).
ValueError: One or more of the provided `weights` are not
positive or all weights are zero.
"""
max_len: int = 200
bases: Tuple[str, str, str, str] = ('A', 'C', 'G', 'U')
# check parameters
if not isinstance(length, int):
raise ValueError(
f"The provided length is not an integer: {length}"
)
if not 1 <= int(length) <= max_len:
raise ValueError(
"The provided length is outside of the accepted range "
f"(1-{max_len}): {length}"
)
if len(weights) != len(bases):
raise ValueError(
"There is not a weight provided for each of the bases '{bases}': "
"{weights}"
)
try:
sum(weights)
except TypeError:
raise ValueError(
"At least one of the provided weights is not a number: {weights}"
)
if any(w < 0 for w in weights):
raise ValueError(
"At least one of the provided weights is negative: {weights}"
)
if all(w == 0 for w in weights):
raise ValueError(f"All weights are zero: {weights}")
# ensure that the values are normalized
s: float = float(sum(weights))
norm_weights: List[float] = [freq/s for freq in weights]
tail_bases: List[str] = choices(bases, weights=norm_weights, k=length)
return "".join(tail_bases)
"""Tests for poly_a module."""
import pytest
from src.poly_a import generate_poly_a
class TestGeneratePolyA():
"""Tests for poly(A) tail generation."""
def test_passes_default_args(self):
res = generate_poly_a()
assert isinstance(res, str)
assert len(res) == 100
def test_passes_set_all_args(self):
res = generate_poly_a(
length=10,
weights=(1,0,0,0),
)
assert isinstance(res, str)
assert len(res) == 10
assert res == len(res) * 'A'
@pytest.mark.parametrize(
"length, expected",
[
('a', ValueError),
(-1, ValueError),
(0, ValueError),
(250, ValueError),
]
)
def test_wrong_length(self, expected, length):
with pytest.raises(expected):
generate_poly_a(length=length)
@pytest.mark.parametrize(
"weights, expected",
[
((0,0,1), ValueError),
(('a', 0,0,1), ValueError),
((0,0,0,-1), ValueError),
((0,0,0,0), ValueError),
]
)
def test_wrong_weights(self, expected, weights):
with pytest.raises(expected):
generate_poly_a(weights=weights)
"""Tests for module in root package."""
from re import match
from src import __version__
def test_version():
"""Assert that version matches semantic versioning format."""
assert match(r'\d\.\d\.\d', __version__)
"""Placeholder test for pipeline.""" """Placeholder test for pipeline."""
import pytest import pytest
import src
import re
from src import sampleinput as si
import pandas as pd import pandas as pd
def test_version(): from src.sampleinput import sample_from_input
"""Assert that version matches semantic versioning format."""
assert re.match(r'\d\.\d\.\d', src.__version__)
def test_sampleinput(tmpdir): def test_sampleinput(tmpdir):
"""Tests the output, input file name and separator.""" """Tests the output, input file name and separator."""
sample_from_input(
si.sample_from_input(
input_file='./tests/resources/Transcript2.tsv', input_file='./tests/resources/Transcript2.tsv',
output_file=tmpdir / 'test1.csv', output_file=tmpdir / 'test1.csv',
sep='\t', sep='\t',
...@@ -23,6 +17,6 @@ def test_sampleinput(tmpdir): ...@@ -23,6 +17,6 @@ def test_sampleinput(tmpdir):
t1=pd.read_table(tmpdir / 'test1.csv', header=None, sep=',') t1=pd.read_table(tmpdir / 'test1.csv', header=None, sep=',')
assert t1[1].sum()==142958 assert t1[1].sum()==142958
with pytest.raises(IndexError): with pytest.raises(IndexError):
si.sample_from_input(input_file='./tests/resources/Transcript2.tsv') sample_from_input(input_file='./tests/resources/Transcript2.tsv')
with pytest.raises(IOError): with pytest.raises(IOError):
si.sample_from_input(input_file='file_not_existing.txt') sample_from_input(input_file='file_not_existing.txt')
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment