From e8f8acc93b03f3e91cad3acfd8100cfa9ba1641a Mon Sep 17 00:00:00 2001
From: garion0000 <michele.garioni@unibas.ch>
Date: Wed, 17 Nov 2021 10:54:53 +0100
Subject: [PATCH] feat: add sampling from transcript file

---
 requirements_dev.txt            |  1 +
 src/sampleinput.py              | 73 +++++++++++++++++++++++++++++++++
 tests/resources/Transcript1.csv |  7 ++++
 tests/resources/Transcript2.tsv |  7 ++++
 tests/test_dummy.py             | 20 ++++++++-
 5 files changed, 107 insertions(+), 1 deletion(-)
 create mode 100644 src/sampleinput.py
 create mode 100644 tests/resources/Transcript1.csv
 create mode 100644 tests/resources/Transcript2.tsv

diff --git a/requirements_dev.txt b/requirements_dev.txt
index a382a35..61e56cc 100644
--- a/requirements_dev.txt
+++ b/requirements_dev.txt
@@ -3,3 +3,4 @@ flake8-docstrings
 pytest
 mypy
 coverage
+pandas
\ No newline at end of file
diff --git a/src/sampleinput.py b/src/sampleinput.py
new file mode 100644
index 0000000..a6cc7e7
--- /dev/null
+++ b/src/sampleinput.py
@@ -0,0 +1,73 @@
+"""Samples transcripts from input.
+
+Samples a defined number of transcript following
+the relative RNA abundance per gene of a given input.
+"""
+import logging
+from pathlib import Path
+from random import choices
+
+LOG = logging.getLogger(__name__)
+
+
+def sample_from_input(
+    input_file: Path,
+    output_file: Path = Path.cwd() / 'sampled_cell.csv',
+    n: int = 10000,
+    sep: str = ',',
+) -> None:
+    """Samples transcripts from input.
+
+    Samples a defined number of transcript per gene following
+    the relative RNA abundance per gene of a given input and
+    writes the simulated results in a csv file.
+
+    Args:
+        input_file (string): name of the input gene expression file.
+        output_file (string): name of the sampled gene expression file.
+        n (int): number of total transcripts to be sampled.
+        sep (str): separator of the input file.
+    """
+    myfile = open(input_file, 'r')
+    # initialize empty dictionary
+    input_dc = {}
+
+    # read line, split key-value and assign key and value to the
+    # dictionary after stripping \n character.
+    LOG.info('reading file...')
+    for myline in myfile:
+        gene = myline.split(sep)
+        input_dc[gene[0].strip()] = int(gene[1].strip())
+    myfile.close()
+    LOG.debug(input_dc)
+    LOG.info('file read.')
+
+    # extract count numbers and calculate relative abundance
+    counts = list(input_dc.values())
+    tot_counts = sum(counts)
+    relative_value = [x / tot_counts for x in counts]
+
+    # sampling
+    LOG.info('sampling reads...')
+    sampled_genes = choices(list(input_dc.keys()), weights=relative_value, k=n)
+
+    # initialize empty dictionary
+    sampled_dc = dict()
+
+    # count the genes occurence from the sampled list
+    for i in sampled_genes:
+        if i not in sampled_dc:
+            sampled_dc[i] = 1
+        else:
+            sampled_dc[i] += 1
+    LOG.info('reads sampled.')
+
+    # write sample dictionary to a csv file, joining the
+    # key value pairs with a comma
+    myfile = open(output_file, 'w')
+    LOG.info('writing output...')
+    for (k, v) in sampled_dc.items():
+        line = ','.join([str(k), str(v)])
+        myfile.write(line + '\n')
+    myfile.close()
+    LOG.info('output written.')
diff --git a/tests/resources/Transcript1.csv b/tests/resources/Transcript1.csv
new file mode 100644
index 0000000..4ae4a98
--- /dev/null
+++ b/tests/resources/Transcript1.csv
@@ -0,0 +1,7 @@
+GENE1,92
+GENE2,13
+GENE3,73
+GENE4,83
+GENE5,32
+GENE6,136
+GENE7,36
\ No newline at end of file
diff --git a/tests/resources/Transcript2.tsv b/tests/resources/Transcript2.tsv
new file mode 100644
index 0000000..727c652
--- /dev/null
+++ b/tests/resources/Transcript2.tsv
@@ -0,0 +1,7 @@
+GENE1	92
+GENE2	13
+GENE3	73
+GENE4	83
+GENE5	32
+GENE6	136
+GENE7	36
\ No newline at end of file
diff --git a/tests/test_dummy.py b/tests/test_dummy.py
index 73a5117..d15a71f 100644
--- a/tests/test_dummy.py
+++ b/tests/test_dummy.py
@@ -1,10 +1,28 @@
 """Placeholder test for pipeline."""
 
+import pytest
 import src
 import re
-
+from src import sampleinput as si
+import pandas as pd
 
 def test_version():
     """Assert that version matches semantic versioning format."""
 
     assert re.match(r'\d\.\d\.\d', src.__version__)
+
+def test_sampleinput(tmpdir):
+    """Tests the output, input file name and separator."""
+
+    si.sample_from_input(
+        input_file='./tests/resources/Transcript2.tsv',
+        output_file=tmpdir / 'test1.csv',
+        sep='\t',
+        n=142958
+    )
+    t1=pd.read_table(tmpdir / 'test1.csv', header=None, sep=',')
+    assert t1[1].sum()==142958
+    with pytest.raises(IndexError):
+        si.sample_from_input(input_file='./tests/resources/Transcript2.tsv')
+    with pytest.raises(IOError):
+        si.sample_from_input(input_file='file_not_existing.txt')
\ No newline at end of file
-- 
GitLab