From e8f8acc93b03f3e91cad3acfd8100cfa9ba1641a Mon Sep 17 00:00:00 2001 From: garion0000 <michele.garioni@unibas.ch> Date: Wed, 17 Nov 2021 10:54:53 +0100 Subject: [PATCH] feat: add sampling from transcript file --- requirements_dev.txt | 1 + src/sampleinput.py | 73 +++++++++++++++++++++++++++++++++ tests/resources/Transcript1.csv | 7 ++++ tests/resources/Transcript2.tsv | 7 ++++ tests/test_dummy.py | 20 ++++++++- 5 files changed, 107 insertions(+), 1 deletion(-) create mode 100644 src/sampleinput.py create mode 100644 tests/resources/Transcript1.csv create mode 100644 tests/resources/Transcript2.tsv diff --git a/requirements_dev.txt b/requirements_dev.txt index a382a35..61e56cc 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -3,3 +3,4 @@ flake8-docstrings pytest mypy coverage +pandas \ No newline at end of file diff --git a/src/sampleinput.py b/src/sampleinput.py new file mode 100644 index 0000000..a6cc7e7 --- /dev/null +++ b/src/sampleinput.py @@ -0,0 +1,73 @@ +"""Samples transcripts from input. + +Samples a defined number of transcript following +the relative RNA abundance per gene of a given input. +""" +import logging +from pathlib import Path +from random import choices + +LOG = logging.getLogger(__name__) + + +def sample_from_input( + input_file: Path, + output_file: Path = Path.cwd() / 'sampled_cell.csv', + n: int = 10000, + sep: str = ',', +) -> None: + """Samples transcripts from input. + + Samples a defined number of transcript per gene following + the relative RNA abundance per gene of a given input and + writes the simulated results in a csv file. + + Args: + input_file (string): name of the input gene expression file. + output_file (string): name of the sampled gene expression file. + n (int): number of total transcripts to be sampled. + sep (str): separator of the input file. + """ + myfile = open(input_file, 'r') + # initialize empty dictionary + input_dc = {} + + # read line, split key-value and assign key and value to the + # dictionary after stripping \n character. + LOG.info('reading file...') + for myline in myfile: + gene = myline.split(sep) + input_dc[gene[0].strip()] = int(gene[1].strip()) + myfile.close() + LOG.debug(input_dc) + LOG.info('file read.') + + # extract count numbers and calculate relative abundance + counts = list(input_dc.values()) + tot_counts = sum(counts) + relative_value = [x / tot_counts for x in counts] + + # sampling + LOG.info('sampling reads...') + sampled_genes = choices(list(input_dc.keys()), weights=relative_value, k=n) + + # initialize empty dictionary + sampled_dc = dict() + + # count the genes occurence from the sampled list + for i in sampled_genes: + if i not in sampled_dc: + sampled_dc[i] = 1 + else: + sampled_dc[i] += 1 + LOG.info('reads sampled.') + + # write sample dictionary to a csv file, joining the + # key value pairs with a comma + myfile = open(output_file, 'w') + LOG.info('writing output...') + for (k, v) in sampled_dc.items(): + line = ','.join([str(k), str(v)]) + myfile.write(line + '\n') + myfile.close() + LOG.info('output written.') diff --git a/tests/resources/Transcript1.csv b/tests/resources/Transcript1.csv new file mode 100644 index 0000000..4ae4a98 --- /dev/null +++ b/tests/resources/Transcript1.csv @@ -0,0 +1,7 @@ +GENE1,92 +GENE2,13 +GENE3,73 +GENE4,83 +GENE5,32 +GENE6,136 +GENE7,36 \ No newline at end of file diff --git a/tests/resources/Transcript2.tsv b/tests/resources/Transcript2.tsv new file mode 100644 index 0000000..727c652 --- /dev/null +++ b/tests/resources/Transcript2.tsv @@ -0,0 +1,7 @@ +GENE1 92 +GENE2 13 +GENE3 73 +GENE4 83 +GENE5 32 +GENE6 136 +GENE7 36 \ No newline at end of file diff --git a/tests/test_dummy.py b/tests/test_dummy.py index 73a5117..d15a71f 100644 --- a/tests/test_dummy.py +++ b/tests/test_dummy.py @@ -1,10 +1,28 @@ """Placeholder test for pipeline.""" +import pytest import src import re - +from src import sampleinput as si +import pandas as pd def test_version(): """Assert that version matches semantic versioning format.""" assert re.match(r'\d\.\d\.\d', src.__version__) + +def test_sampleinput(tmpdir): + """Tests the output, input file name and separator.""" + + si.sample_from_input( + input_file='./tests/resources/Transcript2.tsv', + output_file=tmpdir / 'test1.csv', + sep='\t', + n=142958 + ) + t1=pd.read_table(tmpdir / 'test1.csv', header=None, sep=',') + assert t1[1].sum()==142958 + with pytest.raises(IndexError): + si.sample_from_input(input_file='./tests/resources/Transcript2.tsv') + with pytest.raises(IOError): + si.sample_from_input(input_file='file_not_existing.txt') \ No newline at end of file -- GitLab