Michele Garioni · e8f8acc9
--- a/src/sampleinput.py 0 → 100644

+ 73

− 0
+++ b/src/sampleinput.py 0 → 100644

+ 73

− 0
+"""Samples transcripts from input.
+
+Samples a defined number of transcript following
+the relative RNA abundance per gene of a given input.
+"""
+import logging
+from pathlib import Path
+from random import choices
+
+LOG = logging.getLogger(__name__)
+
+
+def sample_from_input(
+    input_file: Path,
+    output_file: Path = Path.cwd() / 'sampled_cell.csv',
+    n: int = 10000,
+    sep: str = ',',
+) -> None:
+    """Samples transcripts from input.
+
+    Samples a defined number of transcript per gene following
+    the relative RNA abundance per gene of a given input and
+    writes the simulated results in a csv file.
+
+    Args:
+        input_file (string): name of the input gene expression file.
+        output_file (string): name of the sampled gene expression file.
+        n (int): number of total transcripts to be sampled.
+        sep (str): separator of the input file.
+    """
+    myfile = open(input_file, 'r')
+    # initialize empty dictionary
+    input_dc = {}
+
+    # read line, split key-value and assign key and value to the
+    # dictionary after stripping \n character.
+    LOG.info('reading file...')
+    for myline in myfile:
+        gene = myline.split(sep)
+        input_dc[gene[0].strip()] = int(gene[1].strip())
+    myfile.close()
+    LOG.debug(input_dc)
+    LOG.info('file read.')
+
+    # extract count numbers and calculate relative abundance
+    counts = list(input_dc.values())
+    tot_counts = sum(counts)
+    relative_value = [x / tot_counts for x in counts]
+
+    # sampling
+    LOG.info('sampling reads...')
+    sampled_genes = choices(list(input_dc.keys()), weights=relative_value, k=n)
+
+    # initialize empty dictionary
+    sampled_dc = dict()
+
+    # count the genes occurence from the sampled list
+    for i in sampled_genes:
+        if i not in sampled_dc:
+            sampled_dc[i] = 1
+        else:
+            sampled_dc[i] += 1
+    LOG.info('reads sampled.')
+
+    # write sample dictionary to a csv file, joining the
+    # key value pairs with a comma
+    myfile = open(output_file, 'w')
+    LOG.info('writing output...')
+    for (k, v) in sampled_dc.items():
+        line = ','.join([str(k), str(v)])
+        myfile.write(line + '\n')
+    myfile.close()
+    LOG.info('output written.')