Skip to content
Snippets Groups Projects

feat: add sampling from transcript file

Merged Michele Garioni requested to merge feature into main
5 files
+ 107
1
Compare changes
  • Side-by-side
  • Inline
Files
5
+ 73
0
"""Samples transcripts from input.
Samples a defined number of transcript following
the relative RNA abundance per gene of a given input.
"""
import logging
from pathlib import Path
from random import choices
LOG = logging.getLogger(__name__)
def sample_from_input(
input_file: Path,
output_file: Path = Path.cwd() / 'sampled_cell.csv',
n: int = 10000,
sep: str = ',',
) -> None:
"""Samples transcripts from input.
Samples a defined number of transcript per gene following
the relative RNA abundance per gene of a given input and
writes the simulated results in a csv file.
Args:
input_file (string): name of the input gene expression file.
output_file (string): name of the sampled gene expression file.
n (int): number of total transcripts to be sampled.
sep (str): separator of the input file.
"""
myfile = open(input_file, 'r')
# initialize empty dictionary
input_dc = {}
# read line, split key-value and assign key and value to the
# dictionary after stripping \n character.
LOG.info('reading file...')
for myline in myfile:
gene = myline.split(sep)
input_dc[gene[0].strip()] = int(gene[1].strip())
myfile.close()
LOG.debug(input_dc)
LOG.info('file read.')
# extract count numbers and calculate relative abundance
counts = list(input_dc.values())
tot_counts = sum(counts)
relative_value = [x / tot_counts for x in counts]
# sampling
LOG.info('sampling reads...')
sampled_genes = choices(list(input_dc.keys()), weights=relative_value, k=n)
# initialize empty dictionary
sampled_dc = dict()
# count the genes occurence from the sampled list
for i in sampled_genes:
if i not in sampled_dc:
sampled_dc[i] = 1
else:
sampled_dc[i] += 1
LOG.info('reads sampled.')
# write sample dictionary to a csv file, joining the
# key value pairs with a comma
myfile = open(output_file, 'w')
LOG.info('writing output...')
for (k, v) in sampled_dc.items():
line = ','.join([str(k), str(v)])
myfile.write(line + '\n')
myfile.close()
LOG.info('output written.')
Loading