From da3a722eb995b1fa548e057a5bafe051dd00f4aa Mon Sep 17 00:00:00 2001 From: LauraU123 <laura.urbanska@stud.unibas.ch> Date: Fri, 28 Oct 2022 10:29:45 +0200 Subject: [PATCH] added sample transcript script --- sampletranscript.py | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 sampletranscript.py diff --git a/sampletranscript.py b/sampletranscript.py new file mode 100644 index 0000000..1a36491 --- /dev/null +++ b/sampletranscript.py @@ -0,0 +1,42 @@ +import pandas as pd +import numpy as np + +''' +Sample transcript + +This part of the code does Poisson sampling proportionally to gene expression levels for each gene. + +input: total transcript number (int) + csv file with gene id and gene expression levels (columns named 'id' and 'level') + +output: csv file with gene id and count + gtf file with transcript samples +''' + +def transcript_sampling(total_transcript, transcripts): + + #read file containing representative transcript levels and id + + transcript = pd.read_csv(transcripts) + transcript = transcripts + levels = [] + + #poisson sampling for each gene, proportional to expression levels + + for expression_level in transcript['level']: + + poisson_sampled = np.random.poisson(total_transcript/expression_level) + levels.append(poisson_sampled) + # note: if levels from input are decimals, total trancript should be multiplied by expr level, not divided + # poisson_sampled = np.random.poisson(total_transcript*expression_level) + + #write output csv file containing transcript id and count (representative transcript numbers) + + transcript_numbers = pd.DataFrame({'id': transcript['id'],'count': levels}) + pd.DataFrame.to_csv(transcript_numbers, "representative_transcript_numbers.csv") + + +''' +This function writes keeps the representative transcripts from the original input file (gtf) +and writes them to an output. +''' \ No newline at end of file -- GitLab