From da3a722eb995b1fa548e057a5bafe051dd00f4aa Mon Sep 17 00:00:00 2001
From: LauraU123 <laura.urbanska@stud.unibas.ch>
Date: Fri, 28 Oct 2022 10:29:45 +0200
Subject: [PATCH] added sample transcript script

---
 sampletranscript.py | 42 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)
 create mode 100644 sampletranscript.py

diff --git a/sampletranscript.py b/sampletranscript.py
new file mode 100644
index 0000000..1a36491
--- /dev/null
+++ b/sampletranscript.py
@@ -0,0 +1,42 @@
+import pandas as pd
+import numpy as np
+
+'''
+Sample transcript 
+
+This part of the code does Poisson sampling proportionally to gene expression levels for each gene. 
+ 
+input:  total transcript number (int) 
+        csv file with gene id and  gene expression levels (columns named 'id' and 'level')
+
+output: csv file with gene id and count
+        gtf file with transcript samples
+'''
+
+def transcript_sampling(total_transcript, transcripts):
+
+    #read file containing representative transcript levels and id
+
+    transcript = pd.read_csv(transcripts)
+    transcript = transcripts
+    levels = []
+
+    #poisson sampling for each gene, proportional to expression levels
+
+    for expression_level in transcript['level']:
+
+        poisson_sampled = np.random.poisson(total_transcript/expression_level)
+        levels.append(poisson_sampled)
+        # note: if levels from input are decimals, total trancript should be multiplied by expr level, not divided
+        # poisson_sampled = np.random.poisson(total_transcript*expression_level)
+
+    #write output csv file containing transcript id and count (representative transcript numbers)
+
+    transcript_numbers = pd.DataFrame({'id': transcript['id'],'count': levels})
+    pd.DataFrame.to_csv(transcript_numbers, "representative_transcript_numbers.csv")
+
+
+'''
+This function writes keeps the representative transcripts from the original input file (gtf)
+and writes them to an output.
+'''
\ No newline at end of file
-- 
GitLab