From 663d5b8f1dd77f95be59a487bdd473cb0f4360d4 Mon Sep 17 00:00:00 2001
From: RTS1997 <reto.tschannen@unibas.ch>
Date: Sun, 28 Nov 2021 22:49:12 +0100
Subject: [PATCH] feat: Added function to calculate mean and variance from a
 directory with files of gene expression(Work in progress)

---
 src/mean-variance-function.py | 75 +++++++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)
 create mode 100644 src/mean-variance-function.py

diff --git a/src/mean-variance-function.py b/src/mean-variance-function.py
new file mode 100644
index 0000000..2dfdf6d
--- /dev/null
+++ b/src/mean-variance-function.py
@@ -0,0 +1,75 @@
+import glob, io
+import matplotlib.pyplot as plt
+import csv
+
+
+
+
+def mean_variance(filepath):
+    """Given the counts observed for a given gene across all cells, calculate the mean and variance.
+
+    Input: 
+        directory with files of gene expression counts in individual cells
+    
+    Output:
+        1. Csv-formatted table with GeneID, Mean, Variance of the counts
+        2. Scatterplot of mean vs variance for all genes
+    """
+    # Open each file in the input directory, raises error if no file is fund
+
+    files = [file for file in glob.glob(filepath)]
+
+    if len(files) == 0:
+        raise ValueError('No files in directory:', filepath)
+
+    nog = {}
+    count = {}
+    test = {}
+    mean = {}
+    variance = {}
+
+    for file_name in files:
+        with io.open(file_name, 'r') as fh:
+            for line in fh:
+                geneid, copies = str(line.split()[0]), int(line.split()[1])
+                if geneid not in nog:
+                    nog[geneid] = copies
+                    count[geneid] = 1
+                    test[geneid] = [copies]
+                else:
+                    nog[geneid] += copies
+                    count[geneid] += 1
+                    test[geneid] += [copies]
+    
+    # Calculate mean of each gene
+    for i in nog:
+        mean[i] = nog[i]/count[i] 
+
+    # Calculate the variance
+    for i in test:
+        for j in range(0, len(test[i])):
+            variance[i] = (test[i][j]-mean[i])**2/count[i] 
+
+    # Plot mean against variance
+
+    plt.scatter(mean.values(), variance.values())
+    for value in list(mean.keys()):
+        plt.text(mean[value], variance[value], value)
+    #plt.annotate(list(mean.keys()), mean.values(), variance.values())
+    plt.xlabel('mean')
+    plt.ylabel('variance')
+    plt.title('Mean gene expression vs. variance')
+    plt.show()
+
+    with open('/home/reto/results_mean_var_function.csv', 'w') as csv_file:
+        filewriter = csv.writer(csv_file, delimiter = ',', quotechar = '|', quoting = csv.QUOTE_MINIMAL)
+        filewriter.writerow(['geneid', 'mean', 'variance'])
+        for id in nog.keys():
+            filewriter.writerow([id, mean[id], variance[id]])
+    
+    return files, nog, count, mean, 'var', variance, test, list(mean.keys())
+
+
+print(mean_variance('/home/reto/2021_project_folder/2021_test/*'))
+
+
-- 
GitLab