From 663d5b8f1dd77f95be59a487bdd473cb0f4360d4 Mon Sep 17 00:00:00 2001 From: RTS1997 <reto.tschannen@unibas.ch> Date: Sun, 28 Nov 2021 22:49:12 +0100 Subject: [PATCH] feat: Added function to calculate mean and variance from a directory with files of gene expression(Work in progress) --- src/mean-variance-function.py | 75 +++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 src/mean-variance-function.py diff --git a/src/mean-variance-function.py b/src/mean-variance-function.py new file mode 100644 index 0000000..2dfdf6d --- /dev/null +++ b/src/mean-variance-function.py @@ -0,0 +1,75 @@ +import glob, io +import matplotlib.pyplot as plt +import csv + + + + +def mean_variance(filepath): + """Given the counts observed for a given gene across all cells, calculate the mean and variance. + + Input: + directory with files of gene expression counts in individual cells + + Output: + 1. Csv-formatted table with GeneID, Mean, Variance of the counts + 2. Scatterplot of mean vs variance for all genes + """ + # Open each file in the input directory, raises error if no file is fund + + files = [file for file in glob.glob(filepath)] + + if len(files) == 0: + raise ValueError('No files in directory:', filepath) + + nog = {} + count = {} + test = {} + mean = {} + variance = {} + + for file_name in files: + with io.open(file_name, 'r') as fh: + for line in fh: + geneid, copies = str(line.split()[0]), int(line.split()[1]) + if geneid not in nog: + nog[geneid] = copies + count[geneid] = 1 + test[geneid] = [copies] + else: + nog[geneid] += copies + count[geneid] += 1 + test[geneid] += [copies] + + # Calculate mean of each gene + for i in nog: + mean[i] = nog[i]/count[i] + + # Calculate the variance + for i in test: + for j in range(0, len(test[i])): + variance[i] = (test[i][j]-mean[i])**2/count[i] + + # Plot mean against variance + + plt.scatter(mean.values(), variance.values()) + for value in list(mean.keys()): + plt.text(mean[value], variance[value], value) + #plt.annotate(list(mean.keys()), mean.values(), variance.values()) + plt.xlabel('mean') + plt.ylabel('variance') + plt.title('Mean gene expression vs. variance') + plt.show() + + with open('/home/reto/results_mean_var_function.csv', 'w') as csv_file: + filewriter = csv.writer(csv_file, delimiter = ',', quotechar = '|', quoting = csv.QUOTE_MINIMAL) + filewriter.writerow(['geneid', 'mean', 'variance']) + for id in nog.keys(): + filewriter.writerow([id, mean[id], variance[id]]) + + return files, nog, count, mean, 'var', variance, test, list(mean.keys()) + + +print(mean_variance('/home/reto/2021_project_folder/2021_test/*')) + + -- GitLab