diff --git a/src/mean-variance-function.py b/src/mean-variance-function.py index 2dfdf6d5cbf50a4c488a4ba33fbc9b01b94514a3..9ce94540ab71122e068aeaba525f4f19be51ae1f 100644 --- a/src/mean-variance-function.py +++ b/src/mean-variance-function.py @@ -1,6 +1,7 @@ import glob, io import matplotlib.pyplot as plt import csv +import os @@ -12,62 +13,62 @@ def mean_variance(filepath): directory with files of gene expression counts in individual cells Output: - 1. Csv-formatted table with GeneID, Mean, Variance of the counts + 1. Path to Csv-formatted table with GeneID, Mean, Variance of the counts 2. Scatterplot of mean vs variance for all genes """ # Open each file in the input directory, raises error if no file is fund - files = [file for file in glob.glob(filepath)] if len(files) == 0: raise ValueError('No files in directory:', filepath) + # Creates all required dictionaries to cinstruct the mean, variance nog = {} - count = {} - test = {} + occurence = {} + individual_values = {} mean = {} variance = {} + # Added together all gene counts in nog, and occurence in occurence for file_name in files: with io.open(file_name, 'r') as fh: for line in fh: geneid, copies = str(line.split()[0]), int(line.split()[1]) if geneid not in nog: nog[geneid] = copies - count[geneid] = 1 - test[geneid] = [copies] + occurence[geneid] = 1 + individual_values[geneid] = [copies] else: nog[geneid] += copies - count[geneid] += 1 - test[geneid] += [copies] + occurence[geneid] += 1 + individual_values[geneid] += [copies] # Calculate mean of each gene for i in nog: - mean[i] = nog[i]/count[i] + mean[i] = nog[i]/occurence[i] # Calculate the variance - for i in test: - for j in range(0, len(test[i])): - variance[i] = (test[i][j]-mean[i])**2/count[i] + for i in individual_values: + for j in range(0, len(individual_values[i])): + variance[i] = (individual_values[i][j]-mean[i])**2/occurence[i] # Plot mean against variance - plt.scatter(mean.values(), variance.values()) for value in list(mean.keys()): plt.text(mean[value], variance[value], value) - #plt.annotate(list(mean.keys()), mean.values(), variance.values()) plt.xlabel('mean') plt.ylabel('variance') plt.title('Mean gene expression vs. variance') plt.show() - - with open('/home/reto/results_mean_var_function.csv', 'w') as csv_file: + + # Constructs csv file and saves it in the users directory + with open(os.path.expanduser("~")+'/results_mean_var_function.csv', 'w') as csv_file: filewriter = csv.writer(csv_file, delimiter = ',', quotechar = '|', quoting = csv.QUOTE_MINIMAL) filewriter.writerow(['geneid', 'mean', 'variance']) for id in nog.keys(): filewriter.writerow([id, mean[id], variance[id]]) - return files, nog, count, mean, 'var', variance, test, list(mean.keys()) + return os.path.expanduser("~")+'/results_mean_var_function.csv' print(mean_variance('/home/reto/2021_project_folder/2021_test/*'))