Skip to content
Snippets Groups Projects

feat: add function to calculate mean and variance

Closed Reto Tschannen requested to merge issue12 into main
Files
2
+ 74
0
import glob, io
import matplotlib.pyplot as plt
import csv
import os
def mean_variance(filepath):
"""Given the counts observed for a given gene across all cells, calculate the mean and variance.
Input:
directory with files of gene expression counts in individual cells
Output:
1. Path to Csv-formatted table with GeneID, Mean, Variance of the counts
2. Scatterplot of mean vs variance for all genes
"""
# Open each file in the input directory, raises error if no file is fund
files = [file for file in glob.glob(filepath)]
if len(files) == 0:
raise ValueError('No files in directory:', filepath)
# Creates all required dictionaries to cinstruct the mean, variance
gene_counts = {}
occurence = {}
individual_values = {}
mean = {}
variance = {}
# Added together all gene counts in gene_counts, and occurence in occurence
for file_name in files:
with io.open(file_name, 'r') as fh:
for line in fh:
geneid, copies = str(line.split()[0]), int(line.split()[1])
if geneid not in gene_counts:
gene_counts[geneid] = copies
occurence[geneid] = 1
individual_values[geneid] = [copies]
else:
gene_counts[geneid] += copies
occurence[geneid] += 1
individual_values[geneid] += [copies]
# Calculate mean of each gene
for i in gene_counts:
mean[i] = gene_counts[i]/occurence[i]
# Calculate the variance
for i in individual_values:
for j in range(0, len(individual_values[i])):
variance[i] = (individual_values[i][j]-mean[i])**2/occurence[i]
# Plot mean against variance
plt.scatter(mean.values(), variance.values())
for value in list(mean.keys()):
plt.text(mean[value], variance[value], value)
plt.xlabel('mean')
plt.ylabel('variance')
plt.title('Mean gene expression vs. variance')
plt.show()
# Constructs csv file and saves it in the users directory
with open(os.path.expanduser("~")+'/results_mean_var_function.csv', 'w') as csv_file:
filewriter = csv.writer(csv_file, delimiter = ',', quotechar = '|', quoting = csv.QUOTE_MINIMAL)
filewriter.writerow(['geneid', 'mean', 'variance'])
for id in gene_counts.keys():
filewriter.writerow([id, mean[id], variance[id]])
return os.path.expanduser("~")+'/results_mean_var_function.csv'
Loading