Skip to content
Snippets Groups Projects
Commit 663d5b8f authored by Reto Tschannen's avatar Reto Tschannen
Browse files

feat: Added function to calculate mean and variance from a directory with...

feat: Added function to calculate mean and variance from a directory with files of gene expression(Work in progress)
parent e8f8acc9
No related branches found
No related tags found
1 merge request!13feat: add function to calculate mean and variance
Pipeline #13624 failed
This commit is part of merge request !13. Comments created here will be created in the context of that merge request.
import glob, io
import matplotlib.pyplot as plt
import csv
def mean_variance(filepath):
"""Given the counts observed for a given gene across all cells, calculate the mean and variance.
Input:
directory with files of gene expression counts in individual cells
Output:
1. Csv-formatted table with GeneID, Mean, Variance of the counts
2. Scatterplot of mean vs variance for all genes
"""
# Open each file in the input directory, raises error if no file is fund
files = [file for file in glob.glob(filepath)]
if len(files) == 0:
raise ValueError('No files in directory:', filepath)
nog = {}
count = {}
test = {}
mean = {}
variance = {}
for file_name in files:
with io.open(file_name, 'r') as fh:
for line in fh:
geneid, copies = str(line.split()[0]), int(line.split()[1])
if geneid not in nog:
nog[geneid] = copies
count[geneid] = 1
test[geneid] = [copies]
else:
nog[geneid] += copies
count[geneid] += 1
test[geneid] += [copies]
# Calculate mean of each gene
for i in nog:
mean[i] = nog[i]/count[i]
# Calculate the variance
for i in test:
for j in range(0, len(test[i])):
variance[i] = (test[i][j]-mean[i])**2/count[i]
# Plot mean against variance
plt.scatter(mean.values(), variance.values())
for value in list(mean.keys()):
plt.text(mean[value], variance[value], value)
#plt.annotate(list(mean.keys()), mean.values(), variance.values())
plt.xlabel('mean')
plt.ylabel('variance')
plt.title('Mean gene expression vs. variance')
plt.show()
with open('/home/reto/results_mean_var_function.csv', 'w') as csv_file:
filewriter = csv.writer(csv_file, delimiter = ',', quotechar = '|', quoting = csv.QUOTE_MINIMAL)
filewriter.writerow(['geneid', 'mean', 'variance'])
for id in nog.keys():
filewriter.writerow([id, mean[id], variance[id]])
return files, nog, count, mean, 'var', variance, test, list(mean.keys())
print(mean_variance('/home/reto/2021_project_folder/2021_test/*'))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment