Skip to content
Snippets Groups Projects

Issue14 prc

Closed Madan Mukundan requested to merge issue14_PRC into main
16 unresolved threads
9 files
+ 260
1
Compare changes
  • Side-by-side
  • Inline
Files
9
+ 118
0
 
"""Plots read counts of gene expression data."""
 
 
import logging
 
from pathlib import Path
 
from typing import Union
 
import pandas
 
import matplotlib.pyplot as plt
 
 
"""Given a GeneID, construct the histogram of read counts per cell."""
Please register or sign in to reply
 
 
 
def plot_read_counts(
 
dir_path: Union[str, Path] = None,
Please register or sign in to reply
 
gene_id: str = None
 
) -> bool:
 
"""Plots histogram of gene ID and read counts from .csv files.
 
 
This function reads .csv files from a directory formatted
Please register or sign in to reply
 
"GeneID, Counts" and plots a histogram of read counts for a
 
chosen geneID.
 
 
Args:
 
dir_path: Path of directory containing .csv files of read counts.
 
gene_id: Name of gene to plot.
 
 
Raises:
 
ValueError: No value is passed for path and/or gene ID.
Please register or sign in to reply
 
ValueError: Path passed in does not exist.
 
ValueError: Gene not found in selection of .csv files.
 
TypeError: Gene ID parameter passed is not a String.
 
AttributeError: No .csv files in chosen directory
 
"""
 
# Initialize logger
 
logging.basicConfig(
 
format='[%(asctime)s: %(levelname)s] "%(module)s" %(message)s',
 
level=logging.INFO)
 
LOGGER = logging.getLogger(__name__)
 
 
try:
 
dir_path = Path(dir_path)
    • It's best not to have such big try blocks, because you will get very unspecific error handling: somewhere in all of those lines something went wrong... Best to catch errors as specific as possible. Also, you don't need to catch and handle every possible error. For example, if dir_path is a string, it will be very unlikely that Path(dir_path) will fail - but if it does, an error will be raised anyway. If that happens more often than we think, we can still catch that error more specifically.

Please register or sign in to reply
 
 
if not dir_path.exists():
 
LOGGER.critical("Invalid path")
 
raise ValueError("Invalid Path")
Please register or sign in to reply
 
 
elif not isinstance(gene_id, str):
Please register or sign in to reply
 
LOGGER.critical("Invalid gene ID, please enter as string")
 
raise TypeError("Invalid gene ID")
 
 
# identify csv files from directory, formatted ['Gene', 'Count']
 
# if this is not immediately cast as list, on some python versions
 
# it sets file_paths to null after a single call which is big sad
 
file_paths = list(dir_path.glob("*.csv"))
 
file_paths.sort()
 
 
if file_paths == 0:
 
LOGGER.warning('No csv files found in current directory, exiting')
 
raise AttributeError("No .csv files found in chosen directory")
Please register or sign in to reply
 
 
gene_count_dict = {}
 
file_counter = 1
 
 
LOGGER.info("Run with %s and gene %s" % (dir_path, gene_id))
 
 
for file_path in file_paths:
 
# read in csv here and transform to pandas dataframe
 
df_read_counts = pandas.read_csv(str(file_path),
 
header=None, names=['Gene', 'Count'])
 
 
# find gene_id within dataframe
 
gene_count = df_read_counts.loc[df_read_counts.Gene == gene_id,
 
'Count'].tolist()
 
 
if len(gene_count) > 1:
 
LOGGER.warning(
 
"%s.csv contains more than one entry for chosen gene - these will be summed"
 
% (file_path.stem)
 
)
 
 
elif not gene_count:
 
LOGGER.warning(
 
"%s.csv does not contain read of chosen gene, this is interpreted as 0"
 
% (file_path.stem))
 
gene_count = [0]
 
 
# in case count entry is missing, interpreted as 0
 
if not gene_count:
 
gene_count = [0]
 
LOGGER.warning(
 
"File %s contains an entry for this gene but is missing a count number \
 
this is interpreted as 0" % (file_path.stem)
 
)
 
 
file_counter += 1
 
gene_count_dict[file_path.stem] = sum(gene_count)
 
 
except AttributeError:
 
LOGGER.fatal("Invalid selection, please verify input")
Please register or sign in to reply
 
raise ValueError("Either the path, gene ID, or both were not properly specified")
 
 
if sum(gene_count_dict.values()) == 0:
 
LOGGER.warning(
 
"Gene not found -> Please check for correct spelling or ID"
Please register or sign in to reply
 
)
 
raise ValueError("Gene not found")
 
 
else:
 
LOGGER.info('Read complete from %s files' % (file_counter-1))
 
 
# read in dictionary from read_counts and plot counts on the same histogram
 
file_names = list(gene_count_dict.keys())
 
counts = list(gene_count_dict.values())
 
 
# create and show bar plot of read counts for chosen gene
 
plt.bar(range(len(gene_count_dict)), counts, tick_label=file_names)
 
plt.title('Counts for %s' % (gene_id))
 
plt.show()
Please register or sign in to reply
 
return True
Loading