Skip to content
Snippets Groups Projects
Commit 79401641 authored by Madan Mukundan's avatar Madan Mukundan
Browse files

refactor: outputs to file in ./plots, updated cli and tests

parent 8de4aacd
No related branches found
No related tags found
1 merge request!26Issue14 prc
Pipeline #13909 failed
Showing
with 1046 additions and 174 deletions
......@@ -156,4 +156,7 @@ dmypy.json
# Cython debug symbols
cython_debug/
# VSCode
.vscode/
# End of https://www.toptal.com/developers/gitignore/api/python,git
\ No newline at end of file
This diff is collapsed.
......@@ -3,4 +3,5 @@ flake8
flake8-docstrings
mypy
pandas
pytest
\ No newline at end of file
pytest
matplotlib
"""CLI for plot_read_counts function."""
import argparse
import logging
from plot_read_counts import plot_read_counts
def main():
r"""CLI for plot_read_counts.
This cli is used to access the functionality of plot_read_counts,
taking in a path of csv files, a gene ID, and an optional save
file name.
Typical example usage:
cli_plot_read_counts.py '../tests/resources' GENE1 '../test/PLSe1writeTarget.test'
"""
parser = argparse.ArgumentParser(
description='number of transcripts to sample')
parser.add_argument('file_path', action='store',
type=str, help='path to gene count .csv files')
parser.add_argument('gene_id', action='store',
type=str, help='gene ID of which to plot counts')
argsin = parser.parse_args()
logging.basicConfig(format='[%(asctime)s: %(levelname)s] "%(module)s" %(message)s',
level=logging.INFO)
plot_read_counts(argsin.file_path, argsin.gene_id)
if __name__ == "__main__":
main()
......@@ -3,19 +3,19 @@
import logging
from pathlib import Path
from typing import Union
from datetime import datetime
import pandas
import matplotlib.pyplot as plt
"""Given a GeneID, construct the histogram of read counts per cell."""
def plot_read_counts(
dir_path: Union[str, Path] = None,
gene_id: str = None
) -> bool:
dir_path: Union[str, Path],
gene_id: str
) -> str:
"""Plots histogram of gene ID and read counts from .csv files.
This function reads .csv files from a directory formatted
This function reads .csv files from a directory formatted
"GeneID, Counts" and plots a histogram of read counts for a
chosen geneID.
......@@ -23,96 +23,126 @@ def plot_read_counts(
dir_path: Path of directory containing .csv files of read counts.
gene_id: Name of gene to plot.
Returns:
String representing path to .svg plot file.
Raises:
ValueError: No value is passed for path and/or gene ID.
ValueError: Path passed in does not exist.
ValueError: Gene not found in selection of .csv files.
TypeError: Gene ID parameter passed is not a String.
AttributeError: No .csv files in chosen directory
FileNotFoundError: File path does not contain any .csv files.
KeyError: A file containing an entry of the chosen gene is missing
a count entry for it.
TypeError: A file containing an entry of the chosen gene has a count
that is not a number.
"""
LOGGER = logging.getLogger(__name__)
dir_path = Path(dir_path)
if not dir_path.exists():
raise ValueError("Invalid Path")
if not isinstance(gene_id, str):
raise TypeError("Invalid gene ID")
# identify csv files from directory, formatted ['Gene', 'Count']
# if this is not immediately cast as list, on some python versions
# it sets file_paths to null after a single call
file_paths = list(dir_path.glob("*.csv"))
file_paths.sort()
if len(file_paths) == 0:
raise FileNotFoundError("No .csv files found in chosen directory")
gene_count_dict = {}
file_counter = 0
LOGGER.info("Running with %s and gene %s" % (dir_path, gene_id))
for file_path in file_paths:
# read in csv here via pandas
df_read_counts = pandas.read_csv(str(file_path),
header=None, names=['Gene', 'Count'])
# confirm that gene entry exists for edge case where count may be deleted
confirm_gene_presence = df_read_counts.loc[df_read_counts.Gene == gene_id,
'Gene'].tolist()
# find gene_id within dataframe
gene_count = df_read_counts.loc[df_read_counts.Gene == gene_id,
'Count'].tolist()
# # If an entry exists for the gene, but the count is missing, a KeyError is thrown
if confirm_gene_presence and not gene_count:
raise KeyError("%s.csv contains an entry for %s, but is "
"missing a count." % (file_path.stem, gene_id))
# If an entry exists for the gene, but the count is not a number, a TypeError is thrown
if confirm_gene_presence:
try:
for i, count in enumerate(gene_count):
gene_count[i] = int(count)
except ValueError:
raise TypeError("%s.csv contains an entry for %s, but is "
"not a number." % (file_path.stem, gene_id))
if not gene_count:
LOGGER.warning(f"File {file_path.stem}.csv does not contain an entry for {gene_id}, "
"this is interpreted as 0")
gene_count = [0]
if len(gene_count) > 1:
LOGGER.warning(
"%s.csv contains more than one entry for chosen gene - these will be summed"
% (file_path.stem)
)
file_counter += 1
# Uncomment to use file name as label -> does not autoscale pyplot
# gene_count_dict[file_path.stem] = sum(gene_count)
gene_count_dict[f"Read {file_counter}"] = sum(gene_count)
LOGGER.info('Read complete from %s files' % (file_counter))
# read in dictionary from read_counts and plot counts on the same histogram
file_names = list(gene_count_dict.keys())
counts = list(gene_count_dict.values())
# create and show bar plot of read counts for chosen gene
plt.bar(range(len(gene_count_dict)), counts, tick_label=file_names, width=0.4)
plt.title('Counts for %s' % (gene_id))
plt.autoscale(axis='both')
plot_file_path = Path(f"{Path.cwd()}/plots/{get_file_name()}.svg")
plt.savefig(plot_file_path)
LOGGER.info(f"Plot successfully generated and saved at {plot_file_path}")
return(plot_file_path)
def get_file_name() -> str:
"""Helper function to determine output filename."""
datetime_str = datetime.now().strftime("%y%m%d")
file_name_index = 1
plot_file_name = f"reads_plot_{datetime_str}_{file_name_index}"
# while(Path.exists(Path(f"{Path.cwd()}/plots/{plot_file_name}"))):
# file_name_index += 1
# plot_file_name = f"reads_plot_{datetime_str}_{file_name_index}"
return plot_file_name
def main():
"""Prompts user in cl to enter path and gene to plot."""
# Initialize logger
logging.basicConfig(
format='[%(asctime)s: %(levelname)s] "%(module)s" %(message)s',
level=logging.INFO)
LOGGER = logging.getLogger(__name__)
try:
dir_path = Path(dir_path)
if not dir_path.exists():
LOGGER.critical("Invalid path")
raise ValueError("Invalid Path")
elif not isinstance(gene_id, str):
LOGGER.critical("Invalid gene ID, please enter as string")
raise TypeError("Invalid gene ID")
# identify csv files from directory, formatted ['Gene', 'Count']
# if this is not immediately cast as list, on some python versions
# it sets file_paths to null after a single call which is big sad
file_paths = list(dir_path.glob("*.csv"))
file_paths.sort()
if file_paths == 0:
LOGGER.warning('No csv files found in current directory, exiting')
raise AttributeError("No .csv files found in chosen directory")
gene_count_dict = {}
file_counter = 1
LOGGER.info("Run with %s and gene %s" % (dir_path, gene_id))
for file_path in file_paths:
# read in csv here and transform to pandas dataframe
df_read_counts = pandas.read_csv(str(file_path),
header=None, names=['Gene', 'Count'])
# find gene_id within dataframe
gene_count = df_read_counts.loc[df_read_counts.Gene == gene_id,
'Count'].tolist()
if len(gene_count) > 1:
LOGGER.warning(
"%s.csv contains more than one entry for chosen gene - these will be summed"
% (file_path.stem)
)
elif not gene_count:
LOGGER.warning(
"%s.csv does not contain read of chosen gene, this is interpreted as 0"
% (file_path.stem))
gene_count = [0]
# in case count entry is missing, interpreted as 0
if not gene_count:
gene_count = [0]
LOGGER.warning(
"File %s contains an entry for this gene but is missing a count number \
this is interpreted as 0" % (file_path.stem)
)
file_counter += 1
gene_count_dict[file_path.stem] = sum(gene_count)
except AttributeError:
LOGGER.fatal("Invalid selection, please verify input")
raise ValueError("Either the path, gene ID, or both were not properly specified")
if sum(gene_count_dict.values()) == 0:
LOGGER.warning(
"Gene not found -> Please check for correct spelling or ID"
)
raise ValueError("Gene not found")
else:
LOGGER.info('Read complete from %s files' % (file_counter-1))
# read in dictionary from read_counts and plot counts on the same histogram
file_names = list(gene_count_dict.keys())
counts = list(gene_count_dict.values())
# create and show bar plot of read counts for chosen gene
plt.bar(range(len(gene_count_dict)), counts, tick_label=file_names)
plt.title('Counts for %s' % (gene_id))
plt.show()
return True
# Prompt user for input
csv_path = input("\nPlease enter a path: ")
gene_of_interest = input("\nPlease enter a gene name to plot: ")
plot_read_counts(csv_path, gene_of_interest)
if __name__ == "__main__":
main()
GENE1,629
GENE2,803
GENE3,106
GENE4,378
GENE5,728
GENE6,783
GENE7,353
GENE8,747
GENE9,253
GENE10,883
GENE11,552
GENE12,655
GENE13,309
GENE14,133
GENE15,131
GENE16,626
GENE17,669
GENE18,493
GENE19,304
GENE1,830
GENE2,602
GENE3,859
GENE4,174
GENE5,525
GENE6,606
GENE7,851
GENE8,910
GENE9,194
GENE10,415
GENE11,953
GENE12,299
GENE13,127
GENE14,750
GENE15,968
GENE16,282
GENE17,868
GENE18,688
GENE19,650
GENE1,676
GENE2,787
GENE3,876
GENE4,368
GENE5,676
GENE6,896
GENE7,865
GENE8,585
GENE9,853
GENE10,447
GENE11,583
GENE12,573
GENE13,895
GENE14,207
GENE15,562
GENE16,411
GENE17,635
GENE18,469
GENE19,128
Random,Data,for
This,is,a
negative,control,test
some,numbers,
234,,
,432,
,,
32,,
Improper,Data
GENE6,nan
......@@ -4,7 +4,7 @@ GENE3,859
GENE4,174
GENE5,525
GENE6,606
,
GENE7,
GENE8,910
GENE9,194
GENE10,415
......
GENE1,92
GENE2,13
GENE3,73
GENE4,83
GENE5,32
GENE6,136
GENE1,92
GENE2,13
GENE3,73
GENE4,83
GENE5,32
GENE6,136
GENE7,36
\ No newline at end of file
......@@ -12,11 +12,10 @@ class TestPlotReadCounts ():
"dir_path, gene_id, expected",
[
('a', 'GENE1', ValueError),
('Path.home()', 5, TypeError),
(None, None, ValueError),
('Path.home()', 'GENE1', AttributeError),
(Path(str(Path.parent())+"/tests/resources/"), 'GENE25', ValueError),
(Path(f"{Path.cwd()}/tests/"), 'GENE1', FileNotFoundError),
(Path(f"{Path.cwd()}/tests/resources/"), 5, TypeError),
(Path(f"{Path.cwd()}/tests/resources/"), 'GENE6', TypeError),
(Path(f"{Path.cwd()}/tests/resources/"), 'GENE7', TypeError),
]
)
def test_invalid_input(self, dir_path, gene_id, expected):
......@@ -25,13 +24,13 @@ class TestPlotReadCounts ():
plot_read_counts(dir_path, gene_id)
@pytest.mark.parametrize(
"dir_path, gene_id, expected",
"dir_path, gene_id",
[
(Path(str(Path.parent())+"/tests/resources/"), 'GENE1', True),
(Path(str(Path.parent())+"/tests/resources/"), 'GENE1', True),
(Path(str(Path.parent())+"/tests/resources/"), 'GENE7', True),
(Path(f"{Path.cwd()}/tests/resources/"), 'GENE1'),
(Path(f"{Path.cwd()}/tests/resources/"), 'GENE2'),
(Path(f"{Path.cwd()}/tests/resources/"), 'GENE3'),
]
)
def validate_input(self, dir_path, gene_id):
def test_valid_input(self, dir_path, gene_id):
"""Tests valid input with return value boolean."""
assert plot_read_counts(dir_path, gene_id) is True
assert Path.exists(plot_read_counts(dir_path, gene_id))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment