From c279468e880f63b780d96a8fc86ea933c304732a Mon Sep 17 00:00:00 2001 From: Madan Mukundan <madan.mukundan@gmail.com> Date: Sat, 18 Dec 2021 15:55:50 +0100 Subject: [PATCH] build: unencapsulated working version with tests --- src/plot_read_counts.py | 218 +++++++++++++++++++++++++ tests/resources/Transcript2.csv | 20 +++ tests/resources/Transcript3.csv | 19 +++ tests/resources/Transcript4.csv | 19 +++ tests/resources/Transcript5.csv | 19 +++ tests/resources/Transcript6.csv | 19 +++ tests/resources/Transcript7negctrl.csv | 8 + tests/test_plot_read_counts.py | 32 ++++ 8 files changed, 354 insertions(+) create mode 100644 src/plot_read_counts.py create mode 100644 tests/resources/Transcript2.csv create mode 100644 tests/resources/Transcript3.csv create mode 100644 tests/resources/Transcript4.csv create mode 100644 tests/resources/Transcript5.csv create mode 100644 tests/resources/Transcript6.csv create mode 100644 tests/resources/Transcript7negctrl.csv create mode 100644 tests/test_plot_read_counts.py diff --git a/src/plot_read_counts.py b/src/plot_read_counts.py new file mode 100644 index 0000000..20af36a --- /dev/null +++ b/src/plot_read_counts.py @@ -0,0 +1,218 @@ +"""Plots read counts of gene expression data.""" + +import logging +import sys +from pathlib import Path +from tkinter import Tk, ttk, filedialog, StringVar +from typing import Union + +import pandas +import matplotlib.pyplot as plt + +"""Given a GeneID, construct the histogram of read counts per cell.""" + + +def plot_read_counts( + dir_path: Union[str, Path] = None, + gene_id: str = None +) -> None: + """Plots histogram of gene ID and read counts from .csv files. + + This function reads .csv files from a directory formatted + "GeneID, Counts" and plots a histogram of read counts for a + chosen geneID. + + Args: + dir_path: Path of directory containing .csv files of read counts. + gene_id: Name of gene to plot. + + Raises: + ValueError: No value is passed for path and/or gene ID. + ValueError: Path passed in does not exist. + ValueError: Gene not found in selection of .csv files. + TypeError: Gene ID parameter passed is not a String. + AttributeError: No .csv files in chosen directory + """ + # Initialize logger + logging.basicConfig( + format='[%(asctime)s: %(levelname)s] "%(module)s" %(message)s', + level=logging.INFO) + LOGGER = logging.getLogger(__name__) + + try: + dir_path = Path(dir_path) + + if not dir_path.exists(): + LOGGER.critical("Invalid path") + raise ValueError("Invalid Path") + + elif not isinstance(gene_id, str): + LOGGER.critical("Invalid gene ID, please enter as string") + raise TypeError("Invalid gene ID") + + # identify csv files from directory, formatted ['Gene', 'Count'] + # if this is not immediately cast as list, on some python versions + # it sets file_paths to null after a single call which is big sad + file_paths = list(dir_path.glob("*.csv")) + file_paths.sort() + + if file_paths == 0: + LOGGER.warning('No csv files found in current directory, exiting') + raise AttributeError("No .csv files found in chosen directory") + + gene_count_dict = {} + file_counter = 1 + + LOGGER.info("Run with %s and gene %s" % (dir_path, gene_id)) + + for file_path in file_paths: + # read in csv here and transform to pandas dataframe + df_read_counts = pandas.read_csv(str(file_path), + header=None, names=['Gene', 'Count']) + + # find gene_id within dataframe + gene_count = df_read_counts.loc[df_read_counts.Gene == gene_id, + 'Count'].tolist() + + if len(gene_count) > 1: + LOGGER.warning( + "%s.csv contains more than one entry for chosen gene - these will be summed" + % (file_path.stem) + ) + + elif not gene_count: + LOGGER.warning( + "%s.csv does not contain read of chosen gene, this is interpreted as 0" + % (file_path.stem)) + gene_count = [0] + + # in case count entry is missing, interpreted as 0 + if not gene_count: + gene_count = [0] + LOGGER.warning( + "File %s contains an entry for this gene but is missing a count number \ + this is interpreted as 0" % (file_path.stem) + ) + + file_counter += 1 + gene_count_dict[file_path.stem] = sum(gene_count) + + except AttributeError: + LOGGER.fatal("Invalid selection, please verify input") + raise ValueError("Either the path, gene ID, or both were not properly specified") + + if sum(gene_count_dict.values()) == 0: + LOGGER.warning( + "Gene not found -> Please check for correct spelling or ID" + ) + raise ValueError("Gene not found") + + else: + LOGGER.info('Read complete from %s files' % (file_counter-1)) + + # read in dictionary from read_counts and plot counts on the same histogram + file_names = list(gene_count_dict.keys()) + counts = list(gene_count_dict.values()) + + # create and show bar plot of read counts for chosen gene + plt.bar(range(len(gene_count_dict)), counts, tick_label=file_names) + plt.title('Counts for %s' % (gene_id)) + plt.show() + + +def prc_dialog() -> None: + """Creates a GUI to select directory and enter gene.""" + # Create dialog window + dialog = Tk() + dialog.title("Plot Gene Read Counts") + dialog.resizable(False, False) + dialog.geometry("380x210+20+20") + dialog.configure(background="black") + + # Create and attach welcome message to GUI + wlcom_text = "Welcome to Plot Read Counts " + wlcom_msg = ttk.Label(dialog, + text=wlcom_text, + font=("Helvetica", 12, "bold"), + foreground="white", + background="black", + padding=5) + wlcom_msg.grid(column=1, row=0, columnspan=2, pady=5) + + # Create and attach instruction message to GUI + inst_msg = ttk.Label(dialog, + text="Select a directory of .csv files and a gene ID to plot", + font=("Helvetica", 12), + foreground="white", + background="black", + padding=5) + inst_msg.grid(column=1, row=1, columnspan=2, padx=10, pady=5) + + # Create and attach directory selection elements + # Either globals are needed or all functions + # need to be encapsulated + global dir_str_var + dir_str_var = StringVar() + dir_lbl = ttk.Label(dialog, + textvariable=dir_str_var, + font=("Courier", 10, "bold"), + background="black", + foreground="green") + dir_lbl.grid(column=1, row=4, padx=5, pady=1, sticky="e") + dir_btn = ttk.Button(dialog, text="Select Directory", + command=lambda: get_dir()) + dir_btn.grid(column=1, row=6, padx=5, pady=1) + + # Create and attach gene selection elements + global gene_lbl + gene_lbl = StringVar() + gene_entry = ttk.Entry(dialog) + gene_entry.grid(column=2, row=5, pady=1) + gene_entry_lbl = ttk.Label(anchor="center", + textvariable=gene_lbl, + font=("Courier", 10, "bold"), + background="black", + foreground="green") + gene_entry_lbl.grid(column=2, row=4, pady=1) + gene_entry_btn = ttk.Button(dialog, + text="Confirm Gene (Case Sensitive)", + command=lambda: get_gene_id(gene_entry)) + gene_entry_btn.grid(column=2, row=6, padx=2, pady=2) + + # Create and attach confirm and close element + kill_btn = ttk.Button(dialog, text="Exit and Plot", + command=lambda: dialog.destroy()) + kill_btn.grid(column=1, row=8, columnspan=2, padx=5, pady=15) + + dialog.mainloop() + + if 'gui_path' in globals() and 'gui_gene_id' in globals(): + plot_read_counts(gui_path, gui_gene_id) + else: + sys.exit() + + +def get_dir() -> Path: + """Support function to retrieve dir from GUI.""" + global gui_path + gui_path = Path(filedialog.askdirectory( + title='Please select the directory containing \ + gene reads')) + short_dir = ("%s/.../%s/" % (gui_path.drive, gui_path.name)) + dir_str_var.set(short_dir) + + +def get_gene_id(gene_entry: StringVar) -> str: + """Support function to retrieve gene ID from GUI.""" + global gui_gene_id + gui_gene_id = gene_entry.get() + gene_lbl.set(gene_entry.get()) + + +def main() -> None: + """Main opens GUI for dir and gene ID selection.""" + prc_dialog() + + +if __name__ == '__main__': + main() diff --git a/tests/resources/Transcript2.csv b/tests/resources/Transcript2.csv new file mode 100644 index 0000000..2a3dc0c --- /dev/null +++ b/tests/resources/Transcript2.csv @@ -0,0 +1,20 @@ +GENE1,624 +GENE2,895 +GENE3,870 +GENE4,428 +GENE5,449 +GENE6,696 +GENE7,544 +GENE8,338 +GENE9,159 +GENE10,267 +GENE11,268 +GENE12,879 +GENE13,556 +GENE14,613 +GENE15,629 +GENE16,116 +GENE17,762 +GENE18,67 +GENE19,405 +GENE20,3 diff --git a/tests/resources/Transcript3.csv b/tests/resources/Transcript3.csv new file mode 100644 index 0000000..2473e67 --- /dev/null +++ b/tests/resources/Transcript3.csv @@ -0,0 +1,19 @@ +GENE1,830 +GENE2,602 +GENE3,859 +GENE4,174 +GENE5,525 +GENE6,606 +, +GENE8,910 +GENE9,194 +GENE10,415 +GENE11,953 +GENE12,299 +GENE13,127 +GENE14,750 +GENE15,968 +GENE16,282 +GENE17,868 +GENE18,688 +GENE19,650 diff --git a/tests/resources/Transcript4.csv b/tests/resources/Transcript4.csv new file mode 100644 index 0000000..ff000bd --- /dev/null +++ b/tests/resources/Transcript4.csv @@ -0,0 +1,19 @@ +GENE1,629 +GENE2,803 +GENE3,106 +GENE4,378 +GENE5,728 +GENE6,783 +GENE7,353 +GENE8,747 +GENE9,253 +GENE10,883 +GENE11,552 +GENE12,655 +GENE13,309 +GENE14,133 +GENE15,131 +GENE16,626 +GENE17,669 +GENE18,493 +GENE19,304 diff --git a/tests/resources/Transcript5.csv b/tests/resources/Transcript5.csv new file mode 100644 index 0000000..34613ca --- /dev/null +++ b/tests/resources/Transcript5.csv @@ -0,0 +1,19 @@ +GENE1,830 +GENE2,602 +GENE3,859 +GENE4,174 +GENE5,525 +GENE6,606 +GENE7,851 +GENE8,910 +GENE9,194 +GENE10,415 +GENE11,953 +GENE12,299 +GENE13,127 +GENE14,750 +GENE15,968 +GENE16,282 +GENE17,868 +GENE18,688 +GENE19,650 diff --git a/tests/resources/Transcript6.csv b/tests/resources/Transcript6.csv new file mode 100644 index 0000000..794d1c1 --- /dev/null +++ b/tests/resources/Transcript6.csv @@ -0,0 +1,19 @@ +GENE1,676 +GENE2,787 +GENE3,876 +GENE4,368 +GENE5,676 +GENE6,896 +GENE7,865 +GENE8,585 +GENE9,853 +GENE10,447 +GENE11,583 +GENE12,573 +GENE13,895 +GENE14,207 +GENE15,562 +GENE16,411 +GENE17,635 +GENE18,469 +GENE19,128 diff --git a/tests/resources/Transcript7negctrl.csv b/tests/resources/Transcript7negctrl.csv new file mode 100644 index 0000000..72cb6bf --- /dev/null +++ b/tests/resources/Transcript7negctrl.csv @@ -0,0 +1,8 @@ +Random,Data,for +This,is,a +negative,control,test +some,numbers, +234,, +,432, +,, +32,, diff --git a/tests/test_plot_read_counts.py b/tests/test_plot_read_counts.py new file mode 100644 index 0000000..add8572 --- /dev/null +++ b/tests/test_plot_read_counts.py @@ -0,0 +1,32 @@ +"Test for plot_read_counts module" + +import pytest +from pathlib import Path +from src.plot_read_counts import plot_read_counts + +class TestPlotReadCounts (): + + @pytest.mark.parametrize( + "dir_path, gene_id, expected", + [ + ('a', 'GENE1', ValueError), + ('Path.home()', 5, TypeError), + (None, None, ValueError), + ('Path.home()', 'GENE1', AttributeError), + ] + ) + def test_invalid_input(self, dir_path, gene_id, expected): + with pytest.raises(expected): + plot_read_counts(dir_path, gene_id) + + def validate_input(): + + pass + + #plot_dir = PurePath('H:/My Drive/PhD/PLS/PLS scRNAseq Repo/scrna-seq-simulation/tests/resources') + #plot_dir = 'H:/My Drive/PhD/PLS scRNAseq Repo/scrna-seq-simulation/' + # plot_dir = PurePath(filedialog.askdirectory(title='Please select the directory containing \ + # gene reads')) + # gene_of_interest = input('\nPlease type a gene ID to sample: ').upper() + #gene_of_interest = 'GENE1' + \ No newline at end of file -- GitLab