From c279468e880f63b780d96a8fc86ea933c304732a Mon Sep 17 00:00:00 2001
From: Madan Mukundan <madan.mukundan@gmail.com>
Date: Sat, 18 Dec 2021 15:55:50 +0100
Subject: [PATCH] build: unencapsulated working version with tests

---
 src/plot_read_counts.py                | 218 +++++++++++++++++++++++++
 tests/resources/Transcript2.csv        |  20 +++
 tests/resources/Transcript3.csv        |  19 +++
 tests/resources/Transcript4.csv        |  19 +++
 tests/resources/Transcript5.csv        |  19 +++
 tests/resources/Transcript6.csv        |  19 +++
 tests/resources/Transcript7negctrl.csv |   8 +
 tests/test_plot_read_counts.py         |  32 ++++
 8 files changed, 354 insertions(+)
 create mode 100644 src/plot_read_counts.py
 create mode 100644 tests/resources/Transcript2.csv
 create mode 100644 tests/resources/Transcript3.csv
 create mode 100644 tests/resources/Transcript4.csv
 create mode 100644 tests/resources/Transcript5.csv
 create mode 100644 tests/resources/Transcript6.csv
 create mode 100644 tests/resources/Transcript7negctrl.csv
 create mode 100644 tests/test_plot_read_counts.py

diff --git a/src/plot_read_counts.py b/src/plot_read_counts.py
new file mode 100644
index 0000000..20af36a
--- /dev/null
+++ b/src/plot_read_counts.py
@@ -0,0 +1,218 @@
+"""Plots read counts of gene expression data."""
+
+import logging
+import sys
+from pathlib import Path
+from tkinter import Tk, ttk, filedialog, StringVar
+from typing import Union
+
+import pandas
+import matplotlib.pyplot as plt
+
+"""Given a GeneID, construct the histogram of read counts per cell."""
+
+
+def plot_read_counts(
+    dir_path: Union[str, Path] = None,
+    gene_id: str = None
+) -> None:
+    """Plots histogram of gene ID and read counts from .csv files.
+
+        This function reads .csv files from a directory formatted
+    "GeneID, Counts" and plots a histogram of read counts for a
+    chosen geneID.
+
+    Args:
+        dir_path: Path of directory containing .csv files of read counts.
+        gene_id: Name of gene to plot.
+
+    Raises:
+        ValueError: No value is passed for path and/or gene ID.
+        ValueError: Path passed in does not exist.
+        ValueError: Gene not found in selection of .csv files.
+        TypeError: Gene ID parameter passed is not a String.
+        AttributeError: No .csv files in chosen directory
+    """
+    # Initialize logger
+    logging.basicConfig(
+        format='[%(asctime)s: %(levelname)s] "%(module)s" %(message)s',
+        level=logging.INFO)
+    LOGGER = logging.getLogger(__name__)
+
+    try:
+        dir_path = Path(dir_path)
+
+        if not dir_path.exists():
+            LOGGER.critical("Invalid path")
+            raise ValueError("Invalid Path")
+
+        elif not isinstance(gene_id, str):
+            LOGGER.critical("Invalid gene ID, please enter as string")
+            raise TypeError("Invalid gene ID")
+
+        # identify csv files from directory, formatted ['Gene', 'Count']
+        # if this is not immediately cast as list, on some python versions
+        # it sets file_paths to null after a single call which is big sad
+        file_paths = list(dir_path.glob("*.csv"))
+        file_paths.sort()
+
+        if file_paths == 0:
+            LOGGER.warning('No csv files found in current directory, exiting')
+            raise AttributeError("No .csv files found in chosen directory")
+
+        gene_count_dict = {}
+        file_counter = 1
+
+        LOGGER.info("Run with %s and gene %s" % (dir_path, gene_id))
+
+        for file_path in file_paths:
+            # read in csv here and transform to pandas dataframe
+            df_read_counts = pandas.read_csv(str(file_path),
+                                             header=None, names=['Gene', 'Count'])
+
+            # find gene_id within dataframe
+            gene_count = df_read_counts.loc[df_read_counts.Gene == gene_id,
+                                            'Count'].tolist()
+
+            if len(gene_count) > 1:
+                LOGGER.warning(
+                    "%s.csv contains more than one entry for chosen gene - these will be summed"
+                    % (file_path.stem)
+                )
+
+            elif not gene_count:
+                LOGGER.warning(
+                    "%s.csv does not contain read of chosen gene, this is interpreted as 0"
+                    % (file_path.stem))
+                gene_count = [0]
+
+            # in case count entry is missing, interpreted as 0
+            if not gene_count:
+                gene_count = [0]
+                LOGGER.warning(
+                    "File %s contains an entry for this gene but is missing a count number \
+                        this is interpreted as 0" % (file_path.stem)
+                )
+
+            file_counter += 1
+            gene_count_dict[file_path.stem] = sum(gene_count)
+
+    except AttributeError:
+        LOGGER.fatal("Invalid selection, please verify input")
+        raise ValueError("Either the path, gene ID, or both were not properly specified")
+
+    if sum(gene_count_dict.values()) == 0:
+        LOGGER.warning(
+            "Gene not found -> Please check for correct spelling or ID"
+        )
+        raise ValueError("Gene not found")
+
+    else:
+        LOGGER.info('Read complete from %s files' % (file_counter-1))
+
+        # read in dictionary from read_counts and plot counts on the same histogram
+        file_names = list(gene_count_dict.keys())
+        counts = list(gene_count_dict.values())
+
+        # create and show bar plot of read counts for chosen gene
+        plt.bar(range(len(gene_count_dict)), counts, tick_label=file_names)
+        plt.title('Counts for %s' % (gene_id))
+        plt.show()
+
+
+def prc_dialog() -> None:
+    """Creates a GUI to select directory and enter gene."""
+    # Create dialog window
+    dialog = Tk()
+    dialog.title("Plot Gene Read Counts")
+    dialog.resizable(False, False)
+    dialog.geometry("380x210+20+20")
+    dialog.configure(background="black")
+
+    # Create and attach welcome message to GUI
+    wlcom_text = "Welcome to Plot Read Counts "
+    wlcom_msg = ttk.Label(dialog,
+                          text=wlcom_text,
+                          font=("Helvetica", 12, "bold"),
+                          foreground="white",
+                          background="black",
+                          padding=5)
+    wlcom_msg.grid(column=1, row=0, columnspan=2, pady=5)
+
+    # Create and attach instruction message to GUI
+    inst_msg = ttk.Label(dialog,
+                         text="Select a directory of .csv files and a gene ID to plot",
+                         font=("Helvetica", 12),
+                         foreground="white",
+                         background="black",
+                         padding=5)
+    inst_msg.grid(column=1, row=1, columnspan=2, padx=10, pady=5)
+
+    # Create and attach directory selection elements
+    # Either globals are needed or all functions
+    # need to be encapsulated
+    global dir_str_var
+    dir_str_var = StringVar()
+    dir_lbl = ttk.Label(dialog,
+                        textvariable=dir_str_var,
+                        font=("Courier", 10, "bold"),
+                        background="black",
+                        foreground="green")
+    dir_lbl.grid(column=1, row=4, padx=5, pady=1, sticky="e")
+    dir_btn = ttk.Button(dialog, text="Select Directory",
+                         command=lambda: get_dir())
+    dir_btn.grid(column=1, row=6, padx=5, pady=1)
+
+    # Create and attach gene selection elements
+    global gene_lbl
+    gene_lbl = StringVar()
+    gene_entry = ttk.Entry(dialog)
+    gene_entry.grid(column=2, row=5, pady=1)
+    gene_entry_lbl = ttk.Label(anchor="center",
+                               textvariable=gene_lbl,
+                               font=("Courier", 10, "bold"),
+                               background="black",
+                               foreground="green")
+    gene_entry_lbl.grid(column=2, row=4,  pady=1)
+    gene_entry_btn = ttk.Button(dialog,
+                                text="Confirm Gene (Case Sensitive)",
+                                command=lambda: get_gene_id(gene_entry))
+    gene_entry_btn.grid(column=2, row=6, padx=2, pady=2)
+
+    # Create and attach confirm and close element
+    kill_btn = ttk.Button(dialog, text="Exit and Plot",
+                          command=lambda: dialog.destroy())
+    kill_btn.grid(column=1, row=8, columnspan=2, padx=5, pady=15)
+
+    dialog.mainloop()
+
+    if 'gui_path' in globals() and 'gui_gene_id' in globals():
+        plot_read_counts(gui_path, gui_gene_id)
+    else:
+        sys.exit()
+
+
+def get_dir() -> Path:
+    """Support function to retrieve dir from GUI."""
+    global gui_path
+    gui_path = Path(filedialog.askdirectory(
+                                title='Please select the directory containing \
+                                gene reads'))
+    short_dir = ("%s/.../%s/" % (gui_path.drive, gui_path.name))
+    dir_str_var.set(short_dir)
+
+
+def get_gene_id(gene_entry: StringVar) -> str:
+    """Support function to retrieve gene ID from GUI."""
+    global gui_gene_id
+    gui_gene_id = gene_entry.get()
+    gene_lbl.set(gene_entry.get())
+
+
+def main() -> None:
+    """Main opens GUI for dir and gene ID selection."""
+    prc_dialog()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tests/resources/Transcript2.csv b/tests/resources/Transcript2.csv
new file mode 100644
index 0000000..2a3dc0c
--- /dev/null
+++ b/tests/resources/Transcript2.csv
@@ -0,0 +1,20 @@
+GENE1,624
+GENE2,895
+GENE3,870
+GENE4,428
+GENE5,449
+GENE6,696
+GENE7,544
+GENE8,338
+GENE9,159
+GENE10,267
+GENE11,268
+GENE12,879
+GENE13,556
+GENE14,613
+GENE15,629
+GENE16,116
+GENE17,762
+GENE18,67
+GENE19,405
+GENE20,3
diff --git a/tests/resources/Transcript3.csv b/tests/resources/Transcript3.csv
new file mode 100644
index 0000000..2473e67
--- /dev/null
+++ b/tests/resources/Transcript3.csv
@@ -0,0 +1,19 @@
+GENE1,830
+GENE2,602
+GENE3,859
+GENE4,174
+GENE5,525
+GENE6,606
+,
+GENE8,910
+GENE9,194
+GENE10,415
+GENE11,953
+GENE12,299
+GENE13,127
+GENE14,750
+GENE15,968
+GENE16,282
+GENE17,868
+GENE18,688
+GENE19,650
diff --git a/tests/resources/Transcript4.csv b/tests/resources/Transcript4.csv
new file mode 100644
index 0000000..ff000bd
--- /dev/null
+++ b/tests/resources/Transcript4.csv
@@ -0,0 +1,19 @@
+GENE1,629
+GENE2,803
+GENE3,106
+GENE4,378
+GENE5,728
+GENE6,783
+GENE7,353
+GENE8,747
+GENE9,253
+GENE10,883
+GENE11,552
+GENE12,655
+GENE13,309
+GENE14,133
+GENE15,131
+GENE16,626
+GENE17,669
+GENE18,493
+GENE19,304
diff --git a/tests/resources/Transcript5.csv b/tests/resources/Transcript5.csv
new file mode 100644
index 0000000..34613ca
--- /dev/null
+++ b/tests/resources/Transcript5.csv
@@ -0,0 +1,19 @@
+GENE1,830
+GENE2,602
+GENE3,859
+GENE4,174
+GENE5,525
+GENE6,606
+GENE7,851
+GENE8,910
+GENE9,194
+GENE10,415
+GENE11,953
+GENE12,299
+GENE13,127
+GENE14,750
+GENE15,968
+GENE16,282
+GENE17,868
+GENE18,688
+GENE19,650
diff --git a/tests/resources/Transcript6.csv b/tests/resources/Transcript6.csv
new file mode 100644
index 0000000..794d1c1
--- /dev/null
+++ b/tests/resources/Transcript6.csv
@@ -0,0 +1,19 @@
+GENE1,676
+GENE2,787
+GENE3,876
+GENE4,368
+GENE5,676
+GENE6,896
+GENE7,865
+GENE8,585
+GENE9,853
+GENE10,447
+GENE11,583
+GENE12,573
+GENE13,895
+GENE14,207
+GENE15,562
+GENE16,411
+GENE17,635
+GENE18,469
+GENE19,128
diff --git a/tests/resources/Transcript7negctrl.csv b/tests/resources/Transcript7negctrl.csv
new file mode 100644
index 0000000..72cb6bf
--- /dev/null
+++ b/tests/resources/Transcript7negctrl.csv
@@ -0,0 +1,8 @@
+Random,Data,for
+This,is,a
+negative,control,test
+some,numbers,
+234,,
+,432,
+,,
+32,,
diff --git a/tests/test_plot_read_counts.py b/tests/test_plot_read_counts.py
new file mode 100644
index 0000000..add8572
--- /dev/null
+++ b/tests/test_plot_read_counts.py
@@ -0,0 +1,32 @@
+"Test for plot_read_counts module"
+
+import pytest
+from pathlib import Path
+from src.plot_read_counts import plot_read_counts
+
+class TestPlotReadCounts ():
+
+    @pytest.mark.parametrize(
+        "dir_path, gene_id, expected",
+        [
+            ('a', 'GENE1', ValueError),
+            ('Path.home()', 5, TypeError),
+            (None, None, ValueError),
+            ('Path.home()', 'GENE1', AttributeError),
+        ]
+    )
+    def test_invalid_input(self, dir_path, gene_id, expected):
+        with pytest.raises(expected):
+            plot_read_counts(dir_path, gene_id)
+
+    def validate_input():
+
+        pass
+
+    #plot_dir = PurePath('H:/My Drive/PhD/PLS/PLS scRNAseq Repo/scrna-seq-simulation/tests/resources')
+    #plot_dir = 'H:/My Drive/PhD/PLS scRNAseq Repo/scrna-seq-simulation/'
+    # plot_dir = PurePath(filedialog.askdirectory(title='Please select the directory containing \
+    #                     gene reads'))
+    # gene_of_interest = input('\nPlease type a gene ID to sample: ').upper()
+    #gene_of_interest = 'GENE1'
+    
\ No newline at end of file
-- 
GitLab