Madan Mukundan · 8de4aacd · 79401641 · 8de4aacd
--- a/src/plot_read_counts.py 0 → 100644

+ 118

− 0
+++ b/src/plot_read_counts.py 0 → 100644

+ 118

− 0
+"""Plots read counts of gene expression data."""
+import logging
+from pathlib import Path
+from typing import Union
+import pandas
+import matplotlib.pyplot as plt
+"""Given a GeneID, construct the histogram of read counts per cell."""
+def plot_read_counts(
+    dir_path: Union[str, Path] = None,
+    gene_id: str = None
+) -> bool:
+    """Plots histogram of gene ID and read counts from .csv files.
+        This function reads .csv files from a directory formatted
+    "GeneID, Counts" and plots a histogram of read counts for a
+    chosen geneID.
+    Args:
+        dir_path: Path of directory containing .csv files of read counts.
+        gene_id: Name of gene to plot.
+    Raises:
+        ValueError: No value is passed for path and/or gene ID.
+        ValueError: Path passed in does not exist.
+        ValueError: Gene not found in selection of .csv files.
+        TypeError: Gene ID parameter passed is not a String.
+        AttributeError: No .csv files in chosen directory
+    """
+    # Initialize logger
+    logging.basicConfig(
+        format='[%(asctime)s: %(levelname)s] "%(module)s" %(message)s',
+        level=logging.INFO)
+    LOGGER = logging.getLogger(__name__)
+    try:
+        dir_path = Path(dir_path)
+        if not dir_path.exists():
+            LOGGER.critical("Invalid path")
+            raise ValueError("Invalid Path")
+        elif not isinstance(gene_id, str):
+            LOGGER.critical("Invalid gene ID, please enter as string")
+            raise TypeError("Invalid gene ID")
+        # identify csv files from directory, formatted ['Gene', 'Count']
+        # if this is not immediately cast as list, on some python versions
+        # it sets file_paths to null after a single call which is big sad
+        file_paths = list(dir_path.glob("*.csv"))
+        file_paths.sort()
+        if file_paths == 0:
+            LOGGER.warning('No csv files found in current directory, exiting')
+            raise AttributeError("No .csv files found in chosen directory")
+        gene_count_dict = {}
+        file_counter = 1
+        LOGGER.info("Run with %s and gene %s" % (dir_path, gene_id))
+        for file_path in file_paths:
+            # read in csv here and transform to pandas dataframe
+            df_read_counts = pandas.read_csv(str(file_path),
+                                             header=None, names=['Gene', 'Count'])
+            # find gene_id within dataframe
+            gene_count = df_read_counts.loc[df_read_counts.Gene == gene_id,
+                                            'Count'].tolist()
+            if len(gene_count) > 1:
+                LOGGER.warning(
+                    "%s.csv contains more than one entry for chosen gene - these will be summed"
+                    % (file_path.stem)
+                )
+            elif not gene_count:
+                LOGGER.warning(
+                    "%s.csv does not contain read of chosen gene, this is interpreted as 0"
+                    % (file_path.stem))
+                gene_count = [0]
+            # in case count entry is missing, interpreted as 0
+            if not gene_count:
+                gene_count = [0]
+                LOGGER.warning(
+                    "File %s contains an entry for this gene but is missing a count number \
+                        this is interpreted as 0" % (file_path.stem)
+                )
+            file_counter += 1
+            gene_count_dict[file_path.stem] = sum(gene_count)
+    except AttributeError:
+        LOGGER.fatal("Invalid selection, please verify input")
+        raise ValueError("Either the path, gene ID, or both were not properly specified")
+    if sum(gene_count_dict.values()) == 0:
+        LOGGER.warning(
+            "Gene not found -> Please check for correct spelling or ID"
+        )
+        raise ValueError("Gene not found")
+    else:
+        LOGGER.info('Read complete from %s files' % (file_counter-1))
+        # read in dictionary from read_counts and plot counts on the same histogram
+        file_names = list(gene_count_dict.keys())
+        counts = list(gene_count_dict.values())
+        # create and show bar plot of read counts for chosen gene
+        plt.bar(range(len(gene_count_dict)), counts, tick_label=file_names)
+        plt.title('Counts for %s' % (gene_id))
+        plt.show()
+        return True