Minor changes in wording/spelling and added more Doc Strings

125e83ee · Michael Zimmermann · Larissa Glass · de3ff883 · 125e83ee · 125e83ee
Commit 125e83ee authored Nov 21, 2022 by Michael Zimmermann Committed by Larissa Glass Nov 21, 2022
--- a/README.md
+++ b/README.md
 # Synopsis
-The human contains countless variety and diversity of cell types, states, and interactions. We wish to understand these tissues and the cell types at much deeper level. Single-cell RNA-seq (scRNA-seq) offers a look into what genes are being expressed at the level of individual cells. Overall this method allows on to identify cell types, find rare or unidentified cell types or states, identify genes that are differently expressed in different cell types, and explore changes in expression whilst including spatial, regulatory, and protein interactions. 
+The human body contains a countless variety and diversity of cell types, states, and interactions. We wish to understand these tissues and the cell types at much deeper level. Single-cell RNA-seq (scRNA-seq) offers a look into what genes are being expressed at the level of individual cells. Overall this method allows one to identify cell types, find rare or unidentified cell types or states, identify genes that are differently expressed in different cell types, and explore changes in expression whilst including spatial, regulatory, and protein interactions. 
-We hope that other would find use for this transcript_structure generator that allows one to take input gtf files of specific gene transcripts and outputs a gtf  containing intron/exon structures per inputed transcript. 
+We hope that others would find use for this transcript_structure generator that allows one to take input gtf-files of specific gene transcripts and outputs a gtf-file containing intron/exon structures per input transcript. Moreover, one can specify a probability for intron-inclusion which is used to simulate incorrect splicing. 
 # Installation


--- a/tsg/cli.py
+++ b/tsg/cli.py
@@ -13,6 +13,9 @@ def setup_logging(loglevel: str = None) -> None:
    Raises:
        ValueError: If string that is not a log level is passed, raise error.
+    Returns:
+        None
    """
    if loglevel:
        numeric_level = getattr(logging, loglevel.upper())
@@ -28,6 +31,20 @@ def setup_logging(loglevel: str = None) -> None:
 def build_arg_parser() -> argparse.ArgumentParser:
+    """ Builds the argument parser.
+    Args:
+        1) path to the csv-file with the number of transcripts (str)
+        2) path to the gtf-file with the annotations for each transcript (str)
+        3) a value for the probability of intron inclusion (float)
+        4) a log message (str)
+    Raises:
+        None  
+    Returns:
+        parser  
+    """
    parser = argparse.ArgumentParser()
    parser.add_argument("--transcripts", type=str)
    parser.add_argument("--annotation", type=str)
@@ -38,8 +55,18 @@ def build_arg_parser() -> argparse.ArgumentParser:
 def get_args() -> argparse.Namespace:
-    parser = build_arg_parser()
+    """Builds a parser and returns its arguments.
+    Args:
+        None
+    Raises:
+        None 
+    Returns:
+        arguments for parser   
+    """
+    parser = build_arg_parser()
    args = parser.parse_args()
    return args
@@ -73,6 +100,20 @@ def output_filename(filename: str) -> str:
 def app():
+    """Gets the args, sets up the logging and starts the programm with the provided parameters.
+    Args: 
+        1) path to the csv-file with the number of transcripts (str)
+        2) path to the gtf-file with the annotations for each transcript (str)
+        3) a value for the probability of intron inclusion (float)
+        4) a log message (str)
+    Raises:
+        None  
+    Returns:
+        None  
+    """
    args = get_args()
    setup_logging(args.log)


--- a/tsg/main.py
+++ b/tsg/main.py
@@ -11,7 +11,7 @@ LOG = logging.getLogger(__name__)
 def read_abundances(transcripts_file: str) -> pd.DataFrame:
-    """Read abundance file into dataframe.
+    """Read transcript-abundance file into dataframe.
    Args:
        transcripts_file (str): Input filename
@@ -32,7 +32,7 @@ def read_abundances(transcripts_file: str) -> pd.DataFrame:
 def filter_df(df: pd.DataFrame, transcripts: list = []) -> pd.DataFrame:
-    """Filter annotations to include only exons with the highest transcript support level (TSL1).
+    """Filter annotations to include only exons with the highest transcript support level, i.e. TSL1.
    `feature` column is filtered on value "exon" and
    `free_text` column is filtered to include the string denoting the highest transcript support level
@@ -41,7 +41,7 @@ def filter_df(df: pd.DataFrame, transcripts: list = []) -> pd.DataFrame:
    If a list of transcript IDs is given, `free_text` column is filtered to include one of the IDs.
    Args:
-        df: A pd.DataFrame containing an unparsed gtf file
+        df: A pd.DataFrame containing an unparsed gtf-file
        transcript: list of transcript IDs
    Returns:
@@ -63,8 +63,7 @@ def filter_df(df: pd.DataFrame, transcripts: list = []) -> pd.DataFrame:
 def str_to_dict(s: str) -> dict:
    """Split between key/value pairs.
-    Split string based on delimiter ';' into items, remove empty items and split items on delimiter ' ' into key/value
+    Split string based on delimiter ';' into items, remove empty items and split items on delimiter ' ' into key/value pairs.
-    pairs.
    Remove quotes from value strings and create a dictionary.
    Args:
@@ -109,9 +108,9 @@ def dict_to_str(d: dict) -> str:
 def reverse_parse_free_text(df_all: pd.DataFrame) -> pd.DataFrame:
    """Reverse parsing of gtf based pd.DataFrame to include only columns that are well defnined by gtf file standards.
-    The first 8 defined columns are constant as defined by gtf file standards
+    The first 8 defined columns are constant as defined by gtf file standards.
-    Further columns are assumed to be parsed free text columns (see Gtf.parse_free_text()).
+    Further columns are assumed to be parsed free-text columns (see Gtf.parse_free_text()).
-    The parsed free text columns are aggregated as a dictionary and the dictionry is parsed as a string in gtf format.
+    The parsed free-text columns are aggregated as a dictionary and the dictionry is parsed as a string in gtf format.
    Args:
        df_all: A pd.DataFrame containing a parsed gtf file.
@@ -119,11 +118,11 @@ def reverse_parse_free_text(df_all: pd.DataFrame) -> pd.DataFrame:
    Returns:
        A DataFrame with the columns as defined by gtf file standards.
    """
-    # Define pd.DataFrame containing only parsed free text columns
+    # Define pd.DataFrame containing only parsed free-text columns
    df_free_text = df_all.iloc[:, 8:]
    # Define pd.DataFrame containing only non-parsed columns
    df = df_all.iloc[:, :8]
-    # Reverse parsing of free text columns and add the result as column `free_text` to output pd.DataFrame
+    # Reverse parsing of free-text columns and add the result as column `free_text` to output pd.DataFrame
    df["free_text"] = df_free_text.agg(pd.Series.to_dict, axis=1).apply(dict_to_str)
    return df
@@ -236,7 +235,7 @@ class Gtf:
    def parse_free_text(self):
        """Parse key/value pairs from `free_text` column into column `key` with row entry `value`.
-        Creates a dataframe with columns for keys in the free text column instead of `free_text` column.
+        Creates a dataframe with columns for keys in the free-text column instead of `free_text` column.
        Saves it to Gtf.df attribute.
        """
        assert self.parsed == False
@@ -254,7 +253,7 @@ class Gtf:
    def reverse_parse_free_text(self):
        """Reverses parsing of `free_text` column.
-        Creates a data frame that can be written in gtf format to file. Parsed free text columns are aggregated
+        Creates a data frame that can be written in gtf format to file. Parsed free-text columns are aggregated
        into `free_text` column according to gtf format specification.
        """
        assert self.parsed == True