Skip to content
Snippets Groups Projects
Commit 125e83ee authored by Michael Zimmermann's avatar Michael Zimmermann Committed by Larissa Glass
Browse files

Minor changes in wording/spelling and added more Doc Strings

parent de3ff883
No related branches found
No related tags found
1 merge request!19Minor changes in wording/spelling and added more Doc Strings
# Synopsis # Synopsis
The human contains countless variety and diversity of cell types, states, and interactions. We wish to understand these tissues and the cell types at much deeper level. Single-cell RNA-seq (scRNA-seq) offers a look into what genes are being expressed at the level of individual cells. Overall this method allows on to identify cell types, find rare or unidentified cell types or states, identify genes that are differently expressed in different cell types, and explore changes in expression whilst including spatial, regulatory, and protein interactions. The human body contains a countless variety and diversity of cell types, states, and interactions. We wish to understand these tissues and the cell types at much deeper level. Single-cell RNA-seq (scRNA-seq) offers a look into what genes are being expressed at the level of individual cells. Overall this method allows one to identify cell types, find rare or unidentified cell types or states, identify genes that are differently expressed in different cell types, and explore changes in expression whilst including spatial, regulatory, and protein interactions.
We hope that other would find use for this transcript_structure generator that allows one to take input gtf files of specific gene transcripts and outputs a gtf containing intron/exon structures per inputed transcript. We hope that others would find use for this transcript_structure generator that allows one to take input gtf-files of specific gene transcripts and outputs a gtf-file containing intron/exon structures per input transcript. Moreover, one can specify a probability for intron-inclusion which is used to simulate incorrect splicing.
# Installation # Installation
... ...
......
...@@ -13,6 +13,9 @@ def setup_logging(loglevel: str = None) -> None: ...@@ -13,6 +13,9 @@ def setup_logging(loglevel: str = None) -> None:
Raises: Raises:
ValueError: If string that is not a log level is passed, raise error. ValueError: If string that is not a log level is passed, raise error.
Returns:
None
""" """
if loglevel: if loglevel:
numeric_level = getattr(logging, loglevel.upper()) numeric_level = getattr(logging, loglevel.upper())
...@@ -28,6 +31,20 @@ def setup_logging(loglevel: str = None) -> None: ...@@ -28,6 +31,20 @@ def setup_logging(loglevel: str = None) -> None:
def build_arg_parser() -> argparse.ArgumentParser: def build_arg_parser() -> argparse.ArgumentParser:
""" Builds the argument parser.
Args:
1) path to the csv-file with the number of transcripts (str)
2) path to the gtf-file with the annotations for each transcript (str)
3) a value for the probability of intron inclusion (float)
4) a log message (str)
Raises:
None
Returns:
parser
"""
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--transcripts", type=str) parser.add_argument("--transcripts", type=str)
parser.add_argument("--annotation", type=str) parser.add_argument("--annotation", type=str)
...@@ -38,8 +55,18 @@ def build_arg_parser() -> argparse.ArgumentParser: ...@@ -38,8 +55,18 @@ def build_arg_parser() -> argparse.ArgumentParser:
def get_args() -> argparse.Namespace: def get_args() -> argparse.Namespace:
parser = build_arg_parser() """Builds a parser and returns its arguments.
Args:
None
Raises:
None
Returns:
arguments for parser
"""
parser = build_arg_parser()
args = parser.parse_args() args = parser.parse_args()
return args return args
...@@ -73,6 +100,20 @@ def output_filename(filename: str) -> str: ...@@ -73,6 +100,20 @@ def output_filename(filename: str) -> str:
def app(): def app():
"""Gets the args, sets up the logging and starts the programm with the provided parameters.
Args:
1) path to the csv-file with the number of transcripts (str)
2) path to the gtf-file with the annotations for each transcript (str)
3) a value for the probability of intron inclusion (float)
4) a log message (str)
Raises:
None
Returns:
None
"""
args = get_args() args = get_args()
setup_logging(args.log) setup_logging(args.log)
... ...
......
...@@ -11,7 +11,7 @@ LOG = logging.getLogger(__name__) ...@@ -11,7 +11,7 @@ LOG = logging.getLogger(__name__)
def read_abundances(transcripts_file: str) -> pd.DataFrame: def read_abundances(transcripts_file: str) -> pd.DataFrame:
"""Read abundance file into dataframe. """Read transcript-abundance file into dataframe.
Args: Args:
transcripts_file (str): Input filename transcripts_file (str): Input filename
...@@ -32,7 +32,7 @@ def read_abundances(transcripts_file: str) -> pd.DataFrame: ...@@ -32,7 +32,7 @@ def read_abundances(transcripts_file: str) -> pd.DataFrame:
def filter_df(df: pd.DataFrame, transcripts: list = []) -> pd.DataFrame: def filter_df(df: pd.DataFrame, transcripts: list = []) -> pd.DataFrame:
"""Filter annotations to include only exons with the highest transcript support level (TSL1). """Filter annotations to include only exons with the highest transcript support level, i.e. TSL1.
`feature` column is filtered on value "exon" and `feature` column is filtered on value "exon" and
`free_text` column is filtered to include the string denoting the highest transcript support level `free_text` column is filtered to include the string denoting the highest transcript support level
...@@ -41,7 +41,7 @@ def filter_df(df: pd.DataFrame, transcripts: list = []) -> pd.DataFrame: ...@@ -41,7 +41,7 @@ def filter_df(df: pd.DataFrame, transcripts: list = []) -> pd.DataFrame:
If a list of transcript IDs is given, `free_text` column is filtered to include one of the IDs. If a list of transcript IDs is given, `free_text` column is filtered to include one of the IDs.
Args: Args:
df: A pd.DataFrame containing an unparsed gtf file df: A pd.DataFrame containing an unparsed gtf-file
transcript: list of transcript IDs transcript: list of transcript IDs
Returns: Returns:
...@@ -63,8 +63,7 @@ def filter_df(df: pd.DataFrame, transcripts: list = []) -> pd.DataFrame: ...@@ -63,8 +63,7 @@ def filter_df(df: pd.DataFrame, transcripts: list = []) -> pd.DataFrame:
def str_to_dict(s: str) -> dict: def str_to_dict(s: str) -> dict:
"""Split between key/value pairs. """Split between key/value pairs.
Split string based on delimiter ';' into items, remove empty items and split items on delimiter ' ' into key/value Split string based on delimiter ';' into items, remove empty items and split items on delimiter ' ' into key/value pairs.
pairs.
Remove quotes from value strings and create a dictionary. Remove quotes from value strings and create a dictionary.
Args: Args:
...@@ -109,9 +108,9 @@ def dict_to_str(d: dict) -> str: ...@@ -109,9 +108,9 @@ def dict_to_str(d: dict) -> str:
def reverse_parse_free_text(df_all: pd.DataFrame) -> pd.DataFrame: def reverse_parse_free_text(df_all: pd.DataFrame) -> pd.DataFrame:
"""Reverse parsing of gtf based pd.DataFrame to include only columns that are well defnined by gtf file standards. """Reverse parsing of gtf based pd.DataFrame to include only columns that are well defnined by gtf file standards.
The first 8 defined columns are constant as defined by gtf file standards The first 8 defined columns are constant as defined by gtf file standards.
Further columns are assumed to be parsed free text columns (see Gtf.parse_free_text()). Further columns are assumed to be parsed free-text columns (see Gtf.parse_free_text()).
The parsed free text columns are aggregated as a dictionary and the dictionry is parsed as a string in gtf format. The parsed free-text columns are aggregated as a dictionary and the dictionry is parsed as a string in gtf format.
Args: Args:
df_all: A pd.DataFrame containing a parsed gtf file. df_all: A pd.DataFrame containing a parsed gtf file.
...@@ -119,11 +118,11 @@ def reverse_parse_free_text(df_all: pd.DataFrame) -> pd.DataFrame: ...@@ -119,11 +118,11 @@ def reverse_parse_free_text(df_all: pd.DataFrame) -> pd.DataFrame:
Returns: Returns:
A DataFrame with the columns as defined by gtf file standards. A DataFrame with the columns as defined by gtf file standards.
""" """
# Define pd.DataFrame containing only parsed free text columns # Define pd.DataFrame containing only parsed free-text columns
df_free_text = df_all.iloc[:, 8:] df_free_text = df_all.iloc[:, 8:]
# Define pd.DataFrame containing only non-parsed columns # Define pd.DataFrame containing only non-parsed columns
df = df_all.iloc[:, :8] df = df_all.iloc[:, :8]
# Reverse parsing of free text columns and add the result as column `free_text` to output pd.DataFrame # Reverse parsing of free-text columns and add the result as column `free_text` to output pd.DataFrame
df["free_text"] = df_free_text.agg(pd.Series.to_dict, axis=1).apply(dict_to_str) df["free_text"] = df_free_text.agg(pd.Series.to_dict, axis=1).apply(dict_to_str)
return df return df
...@@ -236,7 +235,7 @@ class Gtf: ...@@ -236,7 +235,7 @@ class Gtf:
def parse_free_text(self): def parse_free_text(self):
"""Parse key/value pairs from `free_text` column into column `key` with row entry `value`. """Parse key/value pairs from `free_text` column into column `key` with row entry `value`.
Creates a dataframe with columns for keys in the free text column instead of `free_text` column. Creates a dataframe with columns for keys in the free-text column instead of `free_text` column.
Saves it to Gtf.df attribute. Saves it to Gtf.df attribute.
""" """
assert self.parsed == False assert self.parsed == False
...@@ -254,7 +253,7 @@ class Gtf: ...@@ -254,7 +253,7 @@ class Gtf:
def reverse_parse_free_text(self): def reverse_parse_free_text(self):
"""Reverses parsing of `free_text` column. """Reverses parsing of `free_text` column.
Creates a data frame that can be written in gtf format to file. Parsed free text columns are aggregated Creates a data frame that can be written in gtf format to file. Parsed free-text columns are aggregated
into `free_text` column according to gtf format specification. into `free_text` column according to gtf format specification.
""" """
assert self.parsed == True assert self.parsed == True
... ...
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please to comment