From 77ae9776f7e4f4ea652f8aa03077dbdb4723e737 Mon Sep 17 00:00:00 2001 From: Larissa Glass <larissa.glass@unibas.ch> Date: Tue, 29 Nov 2022 15:00:17 +0000 Subject: [PATCH] Milestone Review --- .flake8 | 3 + .pylintrc | 5 ++ Dockerfile | 6 +- README.md | 14 ++++- environment.yml | 1 + pyproject.toml | 2 +- tests/test_main.py | 2 +- tsg/__init__.py | 5 +- tsg/__main__.py | 3 +- tsg/cli.py | 104 ++++++++++++++++-------------- tsg/main.py | 154 ++++++++++++++++++++++++++------------------- 11 files changed, 181 insertions(+), 118 deletions(-) create mode 100644 .flake8 create mode 100644 .pylintrc diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..1d48b94 --- /dev/null +++ b/.flake8 @@ -0,0 +1,3 @@ +[flake8] +max-line-length = 120 +docstring-convention = google \ No newline at end of file diff --git a/.pylintrc b/.pylintrc new file mode 100644 index 0000000..f6b4eac --- /dev/null +++ b/.pylintrc @@ -0,0 +1,5 @@ +[FORMAT] +max-line-length=120 + +[BASIC] +good-names=df, i, fh, id, s, d \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index be953d5..4d99edb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,11 +1,11 @@ -FROM python:3-slim +FROM python:3.9-slim-buster WORKDIR /usr/src/app -COPY README.md requirements.txt pyproject.toml ./ +COPY README.md pyproject.toml ./ ADD tsg ./tsg ADD data ./data RUN pip install . -CMD [ "transcript-generator", "--transcripts", "data/transcripts_short.csv", "--annotation", "data/Homo_sapiens.GRCh38.103.chr.gtf", "--prob_inclusion", "0.05" ] \ No newline at end of file +ENTRYPOINT [ "transcript-generator" ] \ No newline at end of file diff --git a/README.md b/README.md index b336096..d520651 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,7 @@ To generate the sampled transcripts, open a new shell, activate your environment ``` conda activate transcript-structure-generator -transcript-generator --transcripts <transcripts_file> --annotation <annotations_file> --prob_inclusion=<probability_inclusion> [--log "INFO"] +transcript-generator --prob_inclusion <probability_inclusion> [--log "INFO"] <transcripts_file> <annotations_file> ``` where the transcripts file should be csv-formatted, the annotation file gtf-formatted and the inclusion probability for introns a float in the range [0,1]. The log parameter is optional and can be one of `["CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG"]`. The default is `INFO`. @@ -53,6 +53,18 @@ To perform all tests, make sure your environment corresponds to the `environment pytest tests ``` +To build Docker image, run + +``` +docker build -t transcript-app . +``` + +Afterwards, you can use the Docker image like so + +``` +docker run transcript-app --prob_inclusion <probability_inclusion> [--log "INFO"] <transcripts_file> <annotations_file> +``` + # License MIT license, Copyright (c) 2021 Zavolan Lab, Biozentrum, University of Basel diff --git a/environment.yml b/environment.yml index ac26257..bf74421 100644 --- a/environment.yml +++ b/environment.yml @@ -12,4 +12,5 @@ dependencies: - mypy - flake8 - pytest + - pylint - coverage diff --git a/pyproject.toml b/pyproject.toml index 1214e21..6cb72f0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "tsg" -version = "0.1.0" +version = "0.2.0" authors = [ { name="Michael Zimmermann", email="michael.zimmermann@unibas.ch" }, { name="Andri Fraenkl", email="andri.fraenkl@unibas.ch" }, diff --git a/tests/test_main.py b/tests/test_main.py index 48b9b55..49aaeea 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -62,7 +62,7 @@ class TestGtf: def test_parsed(self): annotations = Gtf() annotations.read_file("tests/resources/Annotation1.gtf") - annotations.parse_free_text() + annotations.parse_key_value() assert annotations.parsed == True assert set(annotations.free_text_columns) == set( diff --git a/tsg/__init__.py b/tsg/__init__.py index 229aaf5..75a11c4 100644 --- a/tsg/__init__.py +++ b/tsg/__init__.py @@ -1,4 +1,7 @@ """Transcript structure generator package.""" +import importlib.metadata +from tsg.main import Gtf, TranscriptGenerator, sample_transcripts -__version__ = '0.0.0' +__version__ = importlib.metadata.version("tsg") +__all__ = ["Gtf", "TranscriptGenerator", "sample_transcripts"] diff --git a/tsg/__main__.py b/tsg/__main__.py index cd76d30..b4fac98 100644 --- a/tsg/__main__.py +++ b/tsg/__main__.py @@ -1,4 +1,5 @@ +"""Main module.""" from tsg.cli import app if __name__ == "__main__": - app() \ No newline at end of file + app() diff --git a/tsg/cli.py b/tsg/cli.py index ee82182..63996d9 100644 --- a/tsg/cli.py +++ b/tsg/cli.py @@ -1,3 +1,4 @@ +"""Command line interface.""" import argparse import logging from pathlib import Path @@ -9,7 +10,7 @@ def setup_logging(loglevel: str = None) -> None: """Set up logging. Loglevel can be one of ["CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG"]. Args: - loglevel (str, optional): Level of log output. Defaults to None. + loglevel: Level of log output. Raises: ValueError: If string that is not a log level is passed, raise error. @@ -17,12 +18,15 @@ def setup_logging(loglevel: str = None) -> None: Returns: None """ + # set default log level + numeric_level = getattr(logging, "INFO") + if loglevel: - numeric_level = getattr(logging, loglevel.upper()) - if not isinstance(numeric_level, int): - raise ValueError("Invalid log level: %s" % loglevel) - else: - numeric_level = getattr(logging, "INFO") + try: + numeric_level = getattr(logging, loglevel.upper()) + except AttributeError as err: + print(f"Unexpected {err=}, {type(err)=}") + raise logging.basicConfig( format='[%(asctime)s: %(levelname)s] %(message)s (module "%(module)s")', @@ -31,44 +35,50 @@ def setup_logging(loglevel: str = None) -> None: def build_arg_parser() -> argparse.ArgumentParser: - """ Builds the argument parser. + """Builds the argument parser. Args: - 1) path to the csv-file with the number of transcripts (str) - 2) path to the gtf-file with the annotations for each transcript (str) - 3) a value for the probability of intron inclusion (float) - 4) a log message (str) - + 1) path to the csv-file with the number of transcripts + 2) path to the gtf-file with the annotations for each transcript + 3) a value for the probability of intron inclusion + 4) a log message + Raises: - None + None Returns: - parser + arguments for parser """ parser = argparse.ArgumentParser() - parser.add_argument("--transcripts", type=str) - parser.add_argument("--annotation", type=str) - parser.add_argument("--prob_inclusion", type=float) - parser.add_argument("--log", type=str) - - return parser - - -def get_args() -> argparse.Namespace: - """Builds a parser and returns its arguments. - - Args: - None - - Raises: - None + parser.add_argument( + "transcripts", + type=str, + help="Path to csv file with number of transcripts (ID,Count).", + ) + parser.add_argument( + "annotation", + type=str, + help="Path to gtf-file with exon annotation." + ) + parser.add_argument( + "-p", + "--prob-inclusion", + type=float, + default=0.05, + help="Probability of intron inclusion.", + ) + parser.add_argument( + "--log", + type=str, + default="INFO", + help='Level of logging. Can be one of ["CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG"]', + ) - Returns: - arguments for parser - """ - parser = build_arg_parser() args = parser.parse_args() + assert args.prob_inclusion >= 0 + assert args.prob_inclusion <= 1 + return args @@ -76,7 +86,7 @@ def output_filename(filename: str) -> str: """Generate output filename for given input filename. Args: - filename (str): Input filename + filename: Input filename Raises: NotImplementedError: Only accept filetypes .csv, .tsv and .gtf. @@ -86,7 +96,7 @@ def output_filename(filename: str) -> str: str: Output filename """ filepath = Path(filename) - if filepath.suffix == ".csv" or filepath.suffix == ".tsv": + if filepath.suffix in (".csv", ".tsv"): outfile = "generated_" + filepath.stem + ".csv" elif filepath.suffix == ".gtf": outfile = "generated_" + filepath.name @@ -95,26 +105,26 @@ def output_filename(filename: str) -> str: if Path(outfile).exists(): raise FileExistsError(f"The output file {outfile} already exists.") - + return outfile def app(): """Gets the args, sets up the logging and starts the programm with the provided parameters. - Args: - 1) path to the csv-file with the number of transcripts (str) - 2) path to the gtf-file with the annotations for each transcript (str) - 3) a value for the probability of intron inclusion (float) - 4) a log message (str) - + Args: + 1) path to the csv-file with the number of transcripts + 2) path to the gtf-file with the annotations for each transcript + 3) a value for the probability of intron inclusion + 4) a log message + Raises: - None + None Returns: - None + None """ - args = get_args() + args = build_arg_parser() setup_logging(args.log) sample_transcripts( @@ -123,4 +133,4 @@ def app(): args.prob_inclusion, output_filename(args.transcripts), output_filename(args.annotation), - ) \ No newline at end of file + ) diff --git a/tsg/main.py b/tsg/main.py index e7bd9db..4ad774f 100644 --- a/tsg/main.py +++ b/tsg/main.py @@ -14,10 +14,10 @@ def read_abundances(transcripts_file: str) -> pd.DataFrame: """Read transcript-abundance file into dataframe. Args: - transcripts_file (str): Input filename + transcripts_file: Input filename Returns: - pd.DataFrame: Transcript abundances ("id", "count") + A pd.DataFrame with the transcript abundances ("id", "count"). Raises: ValueError: When the input file is neither csv or tsv @@ -25,13 +25,12 @@ def read_abundances(transcripts_file: str) -> pd.DataFrame: cols: list = ["id", "count"] if transcripts_file.endswith(".tsv"): return pd.read_table(transcripts_file, header=None, names=cols) - elif transcripts_file.endswith(".csv"): + if transcripts_file.endswith(".csv"): return pd.read_csv(transcripts_file, header=None, names=cols) - else: - raise ValueError("File type needs to be either csv or tsv") + raise ValueError("File type needs to be either csv or tsv") -def filter_df(df: pd.DataFrame, transcripts: list = []) -> pd.DataFrame: +def filter_df(df: pd.DataFrame, transcripts: list = None) -> pd.DataFrame: """Filter annotations to include only exons with the highest transcript support level, i.e. TSL1. `feature` column is filtered on value "exon" and @@ -48,6 +47,8 @@ def filter_df(df: pd.DataFrame, transcripts: list = []) -> pd.DataFrame: A pd.DataFrame containing only rows with exon annotations of highest transcript support level and, if provided, belonging to one of the given transcripts """ + if transcripts is None: + transcripts = [] df_filter = df[ (df["feature"] == "exon") & (df["free_text"].str.contains('transcript_support_level "1"')) @@ -63,8 +64,8 @@ def filter_df(df: pd.DataFrame, transcripts: list = []) -> pd.DataFrame: def str_to_dict(s: str) -> dict: """Split between key/value pairs. - Split string based on delimiter ';' into items, remove empty items and split items on delimiter ' ' into key/value pairs. - Remove quotes from value strings and create a dictionary. + Split string based on delimiter ';' into items, remove empty items and split items on delimiter ' ' into + key/value pairs. Remove quotes from value strings and create a dictionary. Args: s: A string of the form 'gene_id "GENE1"; transcript_id "TRANSCRIPT1";' @@ -97,7 +98,7 @@ def dict_to_str(d: dict) -> str: # join key, value pairs from dictionary with a space in a list, # then join items in list by ; # end on ; - # check if value is nan + # value == value checks that value is not nan s: str = ( "; ".join([f'{key} "{value}"' for key, value in d.items() if value == value]) + ";" @@ -106,17 +107,17 @@ def dict_to_str(d: dict) -> str: def reverse_parse_free_text(df_all: pd.DataFrame) -> pd.DataFrame: - """Reverse parsing of gtf based pd.DataFrame to include only columns that are well defnined by gtf file standards. + """Reverse parsing of gtf based pd.DataFrame to include only columns that are well defnined by gtf-file standards. - The first 8 defined columns are constant as defined by gtf file standards. + The first 8 defined columns are constant as defined by gtf-file standards. Further columns are assumed to be parsed free-text columns (see Gtf.parse_free_text()). The parsed free-text columns are aggregated as a dictionary and the dictionry is parsed as a string in gtf format. Args: - df_all: A pd.DataFrame containing a parsed gtf file. + df_all: A pd.DataFrame containing a parsed gtf-file. Returns: - A DataFrame with the columns as defined by gtf file standards. + A pd.DataFrame with the columns as defined by gtf-file standards. """ # Define pd.DataFrame containing only parsed free-text columns df_free_text = df_all.iloc[:, 8:] @@ -133,8 +134,8 @@ def write_gtf(df: pd.DataFrame, filename: str) -> None: Makes sure data types are correct and saves object in gtf format. Args: - df: A pd.DataFrame containing a gtf file. - Filename: File to save to. + df: A pd.DataFrame containing a gtf-file. + filename: File to save to. """ # Make sure the data types are correct. df = df.astype(Gtf.dtypes) @@ -156,7 +157,7 @@ def write_header(annotations_file: str) -> None: Args: annotations_file: Filename to write header to. """ - with open(annotations_file, "w") as fh: + with open(annotations_file, "w", encoding="utf_8") as fh: fh.write("\t".join(Gtf.dtypes.keys()) + "\n") @@ -184,14 +185,15 @@ class Gtf: def __init__(self): """Initialize Gtf object.""" + self.df = None self.parsed = False self.original_columns = list(self.dtypes.keys()) self.free_text_columns = [] def read_file(self, annotations_file: str) -> None: - """Read gtf file. + """Read gtf-file. - Iterate over chunks of the gtf file reading 100000 rows at a time. Filter chunks for exon annotations of + Iterate over chunks of the gtf-file reading 100000 rows at a time. Filter chunks for exon annotations of the highest transcript support level. Concatenate chunks to get resulting pd.DataFrame. Args: @@ -219,11 +221,11 @@ class Gtf: Part of initialization is: Set dataframe attribute - Check which columns belong to the free-text part of the GTF-file. + Check which columns belong to the free-text part of the gtf-file. Check if there are no columns called free-text and if so, sets the value of parsed attribute to TRUE. Args: - df: pd.DataFrame + df: A pd.DataFrame containing a gtf-file. """ self.free_text_columns = [ col for col in df.columns if col not in self.original_columns @@ -232,13 +234,13 @@ class Gtf: if "free_text" not in df.columns: self.parsed = True - def parse_free_text(self): + def parse_key_value(self): """Parse key/value pairs from `free_text` column into column `key` with row entry `value`. Creates a dataframe with columns for keys in the free-text column instead of `free_text` column. Saves it to Gtf.df attribute. """ - assert self.parsed == False + assert self.parsed is False # create dataframe with columns for values in free_text column df_free_text = self.df["free_text"].map(str_to_dict).apply(pd.Series) # remember which columns come from free_text @@ -256,7 +258,7 @@ class Gtf: Creates a data frame that can be written in gtf format to file. Parsed free-text columns are aggregated into `free_text` column according to gtf format specification. """ - assert self.parsed == True + assert self.parsed is True # create dataframe with only free_text columns df_free_text = self.df[self.free_text_columns] # filter current dataframe to only original columns, except "free_text" column @@ -276,6 +278,32 @@ class Gtf: class TranscriptGenerator: """Class to sample a transcript.""" + def __new__( + cls, + transcript_id: str, + transcript_count: int, + transcript_df: pd.DataFrame, + prob_inclusion: float, + ): + """Initialize TranscriptGenerator object.""" + strands = transcript_df["strand"].unique() + if len(transcript_df) == 0: + LOG.warning( + "Transcript %s can't be sampled. Annotation is missing", transcript_id + ) + instance = None + elif len(strands) > 1: + LOG.warning( + "Transcript %s can't be sampled. Transcript generator is not implemented for transcripts with" + " exons annotated on different strands", + transcript_id, + ) + instance = None + else: + instance = super().__new__(cls) + + return instance + def __init__( self, transcript_id: str, @@ -284,10 +312,6 @@ class TranscriptGenerator: prob_inclusion: float, ): """Initialize TranscriptGenerator object.""" - assert len(transcript_df) > 0 - assert transcript_count > 0 - assert (prob_inclusion >= 0) and (prob_inclusion <= 1) - self.id = transcript_id self.count = transcript_count self.df = transcript_df @@ -301,7 +325,7 @@ class TranscriptGenerator: Each column corresponds to one sample and the number of columns corresponds to the number of samples. Returns: - inclusion_arr: A boolean np.array, where True means intron inclusion. + A boolean np.array, where True means intron inclusion. """ inclusion_arr = np.random.rand(self.no_exons, self.count) < self.prob_inclusion if self.strand == "+": @@ -317,10 +341,9 @@ class TranscriptGenerator: Args: Returns: - names: List of names for generated exons. - inclusion_arr_unique: A boolean np.array where columns correspond to generated transcripts and rows to - intron inclusion. - counts: A np.array containing sample number per generated inclusions, i.e. transcript. + - List of names for generated exons. + - A boolean np.array where columns correspond to generated transcripts and rows to intron inclusion. + - A np.array containing sample number per generated inclusions, i.e. transcript. """ inclusion_arr = self._get_inclusions() # Unique intron inclusion arrays and counts @@ -330,7 +353,7 @@ class TranscriptGenerator: # Name for each generated transcript names = [] for i in range(inclusion_arr_unique.shape[1]): - if np.all(inclusion_arr_unique[:, i] == False, axis=0): + if np.all(inclusion_arr_unique[:, i] is False, axis=0): names.append(self.id) else: names.append(f"{self.id}_{i}") @@ -341,26 +364,26 @@ class TranscriptGenerator: """Take as input a dataframe filtered to one transcript and a boolean vector denoting intron inclusions. Args: - inclusions (np.array): A boolean vector denoting intron inclusion. - transcript_id (str): The transcript id. + inclusions: A boolean vector denoting intron inclusion. + transcript_id: The transcript id. Returns: The generated transcript as a pd.DataFrame. """ df_generated = self.df.copy() if self.strand == "+": - origninal_end = df_generated["end"] + original_end = df_generated["end"] df_generated["end"] = np.where( inclusions, df_generated["start"].shift(periods=-1, fill_value=-1) - 1, - origninal_end, + original_end, ) if self.strand == "-": - origninal_start = df_generated["start"] + original_start = df_generated["start"] df_generated["start"] = np.where( inclusions, df_generated["end"].shift(periods=-1, fill_value=-1) + 1, - origninal_start, + original_start, ) original_id = df_generated["exon_id"] @@ -373,39 +396,36 @@ class TranscriptGenerator: df_generated["transcript_id"] = transcript_id return df_generated - def generate_transcripts(self, filename: str) -> None: + def write_sequences(self, filename: str) -> None: """Write transcripts to file. Args: - filename (str): Output csv filename. + filename: Output csv filename. """ - ids, inclusions, counts = self._get_unique_inclusions() - with open(filename, "a") as fh: + ids, _, counts = self._get_unique_inclusions() + with open(filename, "a", encoding="utf_8") as fh: for transcript_id, transcript_count in zip(ids, counts): fh.write(f"{transcript_id},{self.id},{transcript_count}\n") - def generate_annotations(self, filename: str) -> None: + def write_annotations(self, filename: str) -> None: """Generate a annotations in gtf format for sampled transcript. Args: - Filename: Output gtf filename. + Filename: Output gtf-filename. Raises: ValueError: If given transcript ID could not be sampled. """ - ids, inclusions, counts = self._get_unique_inclusions() + ids, inclusions, _ = self._get_unique_inclusions() n_unique = len(ids) - try: - df = pd.concat( - [self._get_df(inclusions[:, i], ids[i]) for i in range(n_unique)] - ) - df = reverse_parse_free_text(df) + df = pd.concat( + [self._get_df(inclusions[:, i], ids[i]) for i in range(n_unique)] + ) + df = reverse_parse_free_text(df) - write_gtf(df, filename) - LOG.debug(f"Transcript {self.id} sampled") - except ValueError: - LOG.error(f"Transcript {self.id} could not be sampled.") + write_gtf(df, filename) + LOG.debug("Transcript %s sampled", self.id) def sample_transcripts( @@ -418,18 +438,23 @@ def sample_transcripts( """Read input files, iterate over transcript IDs, sample each transcript and save results. Args: - input_transcripts_file (str): Filename of transcript abundances, needs to be csv or tsv. - input_annotations_file (str): Filename of annotations, needs to be gtf. - prob_inclusion (float): Probability of intron inclusion, needs to be float in range [0,1]. - output_transcripts_file (str): Filename of file to write sampled transcripts to. - output_annotations_file (str): Filename of file to write generated annotations to. + input_transcripts_file: Filename of transcript abundances, needs to be csv or tsv. + input_annotations_file: Filename of annotations, needs to be gtf. + prob_inclusion: Probability of intron inclusion, needs to be float in range [0,1]. + output_transcripts_file: Filename of file to write sampled transcripts to. + output_annotations_file: Filename of file to write generated annotations to. """ + LOG.info("Probability of intron inclusion: %s", str(prob_inclusion)) + LOG.info("Parsing transcript abundances...") transcripts = read_abundances(input_transcripts_file) + # Make sure that abundance is positive + transcripts = transcripts[transcripts["count"] > 0] + LOG.info("Done parsing...") LOG.info("Parsing annotations...") annotations = Gtf() annotations.read_file(input_annotations_file) - annotations.parse_free_text() + annotations.parse_key_value() LOG.info("Done parsing...") LOG.info("Start sampling transcripts...") @@ -447,6 +472,9 @@ def sample_transcripts( transcript_df, prob_inclusion=prob_inclusion, ) - transcripts.generate_annotations(output_annotations_file) - transcripts.generate_transcripts(output_transcripts_file) + try: + transcripts.write_annotations(output_annotations_file) + transcripts.write_sequences(output_transcripts_file) + except AttributeError: + pass LOG.info("Done.") -- GitLab