From 30ca7f20184b6c9e804fc89d27ca2864ddbfbb9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= <mate.balajti@unibas.ch> Date: Mon, 4 Sep 2023 17:07:48 +0200 Subject: [PATCH] refactor: remove tqdm, update cli and main --- requirements.txt | 1 - requirements_dev.txt | 1 - tsg/cli.py | 4 +- tsg/main.py | 93 ++++++++++++++++++++++++-------------------- 4 files changed, 53 insertions(+), 46 deletions(-) diff --git a/requirements.txt b/requirements.txt index bf93cc4..babdd14 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,2 @@ matplotlib pandas -tqdm diff --git a/requirements_dev.txt b/requirements_dev.txt index 558b809..f973661 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -1,7 +1,6 @@ matplotlib pandas pip -tqdm mypy flake8 flake8-docstrings diff --git a/tsg/cli.py b/tsg/cli.py index fcc0e71..30a2dec 100644 --- a/tsg/cli.py +++ b/tsg/cli.py @@ -30,8 +30,8 @@ def setup_logging(loglevel: str) -> None: raise logging.basicConfig( - format='[%(asctime)s: %(levelname)s] \ - %(message)s (module "%(module)s")', + format=('[%(asctime)s: %(levelname)s] ' + '%(message)s (module "%(module)s")'), level=numeric_level, ) diff --git a/tsg/main.py b/tsg/main.py index 8307b79..d2c1bb6 100644 --- a/tsg/main.py +++ b/tsg/main.py @@ -4,8 +4,6 @@ import logging import numpy as np import pandas as pd # type: ignore -from tqdm import tqdm # type: ignore - LOG = logging.getLogger(__name__) @@ -31,15 +29,17 @@ def read_abundances(transcripts_file: str) -> pd.DataFrame: def filter_df(gtf_df: pd.DataFrame, transcripts: list) -> pd.DataFrame: - """Filter annotations to include only exons \ - with the highest transcript support level, i.e. TSL1. + """Filter dataframe. + + Filter annotations to include only exons + with the highest transcript support level, i.e. TSL1. `feature` column is filtered on value "exon" and - `free_text` column is filtered to include the string \ + `free_text` column is filtered to include the string denoting the highest transcript support level ('transcript_support_level "1"'). - If a list of transcript IDs is given, `free_text` column \ + If a list of transcript IDs is given, `free_text` column is filtered to include one of the IDs. Args: @@ -47,7 +47,7 @@ def filter_df(gtf_df: pd.DataFrame, transcripts: list) -> pd.DataFrame: transcript: list of transcript IDs Returns: - A pd.DataFrame containing only rows with exon annotations \ + A pd.DataFrame containing only rows with exon annotations of highest transcript support level and, if provided, belonging to one of the given transcripts """ @@ -68,7 +68,7 @@ def filter_df(gtf_df: pd.DataFrame, transcripts: list) -> pd.DataFrame: def str_to_dict(gene_string: str) -> dict: """Split between key/value pairs. - Split string based on delimiter ';' into items, remove empty items and \ + Split string based on delimiter ';' into items, remove empty items and split items on delimiter ' ' into key/value pairs. Remove quotes from value strings and create a dictionary. @@ -92,9 +92,9 @@ def dict_to_str(gene_dict: dict) -> str: Takes e.g. dictionary {'gene_id': 'GENE1', 'transcript_id': 'TRANSCRIPT1'} and returns string 'gene_id "GENE1"; transcript_id "TRANSCRIPT1";'. - Key/value pairs are joined by space to form an item and items are \ + Key/value pairs are joined by space to form an item and items are joinded by ';' to form a string. - If a value is Not a Number (nan), the key/value pair is omitted \ + If a value is Not a Number (nan), the key/value pair is omitted from the string. Args: @@ -115,13 +115,15 @@ def dict_to_str(gene_dict: dict) -> str: def reverse_parse_free_text(df_all: pd.DataFrame) -> pd.DataFrame: - """Reverse parsing of gtf based pd.DataFrame to include only columns that \ - are well defnined by gtf-file standards. + """Reverse parse a gtf based pd.DataFrame. + + The data frame will include only columns that + are well defnined by gtf-file standards. The first 8 defined columns are constant as defined by gtf-file standards. - Further columns are assumed to be parsed free-text columns \ + Further columns are assumed to be parsed free-text columns (see Gtf.parse_free_text()). - The parsed free-text columns are aggregated as a dictionary and \ + The parsed free-text columns are aggregated as a dictionary and the dictionry is parsed as a string in gtf format. Args: @@ -165,8 +167,9 @@ def write_gtf(gtf_df: pd.DataFrame, filename: str) -> None: def write_header(annotations_file: str) -> None: - """Write the header of an annotations file, consisting of the \ - tab delimited column names. + """Write the header of an annotations file. + + It consists of the tab delimited column names. Args: annotations_file: Filename to write header to. @@ -182,7 +185,7 @@ class Gtf: dtypes: A dictionary containing column names and respective data types. parsed: A boolean indicating if the pd.DataFrame is parsed. original_columns: A list of columns not touched by parsing. - free_text_columns: A list of columns created during parsing \ + free_text_columns: A list of columns created during parsing of column `free_text`. """ @@ -240,7 +243,7 @@ class Gtf: Part of initialization is: Set dataframe attribute Check which columns belong to the free-text part of the gtf-file. - Check if there are no columns called free-text and if so, sets \ + Check if there are no columns called free-text and if so, sets the value of parsed attribute to TRUE. Args: @@ -254,11 +257,11 @@ class Gtf: self.parsed = True def parse_key_value(self): - """Parse key/value pairs from `free_text` column into column `key` \ - with row entry `value`. + """Parse key/value pairs. - Creates a dataframe with columns for keys in the free-text column \ - instead of `free_text` column. + From `free_text` column into column `key` with row entry `value`. + Creates a dataframe with columns for keys in the free-text column + instead of `free_text` column. Saves it to Gtf.df attribute. """ assert self.parsed is False @@ -316,15 +319,15 @@ class TranscriptGenerator: strands = transcript_df["strand"].unique() if len(transcript_df) == 0: LOG.warning( - "Transcript %s can't be sampled. " + "Transcript \"%s\" can't be sampled: " "Annotation is missing or TSL is not 1.", transcript_id ) instance = None elif len(strands) > 1: LOG.warning( - "Transcript %s can't be sampled. Transcript generator is " + "Transcript \"%s\" can't be sampled: Transcript generator is " "not implemented for transcripts with exons annotated on " - "different strands", transcript_id, + "different strands.", transcript_id, ) instance = None else: @@ -350,8 +353,8 @@ class TranscriptGenerator: def get_inclusions(self) -> np.ndarray: """Generate inclusions array. - Each column corresponds to one sample and the number of columns \ - corresponds to the number of samples. + Each column corresponds to one sample and the number of columns + corresponds to the number of samples. Returns: A boolean np.array, where True means intron inclusion. @@ -367,16 +370,18 @@ class TranscriptGenerator: return inclusion_arr def get_unique_inclusions(self) -> tuple[list, np.ndarray, np.ndarray]: - """Inclusion of unique intron inclusion via arrays and counts and \ - name generation of each unique count. + """Get unique inclusions. + + Inclusion of unique intron inclusion via arrays and counts and + name generation of each unique count. Args: Returns: - List of names for generated exons. - - A boolean np.array where columns correspond to generated \ + - A boolean np.array where columns correspond to generated transcripts and rows to intron inclusion. - - A np.array containing sample number per generated inclusions, \ + - A np.array containing sample number per generated inclusions, i.e. transcript. """ inclusion_arr = self.get_inclusions() @@ -397,8 +402,10 @@ class TranscriptGenerator: def get_df( self, inclusions: np.ndarray, transcript_id: str ) -> pd.DataFrame: - """Take as input a dataframe filtered to one transcript and \ - a boolean vector denoting intron inclusions. + """Get dataframe. + + Take as input a dataframe filtered to one transcript and + a boolean vector denoting intron inclusions. Args: inclusions: A boolean vector denoting intron inclusion. @@ -466,7 +473,7 @@ class TranscriptGenerator: data_frame = reverse_parse_free_text(data_frame) write_gtf(data_frame, filename) - LOG.debug("Transcript %s sampled", self.ts_id) + LOG.debug("Transcript \"%s\" sampled.", self.ts_id) def sample_transcripts( @@ -476,19 +483,21 @@ def sample_transcripts( output_transcripts_file: str, output_annotations_file: str, ): - """Read input files, iterate over transcript IDs, \ - sample each transcript and save results. + """Sample transcripts. + + Read input files, iterate over transcript IDs, + sample each transcript and save results. Args: - input_transcripts_file: Filename of transcript abundances, \ + input_transcripts_file: Filename of transcript abundances, needs to be csv or tsv. - input_annotations_file: Filename of annotations, \ + input_annotations_file: Filename of annotations, needs to be gtf. - prob_inclusion: Probability of intron inclusion, \ + prob_inclusion: Probability of intron inclusion, needs to be float in range [0,1]. - output_transcripts_file: Filename of file to write \ + output_transcripts_file: Filename of file to write sampled transcripts to. - output_annotations_file: Filename of file to write \ + output_annotations_file: Filename of file to write generated annotations to. """ LOG.info("Probability of intron inclusion: %s", str(prob_inclusion)) @@ -508,7 +517,7 @@ def sample_transcripts( # Set up output file, write header once and append data in loop write_header(output_annotations_file) - for _, row in tqdm(transcripts.iterrows()): + for _, row in transcripts.iterrows(): transcript_id = row["id"] transcript_count = row["count"] -- GitLab