feat: add support for reassigned Transcript Support Level

0364a552 · Mate Balajti · f0286852 · 0364a552 · 0364a552 · 0364a552
Commit 0364a552 authored Sep 4, 2023 by Mate Balajti
--- a/requirements.txt
+++ b/requirements.txt
 matplotlib
 pandas
-tqdm
--- a/requirements_dev.txt
+++ b/requirements_dev.txt
 matplotlib
 pandas
-pandas-stubs
 pip
-tqdm
-types-tqdm
-flake8-docstrings
 mypy
 flake8
+flake8-docstrings
 pytest
 pylint
 coverage
--- a/setup.py
+++ b/setup.py
+"""Set up project."""
+from pathlib import Path
+from setuptools import setup, find_packages
+project_root_dir = Path(__file__).parent.resolve()
+with open(project_root_dir / "requirements.txt",
+          "r", encoding="utf-8") as f:
+    INSTALL_REQUIRES = f.read().splitlines()
+URL = ('https://git.scicore.unibas.ch/zavolan_group/'
+       'tools/transcript-structure-generator')
+setup(
+    name='transcript-structure-generator',
+    version='0.2.0',
+    url=URL,
+    license='MIT',
+    author='Larissa Glass, Michael Zimmermann, Andri Fraenkl',
+    author_email='mate.balajti@unibas.ch',
+    description='Transcript structure generator',
+    packages=find_packages(),
+    install_requires=INSTALL_REQUIRES,
+    entry_points={
+        'console_scripts': ['transcript-structure-generator=tsg.cli:app']
+        }
+)
--- a/tests/test_main.py
+++ b/tests/test_main.py
 """Tests for main module."""
-import pandas as pd
+import pandas as pd  # type: ignore
 from tsg.main import Gtf, TranscriptGenerator, dict_to_str, str_to_dict


--- a/tsg/cli.py
+++ b/tsg/cli.py
@@ -30,8 +30,8 @@ def setup_logging(loglevel: str) -> None:
            raise
    logging.basicConfig(
-        format='[%(asctime)s: %(levelname)s] \
+        format=('[%(asctime)s: %(levelname)s] '
-            %(message)s (module "%(module)s")',
+                '%(message)s (module "%(module)s")'),
        level=numeric_level,
    )


--- a/tsg/main.py
+++ b/tsg/main.py
@@ -3,9 +3,7 @@
 import logging
 import numpy as np
-import pandas as pd
+import pandas as pd  # type: ignore
-from tqdm import tqdm
 LOG = logging.getLogger(__name__)
@@ -31,15 +29,17 @@ def read_abundances(transcripts_file: str) -> pd.DataFrame:
 def filter_df(gtf_df: pd.DataFrame, transcripts: list) -> pd.DataFrame:
-    """Filter annotations to include only exons \
+    """Filter dataframe.
+    Filter annotations to include only exons
    with the highest transcript support level, i.e. TSL1.
    `feature` column is filtered on value "exon" and
-    `free_text` column is filtered to include the string \
+    `free_text` column is filtered to include the string
        denoting the highest transcript support level
    ('transcript_support_level "1"').
-    If a list of transcript IDs is given, `free_text` column \
+    If a list of transcript IDs is given, `free_text` column
        is filtered to include one of the IDs.
    Args:
@@ -47,7 +47,7 @@ def filter_df(gtf_df: pd.DataFrame, transcripts: list) -> pd.DataFrame:
        transcript: list of transcript IDs
    Returns:
-        A pd.DataFrame containing only rows with exon annotations \
+        A pd.DataFrame containing only rows with exon annotations
            of highest transcript support level and,
        if provided, belonging to one of the given transcripts
    """
@@ -55,7 +55,7 @@ def filter_df(gtf_df: pd.DataFrame, transcripts: list) -> pd.DataFrame:
        transcripts = []
    df_filter = gtf_df[
        (gtf_df["feature"] == "exon")
-        & (gtf_df["free_text"].str.contains('transcript_support_level "1"'))
+        & (gtf_df["free_text"].str.contains('transcript_support_level "1'))
    ]
    if len(transcripts) > 0:
        df_filter = df_filter[df_filter["free_text"].str.contains(
@@ -68,7 +68,7 @@ def filter_df(gtf_df: pd.DataFrame, transcripts: list) -> pd.DataFrame:
 def str_to_dict(gene_string: str) -> dict:
    """Split between key/value pairs.
-    Split string based on delimiter ';' into items, remove empty items and \
+    Split string based on delimiter ';' into items, remove empty items and
        split items on delimiter ' ' into
    key/value pairs. Remove quotes from value strings and create a dictionary.
@@ -92,9 +92,9 @@ def dict_to_str(gene_dict: dict) -> str:
    Takes e.g. dictionary {'gene_id': 'GENE1', 'transcript_id': 'TRANSCRIPT1'}
    and returns string 'gene_id "GENE1"; transcript_id "TRANSCRIPT1";'.
-    Key/value pairs are joined by space to form an item and items are \
+    Key/value pairs are joined by space to form an item and items are
        joinded by ';' to form a string.
-    If a value is Not a Number (nan), the key/value pair is omitted \
+    If a value is Not a Number (nan), the key/value pair is omitted
        from the string.
    Args:
@@ -115,13 +115,15 @@ def dict_to_str(gene_dict: dict) -> str:
 def reverse_parse_free_text(df_all: pd.DataFrame) -> pd.DataFrame:
-    """Reverse parsing of gtf based pd.DataFrame to include only columns that \
+    """Reverse parse a gtf based pd.DataFrame.
+    The data frame will include only columns that
    are well defnined by gtf-file standards.
    The first 8 defined columns are constant as defined by gtf-file standards.
-    Further columns are assumed to be parsed free-text columns \
+    Further columns are assumed to be parsed free-text columns
        (see Gtf.parse_free_text()).
-    The parsed free-text columns are aggregated as a dictionary and \
+    The parsed free-text columns are aggregated as a dictionary and
        the dictionry is parsed as a string in gtf format.
    Args:
@@ -165,8 +167,9 @@ def write_gtf(gtf_df: pd.DataFrame, filename: str) -> None:
 def write_header(annotations_file: str) -> None:
-    """Write the header of an annotations file, consisting of the \
+    """Write the header of an annotations file.
-        tab delimited column names.
+    It consists of the tab delimited column names.
    Args:
        annotations_file: Filename to write header to.
@@ -182,7 +185,7 @@ class Gtf:
        dtypes: A dictionary containing column names and respective data types.
        parsed: A boolean indicating if the pd.DataFrame is parsed.
        original_columns: A list of columns not touched by parsing.
-        free_text_columns: A list of columns created during parsing \
+        free_text_columns: A list of columns created during parsing
            of column `free_text`.
    """
@@ -240,7 +243,7 @@ class Gtf:
        Part of initialization is:
        Set dataframe attribute
        Check which columns belong to the free-text part of the gtf-file.
-        Check if there are no columns called free-text and if so, sets \
+        Check if there are no columns called free-text and if so, sets
            the value of parsed attribute to TRUE.
        Args:
@@ -254,10 +257,10 @@ class Gtf:
            self.parsed = True
    def parse_key_value(self):
-        """Parse key/value pairs from `free_text` column into column `key` \
+        """Parse key/value pairs.
-            with row entry `value`.
-        Creates a dataframe with columns for keys in the free-text column \
+        From `free_text` column into column `key` with row entry `value`.
+        Creates a dataframe with columns for keys in the free-text column
        instead of `free_text` column.
        Saves it to Gtf.df attribute.
        """
@@ -316,16 +319,15 @@ class TranscriptGenerator:
        strands = transcript_df["strand"].unique()
        if len(transcript_df) == 0:
            LOG.warning(
-                "Transcript %s can't be sampled. \
+                "Transcript \"%s\" can't be sampled: "
-                    Annotation is missing", transcript_id
+                "Annotation is missing or TSL is not 1.", transcript_id
            )
            instance = None
        elif len(strands) > 1:
            LOG.warning(
-                "Transcript %s can't be sampled. Transcript generator \
+                "Transcript \"%s\" can't be sampled: Transcript generator is "
-                    is not implemented for transcripts with \
+                "not implemented for transcripts with exons annotated on "
-                        exons annotated on different strands",
+                "different strands.", transcript_id,
-                transcript_id,
            )
            instance = None
        else:
@@ -351,7 +353,7 @@ class TranscriptGenerator:
    def get_inclusions(self) -> np.ndarray:
        """Generate inclusions array.
-        Each column corresponds to one sample and the number of columns \
+        Each column corresponds to one sample and the number of columns
        corresponds to the number of samples.
        Returns:
@@ -368,16 +370,18 @@ class TranscriptGenerator:
        return inclusion_arr
    def get_unique_inclusions(self) -> tuple[list, np.ndarray, np.ndarray]:
-        """Inclusion of unique intron inclusion via arrays and counts and \
+        """Get unique inclusions.
+        Inclusion of unique intron inclusion via arrays and counts and
        name generation of each unique count.
        Args:
        Returns:
            - List of names for generated exons.
-            - A boolean np.array where columns correspond to generated \
+            - A boolean np.array where columns correspond to generated
                transcripts and rows to intron inclusion.
-            - A np.array containing sample number per generated inclusions, \
+            - A np.array containing sample number per generated inclusions,
                i.e. transcript.
        """
        inclusion_arr = self.get_inclusions()
@@ -398,7 +402,9 @@ class TranscriptGenerator:
    def get_df(
            self, inclusions: np.ndarray, transcript_id: str
            ) -> pd.DataFrame:
-        """Take as input a dataframe filtered to one transcript and \
+        """Get dataframe.
+        Take as input a dataframe filtered to one transcript and
        a boolean vector denoting intron inclusions.
        Args:
@@ -467,7 +473,7 @@ class TranscriptGenerator:
        data_frame = reverse_parse_free_text(data_frame)
        write_gtf(data_frame, filename)
-        LOG.debug("Transcript %s sampled", self.ts_id)
+        LOG.debug("Transcript \"%s\" sampled.", self.ts_id)
 def sample_transcripts(
@@ -477,19 +483,21 @@ def sample_transcripts(
    output_transcripts_file: str,
    output_annotations_file: str,
 ):
-    """Read input files, iterate over transcript IDs, \
+    """Sample transcripts.
+    Read input files, iterate over transcript IDs,
    sample each transcript and save results.
    Args:
-        input_transcripts_file: Filename of transcript abundances, \
+        input_transcripts_file: Filename of transcript abundances,
            needs to be csv or tsv.
-        input_annotations_file: Filename of annotations, \
+        input_annotations_file: Filename of annotations,
            needs to be gtf.
-        prob_inclusion: Probability of intron inclusion, \
+        prob_inclusion: Probability of intron inclusion,
            needs to be float in range [0,1].
-        output_transcripts_file: Filename of file to write \
+        output_transcripts_file: Filename of file to write
            sampled transcripts to.
-        output_annotations_file: Filename of file to write \
+        output_annotations_file: Filename of file to write
            generated annotations to.
    """
    LOG.info("Probability of intron inclusion: %s", str(prob_inclusion))
@@ -509,7 +517,7 @@ def sample_transcripts(
    # Set up output file, write header once and append data in loop
    write_header(output_annotations_file)
-    for _, row in tqdm(transcripts.iterrows()):
+    for _, row in transcripts.iterrows():
        transcript_id = row["id"]
        transcript_count = row["count"]