Compare revisions

Mate Balajti · Mate Balajti · Mate Balajti · 30ca7f20 · 30ca7f20 · 30ca7f20
--- a/requirements.txt
+++ b/requirements.txt
 matplotlib
 pandas
-tqdm
--- a/requirements_dev.txt
+++ b/requirements_dev.txt
 matplotlib
 pandas
 pip
-tqdm
 mypy
 flake8
 flake8-docstrings

--- a/setup.py
+++ b/setup.py
+"""Set up project."""
+from pathlib import Path
+from setuptools import setup, find_packages
+
+project_root_dir = Path(__file__).parent.resolve()
+with open(project_root_dir / "requirements.txt",
+          "r", encoding="utf-8") as f:
+    INSTALL_REQUIRES = f.read().splitlines()
+
+URL = ('https://git.scicore.unibas.ch/zavolan_group/'
+       'tools/transcript-structure-generator')
+
+setup(
+    name='transcript-structure-generator',
+    version='0.2.0',
+    url=URL,
+    license='MIT',
+    author='Larissa Glass, Michael Zimmermann, Andri Fraenkl',
+    author_email='mate.balajti@unibas.ch',
+    description='Transcript structure generator',
+    packages=find_packages(),
+    install_requires=INSTALL_REQUIRES,
+    entry_points={
+        'console_scripts': ['transcript-structure-generator=tsg.cli:app']
+        }
+)
--- a/tsg/cli.py
+++ b/tsg/cli.py
@@ -30,8 +30,8 @@ def setup_logging(loglevel: str) -> None:
            raise

    logging.basicConfig(
-        format='[%(asctime)s: %(levelname)s] \
-            %(message)s (module "%(module)s")',
+        format=('[%(asctime)s: %(levelname)s] '
+                '%(message)s (module "%(module)s")'),
        level=numeric_level,
    )


--- a/tsg/main.py
+++ b/tsg/main.py
@@ -4,8 +4,6 @@ import logging

 import numpy as np
 import pandas as pd  # type: ignore
-from tqdm import tqdm  # type: ignore
-

 LOG = logging.getLogger(__name__)

@@ -31,15 +29,17 @@ def read_abundances(transcripts_file: str) -> pd.DataFrame:


 def filter_df(gtf_df: pd.DataFrame, transcripts: list) -> pd.DataFrame:
-    """Filter annotations to include only exons \
-        with the highest transcript support level, i.e. TSL1.
+    """Filter dataframe.
+
+    Filter annotations to include only exons
+    with the highest transcript support level, i.e. TSL1.

    `feature` column is filtered on value "exon" and
-    `free_text` column is filtered to include the string \
+    `free_text` column is filtered to include the string
        denoting the highest transcript support level
    ('transcript_support_level "1"').

-    If a list of transcript IDs is given, `free_text` column \
+    If a list of transcript IDs is given, `free_text` column
        is filtered to include one of the IDs.

    Args:
@@ -47,7 +47,7 @@ def filter_df(gtf_df: pd.DataFrame, transcripts: list) -> pd.DataFrame:
        transcript: list of transcript IDs

    Returns:
-        A pd.DataFrame containing only rows with exon annotations \
+        A pd.DataFrame containing only rows with exon annotations
            of highest transcript support level and,
        if provided, belonging to one of the given transcripts
    """
@@ -55,7 +55,7 @@ def filter_df(gtf_df: pd.DataFrame, transcripts: list) -> pd.DataFrame:
        transcripts = []
    df_filter = gtf_df[
        (gtf_df["feature"] == "exon")
-        & (gtf_df["free_text"].str.contains('transcript_support_level "1"'))
+        & (gtf_df["free_text"].str.contains('transcript_support_level "1'))
    ]
    if len(transcripts) > 0:
        df_filter = df_filter[df_filter["free_text"].str.contains(
@@ -68,7 +68,7 @@ def filter_df(gtf_df: pd.DataFrame, transcripts: list) -> pd.DataFrame:
 def str_to_dict(gene_string: str) -> dict:
    """Split between key/value pairs.

-    Split string based on delimiter ';' into items, remove empty items and \
+    Split string based on delimiter ';' into items, remove empty items and
        split items on delimiter ' ' into
    key/value pairs. Remove quotes from value strings and create a dictionary.

@@ -92,9 +92,9 @@ def dict_to_str(gene_dict: dict) -> str:

    Takes e.g. dictionary {'gene_id': 'GENE1', 'transcript_id': 'TRANSCRIPT1'}
    and returns string 'gene_id "GENE1"; transcript_id "TRANSCRIPT1";'.
-    Key/value pairs are joined by space to form an item and items are \
+    Key/value pairs are joined by space to form an item and items are
        joinded by ';' to form a string.
-    If a value is Not a Number (nan), the key/value pair is omitted \
+    If a value is Not a Number (nan), the key/value pair is omitted
        from the string.

    Args:
@@ -115,13 +115,15 @@ def dict_to_str(gene_dict: dict) -> str:


 def reverse_parse_free_text(df_all: pd.DataFrame) -> pd.DataFrame:
-    """Reverse parsing of gtf based pd.DataFrame to include only columns that \
-        are well defnined by gtf-file standards.
+    """Reverse parse a gtf based pd.DataFrame.
+
+    The data frame will include only columns that
+    are well defnined by gtf-file standards.

    The first 8 defined columns are constant as defined by gtf-file standards.
-    Further columns are assumed to be parsed free-text columns \
+    Further columns are assumed to be parsed free-text columns
        (see Gtf.parse_free_text()).
-    The parsed free-text columns are aggregated as a dictionary and \
+    The parsed free-text columns are aggregated as a dictionary and
        the dictionry is parsed as a string in gtf format.

    Args:
@@ -165,8 +167,9 @@ def write_gtf(gtf_df: pd.DataFrame, filename: str) -> None:


 def write_header(annotations_file: str) -> None:
-    """Write the header of an annotations file, consisting of the \
-        tab delimited column names.
+    """Write the header of an annotations file.
+
+    It consists of the tab delimited column names.

    Args:
        annotations_file: Filename to write header to.
@@ -182,7 +185,7 @@ class Gtf:
        dtypes: A dictionary containing column names and respective data types.
        parsed: A boolean indicating if the pd.DataFrame is parsed.
        original_columns: A list of columns not touched by parsing.
-        free_text_columns: A list of columns created during parsing \
+        free_text_columns: A list of columns created during parsing
            of column `free_text`.
    """

@@ -240,7 +243,7 @@ class Gtf:
        Part of initialization is:
        Set dataframe attribute
        Check which columns belong to the free-text part of the gtf-file.
-        Check if there are no columns called free-text and if so, sets \
+        Check if there are no columns called free-text and if so, sets
            the value of parsed attribute to TRUE.

        Args:
@@ -254,11 +257,11 @@ class Gtf:
            self.parsed = True

    def parse_key_value(self):
-        """Parse key/value pairs from `free_text` column into column `key` \
-            with row entry `value`.
+        """Parse key/value pairs.

-        Creates a dataframe with columns for keys in the free-text column \
-            instead of `free_text` column.
+        From `free_text` column into column `key` with row entry `value`.
+        Creates a dataframe with columns for keys in the free-text column
+        instead of `free_text` column.
        Saves it to Gtf.df attribute.
        """
        assert self.parsed is False
@@ -316,16 +319,15 @@ class TranscriptGenerator:
        strands = transcript_df["strand"].unique()
        if len(transcript_df) == 0:
            LOG.warning(
-                "Transcript %s can't be sampled. \
-                    Annotation is missing", transcript_id
+                "Transcript \"%s\" can't be sampled: "
+                "Annotation is missing or TSL is not 1.", transcript_id
            )
            instance = None
        elif len(strands) > 1:
            LOG.warning(
-                "Transcript %s can't be sampled. Transcript generator \
-                    is not implemented for transcripts with \
-                        exons annotated on different strands",
-                transcript_id,
+                "Transcript \"%s\" can't be sampled: Transcript generator is "
+                "not implemented for transcripts with exons annotated on "
+                "different strands.", transcript_id,
            )
            instance = None
        else:
@@ -351,8 +353,8 @@ class TranscriptGenerator:
    def get_inclusions(self) -> np.ndarray:
        """Generate inclusions array.

-        Each column corresponds to one sample and the number of columns \
-            corresponds to the number of samples.
+        Each column corresponds to one sample and the number of columns
+        corresponds to the number of samples.

        Returns:
            A boolean np.array, where True means intron inclusion.
@@ -368,16 +370,18 @@ class TranscriptGenerator:
        return inclusion_arr

    def get_unique_inclusions(self) -> tuple[list, np.ndarray, np.ndarray]:
-        """Inclusion of unique intron inclusion via arrays and counts and \
-            name generation of each unique count.
+        """Get unique inclusions.
+
+        Inclusion of unique intron inclusion via arrays and counts and
+        name generation of each unique count.

        Args:

        Returns:
            - List of names for generated exons.
-            - A boolean np.array where columns correspond to generated \
+            - A boolean np.array where columns correspond to generated
                transcripts and rows to intron inclusion.
-            - A np.array containing sample number per generated inclusions, \
+            - A np.array containing sample number per generated inclusions,
                i.e. transcript.
        """
        inclusion_arr = self.get_inclusions()
@@ -398,8 +402,10 @@ class TranscriptGenerator:
    def get_df(
            self, inclusions: np.ndarray, transcript_id: str
            ) -> pd.DataFrame:
-        """Take as input a dataframe filtered to one transcript and \
-            a boolean vector denoting intron inclusions.
+        """Get dataframe.
+
+        Take as input a dataframe filtered to one transcript and
+        a boolean vector denoting intron inclusions.

        Args:
            inclusions: A boolean vector denoting intron inclusion.
@@ -467,7 +473,7 @@ class TranscriptGenerator:
        data_frame = reverse_parse_free_text(data_frame)

        write_gtf(data_frame, filename)
-        LOG.debug("Transcript %s sampled", self.ts_id)
+        LOG.debug("Transcript \"%s\" sampled.", self.ts_id)


 def sample_transcripts(
@@ -477,19 +483,21 @@ def sample_transcripts(
    output_transcripts_file: str,
    output_annotations_file: str,
 ):
-    """Read input files, iterate over transcript IDs, \
-        sample each transcript and save results.
+    """Sample transcripts.
+
+    Read input files, iterate over transcript IDs,
+    sample each transcript and save results.

    Args:
-        input_transcripts_file: Filename of transcript abundances, \
+        input_transcripts_file: Filename of transcript abundances,
            needs to be csv or tsv.
-        input_annotations_file: Filename of annotations, \
+        input_annotations_file: Filename of annotations,
            needs to be gtf.
-        prob_inclusion: Probability of intron inclusion, \
+        prob_inclusion: Probability of intron inclusion,
            needs to be float in range [0,1].
-        output_transcripts_file: Filename of file to write \
+        output_transcripts_file: Filename of file to write
            sampled transcripts to.
-        output_annotations_file: Filename of file to write \
+        output_annotations_file: Filename of file to write
            generated annotations to.
    """
    LOG.info("Probability of intron inclusion: %s", str(prob_inclusion))
@@ -509,7 +517,7 @@ def sample_transcripts(
    # Set up output file, write header once and append data in loop
    write_header(output_annotations_file)

-    for _, row in tqdm(transcripts.iterrows()):
+    for _, row in transcripts.iterrows():
        transcript_id = row["id"]
        transcript_count = row["count"]
No results found