From 2b09fdfc8641a7c8d4329936c743f87254027e13 Mon Sep 17 00:00:00 2001 From: Alex Kanitz <alexander.kanitz@alumni.ethz.ch> Date: Fri, 18 Nov 2022 16:15:17 +0100 Subject: [PATCH] DO NOT MERGE: remove all content for review --- .gitignore | 161 ----------- Dockerfile | 11 - LICENSE.md | 21 -- README.md | 65 ----- environment.yml | 15 - pyproject.toml | 25 -- tests/__init__.py | 1 - tests/resources/Annotation1.gtf | 5 - tests/resources/Annotations2.gtf | 12 - tests/resources/Transcript1.csv | 7 - tests/resources/Transcript2.tsv | 7 - tests/test_main.py | 135 --------- tsg/__init__.py | 4 - tsg/__main__.py | 4 - tsg/cli.py | 85 ------ tsg/main.py | 453 ------------------------------- 16 files changed, 1011 deletions(-) delete mode 100644 .gitignore delete mode 100644 Dockerfile delete mode 100644 LICENSE.md delete mode 100644 README.md delete mode 100644 environment.yml delete mode 100644 pyproject.toml delete mode 100644 tests/__init__.py delete mode 100644 tests/resources/Annotation1.gtf delete mode 100644 tests/resources/Annotations2.gtf delete mode 100644 tests/resources/Transcript1.csv delete mode 100644 tests/resources/Transcript2.tsv delete mode 100644 tests/test_main.py delete mode 100644 tsg/__init__.py delete mode 100644 tsg/__main__.py delete mode 100644 tsg/cli.py delete mode 100644 tsg/main.py diff --git a/.gitignore b/.gitignore deleted file mode 100644 index dc92f04..0000000 --- a/.gitignore +++ /dev/null @@ -1,161 +0,0 @@ -# Created by https://www.toptal.com/developers/gitignore/api/python,git -# Edit at https://www.toptal.com/developers/gitignore?templates=python,git - -### Git ### -# Created by git for backups. To disable backups in Git: -# $ git config --global mergetool.keepBackup false -*.orig - -# Created by git when using merge tools for conflicts -*.BACKUP.* -*.BASE.* -*.LOCAL.* -*.REMOTE.* -*_BACKUP_*.txt -*_BASE_*.txt -*_LOCAL_*.txt -*_REMOTE_*.txt - -### Python ### -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -share/python-wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.py,cover -.hypothesis/ -.pytest_cache/ -cover/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 -db.sqlite3-journal - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -.pybuilder/ -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -# pyenv -# For a library or package, you might want to ignore these files since the code is -# intended to run in multiple environments; otherwise, check them in: -# .python-version - -# pipenv -# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. -# However, in case of collaboration, if having platform-specific dependencies or dependencies -# having no cross-platform support, pipenv may install dependencies that don't work, or not -# install all needed dependencies. -#Pipfile.lock - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow -__pypackages__/ - -# Celery stuff -celerybeat-schedule -celerybeat.pid - -# SageMath parsed files -*.sage.py - -# Environments -.env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker -.pyre/ - -# pytype static type analyzer -.pytype/ - -# Cython debug symbols -cython_debug/ - -# End of https://www.toptal.com/developers/gitignore/api/python,git - -data \ No newline at end of file diff --git a/Dockerfile b/Dockerfile deleted file mode 100644 index be953d5..0000000 --- a/Dockerfile +++ /dev/null @@ -1,11 +0,0 @@ -FROM python:3-slim - -WORKDIR /usr/src/app - -COPY README.md requirements.txt pyproject.toml ./ -ADD tsg ./tsg -ADD data ./data - -RUN pip install . - -CMD [ "transcript-generator", "--transcripts", "data/transcripts_short.csv", "--annotation", "data/Homo_sapiens.GRCh38.103.chr.gtf", "--prob_inclusion", "0.05" ] \ No newline at end of file diff --git a/LICENSE.md b/LICENSE.md deleted file mode 100644 index 2313fb3..0000000 --- a/LICENSE.md +++ /dev/null @@ -1,21 +0,0 @@ -MIT License - -Copyright (c) 2021 Zavolan Lab, Biozentrum, University of Basel - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. \ No newline at end of file diff --git a/README.md b/README.md deleted file mode 100644 index 9c0d0c7..0000000 --- a/README.md +++ /dev/null @@ -1,65 +0,0 @@ -# Synopsis - -The human contains countless variety and diversity of cell types, states, and interactions. We wish to understand these tissues and the cell types at much deeper level. Single-cell RNA-seq (scRNA-seq) offers a look into what genes are being expressed at the level of individual cells. Overall this method allows on to identify cell types, find rare or unidentified cell types or states, identify genes that are differently expressed in different cell types, and explore changes in expression whilst including spatial, regulatory, and protein interactions. - -We hope that other would find use for this transcript_structure generator that allows one to take input gtf files of specific gene transcripts and outputs a gtf containing intron/exon structures per inputed transcript. - -# Installation - -To install the Python virtual environment, run - -``` -conda env create --file environment.yml -conda activate transcript-structure-generator -``` - -# Usage - -Input: -- csv-formatted file ("ID,Count") with counts for individual transcripts -- probability of intron inclusion (float in range [0,1]) -- gtf-formatted file with exon coordinates of the transcripts included in the csv file - -Output: -- gtf-formatted file containing generated intron/exon structures per transcript -- csv-formatted file ("NewTranscriptID,ID,Count") with - - id of generated transcript - - id of original transcript (without intron inclusions) - - count - -To install package, run - -``` -pip install . -``` - -To generate the sampled transcripts, open a new shell, activate your environment and run - -``` -conda activate transcript-structure-generator - -transcript-generator --transcripts <transcripts_file> --annotation <annotations_file> --prob_inclusion=<probability_inclusion> [--log "INFO"] -``` - -where the transcripts file should be csv-formatted, the annotation file gtf-formatted and the inclusion probability for introns a float in the range [0,1]. The log parameter is optional and can be one of `["CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG"]`. The default is `INFO`. - - -# Development - -To perform all tests, make sure your environment corresponds to the `environment.yml` file and run - -``` -pytest tests -``` - -# License - -MIT license, Copyright (c) 2021 Zavolan Lab, Biozentrum, University of Basel - -# Contributers - -Larissa Glass -Michael Zimmermann -Andri Fraenkl - - diff --git a/environment.yml b/environment.yml deleted file mode 100644 index ac26257..0000000 --- a/environment.yml +++ /dev/null @@ -1,15 +0,0 @@ -name: transcript-structure-generator -channels: - - defaults - - conda-forge -dependencies: - - python=3.9 - - matplotlib - - pandas - - pip - - tqdm - - flake8-docstrings - - mypy - - flake8 - - pytest - - coverage diff --git a/pyproject.toml b/pyproject.toml deleted file mode 100644 index 1214e21..0000000 --- a/pyproject.toml +++ /dev/null @@ -1,25 +0,0 @@ -[project] -name = "tsg" -version = "0.1.0" -authors = [ - { name="Michael Zimmermann", email="michael.zimmermann@unibas.ch" }, - { name="Andri Fraenkl", email="andri.fraenkl@unibas.ch" }, - { name="Larissa Glass", email="larissa.glass@unibas.ch" }, -] -description = "Transcript Structure Generator" -readme = "README.md" -requires-python = ">=3.7" -license = {text = "MIT"} -dependencies = [ - "pandas", - "tqdm", -] - -[tool.setuptools] -packages = ["tsg"] - -[project.urls] -"Homepage" = "https://git.scicore.unibas.ch/zavolan_group/tools/transcript-structure-generator" - -[project.scripts] -transcript-generator = "tsg.cli:app" \ No newline at end of file diff --git a/tests/__init__.py b/tests/__init__.py deleted file mode 100644 index 8c26b48..0000000 --- a/tests/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Tests for the package tsg.""" diff --git a/tests/resources/Annotation1.gtf b/tests/resources/Annotation1.gtf deleted file mode 100644 index c794aaa..0000000 --- a/tests/resources/Annotation1.gtf +++ /dev/null @@ -1,5 +0,0 @@ -1 havana gene 1 100 . + . gene_id "GENE1"; -1 havana transcript 1 100 . + . gene_id "GENE1"; transcript_id "TRANSCRIPT1"; transcript_support_level "1"; -1 havana exon 1 20 . + . gene_id "GENE1"; transcript_id "TRANSCRIPT1"; exon_number "1"; exon_id "ENSE00002234944"; transcript_support_level "1"; -1 havana exon 50 70 . + . gene_id "GENE1"; transcript_id "TRANSCRIPT1"; exon_number "2"; exon_id "ENSE00003582793"; transcript_support_level "1"; -1 havana exon 80 100 . + . gene_id "GENE1"; transcript_id "TRANSCRIPT1"; exon_number "3"; exon_id "ENSE00002312635"; transcript_support_level "1"; diff --git a/tests/resources/Annotations2.gtf b/tests/resources/Annotations2.gtf deleted file mode 100644 index d7f079e..0000000 --- a/tests/resources/Annotations2.gtf +++ /dev/null @@ -1,12 +0,0 @@ -1 havana transcript 1000 2000 . - . gene_id "GENE2"; transcript_id "TRANSCRIPT2"; transcript_support_level "1"; -1 havana exon 1980 2000 . - . gene_id "GENE2"; transcript_id "TRANSCRIPT2"; exon_number "1"; exon_id "ENSE00001890219"; transcript_support_level "1"; -1 havana exon 1900 1950 . - . gene_id "GENE2"; transcript_id "TRANSCRIPT2"; exon_number "2"; exon_id "ENSE00003507205"; transcript_support_level "1"; -1 havana exon 1800 1850 . - . gene_id "GENE2"; transcript_id "TRANSCRIPT2"; exon_number "3"; exon_id "ENSE00003477500"; transcript_support_level "1"; -1 havana exon 1700 1750 . - . gene_id "GENE2"; transcript_id "TRANSCRIPT2"; exon_number "4"; exon_id "ENSE00003565697"; transcript_support_level "1"; -1 havana exon 1600 1650 . - . gene_id "GENE2"; transcript_id "TRANSCRIPT2"; exon_number "5"; exon_id "ENSE00003475637"; transcript_support_level "1"; -1 havana exon 1500 1550 . - . gene_id "GENE2"; transcript_id "TRANSCRIPT2"; exon_number "6"; exon_id "ENSE00003502542"; transcript_support_level "1"; -1 havana exon 1400 1450 . - . gene_id "GENE2"; transcript_id "TRANSCRIPT2"; exon_number "7"; exon_id "ENSE00003553898"; transcript_support_level "1"; -1 havana exon 1300 1350 . - . gene_id "GENE2"; transcript_id "TRANSCRIPT2"; exon_number "8"; exon_id "ENSE00003621279"; transcript_support_level "1"; -1 havana exon 1200 1250 . - . gene_id "GENE2"; transcript_id "TRANSCRIPT2"; exon_number "9"; exon_id "ENSE00002030414"; transcript_support_level "1"; -1 havana exon 1100 1150 . - . gene_id "GENE2"; transcript_id "TRANSCRIPT2"; exon_number "10"; exon_id "ENSE00001935574"; transcript_support_level "1"; -1 havana exon 1000 1050 . - . gene_id "GENE2"; transcript_id "TRANSCRIPT2"; exon_number "11"; exon_id "ENSE00001843071"; transcript_support_level "1"; diff --git a/tests/resources/Transcript1.csv b/tests/resources/Transcript1.csv deleted file mode 100644 index 6635236..0000000 --- a/tests/resources/Transcript1.csv +++ /dev/null @@ -1,7 +0,0 @@ -TRANSCRIPT1,92 -TRANSCRIPT2,13 -TRANSCRIPT3,73 -TRANSCRIPT4,83 -TRANSCRIPT5,32 -TRANSCRIPT6,136 -TRANSCRIPT7,36 \ No newline at end of file diff --git a/tests/resources/Transcript2.tsv b/tests/resources/Transcript2.tsv deleted file mode 100644 index 06b14c4..0000000 --- a/tests/resources/Transcript2.tsv +++ /dev/null @@ -1,7 +0,0 @@ -TRANSCRIPT1 92 -TRANSCRIPT2 13 -TRANSCRIPT3 73 -TRANSCRIPT4 83 -TRANSCRIPT5 32 -TRANSCRIPT6 136 -TRANSCRIPT7 36 \ No newline at end of file diff --git a/tests/test_main.py b/tests/test_main.py deleted file mode 100644 index 48b9b55..0000000 --- a/tests/test_main.py +++ /dev/null @@ -1,135 +0,0 @@ -"""Tests for main module""" - -import numpy as np -import pandas as pd -import pytest -from tsg.main import Gtf, TranscriptGenerator, dict_to_str, str_to_dict - - -class TestFreeTextParsing: - """Test if free text dictionary is correctly parsed.""" - - def test_str2dict(self): - res = str_to_dict( - 'gene_id "GENE2"; transcript_id "TRANSCRIPT2"; exon_number "1"; exon_id "EXON1";' - ) - - assert res == { - "gene_id": "GENE2", - "transcript_id": "TRANSCRIPT2", - "exon_number": "1", - "exon_id": "EXON1", - } - - def test_dict2str(self): - res = dict_to_str( - { - "gene_id": "GENE2", - "transcript_id": "TRANSCRIPT2", - "exon_number": "1", - "exon_id": "EXON1", - } - ) - print(res) - assert ( - res - == 'gene_id "GENE2"; transcript_id "TRANSCRIPT2"; exon_number "1"; exon_id "EXON1";' - ) - - -class TestGtf: - "Test if Gtf class works correctly." - cols = [ - "seqname", - "source", - "feature", - "start", - "end", - "score", - "strand", - "frame", - "free_text", - ] - - def test_init(self): - annotations = Gtf() - annotations.read_file("tests/resources/Annotation1.gtf") - - assert annotations.parsed == False - assert annotations.original_columns == self.cols - assert annotations.free_text_columns == [] - - def test_parsed(self): - annotations = Gtf() - annotations.read_file("tests/resources/Annotation1.gtf") - annotations.parse_free_text() - - assert annotations.parsed == True - assert set(annotations.free_text_columns) == set( - [ - "gene_id", - "transcript_id", - "exon_number", - "exon_id", - "transcript_support_level", - ] - ) - assert set(annotations.original_columns) == set( - ["seqname", "source", "feature", "start", "end", "score", "strand", "frame"] - ) - - -class TestTranscriptGenerator: - cols = [ - "start", - "end", - "strand", - "transcript_id", - ] - - df1 = pd.DataFrame( - { - "start": [1, 50, 80], - "end": [20, 70, 100], - "strand": ["+", "+", "+"], - "exon_id": ["EXON1", "EXON2", "EXON3"], - } - ) - df2 = pd.DataFrame(columns=["start", "end", "strand"]) - - def test_init(self): - transcripts = TranscriptGenerator("TRANSCRIPT1", 3, self.df1, 0.05) - - assert transcripts.strand == "+" - - def test_init_2(self): - with pytest.raises(AssertionError): - transcripts = TranscriptGenerator("TRANSCRIPT2", 3, self.df2, 0.05) - - def test_init_3(self): - with pytest.raises(AssertionError): - transcripts = TranscriptGenerator("TRANSCRIPT1", 0, self.df1, 0.05) - - def test_inclusions(self): - transcripts = TranscriptGenerator("TRANSCRIPT1", 3, self.df1, 0.5) - res = transcripts._get_inclusions() - - assert res.shape == (3, 3) - - def test_unique_inclusions(self): - transcripts = TranscriptGenerator("TRANSCRIPT1", 3, self.df1, 0.5) - res1, res2, res3 = transcripts._get_unique_inclusions() - - def test_get_df(self): - inclusions = [False, True, False] - expected_end = pd.Series([20, 79, 100], name="end") - transcript_id = "TRANSCRIPT1_1" - - transcripts = TranscriptGenerator("TRANSCRIPT1", 3, self.df1, 0.5) - res = transcripts._get_df(inclusions, transcript_id) - - assert res["transcript_id"].unique().item() == "TRANSCRIPT1_1" - assert res["strand"].unique().item() == "+" - assert res["exon_id"].tolist() == ["EXON1", "EXON2_1", "EXON3"] - pd.testing.assert_series_equal(res["start"], self.df1["start"]) - pd.testing.assert_series_equal(res["end"], expected_end) diff --git a/tsg/__init__.py b/tsg/__init__.py deleted file mode 100644 index 229aaf5..0000000 --- a/tsg/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -"""Transcript structure generator package.""" - - -__version__ = '0.0.0' diff --git a/tsg/__main__.py b/tsg/__main__.py deleted file mode 100644 index cd76d30..0000000 --- a/tsg/__main__.py +++ /dev/null @@ -1,4 +0,0 @@ -from tsg.cli import app - -if __name__ == "__main__": - app() \ No newline at end of file diff --git a/tsg/cli.py b/tsg/cli.py deleted file mode 100644 index b67e352..0000000 --- a/tsg/cli.py +++ /dev/null @@ -1,85 +0,0 @@ -import argparse -import logging -from pathlib import Path - -from tsg.main import sample_transcripts - - -def setup_logging(loglevel: str = None) -> None: - """Set up logging. Loglevel can be one of ["CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG"]. - - Args: - loglevel (str, optional): Level of log output. Defaults to None. - - Raises: - ValueError: If string that is not a log level is passed, raise error. - """ - if loglevel: - numeric_level = getattr(logging, loglevel.upper()) - if not isinstance(numeric_level, int): - raise ValueError("Invalid log level: %s" % loglevel) - else: - numeric_level = getattr(logging, "INFO") - - logging.basicConfig( - format='[%(asctime)s: %(levelname)s] %(message)s (module "%(module)s")', - level=numeric_level, - ) - - -def build_arg_parser() -> argparse.ArgumentParser: - parser = argparse.ArgumentParser() - parser.add_argument("--transcripts", type=str) - parser.add_argument("--annotation", type=str) - parser.add_argument("--prob_inclusion", type=float) - parser.add_argument("--log", type=str) - - return parser - - -def get_args() -> argparse.Namespace: - parser = build_arg_parser() - - args = parser.parse_args() - - return args - - -def output_filename(filename: str) -> str: - """Generate output filename for given input filename. - - Args: - filename (str): Input filename - - Raises: - NotImplementedError: Only accept filetypes .csv, .tsv and .gtf. - FileExistsError: If the output file exists, raise error. - - Returns: - str: Output filename - """ - filepath = Path(filename) - if filepath.suffix == ".csv" or filepath.suffix == ".tsv": - outfile = "generated_" + filepath.stem + ".csv" - elif filepath.suffix == ".gtf": - outfile = "generated_" + filepath.name - else: - raise NotImplementedError() - - if Path(outfile).exists(): - raise FileExistsError(f"The output file {outfile} already exists.") - - return outfile - - -def app(): - args = get_args() - - setup_logging(args.log) - sample_transcripts( - args.transcripts, - args.annotation, - args.prob_inclusion, - output_filename(args.transcripts), - output_filename(args.annotation), - ) diff --git a/tsg/main.py b/tsg/main.py deleted file mode 100644 index ca96955..0000000 --- a/tsg/main.py +++ /dev/null @@ -1,453 +0,0 @@ -"""Sample transcripts.""" - -import logging - -import numpy as np -import pandas as pd -from tqdm import tqdm - - -LOG = logging.getLogger(__name__) - - -def read_abundances(transcripts_file: str) -> pd.DataFrame: - """Read abundance file into dataframe. - - Args: - transcripts_file (str): Input filename - - Returns: - pd.DataFrame: Transcript abundances ("id", "count") - - Raises: - ValueError: When the input file is neither csv or tsv - """ - cols: list = ["id", "count"] - if transcripts_file.endswith(".tsv"): - return pd.read_table(transcripts_file, header=None, names=cols) - elif transcripts_file.endswith(".csv"): - return pd.read_csv(transcripts_file, header=None, names=cols) - else: - raise ValueError("File type needs to be either csv or tsv") - - -def filter_df(df: pd.DataFrame, transcripts: list = []) -> pd.DataFrame: - """Filter annotations to include only exons with the highest transcript support level (TSL1). - - `feature` column is filtered on value "exon" and - `free_text` column is filtered to include the string denoting the highest transcript support level - ('transcript_support_level "1"'). - - If a list of transcript IDs is given, `free_text` column is filtered to include one of the IDs. - - Args: - df: A pd.DataFrame containing an unparsed gtf file - transcript: list of transcript IDs - - Returns: - A pd.DataFrame containing only rows with exon annotations of highest transcript support level and, - if provided, belonging to one of the given transcripts - """ - df_filter = df[ - (df["feature"] == "exon") - & (df["free_text"].str.contains('transcript_support_level "1"')) - ] - if len(transcripts) > 0: - df_filter = df_filter["free_text"].str.contains( - "|".join(transcripts), regex=True - ) - - return df_filter - - -def str_to_dict(s: str) -> dict: - """Split between key/value pairs. - - Split string based on delimiter ';' into items, remove empty items and split items on delimiter ' ' into key/value - pairs. - Remove quotes from value strings and create a dictionary. - - Args: - s: A string of the form 'gene_id "GENE1"; transcript_id "TRANSCRIPT1";' - - Returns: - A dictionary containing e.g. {'gene_id': 'GENE1', 'transcript_id': 'TRANSCRIPT1'} - """ - # split into items - # remove empty items - # split items into key/value pairs - item_list: list = [x.split() for x in s.split(";") if len(x) > 0] - # remove quotes for values and return dictionary - return {item[0]: item[1].strip('"') for item in item_list} - - -def dict_to_str(d: dict) -> str: - """Parse dictionary in gtf free_text column format. - - Takes e.g. dictionary {'gene_id': 'GENE1', 'transcript_id': 'TRANSCRIPT1'} and returns - string 'gene_id "GENE1"; transcript_id "TRANSCRIPT1";'. - Key/value pairs are joined by space to form an item and items are joinded by ';' to form a string. - If a value is Not a Number (nan), the key/value pair is omitted from the string. - - Args: - d: A dictionary of the form {'gene_id': 'GENE1', 'transcript_id': 'TRANSCRIPT1'} - - Returns: - A string, e.g. 'gene_id "GENE1"; transcript_id "TRANSCRIPT1";'. - """ - # join key, value pairs from dictionary with a space in a list, - # then join items in list by ; - # end on ; - # check if value is nan - s: str = ( - "; ".join([f'{key} "{value}"' for key, value in d.items() if value == value]) - + ";" - ) - return s - - -def reverse_parse_free_text(df_all: pd.DataFrame) -> pd.DataFrame: - """Reverse parsing of gtf based pd.DataFrame to include only columns that are well defnined by gtf file standards. - - The first 8 defined columns are constant as defined by gtf file standards - Further columns are assumed to be parsed free text columns (see Gtf.parse_free_text()). - The parsed free text columns are aggregated as a dictionary and the dictionry is parsed as a string in gtf format. - - Args: - df_all: A pd.DataFrame containing a parsed gtf file. - - Returns: - A DataFrame with the columns as defined by gtf file standards. - """ - # Define pd.DataFrame containing only parsed free text columns - df_free_text = df_all.iloc[:, 8:] - # Define pd.DataFrame containing only non-parsed columns - df = df_all.iloc[:, :8] - # Reverse parsing of free text columns and add the result as column `free_text` to output pd.DataFrame - df["free_text"] = df_free_text.agg(pd.Series.to_dict, axis=1).apply(dict_to_str) - return df - - -def write_gtf(df: pd.DataFrame, filename: str) -> None: - """Save a Gtf object to file in gtf format. - - Makes sure data types are correct and saves object in gtf format. - - Args: - df: A pd.DataFrame containing a gtf file. - Filename: File to save to. - """ - # Make sure the data types are correct. - df = df.astype(Gtf.dtypes) - - df.to_csv( - filename, - sep="\t", - header=False, - index=False, - quoting=None, - quotechar="'", - mode="a", - ) - - -def write_header(annotations_file: str) -> None: - """Write the header of an annotations file, consisting of the tab delimited column names. - - Args: - annotations_file: Filename to write header to. - """ - with open(annotations_file, "w") as fh: - fh.write("\t".join(Gtf.dtypes.keys()) + "\n") - - -class Gtf: - """Class to read transcripts annotations file into a Gtf object. - - Attributes: - dtypes: A dictionary containing column names and respective data types. - parsed: A boolean indicating if the pd.DataFrame is parsed. - original_columns: A list of columns not touched by parsing. - free_text_columns: A list of columns created during parsing of column `free_text`. - """ - - dtypes = { - "seqname": object, - "source": object, - "feature": object, - "start": int, - "end": int, - "score": object, - "strand": object, - "frame": object, - "free_text": object, - } - - def __init__(self): - """Initialize Gtf object.""" - self.parsed = False - self.original_columns = list(self.dtypes.keys()) - self.free_text_columns = [] - - def read_file(self, annotations_file: str) -> None: - """Read gtf file. - - Iterate over chunks of the gtf file reading 100000 rows at a time. Filter chunks for exon annotations of - the highest transcript support level. Concatenate chunks to get resulting pd.DataFrame. - - Args: - annotations_file: Filename of annotations. - - Raises: - ValueError: The file type is required to be gtf. - """ - if not annotations_file.endswith("gtf"): - raise ValueError("File type needs to be gtf") - - reader = pd.read_table( - annotations_file, - sep="\t", - comment="#", - names=self.dtypes.keys(), - dtype=self.dtypes, - chunksize=100000, - iterator=True, - ) - self.df = pd.concat([filter_df(chunk) for chunk in reader]) - - def from_dataframe(self, df: pd.DataFrame) -> None: - """Initialize Gtf object from pandas Dataframe. - - Part of initialization is: - Set dataframe attribute - Check which columns belong to the free-text part of the GTF-file. - Check if there are no columns called free-text and if so, sets the value of parsed attribute to TRUE. - - Args: - df: pd.DataFrame - """ - self.free_text_columns = [ - col for col in df.columns if col not in self.original_columns - ] - self.df = df - if "free_text" not in df.columns: - self.parsed = True - - def parse_free_text(self): - """Parse key/value pairs from `free_text` column into column `key` with row entry `value`. - - Creates a dataframe with columns for keys in the free text column instead of `free_text` column. - Saves it to Gtf.df attribute. - """ - assert self.parsed == False - # create dataframe with columns for values in free_text column - df_free_text = self.df["free_text"].map(str_to_dict).apply(pd.Series) - # remember which columns come from free_text - self.free_text_columns = df_free_text.columns - # join free_text columns to original dataframe and drop the "free_text" column itself - self.df = self.df.drop("free_text", axis=1) - self.original_columns = self.df.columns - self.df = self.df.join(df_free_text, how="inner") - # remember that current dataframe is parsed, i.e. can't be written in gtf format - self.parsed = True - - def reverse_parse_free_text(self): - """Reverses parsing of `free_text` column. - - Creates a data frame that can be written in gtf format to file. Parsed free text columns are aggregated - into `free_text` column according to gtf format specification. - """ - assert self.parsed == True - # create dataframe with only free_text columns - df_free_text = self.df[self.free_text_columns] - # filter current dataframe to only original columns, except "free_text" column - self.df = self.df[self.original_columns] - # undo parsing and save result in "free_text" column - self.df["free_text"] = df_free_text.agg(pd.Series.to_dict, axis=1).apply( - dict_to_str - ) - # remember that current dataframe is not parsed - self.parsed = False - - def pick_transcript(self, transcript_id: str) -> pd.DataFrame: - """Filter annotations to a given transcript ID.""" - return self.df.query(f"transcript_id == '{transcript_id}'") - - -class TranscriptGenerator: - """Class to sample a transcript.""" - - def __init__( - self, - transcript_id: str, - transcript_count: int, - transcript_df: pd.DataFrame, - prob_inclusion: float, - ): - """Initialize TranscriptGenerator object.""" - assert len(transcript_df) > 0 - assert transcript_count > 0 - assert (prob_inclusion >= 0) and (prob_inclusion <= 1) - - self.id = transcript_id - self.count = transcript_count - self.df = transcript_df - self.no_exons = len(transcript_df) - self.strand = self.df["strand"].unique().item() - self.prob_inclusion = prob_inclusion - - def _get_inclusions(self) -> np.array: - """Generate inclusions array. - - Each column corresponds to one sample and the number of columns corresponds to the number of samples. - - Returns: - inclusion_arr: A boolean np.array, where True means intron inclusion. - """ - inclusion_arr = np.random.rand(self.no_exons, self.count) < self.prob_inclusion - if self.strand == "+": - inclusion_arr[-1, :] = False - elif self.strand == "-": - inclusion_arr[-1, :] = False - - return inclusion_arr - - def _get_unique_inclusions(self) -> tuple[list, np.array, np.array]: - """Inclusion of unique intron inclusion via arrays and counts and name generation of each unique count. - - Args: - - Returns: - names: List of names for generated exons. - inclusion_arr_unique: A boolean np.array where columns correspond to generated transcripts and rows to - intron inclusion. - counts: A np.array containing sample number per generated inclusions, i.e. transcript. - """ - inclusion_arr = self._get_inclusions() - # Unique intron inclusion arrays and counts - inclusion_arr_unique, counts = np.unique( - inclusion_arr, axis=1, return_counts=True - ) - # Name for each generated transcript - names = [] - for i in range(inclusion_arr_unique.shape[1]): - if np.all(inclusion_arr_unique[:, i] == False, axis=0): - names.append(self.id) - else: - names.append(f"{self.id}_{i}") - - return names, inclusion_arr_unique, counts - - def _get_df(self, inclusions: np.array, transcript_id: str) -> pd.DataFrame: - """Take as input a dataframe filtered to one transcript and a boolean vector denoting intron inclusions. - - Args: - inclusions (np.array): A boolean vector denoting intron inclusion. - transcript_id (str): The transcript id. - - Returns: - The generated transcript as a pd.DataFrame. - """ - df_generated = self.df.copy() - if self.strand == "+": - origninal_end = df_generated["end"] - df_generated["end"] = np.where( - inclusions, - df_generated["start"].shift(periods=-1, fill_value=-1) - 1, - origninal_end, - ) - if self.strand == "-": - origninal_start = df_generated["start"] - df_generated["start"] = np.where( - inclusions, - df_generated["end"].shift(periods=-1, fill_value=-1) + 1, - origninal_start, - ) - - original_id = df_generated["exon_id"] - df_generated["exon_id"] = np.where( - inclusions, - df_generated["exon_id"] + "_" + np.arange(len(df_generated)).astype(str), - original_id, - ) - - df_generated["transcript_id"] = transcript_id - return df_generated - - def generate_transcripts(self, filename: str) -> None: - """Write transcripts to file. - - Args: - filename (str): Output csv filename. - """ - ids, inclusions, counts = self._get_unique_inclusions() - with open(filename, "a") as fh: - for transcript_id, transcript_count in zip(ids, counts): - fh.write(f"{transcript_id},{self.id},{transcript_count}\n") - - def generate_annotations(self, filename: str) -> None: - """Generate a annotations in gtf format for sampled transcript. - - Args: - Filename: Output gtf filename. - - Raises: - ValueError: If given transcript ID could not be sampled. - """ - ids, inclusions, counts = self._get_unique_inclusions() - n_unique = len(ids) - - try: - df = pd.concat( - [self._get_df(inclusions[:, i], ids[i]) for i in range(n_unique)] - ) - df = reverse_parse_free_text(df) - - write_gtf(df, filename) - LOG.debug(f"Transcript {self.id} sampled") - except ValueError: - LOG.error(f"Transcript {self.id} could not be sampled.") - - -def sample_transcripts( - input_transcripts_file: str, - input_annotations_file: str, - prob_inclusion: float, - output_transcripts_file: str, - output_annotations_file: str, -): - """Read input files, iterate over transcript IDs, sample each transcript and save results. - - Args: - input_transcripts_file (str): Filename of transcript abundances, needs to be csv or tsv. - input_annotations_file (str): Filename of annotations, needs to be gtf. - prob_inclusion (float): Probability of intron inclusion, needs to be float in range [0,1]. - output_transcripts_file (str): Filename of file to write sampled transcripts to. - output_annotations_file (str): Filename of file to write generated annotations to. - """ - transcripts = read_abundances(input_transcripts_file) - - LOG.info("Parsing annotations...") - annotations = Gtf() - annotations.read_file(input_annotations_file) - annotations.parse_free_text() - LOG.info("Done parsing...") - - LOG.info("Start sampling transcripts...") - # Set up output file, write header once and append data in loop - write_header(output_annotations_file) - - for _, row in tqdm(transcripts.iterrows()): - transcript_id = row["id"] - transcript_count = row["count"] - - transcript_df = annotations.pick_transcript(transcript_id) - transcripts = TranscriptGenerator( - transcript_id, - transcript_count, - transcript_df, - prob_inclusion=prob_inclusion, - ) - transcripts.generate_annotations(output_annotations_file) - transcripts.generate_transcripts(output_transcripts_file) - LOG.info("Done.") -- GitLab