Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • zavolan_group/tools/transcript-structure-generator
1 result
Show changes
Commits on Source (3)
matplotlib matplotlib
pandas pandas
pip pip
tqdm
mypy mypy
flake8 flake8
flake8-docstrings flake8-docstrings
......
"""Set up project."""
from pathlib import Path
from setuptools import setup, find_packages
project_root_dir = Path(__file__).parent.resolve()
with open(project_root_dir / "requirements.txt",
"r", encoding="utf-8") as f:
INSTALL_REQUIRES = f.read().splitlines()
URL = ('https://git.scicore.unibas.ch/zavolan_group/'
'tools/transcript-structure-generator')
setup(
name='transcript-structure-generator',
version='0.2.0',
url=URL,
license='MIT',
author='Larissa Glass, Michael Zimmermann, Andri Fraenkl',
author_email='mate.balajti@unibas.ch',
description='Transcript structure generator',
packages=find_packages(),
install_requires=INSTALL_REQUIRES,
entry_points={
'console_scripts': ['transcript-structure-generator=tsg.cli:app']
}
)
...@@ -30,8 +30,8 @@ def setup_logging(loglevel: str) -> None: ...@@ -30,8 +30,8 @@ def setup_logging(loglevel: str) -> None:
raise raise
logging.basicConfig( logging.basicConfig(
format='[%(asctime)s: %(levelname)s] \ format=('[%(asctime)s: %(levelname)s] '
%(message)s (module "%(module)s")', '%(message)s (module "%(module)s")'),
level=numeric_level, level=numeric_level,
) )
......
...@@ -4,8 +4,6 @@ import logging ...@@ -4,8 +4,6 @@ import logging
import numpy as np import numpy as np
import pandas as pd # type: ignore import pandas as pd # type: ignore
from tqdm import tqdm # type: ignore
LOG = logging.getLogger(__name__) LOG = logging.getLogger(__name__)
...@@ -31,15 +29,17 @@ def read_abundances(transcripts_file: str) -> pd.DataFrame: ...@@ -31,15 +29,17 @@ def read_abundances(transcripts_file: str) -> pd.DataFrame:
def filter_df(gtf_df: pd.DataFrame, transcripts: list) -> pd.DataFrame: def filter_df(gtf_df: pd.DataFrame, transcripts: list) -> pd.DataFrame:
"""Filter annotations to include only exons \ """Filter dataframe.
with the highest transcript support level, i.e. TSL1.
Filter annotations to include only exons
with the highest transcript support level, i.e. TSL1.
`feature` column is filtered on value "exon" and `feature` column is filtered on value "exon" and
`free_text` column is filtered to include the string \ `free_text` column is filtered to include the string
denoting the highest transcript support level denoting the highest transcript support level
('transcript_support_level "1"'). ('transcript_support_level "1"').
If a list of transcript IDs is given, `free_text` column \ If a list of transcript IDs is given, `free_text` column
is filtered to include one of the IDs. is filtered to include one of the IDs.
Args: Args:
...@@ -47,7 +47,7 @@ def filter_df(gtf_df: pd.DataFrame, transcripts: list) -> pd.DataFrame: ...@@ -47,7 +47,7 @@ def filter_df(gtf_df: pd.DataFrame, transcripts: list) -> pd.DataFrame:
transcript: list of transcript IDs transcript: list of transcript IDs
Returns: Returns:
A pd.DataFrame containing only rows with exon annotations \ A pd.DataFrame containing only rows with exon annotations
of highest transcript support level and, of highest transcript support level and,
if provided, belonging to one of the given transcripts if provided, belonging to one of the given transcripts
""" """
...@@ -55,7 +55,7 @@ def filter_df(gtf_df: pd.DataFrame, transcripts: list) -> pd.DataFrame: ...@@ -55,7 +55,7 @@ def filter_df(gtf_df: pd.DataFrame, transcripts: list) -> pd.DataFrame:
transcripts = [] transcripts = []
df_filter = gtf_df[ df_filter = gtf_df[
(gtf_df["feature"] == "exon") (gtf_df["feature"] == "exon")
& (gtf_df["free_text"].str.contains('transcript_support_level "1"')) & (gtf_df["free_text"].str.contains('transcript_support_level "1'))
] ]
if len(transcripts) > 0: if len(transcripts) > 0:
df_filter = df_filter[df_filter["free_text"].str.contains( df_filter = df_filter[df_filter["free_text"].str.contains(
...@@ -68,7 +68,7 @@ def filter_df(gtf_df: pd.DataFrame, transcripts: list) -> pd.DataFrame: ...@@ -68,7 +68,7 @@ def filter_df(gtf_df: pd.DataFrame, transcripts: list) -> pd.DataFrame:
def str_to_dict(gene_string: str) -> dict: def str_to_dict(gene_string: str) -> dict:
"""Split between key/value pairs. """Split between key/value pairs.
Split string based on delimiter ';' into items, remove empty items and \ Split string based on delimiter ';' into items, remove empty items and
split items on delimiter ' ' into split items on delimiter ' ' into
key/value pairs. Remove quotes from value strings and create a dictionary. key/value pairs. Remove quotes from value strings and create a dictionary.
...@@ -92,9 +92,9 @@ def dict_to_str(gene_dict: dict) -> str: ...@@ -92,9 +92,9 @@ def dict_to_str(gene_dict: dict) -> str:
Takes e.g. dictionary {'gene_id': 'GENE1', 'transcript_id': 'TRANSCRIPT1'} Takes e.g. dictionary {'gene_id': 'GENE1', 'transcript_id': 'TRANSCRIPT1'}
and returns string 'gene_id "GENE1"; transcript_id "TRANSCRIPT1";'. and returns string 'gene_id "GENE1"; transcript_id "TRANSCRIPT1";'.
Key/value pairs are joined by space to form an item and items are \ Key/value pairs are joined by space to form an item and items are
joinded by ';' to form a string. joinded by ';' to form a string.
If a value is Not a Number (nan), the key/value pair is omitted \ If a value is Not a Number (nan), the key/value pair is omitted
from the string. from the string.
Args: Args:
...@@ -115,13 +115,15 @@ def dict_to_str(gene_dict: dict) -> str: ...@@ -115,13 +115,15 @@ def dict_to_str(gene_dict: dict) -> str:
def reverse_parse_free_text(df_all: pd.DataFrame) -> pd.DataFrame: def reverse_parse_free_text(df_all: pd.DataFrame) -> pd.DataFrame:
"""Reverse parsing of gtf based pd.DataFrame to include only columns that \ """Reverse parse a gtf based pd.DataFrame.
are well defnined by gtf-file standards.
The data frame will include only columns that
are well defnined by gtf-file standards.
The first 8 defined columns are constant as defined by gtf-file standards. The first 8 defined columns are constant as defined by gtf-file standards.
Further columns are assumed to be parsed free-text columns \ Further columns are assumed to be parsed free-text columns
(see Gtf.parse_free_text()). (see Gtf.parse_free_text()).
The parsed free-text columns are aggregated as a dictionary and \ The parsed free-text columns are aggregated as a dictionary and
the dictionry is parsed as a string in gtf format. the dictionry is parsed as a string in gtf format.
Args: Args:
...@@ -165,8 +167,9 @@ def write_gtf(gtf_df: pd.DataFrame, filename: str) -> None: ...@@ -165,8 +167,9 @@ def write_gtf(gtf_df: pd.DataFrame, filename: str) -> None:
def write_header(annotations_file: str) -> None: def write_header(annotations_file: str) -> None:
"""Write the header of an annotations file, consisting of the \ """Write the header of an annotations file.
tab delimited column names.
It consists of the tab delimited column names.
Args: Args:
annotations_file: Filename to write header to. annotations_file: Filename to write header to.
...@@ -182,7 +185,7 @@ class Gtf: ...@@ -182,7 +185,7 @@ class Gtf:
dtypes: A dictionary containing column names and respective data types. dtypes: A dictionary containing column names and respective data types.
parsed: A boolean indicating if the pd.DataFrame is parsed. parsed: A boolean indicating if the pd.DataFrame is parsed.
original_columns: A list of columns not touched by parsing. original_columns: A list of columns not touched by parsing.
free_text_columns: A list of columns created during parsing \ free_text_columns: A list of columns created during parsing
of column `free_text`. of column `free_text`.
""" """
...@@ -240,7 +243,7 @@ class Gtf: ...@@ -240,7 +243,7 @@ class Gtf:
Part of initialization is: Part of initialization is:
Set dataframe attribute Set dataframe attribute
Check which columns belong to the free-text part of the gtf-file. Check which columns belong to the free-text part of the gtf-file.
Check if there are no columns called free-text and if so, sets \ Check if there are no columns called free-text and if so, sets
the value of parsed attribute to TRUE. the value of parsed attribute to TRUE.
Args: Args:
...@@ -254,11 +257,11 @@ class Gtf: ...@@ -254,11 +257,11 @@ class Gtf:
self.parsed = True self.parsed = True
def parse_key_value(self): def parse_key_value(self):
"""Parse key/value pairs from `free_text` column into column `key` \ """Parse key/value pairs.
with row entry `value`.
Creates a dataframe with columns for keys in the free-text column \ From `free_text` column into column `key` with row entry `value`.
instead of `free_text` column. Creates a dataframe with columns for keys in the free-text column
instead of `free_text` column.
Saves it to Gtf.df attribute. Saves it to Gtf.df attribute.
""" """
assert self.parsed is False assert self.parsed is False
...@@ -316,16 +319,15 @@ class TranscriptGenerator: ...@@ -316,16 +319,15 @@ class TranscriptGenerator:
strands = transcript_df["strand"].unique() strands = transcript_df["strand"].unique()
if len(transcript_df) == 0: if len(transcript_df) == 0:
LOG.warning( LOG.warning(
"Transcript %s can't be sampled. \ "Transcript \"%s\" can't be sampled: "
Annotation is missing", transcript_id "Annotation is missing or TSL is not 1.", transcript_id
) )
instance = None instance = None
elif len(strands) > 1: elif len(strands) > 1:
LOG.warning( LOG.warning(
"Transcript %s can't be sampled. Transcript generator \ "Transcript \"%s\" can't be sampled: Transcript generator is "
is not implemented for transcripts with \ "not implemented for transcripts with exons annotated on "
exons annotated on different strands", "different strands.", transcript_id,
transcript_id,
) )
instance = None instance = None
else: else:
...@@ -351,8 +353,8 @@ class TranscriptGenerator: ...@@ -351,8 +353,8 @@ class TranscriptGenerator:
def get_inclusions(self) -> np.ndarray: def get_inclusions(self) -> np.ndarray:
"""Generate inclusions array. """Generate inclusions array.
Each column corresponds to one sample and the number of columns \ Each column corresponds to one sample and the number of columns
corresponds to the number of samples. corresponds to the number of samples.
Returns: Returns:
A boolean np.array, where True means intron inclusion. A boolean np.array, where True means intron inclusion.
...@@ -368,16 +370,18 @@ class TranscriptGenerator: ...@@ -368,16 +370,18 @@ class TranscriptGenerator:
return inclusion_arr return inclusion_arr
def get_unique_inclusions(self) -> tuple[list, np.ndarray, np.ndarray]: def get_unique_inclusions(self) -> tuple[list, np.ndarray, np.ndarray]:
"""Inclusion of unique intron inclusion via arrays and counts and \ """Get unique inclusions.
name generation of each unique count.
Inclusion of unique intron inclusion via arrays and counts and
name generation of each unique count.
Args: Args:
Returns: Returns:
- List of names for generated exons. - List of names for generated exons.
- A boolean np.array where columns correspond to generated \ - A boolean np.array where columns correspond to generated
transcripts and rows to intron inclusion. transcripts and rows to intron inclusion.
- A np.array containing sample number per generated inclusions, \ - A np.array containing sample number per generated inclusions,
i.e. transcript. i.e. transcript.
""" """
inclusion_arr = self.get_inclusions() inclusion_arr = self.get_inclusions()
...@@ -398,8 +402,10 @@ class TranscriptGenerator: ...@@ -398,8 +402,10 @@ class TranscriptGenerator:
def get_df( def get_df(
self, inclusions: np.ndarray, transcript_id: str self, inclusions: np.ndarray, transcript_id: str
) -> pd.DataFrame: ) -> pd.DataFrame:
"""Take as input a dataframe filtered to one transcript and \ """Get dataframe.
a boolean vector denoting intron inclusions.
Take as input a dataframe filtered to one transcript and
a boolean vector denoting intron inclusions.
Args: Args:
inclusions: A boolean vector denoting intron inclusion. inclusions: A boolean vector denoting intron inclusion.
...@@ -467,7 +473,7 @@ class TranscriptGenerator: ...@@ -467,7 +473,7 @@ class TranscriptGenerator:
data_frame = reverse_parse_free_text(data_frame) data_frame = reverse_parse_free_text(data_frame)
write_gtf(data_frame, filename) write_gtf(data_frame, filename)
LOG.debug("Transcript %s sampled", self.ts_id) LOG.debug("Transcript \"%s\" sampled.", self.ts_id)
def sample_transcripts( def sample_transcripts(
...@@ -477,19 +483,21 @@ def sample_transcripts( ...@@ -477,19 +483,21 @@ def sample_transcripts(
output_transcripts_file: str, output_transcripts_file: str,
output_annotations_file: str, output_annotations_file: str,
): ):
"""Read input files, iterate over transcript IDs, \ """Sample transcripts.
sample each transcript and save results.
Read input files, iterate over transcript IDs,
sample each transcript and save results.
Args: Args:
input_transcripts_file: Filename of transcript abundances, \ input_transcripts_file: Filename of transcript abundances,
needs to be csv or tsv. needs to be csv or tsv.
input_annotations_file: Filename of annotations, \ input_annotations_file: Filename of annotations,
needs to be gtf. needs to be gtf.
prob_inclusion: Probability of intron inclusion, \ prob_inclusion: Probability of intron inclusion,
needs to be float in range [0,1]. needs to be float in range [0,1].
output_transcripts_file: Filename of file to write \ output_transcripts_file: Filename of file to write
sampled transcripts to. sampled transcripts to.
output_annotations_file: Filename of file to write \ output_annotations_file: Filename of file to write
generated annotations to. generated annotations to.
""" """
LOG.info("Probability of intron inclusion: %s", str(prob_inclusion)) LOG.info("Probability of intron inclusion: %s", str(prob_inclusion))
...@@ -509,7 +517,7 @@ def sample_transcripts( ...@@ -509,7 +517,7 @@ def sample_transcripts(
# Set up output file, write header once and append data in loop # Set up output file, write header once and append data in loop
write_header(output_annotations_file) write_header(output_annotations_file)
for _, row in tqdm(transcripts.iterrows()): for _, row in transcripts.iterrows():
transcript_id = row["id"] transcript_id = row["id"]
transcript_count = row["count"] transcript_count = row["count"]
......