Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
T
Transcript structure generator
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
zavolan_group
tools
Transcript structure generator
Compare revisions
261ddccada388c0ee1ec3e691e8d84a7845b2fac to 30ca7f20184b6c9e804fc89d27ca2864ddbfbb9c
Compare revisions
Changes are shown as if the
source
revision was being merged into the
target
revision.
Learn more about comparing revisions.
Source
zavolan_group/tools/transcript-structure-generator
Select target project
No results found
30ca7f20184b6c9e804fc89d27ca2864ddbfbb9c
Select Git revision
Branches
main
review_milestone_2
Swap
Target
zavolan_group/tools/transcript-structure-generator
Select target project
zavolan_group/tools/transcript-structure-generator
1 result
261ddccada388c0ee1ec3e691e8d84a7845b2fac
Select Git revision
Branches
main
review_milestone_2
Show changes
Only incoming changes from source
Include changes to target since source was created
Compare
Commits on Source (3)
feat: add setup.py
· b23e7f3f
Mate Balajti
authored
1 year ago
b23e7f3f
refactor: update main.py with TSL support
· f8aefb82
Mate Balajti
authored
1 year ago
f8aefb82
refactor: remove tqdm, update cli and main
· 30ca7f20
Mate Balajti
authored
1 year ago
30ca7f20
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
requirements.txt
+0
-1
0 additions, 1 deletion
requirements.txt
requirements_dev.txt
+0
-1
0 additions, 1 deletion
requirements_dev.txt
setup.py
+26
-0
26 additions, 0 deletions
setup.py
tsg/cli.py
+2
-2
2 additions, 2 deletions
tsg/cli.py
tsg/main.py
+54
-46
54 additions, 46 deletions
tsg/main.py
with
82 additions
and
50 deletions
requirements.txt
View file @
30ca7f20
matplotlib
pandas
tqdm
This diff is collapsed.
Click to expand it.
requirements_dev.txt
View file @
30ca7f20
matplotlib
pandas
pip
tqdm
mypy
flake8
flake8-docstrings
...
...
This diff is collapsed.
Click to expand it.
setup.py
0 → 100644
View file @
30ca7f20
"""
Set up project.
"""
from
pathlib
import
Path
from
setuptools
import
setup
,
find_packages
project_root_dir
=
Path
(
__file__
).
parent
.
resolve
()
with
open
(
project_root_dir
/
"
requirements.txt
"
,
"
r
"
,
encoding
=
"
utf-8
"
)
as
f
:
INSTALL_REQUIRES
=
f
.
read
().
splitlines
()
URL
=
(
'
https://git.scicore.unibas.ch/zavolan_group/
'
'
tools/transcript-structure-generator
'
)
setup
(
name
=
'
transcript-structure-generator
'
,
version
=
'
0.2.0
'
,
url
=
URL
,
license
=
'
MIT
'
,
author
=
'
Larissa Glass, Michael Zimmermann, Andri Fraenkl
'
,
author_email
=
'
mate.balajti@unibas.ch
'
,
description
=
'
Transcript structure generator
'
,
packages
=
find_packages
(),
install_requires
=
INSTALL_REQUIRES
,
entry_points
=
{
'
console_scripts
'
:
[
'
transcript-structure-generator=tsg.cli:app
'
]
}
)
This diff is collapsed.
Click to expand it.
tsg/cli.py
View file @
30ca7f20
...
...
@@ -30,8 +30,8 @@ def setup_logging(loglevel: str) -> None:
raise
logging
.
basicConfig
(
format
=
'
[%(asctime)s: %(levelname)s]
\
%(message)s (module
"
%(module)s
"
)
'
,
format
=
(
'
[%(asctime)s: %(levelname)s]
'
'
%(message)s (module
"
%(module)s
"
)
'
)
,
level
=
numeric_level
,
)
...
...
This diff is collapsed.
Click to expand it.
tsg/main.py
View file @
30ca7f20
...
...
@@ -4,8 +4,6 @@ import logging
import
numpy
as
np
import
pandas
as
pd
# type: ignore
from
tqdm
import
tqdm
# type: ignore
LOG
=
logging
.
getLogger
(
__name__
)
...
...
@@ -31,15 +29,17 @@ def read_abundances(transcripts_file: str) -> pd.DataFrame:
def
filter_df
(
gtf_df
:
pd
.
DataFrame
,
transcripts
:
list
)
->
pd
.
DataFrame
:
"""
Filter annotations to include only exons
\
with the highest transcript support level, i.e. TSL1.
"""
Filter dataframe.
Filter annotations to include only exons
with the highest transcript support level, i.e. TSL1.
`feature` column is filtered on value
"
exon
"
and
`free_text` column is filtered to include the string
\
`free_text` column is filtered to include the string
denoting the highest transcript support level
(
'
transcript_support_level
"
1
"'
).
If a list of transcript IDs is given, `free_text` column
\
If a list of transcript IDs is given, `free_text` column
is filtered to include one of the IDs.
Args:
...
...
@@ -47,7 +47,7 @@ def filter_df(gtf_df: pd.DataFrame, transcripts: list) -> pd.DataFrame:
transcript: list of transcript IDs
Returns:
A pd.DataFrame containing only rows with exon annotations
\
A pd.DataFrame containing only rows with exon annotations
of highest transcript support level and,
if provided, belonging to one of the given transcripts
"""
...
...
@@ -55,7 +55,7 @@ def filter_df(gtf_df: pd.DataFrame, transcripts: list) -> pd.DataFrame:
transcripts
=
[]
df_filter
=
gtf_df
[
(
gtf_df
[
"
feature
"
]
==
"
exon
"
)
&
(
gtf_df
[
"
free_text
"
].
str
.
contains
(
'
transcript_support_level
"
1
"
'
))
&
(
gtf_df
[
"
free_text
"
].
str
.
contains
(
'
transcript_support_level
"
1
'
))
]
if
len
(
transcripts
)
>
0
:
df_filter
=
df_filter
[
df_filter
[
"
free_text
"
].
str
.
contains
(
...
...
@@ -68,7 +68,7 @@ def filter_df(gtf_df: pd.DataFrame, transcripts: list) -> pd.DataFrame:
def
str_to_dict
(
gene_string
:
str
)
->
dict
:
"""
Split between key/value pairs.
Split string based on delimiter
'
;
'
into items, remove empty items and
\
Split string based on delimiter
'
;
'
into items, remove empty items and
split items on delimiter
'
'
into
key/value pairs. Remove quotes from value strings and create a dictionary.
...
...
@@ -92,9 +92,9 @@ def dict_to_str(gene_dict: dict) -> str:
Takes e.g. dictionary {
'
gene_id
'
:
'
GENE1
'
,
'
transcript_id
'
:
'
TRANSCRIPT1
'
}
and returns string
'
gene_id
"
GENE1
"
; transcript_id
"
TRANSCRIPT1
"
;
'
.
Key/value pairs are joined by space to form an item and items are
\
Key/value pairs are joined by space to form an item and items are
joinded by
'
;
'
to form a string.
If a value is Not a Number (nan), the key/value pair is omitted
\
If a value is Not a Number (nan), the key/value pair is omitted
from the string.
Args:
...
...
@@ -115,13 +115,15 @@ def dict_to_str(gene_dict: dict) -> str:
def
reverse_parse_free_text
(
df_all
:
pd
.
DataFrame
)
->
pd
.
DataFrame
:
"""
Reverse parsing of gtf based pd.DataFrame to include only columns that
\
are well defnined by gtf-file standards.
"""
Reverse parse a gtf based pd.DataFrame.
The data frame will include only columns that
are well defnined by gtf-file standards.
The first 8 defined columns are constant as defined by gtf-file standards.
Further columns are assumed to be parsed free-text columns
\
Further columns are assumed to be parsed free-text columns
(see Gtf.parse_free_text()).
The parsed free-text columns are aggregated as a dictionary and
\
The parsed free-text columns are aggregated as a dictionary and
the dictionry is parsed as a string in gtf format.
Args:
...
...
@@ -165,8 +167,9 @@ def write_gtf(gtf_df: pd.DataFrame, filename: str) -> None:
def
write_header
(
annotations_file
:
str
)
->
None
:
"""
Write the header of an annotations file, consisting of the
\
tab delimited column names.
"""
Write the header of an annotations file.
It consists of the tab delimited column names.
Args:
annotations_file: Filename to write header to.
...
...
@@ -182,7 +185,7 @@ class Gtf:
dtypes: A dictionary containing column names and respective data types.
parsed: A boolean indicating if the pd.DataFrame is parsed.
original_columns: A list of columns not touched by parsing.
free_text_columns: A list of columns created during parsing
\
free_text_columns: A list of columns created during parsing
of column `free_text`.
"""
...
...
@@ -240,7 +243,7 @@ class Gtf:
Part of initialization is:
Set dataframe attribute
Check which columns belong to the free-text part of the gtf-file.
Check if there are no columns called free-text and if so, sets
\
Check if there are no columns called free-text and if so, sets
the value of parsed attribute to TRUE.
Args:
...
...
@@ -254,11 +257,11 @@ class Gtf:
self
.
parsed
=
True
def
parse_key_value
(
self
):
"""
Parse key/value pairs from `free_text` column into column `key`
\
with row entry `value`.
"""
Parse key/value pairs.
Creates a dataframe with columns for keys in the free-text column
\
instead of `free_text` column.
From `free_text` column into column `key` with row entry `value`.
Creates a dataframe with columns for keys in the free-text column
instead of `free_text` column.
Saves it to Gtf.df attribute.
"""
assert
self
.
parsed
is
False
...
...
@@ -316,16 +319,15 @@ class TranscriptGenerator:
strands
=
transcript_df
[
"
strand
"
].
unique
()
if
len
(
transcript_df
)
==
0
:
LOG
.
warning
(
"
Transcript
%s
can
'
t be sampled
.
\
Annotation is missing
"
,
transcript_id
"
Transcript
\"
%s
\"
can
'
t be sampled
:
"
"
Annotation is missing
or TSL is not 1.
"
,
transcript_id
)
instance
=
None
elif
len
(
strands
)
>
1
:
LOG
.
warning
(
"
Transcript %s can
'
t be sampled. Transcript generator
\
is not implemented for transcripts with
\
exons annotated on different strands
"
,
transcript_id
,
"
Transcript
\"
%s
\"
can
'
t be sampled: Transcript generator is
"
"
not implemented for transcripts with exons annotated on
"
"
different strands.
"
,
transcript_id
,
)
instance
=
None
else
:
...
...
@@ -351,8 +353,8 @@ class TranscriptGenerator:
def
get_inclusions
(
self
)
->
np
.
ndarray
:
"""
Generate inclusions array.
Each column corresponds to one sample and the number of columns
\
corresponds to the number of samples.
Each column corresponds to one sample and the number of columns
corresponds to the number of samples.
Returns:
A boolean np.array, where True means intron inclusion.
...
...
@@ -368,16 +370,18 @@ class TranscriptGenerator:
return
inclusion_arr
def
get_unique_inclusions
(
self
)
->
tuple
[
list
,
np
.
ndarray
,
np
.
ndarray
]:
"""
Inclusion of unique intron inclusion via arrays and counts and
\
name generation of each unique count.
"""
Get unique inclusions.
Inclusion of unique intron inclusion via arrays and counts and
name generation of each unique count.
Args:
Returns:
- List of names for generated exons.
- A boolean np.array where columns correspond to generated
\
- A boolean np.array where columns correspond to generated
transcripts and rows to intron inclusion.
- A np.array containing sample number per generated inclusions,
\
- A np.array containing sample number per generated inclusions,
i.e. transcript.
"""
inclusion_arr
=
self
.
get_inclusions
()
...
...
@@ -398,8 +402,10 @@ class TranscriptGenerator:
def
get_df
(
self
,
inclusions
:
np
.
ndarray
,
transcript_id
:
str
)
->
pd
.
DataFrame
:
"""
Take as input a dataframe filtered to one transcript and
\
a boolean vector denoting intron inclusions.
"""
Get dataframe.
Take as input a dataframe filtered to one transcript and
a boolean vector denoting intron inclusions.
Args:
inclusions: A boolean vector denoting intron inclusion.
...
...
@@ -467,7 +473,7 @@ class TranscriptGenerator:
data_frame
=
reverse_parse_free_text
(
data_frame
)
write_gtf
(
data_frame
,
filename
)
LOG
.
debug
(
"
Transcript
%s
sampled
"
,
self
.
ts_id
)
LOG
.
debug
(
"
Transcript
\"
%s
\"
sampled
.
"
,
self
.
ts_id
)
def
sample_transcripts
(
...
...
@@ -477,19 +483,21 @@ def sample_transcripts(
output_transcripts_file
:
str
,
output_annotations_file
:
str
,
):
"""
Read input files, iterate over transcript IDs,
\
sample each transcript and save results.
"""
Sample transcripts.
Read input files, iterate over transcript IDs,
sample each transcript and save results.
Args:
input_transcripts_file: Filename of transcript abundances,
\
input_transcripts_file: Filename of transcript abundances,
needs to be csv or tsv.
input_annotations_file: Filename of annotations,
\
input_annotations_file: Filename of annotations,
needs to be gtf.
prob_inclusion: Probability of intron inclusion,
\
prob_inclusion: Probability of intron inclusion,
needs to be float in range [0,1].
output_transcripts_file: Filename of file to write
\
output_transcripts_file: Filename of file to write
sampled transcripts to.
output_annotations_file: Filename of file to write
\
output_annotations_file: Filename of file to write
generated annotations to.
"""
LOG
.
info
(
"
Probability of intron inclusion: %s
"
,
str
(
prob_inclusion
))
...
...
@@ -509,7 +517,7 @@ def sample_transcripts(
# Set up output file, write header once and append data in loop
write_header
(
output_annotations_file
)
for
_
,
row
in
tqdm
(
transcripts
.
iterrows
()
)
:
for
_
,
row
in
transcripts
.
iterrows
():
transcript_id
=
row
[
"
id
"
]
transcript_count
=
row
[
"
count
"
]
...
...
This diff is collapsed.
Click to expand it.