Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
T
Transcript sampler
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
zavolan_group
tools
Transcript sampler
Commits
441f364b
Commit
441f364b
authored
2 years ago
by
Laura Urbanska
Browse files
Options
Downloads
Patches
Plain Diff
added script and updated representative transcript script
parent
1d5a1253
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
scripts/match_reprTranscript_expressionLevel.py
+200
-0
200 additions, 0 deletions
scripts/match_reprTranscript_expressionLevel.py
scripts/representative_v3.py
+0
-63
0 additions, 63 deletions
scripts/representative_v3.py
scripts/representative_v4.py
+96
-0
96 additions, 0 deletions
scripts/representative_v4.py
with
296 additions
and
63 deletions
scripts/match_reprTranscript_expressionLevel.py
0 → 100644
+
200
−
0
View file @
441f364b
import
pandas
as
pd
import
json
import
re
import
rerpresentative_v4
as
repr
import
os
def
dict_reprTrans_to_df
(
dict_reprTrans
:
dict
):
"""
Convert a dictionary of genes and their representative transcript into a dataframe
Args:
dict_reprTrans (dict) : {
'
Gene
'
:[
'
transcriptA
'
,
'
transcriptB
'
], ...}
Returns:
Pandas dataframe having Gene and transcript as columns
Raises:
/!\ None, I wasn
'
t able to make a TypeError with dict
: Only dict made of key string and value string is allowed
"""
pass
df_reprTrans
=
pd
.
DataFrame
.
from_dict
(
dict_reprTrans
,
orient
=
"
index
"
,
columns
=
[
"
reprTranscript
"
])
df_reprTrans
=
df_reprTrans
.
reset_index
(
level
=
0
)
df_reprTrans
.
columns
=
[
"
Gene
"
,
'
reprTrans
'
]
df_reprTrans
[
"
reprTrans
"
]
=
df_reprTrans
[
"
reprTrans
"
].
str
.
replace
(
r
'
\.[1-9]
'
,
''
,
regex
=
True
)
return
df_reprTrans
def
txt_to_dict
(
dict_txt
:
str
):
"""
Convert a txt file into a dictionary
Args:
dict_txt (str) : pathe to a txt file of a dict
structured as {
'
Gene
'
:[
'
transcriptA
'
,
'
transcriptB
'
], ...}
Returns:
dict (dict) : dictionary stuctured as {
'
Gene
'
:[
'
transcriptA
'
,
'
transcriptB
'
], ...}
Raises:
None
"""
pass
input
:
str
=
open
(
dict_txt
,
"
r
"
).
read
()
input
:
str
=
input
.
replace
(
"
\'
"
,
"
\"
"
)
dict
=
json
.
loads
(
input
)
return
dict
def
transcripts_by_gene_inDf
(
df_gtfSelection
:
str
)
->
pd
.
DataFrame
:
"""
Convert multiindex dataframe from function into a simple dataframe
Args:
df_gtfSelection (str): Pandas multiindex dataframe having Gene,
transcript as indexs and support level as columns.
Come from the function import_gtfSelection_to_df()
Returns:
df_gene (str): Pandas dataframe having Gene and
transcript as columns
Raises:
None
"""
pass
df_gene
=
df_gtfSelection
.
set_index
([
"
Gene
"
])
df_gene
=
df_gene
.
drop
(
columns
=
[
"
Support_level
"
])
df_gene
[
'
Transcript
'
]
=
df_gene
[
'
Transcript
'
].
str
.
replace
(
r
"
\.[0-9]
"
,
""
,
regex
=
True
)
df_gene
=
df_gene
.
reset_index
(
level
=
0
)
return
df_gene
def
tsv_or_csv_to_df
(
input_txt
:
str
)
:
"""
Convert tsv or csv file into a pandas dataframe
Args:
input_txt (str): csv or tsv file containing transcript expression level
Returns:
df_gene (str): Pandas dataframe having transcript and expression level
as columns
Raises:
None
"""
pass
df_input
=
pd
.
read_csv
(
input_txt
,
sep
=
r
"
[\t,]
"
,
lineterminator
=
'
\n
'
,
names
=
[
"
Transcript
"
,
"
Expression_level
"
],
engine
=
"
python
"
)
return
df_input
def
exprLevel_byGene
(
df_exprTrasncript
:
str
,
df_output_gtf_selection
:
str
)
->
pd
.
DataFrame
:
"""
Find matching transcripts bewteen the 2 args
Args:
df_exprTranscript (str): pandas Dataframe containing transcript and their expression level
df_output_gtf_selection (str) : pandas Dataframe containing genes and transcripts
Returns:
Pandas dataframe having gene and sum of its transcript expression level
Raises:
None
"""
pass
df_merged
=
pd
.
merge
(
df_output_gtf_selection
,
df_exprTrasncript
,
how
=
"
inner
"
,
on
=
"
Transcript
"
)
df_sum
=
df_merged
.
groupby
(
"
Gene
"
).
sum
(
"
Expression_level
"
)
# sum transcripts comming from the same gene
return
df_sum
def
match_byGene
(
df_reprTranscript
:
str
,
df_expressionLevel_byGene
:
str
)
->
pd
.
DataFrame
:
"""
Find matching genes bewteen the 2 args
Args:
df_reprTranscript (str): pandas Dataframe containing genes
and their representative transcript
df_expressionLevel_byGene (str) : pandas Dataframe containing
genes and their expression level
Returns:
Pandas dataframe having representative trasncripts
and their expression level
Raises:
None
"""
pass
df_merged
=
pd
.
merge
(
df_reprTranscript
,
df_expressionLevel_byGene
,
how
=
"
outer
"
,
on
=
"
Gene
"
)
df_clean
=
df_merged
.
dropna
(
axis
=
0
)
df_clean
=
df_clean
.
loc
[:,
[
"
reprTrans
"
,
"
Expression_level
"
]]
return
df_clean
def
output_tsv
(
dataframe
:
str
)
->
pd
.
DataFrame
:
"""
Convert pandas dataframe into a tsv file
Args:
dataframe (str): Pandas dataframe containing
representative transcripts and their expression level
Returns:
Tsv file containing representative transcripts
and their expression level in the same directory
Raises:
None
"""
pass
csv_file
=
dataframe
.
to_csv
(
os
.
getcwd
()
+
"
\ReprTrans_ExpressionLevel.tsv
"
,
sep
=
"
\t
"
,
index
=
False
,
header
=
False
)
return
csv_file
### functions to run this part of the programm
def
match_reprTranscript_expressionLevel
(
exprTrans
:
str
,
dict_reprTrans
:
dict
,
intermediate_file
:
str
):
"""
Combine functions to replace transcripts from an expression level csv/tsv file
with representative transcripts
Args:
exprTrans (str): csv or tsv file containing transcripts
and their expression level
dict_reprTrans (dict) : dict of genes and their
representative transcipt
intemediate_file (str) : txt file containing genes, transcript
and their expression level from the transkript_extractor function
Returns:
tsv file of representative trasncripts and their expression level
Raises:
None
"""
df_intermediate
=
repr
.
import_gtfSelection_to_df
(
intermediate_file
)
df_geneTrans
=
transcripts_by_gene_inDf
(
df_intermediate
)
df_exprTrans
=
tsv_or_csv_to_df
(
exprTrans
)
df_reprTrans
=
dict_reprTrans_to_df
(
dict_reprTrans
)
df_exprLevel_byGene
=
exprLevel_byGene
(
df_exprTrans
,
df_geneTrans
)
df_match
=
match_byGene
(
df_reprTrans
,
df_exprLevel_byGene
)
output
=
output_tsv
(
df_match
)
return
output
# run the programm
dict_txt
=
a
#input a dict of {gene:reprTrans} in the form of a txt file
input_intermediate_file
=
b
#input the intermediate file generated by transckript extractor
input_expr
=
c
#input a csv or tsv file containing the expr level
dict_reprTrans
=
txt_to_dict
(
dict_txt
)
match_final
=
match_reprTranscript_expressionLevel
(
input_expr
,
dict_reprTrans
,
input_intermediate_file
)
print
(
"
this is the function :
\n\n
{}
"
.
format
(
match_final
))
if
__name__
==
"
__main__
"
:
match_reprTranscript_expressionLevel
()
\ No newline at end of file
This diff is collapsed.
Click to expand it.
scripts/representative_v3.py
deleted
100644 → 0
+
0
−
63
View file @
1d5a1253
import
pandas
as
pd
import
re
import
itertools
'''
This code take as input a gtf file and returns a dictionary of transcripts with best support level of each gene of the input
'''
##import modified gtf file and create a df##
def
import_gtfSelection_to_df
(
gtf_modified_file
):
#create a df from the tab separated file input
df_input
=
pd
.
read_csv
(
gtf_modified_file
,
sep
=
'
\t
'
,
lineterminator
=
'
\n
'
,
names
=
[
"
Gene_mixed
"
,
"
Transcript
"
,
"
Support_level
"
,
"
Na1
"
,
"
Na2
"
]
)
df_input
[
"
Support_level
"
]
=
df_input
[
"
Support_level
"
].
replace
(
"
"
,
""
)
#Create a new column with only gene name from Gene_mixed column
df_input
[
"
Gene
"
]
=
df_input
[
"
Gene_mixed
"
].
str
.
extract
(
'
([A-Z]\w{0,})
'
,
expand
=
True
)
#Create a new column with only transcript number from Gene_mixed column
df_input
[
"
Transcript_number
"
]
=
df_input
[
"
Gene_mixed
"
].
str
.
extract
(
'
(^\d)
'
,
expand
=
True
)
#Create a new df with relevant column and without NA
df_clean
=
df_input
.
loc
[:,
[
"
Gene
"
,
"
Transcript
"
,
"
Support_level
"
]]
df_clean
[
"
Gene
"
]
=
df_clean
[
"
Gene
"
].
fillna
(
method
=
'
ffill
'
)
df_clean
=
df_clean
.
dropna
(
axis
=
0
)
return
df_clean
##Returns a df containing representative transcripts and their expression level from genes mentioned in the csv file##
def
representative_transcripts_inDict
(
df_gtfSelection
):
#create a df indexed on booth Gene and Transcript columns
df_multIndex
=
df_gtfSelection
.
set_index
([
"
Gene
"
,
"
Transcript
"
])
#create a df with only the transcripts with the highest support level (best is = 1 )
df_min
=
df_multIndex
.
groupby
(
level
=
[
"
Gene
"
])[
"
Support_level
"
].
transform
(
"
min
"
)
print
(
"
\n
=== This is your 10 first representative transcripts : ===
\n
\n
{}
"
.
format
(
df_min
.
head
(
10
)))
#create a df without transcript levels
df_final
=
df_multIndex
.
reset_index
(
level
=
"
Transcript
"
)
df_final
=
df_final
.
drop
(
columns
=
[
"
Support_level
"
])
#create a dict with only Gene and representative transcripts
dict_representative_transcripts
=
df_final
.
groupby
(
"
Gene
"
)[
"
Transcript
"
].
apply
(
list
).
to_dict
()
return
dict_representative_transcripts
### add your inputs here ! ###
gtf_file
=
"
Homo_sapiens.GRCh38.107_intermediat_file.txt
"
# add the gtf input file here
df_gtf
=
import_gtfSelection_to_df
(
gtf_file
)
dictionary_of_representative_transcripts
=
representative_transcripts_inDict
(
df_gtf
)
This diff is collapsed.
Click to expand it.
scripts/representative_v4.py
0 → 100644
+
96
−
0
View file @
441f364b
import
pandas
as
pd
'''
This part of the code take as input a gtf modified file
and return a dictionary of transcripts with best
support level for each gene of the input
'''
def
import_gtfSelection_to_df
(
gtf_modified_file
:
str
):
"""
Import intermediate file from gtf and create a df
Args:
gtf_modified_file (str) : path to the intermediate file
Returns:
Pandas dataframe having Gene, transcript
and support level as columns
Raises:
TypeError : Only str path is allowed
"""
pass
if
not
type
(
gtf_modified_file
)
is
str
:
raise
TypeError
(
"
Only str path is allowed
"
)
df_input
=
pd
.
read_csv
(
gtf_modified_file
,
sep
=
'
\t
'
,
lineterminator
=
'
\n
'
,
names
=
[
"
Gene_mixed
"
,
"
Transcript
"
,
"
Support_level
"
,
"
Na1
"
,
"
Na2
"
]
)
df_input
[
"
Support_level
"
]
=
df_input
[
"
Support_level
"
].
replace
(
"
"
,
""
)
df_input
[
"
Gene
"
]
=
df_input
[
"
Gene_mixed
"
].
str
.
extract
(
'
([A-Z]\w{0,})
'
,
expand
=
True
)
df_input
[
"
Transcript_number
"
]
=
df_input
[
"
Gene_mixed
"
].
str
.
extract
(
'
(^\d)
'
,
expand
=
True
)
df_clean
=
df_input
.
loc
[:,
[
"
Gene
"
,
"
Transcript
"
,
"
Support_level
"
]]
df_clean
[
"
Gene
"
]
=
df_clean
[
"
Gene
"
].
fillna
(
method
=
'
ffill
'
)
df_clean
=
df_clean
.
dropna
(
axis
=
0
)
return
df_clean
def
representative_transcripts_inDict
(
df_gtfSelection
:
str
)
->
pd
.
DataFrame
:
"""
Return a dict containing for each gene transcripts
with highest confidence level
Args:
df_gtfSelection (str): Pandas dataframe having Gene,
transcript and support level as columns
Returns:
Dict {
'
Gene
'
:[
'
transcriptA
'
,
'
transcriptB
'
], ...}
Raises:
TypeError : Only pandas DataFrame is allowed
"""
pass
if
not
type
(
df_gtfSelection
)
is
pd
.
DataFrame
:
raise
TypeError
(
"
Only pandas DataFrame is allowed
"
)
df_multIndex
=
df_gtfSelection
.
set_index
([
"
Gene
"
,
"
Transcript
"
])
#highest support level = 1 , worst = 5, NA = 100
df_min
=
df_multIndex
.
groupby
(
level
=
[
"
Gene
"
])[
"
Support_level
"
].
transform
(
"
min
"
)
df_final
=
df_min
.
reset_index
(
level
=
"
Transcript
"
)
df_final
=
df_final
.
drop
(
columns
=
[
"
Support_level
"
])
dict_representative_transcripts
=
df_final
.
groupby
(
"
Gene
"
)[
"
Transcript
"
].
apply
(
list
).
to_dict
()
return
dict_representative_transcripts
def
find_repr_by_SupportLevel
(
intermediate_file
:
str
):
"""
Combine functions import_gtfSelection_to_df()
and representative_transcripts_inDict()
Args:
intermediate_file : path to the intermediate file
Returns:
Dict {
'
Gene
'
:[
'
transcriptA
'
,
'
transcriptB
'
], ...}
Raises:
None
"""
pass
df_gtf
=
import_gtfSelection_to_df
(
intermediate_file
)
dict_reprTrans
=
representative_transcripts_inDict
(
df_gtf
)
return
dict_reprTrans
if
__name__
==
"
__main__
"
:
find_repr_by_SupportLevel
()
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment