Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
S
scRNA-seq-simulation
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
zavolan_group
pipelines
scRNA-seq-simulation
Merge requests
!14
adding cDNA generation script
Code
Review changes
Check out branch
Download
Patches
Plain diff
Closed
adding cDNA generation script
suvi
into
main
Overview
1
Commits
1
Pipelines
1
Changes
1
Closed
Suvarnan Selliah
requested to merge
suvi
into
main
3 years ago
Overview
1
Commits
1
Pipelines
1
Changes
1
Expand
solving
#5
0
0
Merge request reports
Compare
main
main (base)
and
latest version
latest version
1036b432
1 commit,
3 years ago
1 file
+
239
−
0
Inline
Compare changes
Side-by-side
Inline
Show whitespace changes
Show one file at a time
src/generate_cdna.py
0 → 100644
+
239
−
0
Options
#!/usr/bin/env python3
#Author: Suvarnan Selliah
class
GeneratecDNA
:
def
generatecDNA
(
fasta
,
gtf
,
cp_nr
,
my_output_fasta
=
"
cDNA.fasta
"
,
my_output_csv
=
"
cDNA.csv
"
,
placeholder
=
"
ph-file.csv
"
):
cDNA_transcript_id
=
1
#READING INPUT FILES / PART I
#open files
with
open
(
fasta
,
'
r
'
)
as
fa
,
open
(
gtf
,
'
r
'
)
as
gt
,
open
(
cp_nr
,
'
r
'
)
as
cp
:
#read fasta-file
for
myFastaline
in
fa
:
#search for transcript id and transcript sequence in fasta-file
fasta_id
=
""
fasta_seq
=
""
fasta_id_found
=
False
fasta_seq_found
=
False
currentFastaString
=
myFastaline
if
fasta_id_found
==
False
:
position_of_start
=
currentFastaString
.
find
(
'
>
'
)
if
position_of_start
!=
0
:
continue
elif
position_of_start
==
0
:
fasta_id
=
myFastaline
fasta_id
=
fasta_id
.
replace
(
"
>
"
,
""
)
fasta_id_found
=
True
continue
else
:
print
(
"
FASTA: Start position in fasta file not found
"
)
break
if
fasta_id_found
==
True
and
fasta_seq_found
==
False
:
while
fasta_seq_found
==
False
:
currentFastaString
=
fa
.
readline
()
zero_position
=
currentFastaString
[
0
]
if
zero_position
==
"
;
"
:
continue
elif
zero_position
==
"
>
"
:
print
(
"
FASTA: No Sequence after headline
"
)
break
else
:
fasta_seq
=
currentFastaString
fasta_seq_found
=
True
#starting to work with gtf-file
#defining variables for gtf-file
gtf_seqname
=
''
gtf_start
=
0
gtf_end
=
0
gtf_score
=
0.0
gtf_info_found
=
False
gtf_entries
=
0
gtf_list_of_lines
=
[]
gtf_prob_list
=
[]
#defining variables for csv-file
csv_trans_id
=
""
csv_gene_id
=
""
csv_count
=
0
csv_info_found
=
False
if
fasta_id_found
==
True
and
fasta_seq_found
==
True
:
# search for transcript id from fasta-file in gtf-file
for
myGTFline
in
gt
:
currentGTFString
=
myGTFline
gtf_list
=
currentGTFString
.
split
(
'
\t
'
)
gtf_seqname
=
gtf_list
[
0
]
if
gtf_seqname
==
fasta_id
:
gtf_entries
+=
1
gtf_start
=
gtf_list
[
3
]
gtf_end
=
gtf_list
[
4
]
gtf_score
=
gtf_list
[
5
]
gtf_temp_list
=
[]
gtf_temp_list
.
append
(
gtf_start
)
gtf_temp_list
.
append
(
gtf_end
)
gtf_temp_list
.
append
(
gtf_score
)
gtf_list_of_lines
.
append
(
gtf_temp_list
)
gtf_prob_list
.
append
(
gtf_score
)
else
:
continue
if
gtf_entries
!=
0
:
gtf_info_found
=
True
fasta_id_found
=
False
fasta_seq_found
=
False
assert
gtf_info_found
,
"
Sequence ID from fasta-file not found in gtf-file
"
#search copy number of transcript in 3. file/csv-file
for
myCSVline
in
cp
:
currentCSVString
=
myCSVline
csv_list
=
currentCSVString
.
split
(
'
,
'
)
csv_trans_id
=
csv_list
[
0
]
if
csv_trans_id
==
fasta_id
:
csv_gene_id
=
csv_list
[
1
]
csv_count
=
csv_list
[
2
]
csv_info_found
=
True
gtf_info_found
=
False
break
else
:
continue
assert
csv_info_found
,
"
Data (TranscriptID,GeneID,Count) from csv-file/3.file not found
"
#COMPUTATION & OUTPUT / PART II
#set score to 0 for primimg sites close to the end of sequence
csv_info_found
=
False
seq_len
=
len
(
fasta_seq
)
seq_len
-=
22
gtf_list_of_lines_len
=
len
(
gtf_list_of_lines
)
for
i
in
range
(
gtf_list_of_lines_len
):
if
gtf_list_of_lines
[
i
][
1
]
>
seq_len
:
gtf_list_of_lines
[
i
][
2
]
=
0.0
gtf_prob_list
[
i
]
=
0.0
#assign priming sites (according to score) to copy number
sum_of_score
=
0.0
for
i
in
gtf_prob_list
:
sum_of_score
+=
i
one_score
=
100
/
sum_of_score
norm_list
=
[]
gtf_prob_list_len
=
len
(
gtf_prob_list
)
for
i
in
range
(
gtf_prob_list_len
):
norm_list
.
append
((
one_score
*
gtf_prob_list
[
i
]))
one_norm
=
csv_count
/
100
distr_RNA_to_prim_sites
=
[]
norm_list_len
=
len
(
norm_list
)
for
i
in
range
(
norm_list_len
):
distr_RNA_to_prim_sites
.
append
((
one_norm
*
norm_list
[
i
]))
total_RNA_number
=
0
distr_RNA_to_prim_sites_len
=
len
(
distr_RNA_to_prim_sites
)
for
i
in
range
(
distr_RNA_to_prim_sites_len
):
total_RNA_number
+=
distr_RNA_to_prim_sites
[
i
]
new_distr_RNA_to_prim_sites
=
[]
if
total_RNA_number
!=
csv_count
:
for
i
in
range
(
distr_RNA_to_prim_sites_len
):
new_distr_RNA_to_prim_sites
.
append
(
int
(
distr_RNA_to_prim_sites
[
i
]))
new_distr_RNA_to_prim_sites_len
=
len
(
new_distr_RNA_to_prim_sites
)
counter
=
0
while
total_RNA_number
!=
csv_count
:
new_distr_RNA_to_prim_sites
[
counter
]
=
round
(
distr_RNA_to_prim_sites
[
counter
])
counter
+=
1
assert
counter
<=
new_distr_RNA_to_prim_sites_len
,
"
Calculated RNA transcripts (assigned to priming sites) are more than initial count
"
#order the priming sites
prim_sites_ordered
=
[]
for
i
in
range
(
gtf_list_of_lines_len
):
prim_sites_ordered
.
append
(
gtf_list_of_lines
[
i
][
0
])
prim_sites_ordered
.
sort
()
#searching for 2 priming sites
prim_sites_ordered_len
=
len
(
prim_sites_ordered
)
ph_1
=
0
ph_2
=
0
for
i
in
range
(
0
,
(
prim_sites_ordered_len
-
1
),
2
):
if
prim_sites_ordered
[
i
]
==
0.0
:
continue
else
:
search_for_1
=
prim_sites_ordered
[
i
]
search_for_2
=
prim_sites_ordered
[
i
+
1
]
for
j
in
range
(
gtf_list_of_lines_len
):
if
gtf_list_of_lines
[
j
][
0
]
==
search_for_1
:
ph_1
=
j
if
gtf_list_of_lines
[
j
][
0
]
==
search_for_2
:
ph_2
=
j
#making cDNA and comparing in library
start_1
=
gtf_list_of_lines
[
ph_1
][
0
]
start_2
=
gtf_list_of_lines
[
ph_2
][
0
]
start_1
-=
1
start_2
-=
1
trans_between_prim_sites
=
fasta_seq
[
start_1
:
start_2
]
cDNA
=
''
for
element
in
range
(
0
,
len
(
trans_between_prim_sites
)):
if
trans_between_prim_sites
[
element
]
==
'
A
'
:
cDNA
[
element
]
=
'
T
'
elif
trans_between_prim_sites
[
element
]
==
'
U
'
:
cDNA
[
element
]
=
'
A
'
elif
trans_between_prim_sites
[
element
]
==
'
G
'
:
cDNA
[
element
]
=
'
C
'
elif
trans_between_prim_sites
[
element
]
==
'
C
'
:
cDNA
[
element
]
=
'
G
'
else
:
assert
False
,
"
cDNA synthesis failed, position is not A,U,G or C in transcript
"
# open output files
if
i
==
0
:
with
open
(
my_output_fasta
,
'
a
'
)
as
myfasta
,
open
(
my_output_csv
,
'
a
'
)
as
mycsv
:
myfasta
.
write
(
"
>
"
+
string
(
cDNA_transcript_id
))
myfasta
.
write
(
"
\n
"
)
myfasta
.
write
(
cDNA
)
mycsv
.
write
(
"
,
"
.
join
([
cDNA_transcript_id
,
csv_gene_id
,
new_distr_RNA_to_prim_sites
[
ph_1
]]))
else
:
found_cDNA_id
=
''
found_cDNA_id_bool
=
False
with
open
(
my_output_fasta
,
'
r
'
)
as
myfasta
,
open
(
my_output_csv
,
'
r
'
)
as
mycsv
,
open
(
placeholder
,
'
w
'
)
as
phf
:
for
myline
in
myfasta
:
pos
=
myline
.
find
(
'
>
'
)
if
pos
!=
0
:
continue
if
pos
==
0
:
fid
=
myline
fid
=
fid
.
replace
(
"
>
"
,
""
)
fbool
=
False
while
fbool
==
False
:
myline
=
myfasta
.
readline
()
zer
=
myline
[
0
]
if
zer
==
"
;
"
:
continue
elif
zer
==
"
>
"
:
assert
False
,
"
Error in searching cDNA in output fasta-file
"
else
:
fseq
=
myline
fbool
=
True
if
fseq
==
cDNA
:
found_cDNA_id
=
fid
found_cDNA_id_bool
=
True
break
if
found_cDNA_id_bool
==
True
:
for
myline_csv_out
in
mycsv
:
csvlist
=
myline_csv_out
.
split
(
'
,
'
)
csvcDNAid
=
csvlist
[
0
]
if
csvcDNAid
==
found_cDNA_id
:
csvgeneid
=
csvlist
[
1
]
csvcDNAcount
=
csvlist
[
2
]
csvcDNAcount
+=
new_distr_RNA_to_prim_sites
[
ph_1
]
phf
.
write
(
"
,
"
.
join
([
csvcDNAid
,
csvgeneid
,
csvcDNAcount
]))
else
:
phf
.
write
(
myline_csv_out
)
if
found_cDNA_id_bool
==
True
:
with
open
(
my_output_csv
,
'
w
'
)
as
mycsv
,
open
(
placeholder
,
'
r
'
)
as
phf
:
for
myplaceholder
in
phf
:
mycsv
.
write
(
myplaceholder
)
else
:
cDNA_transcript_id
+=
1
with
open
(
my_output_fasta
,
'
a
'
)
as
myfasta
,
open
(
my_output_csv
,
'
a
'
)
as
mycsv
:
myfasta
.
write
(
"
>
"
+
string
(
cDNA_transcript_id
))
myfasta
.
write
(
"
\n
"
)
myfasta
.
write
(
cDNA
)
mycsv
.
write
(
"
,
"
.
join
(
[
cDNA_transcript_id
,
csv_gene_id
,
new_distr_RNA_to_prim_sites
[
ph_1
]]))
return
my_output_fasta
,
my_output_csv
\ No newline at end of file
Loading