Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
S
scRNA-seq-simulation
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
zavolan_group
pipelines
scRNA-seq-simulation
Commits
3f9b6546
Commit
3f9b6546
authored
3 years ago
by
Suvarnan Selliah
Browse files
Options
Downloads
Patches
Plain Diff
initial commit
parents
No related branches found
No related tags found
1 merge request
!27
Issue #5
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
Generate_cDNA.py
+238
-0
238 additions, 0 deletions
Generate_cDNA.py
with
238 additions
and
0 deletions
Generate_cDNA.py
0 → 100644
+
238
−
0
View file @
3f9b6546
#!/usr/bin/env python3
#Author: Suvarnan Selliah
class
GeneratecDNA
:
def
generatecDNA
(
fasta
,
gtf
,
cp_nr
,
my_output_fasta
=
"
cDNA.fasta
"
,
my_output_csv
=
"
cDNA.csv
"
,
placeholder
=
"
ph-file.csv
"
):
cDNA_transcript_id
=
1
#READING INPUT FILES / PART I
#open files
with
open
(
fasta
,
'
r
'
)
as
fa
,
open
(
gtf
,
'
r
'
)
as
gt
,
open
(
cp_nr
,
'
r
'
)
as
cp
:
#read fasta-file
for
myFastaline
in
fa
:
#search for transcript id and transcript sequence in fasta-file
fasta_id
=
""
fasta_seq
=
""
fasta_id_found
=
False
fasta_seq_found
=
False
currentFastaString
=
myFastaline
if
fasta_id_found
==
False
:
position_of_start
=
currentFastaString
.
find
(
'
>
'
)
if
position_of_start
!=
0
:
continue
elif
position_of_start
==
0
:
fasta_id
=
myFastaline
fasta_id
=
fasta_id
.
replace
(
"
>
"
,
""
)
fasta_id_found
=
True
continue
else
:
print
(
"
FASTA: Start position in fasta file not found
"
)
break
if
fasta_id_found
==
True
and
fasta_seq_found
==
False
:
while
fasta_seq_found
==
False
:
currentFastaString
=
fa
.
readline
()
zero_position
=
currentFastaString
[
0
]
if
zero_position
==
"
;
"
:
continue
elif
zero_position
==
"
>
"
:
print
(
"
FASTA: No Sequence after headline
"
)
break
else
:
fasta_seq
=
currentFastaString
fasta_seq_found
=
True
#starting to work with gtf-file
#defining variables for gtf-file
gtf_seqname
=
''
gtf_start
=
0
gtf_end
=
0
gtf_score
=
0.0
gtf_info_found
=
False
gtf_entries
=
0
gtf_list_of_lines
=
[]
gtf_prob_list
=
[]
#defining variables for csv-file
csv_trans_id
=
""
csv_gene_id
=
""
csv_count
=
0
csv_info_found
=
False
if
fasta_id_found
==
True
and
fasta_seq_found
==
True
:
# search for transcript id from fasta-file in gtf-file
for
myGTFline
in
gt
:
currentGTFString
=
myGTFline
gtf_list
=
currentGTFString
.
split
(
'
\t
'
)
gtf_seqname
=
gtf_list
[
0
]
if
gtf_seqname
==
fasta_id
:
gtf_entries
+=
1
gtf_start
=
gtf_list
[
3
]
gtf_end
=
gtf_list
[
4
]
gtf_score
=
gtf_list
[
5
]
gtf_temp_list
=
[]
gtf_temp_list
.
append
(
gtf_start
)
gtf_temp_list
.
append
(
gtf_end
)
gtf_temp_list
.
append
(
gtf_score
)
gtf_list_of_lines
.
append
(
gtf_temp_list
)
gtf_prob_list
.
append
(
gtf_score
)
else
:
continue
if
gtf_entries
!=
0
:
gtf_info_found
=
True
fasta_id_found
=
False
fasta_seq_found
=
False
assert
gtf_info_found
,
"
Sequence ID from fasta-file not found in gtf-file
"
#search copy number of transcript in 3. file/csv-file
for
myCSVline
in
cp
:
currentCSVString
=
myCSVline
csv_list
=
currentCSVString
.
split
(
'
,
'
)
csv_trans_id
=
csv_list
[
0
]
if
csv_trans_id
==
fasta_id
:
csv_gene_id
=
csv_list
[
1
]
csv_count
=
csv_list
[
2
]
csv_info_found
=
True
gtf_info_found
=
False
break
else
:
continue
assert
csv_info_found
,
"
Data (TranscriptID,GeneID,Count) from csv-file/3.file not found
"
#COMPUTATION & OUTPUT / PART II
#set score to 0 for primimg sites close to the end of sequence
seq_len
=
len
(
fasta_seq
)
seq_len
-=
22
gtf_list_of_lines_len
=
len
(
gtf_list_of_lines
)
for
i
in
range
(
gtf_list_of_lines_len
):
if
gtf_list_of_lines
[
i
][
1
]
>
seq_len
:
gtf_list_of_lines
[
i
][
2
]
=
0.0
gtf_prob_list
[
i
]
=
0.0
#assign priming sites (according to score) to copy number
sum_of_score
=
0.0
for
i
in
gtf_prob_list
:
sum_of_score
+=
i
one_score
=
100
/
sum_of_score
norm_list
=
[]
gtf_prob_list_len
=
len
(
gtf_prob_list
)
for
i
in
range
(
gtf_prob_list_len
):
norm_list
.
append
((
one_score
*
gtf_prob_list
[
i
]))
one_norm
=
csv_count
/
100
distr_RNA_to_prim_sites
=
[]
norm_list_len
=
len
(
norm_list
)
for
i
in
range
(
norm_list_len
):
distr_RNA_to_prim_sites
.
append
((
one_norm
*
norm_list
[
i
]))
total_RNA_number
=
0
distr_RNA_to_prim_sites_len
=
len
(
distr_RNA_to_prim_sites
)
for
i
in
range
(
distr_RNA_to_prim_sites_len
):
total_RNA_number
+=
distr_RNA_to_prim_sites
[
i
]
new_distr_RNA_to_prim_sites
=
[]
if
total_RNA_number
!=
csv_count
:
for
i
in
range
(
distr_RNA_to_prim_sites_len
):
new_distr_RNA_to_prim_sites
.
append
(
int
(
distr_RNA_to_prim_sites
[
i
]))
new_distr_RNA_to_prim_sites_len
=
len
(
new_distr_RNA_to_prim_sites
)
counter
=
0
while
total_RNA_number
!=
csv_count
:
new_distr_RNA_to_prim_sites
[
counter
]
=
round
(
distr_RNA_to_prim_sites
[
counter
])
counter
+=
1
assert
counter
<=
new_distr_RNA_to_prim_sites_len
,
"
Calculated RNA transcripts (assigned to priming sites) are more than initial count
"
#order the priming sites
prim_sites_ordered
=
[]
for
i
in
range
(
gtf_list_of_lines_len
):
prim_sites_ordered
.
append
(
gtf_list_of_lines
[
i
][
0
])
prim_sites_ordered
.
sort
()
#searching for 2 priming sites
prim_sites_ordered_len
=
len
(
prim_sites_ordered
)
ph_1
=
0
ph_2
=
0
for
i
in
range
(
0
,
(
prim_sites_ordered_len
-
1
),
2
):
if
prim_sites_ordered
[
i
]
==
0.0
:
continue
else
:
search_for_1
=
prim_sites_ordered
[
i
]
search_for_2
=
prim_sites_ordered
[
i
+
1
]
for
j
in
range
(
gtf_list_of_lines_len
):
if
gtf_list_of_lines
[
j
][
0
]
==
search_for_1
:
ph_1
=
j
if
gtf_list_of_lines
[
j
][
0
]
==
search_for_2
:
ph_2
=
j
#making cDNA and comparing in library
start_1
=
gtf_list_of_lines
[
ph_1
][
0
]
start_2
=
gtf_list_of_lines
[
ph_2
][
0
]
start_1
-=
1
start_2
-=
1
trans_between_prim_sites
=
fasta_seq
[
start_1
:
start_2
]
cDNA
=
''
for
element
in
range
(
0
,
len
(
trans_between_prim_sites
)):
if
trans_between_prim_sites
[
element
]
==
'
A
'
:
cDNA
[
element
]
=
'
T
'
elif
trans_between_prim_sites
[
element
]
==
'
U
'
:
cDNA
[
element
]
=
'
A
'
elif
trans_between_prim_sites
[
element
]
==
'
G
'
:
cDNA
[
element
]
=
'
C
'
elif
trans_between_prim_sites
[
element
]
==
'
C
'
:
cDNA
[
element
]
=
'
G
'
else
:
assert
False
,
"
cDNA synthesis failed, position is not A,U,G or C in transcript
"
# open output files
if
i
==
0
:
with
open
(
my_output_fasta
,
'
a
'
)
as
myfasta
,
open
(
my_output_csv
,
'
a
'
)
as
mycsv
:
myfasta
.
write
(
"
>
"
+
string
(
cDNA_transcript_id
))
myfasta
.
write
(
"
\n
"
)
myfasta
.
write
(
cDNA
)
mycsv
.
write
(
"
,
"
.
join
([
cDNA_transcript_id
,
csv_gene_id
,
new_distr_RNA_to_prim_sites
[
ph_1
]]))
else
:
found_cDNA_id
=
''
found_cDNA_id_bool
=
False
with
open
(
my_output_fasta
,
'
r
'
)
as
myfasta
,
open
(
my_output_csv
,
'
r
'
)
as
mycsv
,
open
(
placeholder
,
'
w
'
)
as
phf
:
for
myline
in
myfasta
:
pos
=
myline
.
find
(
'
>
'
)
if
pos
!=
0
:
continue
if
pos
==
0
:
fid
=
myline
fid
=
fid
.
replace
(
"
>
"
,
""
)
fbool
=
False
while
fbool
==
False
:
myline
=
myfasta
.
readline
()
zer
=
myline
[
0
]
if
zer
==
"
;
"
:
continue
elif
zer
==
"
>
"
:
assert
False
,
"
Error in searching cDNA in output fasta-file
"
else
:
fseq
=
myline
fbool
=
True
if
fseq
==
cDNA
:
found_cDNA_id
=
fid
found_cDNA_id_bool
=
True
break
if
found_cDNA_id_bool
==
True
:
for
myline_csv_out
in
mycsv
:
csvlist
=
myline_csv_out
.
split
(
'
,
'
)
csvcDNAid
=
csvlist
[
0
]
if
csvcDNAid
==
found_cDNA_id
:
csvgeneid
=
csvlist
[
1
]
csvcDNAcount
=
csvlist
[
2
]
csvcDNAcount
+=
new_distr_RNA_to_prim_sites
[
ph_1
]
phf
.
write
(
"
,
"
.
join
([
csvcDNAid
,
csvgeneid
,
csvcDNAcount
]))
else
:
phf
.
write
(
myline_csv_out
)
if
found_cDNA_id_bool
==
True
:
with
open
(
my_output_csv
,
'
w
'
)
as
mycsv
,
open
(
placeholder
,
'
r
'
)
as
phf
:
for
myplaceholder
in
phf
:
mycsv
.
write
(
myplaceholder
)
else
:
cDNA_transcript_id
+=
1
with
open
(
my_output_fasta
,
'
a
'
)
as
myfasta
,
open
(
my_output_csv
,
'
a
'
)
as
mycsv
:
myfasta
.
write
(
"
>
"
+
string
(
cDNA_transcript_id
))
myfasta
.
write
(
"
\n
"
)
myfasta
.
write
(
cDNA
)
mycsv
.
write
(
"
,
"
.
join
(
[
cDNA_transcript_id
,
csv_gene_id
,
new_distr_RNA_to_prim_sites
[
ph_1
]]))
\ No newline at end of file
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment