Skip to content
Snippets Groups Projects
Commit 3bcdf893 authored by Eric Boittier's avatar Eric Boittier
Browse files

[add] output csv file

parent 5c718f4b
No related branches found
No related tags found
No related merge requests found
Pipeline #14824 failed
%% Cell type:code id:96ccbf23-4a46-4bae-bef2-fdf4c2466ad3 tags:
``` python
```
%% Cell type:code id:3157488c-08cf-4f10-95f6-745613382000 tags:
``` python
from cdna.cdna import cDNA_Gen
```
%% Cell type:code id:a6f5bc24-e454-49ad-b1e3-1f5b63c66ab2 tags:
``` python
```
%% Cell type:code id:ceb225a6-3a63-49e7-a0d3-a92256012261 tags:
``` python
test_path = "/Users/ericboittier/Documents/github/cdna-generator/test_files/"
gtf = test_path+"Example_GTF_Input.gtf"
cpn = test_path+"copy_number_input.csv"
fasta = test_path+"yeast_example.fa"
G = cDNA_Gen(fasta, gtf, cpn, output_fasta="test_files/cDNA.fasta", output_csv="test_files/cDNA.csv")
```
%% Output
INFO:root:Extracted GTF attributes: ['Accessibility_Energy', 'Hybridization_Energy', 'Interaction_Energy', 'Number_of_binding_sites', 'Binding_Probability']
[SeqRecord(seq=Seq('ATCCATAAAAAAAAA'), id='Transcript_1', name='Transcript copy number: 8.0', description='', dbxrefs=[]), SeqRecord(seq=Seq('CATCTCAAAAAGTCT'), id='Transcript_1', name='Transcript copy number: 4.0', description='', dbxrefs=[]), SeqRecord(seq=Seq('AAAAAAAAAAAAAAA'), id='Transcript_2', name='Transcript copy number: 11.0', description='', dbxrefs=[])]
[SeqRecord(seq=Seq('ATCCATAAAAAAAAA'), id='Transcript_1_0', name='Transcript copy number: 8.0', description='', dbxrefs=[]), SeqRecord(seq=Seq('CATCTCAAAAAGTCT'), id='Transcript_1_1', name='Transcript copy number: 4.0', description='', dbxrefs=[]), SeqRecord(seq=Seq('AAAAAAAAAAAAAAA'), id='Transcript_2_0', name='Transcript copy number: 11.0', description='', dbxrefs=[])]
%% Cell type:code id:24976988-4a28-4932-8f53-d8ac7bb018f8 tags:
``` python
G.df_input_CSV
```
%% Output
index ID of transcript ID of parent transcript Transcript copy number
0 0 1 1 12
1 1 2 1 11
2 2 3 2 33
3 3 4 3 11
4 4 5 4 55
%% Cell type:code id:f7fdfef3-58b5-45c5-bd0f-e215a3b13636 tags:
``` python
G.df_input_GTF
```
%% Output
seqname source feature start end score strand frame \
0 Transcript_1 RIBlast Priming_site 10 25 NaN + 0
1 Transcript_1 RIBlast Priming_site 640 655 NaN + 0
2 Transcript_2 RIBlast Priming_site 3 18 NaN + 0
3 Transcript_3 RIBlast Priming_site 5 35 NaN + 0
4 Transcript_4 RIBlast Priming_site 5 35 NaN + 0
5 Transcript_5 RIBlast Priming_site 5 35 NaN + 0
Accessibility_Energy Hybridization_Energy Interaction_Energy \
0 1.49 -9.76 -8.74
1 1.71 -9.12 -8.34
2 1.21 -5.12 -2.34
3 1.21 -5.12 -2.34
4 1.21 -5.12 -2.34
5 1.21 -5.12 -2.34
Number_of_binding_sites Binding_Probability \
0 2 0.12
1 2 0.05
2 1 0.15
3 1 0.25
4 1 0.15
5 1 0.15
Normalized_Binding_Probability Transcript_Copy_Number \
0 0.705882 8.0
1 0.294118 4.0
2 1.000000 11.0
3 1.000000 33.0
4 1.000000 11.0
5 1.000000 55.0
Normalized_Binding_Probability Transcript_Copy_Number cdna_ID \
0 0.705882 8.0 Transcript_1_0
1 0.294118 4.0 Transcript_1_1
2 1.000000 11.0 Transcript_2_0
3 1.000000 33.0 Transcript_3_0
4 1.000000 11.0 Transcript_4_0
5 1.000000 55.0 Transcript_5_0
priming_site compliment
0 (T, T, T, T, T, T, T, T, T, A, T, G, G, A, T) ATCCATAAAAAAAAA
1 (A, G, A, C, T, T, T, T, T, G, A, G, A, T, G) CATCTCAAAAAGTCT
2 (T, T, T, T, T, T, T, T, T, T, T, T, T, T, T) AAAAAAAAAAAAAAA
3 None None
4 None None
5 None None
%% Cell type:code id:0511af5c-780e-41ba-9fb2-17e6640c5822 tags:
``` python
G.write_csv()
```
%% Cell type:code id:9e7fe7be-d134-4c1e-8eea-55273c79e39d tags:
``` python
G.add_sequences()
```
%% Cell type:code id:2eb8f224-0292-4bb0-9e27-ef3bc3676f1d tags:
``` python
G.df_input_GTF
```
%% Output
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Input In [3], in <cell line: 1>()
----> 1 G.df_input_GTF
NameError: name 'G' is not defined
%% Cell type:code id:4a482960-0e05-4355-b91c-fc7f51c138c2 tags:
``` python
G.df_input_GTF["A"] = G.df_input_GTF.apply(
lambda row: foo(row["seqname"],
row["start"],
row["end"]),
axis=1
)
```
%% Output
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Input In [4], in <cell line: 1>()
----> 1 G.df_input_GTF["A"] = G.df_input_GTF.apply(
2 lambda row: foo(row["seqname"],
3 row["start"],
4 row["end"]),
5 axis=1
6 )
NameError: name 'G' is not defined
%% Cell type:code id:304fd031-524a-4fc3-9322-9b3dd55268fb tags:
``` python
# G.df_input_GTF["A"] = G.df_input_GTF.apply(lambda row: foo(row), axis=1)
```
%% Cell type:code id:a8f7a36d-d768-4daf-8705-4cab5e06b562 tags:
``` python
G.df_input_GTF[G.df_input_GTF["seqname"]=="Transcript_1"]
```
%% Cell type:code id:b0d7f2e4-c590-49a9-89eb-b53144dba9e8 tags:
``` python
```
......
......@@ -28,6 +28,15 @@ class cDNA_Gen:
def __init__(
self, fasta, gtf, cpn, output_fasta="cDNA.fasta", output_csv="cDNA.csv"
):
"""_summary_
Args:
fasta (_type_): _description_
gtf (_type_): _description_
cpn (_type_): _description_
output_fasta (str, optional): _description_. Defaults to "cDNA.fasta".
output_csv (str, optional): _description_. Defaults to "cDNA.csv".
"""
# inputs
self.fasta = fasta
self.gtf = gtf
......@@ -49,6 +58,7 @@ class cDNA_Gen:
self.add_compliment()
self.add_records()
self.write_fasta()
self.write_csv()
def add_records(self):
self.fasta_records = []
......@@ -57,7 +67,7 @@ class cDNA_Gen:
copy_number = row["Transcript_Copy_Number"]
record = SeqRecord(
Seq(row["compliment"]),
row["seqname"],
row["cdna_ID"],
f"Transcript copy number: {copy_number}",
"")
self.fasta_records.append(record)
......@@ -96,15 +106,17 @@ class cDNA_Gen:
df_input_GTF = read_gtf(self.gtf)
df_input_GTF['Binding_Probability'] = pd.to_numeric(df_input_GTF['Binding_Probability']) # convert to numeric
df_normalization_bind_probablility = df_input_GTF.groupby('seqname')['Binding_Probability'].sum() # extract binding probablility
# # Add New columns to the existing DataFrame
# df_input_GTF["Normalized_Binding_Probability"] = ''
# df_input_GTF["Transcript_Copy_Number"] = ''
count = 0
prev_id = None
# Adds Normalized_Binding_Probability and Transcript_Copy_Number to each transcript in the dataframe
for index, row in df_input_GTF.iterrows():
# GTF transcript ID
id_GTF = str(row['seqname'])
id_GTF = str(row['seqname'])
if id_GTF == prev_id:
count += 1
else:
prev_id = None
count = 0
# CVS transcript ID
id_CSV = str(row['seqname']).split('_')[1]
# Calculate Normalized_Binding_Probability and add to GTF dataframe
......@@ -112,7 +124,9 @@ class cDNA_Gen:
# Calculate Normalized_Binding_Probability and add to GTF dataframe
csv_transcript_copy_number = self.df_input_CSV.loc[self.df_input_CSV['ID of transcript'] == int(id_CSV), 'Transcript copy number'].iloc[0]
df_input_GTF.loc[index,'Transcript_Copy_Number'] = round(csv_transcript_copy_number * df_input_GTF.loc[index,'Normalized_Binding_Probability'])
df_input_GTF.loc[index,'cdna_ID'] = f"{id_GTF}_{count}"
prev_id = id_GTF
self.df_input_GTF = df_input_GTF
def write_fasta(self):
......@@ -120,7 +134,7 @@ class cDNA_Gen:
SeqIO.write(self.fasta_records, self.output_fasta, "fasta")
def write_csv(self):
pass
self.df_input_GTF[["cdna_ID", "Transcript_Copy_Number"]].to_csv(self.output_csv, index=False)
def return_output(self):
return self.output_fasta, self.output_csv
......
gtfparse
biopython
\ No newline at end of file
gtfparse
biopython
black
flake8
flake8-docstrings
......
cdna_ID,Transcript_Copy_Number
Transcript_1_0,8.0
Transcript_1_1,4.0
Transcript_2_0,11.0
Transcript_3_0,33.0
Transcript_4_0,11.0
Transcript_5_0,55.0
>Transcript_1
>Transcript_1_0
ATCCATAAAAAAAAA
>Transcript_1
>Transcript_1_1
CATCTCAAAAAGTCT
>Transcript_2
>Transcript_2_0
AAAAAAAAAAAAAAA
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment