diff --git a/Untitled.ipynb b/Untitled.ipynb index 14c4f123ed3e8ce2c38692cb7fbea30d09590c5b..f05859bb38cbefb0db8b6a4a7ae94d302ee60aa1 100644 --- a/Untitled.ipynb +++ b/Untitled.ipynb @@ -43,7 +43,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "[SeqRecord(seq=Seq('ATCCATAAAAAAAAA'), id='Transcript_1', name='Transcript copy number: 8.0', description='', dbxrefs=[]), SeqRecord(seq=Seq('CATCTCAAAAAGTCT'), id='Transcript_1', name='Transcript copy number: 4.0', description='', dbxrefs=[]), SeqRecord(seq=Seq('AAAAAAAAAAAAAAA'), id='Transcript_2', name='Transcript copy number: 11.0', description='', dbxrefs=[])]\n" + "[SeqRecord(seq=Seq('ATCCATAAAAAAAAA'), id='Transcript_1_0', name='Transcript copy number: 8.0', description='', dbxrefs=[]), SeqRecord(seq=Seq('CATCTCAAAAAGTCT'), id='Transcript_1_1', name='Transcript copy number: 4.0', description='', dbxrefs=[]), SeqRecord(seq=Seq('AAAAAAAAAAAAAAA'), id='Transcript_2_0', name='Transcript copy number: 11.0', description='', dbxrefs=[])]\n" ] } ], @@ -189,6 +189,7 @@ " <th>Binding_Probability</th>\n", " <th>Normalized_Binding_Probability</th>\n", " <th>Transcript_Copy_Number</th>\n", + " <th>cdna_ID</th>\n", " <th>priming_site</th>\n", " <th>compliment</th>\n", " </tr>\n", @@ -211,6 +212,7 @@ " <td>0.12</td>\n", " <td>0.705882</td>\n", " <td>8.0</td>\n", + " <td>Transcript_1_0</td>\n", " <td>(T, T, T, T, T, T, T, T, T, A, T, G, G, A, T)</td>\n", " <td>ATCCATAAAAAAAAA</td>\n", " </tr>\n", @@ -231,6 +233,7 @@ " <td>0.05</td>\n", " <td>0.294118</td>\n", " <td>4.0</td>\n", + " <td>Transcript_1_1</td>\n", " <td>(A, G, A, C, T, T, T, T, T, G, A, G, A, T, G)</td>\n", " <td>CATCTCAAAAAGTCT</td>\n", " </tr>\n", @@ -251,6 +254,7 @@ " <td>0.15</td>\n", " <td>1.000000</td>\n", " <td>11.0</td>\n", + " <td>Transcript_2_0</td>\n", " <td>(T, T, T, T, T, T, T, T, T, T, T, T, T, T, T)</td>\n", " <td>AAAAAAAAAAAAAAA</td>\n", " </tr>\n", @@ -271,6 +275,7 @@ " <td>0.25</td>\n", " <td>1.000000</td>\n", " <td>33.0</td>\n", + " <td>Transcript_3_0</td>\n", " <td>None</td>\n", " <td>None</td>\n", " </tr>\n", @@ -291,6 +296,7 @@ " <td>0.15</td>\n", " <td>1.000000</td>\n", " <td>11.0</td>\n", + " <td>Transcript_4_0</td>\n", " <td>None</td>\n", " <td>None</td>\n", " </tr>\n", @@ -311,6 +317,7 @@ " <td>0.15</td>\n", " <td>1.000000</td>\n", " <td>55.0</td>\n", + " <td>Transcript_5_0</td>\n", " <td>None</td>\n", " <td>None</td>\n", " </tr>\n", @@ -343,13 +350,13 @@ "4 1 0.15 \n", "5 1 0.15 \n", "\n", - " Normalized_Binding_Probability Transcript_Copy_Number \\\n", - "0 0.705882 8.0 \n", - "1 0.294118 4.0 \n", - "2 1.000000 11.0 \n", - "3 1.000000 33.0 \n", - "4 1.000000 11.0 \n", - "5 1.000000 55.0 \n", + " Normalized_Binding_Probability Transcript_Copy_Number cdna_ID \\\n", + "0 0.705882 8.0 Transcript_1_0 \n", + "1 0.294118 4.0 Transcript_1_1 \n", + "2 1.000000 11.0 Transcript_2_0 \n", + "3 1.000000 33.0 Transcript_3_0 \n", + "4 1.000000 11.0 Transcript_4_0 \n", + "5 1.000000 55.0 Transcript_5_0 \n", "\n", " priming_site compliment \n", "0 (T, T, T, T, T, T, T, T, T, A, T, G, G, A, T) ATCCATAAAAAAAAA \n", @@ -371,7 +378,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, + "id": "0511af5c-780e-41ba-9fb2-17e6640c5822", + "metadata": {}, + "outputs": [], + "source": [ + "G.write_csv()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, "id": "9e7fe7be-d134-4c1e-8eea-55273c79e39d", "metadata": {}, "outputs": [], diff --git a/cdna/cdna.py b/cdna/cdna.py index edd15c4432d0f1c1dae902715126c4b3d95376dd..5615eb5f7887c9a3ab7303842c867d40ac567635 100644 --- a/cdna/cdna.py +++ b/cdna/cdna.py @@ -28,6 +28,15 @@ class cDNA_Gen: def __init__( self, fasta, gtf, cpn, output_fasta="cDNA.fasta", output_csv="cDNA.csv" ): + """_summary_ + + Args: + fasta (_type_): _description_ + gtf (_type_): _description_ + cpn (_type_): _description_ + output_fasta (str, optional): _description_. Defaults to "cDNA.fasta". + output_csv (str, optional): _description_. Defaults to "cDNA.csv". + """ # inputs self.fasta = fasta self.gtf = gtf @@ -49,6 +58,7 @@ class cDNA_Gen: self.add_compliment() self.add_records() self.write_fasta() + self.write_csv() def add_records(self): self.fasta_records = [] @@ -57,7 +67,7 @@ class cDNA_Gen: copy_number = row["Transcript_Copy_Number"] record = SeqRecord( Seq(row["compliment"]), - row["seqname"], + row["cdna_ID"], f"Transcript copy number: {copy_number}", "") self.fasta_records.append(record) @@ -96,15 +106,17 @@ class cDNA_Gen: df_input_GTF = read_gtf(self.gtf) df_input_GTF['Binding_Probability'] = pd.to_numeric(df_input_GTF['Binding_Probability']) # convert to numeric df_normalization_bind_probablility = df_input_GTF.groupby('seqname')['Binding_Probability'].sum() # extract binding probablility - - # # Add New columns to the existing DataFrame - # df_input_GTF["Normalized_Binding_Probability"] = '' - # df_input_GTF["Transcript_Copy_Number"] = '' - + count = 0 + prev_id = None # Adds Normalized_Binding_Probability and Transcript_Copy_Number to each transcript in the dataframe for index, row in df_input_GTF.iterrows(): # GTF transcript ID - id_GTF = str(row['seqname']) + id_GTF = str(row['seqname']) + if id_GTF == prev_id: + count += 1 + else: + prev_id = None + count = 0 # CVS transcript ID id_CSV = str(row['seqname']).split('_')[1] # Calculate Normalized_Binding_Probability and add to GTF dataframe @@ -112,7 +124,9 @@ class cDNA_Gen: # Calculate Normalized_Binding_Probability and add to GTF dataframe csv_transcript_copy_number = self.df_input_CSV.loc[self.df_input_CSV['ID of transcript'] == int(id_CSV), 'Transcript copy number'].iloc[0] df_input_GTF.loc[index,'Transcript_Copy_Number'] = round(csv_transcript_copy_number * df_input_GTF.loc[index,'Normalized_Binding_Probability']) - + df_input_GTF.loc[index,'cdna_ID'] = f"{id_GTF}_{count}" + prev_id = id_GTF + self.df_input_GTF = df_input_GTF def write_fasta(self): @@ -120,7 +134,7 @@ class cDNA_Gen: SeqIO.write(self.fasta_records, self.output_fasta, "fasta") def write_csv(self): - pass + self.df_input_GTF[["cdna_ID", "Transcript_Copy_Number"]].to_csv(self.output_csv, index=False) def return_output(self): return self.output_fasta, self.output_csv diff --git a/requirements.txt b/requirements.txt index cd291123f6021cb082303d6be191fc008d64026f..3abd0040d0dc49c74b00ed3979faa68cb8b5c07d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ gtfparse +biopython \ No newline at end of file diff --git a/requirements_dev.txt b/requirements_dev.txt index 3135aba351690e706d8e7721ce014ebcf645fec7..5c28b616b49842bc376717e7a65ece2468a760c0 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -1,4 +1,5 @@ gtfparse +biopython black flake8 flake8-docstrings diff --git a/test_files/cDNA.csv b/test_files/cDNA.csv new file mode 100644 index 0000000000000000000000000000000000000000..d88da423c611025b828e53eb4de0409f8f7d4cbe --- /dev/null +++ b/test_files/cDNA.csv @@ -0,0 +1,7 @@ +cdna_ID,Transcript_Copy_Number +Transcript_1_0,8.0 +Transcript_1_1,4.0 +Transcript_2_0,11.0 +Transcript_3_0,33.0 +Transcript_4_0,11.0 +Transcript_5_0,55.0 diff --git a/test_files/cDNA.fasta b/test_files/cDNA.fasta index 1b8d3989aede7948effa7afb4c43a10eac63ebb1..18bd991775a765df6bad34d73d75e47e94aedf4b 100644 --- a/test_files/cDNA.fasta +++ b/test_files/cDNA.fasta @@ -1,6 +1,6 @@ ->Transcript_1 +>Transcript_1_0 ATCCATAAAAAAAAA ->Transcript_1 +>Transcript_1_1 CATCTCAAAAAGTCT ->Transcript_2 +>Transcript_2_0 AAAAAAAAAAAAAAA