diff --git a/Untitled.ipynb b/Untitled.ipynb index f05859bb38cbefb0db8b6a4a7ae94d302ee60aa1..9c9d32789d52e3a957e860b3394094ed301673fe 100644 --- a/Untitled.ipynb +++ b/Untitled.ipynb @@ -43,7 +43,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "[SeqRecord(seq=Seq('ATCCATAAAAAAAAA'), id='Transcript_1_0', name='Transcript copy number: 8.0', description='', dbxrefs=[]), SeqRecord(seq=Seq('CATCTCAAAAAGTCT'), id='Transcript_1_1', name='Transcript copy number: 4.0', description='', dbxrefs=[]), SeqRecord(seq=Seq('AAAAAAAAAAAAAAA'), id='Transcript_2_0', name='Transcript copy number: 11.0', description='', dbxrefs=[])]\n" + "[SeqRecord(seq=Seq('TTACAACTTTAGTTCTTTTAATATACTAGAAGCCAGCTCTTTATACAACCAAGT...AAA'), id='Transcript_1_0', name='Transcript copy number: 8.0', description='', dbxrefs=[]), SeqRecord(seq=Seq('TTACAACTTTAGTTCTTTTAATATACTAGAAGCCAGCTCTTTATACAACCAAGT...TCT'), id='Transcript_1_1', name='Transcript copy number: 4.0', description='', dbxrefs=[]), SeqRecord(seq=Seq('AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTAAAATTCTTTC...AAA'), id='Transcript_2_0', name='Transcript copy number: 11.0', description='', dbxrefs=[])]\n" ] } ], @@ -213,8 +213,8 @@ " <td>0.705882</td>\n", " <td>8.0</td>\n", " <td>Transcript_1_0</td>\n", - " <td>(T, T, T, T, T, T, T, T, T, A, T, G, G, A, T)</td>\n", - " <td>ATCCATAAAAAAAAA</td>\n", + " <td>(T, T, T, T, T, T, T, T, T, A, T, G, G, A, T, ...</td>\n", + " <td>TTACAACTTTAGTTCTTTTAATATACTAGAAGCCAGCTCTTTATAC...</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", @@ -234,8 +234,8 @@ " <td>0.294118</td>\n", " <td>4.0</td>\n", " <td>Transcript_1_1</td>\n", - " <td>(A, G, A, C, T, T, T, T, T, G, A, G, A, T, G)</td>\n", - " <td>CATCTCAAAAAGTCT</td>\n", + " <td>(A, G, A, C, T, T, T, T, T, G, A, G, A, T, G, ...</td>\n", + " <td>TTACAACTTTAGTTCTTTTAATATACTAGAAGCCAGCTCTTTATAC...</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", @@ -255,8 +255,8 @@ " <td>1.000000</td>\n", " <td>11.0</td>\n", " <td>Transcript_2_0</td>\n", - " <td>(T, T, T, T, T, T, T, T, T, T, T, T, T, T, T)</td>\n", - " <td>AAAAAAAAAAAAAAA</td>\n", + " <td>(T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, ...</td>\n", + " <td>AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTAAA...</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", @@ -358,13 +358,21 @@ "4 1.000000 11.0 Transcript_4_0 \n", "5 1.000000 55.0 Transcript_5_0 \n", "\n", - " priming_site compliment \n", - "0 (T, T, T, T, T, T, T, T, T, A, T, G, G, A, T) ATCCATAAAAAAAAA \n", - "1 (A, G, A, C, T, T, T, T, T, G, A, G, A, T, G) CATCTCAAAAAGTCT \n", - "2 (T, T, T, T, T, T, T, T, T, T, T, T, T, T, T) AAAAAAAAAAAAAAA \n", - "3 None None \n", - "4 None None \n", - "5 None None " + " priming_site \\\n", + "0 (T, T, T, T, T, T, T, T, T, A, T, G, G, A, T, ... \n", + "1 (A, G, A, C, T, T, T, T, T, G, A, G, A, T, G, ... \n", + "2 (T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, ... \n", + "3 None \n", + "4 None \n", + "5 None \n", + "\n", + " compliment \n", + "0 TTACAACTTTAGTTCTTTTAATATACTAGAAGCCAGCTCTTTATAC... \n", + "1 TTACAACTTTAGTTCTTTTAATATACTAGAAGCCAGCTCTTTATAC... \n", + "2 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTAAA... \n", + "3 None \n", + "4 None \n", + "5 None " ] }, "execution_count": 4, @@ -398,20 +406,235 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 7, "id": "2eb8f224-0292-4bb0-9e27-ef3bc3676f1d", "metadata": {}, "outputs": [ { - "ename": "NameError", - "evalue": "name 'G' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "Input \u001b[0;32mIn [3]\u001b[0m, in \u001b[0;36m<cell line: 1>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mG\u001b[49m\u001b[38;5;241m.\u001b[39mdf_input_GTF\n", - "\u001b[0;31mNameError\u001b[0m: name 'G' is not defined" - ] + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>seqname</th>\n", + " <th>source</th>\n", + " <th>feature</th>\n", + " <th>start</th>\n", + " <th>end</th>\n", + " <th>score</th>\n", + " <th>strand</th>\n", + " <th>frame</th>\n", + " <th>Accessibility_Energy</th>\n", + " <th>Hybridization_Energy</th>\n", + " <th>Interaction_Energy</th>\n", + " <th>Number_of_binding_sites</th>\n", + " <th>Binding_Probability</th>\n", + " <th>Normalized_Binding_Probability</th>\n", + " <th>Transcript_Copy_Number</th>\n", + " <th>cdna_ID</th>\n", + " <th>priming_site</th>\n", + " <th>compliment</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>Transcript_1</td>\n", + " <td>RIBlast</td>\n", + " <td>Priming_site</td>\n", + " <td>10</td>\n", + " <td>25</td>\n", + " <td>NaN</td>\n", + " <td>+</td>\n", + " <td>0</td>\n", + " <td>1.49</td>\n", + " <td>-9.76</td>\n", + " <td>-8.74</td>\n", + " <td>2</td>\n", + " <td>0.12</td>\n", + " <td>0.705882</td>\n", + " <td>8.0</td>\n", + " <td>Transcript_1_0</td>\n", + " <td>(T, T, T, T, T, T, T, T, T, A, T, G, G, A, T, ...</td>\n", + " <td>TTACAACTTTAGTTCTTTTAATATACTAGAAGCCAGCTCTTTATAC...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>Transcript_1</td>\n", + " <td>RIBlast</td>\n", + " <td>Priming_site</td>\n", + " <td>640</td>\n", + " <td>655</td>\n", + " <td>NaN</td>\n", + " <td>+</td>\n", + " <td>0</td>\n", + " <td>1.71</td>\n", + " <td>-9.12</td>\n", + " <td>-8.34</td>\n", + " <td>2</td>\n", + " <td>0.05</td>\n", + " <td>0.294118</td>\n", + " <td>4.0</td>\n", + " <td>Transcript_1_1</td>\n", + " <td>(A, G, A, C, T, T, T, T, T, G, A, G, A, T, G, ...</td>\n", + " <td>TTACAACTTTAGTTCTTTTAATATACTAGAAGCCAGCTCTTTATAC...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>Transcript_2</td>\n", + " <td>RIBlast</td>\n", + " <td>Priming_site</td>\n", + " <td>3</td>\n", + " <td>18</td>\n", + " <td>NaN</td>\n", + " <td>+</td>\n", + " <td>0</td>\n", + " <td>1.21</td>\n", + " <td>-5.12</td>\n", + " <td>-2.34</td>\n", + " <td>1</td>\n", + " <td>0.15</td>\n", + " <td>1.000000</td>\n", + " <td>11.0</td>\n", + " <td>Transcript_2_0</td>\n", + " <td>(T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, ...</td>\n", + " <td>AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTAAA...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>Transcript_3</td>\n", + " <td>RIBlast</td>\n", + " <td>Priming_site</td>\n", + " <td>5</td>\n", + " <td>35</td>\n", + " <td>NaN</td>\n", + " <td>+</td>\n", + " <td>0</td>\n", + " <td>1.21</td>\n", + " <td>-5.12</td>\n", + " <td>-2.34</td>\n", + " <td>1</td>\n", + " <td>0.25</td>\n", + " <td>1.000000</td>\n", + " <td>33.0</td>\n", + " <td>Transcript_3_0</td>\n", + " <td>None</td>\n", + " <td>None</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>Transcript_4</td>\n", + " <td>RIBlast</td>\n", + " <td>Priming_site</td>\n", + " <td>5</td>\n", + " <td>35</td>\n", + " <td>NaN</td>\n", + " <td>+</td>\n", + " <td>0</td>\n", + " <td>1.21</td>\n", + " <td>-5.12</td>\n", + " <td>-2.34</td>\n", + " <td>1</td>\n", + " <td>0.15</td>\n", + " <td>1.000000</td>\n", + " <td>11.0</td>\n", + " <td>Transcript_4_0</td>\n", + " <td>None</td>\n", + " <td>None</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>Transcript_5</td>\n", + " <td>RIBlast</td>\n", + " <td>Priming_site</td>\n", + " <td>5</td>\n", + " <td>35</td>\n", + " <td>NaN</td>\n", + " <td>+</td>\n", + " <td>0</td>\n", + " <td>1.21</td>\n", + " <td>-5.12</td>\n", + " <td>-2.34</td>\n", + " <td>1</td>\n", + " <td>0.15</td>\n", + " <td>1.000000</td>\n", + " <td>55.0</td>\n", + " <td>Transcript_5_0</td>\n", + " <td>None</td>\n", + " <td>None</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " seqname source feature start end score strand frame \\\n", + "0 Transcript_1 RIBlast Priming_site 10 25 NaN + 0 \n", + "1 Transcript_1 RIBlast Priming_site 640 655 NaN + 0 \n", + "2 Transcript_2 RIBlast Priming_site 3 18 NaN + 0 \n", + "3 Transcript_3 RIBlast Priming_site 5 35 NaN + 0 \n", + "4 Transcript_4 RIBlast Priming_site 5 35 NaN + 0 \n", + "5 Transcript_5 RIBlast Priming_site 5 35 NaN + 0 \n", + "\n", + " Accessibility_Energy Hybridization_Energy Interaction_Energy \\\n", + "0 1.49 -9.76 -8.74 \n", + "1 1.71 -9.12 -8.34 \n", + "2 1.21 -5.12 -2.34 \n", + "3 1.21 -5.12 -2.34 \n", + "4 1.21 -5.12 -2.34 \n", + "5 1.21 -5.12 -2.34 \n", + "\n", + " Number_of_binding_sites Binding_Probability \\\n", + "0 2 0.12 \n", + "1 2 0.05 \n", + "2 1 0.15 \n", + "3 1 0.25 \n", + "4 1 0.15 \n", + "5 1 0.15 \n", + "\n", + " Normalized_Binding_Probability Transcript_Copy_Number cdna_ID \\\n", + "0 0.705882 8.0 Transcript_1_0 \n", + "1 0.294118 4.0 Transcript_1_1 \n", + "2 1.000000 11.0 Transcript_2_0 \n", + "3 1.000000 33.0 Transcript_3_0 \n", + "4 1.000000 11.0 Transcript_4_0 \n", + "5 1.000000 55.0 Transcript_5_0 \n", + "\n", + " priming_site \\\n", + "0 (T, T, T, T, T, T, T, T, T, A, T, G, G, A, T, ... \n", + "1 (A, G, A, C, T, T, T, T, T, G, A, G, A, T, G, ... \n", + "2 (T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, ... \n", + "3 None \n", + "4 None \n", + "5 None \n", + "\n", + " compliment \n", + "0 TTACAACTTTAGTTCTTTTAATATACTAGAAGCCAGCTCTTTATAC... \n", + "1 TTACAACTTTAGTTCTTTTAATATACTAGAAGCCAGCTCTTTATAC... \n", + "2 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTAAA... \n", + "3 None \n", + "4 None \n", + "5 None " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -420,30 +643,11 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "4a482960-0e05-4355-b91c-fc7f51c138c2", "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'G' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "Input \u001b[0;32mIn [4]\u001b[0m, in \u001b[0;36m<cell line: 1>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0m G\u001b[38;5;241m.\u001b[39mdf_input_GTF[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mA\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43mG\u001b[49m\u001b[38;5;241m.\u001b[39mdf_input_GTF\u001b[38;5;241m.\u001b[39mapply(\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mlambda\u001b[39;00m row: foo(row[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mseqname\u001b[39m\u001b[38;5;124m\"\u001b[39m], \n\u001b[1;32m 3\u001b[0m row[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mstart\u001b[39m\u001b[38;5;124m\"\u001b[39m], \n\u001b[1;32m 4\u001b[0m row[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mend\u001b[39m\u001b[38;5;124m\"\u001b[39m]), \n\u001b[1;32m 5\u001b[0m axis\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m\n\u001b[1;32m 6\u001b[0m )\n", - "\u001b[0;31mNameError\u001b[0m: name 'G' is not defined" - ] - } - ], - "source": [ - "G.df_input_GTF[\"A\"] = G.df_input_GTF.apply(\n", - " lambda row: foo(row[\"seqname\"], \n", - " row[\"start\"], \n", - " row[\"end\"]), \n", - " axis=1\n", - ")" - ] + "outputs": [], + "source": [] }, { "cell_type": "code", diff --git a/cdna/cdna.py b/cdna/cdna.py index 5615eb5f7887c9a3ab7303842c867d40ac567635..a6025f8f86b5a9634b809e8fa7b6a1f0dd63a041 100644 --- a/cdna/cdna.py +++ b/cdna/cdna.py @@ -1,6 +1,8 @@ import sys import warnings +import logging +from cli import parser import pandas as pd from Bio import SeqIO from Bio.Seq import Seq @@ -8,41 +10,32 @@ from Bio.SeqRecord import SeqRecord from gtfparse import read_gtf # ignore warnings from read_gtf -warnings.filterwarnings(action='ignore', category=FutureWarning) +warnings.filterwarnings(action="ignore", category=FutureWarning) -def compliment(res): + +def compliment(res: str) -> str: translate_dict = {"A": "T", "T": "A", "U": "A", "G": "C", "C": "G"} if res not in translate_dict.keys(): print(f"Unknown character, {res}") - sys.exit(1) + sys.exit(1) return translate_dict[res] -def seq_compliment(sequence): + +def seq_compliment(sequence: str) -> str: if sequence is None: - return None - _ = "".join([compliment(char) for char in str(sequence)])[::-1] # reverse string + return "None" + _ = "".join([compliment(char) for char in str(sequence)])[::-1] # reverse string return _ -class cDNA_Gen: - def __init__( - self, fasta, gtf, cpn, output_fasta="cDNA.fasta", output_csv="cDNA.csv" - ): - """_summary_ - - Args: - fasta (_type_): _description_ - gtf (_type_): _description_ - cpn (_type_): _description_ - output_fasta (str, optional): _description_. Defaults to "cDNA.fasta". - output_csv (str, optional): _description_. Defaults to "cDNA.csv". - """ +class CDNAGen: + def __init__(self, ifasta: str, igtf: str, icpn: str, ofasta: str, ocsv: str): # inputs - self.fasta = fasta - self.gtf = gtf - self.cpn = cpn - self.output_fasta = output_fasta - self.output_csv = output_csv + self.fasta = ifasta + self.gtf = igtf + self.cpn = icpn + self.output_fasta = ofasta + self.output_csv = ocsv # variables self.fasta_dict = None @@ -66,27 +59,28 @@ class cDNA_Gen: if row["compliment"] is not None: copy_number = row["Transcript_Copy_Number"] record = SeqRecord( - Seq(row["compliment"]), - row["cdna_ID"], + Seq(row["compliment"]), + row["cdna_ID"], f"Transcript copy number: {copy_number}", - "") + "", + ) self.fasta_records.append(record) def add_sequences(self): self.df_input_GTF["priming_site"] = self.df_input_GTF.apply( - lambda row: self.read_primingsite(row["seqname"], - row["start"], - row["end"]), - axis=1) + lambda row: self.read_primingsite(row["seqname"], row["start"]), + axis=1, + ) def add_compliment(self): self.df_input_GTF["compliment"] = self.df_input_GTF["priming_site"].apply( - lambda x: seq_compliment(x)) + lambda x: seq_compliment(x) + ) - def read_primingsite(self, sequence, start, end): + def read_primingsite(self, sequence, start): if sequence not in self.fasta_dict.keys(): return None - _ = self.fasta_dict[sequence].seq[start:end] + _ = self.fasta_dict[sequence].seq[start:] return _ def read_fasta(self): @@ -96,35 +90,48 @@ class cDNA_Gen: def read_csv(self): df_input_CSV = pd.read_csv(self.cpn, index_col=False) - df_input_CSV = df_input_CSV.reset_index() # make sure indexes pair with number of rows + df_input_CSV = ( + df_input_CSV.reset_index() + ) # make sure indexes pair with number of rows self.df_input_CSV = df_input_CSV - def read_gtf(self): # returns GTF with essential columns such as "feature", "seqname", "start", "end" # alongside the names of any optional keys which appeared in the attribute column df_input_GTF = read_gtf(self.gtf) - df_input_GTF['Binding_Probability'] = pd.to_numeric(df_input_GTF['Binding_Probability']) # convert to numeric - df_normalization_bind_probablility = df_input_GTF.groupby('seqname')['Binding_Probability'].sum() # extract binding probablility + df_input_GTF["Binding_Probability"] = pd.to_numeric( + df_input_GTF["Binding_Probability"] + ) # convert to numeric + df_normalization_bind_probablility = df_input_GTF.groupby("seqname")[ + "Binding_Probability" + ].sum() # extract binding probablility count = 0 prev_id = None # Adds Normalized_Binding_Probability and Transcript_Copy_Number to each transcript in the dataframe for index, row in df_input_GTF.iterrows(): - # GTF transcript ID - id_GTF = str(row['seqname']) + # GTF transcript ID + id_GTF = str(row["seqname"]) if id_GTF == prev_id: count += 1 else: prev_id = None - count = 0 - # CVS transcript ID - id_CSV = str(row['seqname']).split('_')[1] + count = 0 + # CVS transcript ID + id_CSV = str(row["seqname"]).split("_")[1] # Calculate Normalized_Binding_Probability and add to GTF dataframe - df_input_GTF.loc[index, 'Normalized_Binding_Probability'] = row['Binding_Probability'] / df_normalization_bind_probablility[id_GTF] + df_input_GTF.loc[index, "Normalized_Binding_Probability"] = ( + row["Binding_Probability"] / df_normalization_bind_probablility[id_GTF] + ) # Calculate Normalized_Binding_Probability and add to GTF dataframe - csv_transcript_copy_number = self.df_input_CSV.loc[self.df_input_CSV['ID of transcript'] == int(id_CSV), 'Transcript copy number'].iloc[0] - df_input_GTF.loc[index,'Transcript_Copy_Number'] = round(csv_transcript_copy_number * df_input_GTF.loc[index,'Normalized_Binding_Probability']) - df_input_GTF.loc[index,'cdna_ID'] = f"{id_GTF}_{count}" + csv_transcript_copy_number = self.df_input_CSV.loc[ + self.df_input_CSV["ID of transcript"] == int(id_CSV), + "Transcript copy number", + ].iloc[0] + df_input_GTF.loc[index, "Transcript_Copy_Number"] = round( + csv_transcript_copy_number + * df_input_GTF.loc[index, "Normalized_Binding_Probability"] + ) + df_input_GTF.loc[index, "cdna_ID"] = f"{id_GTF}_{count}" prev_id = id_GTF self.df_input_GTF = df_input_GTF @@ -134,13 +141,18 @@ class cDNA_Gen: SeqIO.write(self.fasta_records, self.output_fasta, "fasta") def write_csv(self): - self.df_input_GTF[["cdna_ID", "Transcript_Copy_Number"]].to_csv(self.output_csv, index=False) + self.df_input_GTF[["cdna_ID", "Transcript_Copy_Number"]].to_csv( + self.output_csv, index=False + ) def return_output(self): return self.output_fasta, self.output_csv - -if __name__ == "__main__": - import argparse - pass +if __name__ == "main": + logging.basicConfig( + format='[%(asctime)s: %(levelname)s] %(message)s (module "%(module)s")', + level=logging.INFO, + ) + LOG = logging.getLogger(__name__) + cnda_object = parser() diff --git a/cdna/cli.py b/cdna/cli.py index 8a623693ef8b7df6bfea53c8825d65639ba81a6a..8c533020acc431a65bdf5766a42eb4a94a1c4351 100644 --- a/cdna/cli.py +++ b/cdna/cli.py @@ -1 +1,23 @@ import cdna +import argparse + + +def parser(): + parser = argparse.ArgumentParser( + prog="cDNA generator", + description="Generate cDNA sequences based on primer probabilities.", + ) + parser.add_argument("--input_fasta_file", help="genome fasta file") + parser.add_argument("--input_gtf", help="gtf file") + parser.add_argument("--output_fasta_name", help="output fasta file") + parser.add_argument("--input_copy_number", help="input copy number (csv) file") + parser.add_argument("--output_csv_name", help="output fasta file") + args = parser.parse_args() + CDNA = cdna.cdna.CDNAGen( + ifasta=args["input_fasta_file"], + igtf=args["input_gtf_file"], + icpn=args["input_copy_number"], + ocsv=args["output_csv_name"], + ofasta=args["output_fasta_name"], + ) + return CDNA diff --git a/test_files/cDNA.fasta b/test_files/cDNA.fasta index 18bd991775a765df6bad34d73d75e47e94aedf4b..2168d9f9c386c2bf4ef6db01ea350623b1a055ef 100644 --- a/test_files/cDNA.fasta +++ b/test_files/cDNA.fasta @@ -1,6 +1,123 @@ >Transcript_1_0 -ATCCATAAAAAAAAA +TTACAACTTTAGTTCTTTTAATATACTAGAAGCCAGCTCTTTATACAACCAAGTGTTCCC +AGAAATTTTCTTAAAATGCACACCGGCTAAACCAACAATACGAACCTTGACGATATGAAT +TTCGAATTTGATGGGCTCTTTTTCTGTTGTAGTCATGCCTTCGCTACCGTCACCTTGTTG +GTGGATGTTCTCCAAAGATGTTCCCGAATGCTTATCATCTCCATAGGTTTCGGTAATAGA +AAAAAAAAAAAAAAAAAAATGTTTTTCTCTGGTGAATGGAATTAGTTGTCAAAGGAGAAG +AAGGTGAGTATTTATTAGAGCCTTGACGTCTAATGGATTCGTGGTGCGAAAGTGGCATGA +TCGATCTTGGAGTCCCTTCTCTCTTCGCTGCCACATTATTGGAAGAGAACCTTTGCATAC +AAACAAAACCACCCTTAACCTCCTTGAACTCTATATTCATTTTCCTCAGAACAAACATAA +TCTTGTATCTAACAATTGGCAAAGGCTTCGATGAAGTTGTCTGAACAGAAAAGAAACCCT +TCAAAAATAAAGAACGGGGGAAATCGATGGATGGCATAGAACCGGCTGGAGCTCTAGAAG +CTTCCTGCAATATCTCTTCATCTGTTAATTCCTTCTCCACAGATGGCGAGTTTATGTGTG +CGTTCACAGAATGCACGCTGACGCTATCATTCGTCTGTGAAGTAACATTTCCCAAACTAT +CGGATTTGTTATCGTCACTTGATTCTAAGAACCCAGTATCTATAAGCTCTTGCTGAGGAT +AGGCAGATGAAGGCATTGGGGGTCTCATATATTTGAGTGATTCACGACGAGCATGACCAA +CTGATTTAGCTCTTGCGCTTGGATGTAGTTTTCGACCTTTTGCAACATTGAGAGGTGGTA +AGGGATGATTCTCGTCGTCGTCGCTTCCTTCAAACTTCTGTGCAGCGTAAGCTTCCCGCC +CGCTTCCTTTTGCAACTTCATTATTCTGTTTATTAATATCATGCAATTTCATTATATCTT +CTTCAATTTGCTTCTCCCTTTGCTTCTCTACCATATTTTCTGCATCCGCAGGCAATGCTG +GAAAGTCACCTTTATTGCCACCTCTGCTATCGTTTGTCTTGACAGGCTCATTCATGGTAT +AATTTGAATCATAACTCGGTTTAGCAAAACCAGGGACAAAGTCTGAAACAGCGCGAGAAT +GCGTCTTTTGCACAGGTTTATTGATACTATTATTACTATTAACAGATGTCTGTTCAATGG +TCTTTTTACGGCGCTGTGATAATTTTCTAAATATATTCCCAAACTTCGCCTTTTCTGAGG +GATCGCCAGTAGTTGAAGGTACCCTAAATTGTTGATAGTCATTTCCCTGAGGAGTTGGAG +AGAGAGCATATTCCATTTCACGGCGTTGATTATCAGAACTTTTTCTACTCGTTGGTGACG +TATGCGCTTGTTCTGGCATTGCTAACCTCGGTGGAATTAAGACATGTAAGGGGGGCTCTG +TGCCATCGCTATTCTTTGGAGTGAATGGCACAGAGGTATCTTTTGTCGCCAACGTGGCTT +CTGGCTCTGATTTCATAGCGGTTGTATTGACCTCTACGTCCCTTTGCTTCACACTTTCCG +GTATTTCAGAAAGCTTTTCTATATTTTCGTGGCTCTGTCGCTGTTGTTGATTCCGAATTT +TCGCATGTTTCCTATCAAGCATCTCAGAAGTCAAGTAATATATGGAAATCATTGGATGAT +ATGCCCGAGTTGGGTCTTCAAAACTCTCTGTTATGTTTGGCGCGTTTCCGTTATCACTGG +ATTCTGCGTTCATTTTGGCCGCTAAAGTCCAGTATTGTCTAGAGAGAAGAACGTAAGTAG +GATCTGTGATTATACTGACCAAAACACTTCTTGTTTCCTCTACATCGTCAATAAATTCTA +AACGGTACATTTCTTTTAAGACATTTATATCAAGCATTTCGATAGTTAGGGGAACTCTTT +TCGGTAAGTAAGAAGGAGGGGGACCATTGAACCCTCTTACCATCCAGTGGTGCTCCACAA +CCTGTTTAAGTGTGGCTCTTCTTTTCGGATCTACTACCAACATTTTGGATAACAGTGATA +TTACTTCGATAGATAAATGTTGGGGATATTCAACCTTACCTTGCTTGATCTTTTCATGTA +AAACGCTCGAATTTTCGTCGTCAAATGGCACTTTACCGCATACCAAAACAAATAAAACTA +CACCAAATGACCAGACATCTACTTCAGGTCCTGTATAAGGATTCGCTTTTAACAGCTCGG +GAGCGGCAAAATACAGAGAGCCACAGAATGTATGAAGCTGCTTCCTAGAATCATAAATAT +TTGAAAGTCCAAAATCAATTATCTTGATTTCACTGGAATCTGAAATCATTATATTTTCTA +TCTTCAAATCTCTATGGACGATGTTGTTAGCATGCAAATATATTAAGGCGCTCGCGATAC +CCCTAGCAAACTTTCTCGCTTGGTGTTCTCGTATTGACCCATGTTGGATGATATAGTCTA +ACAGCTGACCACCTGAAACATATTCAAACAACATATAGAAATGATTTGACAACGTGCACA +TCTCAAAAAGTCTACATATGTGTGGATGATATAAGATTTGTCCCAAGGACGCTTCTCGAA +TAGTTCTTTTGTCCCTAGATATTTCCTTCTCTAATTTCTTCTGTCTCTCCAAAACATCTT +GTTCATTTTTGGGTGGTGGCAGCATTTGTTCTTTATGTAGGAAAGCCTTTGTAGCACGGT +TTACAATTTTTACTGCACAAACCTCATTAGTGTAACGATGTTTTGCCAGCTTCACTTTAC +CCATAGAACCTGCACCAACTGTTTCAACAAACTCCCAATCTCCTAACGATTTTCGATGAA +ACTGCTTAGGCATGCCCTGAGAAGACGAAACTCGACTCTGGCTGGTAGTATTTGGCTTTG +GTGCATTCTCTCTCGATTTACCTTCAAGTTCTACTTGTCTCTCCTTTTGCTCTGCTTGAT +GTGAGTTACCATTATTGGCGTATTTGATATCCGCCGGGGGCATTAGCGGTGTGTTCTGCT +GCTGCTGCTGCTGTGGACTTTTTCCCATCATTCTCAGCGTAGCGGGCGCCATAGTGCTTG +GTTGTGTATGCATGCTGTTGCTTTCACTATTGCCATCATCCTGCTGGTTACCTCTGCCCA +TTGAGAAGGCAGTATTTACGTGATAATCATCCATAAAAAAAAA >Transcript_1_1 -CATCTCAAAAAGTCT +TTACAACTTTAGTTCTTTTAATATACTAGAAGCCAGCTCTTTATACAACCAAGTGTTCCC +AGAAATTTTCTTAAAATGCACACCGGCTAAACCAACAATACGAACCTTGACGATATGAAT +TTCGAATTTGATGGGCTCTTTTTCTGTTGTAGTCATGCCTTCGCTACCGTCACCTTGTTG +GTGGATGTTCTCCAAAGATGTTCCCGAATGCTTATCATCTCCATAGGTTTCGGTAATAGA +AAAAAAAAAAAAAAAAAAATGTTTTTCTCTGGTGAATGGAATTAGTTGTCAAAGGAGAAG +AAGGTGAGTATTTATTAGAGCCTTGACGTCTAATGGATTCGTGGTGCGAAAGTGGCATGA +TCGATCTTGGAGTCCCTTCTCTCTTCGCTGCCACATTATTGGAAGAGAACCTTTGCATAC +AAACAAAACCACCCTTAACCTCCTTGAACTCTATATTCATTTTCCTCAGAACAAACATAA +TCTTGTATCTAACAATTGGCAAAGGCTTCGATGAAGTTGTCTGAACAGAAAAGAAACCCT +TCAAAAATAAAGAACGGGGGAAATCGATGGATGGCATAGAACCGGCTGGAGCTCTAGAAG +CTTCCTGCAATATCTCTTCATCTGTTAATTCCTTCTCCACAGATGGCGAGTTTATGTGTG +CGTTCACAGAATGCACGCTGACGCTATCATTCGTCTGTGAAGTAACATTTCCCAAACTAT +CGGATTTGTTATCGTCACTTGATTCTAAGAACCCAGTATCTATAAGCTCTTGCTGAGGAT +AGGCAGATGAAGGCATTGGGGGTCTCATATATTTGAGTGATTCACGACGAGCATGACCAA +CTGATTTAGCTCTTGCGCTTGGATGTAGTTTTCGACCTTTTGCAACATTGAGAGGTGGTA +AGGGATGATTCTCGTCGTCGTCGCTTCCTTCAAACTTCTGTGCAGCGTAAGCTTCCCGCC +CGCTTCCTTTTGCAACTTCATTATTCTGTTTATTAATATCATGCAATTTCATTATATCTT +CTTCAATTTGCTTCTCCCTTTGCTTCTCTACCATATTTTCTGCATCCGCAGGCAATGCTG +GAAAGTCACCTTTATTGCCACCTCTGCTATCGTTTGTCTTGACAGGCTCATTCATGGTAT +AATTTGAATCATAACTCGGTTTAGCAAAACCAGGGACAAAGTCTGAAACAGCGCGAGAAT +GCGTCTTTTGCACAGGTTTATTGATACTATTATTACTATTAACAGATGTCTGTTCAATGG +TCTTTTTACGGCGCTGTGATAATTTTCTAAATATATTCCCAAACTTCGCCTTTTCTGAGG +GATCGCCAGTAGTTGAAGGTACCCTAAATTGTTGATAGTCATTTCCCTGAGGAGTTGGAG +AGAGAGCATATTCCATTTCACGGCGTTGATTATCAGAACTTTTTCTACTCGTTGGTGACG +TATGCGCTTGTTCTGGCATTGCTAACCTCGGTGGAATTAAGACATGTAAGGGGGGCTCTG +TGCCATCGCTATTCTTTGGAGTGAATGGCACAGAGGTATCTTTTGTCGCCAACGTGGCTT +CTGGCTCTGATTTCATAGCGGTTGTATTGACCTCTACGTCCCTTTGCTTCACACTTTCCG +GTATTTCAGAAAGCTTTTCTATATTTTCGTGGCTCTGTCGCTGTTGTTGATTCCGAATTT +TCGCATGTTTCCTATCAAGCATCTCAGAAGTCAAGTAATATATGGAAATCATTGGATGAT +ATGCCCGAGTTGGGTCTTCAAAACTCTCTGTTATGTTTGGCGCGTTTCCGTTATCACTGG +ATTCTGCGTTCATTTTGGCCGCTAAAGTCCAGTATTGTCTAGAGAGAAGAACGTAAGTAG +GATCTGTGATTATACTGACCAAAACACTTCTTGTTTCCTCTACATCGTCAATAAATTCTA +AACGGTACATTTCTTTTAAGACATTTATATCAAGCATTTCGATAGTTAGGGGAACTCTTT +TCGGTAAGTAAGAAGGAGGGGGACCATTGAACCCTCTTACCATCCAGTGGTGCTCCACAA +CCTGTTTAAGTGTGGCTCTTCTTTTCGGATCTACTACCAACATTTTGGATAACAGTGATA +TTACTTCGATAGATAAATGTTGGGGATATTCAACCTTACCTTGCTTGATCTTTTCATGTA +AAACGCTCGAATTTTCGTCGTCAAATGGCACTTTACCGCATACCAAAACAAATAAAACTA +CACCAAATGACCAGACATCTACTTCAGGTCCTGTATAAGGATTCGCTTTTAACAGCTCGG +GAGCGGCAAAATACAGAGAGCCACAGAATGTATGAAGCTGCTTCCTAGAATCATAAATAT +TTGAAAGTCCAAAATCAATTATCTTGATTTCACTGGAATCTGAAATCATTATATTTTCTA +TCTTCAAATCTCTATGGACGATGTTGTTAGCATGCAAATATATTAAGGCGCTCGCGATAC +CCCTAGCAAACTTTCTCGCTTGGTGTTCTCGTATTGACCCATGTTGGATGATATAGTCTA +ACAGCTGACCACCTGAAACATATTCAAACAACATATAGAAATGATTTGACAACGTGCACA +TCTCAAAAAGTCT >Transcript_2_0 -AAAAAAAAAAAAAAA +AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTAAAATTCTTTCATTAAA +TCCATATATGGATCCTCCCCTTGAATTCCATAGTTGAATTCCTCTTCAGGGTATCTGTCA +AATTGAGAAGTGTCACCTTGGCCCTGTTGGATTGGTGGTTCGTACGGCGTTTCTATGTAT +CTTGCTAACAATTTCTCCCATATCACTTCGTTAAACCACGGATGGTTCTTGACATCTTCA +CTTCCATTTTGTAAGTTACCCAACCTTTCACTTAAGTCTCTGGTAATTAGCTTCTTCAAT +AAGTCCTGCGCGTCTGGATGGAAAAATGGTGGGAACTTCAATTCGGCGTTCAGTATATTT +TCGTAAGTTTTCATGGTGTTCGAATTGTAAAATGGAGTGTATCCGGCAAGCATTTCATAG +ATTAGCACACCAAAACTCCACCAATCCACTGATTTATTATACGGTTTTGTACTGACCACT +TCCGGCGCTATGTAATCTGGTGTGCCACATAATGTGTATGTGACATCGGGAACGTATTTT +GCGAAGCCAAAGTCGGTTATCTTGATATGGCCGTTTTTATCTAGAAGGATATTTTCTGGT +TTCAAGTCTCAAAAAAAAAAAAAAAAAAAAAAAAAATATATATTATATCCTTACTGTGCA +AATATTCCAACGCTAAGCATACCTCTGCGGCATAAAATTTGGCTACTGGGTTGGGAAATC +TTTGAGATTTACGTAGTAAAGAAAATAATTCACCACCTTCAATGTAGTCCATTACCATGA +AAACTTGCTGAGAATCTTGGAACGTTCCCCACATTCGAATGATGAATGGATGTGAAACAA +TTGAAAGCATTCGGCGTTCGTCATTGGTGTGTTCAACCTGCTTCAGCTTCACTATAGTGT +GCTTTTTCAATGTCTTCAAAGCGTAAAACCTCCCATTGTGATTGGAACGAATTAGGTGAA +CTCTCCCAAATGAGCCAGTTCCCAAAGTTCTTAAAATCTGAAAGTCACTTAGGGAATACT +TTCCTGAGGTATCTCTATATTGTAGCATTGGTTTTTTAACCAAACAAATACCTGCAGACG +CTTCTTCTTTCAACTTTCCGCTGTTTCTTCCATTTATCTCCACGGGTGTGTTGGAAGAAC +ATTCCTCATGCACCGAATGTGCGTTTACAGGAATGTCTTGTCCAACGTTATCTGGAGTTG +TTTCTGTCTTGGCAGTAATGCTTAATTTCCTGATTTCATTGTTGTTCATCGGATCAACAT +ACATAAAAAAAAAAAAAAAAAA