diff --git a/tests/test_transcript_structure.py b/tests/test_transcript_structure.py index 877112cd23e3f31a91f6559467309debf56d8117..3535d8fcc228f0f79687f05ffdb2f72af5ae8622 100644 --- a/tests/test_transcript_structure.py +++ b/tests/test_transcript_structure.py @@ -22,7 +22,7 @@ P_INTRON_1: float = 1 ) def test_csv_2_dict(test_input): builder = Gts.BuildTranscriptStructure(test_input[0], test_input[1], test_input[2]) - builder.csv_2_dict() + builder.__csv_2_dict() with open(TEST_CSV_TITLE) as csv: csv_lines = csv.readlines() first_line = csv_lines[0].split(',') @@ -38,7 +38,7 @@ def test_csv_2_dict(test_input): def test_gtf_2_dict(): builder = Gts.BuildTranscriptStructure(TEST_CSV_TITLE, GENE_COORDS, P_INTRON_0) - builder.gtf_2_dict() + builder.__gtf_2_dict() assert len(builder.gene_sequences_dict) == 2 # Two genes read in the dictionary. assert len(builder.gene_sequences_dict[GENE_KEYS[0]]) == 5 assert len(builder.gene_sequences_dict[GENE_KEYS[1]]) == 5 @@ -63,9 +63,9 @@ def test_gtf_2_dict(): ) def test_make_new_transcripts(test_input): builder = Gts.BuildTranscriptStructure(test_input[0], test_input[1], test_input[2]) - builder.csv_2_dict() # Generates dictionary from gene count csv file. - builder.gtf_2_dict() # Generates dictionary from gtf file. - builder.make_new_transcripts() # Generates the differently spliced transcripts. + builder.__csv_2_dict() # Generates dictionary from gene count csv file. + builder.__gtf_2_dict() # Generates dictionary from gtf file. + builder.__make_new_transcripts() # Generates the differently spliced transcripts. numb_trans_dict = 0 numb_trans_csv = 10 @@ -86,10 +86,10 @@ def test_make_new_transcripts(test_input): ) def test_make_gtf_lines(test_input): builder = Gts.BuildTranscriptStructure(test_input[0], test_input[1], test_input[2]) - builder.csv_2_dict() # Generates dictionary from gene count csv file. - builder.gtf_2_dict() # Generates dictionary from gtf file. - builder.make_new_transcripts() # Generates the differently spliced transcripts. - builder.make_gtf_info() + builder.__csv_2_dict() # Generates dictionary from gene count csv file. + builder.__gtf_2_dict() # Generates dictionary from gtf file. + builder.__make_new_transcripts() # Generates the differently spliced transcripts. + builder.__make_gtf_info() for line in builder.gtf_lines: columns = line.split('\t') assert columns[3] < columns[4] # Tests that the coordinates are increasing. @@ -104,10 +104,10 @@ def test_make_gtf_lines(test_input): ) def test_sort_gtf_lines(test_input): builder = Gts.BuildTranscriptStructure(test_input[0], test_input[1], test_input[2]) - builder.csv_2_dict() # Generates dictionary from gene count csv file. - builder.gtf_2_dict() # Generates dictionary from gtf file. - builder.make_new_transcripts() # Generates the differently spliced transcripts. - builder.make_gtf_info() + builder.__csv_2_dict() # Generates dictionary from gene count csv file. + builder.__gtf_2_dict() # Generates dictionary from gtf file. + builder.__make_new_transcripts() # Generates the differently spliced transcripts. + builder.__make_gtf_info() starts_before = [] # Verifies that the function actually has to sort. for line in builder.gtf_lines: @@ -116,9 +116,9 @@ def test_sort_gtf_lines(test_input): starts_before.append(columns[3]) for ii in range(len(starts_before)-1): assert starts_before[ii] > starts_before[ii+1] - builder.sort_gtf_lines() + builder.__sort_gtf_lines() - builder.sort_gtf_lines() + builder.__sort_gtf_lines() starts_after = [] # Verifies that the function sorted. for line in builder.gtf_lines: columns = line.split('\t') @@ -126,7 +126,7 @@ def test_sort_gtf_lines(test_input): starts_after.append(columns[3]) for ii in range(len(starts_before)-1): assert starts_after[ii] < starts_after[ii+1] - builder.sort_gtf_lines() + builder.__sort_gtf_lines() def test_write_gtf(): diff --git a/transcript_structure/Generate_transcript_structure.py b/transcript_structure/Generate_transcript_structure.py index 82558f451126c67bcc0a20f27c85fecf22b8be3f..bf491701edc078cd4a630c53d6726742f2e6b27e 100644 --- a/transcript_structure/Generate_transcript_structure.py +++ b/transcript_structure/Generate_transcript_structure.py @@ -32,6 +32,7 @@ class BuildTranscriptStructure: E.g. '01' means the intron between original (fully spliced) exon 1 and exon 2 was spliced away, but the intron between exon 2 and exon 3 is included in the transcript. gtf_lines(list): List with all newly created gtf lines. + _transcripts_generated(bool): Indicates whether splicing was conducted or not yet. """ def __init__(self, @@ -47,16 +48,18 @@ class BuildTranscriptStructure: self.gene_sequences_dict = {} self.gene_transcript_dict = {} self.gtf_lines = [] + self._transcripts_generated = False def generate_transcript_structure(self): """Computes distribution and gene coordinates of differently spliced mRNA.""" - self.csv_2_dict() # Generates dictionary from gene count csv file. - self.gtf_2_dict() # Generates dictionary from gtf file. - self.make_new_transcripts() # Generates the differently spliced transcripts. - self.make_gtf_info() # Builds the gtf file of all newly created transcripts. - self.sort_gtf_lines() # Sorts the gtf file by gene occurrence in sequence. - - def csv_2_dict(self) -> None: + self.__csv_2_dict() # Generates dictionary from gene count csv file. + self.__gtf_2_dict() # Generates dictionary from gtf file. + self.__make_new_transcripts() # Generates the differently spliced transcripts. + self.__make_gtf_info() # Builds the gtf file of all newly created transcripts. + self.__sort_gtf_lines() # Sorts the gtf file by gene occurrence in sequence. + self._transcripts_generated = True # Adapts state variable. + + def __csv_2_dict(self) -> None: """Converts the csv file with gene count into a dictionary.""" with open(self.gene_count) as g_c: lines = g_c.readlines() @@ -69,7 +72,7 @@ class BuildTranscriptStructure: line_entries = line.split(',') self.gene_count_dict[line_entries[0]] = int(line_entries[1]) - def gtf_2_dict(self) -> None: + def __gtf_2_dict(self) -> None: """Converts the gtf file into a nested dictionary.""" with open(self.input_coords) as c_g: # Reads coordinates from .gtf file. lines = c_g.readlines() @@ -106,7 +109,7 @@ class BuildTranscriptStructure: gene_info['exon_seq'] = coordinates self.gene_sequences_dict[gene_name] = gene_info - def make_new_transcripts(self) -> None: + def __make_new_transcripts(self) -> None: """ Generates the differently spliced transcripts.""" for gene in self.gene_count_dict: @@ -132,24 +135,7 @@ class BuildTranscriptStructure: self.gene_transcript_dict[gene] = transcript_numbers - def write_csv(self, - output_transcript_count: str - ) -> None: - - """ Writes a csv file containing the number of differently spliced transcripts. - - Args: - output_transcript_count(str): Path and name of the output cvs file: "transcript_ID", "gene_ID", count. - """ - - with open(output_transcript_count, 'w', newline='') as file: - writer = csv.writer(file) - writer.writerow(['Transcript_ID', 'Gene_ID', 'count']) - for gene in self.gene_transcript_dict: - for transcript_ID in self.gene_transcript_dict[gene]: - writer.writerow([transcript_ID, gene, self.gene_transcript_dict[gene][transcript_ID]]) - - def make_gtf_info(self) -> None: + def __make_gtf_info(self) -> None: """ Writes the lines of the new gtf file for the differently spliced transcripts.""" for gene in self.gene_transcript_dict: # Iterates over all genes required. self.gtf_lines.append(self.gene_sequences_dict[gene]['gene_line']) # Add gene line to list. @@ -202,7 +188,7 @@ class BuildTranscriptStructure: exon_lines.reverse() self.gtf_lines.extend(exon_lines) - def sort_gtf_lines(self) -> None: + def __sort_gtf_lines(self) -> None: """ Sorts the gtf lines by the position of the genes (increasing) and returns it.""" @@ -225,16 +211,33 @@ class BuildTranscriptStructure: self.gtf_lines = sorted_gtf_lines + def write_csv(self, + csv_output: str + ) -> None: + + """ Writes a csv file containing the number of differently spliced transcripts. + + Args: + csv_output(str): Path and name of the output cvs file: "transcript_ID", "gene_ID", count. + """ + + with open(csv_output, 'w', newline='') as file: + writer = csv.writer(file) + writer.writerow(['Transcript_ID', 'Gene_ID', 'count']) + for gene in self.gene_transcript_dict: + for transcript_ID in self.gene_transcript_dict[gene]: + writer.writerow([transcript_ID, gene, self.gene_transcript_dict[gene][transcript_ID]]) + def write_gtf(self, - output_coords: str + gtf_output: str ) -> None: """ Writes a gtf file with the information about the differently spliced transcripts. Args: - output_coords(str): Path and name of the output gtf file with the information of all relevant transcripts. + gtf_output(str): Path and name of the output gtf file with the information of all relevant transcripts. """ - with open(output_coords, 'w') as gtf_file: + with open(gtf_output, 'w') as gtf_file: gtf_file.writelines(self.gtf_lines) @@ -254,11 +257,7 @@ def main(): random.seed(10) # Initializes seed for random functions for reproducibility. bts = BuildTranscriptStructure(gene_count, coordinates_genes, p_intron) - bts.csv_2_dict() # Generates dictionary from gene count csv file. - bts.gtf_2_dict() # Generates dictionary from gtf file. - bts.make_new_transcripts() # Generates the differently spliced transcripts. - bts.make_gtf_info() # Builds the gtf file of all newly created transcripts. - bts.sort_gtf_lines() # Sorts the gtf file by gene occurrence in sequence. + bts.generate_transcript_structure() # Creates the transcript structures. bts.write_gtf(name_gtf_output) # Writes the new gtf file. bts.write_csv(name_csv_output) # Writes the new csv file with the count of the transcripts.