fix: indicates private class functions with underscores. Adds state variable,...

fix: indicates private class functions with underscores. Adds state variable, indicating whether transcript construction took place already.

fix: indicates private class functions with underscores. Adds state variable,...
4f45b6cb · Timon Baltisberger · 8e943c18 · 4f45b6cb · 4f45b6cb
Commit 4f45b6cb authored 3 years ago by Timon Baltisberger
--- a/tests/test_transcript_structure.py
+++ b/tests/test_transcript_structure.py
@@ -22,7 +22,7 @@ P_INTRON_1: float = 1
 )
 def test_csv_2_dict(test_input):
    builder = Gts.BuildTranscriptStructure(test_input[0], test_input[1], test_input[2])
-    builder.csv_2_dict()
+    builder.__csv_2_dict()
    with open(TEST_CSV_TITLE) as csv:
        csv_lines = csv.readlines()
    first_line = csv_lines[0].split(',')
@@ -38,7 +38,7 @@ def test_csv_2_dict(test_input):

 def test_gtf_2_dict():
    builder = Gts.BuildTranscriptStructure(TEST_CSV_TITLE, GENE_COORDS, P_INTRON_0)
-    builder.gtf_2_dict()
+    builder.__gtf_2_dict()
    assert len(builder.gene_sequences_dict) == 2  # Two genes read in the dictionary.
    assert len(builder.gene_sequences_dict[GENE_KEYS[0]]) == 5
    assert len(builder.gene_sequences_dict[GENE_KEYS[1]]) == 5
@@ -63,9 +63,9 @@ def test_gtf_2_dict():
 )
 def test_make_new_transcripts(test_input):
    builder = Gts.BuildTranscriptStructure(test_input[0], test_input[1], test_input[2])
-    builder.csv_2_dict()  # Generates dictionary from gene count csv file.
-    builder.gtf_2_dict()  # Generates dictionary from gtf file.
-    builder.make_new_transcripts()  # Generates the differently spliced transcripts.
+    builder.__csv_2_dict()  # Generates dictionary from gene count csv file.
+    builder.__gtf_2_dict()  # Generates dictionary from gtf file.
+    builder.__make_new_transcripts()  # Generates the differently spliced transcripts.

    numb_trans_dict = 0
    numb_trans_csv = 10
@@ -86,10 +86,10 @@ def test_make_new_transcripts(test_input):
 )
 def test_make_gtf_lines(test_input):
    builder = Gts.BuildTranscriptStructure(test_input[0], test_input[1], test_input[2])
-    builder.csv_2_dict()  # Generates dictionary from gene count csv file.
-    builder.gtf_2_dict()  # Generates dictionary from gtf file.
-    builder.make_new_transcripts()  # Generates the differently spliced transcripts.
-    builder.make_gtf_info()
+    builder.__csv_2_dict()  # Generates dictionary from gene count csv file.
+    builder.__gtf_2_dict()  # Generates dictionary from gtf file.
+    builder.__make_new_transcripts()  # Generates the differently spliced transcripts.
+    builder.__make_gtf_info()
    for line in builder.gtf_lines:
        columns = line.split('\t')
        assert columns[3] < columns[4]  # Tests that the coordinates are increasing.
@@ -104,10 +104,10 @@ def test_make_gtf_lines(test_input):
 )
 def test_sort_gtf_lines(test_input):
    builder = Gts.BuildTranscriptStructure(test_input[0], test_input[1], test_input[2])
-    builder.csv_2_dict()  # Generates dictionary from gene count csv file.
-    builder.gtf_2_dict()  # Generates dictionary from gtf file.
-    builder.make_new_transcripts()  # Generates the differently spliced transcripts.
-    builder.make_gtf_info()
+    builder.__csv_2_dict()  # Generates dictionary from gene count csv file.
+    builder.__gtf_2_dict()  # Generates dictionary from gtf file.
+    builder.__make_new_transcripts()  # Generates the differently spliced transcripts.
+    builder.__make_gtf_info()

    starts_before = []  # Verifies that the function actually has to sort.
    for line in builder.gtf_lines:
@@ -116,9 +116,9 @@ def test_sort_gtf_lines(test_input):
            starts_before.append(columns[3])
    for ii in range(len(starts_before)-1):
        assert starts_before[ii] > starts_before[ii+1]
-    builder.sort_gtf_lines()
+    builder.__sort_gtf_lines()

-    builder.sort_gtf_lines()
+    builder.__sort_gtf_lines()
    starts_after = []  # Verifies that the function sorted.
    for line in builder.gtf_lines:
        columns = line.split('\t')
@@ -126,7 +126,7 @@ def test_sort_gtf_lines(test_input):
            starts_after.append(columns[3])
    for ii in range(len(starts_before)-1):
        assert starts_after[ii] < starts_after[ii+1]
-    builder.sort_gtf_lines()
+    builder.__sort_gtf_lines()


 def test_write_gtf():

--- a/transcript_structure/Generate_transcript_structure.py
+++ b/transcript_structure/Generate_transcript_structure.py
@@ -32,6 +32,7 @@ class BuildTranscriptStructure:
            E.g. '01' means the intron between original (fully spliced) exon 1 and exon 2 was spliced away,
                        but the intron between exon 2 and exon 3 is included in the transcript.
        gtf_lines(list): List with all newly created gtf lines.
+        _transcripts_generated(bool): Indicates whether splicing was conducted or not yet.
    """

    def __init__(self,
@@ -47,16 +48,18 @@ class BuildTranscriptStructure:
        self.gene_sequences_dict = {}
        self.gene_transcript_dict = {}
        self.gtf_lines = []
+        self._transcripts_generated = False

    def generate_transcript_structure(self):
        """Computes distribution and gene coordinates of differently spliced mRNA."""
-        self.csv_2_dict()  # Generates dictionary from gene count csv file.
-        self.gtf_2_dict()  # Generates dictionary from gtf file.
-        self.make_new_transcripts()  # Generates the differently spliced transcripts.
-        self.make_gtf_info()  # Builds the gtf file of all newly created transcripts.
-        self.sort_gtf_lines()  # Sorts the gtf file by gene occurrence in sequence.
-
-    def csv_2_dict(self) -> None:
+        self.__csv_2_dict()  # Generates dictionary from gene count csv file.
+        self.__gtf_2_dict()  # Generates dictionary from gtf file.
+        self.__make_new_transcripts()  # Generates the differently spliced transcripts.
+        self.__make_gtf_info()  # Builds the gtf file of all newly created transcripts.
+        self.__sort_gtf_lines()  # Sorts the gtf file by gene occurrence in sequence.
+        self._transcripts_generated = True  # Adapts state variable.
+
+    def __csv_2_dict(self) -> None:
        """Converts the csv file with gene count into a dictionary."""
        with open(self.gene_count) as g_c:
            lines = g_c.readlines()
@@ -69,7 +72,7 @@ class BuildTranscriptStructure:
            line_entries = line.split(',')
            self.gene_count_dict[line_entries[0]] = int(line_entries[1])

-    def gtf_2_dict(self) -> None:
+    def __gtf_2_dict(self) -> None:
        """Converts the gtf file into a nested dictionary."""
        with open(self.input_coords) as c_g:  # Reads coordinates from .gtf file.
            lines = c_g.readlines()
@@ -106,7 +109,7 @@ class BuildTranscriptStructure:
                gene_info['exon_seq'] = coordinates
                self.gene_sequences_dict[gene_name] = gene_info

-    def make_new_transcripts(self) -> None:
+    def __make_new_transcripts(self) -> None:
        """ Generates the differently spliced transcripts."""
        for gene in self.gene_count_dict:

@@ -132,24 +135,7 @@ class BuildTranscriptStructure:

            self.gene_transcript_dict[gene] = transcript_numbers

-    def write_csv(self,
-                  output_transcript_count: str
-                  ) -> None:
-
-        """ Writes a csv file containing the number of differently spliced transcripts.
-
-        Args:
-             output_transcript_count(str): Path and name of the output cvs file: "transcript_ID", "gene_ID", count.
-        """
-
-        with open(output_transcript_count, 'w', newline='') as file:
-            writer = csv.writer(file)
-            writer.writerow(['Transcript_ID', 'Gene_ID', 'count'])
-            for gene in self.gene_transcript_dict:
-                for transcript_ID in self.gene_transcript_dict[gene]:
-                    writer.writerow([transcript_ID, gene, self.gene_transcript_dict[gene][transcript_ID]])
-
-    def make_gtf_info(self) -> None:
+    def __make_gtf_info(self) -> None:
        """ Writes the lines of the new gtf file for the differently spliced transcripts."""
        for gene in self.gene_transcript_dict:  # Iterates over all genes required.
            self.gtf_lines.append(self.gene_sequences_dict[gene]['gene_line'])  # Add gene line to list.
@@ -202,7 +188,7 @@ class BuildTranscriptStructure:
                    exon_lines.reverse()
                self.gtf_lines.extend(exon_lines)

-    def sort_gtf_lines(self) -> None:
+    def __sort_gtf_lines(self) -> None:

        """ Sorts the gtf lines by the position of the genes (increasing) and returns it."""

@@ -225,16 +211,33 @@ class BuildTranscriptStructure:

        self.gtf_lines = sorted_gtf_lines

+    def write_csv(self,
+                  csv_output: str
+                  ) -> None:
+
+        """ Writes a csv file containing the number of differently spliced transcripts.
+
+        Args:
+             csv_output(str): Path and name of the output cvs file: "transcript_ID", "gene_ID", count.
+        """
+
+        with open(csv_output, 'w', newline='') as file:
+            writer = csv.writer(file)
+            writer.writerow(['Transcript_ID', 'Gene_ID', 'count'])
+            for gene in self.gene_transcript_dict:
+                for transcript_ID in self.gene_transcript_dict[gene]:
+                    writer.writerow([transcript_ID, gene, self.gene_transcript_dict[gene][transcript_ID]])
+
    def write_gtf(self,
-                  output_coords: str
+                  gtf_output: str
                  ) -> None:

        """ Writes a gtf file with the information about the differently spliced transcripts.

        Args:
-            output_coords(str): Path and name of the output gtf file with the information of all relevant transcripts.
+            gtf_output(str): Path and name of the output gtf file with the information of all relevant transcripts.
        """
-        with open(output_coords, 'w') as gtf_file:
+        with open(gtf_output, 'w') as gtf_file:
            gtf_file.writelines(self.gtf_lines)


@@ -254,11 +257,7 @@ def main():
    random.seed(10)  # Initializes seed for random functions for reproducibility.

    bts = BuildTranscriptStructure(gene_count, coordinates_genes, p_intron)
-    bts.csv_2_dict()  # Generates dictionary from gene count csv file.
-    bts.gtf_2_dict()  # Generates dictionary from gtf file.
-    bts.make_new_transcripts()  # Generates the differently spliced transcripts.
-    bts.make_gtf_info()  # Builds the gtf file of all newly created transcripts.
-    bts.sort_gtf_lines()  # Sorts the gtf file by gene occurrence in sequence.
+    bts.generate_transcript_structure()  # Creates the transcript structures.
    bts.write_gtf(name_gtf_output)  # Writes the new gtf file.
    bts.write_csv(name_csv_output)  # Writes the new csv file with the count of the transcripts.