Skip to content
Snippets Groups Projects
Commit 4f45b6cb authored by Timon Baltisberger's avatar Timon Baltisberger
Browse files

fix: indicates private class functions with underscores. Adds state variable,...

fix: indicates private class functions with underscores. Adds state variable, indicating whether transcript construction took place already.
parent 8e943c18
No related branches found
No related tags found
1 merge request!15add: generate transcript structure
Pipeline #13636 failed
......@@ -22,7 +22,7 @@ P_INTRON_1: float = 1
)
def test_csv_2_dict(test_input):
builder = Gts.BuildTranscriptStructure(test_input[0], test_input[1], test_input[2])
builder.csv_2_dict()
builder.__csv_2_dict()
with open(TEST_CSV_TITLE) as csv:
csv_lines = csv.readlines()
first_line = csv_lines[0].split(',')
......@@ -38,7 +38,7 @@ def test_csv_2_dict(test_input):
def test_gtf_2_dict():
builder = Gts.BuildTranscriptStructure(TEST_CSV_TITLE, GENE_COORDS, P_INTRON_0)
builder.gtf_2_dict()
builder.__gtf_2_dict()
assert len(builder.gene_sequences_dict) == 2 # Two genes read in the dictionary.
assert len(builder.gene_sequences_dict[GENE_KEYS[0]]) == 5
assert len(builder.gene_sequences_dict[GENE_KEYS[1]]) == 5
......@@ -63,9 +63,9 @@ def test_gtf_2_dict():
)
def test_make_new_transcripts(test_input):
builder = Gts.BuildTranscriptStructure(test_input[0], test_input[1], test_input[2])
builder.csv_2_dict() # Generates dictionary from gene count csv file.
builder.gtf_2_dict() # Generates dictionary from gtf file.
builder.make_new_transcripts() # Generates the differently spliced transcripts.
builder.__csv_2_dict() # Generates dictionary from gene count csv file.
builder.__gtf_2_dict() # Generates dictionary from gtf file.
builder.__make_new_transcripts() # Generates the differently spliced transcripts.
numb_trans_dict = 0
numb_trans_csv = 10
......@@ -86,10 +86,10 @@ def test_make_new_transcripts(test_input):
)
def test_make_gtf_lines(test_input):
builder = Gts.BuildTranscriptStructure(test_input[0], test_input[1], test_input[2])
builder.csv_2_dict() # Generates dictionary from gene count csv file.
builder.gtf_2_dict() # Generates dictionary from gtf file.
builder.make_new_transcripts() # Generates the differently spliced transcripts.
builder.make_gtf_info()
builder.__csv_2_dict() # Generates dictionary from gene count csv file.
builder.__gtf_2_dict() # Generates dictionary from gtf file.
builder.__make_new_transcripts() # Generates the differently spliced transcripts.
builder.__make_gtf_info()
for line in builder.gtf_lines:
columns = line.split('\t')
assert columns[3] < columns[4] # Tests that the coordinates are increasing.
......@@ -104,10 +104,10 @@ def test_make_gtf_lines(test_input):
)
def test_sort_gtf_lines(test_input):
builder = Gts.BuildTranscriptStructure(test_input[0], test_input[1], test_input[2])
builder.csv_2_dict() # Generates dictionary from gene count csv file.
builder.gtf_2_dict() # Generates dictionary from gtf file.
builder.make_new_transcripts() # Generates the differently spliced transcripts.
builder.make_gtf_info()
builder.__csv_2_dict() # Generates dictionary from gene count csv file.
builder.__gtf_2_dict() # Generates dictionary from gtf file.
builder.__make_new_transcripts() # Generates the differently spliced transcripts.
builder.__make_gtf_info()
starts_before = [] # Verifies that the function actually has to sort.
for line in builder.gtf_lines:
......@@ -116,9 +116,9 @@ def test_sort_gtf_lines(test_input):
starts_before.append(columns[3])
for ii in range(len(starts_before)-1):
assert starts_before[ii] > starts_before[ii+1]
builder.sort_gtf_lines()
builder.__sort_gtf_lines()
builder.sort_gtf_lines()
builder.__sort_gtf_lines()
starts_after = [] # Verifies that the function sorted.
for line in builder.gtf_lines:
columns = line.split('\t')
......@@ -126,7 +126,7 @@ def test_sort_gtf_lines(test_input):
starts_after.append(columns[3])
for ii in range(len(starts_before)-1):
assert starts_after[ii] < starts_after[ii+1]
builder.sort_gtf_lines()
builder.__sort_gtf_lines()
def test_write_gtf():
......
......@@ -32,6 +32,7 @@ class BuildTranscriptStructure:
E.g. '01' means the intron between original (fully spliced) exon 1 and exon 2 was spliced away,
but the intron between exon 2 and exon 3 is included in the transcript.
gtf_lines(list): List with all newly created gtf lines.
_transcripts_generated(bool): Indicates whether splicing was conducted or not yet.
"""
def __init__(self,
......@@ -47,16 +48,18 @@ class BuildTranscriptStructure:
self.gene_sequences_dict = {}
self.gene_transcript_dict = {}
self.gtf_lines = []
self._transcripts_generated = False
def generate_transcript_structure(self):
"""Computes distribution and gene coordinates of differently spliced mRNA."""
self.csv_2_dict() # Generates dictionary from gene count csv file.
self.gtf_2_dict() # Generates dictionary from gtf file.
self.make_new_transcripts() # Generates the differently spliced transcripts.
self.make_gtf_info() # Builds the gtf file of all newly created transcripts.
self.sort_gtf_lines() # Sorts the gtf file by gene occurrence in sequence.
def csv_2_dict(self) -> None:
self.__csv_2_dict() # Generates dictionary from gene count csv file.
self.__gtf_2_dict() # Generates dictionary from gtf file.
self.__make_new_transcripts() # Generates the differently spliced transcripts.
self.__make_gtf_info() # Builds the gtf file of all newly created transcripts.
self.__sort_gtf_lines() # Sorts the gtf file by gene occurrence in sequence.
self._transcripts_generated = True # Adapts state variable.
def __csv_2_dict(self) -> None:
"""Converts the csv file with gene count into a dictionary."""
with open(self.gene_count) as g_c:
lines = g_c.readlines()
......@@ -69,7 +72,7 @@ class BuildTranscriptStructure:
line_entries = line.split(',')
self.gene_count_dict[line_entries[0]] = int(line_entries[1])
def gtf_2_dict(self) -> None:
def __gtf_2_dict(self) -> None:
"""Converts the gtf file into a nested dictionary."""
with open(self.input_coords) as c_g: # Reads coordinates from .gtf file.
lines = c_g.readlines()
......@@ -106,7 +109,7 @@ class BuildTranscriptStructure:
gene_info['exon_seq'] = coordinates
self.gene_sequences_dict[gene_name] = gene_info
def make_new_transcripts(self) -> None:
def __make_new_transcripts(self) -> None:
""" Generates the differently spliced transcripts."""
for gene in self.gene_count_dict:
......@@ -132,24 +135,7 @@ class BuildTranscriptStructure:
self.gene_transcript_dict[gene] = transcript_numbers
def write_csv(self,
output_transcript_count: str
) -> None:
""" Writes a csv file containing the number of differently spliced transcripts.
Args:
output_transcript_count(str): Path and name of the output cvs file: "transcript_ID", "gene_ID", count.
"""
with open(output_transcript_count, 'w', newline='') as file:
writer = csv.writer(file)
writer.writerow(['Transcript_ID', 'Gene_ID', 'count'])
for gene in self.gene_transcript_dict:
for transcript_ID in self.gene_transcript_dict[gene]:
writer.writerow([transcript_ID, gene, self.gene_transcript_dict[gene][transcript_ID]])
def make_gtf_info(self) -> None:
def __make_gtf_info(self) -> None:
""" Writes the lines of the new gtf file for the differently spliced transcripts."""
for gene in self.gene_transcript_dict: # Iterates over all genes required.
self.gtf_lines.append(self.gene_sequences_dict[gene]['gene_line']) # Add gene line to list.
......@@ -202,7 +188,7 @@ class BuildTranscriptStructure:
exon_lines.reverse()
self.gtf_lines.extend(exon_lines)
def sort_gtf_lines(self) -> None:
def __sort_gtf_lines(self) -> None:
""" Sorts the gtf lines by the position of the genes (increasing) and returns it."""
......@@ -225,16 +211,33 @@ class BuildTranscriptStructure:
self.gtf_lines = sorted_gtf_lines
def write_csv(self,
csv_output: str
) -> None:
""" Writes a csv file containing the number of differently spliced transcripts.
Args:
csv_output(str): Path and name of the output cvs file: "transcript_ID", "gene_ID", count.
"""
with open(csv_output, 'w', newline='') as file:
writer = csv.writer(file)
writer.writerow(['Transcript_ID', 'Gene_ID', 'count'])
for gene in self.gene_transcript_dict:
for transcript_ID in self.gene_transcript_dict[gene]:
writer.writerow([transcript_ID, gene, self.gene_transcript_dict[gene][transcript_ID]])
def write_gtf(self,
output_coords: str
gtf_output: str
) -> None:
""" Writes a gtf file with the information about the differently spliced transcripts.
Args:
output_coords(str): Path and name of the output gtf file with the information of all relevant transcripts.
gtf_output(str): Path and name of the output gtf file with the information of all relevant transcripts.
"""
with open(output_coords, 'w') as gtf_file:
with open(gtf_output, 'w') as gtf_file:
gtf_file.writelines(self.gtf_lines)
......@@ -254,11 +257,7 @@ def main():
random.seed(10) # Initializes seed for random functions for reproducibility.
bts = BuildTranscriptStructure(gene_count, coordinates_genes, p_intron)
bts.csv_2_dict() # Generates dictionary from gene count csv file.
bts.gtf_2_dict() # Generates dictionary from gtf file.
bts.make_new_transcripts() # Generates the differently spliced transcripts.
bts.make_gtf_info() # Builds the gtf file of all newly created transcripts.
bts.sort_gtf_lines() # Sorts the gtf file by gene occurrence in sequence.
bts.generate_transcript_structure() # Creates the transcript structures.
bts.write_gtf(name_gtf_output) # Writes the new gtf file.
bts.write_csv(name_csv_output) # Writes the new csv file with the count of the transcripts.
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment