add: generate transcript structure
Working on issue #2, generating the transcript structures given intron splicing probability. I am still working on a command line interface. Feedback on the functionality of the code would be appreciated.
Merge request reports
Activity
Working on issue #2, generating transcript structure
requested review from @kanitz
49 self.gene_transcript_dict = {} 50 self.gtf_lines = [] 51 self._transcripts_generated = False 52 53 def generate_transcript_structure(self): 54 """Computes distribution and gene coordinates of differently spliced mRNA.""" 55 self._csv_2_dict() # Generates dictionary from gene count csv file. 56 self._gtf_2_dict() # Generates dictionary from gtf file. 57 self._make_new_transcripts() # Generates the differently spliced transcripts. 58 self._make_gtf_info() # Builds the gtf file of all newly created transcripts. 59 self._sort_gtf_lines() # Sorts the gtf file by gene occurrence in sequence. 60 self._transcripts_generated = True # Adapts state variable. 61 62 def _csv_2_dict(self) -> None: 63 """Converts the csv file with gene count into a dictionary.""" 64 with open(self.gene_count) as g_c: 52 53 def generate_transcript_structure(self): 54 """Computes distribution and gene coordinates of differently spliced mRNA.""" 55 self._csv_2_dict() # Generates dictionary from gene count csv file. 56 self._gtf_2_dict() # Generates dictionary from gtf file. 57 self._make_new_transcripts() # Generates the differently spliced transcripts. 58 self._make_gtf_info() # Builds the gtf file of all newly created transcripts. 59 self._sort_gtf_lines() # Sorts the gtf file by gene occurrence in sequence. 60 self._transcripts_generated = True # Adapts state variable. 61 62 def _csv_2_dict(self) -> None: 63 """Converts the csv file with gene count into a dictionary.""" 64 with open(self.gene_count) as g_c: 65 lines = g_c.readlines() 66 67 first_line = lines[0].split(',') # Removes the first line if it is a title. If you are concerned about the correctness of the file, you should implement checking the correctness of the field throughout. Header (and comment) lines usually start with "#", so you should implement skipping those lines. The check that you implemented now could turn out to pass for lines that are not data lines.
54 """Computes distribution and gene coordinates of differently spliced mRNA.""" 55 self._csv_2_dict() # Generates dictionary from gene count csv file. 56 self._gtf_2_dict() # Generates dictionary from gtf file. 57 self._make_new_transcripts() # Generates the differently spliced transcripts. 58 self._make_gtf_info() # Builds the gtf file of all newly created transcripts. 59 self._sort_gtf_lines() # Sorts the gtf file by gene occurrence in sequence. 60 self._transcripts_generated = True # Adapts state variable. 61 62 def _csv_2_dict(self) -> None: 63 """Converts the csv file with gene count into a dictionary.""" 64 with open(self.gene_count) as g_c: 65 lines = g_c.readlines() 66 67 first_line = lines[0].split(',') # Removes the first line if it is a title. 68 if not first_line[1][0].isnumeric(): 69 del lines[0] 70 71 for line in lines: 72 line_entries = line.split(',') 73 self.gene_count_dict[line_entries[0]] = int(line_entries[1]) 74 75 def _gtf_2_dict(self) -> None: 76 """Converts the gtf file into a nested dictionary.""" 77 with open(self.input_coords) as c_g: # Reads coordinates from .gtf file. 78 lines = c_g.readlines() 79 lines = [i for i in lines if i[0] != '#'] # Exclude comments 80 81 for gene_line in range(len(lines)): 82 gene_info = {} # Dictionary with information of a single gene. 83 line_entries = lines[gene_line].split('\t') 84 if line_entries[2] == 'gene': # The line indeed describes a gene. 85 attribute = line_entries[8].split(';') 73 self.gene_count_dict[line_entries[0]] = int(line_entries[1]) 74 75 def _gtf_2_dict(self) -> None: 76 """Converts the gtf file into a nested dictionary.""" 77 with open(self.input_coords) as c_g: # Reads coordinates from .gtf file. 78 lines = c_g.readlines() 79 lines = [i for i in lines if i[0] != '#'] # Exclude comments 80 81 for gene_line in range(len(lines)): 82 gene_info = {} # Dictionary with information of a single gene. 83 line_entries = lines[gene_line].split('\t') 84 if line_entries[2] == 'gene': # The line indeed describes a gene. 85 attribute = line_entries[8].split(';') 86 gene_name = attribute[2][12:-1] # Extracts the gene name from the attributes. 87 gene_info['gene_line'] = lines[gene_line] 88 gene_info['transcript_line'] = lines[gene_line + 1] 1 1 ensembl_havana gene 3999557 4409241 . - . gene_id "ENSMUSG00000025900"; gene_version "12"; gene_name "Rp1"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; - tests/test_transcript_structure.py 0 → 100644
1 """Tests the transcript structure generator module.""" 2 3 import pytest 4 from transcript_structure import Generate_transcript_structure as Gts 5 - tests/test_transcript_structure.py 0 → 100644
1 """Tests the transcript structure generator module.""" 2 3 import pytest 4 from transcript_structure import Generate_transcript_structure as Gts 5 6 TEST_CSV_TITLE = './tests/resources/test_transcript_structure/Rik_5_Rp1_5_title.csv' This is not very robust. If I try to run tests from anywhere but the root directory, tests will fail. Better to use this file as your reference for relative paths. You can get the path of a module with
__file__
. From here you can then make your way to the test files and this will work regardless of the directory from where the code is executed.
- transcript_structure/Outputs/csv_new.csv 0 → 100644
1 """Creates differently spliced transcripts.""" Please don't include binary, proprietary file formats (XLSX) - we can't see what's in it easily and we need unfree tools to properly deal with them. If it contains some important usage information, create a markdown file or similar. Also, please consider moving
coordinates.gtf
totests/test_files
.
1 """Creates differently spliced transcripts.""" 2 3 import random 4 import csv 5 import copy 6 7 8 class BuildTranscriptStructure: 9 """Creates differently spliced transcripts. 10 11 Args: 12 input_gene_count(str): Path to csv file of type "geneID", number to sample. 13 input_coordinates(str): Path to gtf file of relevant genes. 14 p_intron(float): Probability to include each intron in the mRNA sequence. 15 16 Attributes: 244 coordinates_genes = 'gtf/coordinates.gtf' 245 p_intron = 0.3 246 247 # Output paths and names. 248 name_csv_output = 'Outputs/csv_new.csv' 249 name_gtf_output = 'Outputs/gtf_new.gtf' 250 251 random.seed(10) # Initializes seed for random functions for reproducibility. 252 253 bts = BuildTranscriptStructure(gene_count, coordinates_genes, p_intron) 254 bts.generate_transcript_structure() # Creates the transcript structures. 255 bts.write_gtf(name_gtf_output) # Writes the new gtf file. 256 bts.write_csv(name_csv_output) # Writes the new csv file with the count of the transcripts. 257 258 259 if __name__ == '__main__':