diff --git a/tests/.DS_Store b/tests/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6 Binary files /dev/null and b/tests/.DS_Store differ diff --git a/tests/resources/.DS_Store b/tests/resources/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..7ad1176167dc0ba71da2384413b97af9cb814803 Binary files /dev/null and b/tests/resources/.DS_Store differ diff --git a/tests/resources/test_transcript_structure/RP1_RIK.gtf b/tests/resources/test_transcript_structure/RP1_RIK.gtf new file mode 100644 index 0000000000000000000000000000000000000000..d8e59e5114bfabdec1abb86b7b83748d70b5bf5f --- /dev/null +++ b/tests/resources/test_transcript_structure/RP1_RIK.gtf @@ -0,0 +1,12 @@ +1 ensembl_havana gene 3999557 4409241 . - . gene_id "ENSMUSG00000025900"; gene_version "12"; gene_name "Rp1"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; +1 havana transcript 4290846 4409241 . - . gene_id "ENSMUSG00000025900"; gene_version "12"; transcript_id "ENSMUST00000208793"; transcript_version "1"; gene_name "Rp1"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Rp1-204"; transcript_source "havana"; transcript_biotype "retained_intron"; transcript_support_level "1"; +1 havana exon 4409170 4409241 . - . gene_id "ENSMUSG00000025900"; gene_version "12"; transcript_id "ENSMUST00000208793"; transcript_version "1"; exon_number "1"; gene_name "Rp1"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Rp1-204"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSMUSE00001379779"; exon_version "1"; transcript_support_level "1"; +1 havana exon 4352202 4352837 . - . gene_id "ENSMUSG00000025900"; gene_version "12"; transcript_id "ENSMUST00000208793"; transcript_version "1"; exon_number "2"; gene_name "Rp1"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Rp1-204"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSMUSE00001377400"; exon_version "1"; transcript_support_level "1"; +1 havana exon 4351910 4352081 . - . gene_id "ENSMUSG00000025900"; gene_version "12"; transcript_id "ENSMUST00000208793"; transcript_version "1"; exon_number "3"; gene_name "Rp1"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Rp1-204"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSMUSE00001380148"; exon_version "1"; transcript_support_level "1"; +1 havana exon 4290846 4293012 . - . gene_id "ENSMUSG00000025900"; gene_version "12"; transcript_id "ENSMUST00000208793"; transcript_version "1"; exon_number "4"; gene_name "Rp1"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Rp1-204"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSMUSE00001345123"; exon_version "1"; transcript_support_level "1"; +1 havana gene 9747648 9791924 . + . gene_id "ENSMUSG00000097893"; gene_version "8"; gene_name "1700034P13Rik"; gene_source "havana"; gene_biotype "lncRNA"; +1 havana transcript 9747648 9791922 . + . gene_id "ENSMUSG00000097893"; gene_version "8"; transcript_id "ENSMUST00000181821"; transcript_version "7"; gene_name "1700034P13Rik"; gene_source "havana"; gene_biotype "lncRNA"; transcript_name "1700034P13Rik-202"; transcript_source "havana"; transcript_biotype "lncRNA"; transcript_support_level "1"; +1 havana exon 9747648 9748604 . + . gene_id "ENSMUSG00000097893"; gene_version "8"; transcript_id "ENSMUST00000181821"; transcript_version "7"; exon_number "1"; gene_name "1700034P13Rik"; gene_source "havana"; gene_biotype "lncRNA"; transcript_name "1700034P13Rik-202"; transcript_source "havana"; transcript_biotype "lncRNA"; exon_id "ENSMUSE00001104564"; exon_version "1"; transcript_support_level "1"; +1 havana exon 9752449 9752564 . + . gene_id "ENSMUSG00000097893"; gene_version "8"; transcript_id "ENSMUST00000181821"; transcript_version "7"; exon_number "2"; gene_name "1700034P13Rik"; gene_source "havana"; gene_biotype "lncRNA"; transcript_name "1700034P13Rik-202"; transcript_source "havana"; transcript_biotype "lncRNA"; exon_id "ENSMUSE00001144804"; exon_version "1"; transcript_support_level "1"; +1 havana exon 9789656 9789780 . + . gene_id "ENSMUSG00000097893"; gene_version "8"; transcript_id "ENSMUST00000181821"; transcript_version "7"; exon_number "3"; gene_name "1700034P13Rik"; gene_source "havana"; gene_biotype "lncRNA"; transcript_name "1700034P13Rik-202"; transcript_source "havana"; transcript_biotype "lncRNA"; exon_id "ENSMUSE00001175967"; exon_version "1"; transcript_support_level "1"; +1 havana exon 9791125 9791922 . + . gene_id "ENSMUSG00000097893"; gene_version "8"; transcript_id "ENSMUST00000181821"; transcript_version "7"; exon_number "4"; gene_name "1700034P13Rik"; gene_source "havana"; gene_biotype "lncRNA"; transcript_name "1700034P13Rik-202"; transcript_source "havana"; transcript_biotype "lncRNA"; exon_id "ENSMUSE00001174858"; exon_version "1"; transcript_support_level "1"; \ No newline at end of file diff --git a/tests/resources/test_transcript_structure/Rik_5_Rp1_5_no_title.csv b/tests/resources/test_transcript_structure/Rik_5_Rp1_5_no_title.csv new file mode 100644 index 0000000000000000000000000000000000000000..6add2b68a03c19802e0165c5efb5b45eba521454 --- /dev/null +++ b/tests/resources/test_transcript_structure/Rik_5_Rp1_5_no_title.csv @@ -0,0 +1,2 @@ +1700034P13Rik,5 +Rp1,5 \ No newline at end of file diff --git a/tests/resources/test_transcript_structure/Rik_5_Rp1_5_title.csv b/tests/resources/test_transcript_structure/Rik_5_Rp1_5_title.csv new file mode 100644 index 0000000000000000000000000000000000000000..b6be973a2fcec97d9ed65dc4e600ded43d94b7db --- /dev/null +++ b/tests/resources/test_transcript_structure/Rik_5_Rp1_5_title.csv @@ -0,0 +1,3 @@ +GeneID,count +1700034P13Rik,5 +Rp1,5 \ No newline at end of file diff --git a/tests/test_transcript_structure.py b/tests/test_transcript_structure.py new file mode 100644 index 0000000000000000000000000000000000000000..523db757f4036fd86766e203f88175f195f53e47 --- /dev/null +++ b/tests/test_transcript_structure.py @@ -0,0 +1,135 @@ +import pytest +from transcript_structure import Generate_transcript_structure as Gts + +TEST_CSV_TITLE = './tests/resources/test_transcript_structure/Rik_5_Rp1_5_title.csv' +TEST_CSV_NO_TITLE = './tests/resources/test_transcript_structure/Rik_5_Rp1_5_no_title.csv' + +GENE_COORDS = './tests/resources/RP1_RIK.gtf' +GENE_KEYS = ['Rp1', '1700034P13Rik'] + +P_INTRON_0: float = 0 +P_INTRON_0_2 = 0.2 +P_INTRON_1: float = 1 + + +@pytest.mark.parametrize( + "test_input", + [(TEST_CSV_TITLE, GENE_COORDS, P_INTRON_0), + (TEST_CSV_NO_TITLE, GENE_COORDS, P_INTRON_0) + ], +) +def test_csv_2_dict(test_input): + builder = Gts.BuildTranscriptStructure(test_input[0], test_input[1], test_input[2]) + builder.csv_2_dict() + with open(TEST_CSV_TITLE) as csv: + csv_lines = csv.readlines() + first_line = csv_lines[0].split(',') + if not first_line[1].isnumeric(): + del(csv_lines[0]) # Removes title. + csv_lines[-1] = ''.join([csv_lines[-1], '\n']) # Adds \n to last line of csv. + + keys = list(builder.gene_count_dict.keys()) + for index, line in enumerate(csv_lines): + dic_line = ''.join([keys[index], ',', str(builder.gene_count_dict[keys[index]]), '\n']) + assert line == dic_line + + +def test_gtf_2_dict(): + builder = Gts.BuildTranscriptStructure(TEST_CSV_TITLE, GENE_COORDS, P_INTRON_0) + builder.gtf_2_dict() + assert len(builder.gene_sequences_dict) == 2 # Two genes read in the dictionary. + assert len(builder.gene_sequences_dict[GENE_KEYS[0]]) == 5 + assert len(builder.gene_sequences_dict[GENE_KEYS[1]]) == 5 + gene_line_rik = ('1\thavana\tgene\t9747648\t9791924\t.\t+\t.\tgene_id "ENSMUSG00000097893"; gene_version "8"; ' + 'gene_name "1700034P13Rik"; gene_source "havana"; gene_biotype "lncRNA";\n') + assert gene_line_rik == builder.gene_sequences_dict[GENE_KEYS[1]]['gene_line'] + + with open(GENE_COORDS) as gtf: + lines = gtf.readlines() + numb_exons_gtf = len(lines) - 4 # 2x exon + transcript line + numb_exons_dict = 0 + for gene_key in GENE_KEYS: + numb_exons_dict += len(builder.gene_sequences_dict[gene_key]['exon_seq']) + assert numb_exons_gtf == numb_exons_dict + + +@pytest.mark.parametrize( + "test_input", + [(TEST_CSV_TITLE, GENE_COORDS, P_INTRON_0), + (TEST_CSV_TITLE, GENE_COORDS, P_INTRON_1) + ], +) +def test_make_new_transcripts(test_input): + builder = Gts.BuildTranscriptStructure(test_input[0], test_input[1], test_input[2]) + builder.csv_2_dict() # Generates dictionary from gene count csv file. + builder.gtf_2_dict() # Generates dictionary from gtf file. + builder.make_new_transcripts() # Generates the differently spliced transcripts. + + numb_trans_dict = 0 + numb_trans_csv = 10 + for gene_key in GENE_KEYS: + for trans_id in builder.gene_transcript_dict[gene_key]: + numb_trans_dict += builder.gene_transcript_dict[gene_key][trans_id] + assert numb_trans_csv == numb_trans_csv + + for gene_key in GENE_KEYS: + assert len(builder.gene_transcript_dict[gene_key]) == 1 # All have identical transcript IDs. + + +@pytest.mark.parametrize( + "test_input", + [(TEST_CSV_TITLE, GENE_COORDS, P_INTRON_0), + (TEST_CSV_TITLE, GENE_COORDS, P_INTRON_1) + ], +) +def test_make_gtf_lines(test_input): + builder = Gts.BuildTranscriptStructure(test_input[0], test_input[1], test_input[2]) + builder.csv_2_dict() # Generates dictionary from gene count csv file. + builder.gtf_2_dict() # Generates dictionary from gtf file. + builder.make_new_transcripts() # Generates the differently spliced transcripts. + builder.make_gtf_info() + for line in builder.gtf_lines: + columns = line.split('\t') + assert columns[3] < columns[4] # Tests that the coordinates are increasing. + pass + + +@pytest.mark.parametrize( + "test_input", + [(TEST_CSV_TITLE, GENE_COORDS, P_INTRON_0), + (TEST_CSV_TITLE, GENE_COORDS, P_INTRON_1) + ], +) +def test_sort_gtf_lines(test_input): + builder = Gts.BuildTranscriptStructure(test_input[0], test_input[1], test_input[2]) + builder.csv_2_dict() # Generates dictionary from gene count csv file. + builder.gtf_2_dict() # Generates dictionary from gtf file. + builder.make_new_transcripts() # Generates the differently spliced transcripts. + builder.make_gtf_info() + + starts_before = [] # Verifies that the function actually has to sort. + for line in builder.gtf_lines: + columns = line.split('\t') + if columns[2] == 'gene': + starts_before.append(columns[3]) + for ii in range(len(starts_before)-1): + assert starts_before[ii] > starts_before[ii+1] + builder.sort_gtf_lines() + + builder.sort_gtf_lines() + starts_after = [] # Verifies that the function sorted. + for line in builder.gtf_lines: + columns = line.split('\t') + if columns[2] == 'gene': + starts_after.append(columns[3]) + for ii in range(len(starts_before)-1): + assert starts_after[ii] < starts_after[ii+1] + builder.sort_gtf_lines() + + +def test_write_gtf(): + pass + + +def test_write_csv(): + pass diff --git a/transcript_structure/.DS_Store b/transcript_structure/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..9c75cefce1c0973c30df5a9d9edb641571a6d60e Binary files /dev/null and b/transcript_structure/.DS_Store differ diff --git a/transcript_structure/Generate_transcript_structure.py b/transcript_structure/Generate_transcript_structure.py new file mode 100644 index 0000000000000000000000000000000000000000..82558f451126c67bcc0a20f27c85fecf22b8be3f --- /dev/null +++ b/transcript_structure/Generate_transcript_structure.py @@ -0,0 +1,268 @@ +import random +import csv +import copy + + +class BuildTranscriptStructure: + + """Creates differently spliced transcripts. + + Args: + input_gene_count(str): Path to csv file of type "geneID", number to sample. + input_coordinates(str): Path to gtf file of relevant genes. + p_intron(float): Probability to include each intron in the mRNA sequence. + + Attributes: + gene_count_dict(dict): Dictionary of format {"gene_ID": number_of_samplings} + gene_sequences_dict(dict): Nested dictionary with information about each gene. + Format: {"gene_ID": {"gene_line": gene_gtf_line, + "transcript_line": transcript_gtf_line, + "exon_line": exemplary_exon_line, + "exon_seq": [[start_exon1, end_exon1],[start_exon2, end_exon2],...] + } + } + gene_transcript_dict(dict): Nested dictionary with amount of each differently spliced transcript. + Format: {"gene_ID": {"transcript_ID": n_copies}} + The transcript ID is a binary code, signifying whether a certain intron is included or not. + E.g. gene with 4 exons: + transcript code 001: The 1. and 2. introns are not included (spliced away), the 3. is included. + The transcript will therefore have 3 exons (exon 3 and 4 are combined). + For sequences with negative strand senses, the exon numbering is determining the direction the transcript + code is to be applied, and not the occurrence in the gene sequence (inverted by convention). + E.g. '01' means the intron between original (fully spliced) exon 1 and exon 2 was spliced away, + but the intron between exon 2 and exon 3 is included in the transcript. + gtf_lines(list): List with all newly created gtf lines. + """ + + def __init__(self, + input_gene_count: str, + input_coordinates: str, + p_intron: float, + ) -> None: + """Class constructor.""" + self.gene_count = input_gene_count + self.input_coords = input_coordinates + self.p_intron = p_intron + self.gene_count_dict = {} + self.gene_sequences_dict = {} + self.gene_transcript_dict = {} + self.gtf_lines = [] + + def generate_transcript_structure(self): + """Computes distribution and gene coordinates of differently spliced mRNA.""" + self.csv_2_dict() # Generates dictionary from gene count csv file. + self.gtf_2_dict() # Generates dictionary from gtf file. + self.make_new_transcripts() # Generates the differently spliced transcripts. + self.make_gtf_info() # Builds the gtf file of all newly created transcripts. + self.sort_gtf_lines() # Sorts the gtf file by gene occurrence in sequence. + + def csv_2_dict(self) -> None: + """Converts the csv file with gene count into a dictionary.""" + with open(self.gene_count) as g_c: + lines = g_c.readlines() + + first_line = lines[0].split(',') # Removes the first line if it is a title. + if not first_line[1][0].isnumeric(): + del lines[0] + + for line in lines: + line_entries = line.split(',') + self.gene_count_dict[line_entries[0]] = int(line_entries[1]) + + def gtf_2_dict(self) -> None: + """Converts the gtf file into a nested dictionary.""" + with open(self.input_coords) as c_g: # Reads coordinates from .gtf file. + lines = c_g.readlines() + lines = [i for i in lines if i[0] != '#'] # Exclude comments + + for gene_line in range(len(lines)): + gene_info = {} # Dictionary with information of a single gene. + line_entries = lines[gene_line].split('\t') + if line_entries[2] == 'gene': # The line indeed describes a gene. + attribute = line_entries[8].split(';') + gene_name = attribute[2][12:-1] # Extracts the gene name from the attributes. + gene_info['gene_line'] = lines[gene_line] + gene_info['transcript_line'] = lines[gene_line + 1] + gene_info['exon_line'] = lines[gene_line + 2] # Exemplary line of an exon. + gene_info['strand_sense'] = line_entries[6] == '+' # Strand sense. + + coordinates = [] + exon_line = [] + line_offset = 2 # Lines after the gene line (+1 is transcript description). + while True: + try: # Avoids error at end of list. + exon_line = lines[gene_line + line_offset].split('\t') + except IndexError: # End of gtf file reached: The job is finished. + break + if exon_line[2] != 'exon': # End of exon list of this gene is reached. + break + else: # The line is an exon. + coordinates.append([int(exon_line[3]), int(exon_line[4])]) + line_offset += 1 # Move to next line. + + if exon_line[6] == '-': # Strands with sense (-) + coordinates.reverse() + + gene_info['exon_seq'] = coordinates + self.gene_sequences_dict[gene_name] = gene_info + + def make_new_transcripts(self) -> None: + """ Generates the differently spliced transcripts.""" + for gene in self.gene_count_dict: + + # Computes the intron splicing for each transcript. + transcript_ids = [] + for _ in range(self.gene_count_dict[gene]): + i_d = [] + for __ in range(len(self.gene_sequences_dict[gene]['exon_seq']) - 1): + if random.random() > self.p_intron: # Intron spliced away. + i_d.append('0') + else: # Intron not spliced away. + i_d.append('1') + transcript_ids.append(''.join(i_d)) # Combine all transcript IDs in one list. + + # Counts how often each transcript is is the list. + transcript_numbers = {} + while True: + i_d = transcript_ids.pop() + transcript_numbers['-'.join([gene, i_d])] = 1 + transcript_ids.count(i_d) + transcript_ids = [not_current_iD for not_current_iD in transcript_ids if not_current_iD != i_d] + if not transcript_ids: # Leaves loop once all codes were scanned for. + break + + self.gene_transcript_dict[gene] = transcript_numbers + + def write_csv(self, + output_transcript_count: str + ) -> None: + + """ Writes a csv file containing the number of differently spliced transcripts. + + Args: + output_transcript_count(str): Path and name of the output cvs file: "transcript_ID", "gene_ID", count. + """ + + with open(output_transcript_count, 'w', newline='') as file: + writer = csv.writer(file) + writer.writerow(['Transcript_ID', 'Gene_ID', 'count']) + for gene in self.gene_transcript_dict: + for transcript_ID in self.gene_transcript_dict[gene]: + writer.writerow([transcript_ID, gene, self.gene_transcript_dict[gene][transcript_ID]]) + + def make_gtf_info(self) -> None: + """ Writes the lines of the new gtf file for the differently spliced transcripts.""" + for gene in self.gene_transcript_dict: # Iterates over all genes required. + self.gtf_lines.append(self.gene_sequences_dict[gene]['gene_line']) # Add gene line to list. + sense = self.gene_sequences_dict[gene]['strand_sense'] + + for transcript_ID in self.gene_transcript_dict[gene]: # Iterates over all occurring types of splicings. + # Modifies the transcript line according to the splicing. + transcript_line = self.gene_sequences_dict[gene]['transcript_line'].split('\t') + attribute = transcript_line[8].split(';') + attribute[7] = ''.join(['transcript_name "', transcript_ID, '"']) + transcript_line[8] = '; '.join(attribute) + self.gtf_lines.append('\t'.join(transcript_line)) + + start_id = len(gene) + i_d = list(map(int, transcript_ID[start_id + 1:])) # Extract the splicing coding as int. + i_d_pop = copy.copy(i_d) + if sense: + i_d_pop.reverse() + numb_introns = sum(i_d) # Adds up the transcription ID as int. + numb_exons = len(self.gene_sequences_dict[gene]['exon_seq']) - numb_introns + + n_unspliced = 0 # Count of number of unspliced introns. + exon_lines = [] + for exon in range(numb_exons): + exon_line = self.gene_sequences_dict[gene]['exon_line'].split('\t') # Initializes exon line. + exon_line[2] = 'exon' + attribute = exon_line[8].split(';') + if sense: + attribute[4] = ''.join(['exon_number "', str(exon + 1), '"']) + else: + attribute[4] = ''.join(['exon_number "', str(numb_exons - exon), '"']) + attribute[8] = ''.join(['transcript_name "', transcript_ID, '"']) + exon_line[8] = '; '.join(attribute) + exon_line[3] = str(self.gene_sequences_dict[gene]['exon_seq'][exon + n_unspliced][0]) + exon_line[4] = str(self.gene_sequences_dict[gene]['exon_seq'][exon + n_unspliced][1]) + + try: + while i_d_pop.pop(): + n_unspliced += 1 + if sense: + exon_line[4] = str(self.gene_sequences_dict[gene]['exon_seq'][exon + n_unspliced][1]) + else: + exon_line[3] = str(self.gene_sequences_dict[gene]['exon_seq'][exon + n_unspliced][0]) + except IndexError: # End of ID reached + pass + + exon_lines.append('\t'.join(exon_line)) + + if not self.gene_sequences_dict[gene]['strand_sense']: # Negative strand sense need reversed order. + exon_lines.reverse() + self.gtf_lines.extend(exon_lines) + + def sort_gtf_lines(self) -> None: + + """ Sorts the gtf lines by the position of the genes (increasing) and returns it.""" + + # Builds and uses a dictionary with the start of the gene as key, and all lines related to this gene as value: + # {start_gene(int): [[gene_line],[transcript_line],[exon_line1],[exon_line2],...]} + + gene_lines_dict = {} + gene_start = 0 # Validation: This key should remain unused, as every gtf file starts with a gene. + for index, line in enumerate(self.gtf_lines): + line_content = line.split('\t') + if line_content[2] == 'gene': # This is the next gene line. Initializes dictionary entry. + gene_start = line_content[3] # Extract the key = start of gene. + gene_lines_dict[gene_start] = [] + gene_lines_dict[gene_start].append(line) # Append all lines related to this gene. + + sorted_keys = sorted(gene_lines_dict) # Sorts the keys by their values. + sorted_gtf_lines = [] + for key in sorted_keys: + sorted_gtf_lines.extend(gene_lines_dict[key]) # Creates a new list of the gtf lines in the correct order. + + self.gtf_lines = sorted_gtf_lines + + def write_gtf(self, + output_coords: str + ) -> None: + + """ Writes a gtf file with the information about the differently spliced transcripts. + + Args: + output_coords(str): Path and name of the output gtf file with the information of all relevant transcripts. + """ + with open(output_coords, 'w') as gtf_file: + gtf_file.writelines(self.gtf_lines) + + +def main(): + """ Main Function.""" + # Inputs + # gene_count = 'gene_count/Rik_5.csv' # Strand with + sense + # gene_count = 'gene_count/Rp1_5.csv' # Strand with - sense. + gene_count = 'gene_count/Rik_5_Rp1_5.csv' # Both strand senses combined + coordinates_genes = 'gtf/coordinates.gtf' + p_intron = 0.3 + + # Output paths and names. + name_csv_output = 'Outputs/csv_new.csv' + name_gtf_output = 'Outputs/gtf_new.gtf' + + random.seed(10) # Initializes seed for random functions for reproducibility. + + bts = BuildTranscriptStructure(gene_count, coordinates_genes, p_intron) + bts.csv_2_dict() # Generates dictionary from gene count csv file. + bts.gtf_2_dict() # Generates dictionary from gtf file. + bts.make_new_transcripts() # Generates the differently spliced transcripts. + bts.make_gtf_info() # Builds the gtf file of all newly created transcripts. + bts.sort_gtf_lines() # Sorts the gtf file by gene occurrence in sequence. + bts.write_gtf(name_gtf_output) # Writes the new gtf file. + bts.write_csv(name_csv_output) # Writes the new csv file with the count of the transcripts. + + +if __name__ == '__main__': + main() + print('process completed') diff --git a/transcript_structure/Outputs/.DS_Store b/transcript_structure/Outputs/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6 Binary files /dev/null and b/transcript_structure/Outputs/.DS_Store differ diff --git a/transcript_structure/Outputs/csv_new.csv b/transcript_structure/Outputs/csv_new.csv new file mode 100644 index 0000000000000000000000000000000000000000..88a301674c9709792b1fa39ba3b721fb76dcf010 --- /dev/null +++ b/transcript_structure/Outputs/csv_new.csv @@ -0,0 +1,8 @@ +Transcript_ID,Gene_ID,count +1700034P13Rik-010,1700034P13Rik,3 +1700034P13Rik-100,1700034P13Rik,1 +1700034P13Rik-000,1700034P13Rik,1 +Rp1-111,Rp1,1 +Rp1-000,Rp1,2 +Rp1-010,Rp1,1 +Rp1-001,Rp1,1 diff --git a/transcript_structure/Outputs/gtf_new.gtf b/transcript_structure/Outputs/gtf_new.gtf new file mode 100644 index 0000000000000000000000000000000000000000..e80579574e8088396d34b4d736efcbfaf8322d88 --- /dev/null +++ b/transcript_structure/Outputs/gtf_new.gtf @@ -0,0 +1,30 @@ +1 ensembl_havana gene 3999557 4409241 . - . gene_id "ENSMUSG00000025900"; gene_version "12"; gene_name "Rp1"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; +1 havana transcript 4290846 4409241 . - . gene_id "ENSMUSG00000025900"; gene_version "12"; transcript_id "ENSMUST00000208793"; transcript_version "1"; gene_name "Rp1"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Rp1-111"; transcript_source "havana"; transcript_biotype "retained_intron"; transcript_support_level "1"; +1 havana exon 4409170 4293012 . - . gene_id "ENSMUSG00000025900"; gene_version "12"; transcript_id "ENSMUST00000208793"; transcript_version "1"; exon_number "1"; gene_name "Rp1"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Rp1-111"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSMUSE00001379779"; exon_version "1"; transcript_support_level "1"; +1 havana transcript 4290846 4409241 . - . gene_id "ENSMUSG00000025900"; gene_version "12"; transcript_id "ENSMUST00000208793"; transcript_version "1"; gene_name "Rp1"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Rp1-000"; transcript_source "havana"; transcript_biotype "retained_intron"; transcript_support_level "1"; +1 havana exon 4409170 4409241 . - . gene_id "ENSMUSG00000025900"; gene_version "12"; transcript_id "ENSMUST00000208793"; transcript_version "1"; exon_number "1"; gene_name "Rp1"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Rp1-000"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSMUSE00001379779"; exon_version "1"; transcript_support_level "1"; +1 havana exon 4352202 4352837 . - . gene_id "ENSMUSG00000025900"; gene_version "12"; transcript_id "ENSMUST00000208793"; transcript_version "1"; exon_number "2"; gene_name "Rp1"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Rp1-000"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSMUSE00001379779"; exon_version "1"; transcript_support_level "1"; +1 havana exon 4351910 4352081 . - . gene_id "ENSMUSG00000025900"; gene_version "12"; transcript_id "ENSMUST00000208793"; transcript_version "1"; exon_number "3"; gene_name "Rp1"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Rp1-000"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSMUSE00001379779"; exon_version "1"; transcript_support_level "1"; +1 havana exon 4290846 4293012 . - . gene_id "ENSMUSG00000025900"; gene_version "12"; transcript_id "ENSMUST00000208793"; transcript_version "1"; exon_number "4"; gene_name "Rp1"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Rp1-000"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSMUSE00001379779"; exon_version "1"; transcript_support_level "1"; +1 havana transcript 4290846 4409241 . - . gene_id "ENSMUSG00000025900"; gene_version "12"; transcript_id "ENSMUST00000208793"; transcript_version "1"; gene_name "Rp1"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Rp1-010"; transcript_source "havana"; transcript_biotype "retained_intron"; transcript_support_level "1"; +1 havana exon 4409170 4409241 . - . gene_id "ENSMUSG00000025900"; gene_version "12"; transcript_id "ENSMUST00000208793"; transcript_version "1"; exon_number "1"; gene_name "Rp1"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Rp1-010"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSMUSE00001379779"; exon_version "1"; transcript_support_level "1"; +1 havana exon 4352202 4352081 . - . gene_id "ENSMUSG00000025900"; gene_version "12"; transcript_id "ENSMUST00000208793"; transcript_version "1"; exon_number "2"; gene_name "Rp1"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Rp1-010"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSMUSE00001379779"; exon_version "1"; transcript_support_level "1"; +1 havana exon 4290846 4293012 . - . gene_id "ENSMUSG00000025900"; gene_version "12"; transcript_id "ENSMUST00000208793"; transcript_version "1"; exon_number "3"; gene_name "Rp1"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Rp1-010"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSMUSE00001379779"; exon_version "1"; transcript_support_level "1"; +1 havana transcript 4290846 4409241 . - . gene_id "ENSMUSG00000025900"; gene_version "12"; transcript_id "ENSMUST00000208793"; transcript_version "1"; gene_name "Rp1"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Rp1-001"; transcript_source "havana"; transcript_biotype "retained_intron"; transcript_support_level "1"; +1 havana exon 4409170 4409241 . - . gene_id "ENSMUSG00000025900"; gene_version "12"; transcript_id "ENSMUST00000208793"; transcript_version "1"; exon_number "1"; gene_name "Rp1"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Rp1-001"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSMUSE00001379779"; exon_version "1"; transcript_support_level "1"; +1 havana exon 4352202 4352837 . - . gene_id "ENSMUSG00000025900"; gene_version "12"; transcript_id "ENSMUST00000208793"; transcript_version "1"; exon_number "2"; gene_name "Rp1"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Rp1-001"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSMUSE00001379779"; exon_version "1"; transcript_support_level "1"; +1 havana exon 4351910 4293012 . - . gene_id "ENSMUSG00000025900"; gene_version "12"; transcript_id "ENSMUST00000208793"; transcript_version "1"; exon_number "3"; gene_name "Rp1"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Rp1-001"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSMUSE00001379779"; exon_version "1"; transcript_support_level "1"; +1 havana gene 9747648 9791924 . + . gene_id "ENSMUSG00000097893"; gene_version "8"; gene_name "1700034P13Rik"; gene_source "havana"; gene_biotype "lncRNA"; +1 havana transcript 9747648 9791922 . + . gene_id "ENSMUSG00000097893"; gene_version "8"; transcript_id "ENSMUST00000181821"; transcript_version "7"; gene_name "1700034P13Rik"; gene_source "havana"; gene_biotype "lncRNA"; transcript_name "1700034P13Rik-010"; transcript_source "havana"; transcript_biotype "lncRNA"; transcript_support_level "1"; +1 havana exon 9747648 9748604 . + . gene_id "ENSMUSG00000097893"; gene_version "8"; transcript_id "ENSMUST00000181821"; transcript_version "7"; exon_number "1"; gene_name "1700034P13Rik"; gene_source "havana"; gene_biotype "lncRNA"; transcript_name "1700034P13Rik-010"; transcript_source "havana"; transcript_biotype "lncRNA"; exon_id "ENSMUSE00001104564"; exon_version "1"; transcript_support_level "1"; +1 havana exon 9752449 9789780 . + . gene_id "ENSMUSG00000097893"; gene_version "8"; transcript_id "ENSMUST00000181821"; transcript_version "7"; exon_number "2"; gene_name "1700034P13Rik"; gene_source "havana"; gene_biotype "lncRNA"; transcript_name "1700034P13Rik-010"; transcript_source "havana"; transcript_biotype "lncRNA"; exon_id "ENSMUSE00001104564"; exon_version "1"; transcript_support_level "1"; +1 havana exon 9791125 9791922 . + . gene_id "ENSMUSG00000097893"; gene_version "8"; transcript_id "ENSMUST00000181821"; transcript_version "7"; exon_number "3"; gene_name "1700034P13Rik"; gene_source "havana"; gene_biotype "lncRNA"; transcript_name "1700034P13Rik-010"; transcript_source "havana"; transcript_biotype "lncRNA"; exon_id "ENSMUSE00001104564"; exon_version "1"; transcript_support_level "1"; +1 havana transcript 9747648 9791922 . + . gene_id "ENSMUSG00000097893"; gene_version "8"; transcript_id "ENSMUST00000181821"; transcript_version "7"; gene_name "1700034P13Rik"; gene_source "havana"; gene_biotype "lncRNA"; transcript_name "1700034P13Rik-100"; transcript_source "havana"; transcript_biotype "lncRNA"; transcript_support_level "1"; +1 havana exon 9747648 9752564 . + . gene_id "ENSMUSG00000097893"; gene_version "8"; transcript_id "ENSMUST00000181821"; transcript_version "7"; exon_number "1"; gene_name "1700034P13Rik"; gene_source "havana"; gene_biotype "lncRNA"; transcript_name "1700034P13Rik-100"; transcript_source "havana"; transcript_biotype "lncRNA"; exon_id "ENSMUSE00001104564"; exon_version "1"; transcript_support_level "1"; +1 havana exon 9789656 9789780 . + . gene_id "ENSMUSG00000097893"; gene_version "8"; transcript_id "ENSMUST00000181821"; transcript_version "7"; exon_number "2"; gene_name "1700034P13Rik"; gene_source "havana"; gene_biotype "lncRNA"; transcript_name "1700034P13Rik-100"; transcript_source "havana"; transcript_biotype "lncRNA"; exon_id "ENSMUSE00001104564"; exon_version "1"; transcript_support_level "1"; +1 havana exon 9791125 9791922 . + . gene_id "ENSMUSG00000097893"; gene_version "8"; transcript_id "ENSMUST00000181821"; transcript_version "7"; exon_number "3"; gene_name "1700034P13Rik"; gene_source "havana"; gene_biotype "lncRNA"; transcript_name "1700034P13Rik-100"; transcript_source "havana"; transcript_biotype "lncRNA"; exon_id "ENSMUSE00001104564"; exon_version "1"; transcript_support_level "1"; +1 havana transcript 9747648 9791922 . + . gene_id "ENSMUSG00000097893"; gene_version "8"; transcript_id "ENSMUST00000181821"; transcript_version "7"; gene_name "1700034P13Rik"; gene_source "havana"; gene_biotype "lncRNA"; transcript_name "1700034P13Rik-000"; transcript_source "havana"; transcript_biotype "lncRNA"; transcript_support_level "1"; +1 havana exon 9747648 9748604 . + . gene_id "ENSMUSG00000097893"; gene_version "8"; transcript_id "ENSMUST00000181821"; transcript_version "7"; exon_number "1"; gene_name "1700034P13Rik"; gene_source "havana"; gene_biotype "lncRNA"; transcript_name "1700034P13Rik-000"; transcript_source "havana"; transcript_biotype "lncRNA"; exon_id "ENSMUSE00001104564"; exon_version "1"; transcript_support_level "1"; +1 havana exon 9752449 9752564 . + . gene_id "ENSMUSG00000097893"; gene_version "8"; transcript_id "ENSMUST00000181821"; transcript_version "7"; exon_number "2"; gene_name "1700034P13Rik"; gene_source "havana"; gene_biotype "lncRNA"; transcript_name "1700034P13Rik-000"; transcript_source "havana"; transcript_biotype "lncRNA"; exon_id "ENSMUSE00001104564"; exon_version "1"; transcript_support_level "1"; +1 havana exon 9789656 9789780 . + . gene_id "ENSMUSG00000097893"; gene_version "8"; transcript_id "ENSMUST00000181821"; transcript_version "7"; exon_number "3"; gene_name "1700034P13Rik"; gene_source "havana"; gene_biotype "lncRNA"; transcript_name "1700034P13Rik-000"; transcript_source "havana"; transcript_biotype "lncRNA"; exon_id "ENSMUSE00001104564"; exon_version "1"; transcript_support_level "1"; +1 havana exon 9791125 9791922 . + . gene_id "ENSMUSG00000097893"; gene_version "8"; transcript_id "ENSMUST00000181821"; transcript_version "7"; exon_number "4"; gene_name "1700034P13Rik"; gene_source "havana"; gene_biotype "lncRNA"; transcript_name "1700034P13Rik-000"; transcript_source "havana"; transcript_biotype "lncRNA"; exon_id "ENSMUSE00001104564"; exon_version "1"; transcript_support_level "1"; diff --git a/transcript_structure/__init__.py b/transcript_structure/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..db9c290dedfc90e7b0fdd03739ac23cc90d5d051 --- /dev/null +++ b/transcript_structure/__init__.py @@ -0,0 +1,2 @@ +# Transcript Structure root package +__version__ = "1.1.0" diff --git a/transcript_structure/gene_count/.DS_Store b/transcript_structure/gene_count/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6 Binary files /dev/null and b/transcript_structure/gene_count/.DS_Store differ diff --git a/transcript_structure/gene_count/Rik_5.csv b/transcript_structure/gene_count/Rik_5.csv new file mode 100644 index 0000000000000000000000000000000000000000..d437d8ec85888623dbc16f18beec306929a4c38f --- /dev/null +++ b/transcript_structure/gene_count/Rik_5.csv @@ -0,0 +1,2 @@ +GeneID,count +1700034P13Rik,5 \ No newline at end of file diff --git a/transcript_structure/gene_count/Rik_5_Rp1_5.csv b/transcript_structure/gene_count/Rik_5_Rp1_5.csv new file mode 100644 index 0000000000000000000000000000000000000000..b6be973a2fcec97d9ed65dc4e600ded43d94b7db --- /dev/null +++ b/transcript_structure/gene_count/Rik_5_Rp1_5.csv @@ -0,0 +1,3 @@ +GeneID,count +1700034P13Rik,5 +Rp1,5 \ No newline at end of file diff --git a/transcript_structure/gene_count/Rp1_5.csv b/transcript_structure/gene_count/Rp1_5.csv new file mode 100644 index 0000000000000000000000000000000000000000..e7bedd16273dbb4e829a16d37f9b60f654115b1f --- /dev/null +++ b/transcript_structure/gene_count/Rp1_5.csv @@ -0,0 +1,2 @@ +GeneID,count +Rp1,5 \ No newline at end of file diff --git a/transcript_structure/gtf/.DS_Store b/transcript_structure/gtf/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6 Binary files /dev/null and b/transcript_structure/gtf/.DS_Store differ diff --git a/transcript_structure/gtf/coordinates.gtf b/transcript_structure/gtf/coordinates.gtf new file mode 100644 index 0000000000000000000000000000000000000000..cb15d1672584996c6468f4e08d555a0c2ff30818 Binary files /dev/null and b/transcript_structure/gtf/coordinates.gtf differ diff --git a/transcript_structure/gtf/gtf_explained.xlsx b/transcript_structure/gtf/gtf_explained.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..5cc4a71bb5c7db6720aa43375fc9c45c8441581f Binary files /dev/null and b/transcript_structure/gtf/gtf_explained.xlsx differ