Skip to content
Snippets Groups Projects
Commit b255c509 authored by Timon Baltisberger's avatar Timon Baltisberger
Browse files

feat: adds transcript structure generator

parent 95d07fbd
No related branches found
No related tags found
1 merge request!15add: generate transcript structure
Showing
with 467 additions and 0 deletions
File added
File added
1 ensembl_havana gene 3999557 4409241 . - . gene_id "ENSMUSG00000025900"; gene_version "12"; gene_name "Rp1"; gene_source "ensembl_havana"; gene_biotype "protein_coding";
1 havana transcript 4290846 4409241 . - . gene_id "ENSMUSG00000025900"; gene_version "12"; transcript_id "ENSMUST00000208793"; transcript_version "1"; gene_name "Rp1"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Rp1-204"; transcript_source "havana"; transcript_biotype "retained_intron"; transcript_support_level "1";
1 havana exon 4409170 4409241 . - . gene_id "ENSMUSG00000025900"; gene_version "12"; transcript_id "ENSMUST00000208793"; transcript_version "1"; exon_number "1"; gene_name "Rp1"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Rp1-204"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSMUSE00001379779"; exon_version "1"; transcript_support_level "1";
1 havana exon 4352202 4352837 . - . gene_id "ENSMUSG00000025900"; gene_version "12"; transcript_id "ENSMUST00000208793"; transcript_version "1"; exon_number "2"; gene_name "Rp1"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Rp1-204"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSMUSE00001377400"; exon_version "1"; transcript_support_level "1";
1 havana exon 4351910 4352081 . - . gene_id "ENSMUSG00000025900"; gene_version "12"; transcript_id "ENSMUST00000208793"; transcript_version "1"; exon_number "3"; gene_name "Rp1"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Rp1-204"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSMUSE00001380148"; exon_version "1"; transcript_support_level "1";
1 havana exon 4290846 4293012 . - . gene_id "ENSMUSG00000025900"; gene_version "12"; transcript_id "ENSMUST00000208793"; transcript_version "1"; exon_number "4"; gene_name "Rp1"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Rp1-204"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSMUSE00001345123"; exon_version "1"; transcript_support_level "1";
1 havana gene 9747648 9791924 . + . gene_id "ENSMUSG00000097893"; gene_version "8"; gene_name "1700034P13Rik"; gene_source "havana"; gene_biotype "lncRNA";
1 havana transcript 9747648 9791922 . + . gene_id "ENSMUSG00000097893"; gene_version "8"; transcript_id "ENSMUST00000181821"; transcript_version "7"; gene_name "1700034P13Rik"; gene_source "havana"; gene_biotype "lncRNA"; transcript_name "1700034P13Rik-202"; transcript_source "havana"; transcript_biotype "lncRNA"; transcript_support_level "1";
1 havana exon 9747648 9748604 . + . gene_id "ENSMUSG00000097893"; gene_version "8"; transcript_id "ENSMUST00000181821"; transcript_version "7"; exon_number "1"; gene_name "1700034P13Rik"; gene_source "havana"; gene_biotype "lncRNA"; transcript_name "1700034P13Rik-202"; transcript_source "havana"; transcript_biotype "lncRNA"; exon_id "ENSMUSE00001104564"; exon_version "1"; transcript_support_level "1";
1 havana exon 9752449 9752564 . + . gene_id "ENSMUSG00000097893"; gene_version "8"; transcript_id "ENSMUST00000181821"; transcript_version "7"; exon_number "2"; gene_name "1700034P13Rik"; gene_source "havana"; gene_biotype "lncRNA"; transcript_name "1700034P13Rik-202"; transcript_source "havana"; transcript_biotype "lncRNA"; exon_id "ENSMUSE00001144804"; exon_version "1"; transcript_support_level "1";
1 havana exon 9789656 9789780 . + . gene_id "ENSMUSG00000097893"; gene_version "8"; transcript_id "ENSMUST00000181821"; transcript_version "7"; exon_number "3"; gene_name "1700034P13Rik"; gene_source "havana"; gene_biotype "lncRNA"; transcript_name "1700034P13Rik-202"; transcript_source "havana"; transcript_biotype "lncRNA"; exon_id "ENSMUSE00001175967"; exon_version "1"; transcript_support_level "1";
1 havana exon 9791125 9791922 . + . gene_id "ENSMUSG00000097893"; gene_version "8"; transcript_id "ENSMUST00000181821"; transcript_version "7"; exon_number "4"; gene_name "1700034P13Rik"; gene_source "havana"; gene_biotype "lncRNA"; transcript_name "1700034P13Rik-202"; transcript_source "havana"; transcript_biotype "lncRNA"; exon_id "ENSMUSE00001174858"; exon_version "1"; transcript_support_level "1";
\ No newline at end of file
1700034P13Rik,5
Rp1,5
\ No newline at end of file
GeneID,count
1700034P13Rik,5
Rp1,5
\ No newline at end of file
import pytest
from transcript_structure import Generate_transcript_structure as Gts
TEST_CSV_TITLE = './tests/resources/test_transcript_structure/Rik_5_Rp1_5_title.csv'
TEST_CSV_NO_TITLE = './tests/resources/test_transcript_structure/Rik_5_Rp1_5_no_title.csv'
GENE_COORDS = './tests/resources/RP1_RIK.gtf'
GENE_KEYS = ['Rp1', '1700034P13Rik']
P_INTRON_0: float = 0
P_INTRON_0_2 = 0.2
P_INTRON_1: float = 1
@pytest.mark.parametrize(
"test_input",
[(TEST_CSV_TITLE, GENE_COORDS, P_INTRON_0),
(TEST_CSV_NO_TITLE, GENE_COORDS, P_INTRON_0)
],
)
def test_csv_2_dict(test_input):
builder = Gts.BuildTranscriptStructure(test_input[0], test_input[1], test_input[2])
builder.csv_2_dict()
with open(TEST_CSV_TITLE) as csv:
csv_lines = csv.readlines()
first_line = csv_lines[0].split(',')
if not first_line[1].isnumeric():
del(csv_lines[0]) # Removes title.
csv_lines[-1] = ''.join([csv_lines[-1], '\n']) # Adds \n to last line of csv.
keys = list(builder.gene_count_dict.keys())
for index, line in enumerate(csv_lines):
dic_line = ''.join([keys[index], ',', str(builder.gene_count_dict[keys[index]]), '\n'])
assert line == dic_line
def test_gtf_2_dict():
builder = Gts.BuildTranscriptStructure(TEST_CSV_TITLE, GENE_COORDS, P_INTRON_0)
builder.gtf_2_dict()
assert len(builder.gene_sequences_dict) == 2 # Two genes read in the dictionary.
assert len(builder.gene_sequences_dict[GENE_KEYS[0]]) == 5
assert len(builder.gene_sequences_dict[GENE_KEYS[1]]) == 5
gene_line_rik = ('1\thavana\tgene\t9747648\t9791924\t.\t+\t.\tgene_id "ENSMUSG00000097893"; gene_version "8"; '
'gene_name "1700034P13Rik"; gene_source "havana"; gene_biotype "lncRNA";\n')
assert gene_line_rik == builder.gene_sequences_dict[GENE_KEYS[1]]['gene_line']
with open(GENE_COORDS) as gtf:
lines = gtf.readlines()
numb_exons_gtf = len(lines) - 4 # 2x exon + transcript line
numb_exons_dict = 0
for gene_key in GENE_KEYS:
numb_exons_dict += len(builder.gene_sequences_dict[gene_key]['exon_seq'])
assert numb_exons_gtf == numb_exons_dict
@pytest.mark.parametrize(
"test_input",
[(TEST_CSV_TITLE, GENE_COORDS, P_INTRON_0),
(TEST_CSV_TITLE, GENE_COORDS, P_INTRON_1)
],
)
def test_make_new_transcripts(test_input):
builder = Gts.BuildTranscriptStructure(test_input[0], test_input[1], test_input[2])
builder.csv_2_dict() # Generates dictionary from gene count csv file.
builder.gtf_2_dict() # Generates dictionary from gtf file.
builder.make_new_transcripts() # Generates the differently spliced transcripts.
numb_trans_dict = 0
numb_trans_csv = 10
for gene_key in GENE_KEYS:
for trans_id in builder.gene_transcript_dict[gene_key]:
numb_trans_dict += builder.gene_transcript_dict[gene_key][trans_id]
assert numb_trans_csv == numb_trans_csv
for gene_key in GENE_KEYS:
assert len(builder.gene_transcript_dict[gene_key]) == 1 # All have identical transcript IDs.
@pytest.mark.parametrize(
"test_input",
[(TEST_CSV_TITLE, GENE_COORDS, P_INTRON_0),
(TEST_CSV_TITLE, GENE_COORDS, P_INTRON_1)
],
)
def test_make_gtf_lines(test_input):
builder = Gts.BuildTranscriptStructure(test_input[0], test_input[1], test_input[2])
builder.csv_2_dict() # Generates dictionary from gene count csv file.
builder.gtf_2_dict() # Generates dictionary from gtf file.
builder.make_new_transcripts() # Generates the differently spliced transcripts.
builder.make_gtf_info()
for line in builder.gtf_lines:
columns = line.split('\t')
assert columns[3] < columns[4] # Tests that the coordinates are increasing.
pass
@pytest.mark.parametrize(
"test_input",
[(TEST_CSV_TITLE, GENE_COORDS, P_INTRON_0),
(TEST_CSV_TITLE, GENE_COORDS, P_INTRON_1)
],
)
def test_sort_gtf_lines(test_input):
builder = Gts.BuildTranscriptStructure(test_input[0], test_input[1], test_input[2])
builder.csv_2_dict() # Generates dictionary from gene count csv file.
builder.gtf_2_dict() # Generates dictionary from gtf file.
builder.make_new_transcripts() # Generates the differently spliced transcripts.
builder.make_gtf_info()
starts_before = [] # Verifies that the function actually has to sort.
for line in builder.gtf_lines:
columns = line.split('\t')
if columns[2] == 'gene':
starts_before.append(columns[3])
for ii in range(len(starts_before)-1):
assert starts_before[ii] > starts_before[ii+1]
builder.sort_gtf_lines()
builder.sort_gtf_lines()
starts_after = [] # Verifies that the function sorted.
for line in builder.gtf_lines:
columns = line.split('\t')
if columns[2] == 'gene':
starts_after.append(columns[3])
for ii in range(len(starts_before)-1):
assert starts_after[ii] < starts_after[ii+1]
builder.sort_gtf_lines()
def test_write_gtf():
pass
def test_write_csv():
pass
File added
import random
import csv
import copy
class BuildTranscriptStructure:
"""Creates differently spliced transcripts.
Args:
input_gene_count(str): Path to csv file of type "geneID", number to sample.
input_coordinates(str): Path to gtf file of relevant genes.
p_intron(float): Probability to include each intron in the mRNA sequence.
Attributes:
gene_count_dict(dict): Dictionary of format {"gene_ID": number_of_samplings}
gene_sequences_dict(dict): Nested dictionary with information about each gene.
Format: {"gene_ID": {"gene_line": gene_gtf_line,
"transcript_line": transcript_gtf_line,
"exon_line": exemplary_exon_line,
"exon_seq": [[start_exon1, end_exon1],[start_exon2, end_exon2],...]
}
}
gene_transcript_dict(dict): Nested dictionary with amount of each differently spliced transcript.
Format: {"gene_ID": {"transcript_ID": n_copies}}
The transcript ID is a binary code, signifying whether a certain intron is included or not.
E.g. gene with 4 exons:
transcript code 001: The 1. and 2. introns are not included (spliced away), the 3. is included.
The transcript will therefore have 3 exons (exon 3 and 4 are combined).
For sequences with negative strand senses, the exon numbering is determining the direction the transcript
code is to be applied, and not the occurrence in the gene sequence (inverted by convention).
E.g. '01' means the intron between original (fully spliced) exon 1 and exon 2 was spliced away,
but the intron between exon 2 and exon 3 is included in the transcript.
gtf_lines(list): List with all newly created gtf lines.
"""
def __init__(self,
input_gene_count: str,
input_coordinates: str,
p_intron: float,
) -> None:
"""Class constructor."""
self.gene_count = input_gene_count
self.input_coords = input_coordinates
self.p_intron = p_intron
self.gene_count_dict = {}
self.gene_sequences_dict = {}
self.gene_transcript_dict = {}
self.gtf_lines = []
def generate_transcript_structure(self):
"""Computes distribution and gene coordinates of differently spliced mRNA."""
self.csv_2_dict() # Generates dictionary from gene count csv file.
self.gtf_2_dict() # Generates dictionary from gtf file.
self.make_new_transcripts() # Generates the differently spliced transcripts.
self.make_gtf_info() # Builds the gtf file of all newly created transcripts.
self.sort_gtf_lines() # Sorts the gtf file by gene occurrence in sequence.
def csv_2_dict(self) -> None:
"""Converts the csv file with gene count into a dictionary."""
with open(self.gene_count) as g_c:
lines = g_c.readlines()
first_line = lines[0].split(',') # Removes the first line if it is a title.
if not first_line[1][0].isnumeric():
del lines[0]
for line in lines:
line_entries = line.split(',')
self.gene_count_dict[line_entries[0]] = int(line_entries[1])
def gtf_2_dict(self) -> None:
"""Converts the gtf file into a nested dictionary."""
with open(self.input_coords) as c_g: # Reads coordinates from .gtf file.
lines = c_g.readlines()
lines = [i for i in lines if i[0] != '#'] # Exclude comments
for gene_line in range(len(lines)):
gene_info = {} # Dictionary with information of a single gene.
line_entries = lines[gene_line].split('\t')
if line_entries[2] == 'gene': # The line indeed describes a gene.
attribute = line_entries[8].split(';')
gene_name = attribute[2][12:-1] # Extracts the gene name from the attributes.
gene_info['gene_line'] = lines[gene_line]
gene_info['transcript_line'] = lines[gene_line + 1]
gene_info['exon_line'] = lines[gene_line + 2] # Exemplary line of an exon.
gene_info['strand_sense'] = line_entries[6] == '+' # Strand sense.
coordinates = []
exon_line = []
line_offset = 2 # Lines after the gene line (+1 is transcript description).
while True:
try: # Avoids error at end of list.
exon_line = lines[gene_line + line_offset].split('\t')
except IndexError: # End of gtf file reached: The job is finished.
break
if exon_line[2] != 'exon': # End of exon list of this gene is reached.
break
else: # The line is an exon.
coordinates.append([int(exon_line[3]), int(exon_line[4])])
line_offset += 1 # Move to next line.
if exon_line[6] == '-': # Strands with sense (-)
coordinates.reverse()
gene_info['exon_seq'] = coordinates
self.gene_sequences_dict[gene_name] = gene_info
def make_new_transcripts(self) -> None:
""" Generates the differently spliced transcripts."""
for gene in self.gene_count_dict:
# Computes the intron splicing for each transcript.
transcript_ids = []
for _ in range(self.gene_count_dict[gene]):
i_d = []
for __ in range(len(self.gene_sequences_dict[gene]['exon_seq']) - 1):
if random.random() > self.p_intron: # Intron spliced away.
i_d.append('0')
else: # Intron not spliced away.
i_d.append('1')
transcript_ids.append(''.join(i_d)) # Combine all transcript IDs in one list.
# Counts how often each transcript is is the list.
transcript_numbers = {}
while True:
i_d = transcript_ids.pop()
transcript_numbers['-'.join([gene, i_d])] = 1 + transcript_ids.count(i_d)
transcript_ids = [not_current_iD for not_current_iD in transcript_ids if not_current_iD != i_d]
if not transcript_ids: # Leaves loop once all codes were scanned for.
break
self.gene_transcript_dict[gene] = transcript_numbers
def write_csv(self,
output_transcript_count: str
) -> None:
""" Writes a csv file containing the number of differently spliced transcripts.
Args:
output_transcript_count(str): Path and name of the output cvs file: "transcript_ID", "gene_ID", count.
"""
with open(output_transcript_count, 'w', newline='') as file:
writer = csv.writer(file)
writer.writerow(['Transcript_ID', 'Gene_ID', 'count'])
for gene in self.gene_transcript_dict:
for transcript_ID in self.gene_transcript_dict[gene]:
writer.writerow([transcript_ID, gene, self.gene_transcript_dict[gene][transcript_ID]])
def make_gtf_info(self) -> None:
""" Writes the lines of the new gtf file for the differently spliced transcripts."""
for gene in self.gene_transcript_dict: # Iterates over all genes required.
self.gtf_lines.append(self.gene_sequences_dict[gene]['gene_line']) # Add gene line to list.
sense = self.gene_sequences_dict[gene]['strand_sense']
for transcript_ID in self.gene_transcript_dict[gene]: # Iterates over all occurring types of splicings.
# Modifies the transcript line according to the splicing.
transcript_line = self.gene_sequences_dict[gene]['transcript_line'].split('\t')
attribute = transcript_line[8].split(';')
attribute[7] = ''.join(['transcript_name "', transcript_ID, '"'])
transcript_line[8] = '; '.join(attribute)
self.gtf_lines.append('\t'.join(transcript_line))
start_id = len(gene)
i_d = list(map(int, transcript_ID[start_id + 1:])) # Extract the splicing coding as int.
i_d_pop = copy.copy(i_d)
if sense:
i_d_pop.reverse()
numb_introns = sum(i_d) # Adds up the transcription ID as int.
numb_exons = len(self.gene_sequences_dict[gene]['exon_seq']) - numb_introns
n_unspliced = 0 # Count of number of unspliced introns.
exon_lines = []
for exon in range(numb_exons):
exon_line = self.gene_sequences_dict[gene]['exon_line'].split('\t') # Initializes exon line.
exon_line[2] = 'exon'
attribute = exon_line[8].split(';')
if sense:
attribute[4] = ''.join(['exon_number "', str(exon + 1), '"'])
else:
attribute[4] = ''.join(['exon_number "', str(numb_exons - exon), '"'])
attribute[8] = ''.join(['transcript_name "', transcript_ID, '"'])
exon_line[8] = '; '.join(attribute)
exon_line[3] = str(self.gene_sequences_dict[gene]['exon_seq'][exon + n_unspliced][0])
exon_line[4] = str(self.gene_sequences_dict[gene]['exon_seq'][exon + n_unspliced][1])
try:
while i_d_pop.pop():
n_unspliced += 1
if sense:
exon_line[4] = str(self.gene_sequences_dict[gene]['exon_seq'][exon + n_unspliced][1])
else:
exon_line[3] = str(self.gene_sequences_dict[gene]['exon_seq'][exon + n_unspliced][0])
except IndexError: # End of ID reached
pass
exon_lines.append('\t'.join(exon_line))
if not self.gene_sequences_dict[gene]['strand_sense']: # Negative strand sense need reversed order.
exon_lines.reverse()
self.gtf_lines.extend(exon_lines)
def sort_gtf_lines(self) -> None:
""" Sorts the gtf lines by the position of the genes (increasing) and returns it."""
# Builds and uses a dictionary with the start of the gene as key, and all lines related to this gene as value:
# {start_gene(int): [[gene_line],[transcript_line],[exon_line1],[exon_line2],...]}
gene_lines_dict = {}
gene_start = 0 # Validation: This key should remain unused, as every gtf file starts with a gene.
for index, line in enumerate(self.gtf_lines):
line_content = line.split('\t')
if line_content[2] == 'gene': # This is the next gene line. Initializes dictionary entry.
gene_start = line_content[3] # Extract the key = start of gene.
gene_lines_dict[gene_start] = []
gene_lines_dict[gene_start].append(line) # Append all lines related to this gene.
sorted_keys = sorted(gene_lines_dict) # Sorts the keys by their values.
sorted_gtf_lines = []
for key in sorted_keys:
sorted_gtf_lines.extend(gene_lines_dict[key]) # Creates a new list of the gtf lines in the correct order.
self.gtf_lines = sorted_gtf_lines
def write_gtf(self,
output_coords: str
) -> None:
""" Writes a gtf file with the information about the differently spliced transcripts.
Args:
output_coords(str): Path and name of the output gtf file with the information of all relevant transcripts.
"""
with open(output_coords, 'w') as gtf_file:
gtf_file.writelines(self.gtf_lines)
def main():
""" Main Function."""
# Inputs
# gene_count = 'gene_count/Rik_5.csv' # Strand with + sense
# gene_count = 'gene_count/Rp1_5.csv' # Strand with - sense.
gene_count = 'gene_count/Rik_5_Rp1_5.csv' # Both strand senses combined
coordinates_genes = 'gtf/coordinates.gtf'
p_intron = 0.3
# Output paths and names.
name_csv_output = 'Outputs/csv_new.csv'
name_gtf_output = 'Outputs/gtf_new.gtf'
random.seed(10) # Initializes seed for random functions for reproducibility.
bts = BuildTranscriptStructure(gene_count, coordinates_genes, p_intron)
bts.csv_2_dict() # Generates dictionary from gene count csv file.
bts.gtf_2_dict() # Generates dictionary from gtf file.
bts.make_new_transcripts() # Generates the differently spliced transcripts.
bts.make_gtf_info() # Builds the gtf file of all newly created transcripts.
bts.sort_gtf_lines() # Sorts the gtf file by gene occurrence in sequence.
bts.write_gtf(name_gtf_output) # Writes the new gtf file.
bts.write_csv(name_csv_output) # Writes the new csv file with the count of the transcripts.
if __name__ == '__main__':
main()
print('process completed')
File added
Transcript_ID,Gene_ID,count
1700034P13Rik-010,1700034P13Rik,3
1700034P13Rik-100,1700034P13Rik,1
1700034P13Rik-000,1700034P13Rik,1
Rp1-111,Rp1,1
Rp1-000,Rp1,2
Rp1-010,Rp1,1
Rp1-001,Rp1,1
1 ensembl_havana gene 3999557 4409241 . - . gene_id "ENSMUSG00000025900"; gene_version "12"; gene_name "Rp1"; gene_source "ensembl_havana"; gene_biotype "protein_coding";
1 havana transcript 4290846 4409241 . - . gene_id "ENSMUSG00000025900"; gene_version "12"; transcript_id "ENSMUST00000208793"; transcript_version "1"; gene_name "Rp1"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Rp1-111"; transcript_source "havana"; transcript_biotype "retained_intron"; transcript_support_level "1";
1 havana exon 4409170 4293012 . - . gene_id "ENSMUSG00000025900"; gene_version "12"; transcript_id "ENSMUST00000208793"; transcript_version "1"; exon_number "1"; gene_name "Rp1"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Rp1-111"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSMUSE00001379779"; exon_version "1"; transcript_support_level "1";
1 havana transcript 4290846 4409241 . - . gene_id "ENSMUSG00000025900"; gene_version "12"; transcript_id "ENSMUST00000208793"; transcript_version "1"; gene_name "Rp1"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Rp1-000"; transcript_source "havana"; transcript_biotype "retained_intron"; transcript_support_level "1";
1 havana exon 4409170 4409241 . - . gene_id "ENSMUSG00000025900"; gene_version "12"; transcript_id "ENSMUST00000208793"; transcript_version "1"; exon_number "1"; gene_name "Rp1"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Rp1-000"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSMUSE00001379779"; exon_version "1"; transcript_support_level "1";
1 havana exon 4352202 4352837 . - . gene_id "ENSMUSG00000025900"; gene_version "12"; transcript_id "ENSMUST00000208793"; transcript_version "1"; exon_number "2"; gene_name "Rp1"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Rp1-000"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSMUSE00001379779"; exon_version "1"; transcript_support_level "1";
1 havana exon 4351910 4352081 . - . gene_id "ENSMUSG00000025900"; gene_version "12"; transcript_id "ENSMUST00000208793"; transcript_version "1"; exon_number "3"; gene_name "Rp1"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Rp1-000"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSMUSE00001379779"; exon_version "1"; transcript_support_level "1";
1 havana exon 4290846 4293012 . - . gene_id "ENSMUSG00000025900"; gene_version "12"; transcript_id "ENSMUST00000208793"; transcript_version "1"; exon_number "4"; gene_name "Rp1"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Rp1-000"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSMUSE00001379779"; exon_version "1"; transcript_support_level "1";
1 havana transcript 4290846 4409241 . - . gene_id "ENSMUSG00000025900"; gene_version "12"; transcript_id "ENSMUST00000208793"; transcript_version "1"; gene_name "Rp1"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Rp1-010"; transcript_source "havana"; transcript_biotype "retained_intron"; transcript_support_level "1";
1 havana exon 4409170 4409241 . - . gene_id "ENSMUSG00000025900"; gene_version "12"; transcript_id "ENSMUST00000208793"; transcript_version "1"; exon_number "1"; gene_name "Rp1"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Rp1-010"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSMUSE00001379779"; exon_version "1"; transcript_support_level "1";
1 havana exon 4352202 4352081 . - . gene_id "ENSMUSG00000025900"; gene_version "12"; transcript_id "ENSMUST00000208793"; transcript_version "1"; exon_number "2"; gene_name "Rp1"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Rp1-010"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSMUSE00001379779"; exon_version "1"; transcript_support_level "1";
1 havana exon 4290846 4293012 . - . gene_id "ENSMUSG00000025900"; gene_version "12"; transcript_id "ENSMUST00000208793"; transcript_version "1"; exon_number "3"; gene_name "Rp1"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Rp1-010"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSMUSE00001379779"; exon_version "1"; transcript_support_level "1";
1 havana transcript 4290846 4409241 . - . gene_id "ENSMUSG00000025900"; gene_version "12"; transcript_id "ENSMUST00000208793"; transcript_version "1"; gene_name "Rp1"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Rp1-001"; transcript_source "havana"; transcript_biotype "retained_intron"; transcript_support_level "1";
1 havana exon 4409170 4409241 . - . gene_id "ENSMUSG00000025900"; gene_version "12"; transcript_id "ENSMUST00000208793"; transcript_version "1"; exon_number "1"; gene_name "Rp1"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Rp1-001"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSMUSE00001379779"; exon_version "1"; transcript_support_level "1";
1 havana exon 4352202 4352837 . - . gene_id "ENSMUSG00000025900"; gene_version "12"; transcript_id "ENSMUST00000208793"; transcript_version "1"; exon_number "2"; gene_name "Rp1"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Rp1-001"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSMUSE00001379779"; exon_version "1"; transcript_support_level "1";
1 havana exon 4351910 4293012 . - . gene_id "ENSMUSG00000025900"; gene_version "12"; transcript_id "ENSMUST00000208793"; transcript_version "1"; exon_number "3"; gene_name "Rp1"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Rp1-001"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSMUSE00001379779"; exon_version "1"; transcript_support_level "1";
1 havana gene 9747648 9791924 . + . gene_id "ENSMUSG00000097893"; gene_version "8"; gene_name "1700034P13Rik"; gene_source "havana"; gene_biotype "lncRNA";
1 havana transcript 9747648 9791922 . + . gene_id "ENSMUSG00000097893"; gene_version "8"; transcript_id "ENSMUST00000181821"; transcript_version "7"; gene_name "1700034P13Rik"; gene_source "havana"; gene_biotype "lncRNA"; transcript_name "1700034P13Rik-010"; transcript_source "havana"; transcript_biotype "lncRNA"; transcript_support_level "1";
1 havana exon 9747648 9748604 . + . gene_id "ENSMUSG00000097893"; gene_version "8"; transcript_id "ENSMUST00000181821"; transcript_version "7"; exon_number "1"; gene_name "1700034P13Rik"; gene_source "havana"; gene_biotype "lncRNA"; transcript_name "1700034P13Rik-010"; transcript_source "havana"; transcript_biotype "lncRNA"; exon_id "ENSMUSE00001104564"; exon_version "1"; transcript_support_level "1";
1 havana exon 9752449 9789780 . + . gene_id "ENSMUSG00000097893"; gene_version "8"; transcript_id "ENSMUST00000181821"; transcript_version "7"; exon_number "2"; gene_name "1700034P13Rik"; gene_source "havana"; gene_biotype "lncRNA"; transcript_name "1700034P13Rik-010"; transcript_source "havana"; transcript_biotype "lncRNA"; exon_id "ENSMUSE00001104564"; exon_version "1"; transcript_support_level "1";
1 havana exon 9791125 9791922 . + . gene_id "ENSMUSG00000097893"; gene_version "8"; transcript_id "ENSMUST00000181821"; transcript_version "7"; exon_number "3"; gene_name "1700034P13Rik"; gene_source "havana"; gene_biotype "lncRNA"; transcript_name "1700034P13Rik-010"; transcript_source "havana"; transcript_biotype "lncRNA"; exon_id "ENSMUSE00001104564"; exon_version "1"; transcript_support_level "1";
1 havana transcript 9747648 9791922 . + . gene_id "ENSMUSG00000097893"; gene_version "8"; transcript_id "ENSMUST00000181821"; transcript_version "7"; gene_name "1700034P13Rik"; gene_source "havana"; gene_biotype "lncRNA"; transcript_name "1700034P13Rik-100"; transcript_source "havana"; transcript_biotype "lncRNA"; transcript_support_level "1";
1 havana exon 9747648 9752564 . + . gene_id "ENSMUSG00000097893"; gene_version "8"; transcript_id "ENSMUST00000181821"; transcript_version "7"; exon_number "1"; gene_name "1700034P13Rik"; gene_source "havana"; gene_biotype "lncRNA"; transcript_name "1700034P13Rik-100"; transcript_source "havana"; transcript_biotype "lncRNA"; exon_id "ENSMUSE00001104564"; exon_version "1"; transcript_support_level "1";
1 havana exon 9789656 9789780 . + . gene_id "ENSMUSG00000097893"; gene_version "8"; transcript_id "ENSMUST00000181821"; transcript_version "7"; exon_number "2"; gene_name "1700034P13Rik"; gene_source "havana"; gene_biotype "lncRNA"; transcript_name "1700034P13Rik-100"; transcript_source "havana"; transcript_biotype "lncRNA"; exon_id "ENSMUSE00001104564"; exon_version "1"; transcript_support_level "1";
1 havana exon 9791125 9791922 . + . gene_id "ENSMUSG00000097893"; gene_version "8"; transcript_id "ENSMUST00000181821"; transcript_version "7"; exon_number "3"; gene_name "1700034P13Rik"; gene_source "havana"; gene_biotype "lncRNA"; transcript_name "1700034P13Rik-100"; transcript_source "havana"; transcript_biotype "lncRNA"; exon_id "ENSMUSE00001104564"; exon_version "1"; transcript_support_level "1";
1 havana transcript 9747648 9791922 . + . gene_id "ENSMUSG00000097893"; gene_version "8"; transcript_id "ENSMUST00000181821"; transcript_version "7"; gene_name "1700034P13Rik"; gene_source "havana"; gene_biotype "lncRNA"; transcript_name "1700034P13Rik-000"; transcript_source "havana"; transcript_biotype "lncRNA"; transcript_support_level "1";
1 havana exon 9747648 9748604 . + . gene_id "ENSMUSG00000097893"; gene_version "8"; transcript_id "ENSMUST00000181821"; transcript_version "7"; exon_number "1"; gene_name "1700034P13Rik"; gene_source "havana"; gene_biotype "lncRNA"; transcript_name "1700034P13Rik-000"; transcript_source "havana"; transcript_biotype "lncRNA"; exon_id "ENSMUSE00001104564"; exon_version "1"; transcript_support_level "1";
1 havana exon 9752449 9752564 . + . gene_id "ENSMUSG00000097893"; gene_version "8"; transcript_id "ENSMUST00000181821"; transcript_version "7"; exon_number "2"; gene_name "1700034P13Rik"; gene_source "havana"; gene_biotype "lncRNA"; transcript_name "1700034P13Rik-000"; transcript_source "havana"; transcript_biotype "lncRNA"; exon_id "ENSMUSE00001104564"; exon_version "1"; transcript_support_level "1";
1 havana exon 9789656 9789780 . + . gene_id "ENSMUSG00000097893"; gene_version "8"; transcript_id "ENSMUST00000181821"; transcript_version "7"; exon_number "3"; gene_name "1700034P13Rik"; gene_source "havana"; gene_biotype "lncRNA"; transcript_name "1700034P13Rik-000"; transcript_source "havana"; transcript_biotype "lncRNA"; exon_id "ENSMUSE00001104564"; exon_version "1"; transcript_support_level "1";
1 havana exon 9791125 9791922 . + . gene_id "ENSMUSG00000097893"; gene_version "8"; transcript_id "ENSMUST00000181821"; transcript_version "7"; exon_number "4"; gene_name "1700034P13Rik"; gene_source "havana"; gene_biotype "lncRNA"; transcript_name "1700034P13Rik-000"; transcript_source "havana"; transcript_biotype "lncRNA"; exon_id "ENSMUSE00001104564"; exon_version "1"; transcript_support_level "1";
# Transcript Structure root package
__version__ = "1.1.0"
File added
GeneID,count
1700034P13Rik,5
\ No newline at end of file
GeneID,count
1700034P13Rik,5
Rp1,5
\ No newline at end of file
GeneID,count
Rp1,5
\ No newline at end of file
File added
File suppressed by a .gitattributes entry or the file's encoding is unsupported.
File added
  • Author Developer

    Working on issue #2, generating transcript structure

0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment