diff --git a/.gitignore b/.gitignore index 2e906cc871537e0816ae7fd4437ade59afa2986c..7a8baad3fbb9055363fc60eda6a2113c091f85f5 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,5 @@ # ignore ALL files in ANY directory named temp temp/ +__pycache__ +output_files \ No newline at end of file diff --git a/README.md b/README.md index 05f0a157c6478ab0f3b4dda1e979ed88b339850d..fee84f77ae53f93f3ef3466c1261a323e7f336a4 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Transcript Sampler -This workflow sample representative transcripts per gene, in proportion to their relative abundance levels. Sampling is done by poisson sampling. +This workflow samples representative transcripts per gene, in proportion to their relative abundance levels. Sampling is done by Poisson sampling. **This workflow takes as input:** - Path to genome annotation file in gtf format @@ -15,10 +15,8 @@ This workflow sample representative transcripts per gene, in proportion to their **The workflow can be run via the command line as** - `python scripts/new-exe.py --annotation {gtf input file} --output_csv {output csv file} --transcript_number {number of transcripts} --output_gtf {output gtf file} --input_csv {input csv file}` - - Exemple : - - `python scripts\new_exe.py --annotation "input_files\test.gtf" --output_csv "output_files\output_csv.txt" --transcript_number 50 --output_gtf "output_files\output_gtf.gtf" --input_csv "input_files/expression.csv"` + `python transcript_sampler/new_exe.py --input_gtf={gtf input file} --input_csv={input csv file} --output_gtf={output gtf file} --output_csv={output csv file} --n_to_sample={number of transcripts}` + Example : + `python transcript_sampler/new_exe.py --input_gtf="input_files/test.gtf" --input_csv="input_files/expression.csv" --output_gtf="output_files/output.gtf" --output_csv="output_files/output.csv" --n_to_sample=100` diff --git a/images/Transcript_sampling__architecture.png b/images/Transcript_sampling__architecture.png deleted file mode 100644 index 59dab4f65555d376d09385f55461e879d7b27527..0000000000000000000000000000000000000000 Binary files a/images/Transcript_sampling__architecture.png and /dev/null differ diff --git a/input_files/test.gtf b/input_files/test.gtf index bac42f56831fc4495d5f7680933fa3dc842b77e4..36d3fb5bce32177b4ac2512149b25571b43a7acf 100644 --- a/input_files/test.gtf +++ b/input_files/test.gtf @@ -1,183 +1,183 @@ -#!genome-build GRCh38.p13 -#!genome-version GRCh38 -#!genome-date 2013-12 -#!genome-build-accession GCA_000001405.28 -#!genebuild-last-updated 2022-04 -1 ensembl_havana gene 1471765 1497848 . + . gene_id "ENSG00000160072"; gene_version "20"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; -1 ensembl_havana transcript 1471765 1497848 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; tag "basic"; -1 ensembl_havana exon 1471765 14720800009 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "1"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; exon_id "ENSE00003889014"; exon_version "1"; tag "basic"; -1 ensembl_havana CDS 1471885 1472089 . + 0 gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "1"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; protein_id "ENSP00000500094"; protein_version "1"; tag "basic"; -1 ensembl_havana start_codon 1471885 1471887 . + 0 gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "1"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; tag "basic"; -1 ensembl_havana exon 1477274 1477350 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "2"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; exon_id "ENSE00003467707"; exon_version "1"; tag "basic"; -1 ensembl_havana CDS 1477274 1477350 . + 2 gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "2"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; protein_id "ENSP00000500094"; protein_version "1"; tag "basic"; -1 ensembl_havana exon 1478644 1478745 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "3"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; exon_id "ENSE00003569130"; exon_version "1"; tag "basic"; -1 ensembl_havana CDS 1478644 1478745 . + 0 gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "3"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; protein_id "ENSP00000500094"; protein_version "1"; tag "basic"; -1 ensembl_havana exon 1479049 1479108 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "4"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; exon_id "ENSE00003608502"; exon_version "1"; tag "basic"; -1 ensembl_havana CDS 1479049 1479108 . + 0 gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "4"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; protein_id "ENSP00000500094"; protein_version "1"; tag "basic"; -1 ensembl_havana exon 1480867 1480936 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "5"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; exon_id "ENSE00003474888"; exon_version "1"; tag "basic"; -1 ensembl_havana CDS 1480867 1480936 . + 0 gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "5"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; protein_id "ENSP00000500094"; protein_version "1"; tag "basic"; -1 ensembl_havana exon 1482138 1482303 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "6"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; exon_id "ENSE00003654064"; exon_version "1"; tag "basic"; -1 ensembl_havana CDS 1482138 1482303 . + 2 gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "6"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; protein_id "ENSP00000500094"; protein_version "1"; tag "basic"; -1 ensembl_havana exon 1482545 1482614 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "7"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; exon_id "ENSE00003510521"; exon_version "1"; tag "basic"; -1 ensembl_havana CDS 1482545 1482614 . + 1 gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "7"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; protein_id "ENSP00000500094"; protein_version "1"; tag "basic"; -1 ensembl_havana exon 1485016 1485171 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "8"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; exon_id "ENSE00003459370"; exon_version "1"; tag "basic"; -1 ensembl_havana CDS 1485016 1485171 . + 0 gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "8"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; protein_id "ENSP00000500094"; protein_version "1"; tag "basic"; -1 ensembl_havana exon 1485782 1485838 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "9"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; exon_id "ENSE00003655926"; exon_version "1"; tag "basic"; -1 ensembl_havana CDS 1485782 1485838 . + 0 gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "9"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; protein_id "ENSP00000500094"; protein_version "1"; tag "basic"; -1 ensembl_havana exon 1486110 1486235 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "10"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; exon_id "ENSE00003594545"; exon_version "1"; tag "basic"; -1 ensembl_havana CDS 1486110 1486235 . + 0 gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "10"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; protein_id "ENSP00000500094"; protein_version "1"; tag "basic"; -1 ensembl_havana exon 1486544 1486668 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "11"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; exon_id "ENSE00003892109"; exon_version "1"; tag "basic"; -1 ensembl_havana CDS 1486544 1486668 . + 0 gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "11"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; protein_id "ENSP00000500094"; protein_version "1"; tag "basic"; -1 ensembl_havana exon 1487863 1487914 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "12"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; exon_id "ENSE00003689846"; exon_version "1"; tag "basic"; -1 ensembl_havana CDS 1487863 1487914 . + 1 gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "12"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; protein_id "ENSP00000500094"; protein_version "1"; tag "basic"; -1 ensembl_havana exon 1489204 1489274 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "13"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; exon_id "ENSE00003670332"; exon_version "1"; tag "basic"; -1 ensembl_havana CDS 1489204 1489274 . + 0 gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "13"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; protein_id "ENSP00000500094"; protein_version "1"; tag "basic"; -1 ensembl_havana exon 1490257 1490424 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "14"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; exon_id "ENSE00003505365"; exon_version "1"; tag "basic"; -1 ensembl_havana CDS 1490257 1490424 . + 1 gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "14"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; protein_id "ENSP00000500094"; protein_version "1"; tag "basic"; -1 ensembl_havana exon 1490563 1490671 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "15"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; exon_id "ENSE00003497242"; exon_version "1"; tag "basic"; -1 ensembl_havana CDS 1490563 1490671 . + 1 gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "15"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; protein_id "ENSP00000500094"; protein_version "1"; tag "basic"; -1 ensembl_havana exon 1495485 1497848 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "16"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; exon_id "ENSE00003892597"; exon_version "1"; tag "basic"; -1 ensembl_havana CDS 1495485 1495814 . + 0 gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "16"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; protein_id "ENSP00000500094"; protein_version "1"; tag "basic"; -1 ensembl_havana stop_codon 1495815 1495817 . + 0 gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "16"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; tag "basic"; -1 ensembl_havana five_prime_utr 1471765 1471884 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; tag "basic"; -1 ensembl_havana three_prime_utr 1495818 1497848 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; tag "basic"; -1 havana transcript 1478026 1497848 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000472194"; transcript_version "6"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-203"; transcript_source "havana"; transcript_biotype "retained_intron"; transcript_support_level "1"; -1 havana exon 1478026 1478745 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000472194"; transcript_version "6"; exon_number "1"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00001943609"; exon_version "1"; transcript_support_level "1"; -1 havana exon 1479049 1479108 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000472194"; transcript_version "6"; exon_number "2"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003589422"; exon_version "1"; transcript_support_level "1"; -1 havana exon 1480867 1480936 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000472194"; transcript_version "6"; exon_number "3"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003672769"; exon_version "1"; transcript_support_level "1"; -1 havana exon 1482138 1482303 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000472194"; transcript_version "6"; exon_number "4"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003661157"; exon_version "1"; transcript_support_level "1"; -1 havana exon 1482545 1482614 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000472194"; transcript_version "6"; exon_number "5"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003517812"; exon_version "1"; transcript_support_level "1"; -1 havana exon 1485016 1485170001 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000472194"; transcript_version "6"; exon_number "6"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003542737"; exon_version "1"; transcript_support_level "1"; -1 havana exon 1485782 14858380000 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000472194"; transcript_version "6"; exon_number "7"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003479480"; exon_version "1"; transcript_support_level "1"; -1 havana exon 1486110 1486235000 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000472194"; transcript_version "6"; exon_number "8"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003503434"; exon_version "1"; transcript_support_level "1"; -1 havana exon 1486544 1486668 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000472194"; transcript_version "6"; exon_number "9"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003513162"; exon_version "1"; transcript_support_level "1"; -1 havana exon 1487863 1487914 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000472194"; transcript_version "6"; exon_number "10"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003528975"; exon_version "1"; transcript_support_level "1"; -1 havana exon 1489204 1489274 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000472194"; transcript_version "6"; exon_number "11"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003611023"; exon_version "1"; transcript_support_level "1"; -1 havana exon 1490257 1490424 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000472194"; transcript_version "6"; exon_number "12"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003653402"; exon_version "1"; transcript_support_level "1"; -1 havana exon 1490563 1490671 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000472194"; transcript_version "6"; exon_number "13"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003486643"; exon_version "1"; transcript_support_level "1"; -1 havana exon 1495485 1497848 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000472194"; transcript_version "6"; exon_number "14"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00001863816"; exon_version "1"; transcript_support_level "1"; -1 havana transcript 1479049 1482662 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000378736"; transcript_version "3"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; transcript_support_level "5"; -1 havana exon 1479049 1479108 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000378736"; transcript_version "3"; exon_number "1"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; exon_id "ENSE00003589422"; exon_version "1"; transcript_support_level "5"; -1 havana exon 1480867 1480936 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000378736"; transcript_version "3"; exon_number "2"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; exon_id "ENSE00003672769"; exon_version "1"; transcript_support_level "5"; -1 havana exon 1482138 1482303 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000378736"; transcript_version "3"; exon_number "3"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; exon_id "ENSE00003661157"; exon_version "1"; transcript_support_level "5"; -1 havana exon 1482545 1482662 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000378736"; transcript_version "3"; exon_number "4"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; exon_id "ENSE00003614529"; exon_version "1"; transcript_support_level "5"; -1 havana transcript 1483485 1496202 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000485748"; transcript_version "5"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-205"; transcript_source "havana"; transcript_biotype "retained_intron"; transcript_support_level "2"; -1 havana exon 1483485 1485171 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000485748"; transcript_version "5"; exon_number "1"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-205"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00001893282"; exon_version "1"; transcript_support_level "2"; -1 havana exon 1485782 1485838 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000485748"; transcript_version "5"; exon_number "2"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-205"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003479480"; exon_version "1"; transcript_support_level "2"; -1 havana exon 1486110 1486235 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000485748"; transcript_version "5"; exon_number "3"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-205"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003503434"; exon_version "1"; transcript_support_level "2"; -1 havana exon 1486544 1486668 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000485748"; transcript_version "5"; exon_number "4"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-205"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003513162"; exon_version "1"; transcript_support_level "2"; -1 havana exon 1487863 1487914 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000485748"; transcript_version "5"; exon_number "5"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-205"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003528975"; exon_version "1"; transcript_support_level "2"; -1 havana exon 1489204 1489274 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000485748"; transcript_version "5"; exon_number "6"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-205"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003611023"; exon_version "1"; transcript_support_level "2"; -1 havana exon 1489692 1489811 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000485748"; transcript_version "5"; exon_number "7"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-205"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00001885858"; exon_version "1"; transcript_support_level "2"; -1 havana exon 1490257 1490424 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000485748"; transcript_version "5"; exon_number "8"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-205"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003653402"; exon_version "1"; transcript_support_level "2"; -1 havana exon 1490563 1490671 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000485748"; transcript_version "5"; exon_number "9"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-205"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003486643"; exon_version "1"; transcript_support_level "2"; -1 havana exon 1495485 1496202 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000485748"; transcript_version "5"; exon_number "10"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-205"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003689276"; exon_version "1"; transcript_support_level "2"; -1 havana transcript 1484569 1496201 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000474481"; transcript_version "1"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-204"; transcript_source "havana"; transcript_biotype "retained_intron"; transcript_support_level "2"; -1 havana exon 1484569 1485171 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000474481"; transcript_version "1"; exon_number "1"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-204"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00001844843"; exon_version "1"; transcript_support_level "2"; -1 havana exon 1485782 1486235 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000474481"; transcript_version "1"; exon_number "2"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-204"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00001818637"; exon_version "1"; transcript_support_level "2"; -1 havana exon 1486544 1486668 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000474481"; transcript_version "1"; exon_number "3"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-204"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003513162"; exon_version "1"; transcript_support_level "2"; -1 havana exon 1489204 1490671 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000474481"; transcript_version "1"; exon_number "4"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-204"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00001832340"; exon_version "1"; transcript_support_level "2"; -1 havana exon 1495485 1496201 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000474481"; transcript_version "1"; exon_number "5"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-204"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00001844973"; exon_version "1"; transcript_support_level "2"; -1 ensembl transcript 1471784 1496201 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; -1 ensembl exon 1471784 1472089 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; exon_number "1"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; exon_id "ENSE00001833190"; exon_version "2"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; -1 ensembl CDS 1471885 1472089 . + 0 gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; exon_number "1"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; protein_id "ENSP00000311766"; protein_version "8"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; -1 ensembl start_codon 1471885 1471887 . + 0 gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; exon_number "1"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; -1 ensembl exon 1477274 1477350 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; exon_number "2"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; exon_id "ENSE00003467707"; exon_version "1"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; -1 ensembl CDS 1477274 1477350 . + 2 gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; exon_number "2"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; protein_id "ENSP00000311766"; protein_version "8"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; -1 ensembl exon 1480867 1480908 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; exon_number "3"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; exon_id "ENSE00003889337"; exon_version "1"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; -1 ensembl CDS 1480867 1480908 . + 0 gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; exon_number "3"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; protein_id "ENSP00000311766"; protein_version "8"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; -1 ensembl exon 1482266 1482303 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; exon_number "4"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; exon_id "ENSE00003889634"; exon_version "1"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; -1 ensembl CDS 1482266 1482303 . + 0 gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; exon_number "4"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; protein_id "ENSP00000311766"; protein_version "8"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; -1 ensembl exon 1482545 1482614 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; exon_number "5"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; exon_id "ENSE00003510521"; exon_version "1"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; -1 ensembl CDS 1482545 1482614 . + 1 gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; exon_number "5"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; protein_id "ENSP00000311766"; protein_version "8"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; -1 ensembl exon 1485016 1485171 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; exon_number "6"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; exon_id "ENSE00003459370"; exon_version "1"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; -1 ensembl CDS 1485016 1485171 . + 0 gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; exon_number "6"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; protein_id "ENSP00000311766"; protein_version "8"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; -1 ensembl exon 1485782 1485838 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; exon_number "7"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; exon_id "ENSE00003655926"; exon_version "1"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; -1 ensembl CDS 1485782 1485838 . + 0 gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; exon_number "7"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; protein_id "ENSP00000311766"; protein_version "8"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; -1 ensembl exon 1486110 1486235 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; exon_number "8"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; exon_id "ENSE00003594545"; exon_version "1"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; -1 ensembl CDS 1486110 1486235 . + 0 gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; exon_number "8"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; protein_id "ENSP00000311766"; protein_version "8"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; -1 ensembl exon 1486544 1486668 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; exon_number "9"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; exon_id "ENSE00003662125"; exon_version "1"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; -1 ensembl CDS 1486544 1486666 . + 0 gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; exon_number "9"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; protein_id "ENSP00000311766"; protein_version "8"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; -1 ensembl exon 1487863 1487914 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; exon_number "10"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; exon_id "ENSE00003528975"; exon_version "1"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; -1 ensembl exon 1489204 1489274 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; exon_number "11"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; exon_id "ENSE00003611023"; exon_version "1"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; -1 ensembl exon 1490257 1490424 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; exon_number "12"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; exon_id "ENSE00003653402"; exon_version "1"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; -1 ensembl exon 1490563 1490671 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; exon_number "13"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; exon_id "ENSE00003486643"; exon_version "1"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; -1 ensembl exon 1495485 1496201 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; exon_number "14"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; exon_id "ENSE00001844973"; exon_version "1"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; -1 ensembl five_prime_utr 1471784 1471884 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; -1 ensembl three_prime_utr 1486667 1486668 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; -1 ensembl three_prime_utr 1487863 1487914 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; -1 ensembl three_prime_utr 1489204 1489274 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; -1 ensembl three_prime_utr 1490257 1490424 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; -1 ensembl three_prime_utr 1490563 1490671 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; -1 ensembl three_prime_utr 1495485 1496201 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; -1 havana gene 2212523 2220738 . + . gene_id "ENSG00000234396"; gene_version "3"; gene_source "havana"; gene_biotype "lncRNA"; -1 havana transcript 2212523 2220738 . + . gene_id "ENSG00000234396"; gene_version "3"; transcript_id "ENST00000442483"; transcript_version "2"; gene_source "havana"; gene_biotype "lncRNA"; transcript_source "havana"; transcript_biotype "lncRNA"; tag "basic"; transcript_support_level "3"; -1 havana exon 2212523 2212644 . + . gene_id "ENSG00000234396"; gene_version "3"; transcript_id "ENST00000442483"; transcript_version "2"; exon_number "1"; gene_source "havana"; gene_biotype "lncRNA"; transcript_source "havana"; transcript_biotype "lncRNA"; exon_id "ENSE00001603085"; exon_version "2"; tag "basic"; transcript_support_level "3"; -1 havana exon 2220535 2220738 . + . gene_id "ENSG00000234396"; gene_version "3"; transcript_id "ENST00000442483"; transcript_version "2"; exon_number "2"; gene_source "havana"; gene_biotype "lncRNA"; transcript_source "havana"; transcript_biotype "lncRNA"; exon_id "ENSE00001607640"; exon_version "2"; tag "basic"; transcript_support_level "3"; -1 havana gene 629062 629433 . + . gene_id "ENSG00000225972"; gene_version "1"; gene_name "MTND1P23"; gene_source "havana"; gene_biotype "unprocessed_pseudogene"; -1 havana transcript 629062 629433 . + . gene_id "ENSG00000225972"; gene_version "1"; transcript_id "ENST00000416931"; transcript_version "1"; gene_name "MTND1P23"; gene_source "havana"; gene_biotype "unprocessed_pseudogene"; transcript_name "MTND1P23-201"; transcript_source "havana"; transcript_biotype "unprocessed_pseudogene"; tag "basic"; transcript_support_level "NA"; -1 havana exon 629062 629433 . + . gene_id "ENSG00000225972"; gene_version "1"; transcript_id "ENST00000416931"; transcript_version "1"; exon_number "1"; gene_name "MTND1P23"; gene_source "havana"; gene_biotype "unprocessed_pseudogene"; transcript_name "MTND1P23-201"; transcript_source "havana"; transcript_biotype "unprocessed_pseudogene"; exon_id "ENSE00001797039"; exon_version "1"; tag "basic"; transcript_support_level "NA"; -1 havana gene 8786211 8786913 . - . gene_id "ENSG00000224315"; gene_version "2"; gene_name "RPL7P7"; gene_source "havana"; gene_biotype "processed_pseudogene"; -1 havana transcript 8786211 8786913 . - . gene_id "ENSG00000224315"; gene_version "2"; transcript_id "ENST00000428803"; transcript_version "2"; gene_name "RPL7P7"; gene_source "havana"; gene_biotype "processed_pseudogene"; transcript_name "RPL7P7-201"; transcript_source "havana"; transcript_biotype "processed_pseudogene"; tag "basic"; transcript_support_level "NA"; -1 havana exon 8786211 8786913 . - . gene_id "ENSG00000224315"; gene_version "2"; transcript_id "ENST00000428803"; transcript_version "2"; exon_number "1"; gene_name "RPL7P7"; gene_source "havana"; gene_biotype "processed_pseudogene"; transcript_name "RPL7P7-201"; transcript_source "havana"; transcript_biotype "processed_pseudogene"; exon_id "ENSE00001776158"; exon_version "2"; tag "basic"; transcript_support_level "NA"; -1 havana gene 634376 634922 . + . gene_id "ENSG00000198744"; gene_version "5"; gene_name "MTCO3P12"; gene_source "havana"; gene_biotype "unprocessed_pseudogene"; -1 havana transcript 634376 634922 . + . gene_id "ENSG00000198744"; gene_version "5"; transcript_id "ENST00000416718"; transcript_version "2"; gene_name "MTCO3P12"; gene_source "havana"; gene_biotype "unprocessed_pseudogene"; transcript_name "MTCO3P12-201"; transcript_source "havana"; transcript_biotype "unprocessed_pseudogene"; tag "basic"; transcript_support_level "NA"; -1 havana exon 634376 634922 . + . gene_id "ENSG00000198744"; gene_version "5"; transcript_id "ENST00000416718"; transcript_version "2"; exon_number "1"; gene_name "MTCO3P12"; gene_source "havana"; gene_biotype "unprocessed_pseudogene"; transcript_name "MTCO3P12-201"; transcript_source "havana"; transcript_biotype "unprocessed_pseudogene"; exon_id "ENSE00001720008"; exon_version "2"; tag "basic"; transcript_support_level "NA"; -1 havana gene 182696 184174 . + . gene_id "ENSG00000279928"; gene_version "2"; gene_name "DDX11L17"; gene_source "havana"; gene_biotype "unprocessed_pseudogene"; -1 havana transcript 182696 184174 . + . gene_id "ENSG00000279928"; gene_version "2"; transcript_id "ENST00000624431"; transcript_version "2"; gene_name "DDX11L17"; gene_source "havana"; gene_biotype "unprocessed_pseudogene"; transcript_name "DDX11L17-201"; transcript_source "havana"; transcript_biotype "unprocessed_pseudogene"; tag "basic"; transcript_support_level "NA"; -1 havana exon 182696 182746 . + . gene_id "ENSG00000279928"; gene_version "2"; transcript_id "ENST00000624431"; transcript_version "2"; exon_number "1"; gene_name "DDX11L17"; gene_source "havana"; gene_biotype "unprocessed_pseudogene"; transcript_name "DDX11L17-201"; transcript_source "havana"; transcript_biotype "unprocessed_pseudogene"; exon_id "ENSE00003759020"; exon_version "2"; tag "basic"; transcript_support_level "NA"; -1 havana exon 183132 183216 . + . gene_id "ENSG00000279928"; gene_version "2"; transcript_id "ENST00000624431"; transcript_version "2"; exon_number "2"; gene_name "DDX11L17"; gene_source "havana"; gene_biotype "unprocessed_pseudogene"; transcript_name "DDX11L17-201"; transcript_source "havana"; transcript_biotype "unprocessed_pseudogene"; exon_id "ENSE00003759581"; exon_version "2"; tag "basic"; transcript_support_level "NA"; -1 havana exon 183494 183571 . + . gene_id "ENSG00000279928"; gene_version "2"; transcript_id "ENST00000624431"; transcript_version "2"; exon_number "3"; gene_name "DDX11L17"; gene_source "havana"; gene_biotype "unprocessed_pseudogene"; transcript_name "DDX11L17-201"; transcript_source "havana"; transcript_biotype "unprocessed_pseudogene"; exon_id "ENSE00003804405"; exon_version "1"; tag "basic"; transcript_support_level "NA"; -1 havana exon 183740 183901 . + . gene_id "ENSG00000279928"; gene_version "2"; transcript_id "ENST00000624431"; transcript_version "2"; exon_number "4"; gene_name "DDX11L17"; gene_source "havana"; gene_biotype "unprocessed_pseudogene"; transcript_name "DDX11L17-201"; transcript_source "havana"; transcript_biotype "unprocessed_pseudogene"; exon_id "ENSE00003807458"; exon_version "1"; tag "basic"; transcript_support_level "NA"; -1 havana exon 183981 184174 . + . gene_id "ENSG00000279928"; gene_version "2"; transcript_id "ENST00000624431"; transcript_version "2"; exon_number "5"; gene_name "DDX11L17"; gene_source "havana"; gene_biotype "unprocessed_pseudogene"; transcript_name "DDX11L17-201"; transcript_source "havana"; transcript_biotype "unprocessed_pseudogene"; exon_id "ENSE00003760199"; exon_version "2"; tag "basic"; transcript_support_level "NA"; -1 havana gene 2581560 2584533 . + . gene_id "ENSG00000228037"; gene_version "1"; gene_source "havana"; gene_biotype "lncRNA"; -1 havana transcript 2581560 2584533 . + . gene_id "ENSG00000228037"; gene_version "1"; transcript_id "ENST00000424215"; transcript_version "1"; gene_source "havana"; gene_biotype "lncRNA"; transcript_source "havana"; transcript_biotype "lncRNA"; tag "basic"; transcript_support_level "5"; -1 havana exon 2581560 25816500000 . + . gene_id "ENSG00000228037"; gene_version "1"; transcript_id "ENST00000424215"; transcript_version "1"; exon_number "1"; gene_source "havana"; gene_biotype "lncRNA"; transcript_source "havana"; transcript_biotype "lncRNA"; exon_id "ENSE00001795368"; exon_version "1"; tag "basic"; transcript_support_level "5"; -1 havana exon 2583370 2583495 . + . gene_id "ENSG00000228037"; gene_version "1"; transcript_id "ENST00000424215"; transcript_version "1"; exon_number "2"; gene_source "havana"; gene_biotype "lncRNA"; transcript_source "havana"; transcript_biotype "lncRNA"; exon_id "ENSE00001694676"; exon_version "1"; tag "basic"; transcript_support_level "5"; -1 havana exon 2584125 2584533 . + . gene_id "ENSG00000228037"; gene_version "1"; transcript_id "ENST00000424215"; transcript_version "1"; exon_number "3"; gene_source "havana"; gene_biotype "lncRNA"; transcript_source "havana"; transcript_biotype "lncRNA"; exon_id "ENSE00001601095"; exon_version "1"; tag "basic"; transcript_support_level "5"; -1 ensembl_havana gene 3069168 3438621 . + . gene_id "ENSG00000142611"; gene_version "17"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; -1 havana transcript 3069168 3434342 . + . gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic"; transcript_support_level "5"; -1 havana exon 3069168 3069296 . + . gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "1"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00002048533"; exon_version "1"; tag "basic"; transcript_support_level "5"; -1 havana CDS 3069260 3069296 . + 0 gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "1"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; protein_id "ENSP00000426975"; protein_version "1"; tag "basic"; transcript_support_level "5"; -1 havana start_codon 3069260 3069262 . + 0 gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "1"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic"; transcript_support_level "5"; -1 havana exon 3186125 3186474 . + . gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "2"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00001754112"; exon_version "1"; tag "basic"; transcript_support_level "5"; -1 havana CDS 3186125 3186474 . + 2 gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "2"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; protein_id "ENSP00000426975"; protein_version "1"; tag "basic"; transcript_support_level "5"; -1 havana exon 3244087 3244137 . + . gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "3"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00003480863"; exon_version "1"; tag "basic"; transcript_support_level "5"; -1 havana CDS 3244087 3244137 . + 0 gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "3"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; protein_id "ENSP00000426975"; protein_version "1"; tag "basic"; transcript_support_level "5"; -1 havana exon 3385149 3385286 . + . gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "4"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00002034212"; exon_version "1"; tag "basic"; transcript_support_level "5"; -1 havana CDS 3385149 3385286 . + 0 gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "4"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; protein_id "ENSP00000426975"; protein_version "1"; tag "basic"; transcript_support_level "5"; -1 havana exon 3396491 3396593 . + . gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "5"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00003700221"; exon_version "1"; tag "basic"; transcript_support_level "5"; -1 havana CDS 3396491 3396593 . + 0 gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "5"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; protein_id "ENSP00000426975"; protein_version "1"; tag "basic"; transcript_support_level "5"; -1 havana exon 3402791 3402998 . + . gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "6"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00003696962"; exon_version "1"; tag "basic"; transcript_support_level "5"; -1 havana CDS 3402791 3402998 . + 2 gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "6"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; protein_id "ENSP00000426975"; protein_version "1"; tag "basic"; transcript_support_level "5"; -1 havana exon 3404739 3404886 . + . gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "7"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00003700688"; exon_version "1"; tag "basic"; transcript_support_level "5"; -1 havana CDS 3404739 3404886 . + 1 gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "7"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; protein_id "ENSP00000426975"; protein_version "1"; tag "basic"; transcript_support_level "5"; -1 havana exon 3405495 3405648 . + . gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "8"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00003700645"; exon_version "1"; tag "basic"; transcript_support_level "5"; -1 havana CDS 3405495 3405648 . + 0 gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "8"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; protein_id "ENSP00000426975"; protein_version "1"; tag "basic"; transcript_support_level "5"; -1 havana exon 3411384 3412800 . + . gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "9"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00003695658"; exon_version "1"; tag "basic"; transcript_support_level "5"; -1 havana CDS 3411384 3412800 . + 2 gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "9"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; protein_id "ENSP00000426975"; protein_version "1"; tag "basic"; transcript_support_level "5"; -1 havana exon 3414560 3414647 . + . gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "10"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00003701451"; exon_version "1"; tag "basic"; transcript_support_level "5"; -1 havana CDS 3414560 3414647 . + 1 gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "10"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; protein_id "ENSP00000426975"; protein_version "1"; tag "basic"; transcript_support_level "5"; -1 havana exon 3417828 3417997 . + . gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "11"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00003699052"; exon_version "1"; tag "basic"; transcript_support_level "5"; -1 havana CDS 3417828 3417997 . + 0 gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "11"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; protein_id "ENSP00000426975"; protein_version "1"; tag "basic"; transcript_support_level "5"; -1 havana exon 3418667 3418744 . + . gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "12"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00003698430"; exon_version "1"; tag "basic"; transcript_support_level "5"; -1 havana CDS 3418667 3418744 . + 1 gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "12"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; protein_id "ENSP00000426975"; protein_version "1"; tag "basic"; transcript_support_level "5"; -1 havana exon 3425581 3425750 . + . gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "13"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00003699796"; exon_version "1"; tag "basic"; transcript_support_level "5"; -1 havana CDS 3425581 3425750 . + 1 gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "13"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; protein_id "ENSP00000426975"; protein_version "1"; tag "basic"; transcript_support_level "5"; -1 havana exon 3426051 3426225 . + . gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "14"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00003701891"; exon_version "1"; tag "basic"; transcript_support_level "5"; -1 havana CDS 3426051 3426225 . + 2 gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "14"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; protein_id "ENSP00000426975"; protein_version "1"; tag "basic"; transcript_support_level "5"; -1 havana exon 3430872 3431108 . + . gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "15"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00003698226"; exon_version "1"; tag "basic"; transcript_support_level "5"; -1 havana CDS 3430872 3431108 . + 1 gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "15"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; protein_id "ENSP00000426975"; protein_version "1"; tag "basic"; transcript_support_level "5"; -1 havana exon 3433677 3434342 . + . gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "16"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00002081080"; exon_version "1"; tag "basic"; transcript_support_level "5"; -1 havana CDS 3433677 3433686 . + 1 gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "16"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; protein_id "ENSP00000426975"; protein_version "1"; tag "basic"; transcript_support_level "5"; -1 havana stop_codon 3433687 3433689 . + 0 gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "16"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic"; transcript_support_level "5"; -1 havana five_prime_utr 3069168 3069259 . + . gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic"; transcript_support_level "5"; -1 havana three_prime_utr 3433690 3434342 . + . gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic"; transcript_support_level "5"; -1 havana transcript 3069183 3186591 . + . gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000607632"; transcript_version "1"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-210"; transcript_source "havana"; transcript_biotype "retained_intron"; transcript_support_level "2"; -1 havana exon 3069183 3069296 . + . gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000607632"; transcript_version "1"; exon_number "1"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-210"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003700259"; exon_version "1"; transcript_support_level "2"; -1 havana exon 3186125 3186591 . + . gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000607632"; transcript_version "1"; exon_number "2"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-210"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003695128"; exon_version "1"; transcript_support_level "2"; -1 havana transcript 3069197 3435421 . + . gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000378391"; transcript_version "6"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-203"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS44048"; tag "basic"; transcript_support_level "1"; -1 havana exon 3069197 3069296 . + . gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000378391"; transcript_version "6"; exon_number "1"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-203"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS44048"; exon_id "ENSE00001222906"; exon_version "5"; tag "basic"; transcript_support_level "1"; -1 havana CDS 3069260 3069296 . + 0 gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000378391"; transcript_version "6"; exon_number "1"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-203"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS44048"; protein_id "ENSP00000367643"; protein_version "2"; tag "basic"; transcript_support_level "1"; -1 havana start_codon 3069260 3069262 . + 0 gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000378391"; transcript_version "6"; exon_number "1"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-203"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS44048"; tag "basic"; transcript_support_level "1"; -1 havana exon 3186125 31864740000 . + . gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000378391"; transcript_version "6"; exon_number "2"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-203"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS44048"; exon_id "ENSE00001754112"; exon_version "1"; tag "basic"; transcript_support_level "1"; \ No newline at end of file +#!genome-build GRCh38.p13 +#!genome-version GRCh38 +#!genome-date 2013-12 +#!genome-build-accession GCA_000001405.28 +#!genebuild-last-updated 2022-04 +1 ensembl_havana gene 1471765 1497848 . + . gene_id "ENSG00000160072"; gene_version "20"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; +1 ensembl_havana transcript 1471765 1497848 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; tag "basic"; +1 ensembl_havana exon 1471765 14720800009 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "1"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; exon_id "ENSE00003889014"; exon_version "1"; tag "basic"; +1 ensembl_havana CDS 1471885 1472089 . + 0 gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "1"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; protein_id "ENSP00000500094"; protein_version "1"; tag "basic"; +1 ensembl_havana start_codon 1471885 1471887 . + 0 gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "1"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; tag "basic"; +1 ensembl_havana exon 1477274 1477350 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "2"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; exon_id "ENSE00003467707"; exon_version "1"; tag "basic"; +1 ensembl_havana CDS 1477274 1477350 . + 2 gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "2"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; protein_id "ENSP00000500094"; protein_version "1"; tag "basic"; +1 ensembl_havana exon 1478644 1478745 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "3"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; exon_id "ENSE00003569130"; exon_version "1"; tag "basic"; +1 ensembl_havana CDS 1478644 1478745 . + 0 gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "3"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; protein_id "ENSP00000500094"; protein_version "1"; tag "basic"; +1 ensembl_havana exon 1479049 1479108 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "4"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; exon_id "ENSE00003608502"; exon_version "1"; tag "basic"; +1 ensembl_havana CDS 1479049 1479108 . + 0 gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "4"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; protein_id "ENSP00000500094"; protein_version "1"; tag "basic"; +1 ensembl_havana exon 1480867 1480936 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "5"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; exon_id "ENSE00003474888"; exon_version "1"; tag "basic"; +1 ensembl_havana CDS 1480867 1480936 . + 0 gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "5"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; protein_id "ENSP00000500094"; protein_version "1"; tag "basic"; +1 ensembl_havana exon 1482138 1482303 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "6"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; exon_id "ENSE00003654064"; exon_version "1"; tag "basic"; +1 ensembl_havana CDS 1482138 1482303 . + 2 gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "6"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; protein_id "ENSP00000500094"; protein_version "1"; tag "basic"; +1 ensembl_havana exon 1482545 1482614 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "7"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; exon_id "ENSE00003510521"; exon_version "1"; tag "basic"; +1 ensembl_havana CDS 1482545 1482614 . + 1 gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "7"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; protein_id "ENSP00000500094"; protein_version "1"; tag "basic"; +1 ensembl_havana exon 1485016 1485171 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "8"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; exon_id "ENSE00003459370"; exon_version "1"; tag "basic"; +1 ensembl_havana CDS 1485016 1485171 . + 0 gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "8"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; protein_id "ENSP00000500094"; protein_version "1"; tag "basic"; +1 ensembl_havana exon 1485782 1485838 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "9"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; exon_id "ENSE00003655926"; exon_version "1"; tag "basic"; +1 ensembl_havana CDS 1485782 1485838 . + 0 gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "9"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; protein_id "ENSP00000500094"; protein_version "1"; tag "basic"; +1 ensembl_havana exon 1486110 1486235 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "10"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; exon_id "ENSE00003594545"; exon_version "1"; tag "basic"; +1 ensembl_havana CDS 1486110 1486235 . + 0 gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "10"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; protein_id "ENSP00000500094"; protein_version "1"; tag "basic"; +1 ensembl_havana exon 1486544 1486668 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "11"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; exon_id "ENSE00003892109"; exon_version "1"; tag "basic"; +1 ensembl_havana CDS 1486544 1486668 . + 0 gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "11"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; protein_id "ENSP00000500094"; protein_version "1"; tag "basic"; +1 ensembl_havana exon 1487863 1487914 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "12"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; exon_id "ENSE00003689846"; exon_version "1"; tag "basic"; +1 ensembl_havana CDS 1487863 1487914 . + 1 gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "12"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; protein_id "ENSP00000500094"; protein_version "1"; tag "basic"; +1 ensembl_havana exon 1489204 1489274 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "13"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; exon_id "ENSE00003670332"; exon_version "1"; tag "basic"; +1 ensembl_havana CDS 1489204 1489274 . + 0 gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "13"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; protein_id "ENSP00000500094"; protein_version "1"; tag "basic"; +1 ensembl_havana exon 1490257 1490424 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "14"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; exon_id "ENSE00003505365"; exon_version "1"; tag "basic"; +1 ensembl_havana CDS 1490257 1490424 . + 1 gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "14"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; protein_id "ENSP00000500094"; protein_version "1"; tag "basic"; +1 ensembl_havana exon 1490563 1490671 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "15"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; exon_id "ENSE00003497242"; exon_version "1"; tag "basic"; +1 ensembl_havana CDS 1490563 1490671 . + 1 gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "15"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; protein_id "ENSP00000500094"; protein_version "1"; tag "basic"; +1 ensembl_havana exon 1495485 1497848 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "16"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; exon_id "ENSE00003892597"; exon_version "1"; tag "basic"; +1 ensembl_havana CDS 1495485 1495814 . + 0 gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "16"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; protein_id "ENSP00000500094"; protein_version "1"; tag "basic"; +1 ensembl_havana stop_codon 1495815 1495817 . + 0 gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "16"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; tag "basic"; +1 ensembl_havana five_prime_utr 1471765 1471884 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; tag "basic"; +1 ensembl_havana three_prime_utr 1495818 1497848 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; tag "basic"; +1 havana transcript 1478026 1497848 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000472194"; transcript_version "6"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-203"; transcript_source "havana"; transcript_biotype "retained_intron"; transcript_support_level "1"; +1 havana exon 1478026 1478745 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000472194"; transcript_version "6"; exon_number "1"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00001943609"; exon_version "1"; transcript_support_level "1"; +1 havana exon 1479049 1479108 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000472194"; transcript_version "6"; exon_number "2"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003589422"; exon_version "1"; transcript_support_level "1"; +1 havana exon 1480867 1480936 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000472194"; transcript_version "6"; exon_number "3"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003672769"; exon_version "1"; transcript_support_level "1"; +1 havana exon 1482138 1482303 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000472194"; transcript_version "6"; exon_number "4"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003661157"; exon_version "1"; transcript_support_level "1"; +1 havana exon 1482545 1482614 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000472194"; transcript_version "6"; exon_number "5"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003517812"; exon_version "1"; transcript_support_level "1"; +1 havana exon 1485016 1485170001 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000472194"; transcript_version "6"; exon_number "6"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003542737"; exon_version "1"; transcript_support_level "1"; +1 havana exon 1485782 14858380000 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000472194"; transcript_version "6"; exon_number "7"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003479480"; exon_version "1"; transcript_support_level "1"; +1 havana exon 1486110 1486235000 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000472194"; transcript_version "6"; exon_number "8"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003503434"; exon_version "1"; transcript_support_level "1"; +1 havana exon 1486544 1486668 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000472194"; transcript_version "6"; exon_number "9"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003513162"; exon_version "1"; transcript_support_level "1"; +1 havana exon 1487863 1487914 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000472194"; transcript_version "6"; exon_number "10"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003528975"; exon_version "1"; transcript_support_level "1"; +1 havana exon 1489204 1489274 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000472194"; transcript_version "6"; exon_number "11"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003611023"; exon_version "1"; transcript_support_level "1"; +1 havana exon 1490257 1490424 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000472194"; transcript_version "6"; exon_number "12"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003653402"; exon_version "1"; transcript_support_level "1"; +1 havana exon 1490563 1490671 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000472194"; transcript_version "6"; exon_number "13"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003486643"; exon_version "1"; transcript_support_level "1"; +1 havana exon 1495485 1497848 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000472194"; transcript_version "6"; exon_number "14"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00001863816"; exon_version "1"; transcript_support_level "1"; +1 havana transcript 1479049 1482662 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000378736"; transcript_version "3"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; transcript_support_level "5"; +1 havana exon 1479049 1479108 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000378736"; transcript_version "3"; exon_number "1"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; exon_id "ENSE00003589422"; exon_version "1"; transcript_support_level "5"; +1 havana exon 1480867 1480936 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000378736"; transcript_version "3"; exon_number "2"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; exon_id "ENSE00003672769"; exon_version "1"; transcript_support_level "5"; +1 havana exon 1482138 1482303 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000378736"; transcript_version "3"; exon_number "3"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; exon_id "ENSE00003661157"; exon_version "1"; transcript_support_level "5"; +1 havana exon 1482545 1482662 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000378736"; transcript_version "3"; exon_number "4"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; exon_id "ENSE00003614529"; exon_version "1"; transcript_support_level "5"; +1 havana transcript 1483485 1496202 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000485748"; transcript_version "5"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-205"; transcript_source "havana"; transcript_biotype "retained_intron"; transcript_support_level "2"; +1 havana exon 1483485 1485171 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000485748"; transcript_version "5"; exon_number "1"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-205"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00001893282"; exon_version "1"; transcript_support_level "2"; +1 havana exon 1485782 1485838 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000485748"; transcript_version "5"; exon_number "2"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-205"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003479480"; exon_version "1"; transcript_support_level "2"; +1 havana exon 1486110 1486235 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000485748"; transcript_version "5"; exon_number "3"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-205"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003503434"; exon_version "1"; transcript_support_level "2"; +1 havana exon 1486544 1486668 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000485748"; transcript_version "5"; exon_number "4"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-205"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003513162"; exon_version "1"; transcript_support_level "2"; +1 havana exon 1487863 1487914 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000485748"; transcript_version "5"; exon_number "5"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-205"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003528975"; exon_version "1"; transcript_support_level "2"; +1 havana exon 1489204 1489274 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000485748"; transcript_version "5"; exon_number "6"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-205"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003611023"; exon_version "1"; transcript_support_level "2"; +1 havana exon 1489692 1489811 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000485748"; transcript_version "5"; exon_number "7"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-205"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00001885858"; exon_version "1"; transcript_support_level "2"; +1 havana exon 1490257 1490424 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000485748"; transcript_version "5"; exon_number "8"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-205"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003653402"; exon_version "1"; transcript_support_level "2"; +1 havana exon 1490563 1490671 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000485748"; transcript_version "5"; exon_number "9"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-205"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003486643"; exon_version "1"; transcript_support_level "2"; +1 havana exon 1495485 1496202 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000485748"; transcript_version "5"; exon_number "10"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-205"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003689276"; exon_version "1"; transcript_support_level "2"; +1 havana transcript 1484569 1496201 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000474481"; transcript_version "1"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-204"; transcript_source "havana"; transcript_biotype "retained_intron"; transcript_support_level "2"; +1 havana exon 1484569 1485171 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000474481"; transcript_version "1"; exon_number "1"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-204"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00001844843"; exon_version "1"; transcript_support_level "2"; +1 havana exon 1485782 1486235 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000474481"; transcript_version "1"; exon_number "2"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-204"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00001818637"; exon_version "1"; transcript_support_level "2"; +1 havana exon 1486544 1486668 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000474481"; transcript_version "1"; exon_number "3"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-204"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003513162"; exon_version "1"; transcript_support_level "2"; +1 havana exon 1489204 1490671 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000474481"; transcript_version "1"; exon_number "4"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-204"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00001832340"; exon_version "1"; transcript_support_level "2"; +1 havana exon 1495485 1496201 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000474481"; transcript_version "1"; exon_number "5"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-204"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00001844973"; exon_version "1"; transcript_support_level "2"; +1 ensembl transcript 1471784 1496201 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; +1 ensembl exon 1471784 1472089 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; exon_number "1"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; exon_id "ENSE00001833190"; exon_version "2"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; +1 ensembl CDS 1471885 1472089 . + 0 gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; exon_number "1"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; protein_id "ENSP00000311766"; protein_version "8"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; +1 ensembl start_codon 1471885 1471887 . + 0 gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; exon_number "1"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; +1 ensembl exon 1477274 1477350 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; exon_number "2"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; exon_id "ENSE00003467707"; exon_version "1"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; +1 ensembl CDS 1477274 1477350 . + 2 gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; exon_number "2"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; protein_id "ENSP00000311766"; protein_version "8"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; +1 ensembl exon 1480867 1480908 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; exon_number "3"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; exon_id "ENSE00003889337"; exon_version "1"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; +1 ensembl CDS 1480867 1480908 . + 0 gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; exon_number "3"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; protein_id "ENSP00000311766"; protein_version "8"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; +1 ensembl exon 1482266 1482303 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; exon_number "4"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; exon_id "ENSE00003889634"; exon_version "1"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; +1 ensembl CDS 1482266 1482303 . + 0 gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; exon_number "4"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; protein_id "ENSP00000311766"; protein_version "8"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; +1 ensembl exon 1482545 1482614 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; exon_number "5"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; exon_id "ENSE00003510521"; exon_version "1"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; +1 ensembl CDS 1482545 1482614 . + 1 gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; exon_number "5"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; protein_id "ENSP00000311766"; protein_version "8"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; +1 ensembl exon 1485016 1485171 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; exon_number "6"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; exon_id "ENSE00003459370"; exon_version "1"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; +1 ensembl CDS 1485016 1485171 . + 0 gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; exon_number "6"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; protein_id "ENSP00000311766"; protein_version "8"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; +1 ensembl exon 1485782 1485838 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; exon_number "7"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; exon_id "ENSE00003655926"; exon_version "1"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; +1 ensembl CDS 1485782 1485838 . + 0 gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; exon_number "7"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; protein_id "ENSP00000311766"; protein_version "8"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; +1 ensembl exon 1486110 1486235 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; exon_number "8"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; exon_id "ENSE00003594545"; exon_version "1"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; +1 ensembl CDS 1486110 1486235 . + 0 gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; exon_number "8"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; protein_id "ENSP00000311766"; protein_version "8"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; +1 ensembl exon 1486544 1486668 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; exon_number "9"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; exon_id "ENSE00003662125"; exon_version "1"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; +1 ensembl CDS 1486544 1486666 . + 0 gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; exon_number "9"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; protein_id "ENSP00000311766"; protein_version "8"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; +1 ensembl exon 1487863 1487914 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; exon_number "10"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; exon_id "ENSE00003528975"; exon_version "1"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; +1 ensembl exon 1489204 1489274 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; exon_number "11"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; exon_id "ENSE00003611023"; exon_version "1"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; +1 ensembl exon 1490257 1490424 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; exon_number "12"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; exon_id "ENSE00003653402"; exon_version "1"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; +1 ensembl exon 1490563 1490671 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; exon_number "13"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; exon_id "ENSE00003486643"; exon_version "1"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; +1 ensembl exon 1495485 1496201 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; exon_number "14"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; exon_id "ENSE00001844973"; exon_version "1"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; +1 ensembl five_prime_utr 1471784 1471884 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; +1 ensembl three_prime_utr 1486667 1486668 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; +1 ensembl three_prime_utr 1487863 1487914 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; +1 ensembl three_prime_utr 1489204 1489274 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; +1 ensembl three_prime_utr 1490257 1490424 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; +1 ensembl three_prime_utr 1490563 1490671 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; +1 ensembl three_prime_utr 1495485 1496201 . + . gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000308647"; transcript_version "8"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; tag "basic"; transcript_support_level "1 (assigned to previous version 7)"; +1 havana gene 2212523 2220738 . + . gene_id "ENSG00000234396"; gene_version "3"; gene_source "havana"; gene_biotype "lncRNA"; +1 havana transcript 2212523 2220738 . + . gene_id "ENSG00000234396"; gene_version "3"; transcript_id "ENST00000442483"; transcript_version "2"; gene_source "havana"; gene_biotype "lncRNA"; transcript_source "havana"; transcript_biotype "lncRNA"; tag "basic"; transcript_support_level "3"; +1 havana exon 2212523 2212644 . + . gene_id "ENSG00000234396"; gene_version "3"; transcript_id "ENST00000442483"; transcript_version "2"; exon_number "1"; gene_source "havana"; gene_biotype "lncRNA"; transcript_source "havana"; transcript_biotype "lncRNA"; exon_id "ENSE00001603085"; exon_version "2"; tag "basic"; transcript_support_level "3"; +1 havana exon 2220535 2220738 . + . gene_id "ENSG00000234396"; gene_version "3"; transcript_id "ENST00000442483"; transcript_version "2"; exon_number "2"; gene_source "havana"; gene_biotype "lncRNA"; transcript_source "havana"; transcript_biotype "lncRNA"; exon_id "ENSE00001607640"; exon_version "2"; tag "basic"; transcript_support_level "3"; +1 havana gene 629062 629433 . + . gene_id "ENSG00000225972"; gene_version "1"; gene_name "MTND1P23"; gene_source "havana"; gene_biotype "unprocessed_pseudogene"; +1 havana transcript 629062 629433 . + . gene_id "ENSG00000225972"; gene_version "1"; transcript_id "ENST00000416931"; transcript_version "1"; gene_name "MTND1P23"; gene_source "havana"; gene_biotype "unprocessed_pseudogene"; transcript_name "MTND1P23-201"; transcript_source "havana"; transcript_biotype "unprocessed_pseudogene"; tag "basic"; transcript_support_level "NA"; +1 havana exon 629062 629433 . + . gene_id "ENSG00000225972"; gene_version "1"; transcript_id "ENST00000416931"; transcript_version "1"; exon_number "1"; gene_name "MTND1P23"; gene_source "havana"; gene_biotype "unprocessed_pseudogene"; transcript_name "MTND1P23-201"; transcript_source "havana"; transcript_biotype "unprocessed_pseudogene"; exon_id "ENSE00001797039"; exon_version "1"; tag "basic"; transcript_support_level "NA"; +1 havana gene 8786211 8786913 . - . gene_id "ENSG00000224315"; gene_version "2"; gene_name "RPL7P7"; gene_source "havana"; gene_biotype "processed_pseudogene"; +1 havana transcript 8786211 8786913 . - . gene_id "ENSG00000224315"; gene_version "2"; transcript_id "ENST00000428803"; transcript_version "2"; gene_name "RPL7P7"; gene_source "havana"; gene_biotype "processed_pseudogene"; transcript_name "RPL7P7-201"; transcript_source "havana"; transcript_biotype "processed_pseudogene"; tag "basic"; transcript_support_level "NA"; +1 havana exon 8786211 8786913 . - . gene_id "ENSG00000224315"; gene_version "2"; transcript_id "ENST00000428803"; transcript_version "2"; exon_number "1"; gene_name "RPL7P7"; gene_source "havana"; gene_biotype "processed_pseudogene"; transcript_name "RPL7P7-201"; transcript_source "havana"; transcript_biotype "processed_pseudogene"; exon_id "ENSE00001776158"; exon_version "2"; tag "basic"; transcript_support_level "NA"; +1 havana gene 634376 634922 . + . gene_id "ENSG00000198744"; gene_version "5"; gene_name "MTCO3P12"; gene_source "havana"; gene_biotype "unprocessed_pseudogene"; +1 havana transcript 634376 634922 . + . gene_id "ENSG00000198744"; gene_version "5"; transcript_id "ENST00000416718"; transcript_version "2"; gene_name "MTCO3P12"; gene_source "havana"; gene_biotype "unprocessed_pseudogene"; transcript_name "MTCO3P12-201"; transcript_source "havana"; transcript_biotype "unprocessed_pseudogene"; tag "basic"; transcript_support_level "NA"; +1 havana exon 634376 634922 . + . gene_id "ENSG00000198744"; gene_version "5"; transcript_id "ENST00000416718"; transcript_version "2"; exon_number "1"; gene_name "MTCO3P12"; gene_source "havana"; gene_biotype "unprocessed_pseudogene"; transcript_name "MTCO3P12-201"; transcript_source "havana"; transcript_biotype "unprocessed_pseudogene"; exon_id "ENSE00001720008"; exon_version "2"; tag "basic"; transcript_support_level "NA"; +1 havana gene 182696 184174 . + . gene_id "ENSG00000279928"; gene_version "2"; gene_name "DDX11L17"; gene_source "havana"; gene_biotype "unprocessed_pseudogene"; +1 havana transcript 182696 184174 . + . gene_id "ENSG00000279928"; gene_version "2"; transcript_id "ENST00000624431"; transcript_version "2"; gene_name "DDX11L17"; gene_source "havana"; gene_biotype "unprocessed_pseudogene"; transcript_name "DDX11L17-201"; transcript_source "havana"; transcript_biotype "unprocessed_pseudogene"; tag "basic"; transcript_support_level "NA"; +1 havana exon 182696 182746 . + . gene_id "ENSG00000279928"; gene_version "2"; transcript_id "ENST00000624431"; transcript_version "2"; exon_number "1"; gene_name "DDX11L17"; gene_source "havana"; gene_biotype "unprocessed_pseudogene"; transcript_name "DDX11L17-201"; transcript_source "havana"; transcript_biotype "unprocessed_pseudogene"; exon_id "ENSE00003759020"; exon_version "2"; tag "basic"; transcript_support_level "NA"; +1 havana exon 183132 183216 . + . gene_id "ENSG00000279928"; gene_version "2"; transcript_id "ENST00000624431"; transcript_version "2"; exon_number "2"; gene_name "DDX11L17"; gene_source "havana"; gene_biotype "unprocessed_pseudogene"; transcript_name "DDX11L17-201"; transcript_source "havana"; transcript_biotype "unprocessed_pseudogene"; exon_id "ENSE00003759581"; exon_version "2"; tag "basic"; transcript_support_level "NA"; +1 havana exon 183494 183571 . + . gene_id "ENSG00000279928"; gene_version "2"; transcript_id "ENST00000624431"; transcript_version "2"; exon_number "3"; gene_name "DDX11L17"; gene_source "havana"; gene_biotype "unprocessed_pseudogene"; transcript_name "DDX11L17-201"; transcript_source "havana"; transcript_biotype "unprocessed_pseudogene"; exon_id "ENSE00003804405"; exon_version "1"; tag "basic"; transcript_support_level "NA"; +1 havana exon 183740 183901 . + . gene_id "ENSG00000279928"; gene_version "2"; transcript_id "ENST00000624431"; transcript_version "2"; exon_number "4"; gene_name "DDX11L17"; gene_source "havana"; gene_biotype "unprocessed_pseudogene"; transcript_name "DDX11L17-201"; transcript_source "havana"; transcript_biotype "unprocessed_pseudogene"; exon_id "ENSE00003807458"; exon_version "1"; tag "basic"; transcript_support_level "NA"; +1 havana exon 183981 184174 . + . gene_id "ENSG00000279928"; gene_version "2"; transcript_id "ENST00000624431"; transcript_version "2"; exon_number "5"; gene_name "DDX11L17"; gene_source "havana"; gene_biotype "unprocessed_pseudogene"; transcript_name "DDX11L17-201"; transcript_source "havana"; transcript_biotype "unprocessed_pseudogene"; exon_id "ENSE00003760199"; exon_version "2"; tag "basic"; transcript_support_level "NA"; +1 havana gene 2581560 2584533 . + . gene_id "ENSG00000228037"; gene_version "1"; gene_source "havana"; gene_biotype "lncRNA"; +1 havana transcript 2581560 2584533 . + . gene_id "ENSG00000228037"; gene_version "1"; transcript_id "ENST00000424215"; transcript_version "1"; gene_source "havana"; gene_biotype "lncRNA"; transcript_source "havana"; transcript_biotype "lncRNA"; tag "basic"; transcript_support_level "5"; +1 havana exon 2581560 25816500000 . + . gene_id "ENSG00000228037"; gene_version "1"; transcript_id "ENST00000424215"; transcript_version "1"; exon_number "1"; gene_source "havana"; gene_biotype "lncRNA"; transcript_source "havana"; transcript_biotype "lncRNA"; exon_id "ENSE00001795368"; exon_version "1"; tag "basic"; transcript_support_level "5"; +1 havana exon 2583370 2583495 . + . gene_id "ENSG00000228037"; gene_version "1"; transcript_id "ENST00000424215"; transcript_version "1"; exon_number "2"; gene_source "havana"; gene_biotype "lncRNA"; transcript_source "havana"; transcript_biotype "lncRNA"; exon_id "ENSE00001694676"; exon_version "1"; tag "basic"; transcript_support_level "5"; +1 havana exon 2584125 2584533 . + . gene_id "ENSG00000228037"; gene_version "1"; transcript_id "ENST00000424215"; transcript_version "1"; exon_number "3"; gene_source "havana"; gene_biotype "lncRNA"; transcript_source "havana"; transcript_biotype "lncRNA"; exon_id "ENSE00001601095"; exon_version "1"; tag "basic"; transcript_support_level "5"; +1 ensembl_havana gene 3069168 3438621 . + . gene_id "ENSG00000142611"; gene_version "17"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; +1 havana transcript 3069168 3434342 . + . gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic"; transcript_support_level "5"; +1 havana exon 3069168 3069296 . + . gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "1"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00002048533"; exon_version "1"; tag "basic"; transcript_support_level "5"; +1 havana CDS 3069260 3069296 . + 0 gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "1"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; protein_id "ENSP00000426975"; protein_version "1"; tag "basic"; transcript_support_level "5"; +1 havana start_codon 3069260 3069262 . + 0 gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "1"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic"; transcript_support_level "5"; +1 havana exon 3186125 3186474 . + . gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "2"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00001754112"; exon_version "1"; tag "basic"; transcript_support_level "5"; +1 havana CDS 3186125 3186474 . + 2 gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "2"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; protein_id "ENSP00000426975"; protein_version "1"; tag "basic"; transcript_support_level "5"; +1 havana exon 3244087 3244137 . + . gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "3"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00003480863"; exon_version "1"; tag "basic"; transcript_support_level "5"; +1 havana CDS 3244087 3244137 . + 0 gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "3"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; protein_id "ENSP00000426975"; protein_version "1"; tag "basic"; transcript_support_level "5"; +1 havana exon 3385149 3385286 . + . gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "4"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00002034212"; exon_version "1"; tag "basic"; transcript_support_level "5"; +1 havana CDS 3385149 3385286 . + 0 gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "4"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; protein_id "ENSP00000426975"; protein_version "1"; tag "basic"; transcript_support_level "5"; +1 havana exon 3396491 3396593 . + . gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "5"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00003700221"; exon_version "1"; tag "basic"; transcript_support_level "5"; +1 havana CDS 3396491 3396593 . + 0 gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "5"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; protein_id "ENSP00000426975"; protein_version "1"; tag "basic"; transcript_support_level "5"; +1 havana exon 3402791 3402998 . + . gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "6"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00003696962"; exon_version "1"; tag "basic"; transcript_support_level "5"; +1 havana CDS 3402791 3402998 . + 2 gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "6"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; protein_id "ENSP00000426975"; protein_version "1"; tag "basic"; transcript_support_level "5"; +1 havana exon 3404739 3404886 . + . gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "7"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00003700688"; exon_version "1"; tag "basic"; transcript_support_level "5"; +1 havana CDS 3404739 3404886 . + 1 gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "7"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; protein_id "ENSP00000426975"; protein_version "1"; tag "basic"; transcript_support_level "5"; +1 havana exon 3405495 3405648 . + . gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "8"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00003700645"; exon_version "1"; tag "basic"; transcript_support_level "5"; +1 havana CDS 3405495 3405648 . + 0 gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "8"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; protein_id "ENSP00000426975"; protein_version "1"; tag "basic"; transcript_support_level "5"; +1 havana exon 3411384 3412800 . + . gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "9"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00003695658"; exon_version "1"; tag "basic"; transcript_support_level "5"; +1 havana CDS 3411384 3412800 . + 2 gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "9"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; protein_id "ENSP00000426975"; protein_version "1"; tag "basic"; transcript_support_level "5"; +1 havana exon 3414560 3414647 . + . gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "10"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00003701451"; exon_version "1"; tag "basic"; transcript_support_level "5"; +1 havana CDS 3414560 3414647 . + 1 gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "10"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; protein_id "ENSP00000426975"; protein_version "1"; tag "basic"; transcript_support_level "5"; +1 havana exon 3417828 3417997 . + . gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "11"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00003699052"; exon_version "1"; tag "basic"; transcript_support_level "5"; +1 havana CDS 3417828 3417997 . + 0 gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "11"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; protein_id "ENSP00000426975"; protein_version "1"; tag "basic"; transcript_support_level "5"; +1 havana exon 3418667 3418744 . + . gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "12"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00003698430"; exon_version "1"; tag "basic"; transcript_support_level "5"; +1 havana CDS 3418667 3418744 . + 1 gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "12"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; protein_id "ENSP00000426975"; protein_version "1"; tag "basic"; transcript_support_level "5"; +1 havana exon 3425581 3425750 . + . gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "13"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00003699796"; exon_version "1"; tag "basic"; transcript_support_level "5"; +1 havana CDS 3425581 3425750 . + 1 gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "13"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; protein_id "ENSP00000426975"; protein_version "1"; tag "basic"; transcript_support_level "5"; +1 havana exon 3426051 3426225 . + . gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "14"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00003701891"; exon_version "1"; tag "basic"; transcript_support_level "5"; +1 havana CDS 3426051 3426225 . + 2 gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "14"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; protein_id "ENSP00000426975"; protein_version "1"; tag "basic"; transcript_support_level "5"; +1 havana exon 3430872 3431108 . + . gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "15"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00003698226"; exon_version "1"; tag "basic"; transcript_support_level "5"; +1 havana CDS 3430872 3431108 . + 1 gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "15"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; protein_id "ENSP00000426975"; protein_version "1"; tag "basic"; transcript_support_level "5"; +1 havana exon 3433677 3434342 . + . gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "16"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; exon_id "ENSE00002081080"; exon_version "1"; tag "basic"; transcript_support_level "5"; +1 havana CDS 3433677 3433686 . + 1 gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "16"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; protein_id "ENSP00000426975"; protein_version "1"; tag "basic"; transcript_support_level "5"; +1 havana stop_codon 3433687 3433689 . + 0 gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; exon_number "16"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic"; transcript_support_level "5"; +1 havana five_prime_utr 3069168 3069259 . + . gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic"; transcript_support_level "5"; +1 havana three_prime_utr 3433690 3434342 . + . gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000511072"; transcript_version "5"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-206"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "basic"; transcript_support_level "5"; +1 havana transcript 3069183 3186591 . + . gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000607632"; transcript_version "1"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-210"; transcript_source "havana"; transcript_biotype "retained_intron"; transcript_support_level "2"; +1 havana exon 3069183 3069296 . + . gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000607632"; transcript_version "1"; exon_number "1"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-210"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003700259"; exon_version "1"; transcript_support_level "2"; +1 havana exon 3186125 3186591 . + . gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000607632"; transcript_version "1"; exon_number "2"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-210"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003695128"; exon_version "1"; transcript_support_level "2"; +1 havana transcript 3069197 3435421 . + . gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000378391"; transcript_version "6"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-203"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS44048"; tag "basic"; transcript_support_level "1"; +1 havana exon 3069197 3069296 . + . gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000378391"; transcript_version "6"; exon_number "1"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-203"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS44048"; exon_id "ENSE00001222906"; exon_version "5"; tag "basic"; transcript_support_level "1"; +1 havana CDS 3069260 3069296 . + 0 gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000378391"; transcript_version "6"; exon_number "1"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-203"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS44048"; protein_id "ENSP00000367643"; protein_version "2"; tag "basic"; transcript_support_level "1"; +1 havana start_codon 3069260 3069262 . + 0 gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000378391"; transcript_version "6"; exon_number "1"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-203"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS44048"; tag "basic"; transcript_support_level "1"; +1 havana exon 3186125 31864740000 . + . gene_id "ENSG00000142611"; gene_version "17"; transcript_id "ENST00000378391"; transcript_version "6"; exon_number "2"; gene_name "PRDM16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "PRDM16-203"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS44048"; exon_id "ENSE00001754112"; exon_version "1"; tag "basic"; transcript_support_level "1"; diff --git a/scripts/exon_length_filter.py b/scripts/exon_length_filter.py new file mode 100644 index 0000000000000000000000000000000000000000..2aeb302af55566a5292f7283012bf21ce064d5e7 --- /dev/null +++ b/scripts/exon_length_filter.py @@ -0,0 +1,201 @@ +# Exon length filter # +"""Exon length filter +Version 2.1.0""" +# Called Packages # +import re +import os +import transcript_extractor as te + +python_version = "3.7.13" +module_list = [re, os] +modul_name_list = ["re", "os"] + +# Functions # + + +def exon_length_calculator(entry): + """This function finds the start and end cordinates of the + exon and uses them to calculate its length""" + try: + find_exon_coordinates = re.compile(r"\t\d{1,15}\t") + # this difines the pattern of the coordinates + try_find_start_coordinates = find_exon_coordinates.search(entry) + # this line findes the start coordinares based on the pattern + start_coordinates = int(try_find_start_coordinates[0].replace("\t", "")) + # this line removes the \t at the end and the start of the pattern and + # turn the string of the coordinates into intergers + final_index_start_coordinates = entry.find(try_find_start_coordinates[0])+len(try_find_start_coordinates[0])-1 + # this line determines the indes of the final digit + # of the start coordinates + sub_entry = entry[final_index_start_coordinates:] + # this lineused the index determin above a starting point + # for a new sub entry + try_find_end_coordinates = find_exon_coordinates.search(sub_entry) + end_coordinates = int(try_find_end_coordinates[0].replace("\t", "")) + # these two lines find the end coordinates and turn tham int an int + exon_length = end_coordinates-start_coordinates + # this line claculates the transcript length + except: + print("\n\nIn the following enty only one or no valid coordinates \ + could be found:\n",entry,"the value will be set to NA") + exon_length = "NA" + return exon_length + + +def exon_fider(entry): + """This funtion determines if a given entry belongs to an exon + Expected inputs: + entry: str #any enty of a gtf file""" + exon_test = entry.find(r"\texon\t") + # This line look for the entry exon in the file + if exon_test == -1: + try_exon_test = False + else: + try_exon_test = True + # The block above evaluates the results of the search for the wort exon + return try_exon_test + +def __longest_transcript_finder( + current_exon_length, + longest_transcript, + longest_transcript_ID, + old_transcript_ID + ): + """This funtion encapsulates an operation that has to be carried out + at several points in the exon_length_filter function and serves to + make that function more modular""" + if current_exon_length > longest_transcript: + # This condition updates the most promesing for + # beeing the representative transcript + longest_transcript = current_exon_length + longest_transcript_ID = old_transcript_ID + current_exon_length = 0 + return current_exon_length, longest_transcript, longest_transcript_ID + + +def _representative_transcript_csv( + representative_transcript, file_name = "test", deposit_pathway_name =os.getcwd() + ): + with open(os.path.join( + deposit_pathway_name, file_name+"_"+"representative_transcripts"+".csv" + ), "w", encoding="utf-8") as rt: + for i in representative_transcript: + transcript = representative_transcript[i] + new_entry = str(i)+","+transcript+"\n" + rt.write(new_entry) + + +def _exon_length_filter(file_name = "test",source_pathway_name = os.getcwd(),deposit_pathway_name =os.getcwd(),gen_dict = {"ENSG00000160072":["ENST00000673477","ENST00000472194","ENST00000378736","ENST00000308647","ENST00000442483"],"ENSG00000225972":["ENST00000416931"],"ENSG00000279928":["ENST00000624431","ENST00000424215"],"ENSG00000142611":["ENST00000378391","ENST00000607632","ENST00000511072"]}): + """This funtion selects only the transcripts for a dictionary that have the longest total mRNA""" + bar,start_time = te.bar_builder(length_multiplyer = 3) + total_genes = len(gen_dict) + gens_done = 0 + + with open(os.path.join(source_pathway_name,file_name+".gtf"), 'r') as f: + + old_gen = str() + old_transcript_ID = str() + representative_transcript = dict() + representative_trasnscript_not_found = True + longest_transcript_ID = str() + current_exon_length = 0 + longest_transcript = 0 + percentage_done = 0 + + for entry in f: + + try: + corrent_gen = te.gene_ID_finder(entry) + except: + corrent_gen = old_gen + #The block above test if there is a gen name in the entry + if corrent_gen != old_gen: + representative_trasnscript_not_found = True + + #The block above determines if the Gen name is new and set the test + #representative_trasnscript_not_found back to true which is used to + #make the program faster if there is just one transcript for a given + #gen in the dict + if representative_trasnscript_not_found and corrent_gen != str(): + #print(corrent_gen) + #The conditon prvents serges if a representative transcript has + #all ready been chosen + if corrent_gen != old_gen: + current_exon_length,longest_transcript,longest_transcript_ID = __longest_transcript_finder(current_exon_length,longest_transcript,longest_transcript_ID,old_transcript_ID) + representative_transcript[old_gen] = longest_transcript_ID + try: + del gen_dict[old_gen] + old_gen = corrent_gen + gens_done += 1 + corrent_percentage_done = (gens_done/total_genes)*100 + if corrent_percentage_done > percentage_done+10: + bar,start_time = te.bar_builder(percentage=percentage_done+10,length_multiplyer = 3,start_time=start_time,bar =bar) + percentage_done = int(corrent_percentage_done) + + + except: + old_gen = corrent_gen + longest_transcript = 0 + #The block above adds the transcript of the last gen that + #had the longest exons into the representative transcripts dict + try: + #This try / except block test if the gen is in the input dictionary + transcript_IDs = gen_dict[corrent_gen] + if len(gen_dict[corrent_gen]) == 1: + #This conditions is a short cut for Genes that + #allready have a representative transcript + representative_transcript=gen_dict[corrent_gen[0]] + representative_trasnscript_not_found = False + continue + except: + continue + + try: + current_transcript_ID = te.transcript_ID_finder(entry) + except: + continue + #The block above searches for a transcript ID in the current entry + + if current_transcript_ID in transcript_IDs: + #This condition test if the Transcript is one of the + #candidates for representative transcripts + if current_transcript_ID != old_transcript_ID: + #This condition if the enty still belongs to the + #previous transcript and is triggers if that is not the case + current_exon_length,longest_transcript,longest_transcript_ID = __longest_transcript_finder(current_exon_length,longest_transcript,longest_transcript_ID,old_transcript_ID) + try: + transcript_IDs.remove(old_transcript_ID) + old_transcript_ID = current_transcript_ID + except: + old_transcript_ID = current_transcript_ID + if exon_fider(entry): + exon_length = exon_length_calculator(entry) + current_exon_length += exon_length + else: + continue + current_exon_length,longest_transcript,longest_transcript_ID = __longest_transcript_finder(current_exon_length,longest_transcript,longest_transcript_ID,old_transcript_ID) + representative_transcript[old_gen] = longest_transcript_ID + del representative_transcript[str()] + te.bar_builder(100,length_multiplyer = 3,start_time=start_time,bar =bar) + return(representative_transcript) + +def exon_length_filter(file_name = "test",source_pathway_name = os.getcwd(),deposit_pathway_name =os.getcwd(),gen_dict = {"ENSG00000160072":["ENST00000673477","ENST00000472194","ENST00000378736","ENST00000308647","ENST00000442483"],"ENSG00000225972":["ENST00000416931"],"ENSG00000279928":["ENST00000624431","ENST00000424215"],"ENSG00000142611":["ENST00000378391","ENST00000607632","ENST00000511072"]}): + """This function filters a dictionary of genes and there transcripts by the length of there exons an selects the longes transcript for each gene and returns an dictionary {gene_ID : transcript_ID}. + Expected inputs: + file_name: str ; default = test #the name of the gft file you want to look at + source_pathway_name: str ; default = current work directory #path of the gtf file + deposit_pathway_name: str ; default = current work directory #path for files + gen_dict:dict{key == gene ID:[transcript IDs that belong to that gene]}""" + + print("Representative trascipts are filterd based on exon length please wait...") + source_pathway_name,deposit_pathway_name = te.__do_pathways_exist__(source_pathway_name,deposit_pathway_name) + representative_transcript = _exon_length_filter(file_name,source_pathway_name,deposit_pathway_name,gen_dict) + print("\nRepresentative transcripts collected") + return representative_transcript + + +if __name__ == "__main__": + # te.version_control(module_list,modul_name_list,python_version) + exon_length_filter() + +# This line allows the file to be executed on its own also from diff --git a/scripts/find_representative_transcripts.py b/scripts/find_representative_transcripts.py deleted file mode 100644 index ee0dbc62de1d64c33d49184d7310cde8509bbdb2..0000000000000000000000000000000000000000 --- a/scripts/find_representative_transcripts.py +++ /dev/null @@ -1,246 +0,0 @@ -#### Find representative transcripts #### -"""Version 1.1.1""" - -### Imports ### -import argparse - -### Functions ### - -def attributs_converter(attributs): - """ - This funtion converts the "unstrucktured" ;-seperated part of he line into a list of identifyers and coresponding data the struckture of - which can be used ot find the data easyly e.g the index of the identifier transcrip_id + 1 will give the trasncript id of the current gene - Input: - attributs = str() #the unstrucktured part of the entry - Output: - attributs = list() # cleand list with the characterritsics discribed above - """ - attributs = attributs.replace("\"","") - attributs = attributs.replace(";","") - attributs = attributs.replace("\\n","") - attributs =attributs.split(" ") - - return(attributs) - -def find_in_attributs (attributs,look_for): - """ - This function finds a key word and used that to lokat the value of that key word e.g key = gene_id, value = 'ENSMUSG00002074970', - this works as they are next to each other in the attributs list. - Inputs: - sub_enty = list() - look_fore = str() #string of with the name of the key to look for - Output: - attributs[index] or NA = str() #NA is returned if the key was not found in the attributs - """ - try: - index = attributs.index(look_for)+1 - return attributs[index] - except: - #print("No",look_for,"in the entry the return was set to NA\n",attributs) - return "NA" - -def _re_format(rep_trans_dict): - """ - This function is ment to reformat dictionary of the representatice transcripts into an dictionary with only one entry per key - Input: - rep_trans_dict = {gene_id : [transcript_id , transcript_support_level , transcript_length]} - Output: - rep_transcripts = {gene_id : transcript_id} - """ - rep_transcripts = dict() - for gene_id in rep_trans_dict: - rep_transcripts[gene_id] = rep_trans_dict[gene_id][0] - - return rep_transcripts - - - -def get_rep_trans(file_name = "test"): - """ - This is the main function of this script it selects one representative transcrip per gene based on a gtf annotation file. - It does so be two criteria: first the transcript support level and it there are several transcript - of one gene that have the same trasncript_support_level it chooses the one that corresponds to the longest mRNA. - Input: - file_name = str() # name of the annotation file with or without the .gtf part - Output: - rep_transcripts = {gene_id : transcript_id} - """ - - #setting defoult variables - rep_trans = dict() - cur_gID = str() - cur_best_trans = [str(),100,0] # [transcript_id , transcript_support_level , transcript_length] - pot_best_trans = False - cur_tID = str() - ignor_trans = False - - with open (file_name,"r") as f: - for line in f: - entry = line.split("\t") - - #removes expected but unneeded entrys - exp_unneed = ["CDS","stop_codon","five_prime_utr","three_prime_utr","start_codon",'Selenocysteine'] - if len(entry) == 1 or entry[2] in exp_unneed: - continue - - #this function turns the less organized part of the entry into a reable list - attributs = attributs_converter(entry[8]) - #looking for and processing exons entrys - if entry[2] == "exon": - - #dicide if to contiune or not - if ignor_trans: - continue - elif cur_gID != attributs[1]: - raise ValueError("ERROR exon from an unexpected Gen") - continue - elif find_in_attributs (attributs,"transcript_id") != cur_tID: - raise ValueError("exon from an unexpected transcript") - continue - - #adding the length of the exon to the appropriat list and chacking for changes in best transcript - if pot_best_trans: - pot_best_trans[2]+= int(entry[4])-int(entry[3]) - if pot_best_trans[2] > cur_best_trans[2]: - cur_best_trans = pot_best_trans - pot_best_trans = False - else: - cur_best_trans[2]+= int(entry[4])-int(entry[3]) - - - - #looking for and processing transcript entrys - elif entry[2] == "transcript": - - #varryfi that the gen is correct - if cur_gID != attributs[1]: - raise ValueError("ERROR transcript from an unexpected Gen") - continue - - #finding the transcript id and the support level - cur_tID = find_in_attributs (attributs,"transcript_id") - t_supp_lvl = find_in_attributs (attributs,"transcript_support_level") - - #If there is no transcript support level or the level is given as NA it is nomed as 100. else the transcript support level is tunrn into int - if t_supp_lvl == "NA": - t_supp_lvl = 100 - else: - try: - t_supp_lvl = int(t_supp_lvl) - except: - t_supp_lvl = 100 - - - #decides if the transcript has potential to become the representative transcript - if t_supp_lvl < cur_best_trans[1] or cur_best_trans[0] == "": - cur_best_trans = [cur_tID,t_supp_lvl,0] - pot_best_trans = False - ignor_trans = False - - elif t_supp_lvl == cur_best_trans[1]: - pot_best_trans = [cur_tID,t_supp_lvl,0] - else: - ignor_trans = True - - - #looking for and processing gene entrys - elif entry[2] == "gene": - - #updating rep_trans dict - if cur_gID not in rep_trans: - rep_trans[cur_gID] = cur_best_trans - else: - if rep_trans[cur_gID][1] > cur_best_trans[1]: - rep_trans[cur_gID] = cur_best_trans - elif rep_trans[cur_gID][1] == cur_best_trans[1] and rep_trans[cur_gID][2] < cur_best_trans[2]: - rep_trans[cur_gID] = cur_best_trans - - #updating cur_gID and resetting cur_best_trans - cur_gID = attributs[1] - cur_best_trans = [str(),100,0] - - #raises an error for unidentifyable entrys - else: - raise ValueError("This entry could not be identified\n",entry) - - #addding the final gene to the dictionary - if cur_gID not in rep_trans: - rep_trans[cur_gID] = cur_best_trans - else: - if rep_trans[cur_gID][1] > cur_best_trans[1]: - rep_trans[cur_gID] = cur_best_trans - elif rep_trans[cur_gID][1] == cur_best_trans[1] and rep_trans[cur_gID][2] < cur_best_trans[2]: - rep_trans[cur_gID] = cur_best_trans - - del rep_trans[""] - rep_transcripts = _re_format(rep_trans) - return(rep_transcripts ) - -def gtf_file_writer (original_file, output_file): - """ - this function writes the output GTF file - """ - output = [] - rep_transcript_dict = get_rep_trans(original_file) - - with open(original_file, 'r') as f: - for entry in f: - if entry[0] != '#': - attributes = attributs_converter(entry) - type_ = attributes[2] - if type_ == 'gene': - gene_id = find_in_attributs(attributes, 'gene_id') - output.append(entry) - else: - transcript_id = find_in_attributs(attributes, 'transcript_id') - try: - if rep_transcript_dict[gene_id] == transcript_id: - output.append(entry) - except: - print("error") - - with open(output_file, 'w') as last_file: - last_file.write(output) - -def _test(): - """ - This funtion is ment to be run for test - Output: - file with the dictionary generated based on the test file - """ - file_name = "test.gtf" - rt = get_rep_trans(file_name) - expected_result = {"ENSG00000160072":"ENST00000472194","ENSG00000234396":"ENST00000442483", - "ENSG00000225972":"ENST00000416931","ENSG00000224315":"ENST00000428803", - "ENSG00000198744":"ENST00000416718","ENSG00000279928":"ENST00000624431", - "ENSG00000228037":"ENST00000424215",'ENSG00000142611':'ENST00000378391'} - if rt != expected_result: - print("The test fail due to not yieding the same results") - print("The results the program got\n",rt) - print("The expected results\n",expected_result) - else: - print("The test was succses full") - -### Execution part ### -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="find_representativ_transcripts",formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument("-file_name", required=True, help="gtf file with genome annotation") - parser.add_argument("-t", required=False,default = False,help="to run the test input -t True") - args = parser.parse_args() - - #standadize the file_name inlude .gtf# - - file_name = args.file_name - i_gtf = file_name.find(".gtf") - if i_gtf == -1: - file_name += ".gtf" - - if args.t: - _test() - else: - get_rep_trans(file_name) - - - - - \ No newline at end of file diff --git a/scripts/match_reprtranscript_expressionlevel.py b/scripts/match_reprtranscript_expressionlevel.py deleted file mode 100644 index f7f3277c806b09131b79c2ad8f23781053a65881..0000000000000000000000000000000000000000 --- a/scripts/match_reprtranscript_expressionlevel.py +++ /dev/null @@ -1,182 +0,0 @@ -### Made by Hugo Gillet ### -import pandas as pd -from gtfparse import read_gtf - - -def dict_reprTrans_to_df(dict_reprTrans: dict[str, str]) -> pd.DataFrame: - - """Convert a dictionary of genes and their representative transcript into a dataframe - - Args: - dict_reprTrans (dict) : {'Gene':['transcriptA', 'transcriptB'], ...} - - Returns: - Pandas dataframe having Gene and transcript as columns - - Raises: - Only dict are allowed - Key should be strings - Value should be strings - - """ - pass - if not type(dict_reprTrans) is dict: - raise TypeError("Only dict are allowed") - if type(list(dict_reprTrans.keys())[0]) is not str: - raise TypeError("Key should be strings") - if type(list(dict_reprTrans.values())[0]) is not str: - raise TypeError("Values should be strings") - - df_reprTrans = pd.DataFrame.from_dict( - dict_reprTrans, orient="index", columns=["reprTranscript"] - ) - df_reprTrans = df_reprTrans.reset_index(level=0) - df_reprTrans.columns = ["Gene", "reprTrans"] - df_reprTrans["reprTrans"] = df_reprTrans["reprTrans"].str.replace( - r"\.[1-9]", "", regex=True - ) - return df_reprTrans - -def gene_and_transcript(gtf_file:str)-> pd.DataFrame: - """ - This function take a .gtf file and convert it into a - dataframe containing gene_id and their transcripts_id. - Args: - gtf_file (str) : path to the .gtf file - - Returns: - df_gtf (pd.DataFrame) : pandas dataframe containing having has columns - gene_id and their transcripts_id. - Raises : - None - - """ - df_gtf = read_gtf(gtf_file) - df_gtf = df_gtf.loc[df_gtf["feature"]=="transcript"] - df_gtf = df_gtf[["gene_id","transcript_id"]] - df_gtf = df_gtf.rename(columns={"gene_id":"Gene","transcript_id":"Transcript"}) - return df_gtf - - - -def tsv_or_csv_to_df(input_txt: str) -> pd.DataFrame: - """Convert tsv or csv file into a pandas dataframe - - Args: - input_txt (str): csv or tsv file containing transcript expression level - - Returns: - df_gene (str): Pandas dataframe having transcript and expression level - as columns - - Raises: - None - """ - pass - df_input = pd.read_csv( - input_txt, - sep=r"[\t,]", - lineterminator="\n", - names=["Transcript", "Expression_level"], - engine="python", - ) - return df_input - - -def exprLevel_byGene( - df_exprTrasncript: pd.DataFrame, df_output_gtf_selection: pd.DataFrame -) -> pd.DataFrame: - """find the gene of each transcipt given by the expression level csv/tsv file, - and summ expression level of all transcipts from the same gene. - - Args: - df_exprTranscript : pandas Dataframe containing transcript and their expression level, - generated by "tsv_or_csv_to_df" function - df_output_gtf_selection : pandas Dataframe containing genes and transcripts, - generated by "transcripts_by_gene_inDf" function - - Returns: - Pandas dataframe having gene and sum of its transcript expression level - - Raises: - None - """ - pass - df_merged = pd.merge( - df_output_gtf_selection, df_exprTrasncript, how="inner", on="Transcript" - ) - df_sum = df_merged.groupby("Gene").sum( - "Expression_level" - ) - return df_sum - - -def match_byGene( - df_reprTranscript: pd.DataFrame, df_expressionLevel_byGene: pd.DataFrame -) -> pd.DataFrame: - """Find matching genes bewteen the 2 args - - Args: - df_reprTranscript : pandas Dataframe containing genes - and their representative transcript, generated by - "dict_reprTrans_to_df()" - df_expressionLevel_byGene : pandas Dataframe containing - genes and their expression level generated by - "transcript_by_gene_inDf()" - - Returns: - Pandas dataframe having representative trasncripts - and their expression level - - Raises: - None - """ - pass - df_merged = pd.merge( - df_reprTranscript, df_expressionLevel_byGene, how="outer", on="Gene" - ) - df_clean = df_merged.dropna(axis=0) - df_clean = df_clean.loc[:, ["reprTrans", "Expression_level"]] - return df_clean - - - - - -### functions to run this part of the programm - - -def match_reprTranscript_expressionLevel( - exprTrans: str, dict_reprTrans: dict, gtf_file: str, -): - """Combine functions to replace transcripts from an expression level csv/tsv file - with representative transcripts - - Args: - exprTrans (str): csv or tsv file containing transcripts - and their expression level - dict_reprTrans (dict) : dict of genes and their - representative transcipt - intemediate_file (str) : txt file containing genes, transcript - and their expression level from the transkript_extractor function - output_path : path indicating were the tsv file should be written - - Returns: - tsv file of representative trasncripts and their expression level - - Raises: - None - """ - df_gene_transcript = gene_and_transcript(gtf_file) - df_exprTrans = tsv_or_csv_to_df(exprTrans) - df_reprTrans = dict_reprTrans_to_df(dict_reprTrans) - df_exprLevel_byGene = exprLevel_byGene(df_exprTrans, df_gene_transcript) # error here - df_match = match_byGene(df_reprTrans, df_exprLevel_byGene) - df_match.rename(columns = {'reprTrans':'id', 'Expression_level':'level'}, inplace = True) - return df_match - - -# run the programm - -if __name__ == "__main__": - match_reprTranscript_expressionLevel() diff --git a/scripts/new_exe.py b/scripts/new_exe.py deleted file mode 100644 index ade521ceebe76d7bff2c424006c9d4624cf72199..0000000000000000000000000000000000000000 --- a/scripts/new_exe.py +++ /dev/null @@ -1,57 +0,0 @@ -import argparse -import time -import transcript_sampler as ts - -# exemple execution : python C:\...\final_exe.py --input_gtf "C:\...\input_files\test.gtf" --input_csv "C:\...\input_files\expression.csv" --output_gtf "C:\...\output\output_gtf.gtf" --output_csv "C:\...\ouput\output_gtf.gtf" --n_to_sample 100 - - -def exe(input_gtf, input_csv, output_gtf, output_csv, transcript_nr, input_free=True): - start = time.time() - dict_repr_trans = ts.get_rep_trans(input_gtf) - df_repr = ts.match_reprTranscript_expressionLevel( - dict_reprTrans=dict_repr_trans, exprTrans=input_csv, gtf_file=input_gtf - ) - print("Finiding match between representative transcripts and expression level file") - print("Poisson sampling of transcripts") - ts.transcript_sampling(transcript_nr, df_repr, output_csv) - print("output csv file ready") - print("writing output gtf file") - ts.gtf_file_writer(input_gtf, dict_repr_trans, output_gtf) - end = time.time() - print("\nScript executed in {} sec\n".format(end - start)) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="transcript sampler", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) - parser.add_argument( - "--input_gtf", required=True, help="gtf file with genome annotation" - ) - parser.add_argument( - "--input_csv", - required=True, - help="csv or tsv file with transcript and their expression level ", - ) - parser.add_argument( - "--output_gtf", - required=True, - help="output path for the new gtf file of representative transcripts", - ) - parser.add_argument( - "--output_csv", - required=True, - help="output path for the new csv file of representative transcript and their sampled number", - ) - parser.add_argument( - "--n_to_sample", required=True, help="total number of transcripts to sample" - ) - args = parser.parse_args() - exe( - args.input_gtf, - args.input_csv, - args.output_gtf, - args.output_csv, - args.n_to_sample, - ) diff --git a/scripts/new_gtf_writer.py b/scripts/new_gtf_writer.py deleted file mode 100644 index 43e7ca63a607a8e789e32e64138126b6f556ed6b..0000000000000000000000000000000000000000 --- a/scripts/new_gtf_writer.py +++ /dev/null @@ -1,20 +0,0 @@ - -def gtf_file_writer (original_file, output_file): - output = [] - rep_transcript_dict = get_rep_trans(original_file) - - with open(original_file, 'r') as f: - for entry in f: - if entry[0] != '#': - attributes = attributs_converter(entry) - type_ = attributes[2] - if type_ == 'gene': - gene_id = find_in_attributs(attributes, 'gene_id') - output.append(entry) - if type_ != 'gene': - transcript_id = find_in_attributs(attributes, 'transcript_id') - if rep_transcript_dict[gene_id] == transcript_id: - output.append(entry) - - with open(output_file, 'w') as last_file: - last_file.write(output) \ No newline at end of file diff --git a/scripts/poisson_sampling.py b/scripts/poisson_sampling.py deleted file mode 100644 index fedd8e8fb18eac8352b437308e0e925e8a317630..0000000000000000000000000000000000000000 --- a/scripts/poisson_sampling.py +++ /dev/null @@ -1,57 +0,0 @@ -### Called Packages ### -import pandas as pd -import numpy as np -import argparse - -import transcript_extractor as te - -python_version = "3.7.13" -module_list =[pd,np,argparse] -modul_name_list = ["pd","np","argparse"] -### Functions ### - -''' -Sample transcript - -This part of the code does Poisson sampling proportionally to gene expression levels for each gene. - -input: total transcript number (int) - csv file with gene id and gene expression levels (columns named 'id' and 'level') - -output: csv file with gene id and count - gtf file with transcript samples -''' - - -def transcript_sampling(total_transcript_number, df_repr, output_csv): - #df = pd.read_csv(csv_file, sep="\t", lineterminator="\n", names=["id", "level"]) - df = df_repr # the function "match_reprTranscript_expressionLevel()" now directly output a dataframe - levels = [] - sums = df['level'].tolist() - total = sum(sums) - total_transcript_number=int(total_transcript_number) # I added this because writting a number in the terminal inputed a string - normalized = total_transcript_number/total - for expression_level in df['level']: - poisson_sampled = np.random.poisson(expression_level*normalized) - levels.append(poisson_sampled) - - transcript_numbers = pd.DataFrame({'id': df['id'],'count': levels}) - pd.DataFrame.to_csv(transcript_numbers, output_csv) - -if __name__ == '__main__': - #te.version_control(module_list,modul_name_list,python_version) - parser = argparse.ArgumentParser( - description="Transcript Poisson sampler, csv output", - formatter_class=argparse.ArgumentDefaultsHelpFormatter - ) - - parser.add_argument("--expression_level", required=True, help="csv file with expression level") - parser.add_argument("--output_csv", required=True, help="output csv file") - parser.add_argument("--input_csv", required=True, help="input csv file") - parser.add_argument("--transcript_number", required=True, help="total number of transcripts to sample") - args = parser.parse_args() - - - transcript_sampling(args.transcript_number, args.input_csv, args.output_csv, args.transcript_number) - - diff --git a/scripts/representative.py b/scripts/representative.py index 1228e9a5438ca0cd89b6e176e1f78f181895198c..589f4b4c473ea4787efbd589a1c932bc0bfb87da 100644 --- a/scripts/representative.py +++ b/scripts/representative.py @@ -1,91 +1,91 @@ -import pandas as pd -import os - ''' -This part of the code take as input a gtf modified file +This part of the code take as input a gtf modified file and return a dictionary of transcripts with best support level for each gene of the input - ''' +import pandas as pd +# import os - - -def import_gtfSelection_to_df(gtf_modified_file: str) -> pd.DataFrame: +def import_gtf_selection_to_df(gtf_modified_file: str) -> pd.DataFrame: """Import intermediate file from gtf and create a df Args: gtf_modified_file (str) : path to the intermediate file Returns: - Pandas dataframe having Gene, transcript + Pandas dataframe having Gene, transcript and support level as columns - + Raises: TypeError : Only str path is allowed - + """ - pass - if not type(gtf_modified_file) is str: - raise TypeError("Only str path is allowed") - df_input = pd.read_csv(gtf_modified_file, sep = '\t', lineterminator = '\n', -names = ["Gene_mixed", "Transcript", "Support_level", "Na1", "Na2"] ) + if not isinstance(gtf_modified_file, str): + raise TypeError("Only str path is allowed") + df_input = pd.read_csv( + gtf_modified_file, sep='\t', lineterminator='\n', + names=["Gene_mixed", "Transcript", "Support_level", "Na1", "Na2"] + ) df_input["Support_level"] = df_input["Support_level"].replace(" ", "") - df_input["Gene"] = df_input["Gene_mixed"].str.extract('([A-Z]\w{0,})', expand=True) - df_input["Transcript_number"] = df_input["Gene_mixed"].str.extract('(^\d)', expand=True) - df_clean = df_input.loc[:, ["Gene", "Transcript","Support_level"]] - df_clean["Gene"] = df_clean["Gene"].fillna(method = 'ffill') - df_clean = df_clean.dropna(axis = 0) + df_input["Gene"] = df_input["Gene_mixed"].str.extract( + r'([A-Z]\w{0,})', expand=True # noqa: W605 + ) + df_input["Transcript_number"] = df_input["Gene_mixed"].str.extract( + r'(^\d)', expand=True # noqa: W605 + ) + df_clean = df_input.loc[:, ["Gene", "Transcript", "Support_level"]] + df_clean["Gene"] = df_clean["Gene"].fillna(method='ffill') + df_clean = df_clean.dropna(axis=0) return df_clean - - -def representative_transcripts_inDict(df_gtfSelection: pd.DataFrame) -> pd.DataFrame: - """Return a dict containing for each gene transcripts +def representative_transcripts_in_dict( + df_gtf_selection: pd.DataFrame) -> pd.DataFrame: + """Return a dict containing for each gene transcripts with highest confidence level Args: - df_gtfSelection (str): Pandas dataframe having Gene, + df_gtf_selection (str): Pandas dataframe having Gene, transcript and support level as columns Returns: Dict {'Gene':['transcriptA', 'transcriptB'], ...} - + Raises: TypeError : Only pandas DataFrame is allowed """ - pass - - if not type(df_gtfSelection) is pd.DataFrame: + if not isinstance(df_gtf_selection, pd.DataFrame): raise TypeError("Only pandas DataFrame is allowed") - df_min = df_gtfSelection[df_gtfSelection["Support_level"]==df_gtfSelection.groupby("Gene")["Support_level"].transform(min)] - df_final = df_min.drop(columns = ["Support_level"]) - dict_representative_transcripts = df_final.groupby("Gene")["Transcript"].apply(list).to_dict() - return dict_representative_transcripts + df_min = df_gtf_selection[ + df_gtf_selection["Support_level"] == + df_gtf_selection.groupby("Gene")["Support_level"].transform(min) + ] + df_final = df_min.drop(columns=["Support_level"]) + dict_representative_transcripts = df_final.groupby("Gene")[ + "Transcript"].apply(list).to_dict() + return dict_representative_transcripts - -def find_repr_by_SupportLevel(intermediate_file: str) -> dict[str,str]: - """Combine functions import_gtfSelection_to_df() - and representative_transcripts_inDict() +def find_repr_by_support_level(intermediate_file: str) -> dict[str, str]: + """Combine functions import_gtf_selection_to_df() + and representative_transcripts_in_dict() Args: intermediate_file : path to the intermediate file Returns: Dict {'Gene':['transcriptA', 'transcriptB'], ...} - + Raises: None - + """ - pass - df_gtf = import_gtfSelection_to_df(intermediate_file) - dict_reprTrans = representative_transcripts_inDict(df_gtf) - return dict_reprTrans + df_gtf = import_gtf_selection_to_df(intermediate_file) + dict_repr_trans = representative_transcripts_in_dict(df_gtf) + return dict_repr_trans -if __name__ == "__main__": - find_repr_by_SupportLevel() +# if __name__ == "__main__": +# find_repr_by_support_level() diff --git a/scripts/transcript_extractor.py b/scripts/transcript_extractor.py new file mode 100644 index 0000000000000000000000000000000000000000..5c81bcd838d4dff57c44989ea07edc0a4ee2e3a7 --- /dev/null +++ b/scripts/transcript_extractor.py @@ -0,0 +1,329 @@ +#### Transcript extractor ##### +"""Transcript extractor +Version 1.2.0""" +### Called Packages ### +import re +import os +import time + +python_version = "3.7.13" +module_list =[re,os,time] +modul_name_list = ["re","os","time"] + +### Functions ### +def version_control(module_list,modul_name_list,python_version): + with open("required.txt","a") as req: + + for i in range(len(module_list)): + + try: + version = module_list[i].__version__ + entry = modul_name_list[i]+"\t"+str(version)+"\n" + req.write(entry) + except: + version = python_version + entry = modul_name_list[i]+"\t"+str(version)+"\n" + req.write(entry) + +def __parameter_editor(file_name,source_pathway_name,deposit_pathway_name): + """This function allows for changing the parameters after running the program""" + while True: + print("The program will run with the following parameters:\nFile name:\t\t",file_name,"\nSource pathway:\t",source_pathway_name,"\nDeposit pathway:\t",deposit_pathway_name,"\n") + parameter_conformation = input("To continue with these parameters input [continue or c] to change them input [edit]\n>") + if parameter_conformation == "continue"or parameter_conformation =="c": + break + elif parameter_conformation == "edit": + #edit the parameters + while True: + change_question = input("select the parameter you want to change [nfile/spath/dpath] or input [b] to go back\n>") + if change_question == "nfile": + #This condition allows the user to chenge the file name + file_name = input("Please input the new file name\n>") + break + elif change_question == "spath": + #This condition allows the user to change the source path + source_pathway_name = input("Please input the new source path\n>") + + does_source_pathway_exist = os.path.exists(source_pathway_name) + if does_source_pathway_exist: + break + else: + print("The new source pathway:",source_pathway_name,"does not exist\nThe source pathway was returned to default:",os.getcwd()) + source_pathway_name = os.getcwd() + elif change_question == "dpath": + #This condition allows the user to change output file location + deposit_pathway_name = input("Please input the new output file path name\n>") + does_deposit_pathway_exist = os.path.exists(deposit_pathway_name) + if does_deposit_pathway_exist: + break + else: + print("The new deposit pathway:",deposit_pathway_name,"does not existe\nThe deposit pathway was returnt to default:",source_pathway_name) + deposit_pathway_name = source_pathway_name + #The block above test if the new deposit pathway is valid + elif change_question == "b": + # This condition allows the user to return to the main loop + break + else: + #This condition covers all non valid inputs into the secund loop + print("The input",change_question,"is not valid. Please use one of the specified commands") + + else: + #This condition covers all non valid input for the main loop + print("The input",parameter_conformation,"is not valide please use one of the specified comands\n") + return(file_name,source_pathway_name,deposit_pathway_name) + + + + + + + +def __searche_for_preexisting_files(file_name,deposit_pathway_name = os.getcwd()): + """This function searches for preexisting files of the same name as the results file of the current program. It allows the user to choose to move on with the pre-existing file """ + File_of_same_name_found = False + generat_new_file = False + directory_content = os.listdir(deposit_pathway_name) + for file in directory_content: + if file == file_name: + while True: + File_found_input = input (file_name+" has allready been generated\nDo you want to generate a new one [y/n] \n>") + if File_found_input == "n": + File_of_same_name_found = True + break + elif File_found_input == "y": + generat_new_file = True + break + else: + print("Invalid input\nPlease press [y] if you want to generate a new file or [n] if you want to use the preexisting file") + break + else: + continue + if File_of_same_name_found: + print("No new file will be generated, the program can continue") + elif generat_new_file: + print("A new file will be generated please wait...\n") + else: + print("No pre-existing file of the relevant type has been found.\nA new file will be generated please wait...\n") + return(File_of_same_name_found) + +def bar_builder(percentage = 0,length_multiplyer = 2,start_time = time.time(),bar = str()): + """This function creates a loading bar that can load in 10% increments starting a 0% and ending at 100% + Expected inputs: + percentage: int between 0 and 100 in steps of 10; default = 0 #defines the current loading increment + length_multiplyer: int > 0 ; default = 2 #determiens the amount of symbols per loading increment + start_time: any int ; default= time.time() #for determening loading time + bar: str ; default = str()#input of the current bar status does not need to be defined if for the 0% increment + """ + if percentage == 100: + bar = bar.replace("-","#") + print("\r"+bar+"\t"+"100%\t\t"+str(int(time.time()-start_time))) + elif percentage > 0: + bar = bar.replace("-","#",length_multiplyer) + print("\r"+bar+"\t"+str(percentage)+"%", end='',flush=True) + elif percentage == 0: + bar = "["+"-"*length_multiplyer*10+"]" + print(bar+"\t", end='',flush=True) + return(bar,start_time) + +def __test_file_name(file_name,source_pathway_name = os.getcwd()): + """This function validates that the source file exists at the source path. It turns the file name input in a standardized format that can be used in the next steps""" + + directory_content = os.listdir(source_pathway_name) + + index_of_the_dot = file_name.rfind(".") + valide_source_file = False + validate_source_file = True + if index_of_the_dot ==-1: + file_name += ".gtf" + else: + source_file_typ = file_name[index_of_the_dot:] + not_a_file_type = re.compile(".\d{1,13}") + try_not_a_file_type = not_a_file_type.search(source_file_typ) + if source_file_typ == ".gtf": + file_name = file_name + elif try_not_a_file_type: + file_name += ".gtf" + else: + print("This program can not handle",source_file_typ,"files. \nplease use a .gtf file" ) + validate_source_file = False + #The block above tests if the file_name includes the file type and if no + #file type is found adds ".gtf" und if a non ".gtf" file is found gives an error + + if validate_source_file: + for file in directory_content: + if file == file_name: + valide_source_file = True + break + #The block above tests if a file on the given name is in the given directora + + if valide_source_file: + print("The file:",file_name,"has been found.\n") + else: + print("No .gtf file of the name",file_name,"has been found in this pathway") + #The bock above gives feed back regarding the results of the file test + + file_name = file_name.replace(".gtf","") + #This line normalizes the file name + return(valide_source_file,file_name) + +def __do_pathways_exist__(source_pathway_name,deposit_pathway_name): + """This funtion tests that the entered pathways actualy exist""" + does_source_pathway_exist = os.path.exists(source_pathway_name) + does_deposit_pathway_exist = os.path.exists(deposit_pathway_name) + #The Block above does the actual testing + if does_source_pathway_exist: + source_pathway_name = source_pathway_name + else: + print("The source pathway:",source_pathway_name,"has not been found\nThe source pathway was set to the default") + source_pathway_name = os.getcwd() + #The block above detail the possible reactions for the source pathe existing or not existing + if does_deposit_pathway_exist: + deposit_pathway_name = deposit_pathway_name + else: + print("The deposit pathway:",deposit_pathway_name,"has not been found\nThe deposit pathway was set to the default") + deposit_pathway_name = source_pathway_name + #The block above details the possible reactions for the deposit pathway existing or not existing + return(source_pathway_name,deposit_pathway_name) + +def gene_ID_finder(entry): + """This function is supposed to find the gene ID of a known gene entry + Expected inputs: + entry: str #a line from a gtf file that contains a gene ID""" + index_gene_id = entry.find("gene_id") + find_gene_id_name = re.compile("\"\S{1,25}\"") + sub_entry = entry[index_gene_id:] + try_find_gene_id_name = find_gene_id_name.search(sub_entry) + gene_ID = try_find_gene_id_name[0].replace("\"","") + return (gene_ID) + +def transcript_ID_finder (entry): + """This function is supposed to finde the transcript ID in a known transcript entry + Expected inputs: + entry: str #a line from a gtf file that contains a transcript ID""" + index_transcript_id = entry.find("transcript_id") + find_transcript_id_name = re.compile("\"\S{1,25}\"") + sub_entry = entry[index_transcript_id:] + try_find_transcript_id_name = find_transcript_id_name.search(sub_entry) + + try: + transcript_ID = try_find_transcript_id_name[0].replace("\"","") + except: + transcript_ID = "" + return (transcript_ID) + +def transcript_support_level_finder(entry): + """This function is supposed to find the transcript support level in a known transcript entry + Expected input: + entry: str #a line from a gtf file that be blongs to a transcript""" + transcript_support_level_start_ID = entry.find("transcript_support_level") + sub_entry = entry[transcript_support_level_start_ID:] + + try: + score_finder = re.compile("\W\w{1,16}\W{2}") + try_score_finder = score_finder.search(sub_entry) + Pre_score_1 = try_score_finder[0] + Pre_score_2 = Pre_score_1.replace("\"","") + Pre_score_2 = Pre_score_2.replace("(","") + transcript_support_level = Pre_score_2.replace(";","") + if "NA" in transcript_support_level: + transcript_support_level = 100 + #I changed This tell laura + + + except: + transcript_support_level = 100 + return (transcript_support_level) + + + + +def _transcript_extractor (file_name,source_pathway_name,deposit_pathway_name): + """This function extracts the transcript number ,transcript ID, the transcript support level, the transcrip length and the line index from a gtf file of a given name and saves tham as a new file name given_name_intermediat_file.txt. + Expected input: + file_name: str #the name of the gft file you want to look at without the .gtf part + source_pathway_name: str #path of the gtf file + deposit_pathway_name: str #path for saving the intermediat file""" + + with open(os.path.join(source_pathway_name,file_name+".gtf"), 'r') as f: + total_entrys =len(f.readlines()) + with open(os.path.join(source_pathway_name,file_name+".gtf"), 'r') as f: + current_entry = 0 + percentage_done = 0 + bar,start_time = bar_builder(length_multiplyer = 3) + + + Old_gen_ID = str() + #stand-in as the first couple entrys are not genes + with open(os.path.join(deposit_pathway_name,file_name+"_"+"intermediate_file"+".txt"),"w") as IMF: + transcript_number = 0 + for entry in f: + + + current_entry += 1 + current_percentage_done = 100* current_entry/total_entrys + if current_percentage_done > percentage_done +10: + bar,start_time = bar_builder(percentage=percentage_done+10,length_multiplyer = 3,start_time=start_time,bar =bar) + percentage_done = int(current_percentage_done) + + if "gene_id" in entry: + Gen_ID = gene_ID_finder(entry) + else: + Gen_ID = Old_gen_ID + + if Gen_ID != Old_gen_ID: + Gen_entry = ">"+ Gen_ID +"\n" + IMF.write(Gen_entry) + transcript_number = 0 + Old_gen_ID = Gen_ID + + if "\ttranscript\t" in entry: + transcript_number += 1 + Transcript_ID = transcript_ID_finder(entry) + #the function that determins the transcript ID is called + transcript_support_level = transcript_support_level_finder(entry) + #the function that determins the transcript support level is called + New_entry = str(transcript_number)+"\t"+str(Transcript_ID)+"\t"+str(transcript_support_level)+"\t"+"\t\n" + IMF.write(New_entry) + bar_builder(100,length_multiplyer = 3,start_time=start_time,bar =bar) + print("The transcripts have been collected") + + +def extract_transcript(file_name = "test",source_pathway_name = os.getcwd(),deposit_pathway_name = False,Input_free = False): + """ This it the overall exetutable funtion that will execute the transcript extraction process for a given file with all checks. + Expected input: + file_name: str ; default = test #the name of the gft file you want to look at + source_pathway_name: str ; default = current work directory #path of the gtf file + deposit_pathway_name: str ; default = source_pathway_name #path for saving the intermediat file + Outputs: + file_name: str + source_pathway_name: str + deposit_pathway_name: str + """ + + + if deposit_pathway_name == False: + deposit_pathway_name = source_pathway_name + if Input_free: + validated_file_name = __test_file_name(file_name,source_pathway_name) + file_name = validated_file_name[1] + _transcript_extractor (file_name,source_pathway_name,deposit_pathway_name) + else: + file_name,source_pathway_name,deposit_pathway_name = __parameter_editor(file_name,source_pathway_name,deposit_pathway_name) + source_pathway_name,deposit_pathway_name =__do_pathways_exist__(source_pathway_name,deposit_pathway_name) + validated_file_name = __test_file_name(file_name,source_pathway_name) + file_name = validated_file_name[1] + if validated_file_name[0]: + if __searche_for_preexisting_files(file_name+"_intermediate_file.txt",deposit_pathway_name): + print("The transcripts has been collected\n") + else: + _transcript_extractor (file_name,source_pathway_name,deposit_pathway_name) + return(file_name,source_pathway_name,deposit_pathway_name) + +#### Dev part #### + +if __name__ == "__main__": + #version_control(module_list,modul_name_list,python_version) + extract_transcript() +#This line allows the file to be executed on its own also from + + diff --git a/scripts/transcript_sampler.py b/scripts/transcript_sampler.py deleted file mode 100644 index 7ee155eb551aaf688e82269cb4bfc65aba4e9882..0000000000000000000000000000000000000000 --- a/scripts/transcript_sampler.py +++ /dev/null @@ -1,426 +0,0 @@ -import pandas as pd -import numpy as np -from gtfparse import read_gtf - - -def attributs_converter(attributs): - """ - This funtion converts the "unstrucktured" ;-seperated part of he line into a list of identifyers and coresponding data the struckture of - which can be used ot find the data easyly e.g the index of the identifier transcrip_id + 1 will give the trasncript id of the current gene - Input: - attributs = str() #the unstrucktured part of the entry - Output: - attributs = list() # cleand list with the characterritsics discribed above - """ - attributs = attributs.replace('"', "") - attributs = attributs.replace(";", "") - attributs = attributs.replace("\\n", "") - attributs = attributs.split(" ") - - return attributs - - -def find_in_attributs(attributs, look_for): - """ - This function finds a key word and used that to lokat the value of that key word e.g key = gene_id, value = 'ENSMUSG00002074970', - this works as they are next to each other in the attributs list. - Inputs: - sub_enty = list() - look_fore = str() #string of with the name of the key to look for - Output: - attributs[index] or NA = str() #NA is returned if the key was not found in the attributs - """ - try: - index = attributs.index(look_for) + 1 - return attributs[index] - except: - # print("No",look_for,"in the entry the return was set to NA\n",attributs) - return "NA" - - -def _re_format(rep_trans_dict): - """ - This function is ment to reformat dictionary of the representatice transcripts into an dictionary with only one entry per key - Input: - rep_trans_dict = {gene_id : [transcript_id , transcript_support_level , transcript_length]} - Output: - rep_transcripts = {gene_id : transcript_id} - """ - rep_transcripts = dict() - for gene_id in rep_trans_dict: - rep_transcripts[gene_id] = rep_trans_dict[gene_id][0] - - return rep_transcripts - - -def get_rep_trans(file_name="test"): - """ - This is the main function of this script it selects one representative transcrip per gene based on a gtf annotation file. - It does so be two criteria: first the transcript support level and it there are several transcript - of one gene that have the same trasncript_support_level it chooses the one that corresponds to the longest mRNA. - Input: - file_name = str() # name of the annotation file with or without the .gtf part - Output: - rep_transcripts = {gene_id : transcript_id} - """ - - # setting defoult variables - rep_trans = dict() - cur_gID = str() - cur_best_trans = [ - str(), - 100, - 0, - ] # [transcript_id , transcript_support_level , transcript_length] - pot_best_trans = False - cur_tID = str() - ignor_trans = False - - with open(file_name, "r") as f: - for line in f: - entry = line.split("\t") - - # removes expected but unneeded entrys - exp_unneed = [ - "CDS", - "stop_codon", - "five_prime_utr", - "three_prime_utr", - "start_codon", - "Selenocysteine", - ] - if len(entry) == 1 or entry[2] in exp_unneed: - continue - - # this function turns the less organized part of the entry into a reable list - attributs = attributs_converter(entry[8]) - # looking for and processing exons entrys - if entry[2] == "exon": - - # dicide if to contiune or not - if ignor_trans: - continue - elif cur_gID != attributs[1]: - raise ValueError("ERROR exon from an unexpected Gen") - continue - elif find_in_attributs(attributs, "transcript_id") != cur_tID: - raise ValueError("exon from an unexpected transcript") - continue - - # adding the length of the exon to the appropriat list and chacking for changes in best transcript - if pot_best_trans: - pot_best_trans[2] += int(entry[4]) - int(entry[3]) - if pot_best_trans[2] > cur_best_trans[2]: - cur_best_trans = pot_best_trans - pot_best_trans = False - else: - cur_best_trans[2] += int(entry[4]) - int(entry[3]) - - # looking for and processing transcript entrys - elif entry[2] == "transcript": - - # varryfi that the gen is correct - if cur_gID != attributs[1]: - raise ValueError("ERROR transcript from an unexpected Gen") - continue - - # finding the transcript id and the support level - cur_tID = find_in_attributs(attributs, "transcript_id") - t_supp_lvl = find_in_attributs(attributs, "transcript_support_level") - - # If there is no transcript support level or the level is given as NA it is nomed as 100. else the transcript support level is tunrn into int - if t_supp_lvl == "NA": - t_supp_lvl = 100 - else: - try: - t_supp_lvl = int(t_supp_lvl) - except: - t_supp_lvl = 100 - - # decides if the transcript has potential to become the representative transcript - if t_supp_lvl < cur_best_trans[1] or cur_best_trans[0] == "": - cur_best_trans = [cur_tID, t_supp_lvl, 0] - pot_best_trans = False - ignor_trans = False - - elif t_supp_lvl == cur_best_trans[1]: - pot_best_trans = [cur_tID, t_supp_lvl, 0] - else: - ignor_trans = True - - # looking for and processing gene entrys - elif entry[2] == "gene": - - # updating rep_trans dict - if cur_gID not in rep_trans: - rep_trans[cur_gID] = cur_best_trans - else: - if rep_trans[cur_gID][1] > cur_best_trans[1]: - rep_trans[cur_gID] = cur_best_trans - elif ( - rep_trans[cur_gID][1] == cur_best_trans[1] - and rep_trans[cur_gID][2] < cur_best_trans[2] - ): - rep_trans[cur_gID] = cur_best_trans - - # updating cur_gID and resetting cur_best_trans - cur_gID = attributs[1] - cur_best_trans = [str(), 100, 0] - - # raises an error for unidentifyable entrys - else: - raise ValueError("This entry could not be identified\n", entry) - - # addding the final gene to the dictionary - if cur_gID not in rep_trans: - rep_trans[cur_gID] = cur_best_trans - else: - if rep_trans[cur_gID][1] > cur_best_trans[1]: - rep_trans[cur_gID] = cur_best_trans - elif ( - rep_trans[cur_gID][1] == cur_best_trans[1] - and rep_trans[cur_gID][2] < cur_best_trans[2] - ): - rep_trans[cur_gID] = cur_best_trans - - del rep_trans[""] - rep_transcripts = _re_format(rep_trans) - return rep_transcripts - - -def _test(): - """ - This funtion is ment to be run for test - Output: - file with the dictionary generated based on the test file - """ - file_name = "test.gtf" - rt = get_rep_trans(file_name) - expected_result = { - "ENSG00000160072": "ENST00000472194", - "ENSG00000234396": "ENST00000442483", - "ENSG00000225972": "ENST00000416931", - "ENSG00000224315": "ENST00000428803", - "ENSG00000198744": "ENST00000416718", - "ENSG00000279928": "ENST00000624431", - "ENSG00000228037": "ENST00000424215", - "ENSG00000142611": "ENST00000378391", - } - if rt != expected_result: - print("The test fail due to not yieding the same results") - print("The results the program got\n", rt) - print("The expected results\n", expected_result) - else: - print("The test was succses full") - - -def gtf_file_writer(original_file, rep_transcript_dict, output_file): - """ - this function writes the output GTF file - """ - output = [] - - with open(original_file, "r") as f: - for line in f: - entry = line.split("\t") - if line[0] != "#": - attributes = attributs_converter(entry[8]) - type_ = entry[2] - else: - continue - if type_ == "gene": - gene_id = find_in_attributs(attributes, "gene_id") - output.append(line) - else: - transcript_id = find_in_attributs(attributes, "transcript_id") - if rep_transcript_dict[gene_id] == transcript_id: - output.append(line) - - with open(output_file, "w") as last_file: - for item in output: - last_file.write(item) - - -def gtf_to_df(gtf_file: str) -> pd.DataFrame: - """ - This function take a .gtf file and convert it into a - dataframe containing gene_id and their transcripts_id. - Args: - gtf_file (str) : path to the .gtf file - - Returns: - df_gtf (pd.DataFrame) : pandas dataframe containing columns - gene_id and their transcripts_id. - Raises : - None - - """ - df_gtf = read_gtf(gtf_file) - df_gtf = df_gtf.loc[df_gtf["feature"] == "transcript"] - df_gtf = df_gtf[["gene_id", "transcript_id"]] - df_gtf = df_gtf.rename(columns={"gene_id": "Gene", "transcript_id": "Transcript"}) - return df_gtf - - -def dict_reprTrans_to_df(dict_reprTrans: dict[str, str]) -> pd.DataFrame: - - """Convert a dictionary of genes and their representative transcript into a dataframe - - Args: - dict_reprTrans (dict) : {'Gene':['transcriptA', 'transcriptB'], ...} - - Returns: - Pandas dataframe having Gene and transcript as columns - - Raises: - Only dict are allowed - Key should be strings - Value should be strings - - """ - pass - if not type(dict_reprTrans) is dict: - raise TypeError("Only dict are allowed") - if type(list(dict_reprTrans.keys())[0]) is not str: - raise TypeError("Key should be strings") - if type(list(dict_reprTrans.values())[0]) is not str: - raise TypeError("Values should be strings") - - df_reprTrans = pd.DataFrame.from_dict( - dict_reprTrans, orient="index", columns=["reprTranscript"] - ) - df_reprTrans = df_reprTrans.reset_index(level=0) - df_reprTrans.columns = ["Gene", "reprTrans"] - df_reprTrans["reprTrans"] = df_reprTrans["reprTrans"].str.replace( - r"\.[1-9]", "", regex=True - ) - return df_reprTrans - - -def tsv_or_csv_to_df(input_txt: str) -> pd.DataFrame: - """Convert tsv or csv file into a pandas dataframe - - Args: - input_txt (str): csv or tsv file containing transcript expression level - - Returns: - df_gene (str): Pandas dataframe having transcript and expression level - as columns - - Raises: - None - """ - pass - df_input = pd.read_csv( - input_txt, - sep=r"[\t,]", - lineterminator="\n", - names=["Transcript", "Expression_level"], - engine="python", - ) - return df_input - - -def exprLevel_byGene( - df_exprTrasncript: pd.DataFrame, df_output_gtf_selection: pd.DataFrame -) -> pd.DataFrame: - """find the gene of each transcipt given by the expression level csv/tsv file, - and summ expression level of all transcipts from the same gene. - - Args: - df_exprTranscript : pandas Dataframe containing transcript and their expression level, - generated by "tsv_or_csv_to_df" function - df_output_gtf_selection : pandas Dataframe containing genes and transcripts, - generated by "transcripts_by_gene_inDf" function - - Returns: - Pandas dataframe having gene and sum of its transcript expression level - - Raises: - None - """ - pass - df_merged = pd.merge( - df_output_gtf_selection, df_exprTrasncript, how="inner", on="Transcript" - ) - df_sum = df_merged.groupby("Gene").sum("Expression_level") - return df_sum - - -def match_byGene( - df_reprTranscript: pd.DataFrame, df_expressionLevel_byGene: pd.DataFrame -) -> pd.DataFrame: - """Find matching genes bewteen the 2 args - - Args: - df_reprTranscript : pandas Dataframe containing genes - and their representative transcript, generated by - "dict_reprTrans_to_df()" - df_expressionLevel_byGene : pandas Dataframe containing - genes and their expression level generated by - "transcript_by_gene_inDf()" - - Returns: - Pandas dataframe having representative trasncripts - and their expression level - - Raises: - None - """ - pass - df_merged = pd.merge( - df_reprTranscript, df_expressionLevel_byGene, how="outer", on="Gene" - ) - df_clean = df_merged.dropna(axis=0) - df_clean = df_clean.loc[:, ["reprTrans", "Expression_level"]] - return df_clean - - -### functions to run this part of the programm - - -def match_reprTranscript_expressionLevel( - exprTrans: str, dict_reprTrans: dict, gtf_file: str, -): - """Combine functions to replace transcripts from an expression level csv/tsv file - with representative transcripts - - Args: - exprTrans (str): csv or tsv file containing transcripts - and their expression level - dict_reprTrans (dict) : dict of genes and their - representative transcipt - intemediate_file (str) : txt file containing genes, transcript - and their expression level from the transkript_extractor function - output_path : path indicating were the tsv file should be written - - Returns: - tsv file of representative trasncripts and their expression level - - Raises: - None - """ - df_gene_transcript = gtf_to_df(gtf_file) - df_exprTrans = tsv_or_csv_to_df(exprTrans) - df_reprTrans = dict_reprTrans_to_df(dict_reprTrans) - df_exprLevel_byGene = exprLevel_byGene(df_exprTrans, df_gene_transcript) - df_match = match_byGene(df_reprTrans, df_exprLevel_byGene) - df_match.rename( - columns={"reprTrans": "id", "Expression_level": "level"}, inplace=True - ) - return df_match - - -def transcript_sampling(total_transcript_number, df_repr, output_csv): - df = df_repr - levels = [] - sums = df["level"].tolist() - total = sum(sums) - total_transcript_number = int(total_transcript_number) - normalized = total_transcript_number / total - for expression_level in df["level"]: - poisson_sampled = np.random.poisson(expression_level * normalized) - levels.append(poisson_sampled) - - transcript_numbers = pd.DataFrame({"id": df["id"], "count": levels}) - pd.DataFrame.to_csv(transcript_numbers, output_csv) diff --git a/test/Test_representative_and_match/test_match.py b/test/Test_representative_and_match/test_match.py deleted file mode 100644 index c8b156cb24435bb24d9764dd50c19ac1fdde6086..0000000000000000000000000000000000000000 --- a/test/Test_representative_and_match/test_match.py +++ /dev/null @@ -1,207 +0,0 @@ -import pandas as pd -import json -import re -import match_reprtranscript_expressionlevel as match -import os -import pytest -import test_Functions as tFun -import numpy as np -import representative as repr -from pandas.testing import assert_frame_equal - -def test_dict_reprTrans_to_df(): - """ - This function test if a dict of {gene: representativeTranscript} - is converted in a dataframe in the right format - """ - dict_repr_test = {"ENSMUSG00000079415":"ENSMUST00000112933", -"ENSMUSG00000024691" : "ENSMUST00000025595", -"ENSMUSG00000063683": "ENSMUST00000119960"} - dict_mixed = {"a":2, "b":3} - str_random = "jflkajflkaelfha" - dict_int = {12:34, 13:66} - df = match.dict_reprTrans_to_df(dict_repr_test) - datatype={'Gene': np.dtype('O'), 'reprTrans': np.dtype('O')} - - with pytest.raises(TypeError, match=r"Only dict are allowed"): - match.dict_reprTrans_to_df(str_random) - with pytest.raises(TypeError, match=r"Key should be strings"): - match.dict_reprTrans_to_df(dict_int) - with pytest.raises(TypeError, match=r"Values should be strings"): - match.dict_reprTrans_to_df(dict_mixed) - assert tFun.column_number(df)==2, "number of columns is not equal to 2" - assert tFun.column_dType(df)==datatype, "at least one column has the wrong datatype" - assert tFun.duplicated_rows(df).empty, "at least one row are duplicated " - assert tFun.NA_value(df) == 0, "at least one row contain NA values " - - -def test_txt_to_dict(): - path = tFun.find_path("test_dict_repr_trans.txt") - dico = match.txt_to_dict(path) - dict_test = {'ENSMUSG00000079415': 'ENSMUST00000112933', -"ENSMUSG00000024691" : "ENSMUST00000025595", -"ENSMUSG00000063683": "ENSMUST00000119960"} - assert dico == dict_test - -def test_transcripts_by_gene_inDf(): - """ - This function test if a dataframe generated from - the intermediate file is converted in another - dataframe without the support level column. - """ - path = tFun.find_path_intermediateFile() - df = repr.import_gtfSelection_to_df(path) - df_gene = match.transcripts_by_gene_inDf(df) - datatype={'Gene': np.dtype('O'), 'Transcript': np.dtype('O')} - assert tFun.column_number(df_gene)==2, "number of columns is not equal to 2" - assert tFun.column_dType(df_gene)==datatype, "at least one column has the wrong datatype" - assert tFun.duplicated_rows(df_gene).empty, "at least one row are duplicated " - assert tFun.NA_value(df_gene) == 0, "at least one row contain NA values " - - -def test_tsv_or_csv_to_df(): - """ - This function test if the function tsv_or_csv_to_df() cans take - csv and tsv file as input and return a pandas dataframe in the - right format - """ - path_tsv = tFun.find_path(r"test_gene_exprL") - df_tsv = match.tsv_or_csv_to_df(path_tsv) - path_csv = tFun.find_path(r"test_gene_exprL_csv.csv") - df_csv = match.tsv_or_csv_to_df(path_csv) - datatype ={'Transcript': np.dtype('O'), 'Expression_level': np.dtype('float64')} - assert tFun.column_number(df_tsv)==2, "number of columns is not equal to 2" - assert tFun.column_dType(df_tsv)==datatype, "at least one column has the wrong datatype" - assert tFun.duplicated_rows(df_tsv).empty, "at least one row are duplicated " - assert tFun.NA_value(df_tsv) == 0, "at least one row contain NA values " - assert_frame_equal(df_tsv, df_csv), "csv and tsv import doesn't match" - - -def test_exprLevel_byGene(): - """ - This function test if the function exprLevel_byGene can find the gene of - each transcipt given by the expression level csv/tsv file and sum their - expression level - """ - path_tsv = tFun.find_path(r"test_gene_exprL") - df_tsv_exprL = match.tsv_or_csv_to_df(path_tsv) - - path_intermediate = tFun.find_path_intermediateFile() - df_intermediate = repr.import_gtfSelection_to_df(path_intermediate) - df_gene_transcript = match.transcripts_by_gene_inDf(df_intermediate) - - df_exprLevel = match.exprLevel_byGene(df_tsv_exprL, df_gene_transcript) - - datatype ={'Expression_level': np.dtype('float64')} - assert tFun.column_number(df_exprLevel)==1, "number of columns is not equal to 1" - assert tFun.column_dType(df_exprLevel)==datatype, "at least one column has the wrong datatype" - assert tFun.duplicated_rows(df_exprLevel).empty, "at least one row are duplicated " - assert tFun.NA_value(df_exprLevel) == 0, "at least one row contain NA values " - assert tFun.duplicated_index(df_exprLevel).empty, "at least one index element is duplicated" - -def test_match_byGene(): - """ - This function test if the function "match_byGene()" can - create a pandas dataframe matching representative transcript - and their expression level based on their gene in the - correct pandas dataframe format. - """ - - - dict_repr_test = {'ENSMUSG00000079415': 'ENSMUST00000112933', -"ENSMUSG00000024691" : "ENSMUST00000025595", -"ENSMUSG00000063683": "ENSMUST00000119960"} - df_dict_reprTrans = match.dict_reprTrans_to_df(dict_repr_test) - - - path_tsv = tFun.find_path(r"test_gene_exprL") - df_tsv_exprL = match.tsv_or_csv_to_df(path_tsv) - path_intermediate = tFun.find_path_intermediateFile() - df_intermediate = repr.import_gtfSelection_to_df(path_intermediate) - df_gene_transcript = match.transcripts_by_gene_inDf(df_intermediate) - df_exprLevel = match.exprLevel_byGene(df_tsv_exprL, df_gene_transcript) - - df_match = match.match_byGene(df_dict_reprTrans, df_exprLevel) - datatype = {'reprTrans': np.dtype('O'), 'Expression_level': np.dtype('float64')} - - assert tFun.column_number(df_match)==2, "number of columns is not equal to 2" - assert tFun.column_dType(df_match)==datatype, "at least one column has the wrong datatype" - assert tFun.duplicated_rows(df_match).empty, "at least one row are duplicated " - assert tFun.NA_value(df_match) == 0, "at least one row contain NA values " - assert tFun.duplicated_index(df_match).empty, "at least one index element is duplicated" - -def test_output_tsv(): - """ - This function test if a tsv file is generated from a pandas - dataframe in the right format. - """ - - dict_repr_test = {'ENSMUSG00000079415': 'ENSMUST00000112933', -"ENSMUSG00000024691" : "ENSMUST00000025595", -"ENSMUSG00000063683": "ENSMUST00000119960"} - df_dict_reprTrans = match.dict_reprTrans_to_df(dict_repr_test) - - - path_tsv = tFun.find_path(r"test_gene_exprL") - df_tsv_exprL = match.tsv_or_csv_to_df(path_tsv) - path_intermediate = tFun.find_path_intermediateFile() - df_intermediate = repr.import_gtfSelection_to_df(path_intermediate) - df_gene_transcript = match.transcripts_by_gene_inDf(df_intermediate) - - df_exprLevel = match.exprLevel_byGene(df_tsv_exprL, df_gene_transcript) - - df_match = match.match_byGene(df_dict_reprTrans, df_exprLevel) - - match.output_tsv(df_match) - - ref_path=tFun.find_path("test_ref_output.tsv") - output_path = tFun.find_output() - - with open(ref_path, 'r') as t1, open(output_path, 'r') as t2: - fileRef = t1.readlines() - fileOutput = t2.readlines() - - - assert sorted(fileRef) == sorted(fileOutput), "the output does't match the expected tsv file" - - -def test_match_reprTranscript_expressionLevel(): - """ - This function test that the right output is generated by the function - match_reprTranscript_expressionLevel() - """ - input_path = tFun.find_path("test_gene_exprL") - intermediate_path = tFun.find_path_intermediateFile() - dict_repr_test = {'ENSMUSG00000079415': 'ENSMUST00000112933', -"ENSMUSG00000024691" : "ENSMUST00000025595", -"ENSMUSG00000063683": "ENSMUST00000119960"} - - match.match_reprTranscript_expressionLevel(input_path, dict_repr_test, intermediate_path) - - ref_path=tFun.find_path("test_ref_output.tsv") - output_path = tFun.find_output() - - - with open(ref_path, 'r') as t1,\ - open(output_path, 'r') as t2,\ - open(input_path, 'r') as t3 : - fileRef = t1.readlines() - fileOutput = t2.readlines() - fileInput = t3.readlines() - - assert sorted(fileRef) == sorted(fileOutput), "the output does't match the expected tsv file" - assert sorted(fileRef) != sorted(fileInput), "the output does't match the expected tsv file" - - - - -test_dict_reprTrans_to_df() -test_txt_to_dict() -test_transcripts_by_gene_inDf() -test_tsv_or_csv_to_df() -test_exprLevel_byGene() -test_match_byGene() -test_output_tsv() -test_match_reprTranscript_expressionLevel() - -print("test_match is done ! No error was found") diff --git a/test/.gitkeep b/tests/__init__.py similarity index 100% rename from test/.gitkeep rename to tests/__init__.py diff --git a/test/Test_representative_and_match/.gitkeep b/tests/inputs/.gitkeep similarity index 100% rename from test/Test_representative_and_match/.gitkeep rename to tests/inputs/.gitkeep diff --git a/test/Test_representative_and_match/inputs/test_dict_repr_trans.txt b/tests/inputs/test_dict_repr_trans.txt similarity index 100% rename from test/Test_representative_and_match/inputs/test_dict_repr_trans.txt rename to tests/inputs/test_dict_repr_trans.txt diff --git a/test/Test_representative_and_match/inputs/test_gencode.vM31.annotation_intermediat_file.txt b/tests/inputs/test_gencode.vM31.annotation_intermediat_file.txt similarity index 100% rename from test/Test_representative_and_match/inputs/test_gencode.vM31.annotation_intermediat_file.txt rename to tests/inputs/test_gencode.vM31.annotation_intermediat_file.txt diff --git a/test/Test_representative_and_match/inputs/test_gene_exprL b/tests/inputs/test_gene_exprL similarity index 100% rename from test/Test_representative_and_match/inputs/test_gene_exprL rename to tests/inputs/test_gene_exprL diff --git a/test/Test_representative_and_match/inputs/test_gene_exprL_csv.csv b/tests/inputs/test_gene_exprL_csv.csv similarity index 100% rename from test/Test_representative_and_match/inputs/test_gene_exprL_csv.csv rename to tests/inputs/test_gene_exprL_csv.csv diff --git a/test/Test_representative_and_match/inputs/test_ref_output.tsv b/tests/inputs/test_ref_output.tsv similarity index 100% rename from test/Test_representative_and_match/inputs/test_ref_output.tsv rename to tests/inputs/test_ref_output.tsv diff --git a/test/Test_representative_and_match/test_Functions.py b/tests/test_Functions.py similarity index 71% rename from test/Test_representative_and_match/test_Functions.py rename to tests/test_Functions.py index 72a120d1d2e6967233abd0dc2bc14607a8ef40fe..fe51484c206908600360239917194fa455cf17c9 100644 --- a/test/Test_representative_and_match/test_Functions.py +++ b/tests/test_Functions.py @@ -2,32 +2,34 @@ import pandas as pd import numpy as np import os -def find_path(filename:str)->str: + +def find_path(filename: str) -> str: """Find the path to a file Args: name of a file Returns: - str path of a file - + str path of a file + Raises: None """ absolute_path = os.path.dirname(__file__) - test_file = "inputs\\" + str(filename) + test_file = "inputs/" + str(filename) full_path = os.path.join(absolute_path, test_file) return full_path + def find_output(): - """Find the path of the output file + """Find the path of the output file Args: name of a file Returns: - str path of a file - + str path of a file + Raises: None """ @@ -37,7 +39,7 @@ def find_output(): return full_path -def find_path_intermediateFile()->str: +def find_path_intermediateFile() -> str: """Find the path to gencode.vM31.annotation_intermediat_file.txt Args: @@ -45,77 +47,82 @@ def find_path_intermediateFile()->str: Returns: str path of gencode.vM31.annotation_intermediat_file.txt - + Raises: None - """ + """ absolute_path = os.path.dirname(__file__) - test_file = r"inputs\test_gencode.vM31.annotation_intermediat_file.txt" + test_file = r"inputs/test_gencode.vM31.annotation_intermediat_file.txt" full_path = os.path.join(absolute_path, test_file) return full_path -def column_number(df :pd.DataFrame)-> int: - """Return the number of column of a df +def column_number(df: pd.DataFrame) -> int: + + """Return the number of column of a df Args: dataframe Returns: int - + Raises: None - """ + """ length = len(df.columns) return length -def column_dType(df : pd.DataFrame) -> dict[str,np.dtype]: - """Return the type of each column of a df in a dict + +def column_dType(df: pd.DataFrame) -> dict[str, np.dtype]: + """Return the type of each column of a df in a dict Args: Pandas dataframe Returns: dict{column:np.dtype()} - + Raises: None - """ - dtype=df.dtypes.to_dict() + """ + dtype = df.dtypes.to_dict() return dtype + def duplicated_rows(df: pd.DataFrame) -> pd.DataFrame: - """Return the sum of duplicated rows in a df + """Return the sum of duplicated rows in a df Args: Pandas dataframe Returns: int - + Raises: None - """ + """ df_dupl = df[df.duplicated()] return df_dupl + def duplicated_index(df: pd.DataFrame) -> pd.DataFrame: - """Return the sum of duplicated index in a df + """Return the sum of duplicated index in a df Args: Pandas dataframe Returns: int - + Raises: None - """ + """ df_dupl = df[df.index.duplicated()] return df_dupl -def NA_value(df: pd.DataFrame) -> int: + +def NA_value(df: pd.DataFrame) -> int: """Return the sum of NA values in a df Args: @@ -123,10 +130,9 @@ def NA_value(df: pd.DataFrame) -> int: Returns: int - + Raises: None - """ + """ nNA = df.isna().sum().sum() return nNA - diff --git a/tests/test_match_reptrans_explvl.py b/tests/test_match_reptrans_explvl.py new file mode 100644 index 0000000000000000000000000000000000000000..8e1b52c7e62e2cd3ea7f2320b62f551def3828be --- /dev/null +++ b/tests/test_match_reptrans_explvl.py @@ -0,0 +1,260 @@ +"""Tests for match representative transcript with expression level""" +import pytest +import pandas as pd +import numpy as np +from pandas.testing import assert_frame_equal +import tests.test_Functions as tFun +from transcript_sampler.match_reptrans_explvl import MatchReptransExplvl as match + + +class TestMatchReptrans: + """Tests for match_reptrans_explvl.py""" + # def test_gtf_to_df(self): + # TO DO + + def test_dict_repr_trans_to_df(self): + """ + This function test if a dict of {gene: representativeTranscript} + is converted in a dataframe in the right format + """ + dict_repr_test = { + "ENSMUSG00000079415": "ENSMUST00000112933", + "ENSMUSG00000024691": "ENSMUST00000025595", + "ENSMUSG00000063683": "ENSMUST00000119960"} + dict_mixed = {"a": 2, "b": 3} + str_random = "jflkajflkaelfha" + dict_int = {12: 34, 13: 66} + data_frame = match.dict_repr_trans_to_df(dict_repr_test) + datatype = {'Gene': np.dtype('O'), 'reprTrans': np.dtype('O')} + + with pytest.raises(TypeError, match=r"Only dictionaries are allowed"): + match.dict_repr_trans_to_df(str_random) + with pytest.raises(TypeError, match=r"Keys should be strings"): + match.dict_repr_trans_to_df(dict_int) + with pytest.raises(TypeError, match=r"Values should be strings"): + match.dict_repr_trans_to_df(dict_mixed) + + assert tFun.column_number(data_frame) == 2, \ + "number of columns not equal to 2" + assert tFun.column_dType(data_frame) == datatype, \ + "at least one column has the wrong datatype" + assert tFun.duplicated_rows(data_frame).empty, \ + "at least one row is duplicated" + assert tFun.NA_value(data_frame) == 0, \ + "at least one row contain NA values" + + def test_tsv_or_csv_to_df(self): + """ + This function test if the function tsv_or_csv_to_df() can take + csv and tsv file as input and return a pandas dataframe in the + right format + """ + path_tsv = tFun.find_path(r"test_gene_exprL") + df_tsv = match.tsv_or_csv_to_df(path_tsv) + path_csv = tFun.find_path(r"test_gene_exprL_csv.csv") + df_csv = match.tsv_or_csv_to_df(path_csv) + datatype = {'Transcript': np.dtype('O'), + 'Expression_level': np.dtype('float64')} + + assert tFun.column_number(df_tsv) == 2, \ + "number of columns is not equal to 2" + assert tFun.column_dType(df_tsv) == datatype, \ + "at least one column has the wrong datatype" + assert tFun.duplicated_rows(df_tsv).empty, \ + "at least one row are duplicated " + assert tFun.NA_value(df_tsv) == 0, \ + "at least one row contain NA values" + assert_frame_equal(df_tsv, df_csv), \ + "csv and tsv import doesn't match" + + def test_expr_level_by_gene(self): + """ + This function test if the function expr_level_by_gene can find + the gene of each transcipt given by the expression level csv/tsv + file and sum their expression level + """ + path_tsv = tFun.find_path(r"test_gene_exprL") + df_tsv_exprL = match.tsv_or_csv_to_df(path_tsv) + df_gene_transcript = pd.DataFrame( + {'Gene': ['ENSMUSG00000024691', 'ENSMUSG00000024691', + 'ENSMUSG00000024691', 'ENSMUSG00000024691', + 'ENSMUSG00000079415', 'ENSMUSG00000063683', + 'ENSMUSG00000063683', 'ENSMUSG00000063683', + 'ENSMUSG00000063683', 'ENSMUSG00000063683'], + 'Transcript': ['ENSMUST00000139270', 'ENSMUST00000151307', + 'ENSMUST00000144662', 'ENSMUST00000025595', + 'ENSMUST00000112933', 'ENSMUST000000449762', + 'ENSMUST00000155846', 'ENSMUST00000157069', + 'ENSMUST00000119960', 'ENSMUST00000123173']} + ) + + df_exprLevel = match.expr_level_by_gene( + df_tsv_exprL, df_gene_transcript + ) + datatype = {'Gene': np.dtype('O'), + 'Expression_level': np.dtype('float64')} + + assert tFun.column_number(df_exprLevel) == 2, \ + "number of columns is not equal to 2" + assert tFun.column_dType(df_exprLevel) == datatype, \ + "at least one column has the wrong datatype" + assert tFun.duplicated_rows(df_exprLevel).empty, \ + "at least one row are duplicated " + assert tFun.NA_value(df_exprLevel) == 0, \ + "at least one row contain NA values " + assert tFun.duplicated_index(df_exprLevel).empty, \ + "at least one index element is duplicated" + + def test_match_by_gene(self): + """ + This function test if the function "match_by_gene()" can + create a pandas dataframe matching representative transcript + and their expression level based on their gene in the + correct pandas dataframe format. + """ + + dict_repr_test = { + 'ENSMUSG00000079415': 'ENSMUST00000112933', + 'ENSMUSG00000024691': 'ENSMUST00000025595', + 'ENSMUSG00000063683': 'ENSMUST00000119960'} + df_dict_reprTrans = match.dict_repr_trans_to_df(dict_repr_test) + + path_tsv = tFun.find_path(r"test_gene_exprL") + df_tsv_exprL = match.tsv_or_csv_to_df(path_tsv) + df_gene_transcript = pd.DataFrame( + {'Gene': ['ENSMUSG00000024691', 'ENSMUSG00000024691', + 'ENSMUSG00000024691', 'ENSMUSG00000024691', + 'ENSMUSG00000079415', 'ENSMUSG00000063683', + 'ENSMUSG00000063683', 'ENSMUSG00000063683', + 'ENSMUSG00000063683', 'ENSMUSG00000063683'], + 'Transcript': ['ENSMUST00000139270', 'ENSMUST00000151307', + 'ENSMUST00000144662', 'ENSMUST00000025595', + 'ENSMUST00000112933', 'ENSMUST000000449762', + 'ENSMUST00000155846', 'ENSMUST00000157069', + 'ENSMUST00000119960', 'ENSMUST00000123173']} + ) + df_exprLevel = match.expr_level_by_gene( + df_tsv_exprL, df_gene_transcript) + + df_match = match.match_by_gene(df_dict_reprTrans, df_exprLevel) + datatype = { + 'reprTrans': np.dtype('O'), + 'Expression_level': np.dtype('float64')} + + assert tFun.column_number(df_match) == 2, \ + "number of columns is not equal to 2" + assert tFun.column_dType(df_match) == datatype, \ + "at least one column has the wrong datatype" + assert tFun.duplicated_rows(df_match).empty, \ + "at least one row are duplicated " + assert tFun.NA_value(df_match) == 0, \ + "at least one row contain NA values " + assert tFun.duplicated_index(df_match).empty, \ + "at least one index element is duplicated" + + def test_match_repr_transcript_expression_level(self): + """ + This function test that the right output is generated by the function + match_repr_transcript_expression_level() + """ + input_path = tFun.find_path("test_gene_exprL") + intermediate_path = tFun.find_path_intermediateFile() + dict_repr_test = { + 'ENSMUSG00000079415': 'ENSMUST00000112933', + "ENSMUSG00000024691": "ENSMUST00000025595", + "ENSMUSG00000063683": "ENSMUST00000119960"} + + match.match_repr_transcript_expression_level( + exprTrans=input_path, + dict_reprTrans=dict_repr_test, + gtf_file=intermediate_path) + + ref_path = tFun.find_path("test_ref_output.tsv") + output_path = tFun.find_output() + + with open(ref_path, 'r', encoding="utf-8") as t1,\ + open(output_path, 'r', encoding="utf-8") as t2,\ + open(input_path, 'r', encoding="utf-8") as t3: + fileRef = t1.readlines() + fileOutput = t2.readlines() + fileInput = t3.readlines() + + assert ( + sorted(fileRef) == sorted(fileOutput) + ), "the output does't match the expected tsv file" + assert ( + sorted(fileRef) != sorted(fileInput) + ), "the output does't match the expected tsv file" + + # def test_txt_to_dict(self): + # """This function tests if txt is convertod to dict""" + # path = tFun.find_path("test_dict_repr_trans.txt") + # dico = match.txt_to_dict(path) + # dict_test = {'ENSMUSG00000079415': 'ENSMUST00000112933', + # "ENSMUSG00000024691": "ENSMUST00000025595", + # "ENSMUSG00000063683": "ENSMUST00000119960"} + # assert dico == dict_test + + # def test_transcripts_by_gene_inDf(): + # """ + # This function test if a dataframe generated from + # the intermediate file is converted in another + # dataframe without the support level column. + # """ + # path = tFun.find_path_intermediateFile() + # df = repr.import_gtfSelection_to_df(path) + # df_gene = match.transcripts_by_gene_inDf(df) + # datatype = {'Gene': np.dtype('O'), 'Transcript': np.dtype('O')} + # assert tFun.column_number(df_gene) == ( + # 2, "number of columns is not equal to 2") + # assert tFun.column_dType(df_gene) == ( + # datatype, "at least one column has the wrong datatype") + # assert tFun.duplicated_rows(df_gene).empty, \ + # "at least one row are duplicated" + # assert tFun.NA_value(df_gene) == 0, "at least one row contain NA values" + + # def test_output_tsv(): + # """ + # This function test if a tsv file is generated from a pandas + # dataframe in the right format. + # """ + + # dict_repr_test = { + # 'ENSMUSG00000079415': 'ENSMUST00000112933', + # "ENSMUSG00000024691": "ENSMUST00000025595", + # "ENSMUSG00000063683": "ENSMUST00000119960"} + # df_dict_reprTrans = match.dict_repr_trans_to_df(dict_repr_test) + + # path_tsv = tFun.find_path(r"test_gene_exprL") + # df_tsv_exprL = match.tsv_or_csv_to_df(path_tsv) + # path_intermediate = tFun.find_path_intermediateFile() + # df_intermediate = repr.import_gtfSelection_to_df(path_intermediate) + # df_gene_transcript = match.transcripts_by_gene_inDf(df_intermediate) + + # df_exprLevel = match.expr_level_by_gene(df_tsv_exprL, df_gene_transcript) + + # df_match = match.match_by_gene(df_dict_reprTrans, df_exprLevel) + + # match.output_tsv(df_match) + + # ref_path = tFun.find_path("test_ref_output.tsv") + # output_path = tFun.find_output() + + # with open(ref_path, 'r') as t1, open(output_path, 'r') as t2: + # fileRef = t1.readlines() + # fileOutput = t2.readlines() + + # assert ( + # sorted(fileRef) == sorted(fileOutput) + # ), "the output does't match the expected tsv file" + +# test_dict_repr_trans_to_df() +# test_txt_to_dict() +# test_transcripts_by_gene_inDf() +# test_tsv_or_csv_to_df() +# test_expr_level_by_gene() +# test_match_by_gene() +# test_output_tsv() +# test_match_repr_transcript_expression_level() + +# print("test_match is done ! No error was found") diff --git a/test/Test_representative_and_match/test_representative.py b/tests/test_representative.py similarity index 58% rename from test/Test_representative_and_match/test_representative.py rename to tests/test_representative.py index 4d000977434368393cedeb6ac0d4b93f30609ab7..4ee677838beb99ddfd671f21f7b8452102965c49 100644 --- a/test/Test_representative_and_match/test_representative.py +++ b/tests/test_representative.py @@ -1,6 +1,4 @@ import pytest -import pandas as pd -import datatest as dt import representative as repr import numpy as np import test_Functions as tFun @@ -14,71 +12,82 @@ def test_import_gtfSelection_to_df(): None Returns: - Assert results - + Assert results + Raises: None """ path = tFun.find_path_intermediateFile() - df = repr.import_gtfSelection_to_df(path) - datatype={'Gene': np.dtype('O'), 'Transcript': np.dtype('O'), 'Support_level': np.dtype('float64')} - assert tFun.column_number(df)==3, "number of columns is not equal to 3" - assert tFun.column_dType(df)==datatype, "at lease one column has the wrong datatype" + df = repr.import_gtfSelection_to_df(path) + datatype = {'Gene': np.dtype('O'), 'Transcript': np.dtype('O'), + 'Support_level': np.dtype('float64')} + assert tFun.column_number(df) == ( + 3, "number of columns is not equal to 3") + assert tFun.column_dType(df) == ( + datatype, "at lease one column has the wrong datatype") assert tFun.duplicated_rows(df).empty, "at lease one row are duplicated " assert tFun.NA_value(df) == 0, "at lease one row contain NA values " with pytest.raises(TypeError, match=r"Only str path is allowed"): repr.import_gtfSelection_to_df(123) -def test_representative_transcript_inDict(): +def test_representative_transcript_inDict(): """ Test if df generated by "import_gtfSelection_to_df()" output - a dict in the right format + a dict in the right format Args: Pandas dataframe with [Gene, Transcript, Support_level] as columns, validated with test_import_gtfSelection_to_df() Returns: - Assert results - + Assert results + Raises: None """ path = tFun.find_path_intermediateFile() - df = repr.import_gtfSelection_to_df(path) + df = repr.import_gtfSelection_to_df(path) dict_to_test = repr.representative_transcripts_inDict(df) - dict_expected = {'ENSMUSG00000024691': ['ENSMUST00000025595.5'], - 'ENSMUSG00000063683': ['ENSMUST00000044976.12', 'ENSMUST00000119960.2'], - 'ENSMUSG00000079415': ['ENSMUST00000112933.2']} + dict_expected = { + 'ENSMUSG00000024691': ['ENSMUST00000025595.5'], + 'ENSMUSG00000063683': ['ENSMUST00000044976.12', + 'ENSMUST00000119960.2'], + 'ENSMUSG00000079415': ['ENSMUST00000112933.2']} assert dict_to_test == dict_expected with pytest.raises(TypeError, match=r"Only pandas DataFrame is allowed"): repr.representative_transcripts_inDict(123) with pytest.raises(TypeError, match=r"Only pandas DataFrame is allowed"): repr.representative_transcripts_inDict("hello") with pytest.raises(TypeError, match=r"Only pandas DataFrame is allowed"): - repr.representative_transcripts_inDict(["hello","world",123]) + repr.representative_transcripts_inDict(["hello", "world", 123]) with pytest.raises(TypeError, match=r"Only pandas DataFrame is allowed"): - repr.representative_transcripts_inDict({"hello":"world", "bonjour":["le monde", 123]}) + repr.representative_transcripts_inDict({"hello": "world", + "bonjour": ["le monde", 123]}) + def test_find_repr_by_SupportLevel(): """ - Test if the correct dict is generated from gencode.vM31.annotation_intermediat_file.txt + Test if the correct dict is generated from + gencode.vM31.annotation_intermediat_file.txt Args: - None + None Returns: - Assert results - + Assert results + Raises: None """ path = tFun.find_path_intermediateFile() dict_to_test = repr.find_repr_by_SupportLevel(path) - dict_expected = {'ENSMUSG00000024691': ['ENSMUST00000025595.5'], - 'ENSMUSG00000063683': ['ENSMUST00000044976.12', 'ENSMUST00000119960.2'], - 'ENSMUSG00000079415': ['ENSMUST00000112933.2']} + dict_expected = { + 'ENSMUSG00000024691': ['ENSMUST00000025595.5'], + 'ENSMUSG00000063683': ['ENSMUST00000044976.12', + 'ENSMUST00000119960.2'], + 'ENSMUSG00000079415': ['ENSMUST00000112933.2']} assert dict_to_test == dict_expected + test_representative_transcript_inDict() test_find_repr_by_SupportLevel() test_import_gtfSelection_to_df() diff --git a/test/Test_representative_and_match/inputs/.gitkeep b/transcript_sampler/__init__.py similarity index 100% rename from test/Test_representative_and_match/inputs/.gitkeep rename to transcript_sampler/__init__.py diff --git a/transcript_sampler/find_reptrans.py b/transcript_sampler/find_reptrans.py new file mode 100644 index 0000000000000000000000000000000000000000..6025e29cac6e6501e1b98f627ca4023e6b91128d --- /dev/null +++ b/transcript_sampler/find_reptrans.py @@ -0,0 +1,288 @@ +"""Find representative transcripts""" + +import logging + +LOG = logging.getLogger(__name__) + + +class FindRepTrans: + """Find representative transcripts.""" + + def __init__(self): + pass + + @staticmethod + def attributes_converter(attributes: str) -> list: + """ + This funtion converts the "unstructured" ;-seperated part of + the line into a list of identifiers and corresponding data, + the structure of which can be used ot find the data easily e.g + the index of the identifier transcript_id + 1 will give the + transcript id of the current gene. + Input: + attributes = str() # the unstructured part of the entry + Output: + attributes = list() # cleaned list with the \ + characteristics described above + """ + attributes = ( + attributes.replace('"', "") + .replace(";", "") + .replace("\\n", "") + .split(" ") + ) + return attributes + + @staticmethod + def find_in_attributes(attributes: list, look_for: str) -> str: + """ + This function finds a keyword and used that to locate the value of that + keyword e.g key = gene_id, value = 'ENSMUSG00002074970', + this works as they are next to each other in the attributes list. + Inputs: + attributes = list() + look_for = str() # string of the name of the key to look for + Output: + attributes[index] or NA = str() # NA is returned if the key + was not found in the attributes + """ + if look_for in attributes: + index = attributes.index(look_for) + 1 + return attributes[index] + else: + LOG.warning('No %s in the entry, the return was set to NA', + look_for) + return "NA" + + @staticmethod + def reformat_reptrans(rep_trans_dict: dict) -> dict: + """ + This function is meant to reformat dictionary of the representative + transcripts into an dictionary with only one entry per key + Input: + rep_trans_dict = {gene_id : [ + transcript_id, transcript_support_level, transcript_length]} + Output: + rep_transcripts = {gene_id : transcript_id} + """ + rep_transcripts = {} + for gene_id in rep_trans_dict: + rep_transcripts[gene_id] = rep_trans_dict[gene_id][0] + + return rep_transcripts + + def get_rep_trans(self, file_name: str) -> dict: + """ + This is the main function of this script. It selects one + representative transcript per gene based on a GTF annotation file. + It does so by two criteria: the transcript support level and if + there are several transcripts of one gene that have the same + transcript_support_level, it chooses the one that corresponds + to the longest mRNA. + + Args: + file_name (str): Name of the annotation file with or without + the .gtf extension. + + Returns: + rep_transcripts (dict): Dictionary of gene_id to transcript_id + representing the selected representative transcripts. + + Raises: + ValueError: If an unexpected entry is encountered in the GTF file. + """ + + # setting default variables + rep_transcripts = {} + cur_g_id = "" + # [transcript_id, transcript_support_level, transcript_length] + cur_best_trans = ["", 100, 0] + + with open(file_name, "r", encoding="utf-8") as file: + for line in file: + entry = line.split("\t") + + # removes expected but unneeded entries + if len(entry) == 1 or entry[2] in [ + "CDS", "stop_codon", + "five_prime_utr", "three_prime_utr", + "start_codon", "Selenocysteine" + ]: + continue + + # this function turns the less organized part of the entry + # into a readable list + attributes = self.attributes_converter(entry[8]) + + # looking for and processing exons entries + if entry[2] == "exon": + if cur_g_id != attributes[1]: + LOG.error() + raise ValueError("Exon from an unexpected gene") + elif ( + self.find_in_attributes( + attributes, "transcript_id" + ) != cur_tID + ): + LOG.error() + raise ValueError("Exon from an unexpected transcript") + + # adding the length of the exon to the appropriate list and + # checking for changes in best transcript + if pot_best_trans: + pot_best_trans[2] += int(entry[4]) - int(entry[3]) + if pot_best_trans[2] > cur_best_trans[2]: + cur_best_trans = pot_best_trans + pot_best_trans = False + else: + cur_best_trans[2] += int(entry[4]) - int(entry[3]) + + # looking for and processing transcript entries + elif entry[2] == "transcript": + # verify that the gen is correct + if cur_g_id != attributes[1]: + LOG.error() + raise ValueError("Transcript from an unexpected gene") + + # finding the transcript id and the support level + cur_tID = self.find_in_attributes( + attributes, "transcript_id" + ) + t_supp_lvl = self.find_in_attributes( + attributes, "transcript_support_level" + ) + + # If there is no transcript support level or the level is + # given as NA it is nomed as 100. else the transcript + # support level is turned into int + if t_supp_lvl == "NA": + t_supp_lvl = 100 + else: + if t_supp_lvl.isdigit(): + t_supp_lvl = int(t_supp_lvl) + else: + t_supp_lvl = 100 + + # decides if the transcript has potential to become the + # representative transcript + if t_supp_lvl < cur_best_trans[1] or cur_best_trans[0] == "": + cur_best_trans = [cur_tID, t_supp_lvl, 0] + pot_best_trans = False + ignor_trans = False + elif t_supp_lvl == cur_best_trans[1]: + pot_best_trans = [cur_tID, t_supp_lvl, 0] + else: + ignor_trans = True + + # looking for and processing gene entries + elif entry[2] == "gene": + # updating rep_transcripts dict + if cur_g_id in rep_transcripts: + if (rep_transcripts[cur_g_id][1] > cur_best_trans[1] + or (rep_transcripts[cur_g_id][1] == + cur_best_trans[1] + and rep_transcripts[cur_g_id][2] < + cur_best_trans[2])): + rep_transcripts[cur_g_id] = cur_best_trans + else: + rep_transcripts[cur_g_id] = cur_best_trans + + # updating cur_g_id and resetting cur_best_trans + cur_g_id = attributes[1] + cur_best_trans = ["", 100, 0] + + # raises an error for unidentifiable entries + else: + LOG.error() + raise ValueError("This entry could not be identified") + + # adding the final gene to the dictionary + if cur_g_id in rep_transcripts: + if (rep_transcripts[cur_g_id][1] > cur_best_trans[1] + or (rep_transcripts[cur_g_id][1] == cur_best_trans[1] + and rep_transcripts[cur_g_id][2] < cur_best_trans[2])): + rep_transcripts[cur_g_id] = cur_best_trans + else: + rep_transcripts[cur_g_id] = cur_best_trans + + del rep_transcripts[""] + rep_transcripts = self.reformat_reptrans(rep_transcripts) + return rep_transcripts + + def gtf_file_writer(self, original_file: str, + rep_transcript_dict: dict, output_file: str): + """ + This function writes the output GTF file. + """ + output = [] + + with open(original_file, "r", encoding="utf-8") as f: + for line in f: + if line.startswith("#"): + continue + + entry = line.split("\t") + attributes = self.attributes_converter(entry[8]) + feature_type = entry[2] + + if feature_type == "gene": + gene_id = self.find_in_attributes(attributes, "gene_id") + output.append(line) + else: + transcript_id = self.find_in_attributes( + attributes, "transcript_id" + ) + if gene_id in rep_transcript_dict and \ + rep_transcript_dict[gene_id] == transcript_id: + output.append(line) + + with open(output_file, "w", encoding="utf-8") as last_file: + last_file.writelines(output) + + +# def _test(): +# """ +# This funtion is meant to be run for test +# Output: +# file with the dictionary generated based on the test file +# """ +# file_name = "test.gtf" +# rt = get_rep_trans(file_name) +# expected_result = {"ENSG00000160072": "ENST00000472194", +# "ENSG00000234396": "ENST00000442483", +# "ENSG00000225972": "ENST00000416931", +# "ENSG00000224315": "ENST00000428803", +# "ENSG00000198744": "ENST00000416718", +# "ENSG00000279928": "ENST00000624431", +# "ENSG00000228037": "ENST00000424215", +# 'ENSG00000142611': 'ENST00000378391'} +# if rt != expected_result: +# print("The test failed due to not yielding the same results") +# print("The results the program got\n", rt) +# print("The expected results\n", expected_result) +# else: +# print("The test was successful") + + +# # Execution part # +# if __name__ == "__main__": +# parser = argparse.ArgumentParser( +# description="find_representativ_transcripts", +# formatter_class=argparse.ArgumentDefaultsHelpFormatter +# ) +# parser.add_argument("-file_name", required=True, +# help="gtf file with genome annotation") +# parser.add_argument("-t", required=False, default=False, +# help="to run the test input -t True") +# args = parser.parse_args() + +# # standadize the file_name inlude .gtf# +# file_name = args.file_name +# i_gtf = file_name.find(".gtf") +# if i_gtf == -1: +# file_name += ".gtf" + +# if args.t: +# _test() +# else: +# get_rep_trans(file_name) diff --git a/transcript_sampler/match_reptrans_explvl.py b/transcript_sampler/match_reptrans_explvl.py new file mode 100644 index 0000000000000000000000000000000000000000..e6d7a9a95bd8311ef0b6f8ab3fcfe627c1828df9 --- /dev/null +++ b/transcript_sampler/match_reptrans_explvl.py @@ -0,0 +1,345 @@ +"""Match representative transcript with expression level""" +# Made by Hugo Gillet # + +import logging +import pandas as pd +from gtfparse import read_gtf + +LOG = logging.getLogger(__name__) + + +class MatchReptransExplvl: + """Match representative transcript with expression level""" + def __init__(self): + pass + + @staticmethod + def gtf_to_df(gtf_file: str) -> pd.DataFrame: + """ + This function takes a .gtf file and converts it into a pandas DataFrame + containing gene_id and their transcript_id. + + Args: + gtf_file (str): Path to the .gtf file. + + Returns: + df_gtf (pd.DataFrame): Pandas DataFrame containing columns + 'Gene' and 'Transcript'. + + Raises: + None + """ + df_gtf = read_gtf(gtf_file,).to_pandas() + df_gtf = df_gtf[df_gtf["feature"] == "transcript"] + df_gtf = df_gtf[["gene_id", "transcript_id"]] + df_gtf = df_gtf.rename(columns={ + "gene_id": "Gene", "transcript_id": "Transcript" + }) + return df_gtf + + @staticmethod + def dict_repr_trans_to_df(dict_reprTrans: "dict[str, str]") -> pd.DataFrame: + """ + Convert a dictionary of genes and their representative transcript into a DataFrame. + + Args: + dict_reprTrans (dict): {'Gene': ['transcriptA', 'transcriptB'], ...} + + Returns: + Pandas DataFrame with 'Gene' and 'Transcript' as columns. + + Raises: + TypeError: Only dictionaries are allowed. + TypeError: Keys should be strings. + TypeError: Values should be strings. + """ + if not isinstance(dict_reprTrans, dict): + LOG.error("Only dictionaries are allowed") + raise TypeError("Only dictionaries are allowed") + if not all(isinstance(key, str) for key in dict_reprTrans.keys()): + LOG.error("Keys should be strings") + raise TypeError("Keys should be strings") + if not all(isinstance(value, str) for value in dict_reprTrans.values()): + LOG.error("Values should be strings") + raise TypeError("Values should be strings") + + df_reprTrans = pd.DataFrame.from_dict(dict_reprTrans, orient="index", columns=["reprTranscript"]) + df_reprTrans = df_reprTrans.reset_index() + df_reprTrans.columns = ["Gene", "reprTrans"] + df_reprTrans["reprTrans"] = df_reprTrans["reprTrans"].str.replace(r"\.[1-9]", "", regex=True) + + return df_reprTrans + + @staticmethod + def tsv_or_csv_to_df(input_txt: str) -> pd.DataFrame: + """ + Convert a TSV or CSV file into a pandas DataFrame. + + Args: + input_txt (str): TSV or CSV file containing transcript expression levels. + + Returns: + df_gene (pd.DataFrame): Pandas DataFrame with 'Transcript' and 'Expression_level' as columns. + + Raises: + None + """ + df_input = pd.read_csv( + input_txt, + sep=r"[\t,]", + lineterminator="\n", + names=["Transcript", "Expression_level"], + engine="python", + ) + return df_input + + @staticmethod + def expr_level_by_gene( + df_exprTranscript: pd.DataFrame, df_output_gtf_selection: pd.DataFrame + ) -> pd.DataFrame: + """ + Find the gene of each transcript given by the expression level CSV/TSV file + and sum the expression level of all transcripts from the same gene. + + Args: + df_exprTranscript (pd.DataFrame): Pandas DataFrame containing transcripts and their expression levels, + generated by the "tsv_or_csv_to_df" function. + df_output_gtf_selection (pd.DataFrame): Pandas DataFrame containing genes and transcripts, + generated by the "transcripts_by_gene_inDf" function. + + Returns: + Pandas DataFrame having 'Gene' and sum of its transcript expression levels. + + Raises: + None + """ + df_merged = pd.merge(df_output_gtf_selection, df_exprTranscript, how="inner", on="Transcript") + df_sum = df_merged.groupby("Gene")["Expression_level"].sum().reset_index() + return df_sum + + @staticmethod + def match_by_gene( + df_reprTranscript: pd.DataFrame, df_expressionLevel_byGene: pd.DataFrame + ) -> pd.DataFrame: + """ + Find matching genes between the two DataFrames. + + Args: + df_reprTranscript (pd.DataFrame): Pandas DataFrame containing genes and their representative transcripts, + generated by the "dict_repr_trans_to_df()" function. + df_expressionLevel_byGene (pd.DataFrame): Pandas DataFrame containing genes and their expression levels, + generated by the "transcript_by_gene_inDf()" function. + + Returns: + Pandas DataFrame having representative transcripts and their expression levels. + + Raises: + None + """ + df_merged = pd.merge(df_reprTranscript, df_expressionLevel_byGene, how="inner", on="Gene") + df_clean = df_merged.loc[:, ["reprTrans", "Expression_level"]] + return df_clean + + def match_repr_transcript_expression_level( + self, exprTrans: str, dict_reprTrans: dict, gtf_file: str, + ): + """ + Combine functions to replace transcripts from an expression level CSV/TSV file with representative transcripts. + + Args: + exprTrans (str): CSV or TSV file containing transcripts and their expression level. + dict_reprTrans (dict): Dictionary of genes and their representative transcripts. + gtf_file (str): Path to the GTF file. + + Returns: + Pandas DataFrame of representative transcripts and their expression level. + + Raises: + None + """ + df_gene_transcript = self.gtf_to_df(gtf_file) + df_exprTrans = self.tsv_or_csv_to_df(exprTrans) + df_reprTrans = self.dict_repr_trans_to_df(dict_reprTrans) + df_expr_level_by_gene = self.expr_level_by_gene(df_exprTrans, df_gene_transcript) + df_match = self.match_by_gene(df_reprTrans, df_expr_level_by_gene) + df_match.rename(columns={"reprTrans": "id", "Expression_level": "level"}, inplace=True) + return df_match + + + +# def dict_repr_trans_to_df(dict_reprTrans: "dict[str, str]") -> pd.DataFrame: + +# """Convert a dictionary of genes and their representative +# transcript into a dataframe + +# Args: +# dict_reprTrans (dict): {'Gene':['transcriptA', 'transcriptB'], ...} + +# Returns: +# Pandas dataframe having Gene and transcript as columns + +# Raises: +# Only dict are allowed +# Key should be strings +# Value should be strings + +# """ +# pass +# if not type(dict_reprTrans) is dict: +# raise TypeError("Only dict are allowed") +# if type(list(dict_reprTrans.keys())[0]) is not str: +# raise TypeError("Key should be strings") +# if type(list(dict_reprTrans.values())[0]) is not str: +# raise TypeError("Values should be strings") + +# df_reprTrans = pd.DataFrame.from_dict( +# dict_reprTrans, orient="index", columns=["reprTranscript"] +# ) +# df_reprTrans = df_reprTrans.reset_index(level=0) +# df_reprTrans.columns = ["Gene", "reprTrans"] +# df_reprTrans["reprTrans"] = df_reprTrans["reprTrans"].str.replace( +# r"\.[1-9]", "", regex=True +# ) +# return df_reprTrans + + +# def gene_and_transcript(gtf_file: str) -> pd.DataFrame: +# """ +# This function take a .gtf file and convert it into a +# dataframe containing gene_id and their transcripts_id. +# Args: +# gtf_file(str) : path to the .gtf file + +# Returns: +# df_gtf(pd.DataFrame): pandas df containing having has columns +# gene_id and their transcripts_id. +# Raises: +# None +# """ +# df_gtf = read_gtf(gtf_file) +# df_gtf = df_gtf.loc[df_gtf["feature"] == "transcript"] +# df_gtf = df_gtf[["gene_id", "transcript_id"]] +# df_gtf = df_gtf.rename(columns={"gene_id": "Gene", +# "transcript_id": "Transcript"}) +# return df_gtf + + +# def tsv_or_csv_to_df(input_txt: str) -> pd.DataFrame: +# """Convert tsv or csv file into a pandas dataframe + +# Args: +# input_txt (str): csv or tsv file containing transcript exp level + +# Returns: +# df_gene (str): Pandas dataframe having transcript and exp level +# as columns + +# Raises: +# None +# """ +# pass +# df_input = pd.read_csv( +# input_txt, +# sep=r"[\t,]", +# lineterminator="\n", +# names=["Transcript", "Expression_level"], +# engine="python", +# ) +# return df_input + + +# def expr_level_by_gene( +# df_exprTrasncript: pd.DataFrame, df_output_gtf_selection: pd.DataFrame +# ) -> pd.DataFrame: +# """find the gene of each transcipt given by the expression level csv/tsv +# file, and summ expression level of all transcipts from the same gene. + +# Args: +# df_exprTranscript: pandas df containing transcript and +# their exp level generated by "tsv_or_csv_to_df" function +# df_output_gtf_selection : pandas df containing genes and +# transcripts, generated by "transcripts_by_gene_inDf" function + +# Returns: +# Pandas dataframe having gene and sum of its transcript exp level + +# Raises: +# None +# """ +# pass +# df_merged = pd.merge( +# df_output_gtf_selection, df_exprTrasncript, +# how="inner", on="Transcript" +# ) +# df_sum = df_merged.groupby("Gene").sum( +# "Expression_level" +# ) +# return df_sum + + +# def match_by_gene( +# df_reprTranscript: pd.DataFrame, df_expressionLevel_byGene: pd.DataFrame +# ) -> pd.DataFrame: +# """Find matching genes bewteen the 2 args + +# Args: +# df_reprTranscript : pandas Dataframe containing genes +# and their representative transcript, generated by +# "dict_repr_trans_to_df()" +# df_expressionLevel_byGene : pandas Dataframe containing +# genes and their expression level generated by +# "transcript_by_gene_inDf()" + +# Returns: +# Pandas dataframe having representative trasncripts +# and their expression level + +# Raises: +# None +# """ +# pass +# df_merged = pd.merge( +# df_reprTranscript, df_expressionLevel_byGene, how="outer", on="Gene" +# ) +# df_clean = df_merged.dropna(axis=0) +# df_clean = df_clean.loc[:, ["reprTrans", "Expression_level"]] +# return df_clean + + +# # functions to run this part of the programm +# def match_repr_transcript_expression_level( +# exprTrans: str, dict_reprTrans: dict, gtf_file: str, +# ): +# """Combine functions to replace transcripts from an exp level csv/tsv file +# with representative transcripts + +# Args: +# exprTrans (str): csv or tsv file containing transcripts +# and their expression level +# dict_reprTrans (dict) : dict of genes and their +# representative transcipt +# intemediate_file (str) : txt file containing genes, transcript +# and their expression level from the transkript_extractor function +# output_path : path indicating were the tsv file should be written + +# Returns: +# tsv file of representative trasncripts and their expression level + +# Raises: +# None +# """ +# df_gene_transcript = gene_and_transcript(gtf_file) +# df_exprTrans = tsv_or_csv_to_df(exprTrans) +# df_reprTrans = dict_repr_trans_to_df(dict_reprTrans) +# df_expr_level_by_gene = expr_level_by_gene( +# df_exprTrans, df_gene_transcript +# ) # error here +# df_match = match_by_gene(df_reprTrans, df_expr_level_by_gene) +# df_match.rename(columns={'reprTrans': 'id', 'Expression_level': 'level'}, +# inplace=True) +# return df_match + + +# # run the program +# if __name__ == "__main__": +# match_repr_transcript_expression_level() diff --git a/transcript_sampler/new_exe.py b/transcript_sampler/new_exe.py new file mode 100644 index 0000000000000000000000000000000000000000..d96f6136bf181b6c98078bb94a222893ab6e1ef5 --- /dev/null +++ b/transcript_sampler/new_exe.py @@ -0,0 +1,78 @@ +"""This module executes the transcript_sampler""" +import argparse +import time +import logging +logging.basicConfig( + format='[%(asctime)s: %(levelname)s] %(message)s (module "%(module)s")', + level=logging.INFO, + ) +from find_reptrans import FindRepTrans # pylint: disable=E0401,C0413 +from match_reptrans_explvl import MatchReptransExplvl # pylint: disable=E0401,C0413 +from poisson_sampling import SampleTranscript # pylint: disable=E0401,C0413 + +find_rep_trans = FindRepTrans() +match_reptrs_explvl = MatchReptransExplvl() +poisson_sample = SampleTranscript() + +LOG = logging.getLogger(__name__) + + +def exe(input_gtf, input_csv, output_gtf, output_csv, transcript_nr): + """Execute transcript sampler.""" + start = time.time() + LOG.info("Started transcript sampler...") + dict_repr_trans = find_rep_trans.get_rep_trans(input_gtf) + df_repr = match_reptrs_explvl.match_repr_transcript_expression_level( + dict_reprTrans=dict_repr_trans, exprTrans=input_csv, gtf_file=input_gtf + ) + LOG.info( + "Finding match between representative transcripts \ + and expression level file" + ) + LOG.info("Poisson sampling of transcripts") + poisson_sample.transcript_sampling(transcript_nr, df_repr, output_csv) + LOG.info("Output CSV file ready") + + LOG.info("Writing output GTF file") + find_rep_trans.gtf_file_writer(input_gtf, dict_repr_trans, output_gtf) + + end = time.time() + LOG.info("Script executed in %s sec", (end - start)) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Transcript sampler", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument( + "--input_gtf", required=True, + help="GTF file with genome annotation" + ) + parser.add_argument( + "--input_csv", required=True, + help="CSV or TSV file with transcripts and their expression level" + ) + parser.add_argument( + "--output_gtf", required=True, + help="Output path for the new GTF file of representative transcripts" + ) + parser.add_argument( + "--output_csv", required=True, + help="Output path for the new CSV file of representative transcripts \ + and their sampled number" + ) + parser.add_argument( + "--n_to_sample", required=True, + help="Total number of transcripts to sample" + ) + args = parser.parse_args() + print(args) + + exe( + args.input_gtf, + args.input_csv, + args.output_gtf, + args.output_csv, + args.n_to_sample, + ) diff --git a/transcript_sampler/poisson_sampling.py b/transcript_sampler/poisson_sampling.py new file mode 100644 index 0000000000000000000000000000000000000000..6c586aca38c5cc56664d7d8d0344a7cb5b3d6048 --- /dev/null +++ b/transcript_sampler/poisson_sampling.py @@ -0,0 +1,75 @@ +"""Sample transcripts by Poisson-sampling""" + +import pandas as pd +import numpy as np + + +class SampleTranscript: + ''' + Sample transcript + + This part of the code does Poisson sampling proportionally + to gene expression levels for each gene. + + input: total transcript number (int) + csv file with gene id and gene expression levels + (columns named 'id' and 'level') + + output: csv file with gene id and count + gtf file with transcript samples + ''' + @staticmethod + def transcript_sampling(total_transcript_number, df_repr, output_csv): + """Samples transcript based on Poisson-sampling""" + total = df_repr["level"].sum() + total_transcript_number = int(total_transcript_number) + normalized = total_transcript_number / total + levels = np.random.poisson(df_repr["level"] * normalized) + transcript_numbers = pd.DataFrame({ + "id": df_repr["id"], "count": levels + }) + transcript_numbers.to_csv(output_csv, index=False) + + +# python_version = "3.7.13" +# module_list = [pd, np, argparse] +# modul_name_list = ["pd", "np", "argparse"] + +# def transcript_sampling(total_transcript_number, df_repr, output_csv): +# # df = pd.read_csv( +# # csv_file, sep="\t", lineterminator="\n", names=["id", "level"]) +# # the function match_reprTranscript_expressionLevel() now outputs a df +# df = df_repr +# levels = [] +# sums = df['level'].tolist() +# total = sum(sums) +# # I added this because writting a number in the terminal inputed a string +# total_transcript_number = int(total_transcript_number) +# normalized = total_transcript_number/total +# for expression_level in df['level']: +# poisson_sampled = np.random.poisson(expression_level*normalized) +# levels.append(poisson_sampled) + +# transcript_numbers = pd.DataFrame({'id': df['id'], 'count': levels}) +# pd.DataFrame.to_csv(transcript_numbers, output_csv) + + +# if __name__ == '__main__': +# # te.version_control(module_list,modul_name_list,python_version) +# parser = argparse.ArgumentParser( +# description="Transcript Poisson sampler, csv output", +# formatter_class=argparse.ArgumentDefaultsHelpFormatter +# ) + +# parser.add_argument("--expression_level", required=True, +# help="csv file with expression level") +# parser.add_argument("--output_csv", required=True, +# help="output csv file") +# parser.add_argument("--input_csv", required=True, +# help="input csv file") +# parser.add_argument("--transcript_number", required=True, +# help="total number of transcripts to sample") +# args = parser.parse_args() + +# transcript_sampling(args.transcript_number, args.input_csv, +# args.output_csv, args.transcript_number) diff --git a/scripts/transcript_sampler.ipynb b/transcript_sampler/transcript_sampler_org.ipynb similarity index 100% rename from scripts/transcript_sampler.ipynb rename to transcript_sampler/transcript_sampler_org.ipynb diff --git a/transcript_sampler/transcript_sampler_org.py b/transcript_sampler/transcript_sampler_org.py new file mode 100644 index 0000000000000000000000000000000000000000..923cbf81367972b0c39fd7631243aac39a9322b3 --- /dev/null +++ b/transcript_sampler/transcript_sampler_org.py @@ -0,0 +1,400 @@ +import pandas as pd +import numpy as np +import logging +from gtfparse import read_gtf + +LOG = logging.getLogger(__name__) + +def attributes_converter(attributes: str) -> list: + """ + This funtion converts the "unstructured" ;-seperated part of he line into + a list of identifiers and corresponding data, the structure of + which can be used ot find the data easily e.g the index of the identifier + transcript_id + 1 will give the transcript id of the current gene + Input: + attributes = str() # the unstructured part of the entry + Output: + attributes = list() # cleaned list with the characteristics described + """ + attributes = ( + attributes.replace('"', "") + .replace(";", "") + .replace("\\n", "") + .split(" ") + ) + return attributes + + +def find_in_attributes(attributes: list, look_for: str) -> str: + """ + This function finds a keyword and used that to locate the value of that + keyword e.g key = gene_id, value = 'ENSMUSG00002074970', + this works as they are next to each other in the attributes list. + Inputs: + attributes = list() + look_for = str() # string of the name of the key to look for + Output: + attributes[index] or NA = str() # NA is returned if the key + was not found in the attributes + """ + if look_for in attributes: + index = attributes.index(look_for) + 1 + return attributes[index] + else: + LOG.warning(f'No {look_for} in the entry, the return was set to NA') + return "NA" + + +def _re_format(rep_trans_dict: dict) -> dict: + """ + This function is meant to reformat dictionary of the representative + transcripts into an dictionary with only one entry per key + Input: + rep_trans_dict = {gene_id : [ + transcript_id, transcript_support_level, transcript_length]} + Output: + rep_transcripts = {gene_id : transcript_id} + """ + rep_transcripts = dict() + for gene_id in rep_trans_dict: + rep_transcripts[gene_id] = rep_trans_dict[gene_id][0] + + return rep_transcripts + + +def get_rep_trans(file_name: str = "test.gtf") -> dict: + """ + This is the main function of this script. It selects one representative transcript per gene based on a GTF annotation file. + It does so by two criteria: the transcript support level and if there are several transcripts of one gene that have the same transcript_support_level, it chooses the one that corresponds to the longest mRNA. + + Args: + file_name (str): Name of the annotation file with or without the .gtf extension. + + Returns: + rep_transcripts (dict): Dictionary of gene_id to transcript_id representing the selected representative transcripts. + + Raises: + ValueError: If an unexpected entry is encountered in the GTF file. + """ + + # setting default variables + rep_transcripts = {} + cur_gID = "" + cur_best_trans = ["", 100, 0] # [transcript_id, transcript_support_level, transcript_length] + + with open(file_name, "r") as f: + for line in f: + entry = line.split("\t") + + # removes expected but unneeded entries + if len(entry) == 1 or entry[2] in [ + "CDS", + "stop_codon", + "five_prime_utr", + "three_prime_utr", + "start_codon", + "Selenocysteine" + ]: + continue + + # this function turns the less organized part of the entry + # into a readable list + attributes = attributes_converter(entry[8]) + + # looking for and processing exons entries + if entry[2] == "exon": + if ignor_trans: + continue + elif cur_gID != attributes[1]: + LOG.error() + raise ValueError("Exon from an unexpected gene") + elif find_in_attributes(attributes, "transcript_id") != cur_tID: + LOG.error() + raise ValueError("Exon from an unexpected transcript") + + # adding the length of the exon to the appropriate list and + # checking for changes in best transcript + if pot_best_trans: + pot_best_trans[2] += int(entry[4]) - int(entry[3]) + if pot_best_trans[2] > cur_best_trans[2]: + cur_best_trans = pot_best_trans + pot_best_trans = False + else: + cur_best_trans[2] += int(entry[4]) - int(entry[3]) + + # looking for and processing transcript entries + elif entry[2] == "transcript": + # verify that the gen is correct + if cur_gID != attributes[1]: + LOG.error() + raise ValueError("Transcript from an unexpected gene") + + # finding the transcript id and the support level + cur_tID = find_in_attributes(attributes, "transcript_id") + t_supp_lvl = find_in_attributes(attributes, "transcript_support_level") + + # If there is no transcript support level or the level is + # given as NA it is nomed as 100. else the transcript + # support level is turned into int + if t_supp_lvl == "NA": + t_supp_lvl = 100 + else: + if t_supp_lvl.isdigit(): + t_supp_lvl = int(t_supp_lvl) + else: + t_supp_lvl = 100 + + # decides if the transcript has potential to become the + # representative transcript + if t_supp_lvl < cur_best_trans[1] or cur_best_trans[0] == "": + cur_best_trans = [cur_tID, t_supp_lvl, 0] + pot_best_trans = False + ignor_trans = False + elif t_supp_lvl == cur_best_trans[1]: + pot_best_trans = [cur_tID, t_supp_lvl, 0] + else: + ignor_trans = True + + # looking for and processing gene entries + elif entry[2] == "gene": + # updating rep_transcripts dict + if cur_gID in rep_transcripts: + if rep_transcripts[cur_gID][1] > cur_best_trans[1] or (rep_transcripts[cur_gID][1] == cur_best_trans[1] and rep_transcripts[cur_gID][2] < cur_best_trans[2]): + rep_transcripts[cur_gID] = cur_best_trans + else: + rep_transcripts[cur_gID] = cur_best_trans + + # updating cur_gID and resetting cur_best_trans + cur_gID = attributes[1] + cur_best_trans = ["", 100, 0] + + # raises an error for unidentifiable entries + else: + LOG.error() + raise ValueError("This entry could not be identified") + + # adding the final gene to the dictionary + if cur_gID in rep_transcripts: + if rep_transcripts[cur_gID][1] > cur_best_trans[1] or (rep_transcripts[cur_gID][1] == cur_best_trans[1] and rep_transcripts[cur_gID][2] < cur_best_trans[2]): + rep_transcripts[cur_gID] = cur_best_trans + else: + rep_transcripts[cur_gID] = cur_best_trans + + del rep_transcripts[""] + rep_transcripts = _re_format(rep_transcripts) + return rep_transcripts + + +def _test(): + """ + This funtion is meant to be run for test + Output: + file with the dictionary generated based on the test file + """ + file_name = "test.gtf" + rt = get_rep_trans(file_name) + expected_result = { + "ENSG00000160072": "ENST00000472194", + "ENSG00000234396": "ENST00000442483", + "ENSG00000225972": "ENST00000416931", + "ENSG00000224315": "ENST00000428803", + "ENSG00000198744": "ENST00000416718", + "ENSG00000279928": "ENST00000624431", + "ENSG00000228037": "ENST00000424215", + "ENSG00000142611": "ENST00000378391", + } + if rt != expected_result: + print("The test fail due to not yieding the same results") + print("The results the program got\n", rt) + print("The expected results\n", expected_result) + else: + print("The test was succsesfull") + + +def gtf_file_writer(original_file: str, rep_transcript_dict: dict, output_file: str): + """ + This function writes the output GTF file. + """ + output = [] + + with open(original_file, "r") as f: + for line in f: + if line.startswith("#"): + continue + + entry = line.split("\t") + attributes = attributes_converter(entry[8]) + feature_type = entry[2] + + if feature_type == "gene": + gene_id = find_in_attributes(attributes, "gene_id") + output.append(line) + else: + transcript_id = find_in_attributes(attributes, "transcript_id") + if gene_id in rep_transcript_dict and rep_transcript_dict[gene_id] == transcript_id: + output.append(line) + + with open(output_file, "w") as last_file: + last_file.writelines(output) + + +def gtf_to_df(gtf_file: str) -> pd.DataFrame: + """ + This function takes a .gtf file and converts it into a pandas DataFrame + containing gene_id and their transcript_id. + + Args: + gtf_file (str): Path to the .gtf file. + + Returns: + df_gtf (pd.DataFrame): Pandas DataFrame containing columns 'Gene' and 'Transcript'. + + Raises: + None + """ + df_gtf = read_gtf(gtf_file,).to_pandas() + df_gtf = df_gtf[df_gtf["feature"] == "transcript"] + df_gtf = df_gtf[["gene_id", "transcript_id"]] + df_gtf = df_gtf.rename(columns={"gene_id": "Gene", "transcript_id": "Transcript"}) + return df_gtf + + +def dict_reprTrans_to_df(dict_reprTrans: "dict[str, str]") -> pd.DataFrame: + """ + Convert a dictionary of genes and their representative transcript into a DataFrame. + + Args: + dict_reprTrans (dict): {'Gene': ['transcriptA', 'transcriptB'], ...} + + Returns: + Pandas DataFrame with 'Gene' and 'Transcript' as columns. + + Raises: + TypeError: Only dictionaries are allowed. + TypeError: Keys should be strings. + TypeError: Values should be strings. + """ + if not isinstance(dict_reprTrans, dict): + LOG.error() + raise TypeError("Only dictionaries are allowed") + if not all(isinstance(key, str) for key in dict_reprTrans.keys()): + LOG.error() + raise TypeError("Keys should be strings") + if not all(isinstance(value, str) for value in dict_reprTrans.values()): + LOG.error() + raise TypeError("Values should be strings") + + df_reprTrans = pd.DataFrame.from_dict(dict_reprTrans, orient="index", columns=["reprTranscript"]) + df_reprTrans = df_reprTrans.reset_index() + df_reprTrans.columns = ["Gene", "reprTrans"] + df_reprTrans["reprTrans"] = df_reprTrans["reprTrans"].str.replace(r"\.[1-9]", "", regex=True) + + return df_reprTrans + + +def tsv_or_csv_to_df(input_txt: str) -> pd.DataFrame: + """ + Convert a TSV or CSV file into a pandas DataFrame. + + Args: + input_txt (str): TSV or CSV file containing transcript expression levels. + + Returns: + df_gene (pd.DataFrame): Pandas DataFrame with 'Transcript' and 'Expression_level' as columns. + + Raises: + None + """ + df_input = pd.read_csv( + input_txt, + sep=r"[\t,]", + lineterminator="\n", + names=["Transcript", "Expression_level"], + engine="python", + ) + return df_input + + +def exprLevel_byGene( + df_exprTranscript: pd.DataFrame, df_output_gtf_selection: pd.DataFrame +) -> pd.DataFrame: + """ + Find the gene of each transcript given by the expression level CSV/TSV file + and sum the expression level of all transcripts from the same gene. + + Args: + df_exprTranscript (pd.DataFrame): Pandas DataFrame containing transcripts and their expression levels, + generated by the "tsv_or_csv_to_df" function. + df_output_gtf_selection (pd.DataFrame): Pandas DataFrame containing genes and transcripts, + generated by the "transcripts_by_gene_inDf" function. + + Returns: + Pandas DataFrame having 'Gene' and sum of its transcript expression levels. + + Raises: + None + """ + df_merged = pd.merge(df_output_gtf_selection, df_exprTranscript, how="inner", on="Transcript") + df_sum = df_merged.groupby("Gene")["Expression_level"].sum().reset_index() + return df_sum + + +def match_byGene( + df_reprTranscript: pd.DataFrame, df_expressionLevel_byGene: pd.DataFrame +) -> pd.DataFrame: + """ + Find matching genes between the two DataFrames. + + Args: + df_reprTranscript (pd.DataFrame): Pandas DataFrame containing genes and their representative transcripts, + generated by the "dict_reprTrans_to_df()" function. + df_expressionLevel_byGene (pd.DataFrame): Pandas DataFrame containing genes and their expression levels, + generated by the "transcript_by_gene_inDf()" function. + + Returns: + Pandas DataFrame having representative transcripts and their expression levels. + + Raises: + None + """ + df_merged = pd.merge(df_reprTranscript, df_expressionLevel_byGene, how="inner", on="Gene") + df_clean = df_merged.loc[:, ["reprTrans", "Expression_level"]] + return df_clean + + +# functions to run this part of the program + + +def match_reprTranscript_expressionLevel( + exprTrans: str, dict_reprTrans: dict, gtf_file: str, +): + """ + Combine functions to replace transcripts from an expression level CSV/TSV file with representative transcripts. + + Args: + exprTrans (str): CSV or TSV file containing transcripts and their expression level. + dict_reprTrans (dict): Dictionary of genes and their representative transcripts. + gtf_file (str): Path to the GTF file. + + Returns: + Pandas DataFrame of representative transcripts and their expression level. + + Raises: + None + """ + df_gene_transcript = gtf_to_df(gtf_file) + df_exprTrans = tsv_or_csv_to_df(exprTrans) + df_reprTrans = dict_reprTrans_to_df(dict_reprTrans) + df_exprLevel_byGene = exprLevel_byGene(df_exprTrans, df_gene_transcript) + df_match = match_byGene(df_reprTrans, df_exprLevel_byGene) + df_match.rename(columns={"reprTrans": "id", "Expression_level": "level"}, inplace=True) + return df_match + + +def transcript_sampling(total_transcript_number, df_repr, output_csv): + total = df_repr["level"].sum() + total_transcript_number = int(total_transcript_number) + normalized = total_transcript_number / total + levels = np.random.poisson(df_repr["level"] * normalized) + transcript_numbers = pd.DataFrame({"id": df_repr["id"], "count": levels}) + transcript_numbers.to_csv(output_csv, index=False)