From 2c8e894b9ea6847c7600080b3d13de6a886822e8 Mon Sep 17 00:00:00 2001 From: Larissa Glass <larissa.glass@unibas.ch> Date: Tue, 8 Nov 2022 10:41:02 +0000 Subject: [PATCH] Resolve "Tests" --- README.md | 11 ++- tests/__init__.py | 1 + tests/resources/Annotation1.gtf | 5 ++ tests/resources/Annotations2.gtf | 12 +++ tests/resources/Transcript1.csv | 7 ++ tests/resources/Transcript2.tsv | 7 ++ tests/test_main.py | 135 +++++++++++++++++++++++++++++++ 7 files changed, 177 insertions(+), 1 deletion(-) create mode 100644 tests/__init__.py create mode 100644 tests/resources/Annotation1.gtf create mode 100644 tests/resources/Annotations2.gtf create mode 100644 tests/resources/Transcript1.csv create mode 100644 tests/resources/Transcript2.tsv create mode 100644 tests/test_main.py diff --git a/README.md b/README.md index 0a814af..d29a0a8 100644 --- a/README.md +++ b/README.md @@ -39,4 +39,13 @@ To generate the sampled transcripts, run transcript-generator --transcripts <transcripts_file> --annotation <annotations_file> --prob_inclusion=<probability_inclusion> ``` -where the transcripts file should be csv-formatted, the annotation file gtf-formatted and the inclusion probability for introns a float in the range [0,1]. \ No newline at end of file +where the transcripts file should be csv-formatted, the annotation file gtf-formatted and the inclusion probability for introns a float in the range [0,1]. + + +# Development + +To perform all tests, make sure your environment corresponds to the `environment.yml` file and run + +``` +pytest tests +``` \ No newline at end of file diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..8c26b48 --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1 @@ +"""Tests for the package tsg.""" diff --git a/tests/resources/Annotation1.gtf b/tests/resources/Annotation1.gtf new file mode 100644 index 0000000..c794aaa --- /dev/null +++ b/tests/resources/Annotation1.gtf @@ -0,0 +1,5 @@ +1 havana gene 1 100 . + . gene_id "GENE1"; +1 havana transcript 1 100 . + . gene_id "GENE1"; transcript_id "TRANSCRIPT1"; transcript_support_level "1"; +1 havana exon 1 20 . + . gene_id "GENE1"; transcript_id "TRANSCRIPT1"; exon_number "1"; exon_id "ENSE00002234944"; transcript_support_level "1"; +1 havana exon 50 70 . + . gene_id "GENE1"; transcript_id "TRANSCRIPT1"; exon_number "2"; exon_id "ENSE00003582793"; transcript_support_level "1"; +1 havana exon 80 100 . + . gene_id "GENE1"; transcript_id "TRANSCRIPT1"; exon_number "3"; exon_id "ENSE00002312635"; transcript_support_level "1"; diff --git a/tests/resources/Annotations2.gtf b/tests/resources/Annotations2.gtf new file mode 100644 index 0000000..d7f079e --- /dev/null +++ b/tests/resources/Annotations2.gtf @@ -0,0 +1,12 @@ +1 havana transcript 1000 2000 . - . gene_id "GENE2"; transcript_id "TRANSCRIPT2"; transcript_support_level "1"; +1 havana exon 1980 2000 . - . gene_id "GENE2"; transcript_id "TRANSCRIPT2"; exon_number "1"; exon_id "ENSE00001890219"; transcript_support_level "1"; +1 havana exon 1900 1950 . - . gene_id "GENE2"; transcript_id "TRANSCRIPT2"; exon_number "2"; exon_id "ENSE00003507205"; transcript_support_level "1"; +1 havana exon 1800 1850 . - . gene_id "GENE2"; transcript_id "TRANSCRIPT2"; exon_number "3"; exon_id "ENSE00003477500"; transcript_support_level "1"; +1 havana exon 1700 1750 . - . gene_id "GENE2"; transcript_id "TRANSCRIPT2"; exon_number "4"; exon_id "ENSE00003565697"; transcript_support_level "1"; +1 havana exon 1600 1650 . - . gene_id "GENE2"; transcript_id "TRANSCRIPT2"; exon_number "5"; exon_id "ENSE00003475637"; transcript_support_level "1"; +1 havana exon 1500 1550 . - . gene_id "GENE2"; transcript_id "TRANSCRIPT2"; exon_number "6"; exon_id "ENSE00003502542"; transcript_support_level "1"; +1 havana exon 1400 1450 . - . gene_id "GENE2"; transcript_id "TRANSCRIPT2"; exon_number "7"; exon_id "ENSE00003553898"; transcript_support_level "1"; +1 havana exon 1300 1350 . - . gene_id "GENE2"; transcript_id "TRANSCRIPT2"; exon_number "8"; exon_id "ENSE00003621279"; transcript_support_level "1"; +1 havana exon 1200 1250 . - . gene_id "GENE2"; transcript_id "TRANSCRIPT2"; exon_number "9"; exon_id "ENSE00002030414"; transcript_support_level "1"; +1 havana exon 1100 1150 . - . gene_id "GENE2"; transcript_id "TRANSCRIPT2"; exon_number "10"; exon_id "ENSE00001935574"; transcript_support_level "1"; +1 havana exon 1000 1050 . - . gene_id "GENE2"; transcript_id "TRANSCRIPT2"; exon_number "11"; exon_id "ENSE00001843071"; transcript_support_level "1"; diff --git a/tests/resources/Transcript1.csv b/tests/resources/Transcript1.csv new file mode 100644 index 0000000..6635236 --- /dev/null +++ b/tests/resources/Transcript1.csv @@ -0,0 +1,7 @@ +TRANSCRIPT1,92 +TRANSCRIPT2,13 +TRANSCRIPT3,73 +TRANSCRIPT4,83 +TRANSCRIPT5,32 +TRANSCRIPT6,136 +TRANSCRIPT7,36 \ No newline at end of file diff --git a/tests/resources/Transcript2.tsv b/tests/resources/Transcript2.tsv new file mode 100644 index 0000000..06b14c4 --- /dev/null +++ b/tests/resources/Transcript2.tsv @@ -0,0 +1,7 @@ +TRANSCRIPT1 92 +TRANSCRIPT2 13 +TRANSCRIPT3 73 +TRANSCRIPT4 83 +TRANSCRIPT5 32 +TRANSCRIPT6 136 +TRANSCRIPT7 36 \ No newline at end of file diff --git a/tests/test_main.py b/tests/test_main.py new file mode 100644 index 0000000..48b9b55 --- /dev/null +++ b/tests/test_main.py @@ -0,0 +1,135 @@ +"""Tests for main module""" + +import numpy as np +import pandas as pd +import pytest +from tsg.main import Gtf, TranscriptGenerator, dict_to_str, str_to_dict + + +class TestFreeTextParsing: + """Test if free text dictionary is correctly parsed.""" + + def test_str2dict(self): + res = str_to_dict( + 'gene_id "GENE2"; transcript_id "TRANSCRIPT2"; exon_number "1"; exon_id "EXON1";' + ) + + assert res == { + "gene_id": "GENE2", + "transcript_id": "TRANSCRIPT2", + "exon_number": "1", + "exon_id": "EXON1", + } + + def test_dict2str(self): + res = dict_to_str( + { + "gene_id": "GENE2", + "transcript_id": "TRANSCRIPT2", + "exon_number": "1", + "exon_id": "EXON1", + } + ) + print(res) + assert ( + res + == 'gene_id "GENE2"; transcript_id "TRANSCRIPT2"; exon_number "1"; exon_id "EXON1";' + ) + + +class TestGtf: + "Test if Gtf class works correctly." + cols = [ + "seqname", + "source", + "feature", + "start", + "end", + "score", + "strand", + "frame", + "free_text", + ] + + def test_init(self): + annotations = Gtf() + annotations.read_file("tests/resources/Annotation1.gtf") + + assert annotations.parsed == False + assert annotations.original_columns == self.cols + assert annotations.free_text_columns == [] + + def test_parsed(self): + annotations = Gtf() + annotations.read_file("tests/resources/Annotation1.gtf") + annotations.parse_free_text() + + assert annotations.parsed == True + assert set(annotations.free_text_columns) == set( + [ + "gene_id", + "transcript_id", + "exon_number", + "exon_id", + "transcript_support_level", + ] + ) + assert set(annotations.original_columns) == set( + ["seqname", "source", "feature", "start", "end", "score", "strand", "frame"] + ) + + +class TestTranscriptGenerator: + cols = [ + "start", + "end", + "strand", + "transcript_id", + ] + + df1 = pd.DataFrame( + { + "start": [1, 50, 80], + "end": [20, 70, 100], + "strand": ["+", "+", "+"], + "exon_id": ["EXON1", "EXON2", "EXON3"], + } + ) + df2 = pd.DataFrame(columns=["start", "end", "strand"]) + + def test_init(self): + transcripts = TranscriptGenerator("TRANSCRIPT1", 3, self.df1, 0.05) + + assert transcripts.strand == "+" + + def test_init_2(self): + with pytest.raises(AssertionError): + transcripts = TranscriptGenerator("TRANSCRIPT2", 3, self.df2, 0.05) + + def test_init_3(self): + with pytest.raises(AssertionError): + transcripts = TranscriptGenerator("TRANSCRIPT1", 0, self.df1, 0.05) + + def test_inclusions(self): + transcripts = TranscriptGenerator("TRANSCRIPT1", 3, self.df1, 0.5) + res = transcripts._get_inclusions() + + assert res.shape == (3, 3) + + def test_unique_inclusions(self): + transcripts = TranscriptGenerator("TRANSCRIPT1", 3, self.df1, 0.5) + res1, res2, res3 = transcripts._get_unique_inclusions() + + def test_get_df(self): + inclusions = [False, True, False] + expected_end = pd.Series([20, 79, 100], name="end") + transcript_id = "TRANSCRIPT1_1" + + transcripts = TranscriptGenerator("TRANSCRIPT1", 3, self.df1, 0.5) + res = transcripts._get_df(inclusions, transcript_id) + + assert res["transcript_id"].unique().item() == "TRANSCRIPT1_1" + assert res["strand"].unique().item() == "+" + assert res["exon_id"].tolist() == ["EXON1", "EXON2_1", "EXON3"] + pd.testing.assert_series_equal(res["start"], self.df1["start"]) + pd.testing.assert_series_equal(res["end"], expected_end) -- GitLab