From 2c8e894b9ea6847c7600080b3d13de6a886822e8 Mon Sep 17 00:00:00 2001
From: Larissa Glass <larissa.glass@unibas.ch>
Date: Tue, 8 Nov 2022 10:41:02 +0000
Subject: [PATCH] Resolve "Tests"

---
 README.md                        |  11 ++-
 tests/__init__.py                |   1 +
 tests/resources/Annotation1.gtf  |   5 ++
 tests/resources/Annotations2.gtf |  12 +++
 tests/resources/Transcript1.csv  |   7 ++
 tests/resources/Transcript2.tsv  |   7 ++
 tests/test_main.py               | 135 +++++++++++++++++++++++++++++++
 7 files changed, 177 insertions(+), 1 deletion(-)
 create mode 100644 tests/__init__.py
 create mode 100644 tests/resources/Annotation1.gtf
 create mode 100644 tests/resources/Annotations2.gtf
 create mode 100644 tests/resources/Transcript1.csv
 create mode 100644 tests/resources/Transcript2.tsv
 create mode 100644 tests/test_main.py

diff --git a/README.md b/README.md
index 0a814af..d29a0a8 100644
--- a/README.md
+++ b/README.md
@@ -39,4 +39,13 @@ To generate the sampled transcripts, run
 transcript-generator --transcripts <transcripts_file> --annotation <annotations_file> --prob_inclusion=<probability_inclusion>
 ```
 
-where the transcripts file should be csv-formatted, the annotation file gtf-formatted and the inclusion probability for introns a float in the range [0,1].
\ No newline at end of file
+where the transcripts file should be csv-formatted, the annotation file gtf-formatted and the inclusion probability for introns a float in the range [0,1].
+
+
+# Development
+
+To perform all tests, make sure your environment corresponds to the `environment.yml` file and run
+
+```
+pytest tests
+```
\ No newline at end of file
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..8c26b48
--- /dev/null
+++ b/tests/__init__.py
@@ -0,0 +1 @@
+"""Tests for the package tsg."""
diff --git a/tests/resources/Annotation1.gtf b/tests/resources/Annotation1.gtf
new file mode 100644
index 0000000..c794aaa
--- /dev/null
+++ b/tests/resources/Annotation1.gtf
@@ -0,0 +1,5 @@
+1	havana	gene	1	100	.	+	.	gene_id "GENE1";
+1	havana	transcript	1	100	.	+	.	gene_id "GENE1"; transcript_id "TRANSCRIPT1"; transcript_support_level "1";
+1	havana	exon	1	20	.	+	.	gene_id "GENE1"; transcript_id "TRANSCRIPT1"; exon_number "1"; exon_id "ENSE00002234944"; transcript_support_level "1";
+1	havana	exon	50	70	.	+	.	gene_id "GENE1"; transcript_id "TRANSCRIPT1"; exon_number "2"; exon_id "ENSE00003582793"; transcript_support_level "1";
+1	havana	exon	80	100	.	+	.	gene_id "GENE1"; transcript_id "TRANSCRIPT1"; exon_number "3"; exon_id "ENSE00002312635"; transcript_support_level "1";
diff --git a/tests/resources/Annotations2.gtf b/tests/resources/Annotations2.gtf
new file mode 100644
index 0000000..d7f079e
--- /dev/null
+++ b/tests/resources/Annotations2.gtf
@@ -0,0 +1,12 @@
+1	havana	transcript	1000	2000	.	-	.	gene_id "GENE2"; transcript_id "TRANSCRIPT2"; transcript_support_level "1";
+1	havana	exon	1980	2000	.	-	.	gene_id "GENE2"; transcript_id "TRANSCRIPT2"; exon_number "1"; exon_id "ENSE00001890219"; transcript_support_level "1";
+1	havana	exon	1900	1950	.	-	.	gene_id "GENE2"; transcript_id "TRANSCRIPT2"; exon_number "2"; exon_id "ENSE00003507205"; transcript_support_level "1";
+1	havana	exon	1800	1850	.	-	.	gene_id "GENE2"; transcript_id "TRANSCRIPT2"; exon_number "3"; exon_id "ENSE00003477500"; transcript_support_level "1";
+1	havana	exon	1700	1750	.	-	.	gene_id "GENE2"; transcript_id "TRANSCRIPT2"; exon_number "4"; exon_id "ENSE00003565697"; transcript_support_level "1";
+1	havana	exon	1600	1650	.	-	.	gene_id "GENE2"; transcript_id "TRANSCRIPT2"; exon_number "5"; exon_id "ENSE00003475637"; transcript_support_level "1";
+1	havana	exon	1500	1550	.	-	.	gene_id "GENE2"; transcript_id "TRANSCRIPT2"; exon_number "6"; exon_id "ENSE00003502542"; transcript_support_level "1";
+1	havana	exon	1400	1450	.	-	.	gene_id "GENE2"; transcript_id "TRANSCRIPT2"; exon_number "7"; exon_id "ENSE00003553898"; transcript_support_level "1";
+1	havana	exon	1300	1350	.	-	.	gene_id "GENE2"; transcript_id "TRANSCRIPT2"; exon_number "8"; exon_id "ENSE00003621279"; transcript_support_level "1";
+1	havana	exon	1200	1250	.	-	.	gene_id "GENE2"; transcript_id "TRANSCRIPT2"; exon_number "9"; exon_id "ENSE00002030414"; transcript_support_level "1";
+1	havana	exon	1100	1150	.	-	.	gene_id "GENE2"; transcript_id "TRANSCRIPT2"; exon_number "10"; exon_id "ENSE00001935574"; transcript_support_level "1";
+1	havana	exon	1000	1050	.	-	.	gene_id "GENE2"; transcript_id "TRANSCRIPT2"; exon_number "11"; exon_id "ENSE00001843071"; transcript_support_level "1";
diff --git a/tests/resources/Transcript1.csv b/tests/resources/Transcript1.csv
new file mode 100644
index 0000000..6635236
--- /dev/null
+++ b/tests/resources/Transcript1.csv
@@ -0,0 +1,7 @@
+TRANSCRIPT1,92
+TRANSCRIPT2,13
+TRANSCRIPT3,73
+TRANSCRIPT4,83
+TRANSCRIPT5,32
+TRANSCRIPT6,136
+TRANSCRIPT7,36
\ No newline at end of file
diff --git a/tests/resources/Transcript2.tsv b/tests/resources/Transcript2.tsv
new file mode 100644
index 0000000..06b14c4
--- /dev/null
+++ b/tests/resources/Transcript2.tsv
@@ -0,0 +1,7 @@
+TRANSCRIPT1	92
+TRANSCRIPT2	13
+TRANSCRIPT3	73
+TRANSCRIPT4	83
+TRANSCRIPT5	32
+TRANSCRIPT6	136
+TRANSCRIPT7	36
\ No newline at end of file
diff --git a/tests/test_main.py b/tests/test_main.py
new file mode 100644
index 0000000..48b9b55
--- /dev/null
+++ b/tests/test_main.py
@@ -0,0 +1,135 @@
+"""Tests for main module"""
+
+import numpy as np
+import pandas as pd
+import pytest
+from tsg.main import Gtf, TranscriptGenerator, dict_to_str, str_to_dict
+
+
+class TestFreeTextParsing:
+    """Test if free text dictionary is correctly parsed."""
+
+    def test_str2dict(self):
+        res = str_to_dict(
+            'gene_id "GENE2"; transcript_id "TRANSCRIPT2"; exon_number "1"; exon_id "EXON1";'
+        )
+
+        assert res == {
+            "gene_id": "GENE2",
+            "transcript_id": "TRANSCRIPT2",
+            "exon_number": "1",
+            "exon_id": "EXON1",
+        }
+
+    def test_dict2str(self):
+        res = dict_to_str(
+            {
+                "gene_id": "GENE2",
+                "transcript_id": "TRANSCRIPT2",
+                "exon_number": "1",
+                "exon_id": "EXON1",
+            }
+        )
+        print(res)
+        assert (
+            res
+            == 'gene_id "GENE2"; transcript_id "TRANSCRIPT2"; exon_number "1"; exon_id "EXON1";'
+        )
+
+
+class TestGtf:
+    "Test if Gtf class works correctly."
+    cols = [
+        "seqname",
+        "source",
+        "feature",
+        "start",
+        "end",
+        "score",
+        "strand",
+        "frame",
+        "free_text",
+    ]
+
+    def test_init(self):
+        annotations = Gtf()
+        annotations.read_file("tests/resources/Annotation1.gtf")
+
+        assert annotations.parsed == False
+        assert annotations.original_columns == self.cols
+        assert annotations.free_text_columns == []
+
+    def test_parsed(self):
+        annotations = Gtf()
+        annotations.read_file("tests/resources/Annotation1.gtf")
+        annotations.parse_free_text()
+
+        assert annotations.parsed == True
+        assert set(annotations.free_text_columns) == set(
+            [
+                "gene_id",
+                "transcript_id",
+                "exon_number",
+                "exon_id",
+                "transcript_support_level",
+            ]
+        )
+        assert set(annotations.original_columns) == set(
+            ["seqname", "source", "feature", "start", "end", "score", "strand", "frame"]
+        )
+
+
+class TestTranscriptGenerator:
+    cols = [
+        "start",
+        "end",
+        "strand",
+        "transcript_id",
+    ]
+
+    df1 = pd.DataFrame(
+        {
+            "start": [1, 50, 80],
+            "end": [20, 70, 100],
+            "strand": ["+", "+", "+"],
+            "exon_id": ["EXON1", "EXON2", "EXON3"],
+        }
+    )
+    df2 = pd.DataFrame(columns=["start", "end", "strand"])
+
+    def test_init(self):
+        transcripts = TranscriptGenerator("TRANSCRIPT1", 3, self.df1, 0.05)
+
+        assert transcripts.strand == "+"
+
+    def test_init_2(self):
+        with pytest.raises(AssertionError):
+            transcripts = TranscriptGenerator("TRANSCRIPT2", 3, self.df2, 0.05)
+
+    def test_init_3(self):
+        with pytest.raises(AssertionError):
+            transcripts = TranscriptGenerator("TRANSCRIPT1", 0, self.df1, 0.05)
+
+    def test_inclusions(self):
+        transcripts = TranscriptGenerator("TRANSCRIPT1", 3, self.df1, 0.5)
+        res = transcripts._get_inclusions()
+
+        assert res.shape == (3, 3)
+
+    def test_unique_inclusions(self):
+        transcripts = TranscriptGenerator("TRANSCRIPT1", 3, self.df1, 0.5)
+        res1, res2, res3 = transcripts._get_unique_inclusions()
+
+    def test_get_df(self):
+        inclusions = [False, True, False]
+        expected_end = pd.Series([20, 79, 100], name="end")
+        transcript_id = "TRANSCRIPT1_1"
+
+        transcripts = TranscriptGenerator("TRANSCRIPT1", 3, self.df1, 0.5)
+        res = transcripts._get_df(inclusions, transcript_id)
+
+        assert res["transcript_id"].unique().item() == "TRANSCRIPT1_1"
+        assert res["strand"].unique().item() == "+"
+        assert res["exon_id"].tolist() == ["EXON1", "EXON2_1", "EXON3"]
+        pd.testing.assert_series_equal(res["start"], self.df1["start"])
+        pd.testing.assert_series_equal(res["end"], expected_end)
-- 
GitLab