test_main.py

"""Tests for main module"""

import numpy as np
import pandas as pd
import pytest
from tsg.main import Gtf, TranscriptGenerator, dict_to_str, str_to_dict


class TestFreeTextParsing:
    """Test if free text dictionary is correctly parsed."""

    def test_str2dict(self):
        res = str_to_dict(
            'gene_id "GENE2"; transcript_id "TRANSCRIPT2"; exon_number "1"; exon_id "EXON1";'
        )

        assert res == {
            "gene_id": "GENE2",
            "transcript_id": "TRANSCRIPT2",
            "exon_number": "1",
            "exon_id": "EXON1",
        }

    def test_dict2str(self):
        res = dict_to_str(
            {
                "gene_id": "GENE2",
                "transcript_id": "TRANSCRIPT2",
                "exon_number": "1",
                "exon_id": "EXON1",
            }
        )
        print(res)
        assert (
            res
            == 'gene_id "GENE2"; transcript_id "TRANSCRIPT2"; exon_number "1"; exon_id "EXON1";'
        )


class TestGtf:
    "Test if Gtf class works correctly."
    cols = [
        "seqname",
        "source",
        "feature",
        "start",
        "end",
        "score",
        "strand",
        "frame",
        "free_text",
    ]

    def test_init(self):
        annotations = Gtf()
        annotations.read_file("tests/resources/Annotation1.gtf")

        assert annotations.parsed == False
        assert annotations.original_columns == self.cols
        assert annotations.free_text_columns == []

    def test_parsed(self):
        annotations = Gtf()
        annotations.read_file("tests/resources/Annotation1.gtf")
        annotations.parse_free_text()

        assert annotations.parsed == True
        assert set(annotations.free_text_columns) == set(
            [
                "gene_id",
                "transcript_id",
                "exon_number",
                "exon_id",
                "transcript_support_level",
            ]
        )
        assert set(annotations.original_columns) == set(
            ["seqname", "source", "feature", "start", "end", "score", "strand", "frame"]
        )


class TestTranscriptGenerator:
    cols = [
        "start",
        "end",
        "strand",
        "transcript_id",
    ]

    df1 = pd.DataFrame(
        {
            "start": [1, 50, 80],
            "end": [20, 70, 100],
            "strand": ["+", "+", "+"],
            "exon_id": ["EXON1", "EXON2", "EXON3"],
        }
    )
    df2 = pd.DataFrame(columns=["start", "end", "strand"])

    def test_init(self):
        transcripts = TranscriptGenerator("TRANSCRIPT1", 3, self.df1, 0.05)

        assert transcripts.strand == "+"

    def test_init_2(self):
        with pytest.raises(AssertionError):
            transcripts = TranscriptGenerator("TRANSCRIPT2", 3, self.df2, 0.05)

    def test_init_3(self):
        with pytest.raises(AssertionError):
            transcripts = TranscriptGenerator("TRANSCRIPT1", 0, self.df1, 0.05)

    def test_inclusions(self):
        transcripts = TranscriptGenerator("TRANSCRIPT1", 3, self.df1, 0.5)
        res = transcripts._get_inclusions()

        assert res.shape == (3, 3)

    def test_unique_inclusions(self):
        transcripts = TranscriptGenerator("TRANSCRIPT1", 3, self.df1, 0.5)
        res1, res2, res3 = transcripts._get_unique_inclusions()

    def test_get_df(self):
        inclusions = [False, True, False]
        expected_end = pd.Series([20, 79, 100], name="end")
        transcript_id = "TRANSCRIPT1_1"

        transcripts = TranscriptGenerator("TRANSCRIPT1", 3, self.df1, 0.5)
        res = transcripts._get_df(inclusions, transcript_id)

        assert res["transcript_id"].unique().item() == "TRANSCRIPT1_1"
        assert res["strand"].unique().item() == "+"
        assert res["exon_id"].tolist() == ["EXON1", "EXON2_1", "EXON3"]
        pd.testing.assert_series_equal(res["start"], self.df1["start"])
        pd.testing.assert_series_equal(res["end"], expected_end)