feat: add and update tests, ci, env

1569975c · Mate Balajti · 5a912247 · 1569975c · 1569975c · 1569975c
Commit 1569975c authored 1 year ago by Mate Balajti
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -29,6 +29,6 @@ lint-test-job:   # This job also runs in the test stage.
    - pip install -r requirements.txt
    - pip install -r requirements_dev.txt
    - pip install -e .
-    # - flake8 --docstring-convention google transcript_sampler/ tests/
+    - flake8 --docstring-convention google transcript_sampler/ tests/
-    # - pylint transcript_sampler/ tests/
+    - pylint transcript_sampler/ tests/
-    # - mypy transcript_sampler/
+    - mypy transcript_sampler/ tests/
\ No newline at end of file
--- a/environment.yml
+++ b/environment.yml
+name: scrna-seq-sim
+channels:
+  - defaults
+  - bioconda
+  - conda-forge
+dependencies:
+  - argparse
+  - biopython>=1.78
+  - black
+  - coverage
+  - flake8
+  - flake8-docstrings
+  - gtfparse
+  - polars==0.16.17
+  - mypy
+  - numpy>=1.23.3
+  - pylint
+  - pytest
+  - nextflow
+  - pandas>=1.4.4
+  - pip>=20.2.3
+  - python>=3.6, <=3.10
+  - pip:
+    - -e .
--- a/requirements.txt
+++ b/requirements.txt
 argparse
 biopython
 gtfparse
-numpy >= 1.23.3
+numpy>=1.23.3
-pandas >= 1.4.4
+pandas>=1.4.4
\ No newline at end of file
--- a/setup.py
+++ b/setup.py
 """Set up project."""
 from pathlib import Path
-from setuptools import setup, find_packages
+from setuptools import setup, find_packages  # type: ignore
 project_root_dir = Path(__file__).parent.resolve()
 with open(project_root_dir / "requirements.txt",

--- a/tests/__init__.py
+++ b/tests/__init__.py
+"""Initialize tests."""
--- a/tests/input_files/expression.csv
+++ b/tests/input_files/expression.csv
--- a/tests/input_files/test.gtf
+++ b/tests/input_files/test.gtf
--- a/tests/test_functions.py
+++ b/tests/test_functions.py
 """Tests functions."""
 import os
-import pandas as pd
+import pandas as pd  # type: ignore
 import numpy as np
@@ -36,7 +36,7 @@ def find_output():
        None
    """
    absolute_path = os.path.dirname(__file__)
-    test_file = "ReprTrans_ExpressionLevel.tsv"
+    test_file = "inputs/test_ref_output.tsv"
    full_path = os.path.join(absolute_path, test_file)
    return full_path

--- a/tests/test_match_reptrans_explvl.py
+++ b/tests/test_match_reptrans_explvl.py
 """Tests for match representative transcript with expression level."""
 import pytest
-import pandas as pd
+import pandas as pd  # type: ignore
 import numpy as np
-from pandas.testing import assert_frame_equal
+from pandas.testing import assert_frame_equal  # type: ignore
-import tests.test_functions as tFun
+from transcript_sampler.match_reptrans_explvl import (
-from transcript_sampler.match_reptrans_explvl import \
    MatchReptransExplvl as match
+)
+import tests.test_functions as tFun
 class TestMatchReptrans:
    """Tests for match_reptrans_explvl.py."""
-    # def test_gtf_to_df(self):
-    # TO DO
    def test_dict_repr_trans_to_df(self):
        """Test dict_repr_trans_to_df() function.
@@ -44,7 +42,7 @@ class TestMatchReptrans:
        assert tFun.duplicated_rows(data_frame).empty, \
            "at least one row is duplicated"
        assert tFun.na_value(data_frame) == 0, \
-            "at least one row contain NA values"
+            "at least one row contains NA values"
    def test_tsv_or_csv_to_df(self):
        """Test tsv_or_csv_to_df() function.
@@ -65,9 +63,9 @@ class TestMatchReptrans:
        assert tFun.column_d_type(df_tsv) == datatype, \
            "at least one column has the wrong datatype"
        assert tFun.duplicated_rows(df_tsv).empty, \
-            "at least one row are duplicated "
+            "at least one row is duplicated"
        assert tFun.na_value(df_tsv) == 0, \
-            "at least one row contain NA values"
+            "at least one row contains NA values"
        assert assert_frame_equal(df_tsv, df_csv) is None, \
            "csv and tsv import doesn't match"
@@ -75,7 +73,7 @@ class TestMatchReptrans:
        """Test expr_level_by_gene() function.
        This function test if the function expr_level_by_gene can find
-        the gene of each transcipt given by the expression level csv/tsv
+        the gene of each transcript given by the expression level csv/tsv
        file and sum their expression level
        """
        path_tsv = tFun.find_path(r"test_gene_exprL")
@@ -104,9 +102,9 @@ class TestMatchReptrans:
        assert tFun.column_d_type(df_exp_lvl) == datatype, \
            "at least one column has the wrong datatype"
        assert tFun.duplicated_rows(df_exp_lvl).empty, \
-            "at least one row are duplicated "
+            "at least one row is duplicated"
        assert tFun.na_value(df_exp_lvl) == 0, \
-            "at least one row contain NA values "
+            "at least one row contains NA values"
        assert tFun.duplicated_index(df_exp_lvl).empty, \
            "at least one index element is duplicated"
@@ -151,9 +149,9 @@ class TestMatchReptrans:
        assert tFun.column_d_type(df_match) == datatype, \
            "at least one column has the wrong datatype"
        assert tFun.duplicated_rows(df_match).empty, \
-            "at least one row are duplicated "
+            "at least one row is duplicated"
        assert tFun.na_value(df_match) == 0, \
-            "at least one row contain NA values "
+            "at least one row contains NA values"
        assert tFun.duplicated_index(df_match).empty, \
            "at least one index element is duplicated"
@@ -164,104 +162,37 @@ class TestMatchReptrans:
        function match_repr_transcript_expression_level().
        """
        input_path = tFun.find_path("test_gene_exprL")
-        intermediate_path = tFun.find_path_intermediate_file()
+        gtf_file = tFun.find_path("test.gtf")
        dict_repr_test = {
            'ENSMUSG00000079415': 'ENSMUST00000112933',
            "ENSMUSG00000024691": "ENSMUST00000025595",
            "ENSMUSG00000063683": "ENSMUST00000119960"}
-        match.match_repr_transcript_expression_level(
+        # Create an instance of MatchReptransExplvl
-            self,
+        match_instance = match()
-            exprTrans=input_path,
-            dict_reprTrans=dict_repr_test,
+        df_result = match_instance.match_repr_transcript_expression_level(
-            gtf_file=intermediate_path
+            expr_trans=input_path,
-            )
+            dict_repr_trans=dict_repr_test,
+            gtf_file=gtf_file
+        )
        ref_path = tFun.find_path("test_ref_output.tsv")
        output_path = tFun.find_output()
-        with open(ref_path, 'r', encoding="utf-8") as t1,\
+        with open(
-            open(output_path, 'r', encoding="utf-8") as t2,\
+            ref_path, 'r', encoding="utf-8"
-            open(input_path, 'r', encoding="utf-8") as t3:
+        ) as test_file_1, open(
-            fileRef = t1.readlines()
+            output_path, 'r', encoding="utf-8"
-            fileOutput = t2.readlines()
+        ) as test_file_2:
-            fileInput = t3.readlines()
+            file_ref = test_file_1.readlines()
+            file_output = test_file_2.readlines()
        assert (
-            sorted(fileRef) == sorted(fileOutput)
+            sorted(file_ref) == sorted(file_output)
-            ), "the output does't match the expected tsv file"
+        ), "the output doesn't match the expected tsv file"
        assert (
-            sorted(fileRef) != sorted(fileInput)
+            sorted(file_ref) != sorted(
-            ), "the output does't match the expected tsv file"
+                df_result.to_csv(index=False).splitlines()
-    def test_txt_to_dict(self):
-        """This function tests if txt is convertod to dict"""
-        path = tFun.find_path("test_dict_repr_trans.txt")
-        dico = match.txt_to_dict(path)
-        dict_test = {'ENSMUSG00000079415': 'ENSMUST00000112933',
-                    "ENSMUSG00000024691": "ENSMUST00000025595",
-                    "ENSMUSG00000063683": "ENSMUST00000119960"}
-        assert dico == dict_test
-    def test_transcripts_by_gene_inDf():
-        """
-        This function test if a dataframe generated from
-        the intermediate file is converted in another
-        dataframe without the support level column.
-        """
-        path = tFun.find_path_intermediate_file()
-        df = repr.import_gtfSelection_to_df(path)
-        df_gene = match.transcripts_by_gene_inDf(df)
-        datatype = {'Gene': np.dtype('O'), 'Transcript': np.dtype('O')}
-        assert tFun.column_number(df_gene) == (
-            2, "number of columns is not equal to 2")
-        assert tFun.column_d_type(df_gene) == (
-            datatype, "at least one column has the wrong datatype")
-        assert tFun.duplicated_rows(df_gene).empty, \
-            "at least one row are duplicated"
-        assert tFun.na_value(df_gene) == 0, \
-            "at least one row contain NA values"
-    def test_output_tsv():
-        """Test if a tsv file is generated from a df in the right format."""
-        dict_repr_test = {
-            'ENSMUSG00000079415': 'ENSMUST00000112933',
-            "ENSMUSG00000024691": "ENSMUST00000025595",
-            "ENSMUSG00000063683": "ENSMUST00000119960"}
-        df_dict_repr_trans = match.dict_repr_trans_to_df(dict_repr_test)
-        path_tsv = tFun.find_path(r"test_gene_exprL")
-        df_tsv_exp_lvl = match.tsv_or_csv_to_df(path_tsv)
-        path_intermediate = tFun.find_path_intermediate_file()
-        df_intermediate = repr.import_gtfSelection_to_df(path_intermediate)
-        df_gene_transcript = match.transcripts_by_gene_inDf(df_intermediate)
-        df_exp_lvl = match.expr_level_by_gene(
-            df_tsv_exp_lvl, df_gene_transcript
            )
+        ), "the output doesn't match the expected tsv file"
-        df_match = match.match_by_gene(df_dict_repr_trans, df_exp_lvl)
-        match.output_tsv(df_match)
-        ref_path = tFun.find_path("test_ref_output.tsv")
-        output_path = tFun.find_output()
-        with open(ref_path, 'r', encoding="utf-8") as t1, open(output_path, 'r') as t2:
-            fileRef = t1.readlines()
-            fileOutput = t2.readlines()
-        assert (
-            sorted(fileRef) == sorted(fileOutput)
-            ), "the output does't match the expected tsv file"
-# test_dict_repr_trans_to_df()
-# test_txt_to_dict()
-# test_transcripts_by_gene_inDf()
-# test_tsv_or_csv_to_df()
-# test_expr_level_by_gene()
-# test_match_by_gene()
-# test_output_tsv()
-# test_match_repr_transcript_expression_level()
-# print("test_match is done ! No error was found")