From 1569975c75b472fc0debdb0eadd28e46b5585a14 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= <mate.balajti@unibas.ch>
Date: Tue, 19 Sep 2023 11:38:10 +0200
Subject: [PATCH] feat: add and update tests, ci, env

---
 .gitlab-ci.yml                               |   6 +-
 environment.yml                              |  24 ++++
 requirements.txt                             |   4 +-
 setup.py                                     |   2 +-
 tests/__init__.py                            |   1 +
 tests/{input_files => inputs}/expression.csv |   0
 tests/{input_files => inputs}/test.gtf       |   0
 tests/test_functions.py                      |   4 +-
 tests/test_match_reptrans_explvl.py          | 137 +++++--------------
 9 files changed, 67 insertions(+), 111 deletions(-)
 create mode 100644 environment.yml
 rename tests/{input_files => inputs}/expression.csv (100%)
 rename tests/{input_files => inputs}/test.gtf (100%)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index c4402dd..e34cd6f 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -29,6 +29,6 @@ lint-test-job:   # This job also runs in the test stage.
     - pip install -r requirements.txt
     - pip install -r requirements_dev.txt
     - pip install -e .
-    # - flake8 --docstring-convention google transcript_sampler/ tests/
-    # - pylint transcript_sampler/ tests/
-    # - mypy transcript_sampler/
\ No newline at end of file
+    - flake8 --docstring-convention google transcript_sampler/ tests/
+    - pylint transcript_sampler/ tests/
+    - mypy transcript_sampler/ tests/
\ No newline at end of file
diff --git a/environment.yml b/environment.yml
new file mode 100644
index 0000000..bb7989a
--- /dev/null
+++ b/environment.yml
@@ -0,0 +1,24 @@
+name: scrna-seq-sim
+channels:
+  - defaults
+  - bioconda
+  - conda-forge
+dependencies:
+  - argparse
+  - biopython>=1.78
+  - black
+  - coverage
+  - flake8
+  - flake8-docstrings
+  - gtfparse
+  - polars==0.16.17
+  - mypy
+  - numpy>=1.23.3
+  - pylint
+  - pytest
+  - nextflow
+  - pandas>=1.4.4
+  - pip>=20.2.3
+  - python>=3.6, <=3.10
+  - pip:
+    - -e .
diff --git a/requirements.txt b/requirements.txt
index 98d4d62..782fb4a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 argparse
 biopython
 gtfparse
-numpy >= 1.23.3
-pandas >= 1.4.4
\ No newline at end of file
+numpy>=1.23.3
+pandas>=1.4.4
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 9542e8a..721659b 100644
--- a/setup.py
+++ b/setup.py
@@ -1,6 +1,6 @@
 """Set up project."""
 from pathlib import Path
-from setuptools import setup, find_packages
+from setuptools import setup, find_packages  # type: ignore
 
 project_root_dir = Path(__file__).parent.resolve()
 with open(project_root_dir / "requirements.txt",
diff --git a/tests/__init__.py b/tests/__init__.py
index e69de29..b3d9b62 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -0,0 +1 @@
+"""Initialize tests."""
diff --git a/tests/input_files/expression.csv b/tests/inputs/expression.csv
similarity index 100%
rename from tests/input_files/expression.csv
rename to tests/inputs/expression.csv
diff --git a/tests/input_files/test.gtf b/tests/inputs/test.gtf
similarity index 100%
rename from tests/input_files/test.gtf
rename to tests/inputs/test.gtf
diff --git a/tests/test_functions.py b/tests/test_functions.py
index a0f2f1a..a81b075 100644
--- a/tests/test_functions.py
+++ b/tests/test_functions.py
@@ -1,6 +1,6 @@
 """Tests functions."""
 import os
-import pandas as pd
+import pandas as pd  # type: ignore
 import numpy as np
 
 
@@ -36,7 +36,7 @@ def find_output():
         None
     """
     absolute_path = os.path.dirname(__file__)
-    test_file = "ReprTrans_ExpressionLevel.tsv"
+    test_file = "inputs/test_ref_output.tsv"
     full_path = os.path.join(absolute_path, test_file)
     return full_path
 
diff --git a/tests/test_match_reptrans_explvl.py b/tests/test_match_reptrans_explvl.py
index abef23b..495155d 100644
--- a/tests/test_match_reptrans_explvl.py
+++ b/tests/test_match_reptrans_explvl.py
@@ -1,19 +1,17 @@
 """Tests for match representative transcript with expression level."""
 import pytest
-import pandas as pd
+import pandas as pd  # type: ignore
 import numpy as np
-from pandas.testing import assert_frame_equal
-import tests.test_functions as tFun
-from transcript_sampler.match_reptrans_explvl import \
+from pandas.testing import assert_frame_equal  # type: ignore
+from transcript_sampler.match_reptrans_explvl import (
     MatchReptransExplvl as match
+)
+import tests.test_functions as tFun
 
 
 class TestMatchReptrans:
     """Tests for match_reptrans_explvl.py."""
 
-    # def test_gtf_to_df(self):
-    # TO DO
-
     def test_dict_repr_trans_to_df(self):
         """Test dict_repr_trans_to_df() function.
 
@@ -44,7 +42,7 @@ class TestMatchReptrans:
         assert tFun.duplicated_rows(data_frame).empty, \
             "at least one row is duplicated"
         assert tFun.na_value(data_frame) == 0, \
-            "at least one row contain NA values"
+            "at least one row contains NA values"
 
     def test_tsv_or_csv_to_df(self):
         """Test tsv_or_csv_to_df() function.
@@ -65,9 +63,9 @@ class TestMatchReptrans:
         assert tFun.column_d_type(df_tsv) == datatype, \
             "at least one column has the wrong datatype"
         assert tFun.duplicated_rows(df_tsv).empty, \
-            "at least one row are duplicated "
+            "at least one row is duplicated"
         assert tFun.na_value(df_tsv) == 0, \
-            "at least one row contain NA values"
+            "at least one row contains NA values"
         assert assert_frame_equal(df_tsv, df_csv) is None, \
             "csv and tsv import doesn't match"
 
@@ -75,7 +73,7 @@ class TestMatchReptrans:
         """Test expr_level_by_gene() function.
 
         This function test if the function expr_level_by_gene can find
-        the gene of each transcipt given by the expression level csv/tsv
+        the gene of each transcript given by the expression level csv/tsv
         file and sum their expression level
         """
         path_tsv = tFun.find_path(r"test_gene_exprL")
@@ -104,9 +102,9 @@ class TestMatchReptrans:
         assert tFun.column_d_type(df_exp_lvl) == datatype, \
             "at least one column has the wrong datatype"
         assert tFun.duplicated_rows(df_exp_lvl).empty, \
-            "at least one row are duplicated "
+            "at least one row is duplicated"
         assert tFun.na_value(df_exp_lvl) == 0, \
-            "at least one row contain NA values "
+            "at least one row contains NA values"
         assert tFun.duplicated_index(df_exp_lvl).empty, \
             "at least one index element is duplicated"
 
@@ -151,9 +149,9 @@ class TestMatchReptrans:
         assert tFun.column_d_type(df_match) == datatype, \
             "at least one column has the wrong datatype"
         assert tFun.duplicated_rows(df_match).empty, \
-            "at least one row are duplicated "
+            "at least one row is duplicated"
         assert tFun.na_value(df_match) == 0, \
-            "at least one row contain NA values "
+            "at least one row contains NA values"
         assert tFun.duplicated_index(df_match).empty, \
             "at least one index element is duplicated"
 
@@ -164,104 +162,37 @@ class TestMatchReptrans:
         function match_repr_transcript_expression_level().
         """
         input_path = tFun.find_path("test_gene_exprL")
-        intermediate_path = tFun.find_path_intermediate_file()
+        gtf_file = tFun.find_path("test.gtf")
         dict_repr_test = {
             'ENSMUSG00000079415': 'ENSMUST00000112933',
             "ENSMUSG00000024691": "ENSMUST00000025595",
             "ENSMUSG00000063683": "ENSMUST00000119960"}
 
-        match.match_repr_transcript_expression_level(
-            self,
-            exprTrans=input_path,
-            dict_reprTrans=dict_repr_test,
-            gtf_file=intermediate_path
-            )
+        # Create an instance of MatchReptransExplvl
+        match_instance = match()
+
+        df_result = match_instance.match_repr_transcript_expression_level(
+            expr_trans=input_path,
+            dict_repr_trans=dict_repr_test,
+            gtf_file=gtf_file
+        )
 
         ref_path = tFun.find_path("test_ref_output.tsv")
         output_path = tFun.find_output()
 
-        with open(ref_path, 'r', encoding="utf-8") as t1,\
-            open(output_path, 'r', encoding="utf-8") as t2,\
-            open(input_path, 'r', encoding="utf-8") as t3:
-            fileRef = t1.readlines()
-            fileOutput = t2.readlines()
-            fileInput = t3.readlines()
+        with open(
+            ref_path, 'r', encoding="utf-8"
+        ) as test_file_1, open(
+            output_path, 'r', encoding="utf-8"
+        ) as test_file_2:
+            file_ref = test_file_1.readlines()
+            file_output = test_file_2.readlines()
 
         assert (
-            sorted(fileRef) == sorted(fileOutput)
-            ), "the output does't match the expected tsv file"
+            sorted(file_ref) == sorted(file_output)
+        ), "the output doesn't match the expected tsv file"
         assert (
-            sorted(fileRef) != sorted(fileInput)
-            ), "the output does't match the expected tsv file"
-
-    def test_txt_to_dict(self):
-        """This function tests if txt is convertod to dict"""
-        path = tFun.find_path("test_dict_repr_trans.txt")
-        dico = match.txt_to_dict(path)
-        dict_test = {'ENSMUSG00000079415': 'ENSMUST00000112933',
-                    "ENSMUSG00000024691": "ENSMUST00000025595",
-                    "ENSMUSG00000063683": "ENSMUST00000119960"}
-        assert dico == dict_test
-
-    def test_transcripts_by_gene_inDf():
-        """
-        This function test if a dataframe generated from
-        the intermediate file is converted in another
-        dataframe without the support level column.
-        """
-        path = tFun.find_path_intermediate_file()
-        df = repr.import_gtfSelection_to_df(path)
-        df_gene = match.transcripts_by_gene_inDf(df)
-        datatype = {'Gene': np.dtype('O'), 'Transcript': np.dtype('O')}
-        assert tFun.column_number(df_gene) == (
-            2, "number of columns is not equal to 2")
-        assert tFun.column_d_type(df_gene) == (
-            datatype, "at least one column has the wrong datatype")
-        assert tFun.duplicated_rows(df_gene).empty, \
-            "at least one row are duplicated"
-        assert tFun.na_value(df_gene) == 0, \
-            "at least one row contain NA values"
-
-    def test_output_tsv():
-        """Test if a tsv file is generated from a df in the right format."""
-        dict_repr_test = {
-            'ENSMUSG00000079415': 'ENSMUST00000112933',
-            "ENSMUSG00000024691": "ENSMUST00000025595",
-            "ENSMUSG00000063683": "ENSMUST00000119960"}
-        df_dict_repr_trans = match.dict_repr_trans_to_df(dict_repr_test)
-
-        path_tsv = tFun.find_path(r"test_gene_exprL")
-        df_tsv_exp_lvl = match.tsv_or_csv_to_df(path_tsv)
-        path_intermediate = tFun.find_path_intermediate_file()
-        df_intermediate = repr.import_gtfSelection_to_df(path_intermediate)
-        df_gene_transcript = match.transcripts_by_gene_inDf(df_intermediate)
-
-        df_exp_lvl = match.expr_level_by_gene(
-            df_tsv_exp_lvl, df_gene_transcript
+            sorted(file_ref) != sorted(
+                df_result.to_csv(index=False).splitlines()
             )
-
-        df_match = match.match_by_gene(df_dict_repr_trans, df_exp_lvl)
-
-        match.output_tsv(df_match)
-
-        ref_path = tFun.find_path("test_ref_output.tsv")
-        output_path = tFun.find_output()
-
-        with open(ref_path, 'r', encoding="utf-8") as t1, open(output_path, 'r') as t2:
-            fileRef = t1.readlines()
-            fileOutput = t2.readlines()
-
-        assert (
-            sorted(fileRef) == sorted(fileOutput)
-            ), "the output does't match the expected tsv file"
-
-# test_dict_repr_trans_to_df()
-# test_txt_to_dict()
-# test_transcripts_by_gene_inDf()
-# test_tsv_or_csv_to_df()
-# test_expr_level_by_gene()
-# test_match_by_gene()
-# test_output_tsv()
-# test_match_repr_transcript_expression_level()
-
-# print("test_match is done ! No error was found")
+        ), "the output doesn't match the expected tsv file"
-- 
GitLab