From 1569975c75b472fc0debdb0eadd28e46b5585a14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= <mate.balajti@unibas.ch> Date: Tue, 19 Sep 2023 11:38:10 +0200 Subject: [PATCH] feat: add and update tests, ci, env --- .gitlab-ci.yml | 6 +- environment.yml | 24 ++++ requirements.txt | 4 +- setup.py | 2 +- tests/__init__.py | 1 + tests/{input_files => inputs}/expression.csv | 0 tests/{input_files => inputs}/test.gtf | 0 tests/test_functions.py | 4 +- tests/test_match_reptrans_explvl.py | 137 +++++-------------- 9 files changed, 67 insertions(+), 111 deletions(-) create mode 100644 environment.yml rename tests/{input_files => inputs}/expression.csv (100%) rename tests/{input_files => inputs}/test.gtf (100%) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index c4402dd..e34cd6f 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -29,6 +29,6 @@ lint-test-job: # This job also runs in the test stage. - pip install -r requirements.txt - pip install -r requirements_dev.txt - pip install -e . - # - flake8 --docstring-convention google transcript_sampler/ tests/ - # - pylint transcript_sampler/ tests/ - # - mypy transcript_sampler/ \ No newline at end of file + - flake8 --docstring-convention google transcript_sampler/ tests/ + - pylint transcript_sampler/ tests/ + - mypy transcript_sampler/ tests/ \ No newline at end of file diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000..bb7989a --- /dev/null +++ b/environment.yml @@ -0,0 +1,24 @@ +name: scrna-seq-sim +channels: + - defaults + - bioconda + - conda-forge +dependencies: + - argparse + - biopython>=1.78 + - black + - coverage + - flake8 + - flake8-docstrings + - gtfparse + - polars==0.16.17 + - mypy + - numpy>=1.23.3 + - pylint + - pytest + - nextflow + - pandas>=1.4.4 + - pip>=20.2.3 + - python>=3.6, <=3.10 + - pip: + - -e . diff --git a/requirements.txt b/requirements.txt index 98d4d62..782fb4a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ argparse biopython gtfparse -numpy >= 1.23.3 -pandas >= 1.4.4 \ No newline at end of file +numpy>=1.23.3 +pandas>=1.4.4 \ No newline at end of file diff --git a/setup.py b/setup.py index 9542e8a..721659b 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,6 @@ """Set up project.""" from pathlib import Path -from setuptools import setup, find_packages +from setuptools import setup, find_packages # type: ignore project_root_dir = Path(__file__).parent.resolve() with open(project_root_dir / "requirements.txt", diff --git a/tests/__init__.py b/tests/__init__.py index e69de29..b3d9b62 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -0,0 +1 @@ +"""Initialize tests.""" diff --git a/tests/input_files/expression.csv b/tests/inputs/expression.csv similarity index 100% rename from tests/input_files/expression.csv rename to tests/inputs/expression.csv diff --git a/tests/input_files/test.gtf b/tests/inputs/test.gtf similarity index 100% rename from tests/input_files/test.gtf rename to tests/inputs/test.gtf diff --git a/tests/test_functions.py b/tests/test_functions.py index a0f2f1a..a81b075 100644 --- a/tests/test_functions.py +++ b/tests/test_functions.py @@ -1,6 +1,6 @@ """Tests functions.""" import os -import pandas as pd +import pandas as pd # type: ignore import numpy as np @@ -36,7 +36,7 @@ def find_output(): None """ absolute_path = os.path.dirname(__file__) - test_file = "ReprTrans_ExpressionLevel.tsv" + test_file = "inputs/test_ref_output.tsv" full_path = os.path.join(absolute_path, test_file) return full_path diff --git a/tests/test_match_reptrans_explvl.py b/tests/test_match_reptrans_explvl.py index abef23b..495155d 100644 --- a/tests/test_match_reptrans_explvl.py +++ b/tests/test_match_reptrans_explvl.py @@ -1,19 +1,17 @@ """Tests for match representative transcript with expression level.""" import pytest -import pandas as pd +import pandas as pd # type: ignore import numpy as np -from pandas.testing import assert_frame_equal -import tests.test_functions as tFun -from transcript_sampler.match_reptrans_explvl import \ +from pandas.testing import assert_frame_equal # type: ignore +from transcript_sampler.match_reptrans_explvl import ( MatchReptransExplvl as match +) +import tests.test_functions as tFun class TestMatchReptrans: """Tests for match_reptrans_explvl.py.""" - # def test_gtf_to_df(self): - # TO DO - def test_dict_repr_trans_to_df(self): """Test dict_repr_trans_to_df() function. @@ -44,7 +42,7 @@ class TestMatchReptrans: assert tFun.duplicated_rows(data_frame).empty, \ "at least one row is duplicated" assert tFun.na_value(data_frame) == 0, \ - "at least one row contain NA values" + "at least one row contains NA values" def test_tsv_or_csv_to_df(self): """Test tsv_or_csv_to_df() function. @@ -65,9 +63,9 @@ class TestMatchReptrans: assert tFun.column_d_type(df_tsv) == datatype, \ "at least one column has the wrong datatype" assert tFun.duplicated_rows(df_tsv).empty, \ - "at least one row are duplicated " + "at least one row is duplicated" assert tFun.na_value(df_tsv) == 0, \ - "at least one row contain NA values" + "at least one row contains NA values" assert assert_frame_equal(df_tsv, df_csv) is None, \ "csv and tsv import doesn't match" @@ -75,7 +73,7 @@ class TestMatchReptrans: """Test expr_level_by_gene() function. This function test if the function expr_level_by_gene can find - the gene of each transcipt given by the expression level csv/tsv + the gene of each transcript given by the expression level csv/tsv file and sum their expression level """ path_tsv = tFun.find_path(r"test_gene_exprL") @@ -104,9 +102,9 @@ class TestMatchReptrans: assert tFun.column_d_type(df_exp_lvl) == datatype, \ "at least one column has the wrong datatype" assert tFun.duplicated_rows(df_exp_lvl).empty, \ - "at least one row are duplicated " + "at least one row is duplicated" assert tFun.na_value(df_exp_lvl) == 0, \ - "at least one row contain NA values " + "at least one row contains NA values" assert tFun.duplicated_index(df_exp_lvl).empty, \ "at least one index element is duplicated" @@ -151,9 +149,9 @@ class TestMatchReptrans: assert tFun.column_d_type(df_match) == datatype, \ "at least one column has the wrong datatype" assert tFun.duplicated_rows(df_match).empty, \ - "at least one row are duplicated " + "at least one row is duplicated" assert tFun.na_value(df_match) == 0, \ - "at least one row contain NA values " + "at least one row contains NA values" assert tFun.duplicated_index(df_match).empty, \ "at least one index element is duplicated" @@ -164,104 +162,37 @@ class TestMatchReptrans: function match_repr_transcript_expression_level(). """ input_path = tFun.find_path("test_gene_exprL") - intermediate_path = tFun.find_path_intermediate_file() + gtf_file = tFun.find_path("test.gtf") dict_repr_test = { 'ENSMUSG00000079415': 'ENSMUST00000112933', "ENSMUSG00000024691": "ENSMUST00000025595", "ENSMUSG00000063683": "ENSMUST00000119960"} - match.match_repr_transcript_expression_level( - self, - exprTrans=input_path, - dict_reprTrans=dict_repr_test, - gtf_file=intermediate_path - ) + # Create an instance of MatchReptransExplvl + match_instance = match() + + df_result = match_instance.match_repr_transcript_expression_level( + expr_trans=input_path, + dict_repr_trans=dict_repr_test, + gtf_file=gtf_file + ) ref_path = tFun.find_path("test_ref_output.tsv") output_path = tFun.find_output() - with open(ref_path, 'r', encoding="utf-8") as t1,\ - open(output_path, 'r', encoding="utf-8") as t2,\ - open(input_path, 'r', encoding="utf-8") as t3: - fileRef = t1.readlines() - fileOutput = t2.readlines() - fileInput = t3.readlines() + with open( + ref_path, 'r', encoding="utf-8" + ) as test_file_1, open( + output_path, 'r', encoding="utf-8" + ) as test_file_2: + file_ref = test_file_1.readlines() + file_output = test_file_2.readlines() assert ( - sorted(fileRef) == sorted(fileOutput) - ), "the output does't match the expected tsv file" + sorted(file_ref) == sorted(file_output) + ), "the output doesn't match the expected tsv file" assert ( - sorted(fileRef) != sorted(fileInput) - ), "the output does't match the expected tsv file" - - def test_txt_to_dict(self): - """This function tests if txt is convertod to dict""" - path = tFun.find_path("test_dict_repr_trans.txt") - dico = match.txt_to_dict(path) - dict_test = {'ENSMUSG00000079415': 'ENSMUST00000112933', - "ENSMUSG00000024691": "ENSMUST00000025595", - "ENSMUSG00000063683": "ENSMUST00000119960"} - assert dico == dict_test - - def test_transcripts_by_gene_inDf(): - """ - This function test if a dataframe generated from - the intermediate file is converted in another - dataframe without the support level column. - """ - path = tFun.find_path_intermediate_file() - df = repr.import_gtfSelection_to_df(path) - df_gene = match.transcripts_by_gene_inDf(df) - datatype = {'Gene': np.dtype('O'), 'Transcript': np.dtype('O')} - assert tFun.column_number(df_gene) == ( - 2, "number of columns is not equal to 2") - assert tFun.column_d_type(df_gene) == ( - datatype, "at least one column has the wrong datatype") - assert tFun.duplicated_rows(df_gene).empty, \ - "at least one row are duplicated" - assert tFun.na_value(df_gene) == 0, \ - "at least one row contain NA values" - - def test_output_tsv(): - """Test if a tsv file is generated from a df in the right format.""" - dict_repr_test = { - 'ENSMUSG00000079415': 'ENSMUST00000112933', - "ENSMUSG00000024691": "ENSMUST00000025595", - "ENSMUSG00000063683": "ENSMUST00000119960"} - df_dict_repr_trans = match.dict_repr_trans_to_df(dict_repr_test) - - path_tsv = tFun.find_path(r"test_gene_exprL") - df_tsv_exp_lvl = match.tsv_or_csv_to_df(path_tsv) - path_intermediate = tFun.find_path_intermediate_file() - df_intermediate = repr.import_gtfSelection_to_df(path_intermediate) - df_gene_transcript = match.transcripts_by_gene_inDf(df_intermediate) - - df_exp_lvl = match.expr_level_by_gene( - df_tsv_exp_lvl, df_gene_transcript + sorted(file_ref) != sorted( + df_result.to_csv(index=False).splitlines() ) - - df_match = match.match_by_gene(df_dict_repr_trans, df_exp_lvl) - - match.output_tsv(df_match) - - ref_path = tFun.find_path("test_ref_output.tsv") - output_path = tFun.find_output() - - with open(ref_path, 'r', encoding="utf-8") as t1, open(output_path, 'r') as t2: - fileRef = t1.readlines() - fileOutput = t2.readlines() - - assert ( - sorted(fileRef) == sorted(fileOutput) - ), "the output does't match the expected tsv file" - -# test_dict_repr_trans_to_df() -# test_txt_to_dict() -# test_transcripts_by_gene_inDf() -# test_tsv_or_csv_to_df() -# test_expr_level_by_gene() -# test_match_by_gene() -# test_output_tsv() -# test_match_repr_transcript_expression_level() - -# print("test_match is done ! No error was found") + ), "the output doesn't match the expected tsv file" -- GitLab