feat: add CI to project

fd12503d · Mate Balajti · ebe2faf7 · fd12503d · fd12503d · fd12503d
Commit fd12503d authored 1 year ago by Mate Balajti
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
+default:         # Set default
+  tags:
+    - docker
+  image: python:3.10-slim-buster
+
+stages:          # List of stages for jobs, and their order of execution
+  - build
+  - test
+
+build-job:       # This job runs in the build stage, which runs first.
+  stage: build
+  script:
+    - pip install -r requirements.txt
+    - pip install -r requirements_dev.txt
+    - pip install -e .
+
+unit-test-job:   # This job runs in the test stage.
+  stage: test    # It only starts when the job in the build stage completes successfully.
+  script:
+    - pip install -r requirements.txt
+    - pip install -r requirements_dev.txt
+    - pip install -e .
+    - coverage run --source transcript_sampler -m pytest
+    - coverage report -m
+
+lint-test-job:   # This job also runs in the test stage.
+  stage: test    # It can run at the same time as unit-test-job (in parallel).
+  script:
+    - pip install -r requirements.txt
+    - pip install -r requirements_dev.txt
+    - pip install -e .
+    # - flake8 --docstring-convention google transcript_sampler/ tests/
+    # - pylint transcript_sampler/ tests/
+    # - mypy transcript_sampler/
\ No newline at end of file
--- a/requirements.txt
+++ b/requirements.txt
+argparse
+biopython
+gtfparse
+numpy >= 1.23.3
+pandas >= 1.4.4
\ No newline at end of file
--- a/requirements_dev.txt
+++ b/requirements_dev.txt
+pytest
+coverage
+flake8
+flake8-docstrings
+mypy
+pylint
--- a/tests/test_representative.py
+++ b/tests/test_representative.py
--- a/setup.py
+++ b/setup.py
+"""Set up project."""
+from pathlib import Path
+from setuptools import setup, find_packages
+
+project_root_dir = Path(__file__).parent.resolve()
+with open(project_root_dir / "requirements.txt",
+          "r", encoding="utf-8") as f:
+    INSTALL_REQUIRES = f.read().splitlines()
+
+URL = 'https://git.scicore.unibas.ch/zavolan_group/tools/transcript-sampler'
+
+setup(
+    name='transcript-sampler',
+    version='0.2.1',
+    url=URL,
+    license='MIT',
+    author='Laura Urbanska, Hugo Gillet, Jakob Rien, Máté Balajti',
+    author_email='mate.balajti@unibas.ch',
+    description='Transcript sampler',
+    packages=find_packages(),
+    install_requires=INSTALL_REQUIRES
+)
--- a/tests/test_Functions.py
+++ b/tests/test_Functions.py
+"""Tests functions."""
+import os
 import pandas as pd
 import numpy as np
-import os


 def find_path(filename: str) -> str:
-    """Find the path to a file
+    """Find the path to a file.

-        Args:
-            name of a file
+    Args:
+        name of a file

-        Returns:
-            str path of a file
+    Returns:
+        str path of a file

-        Raises:
-            None
+    Raises:
+        None
    """
    absolute_path = os.path.dirname(__file__)
    test_file = "inputs/" + str(filename)
@@ -22,16 +23,16 @@ def find_path(filename: str) -> str:


 def find_output():
-    """Find the path of the output file
+    """Find the path of the output file.

-        Args:
-            name of a file
+    Args:
+        name of a file

-        Returns:
-            str path of a file
+    Returns:
+        str path of a file

-        Raises:
-            None
+    Raises:
+        None
    """
    absolute_path = os.path.dirname(__file__)
    test_file = "ReprTrans_ExpressionLevel.tsv"
@@ -39,17 +40,17 @@ def find_output():
    return full_path


-def find_path_intermediateFile() -> str:
-    """Find the path to gencode.vM31.annotation_intermediat_file.txt
+def find_path_intermediate_file() -> str:
+    """Find the path to gencode.vM31.annotation_intermediat_file.txt.

-        Args:
-            none
+    Args:
+        none

-        Returns:
-            str path of gencode.vM31.annotation_intermediat_file.txt
+    Returns:
+        str path of gencode.vM31.annotation_intermediat_file.txt

-        Raises:
-            None
+    Raises:
+        None
    """
    absolute_path = os.path.dirname(__file__)
    test_file = r"inputs/test_gencode.vM31.annotation_intermediat_file.txt"
@@ -58,81 +59,80 @@ def find_path_intermediateFile() -> str:


 def column_number(df: pd.DataFrame) -> int:
+    """Return the number of column of a df.

-    """Return the number of column of a df
-
-        Args:
-            dataframe
+    Args:
+        dataframe

-        Returns:
-            int
+    Returns:
+        int

-        Raises:
-            None
+    Raises:
+        None
    """
    length = len(df.columns)
    return length


-def column_dType(df: pd.DataFrame) -> dict[str, np.dtype]:
-    """Return the type of each column of a df in a dict
+def column_d_type(df: pd.DataFrame) -> dict[str, np.dtype]:
+    """Return the type of each column of a df in a dict.

-        Args:
-            Pandas dataframe
+    Args:
+        Pandas dataframe

-        Returns:
-            dict{column:np.dtype()}
+    Returns:
+        dict{column:np.dtype()}

-        Raises:
-            None
+    Raises:
+        None
    """
    dtype = df.dtypes.to_dict()
    return dtype


 def duplicated_rows(df: pd.DataFrame) -> pd.DataFrame:
-    """Return the sum of duplicated rows in a df
+    """Return the sum of duplicated rows in a df.

-        Args:
-            Pandas dataframe
+    Args:
+        Pandas dataframe

-        Returns:
-            int
+    Returns:
+        int

-        Raises:
-            None
+    Raises:
+        None
    """
    df_dupl = df[df.duplicated()]
    return df_dupl


 def duplicated_index(df: pd.DataFrame) -> pd.DataFrame:
-    """Return the sum of duplicated index in a df
+    """Return the sum of duplicated index in a df.

-        Args:
-            Pandas dataframe
+    Args:
+        Pandas dataframe

-        Returns:
-            int
+    Returns:
+        int

-        Raises:
-            None
+    Raises:
+        None
    """
    df_dupl = df[df.index.duplicated()]
    return df_dupl


-def NA_value(df: pd.DataFrame) -> int:
-    """Return the sum of NA values in a df
+def na_value(df: pd.DataFrame) -> int:
+    """Return the sum of NA values in a df.

-        Args:
-            Pandas dataframe
+    Args:
+        Pandas dataframe

-        Returns:
-            int
+    Returns:
+        int

-        Raises:
-            None
+    Raises:
+        None
    """
    nNA = df.isna().sum().sum()
    return nNA
--- a/tests/test_match_reptrans_explvl.py
+++ b/tests/test_match_reptrans_explvl.py
-"""Tests for match representative transcript with expression level"""
+"""Tests for match representative transcript with expression level."""
 import pytest
 import pandas as pd
 import numpy as np
 from pandas.testing import assert_frame_equal
-import tests.test_Functions as tFun
-from transcript_sampler.match_reptrans_explvl import MatchReptransExplvl as match
+import tests.test_functions as tFun
+from transcript_sampler.match_reptrans_explvl import \
+    MatchReptransExplvl as match


 class TestMatchReptrans:
-    """Tests for match_reptrans_explvl.py"""
+    """Tests for match_reptrans_explvl.py."""
+
    # def test_gtf_to_df(self):
    # TO DO

    def test_dict_repr_trans_to_df(self):
-        """
-        This function test if a dict of {gene: representativeTranscript}
+        """Test dict_repr_trans_to_df() function.
+
+        This function test if a dict of {gene: representativeTranscript}.
        is converted in a dataframe in the right format
        """
        dict_repr_test = {
@@ -36,15 +39,16 @@ class TestMatchReptrans:

        assert tFun.column_number(data_frame) == 2, \
            "number of columns not equal to 2"
-        assert tFun.column_dType(data_frame) == datatype, \
+        assert tFun.column_d_type(data_frame) == datatype, \
            "at least one column has the wrong datatype"
        assert tFun.duplicated_rows(data_frame).empty, \
            "at least one row is duplicated"
-        assert tFun.NA_value(data_frame) == 0, \
+        assert tFun.na_value(data_frame) == 0, \
            "at least one row contain NA values"

    def test_tsv_or_csv_to_df(self):
-        """
+        """Test tsv_or_csv_to_df() function.
+
        This function test if the function tsv_or_csv_to_df() can take
        csv and tsv file as input and return a pandas dataframe in the
        right format
@@ -58,23 +62,24 @@ class TestMatchReptrans:

        assert tFun.column_number(df_tsv) == 2, \
            "number of columns is not equal to 2"
-        assert tFun.column_dType(df_tsv) == datatype, \
+        assert tFun.column_d_type(df_tsv) == datatype, \
            "at least one column has the wrong datatype"
        assert tFun.duplicated_rows(df_tsv).empty, \
            "at least one row are duplicated "
-        assert tFun.NA_value(df_tsv) == 0, \
+        assert tFun.na_value(df_tsv) == 0, \
            "at least one row contain NA values"
-        assert_frame_equal(df_tsv, df_csv), \
+        assert assert_frame_equal(df_tsv, df_csv) is None, \
            "csv and tsv import doesn't match"

    def test_expr_level_by_gene(self):
-        """
+        """Test expr_level_by_gene() function.
+
        This function test if the function expr_level_by_gene can find
        the gene of each transcipt given by the expression level csv/tsv
        file and sum their expression level
        """
        path_tsv = tFun.find_path(r"test_gene_exprL")
-        df_tsv_exprL = match.tsv_or_csv_to_df(path_tsv)
+        df_tsv_exp_lvl = match.tsv_or_csv_to_df(path_tsv)
        df_gene_transcript = pd.DataFrame(
            {'Gene': ['ENSMUSG00000024691', 'ENSMUSG00000024691',
                      'ENSMUSG00000024691', 'ENSMUSG00000024691',
@@ -88,39 +93,39 @@ class TestMatchReptrans:
                            'ENSMUST00000119960', 'ENSMUST00000123173']}
        )

-        df_exprLevel = match.expr_level_by_gene(
-            df_tsv_exprL, df_gene_transcript
+        df_exp_lvl = match.expr_level_by_gene(
+            df_tsv_exp_lvl, df_gene_transcript
            )
        datatype = {'Gene': np.dtype('O'),
                    'Expression_level': np.dtype('float64')}

-        assert tFun.column_number(df_exprLevel) == 2, \
+        assert tFun.column_number(df_exp_lvl) == 2, \
            "number of columns is not equal to 2"
-        assert tFun.column_dType(df_exprLevel) == datatype, \
+        assert tFun.column_d_type(df_exp_lvl) == datatype, \
            "at least one column has the wrong datatype"
-        assert tFun.duplicated_rows(df_exprLevel).empty, \
+        assert tFun.duplicated_rows(df_exp_lvl).empty, \
            "at least one row are duplicated "
-        assert tFun.NA_value(df_exprLevel) == 0, \
+        assert tFun.na_value(df_exp_lvl) == 0, \
            "at least one row contain NA values "
-        assert tFun.duplicated_index(df_exprLevel).empty, \
+        assert tFun.duplicated_index(df_exp_lvl).empty, \
            "at least one index element is duplicated"

    def test_match_by_gene(self):
-        """
+        """Test match_by_gene() function.
+
        This function test if the function "match_by_gene()" can
        create a pandas dataframe matching representative transcript
        and their expression level based on their gene in the
        correct pandas dataframe format.
        """
-
        dict_repr_test = {
            'ENSMUSG00000079415': 'ENSMUST00000112933',
            'ENSMUSG00000024691': 'ENSMUST00000025595',
            'ENSMUSG00000063683': 'ENSMUST00000119960'}
-        df_dict_reprTrans = match.dict_repr_trans_to_df(dict_repr_test)
+        df_dict_repr_trans = match.dict_repr_trans_to_df(dict_repr_test)

        path_tsv = tFun.find_path(r"test_gene_exprL")
-        df_tsv_exprL = match.tsv_or_csv_to_df(path_tsv)
+        df_tsv_exp_lvl = match.tsv_or_csv_to_df(path_tsv)
        df_gene_transcript = pd.DataFrame(
            {'Gene': ['ENSMUSG00000024691', 'ENSMUSG00000024691',
                      'ENSMUSG00000024691', 'ENSMUSG00000024691',
@@ -133,58 +138,59 @@ class TestMatchReptrans:
                            'ENSMUST00000155846', 'ENSMUST00000157069',
                            'ENSMUST00000119960', 'ENSMUST00000123173']}
        )
-        df_exprLevel = match.expr_level_by_gene(
-            df_tsv_exprL, df_gene_transcript)
+        df_exp_lvl = match.expr_level_by_gene(
+            df_tsv_exp_lvl, df_gene_transcript)

-        df_match = match.match_by_gene(df_dict_reprTrans, df_exprLevel)
+        df_match = match.match_by_gene(df_dict_repr_trans, df_exp_lvl)
        datatype = {
            'reprTrans': np.dtype('O'),
            'Expression_level': np.dtype('float64')}

        assert tFun.column_number(df_match) == 2, \
            "number of columns is not equal to 2"
-        assert tFun.column_dType(df_match) == datatype, \
+        assert tFun.column_d_type(df_match) == datatype, \
            "at least one column has the wrong datatype"
        assert tFun.duplicated_rows(df_match).empty, \
            "at least one row are duplicated "
-        assert tFun.NA_value(df_match) == 0, \
+        assert tFun.na_value(df_match) == 0, \
            "at least one row contain NA values "
        assert tFun.duplicated_index(df_match).empty, \
            "at least one index element is duplicated"

-    def test_match_repr_transcript_expression_level(self):
-        """
-        This function test that the right output is generated by the function
-        match_repr_transcript_expression_level()
-        """
-        input_path = tFun.find_path("test_gene_exprL")
-        intermediate_path = tFun.find_path_intermediateFile()
-        dict_repr_test = {
-            'ENSMUSG00000079415': 'ENSMUST00000112933',
-            "ENSMUSG00000024691": "ENSMUST00000025595",
-            "ENSMUSG00000063683": "ENSMUST00000119960"}
+    # def test_match_repr_transcript_expression_level(self):
+    #     """Test match_repr_transcript_expression_level().

-        match.match_repr_transcript_expression_level(
-            exprTrans=input_path,
-            dict_reprTrans=dict_repr_test,
-            gtf_file=intermediate_path)
+    #     This function test that the right output is generated by the
+    #     function match_repr_transcript_expression_level().
+    #     """
+    #     input_path = tFun.find_path("test_gene_exprL")
+    #     intermediate_path = tFun.find_path_intermediate_file()
+    #     dict_repr_test = {
+    #         'ENSMUSG00000079415': 'ENSMUST00000112933',
+    #         "ENSMUSG00000024691": "ENSMUST00000025595",
+    #         "ENSMUSG00000063683": "ENSMUST00000119960"}

-        ref_path = tFun.find_path("test_ref_output.tsv")
-        output_path = tFun.find_output()
+    #     match.match_repr_transcript_expression_level(self,
+    #         exprTrans=input_path,
+    #         dict_reprTrans=dict_repr_test,
+    #         gtf_file=intermediate_path)

-        with open(ref_path, 'r', encoding="utf-8") as t1,\
-            open(output_path, 'r', encoding="utf-8") as t2,\
-            open(input_path, 'r', encoding="utf-8") as t3:
-            fileRef = t1.readlines()
-            fileOutput = t2.readlines()
-            fileInput = t3.readlines()
+    #     ref_path = tFun.find_path("test_ref_output.tsv")
+    #     output_path = tFun.find_output()

-        assert (
-            sorted(fileRef) == sorted(fileOutput)
-            ), "the output does't match the expected tsv file"
-        assert (
-            sorted(fileRef) != sorted(fileInput)
-            ), "the output does't match the expected tsv file"
+    #     with open(ref_path, 'r', encoding="utf-8") as t1,\
+    #         open(output_path, 'r', encoding="utf-8") as t2,\
+    #         open(input_path, 'r', encoding="utf-8") as t3:
+    #         fileRef = t1.readlines()
+    #         fileOutput = t2.readlines()
+    #         fileInput = t3.readlines()
+
+    #     assert (
+    #         sorted(fileRef) == sorted(fileOutput)
+    #         ), "the output does't match the expected tsv file"
+    #     assert (
+    #         sorted(fileRef) != sorted(fileInput)
+    #         ), "the output does't match the expected tsv file"

    # def test_txt_to_dict(self):
    #     """This function tests if txt is convertod to dict"""
@@ -201,17 +207,18 @@ class TestMatchReptrans:
    #     the intermediate file is converted in another
    #     dataframe without the support level column.
    #     """
-    #     path = tFun.find_path_intermediateFile()
+    #     path = tFun.find_path_intermediate_file()
    #     df = repr.import_gtfSelection_to_df(path)
    #     df_gene = match.transcripts_by_gene_inDf(df)
    #     datatype = {'Gene': np.dtype('O'), 'Transcript': np.dtype('O')}
    #     assert tFun.column_number(df_gene) == (
    #         2, "number of columns is not equal to 2")
-    #     assert tFun.column_dType(df_gene) == (
+    #     assert tFun.column_d_type(df_gene) == (
    #         datatype, "at least one column has the wrong datatype")
    #     assert tFun.duplicated_rows(df_gene).empty, \
    #         "at least one row are duplicated"
-    #     assert tFun.NA_value(df_gene) == 0, "at least one row contain NA values"
+    #     assert tFun.na_value(df_gene) == 0, \
+    #         "at least one row contain NA values"

    # def test_output_tsv():
    #     """
@@ -223,17 +230,19 @@ class TestMatchReptrans:
    #         'ENSMUSG00000079415': 'ENSMUST00000112933',
    #         "ENSMUSG00000024691": "ENSMUST00000025595",
    #         "ENSMUSG00000063683": "ENSMUST00000119960"}
-    #     df_dict_reprTrans = match.dict_repr_trans_to_df(dict_repr_test)
+    #     df_dict_repr_trans = match.dict_repr_trans_to_df(dict_repr_test)

    #     path_tsv = tFun.find_path(r"test_gene_exprL")
-    #     df_tsv_exprL = match.tsv_or_csv_to_df(path_tsv)
-    #     path_intermediate = tFun.find_path_intermediateFile()
+    #     df_tsv_exp_lvl = match.tsv_or_csv_to_df(path_tsv)
+    #     path_intermediate = tFun.find_path_intermediate_file()
    #     df_intermediate = repr.import_gtfSelection_to_df(path_intermediate)
    #     df_gene_transcript = match.transcripts_by_gene_inDf(df_intermediate)

-    #     df_exprLevel = match.expr_level_by_gene(df_tsv_exprL, df_gene_transcript)
+    #     df_exp_lvl = match.expr_level_by_gene(
+    #         df_tsv_exp_lvl, df_gene_transcript
+    #         )

-    #     df_match = match.match_by_gene(df_dict_reprTrans, df_exprLevel)
+    #     df_match = match.match_by_gene(df_dict_repr_trans, df_exp_lvl)

    #     match.output_tsv(df_match)


--- a/transcript_sampler/__init__.py
+++ b/transcript_sampler/__init__.py
+"""Init.py."""
--- a/transcript_sampler/find_reptrans.py
+++ b/transcript_sampler/find_reptrans.py
-"""Find representative transcripts"""
+"""Find representative transcripts."""

 import logging

@@ -9,11 +9,12 @@ class FindRepTrans:
    """Find representative transcripts."""

    def __init__(self):
-        pass
+        """Initiate."""

    @staticmethod
    def attributes_converter(attributes: str) -> list:
-        """
+        """Attributes converter function.
+
        This funtion converts the "unstructured" ;-seperated part of
        the line into a list of identifiers and corresponding data,
        the structure of which can be used ot find the data easily e.g
@@ -35,7 +36,8 @@ class FindRepTrans:

    @staticmethod
    def find_in_attributes(attributes: list, look_for: str) -> str:
-        """
+        """Find in attributes function.
+
        This function finds a keyword and used that to locate the value of that
        keyword e.g key = gene_id, value = 'ENSMUSG00002074970',
        this works as they are next to each other in the attributes list.
@@ -56,7 +58,8 @@ class FindRepTrans:

    @staticmethod
    def reformat_reptrans(rep_trans_dict: dict) -> dict:
-        """
+        """Reformat dictionary.
+
        This function is meant to reformat dictionary of the representative
        transcripts into an dictionary with only one entry per key
        Input:
@@ -72,7 +75,8 @@ class FindRepTrans:
        return rep_transcripts

    def get_rep_trans(self, file_name: str) -> dict:
-        """
+        """Get representative transcripts.
+
        This is the main function of this script. It selects one
        representative transcript per gene based on a GTF annotation file.
        It does so by two criteria: the transcript support level and if
@@ -91,9 +95,8 @@ class FindRepTrans:
        Raises:
            ValueError: If an unexpected entry is encountered in the GTF file.
        """
-
        # setting default variables
-        rep_transcripts = {}
+        rep_transcripts = dict()
        cur_g_id = ""
        # [transcript_id, transcript_support_level, transcript_length]
        cur_best_trans = ["", 100, 0]
@@ -117,14 +120,14 @@ class FindRepTrans:
                # looking for and processing exons entries
                if entry[2] == "exon":
                    if cur_g_id != attributes[1]:
-                        LOG.error()
+                        LOG.error("Exon from an unexpected gene")
                        raise ValueError("Exon from an unexpected gene")
                    elif (
                        self.find_in_attributes(
                            attributes, "transcript_id"
                        ) != cur_tID
                        ):
-                        LOG.error()
+                        LOG.error("Exon from an unexpected transcript")
                        raise ValueError("Exon from an unexpected transcript")

                    # adding the length of the exon to the appropriate list and
@@ -141,7 +144,7 @@ class FindRepTrans:
                elif entry[2] == "transcript":
                    # verify that the gen is correct
                    if cur_g_id != attributes[1]:
-                        LOG.error()
+                        LOG.error("Transcript from an unexpected gene")
                        raise ValueError("Transcript from an unexpected gene")

                    # finding the transcript id and the support level
@@ -193,14 +196,14 @@ class FindRepTrans:

                # raises an error for unidentifiable entries
                else:
-                    LOG.error()
+                    LOG.error("This entry could not be identified")
                    raise ValueError("This entry could not be identified")

            # adding the final gene to the dictionary
            if cur_g_id in rep_transcripts:
-                if (rep_transcripts[cur_g_id][1] > cur_best_trans[1]
-                    or (rep_transcripts[cur_g_id][1] == cur_best_trans[1]
-                        and rep_transcripts[cur_g_id][2] < cur_best_trans[2])):
+                if (rep_transcripts[cur_g_id][1] > cur_best_trans[1] or
+                        (rep_transcripts[cur_g_id][1] == cur_best_trans[1] and
+                        rep_transcripts[cur_g_id][2] < cur_best_trans[2])):
                    rep_transcripts[cur_g_id] = cur_best_trans
            else:
                rep_transcripts[cur_g_id] = cur_best_trans
@@ -211,7 +214,8 @@ class FindRepTrans:

    def gtf_file_writer(self, original_file: str,
                        rep_transcript_dict: dict, output_file: str):
-        """
+        """Gtf file writer.
+
        This function writes the output GTF file.
        """
        output = []

--- a/transcript_sampler/match_reptrans_explvl.py
+++ b/transcript_sampler/match_reptrans_explvl.py
@@ -9,13 +9,15 @@ LOG = logging.getLogger(__name__)


 class MatchReptransExplvl:
-    """Match representative transcript with expression level"""
+    """Match representative transcript with expression level."""
+
    def __init__(self):
-        pass
+        """Initiate."""

    @staticmethod
    def gtf_to_df(gtf_file: str) -> pd.DataFrame:
-        """
+        """Gtf to df.
+
        This function takes a .gtf file and converts it into a pandas DataFrame
        containing gene_id and their transcript_id.

@@ -74,13 +76,15 @@ class MatchReptransExplvl:
    def tsv_or_csv_to_df(input_txt: str) -> pd.DataFrame:
        """
        Convert a TSV or CSV file into a pandas DataFrame.
-        
+
        Args:
-            input_txt (str): TSV or CSV file containing transcript expression levels.
-        
+            input_txt (str): TSV or CSV file containing transcript expression
+            levels.
+
        Returns:
-            df_gene (pd.DataFrame): Pandas DataFrame with 'Transcript' and 'Expression_level' as columns.
-        
+            df_gene (pd.DataFrame): Pandas DataFrame with 'Transcript' and
+            'Expression_level' as columns.
+
        Raises:
            None
        """
@@ -139,7 +143,7 @@ class MatchReptransExplvl:
        df_merged = pd.merge(df_reprTranscript, df_expressionLevel_byGene, how="inner", on="Gene")
        df_clean = df_merged.loc[:, ["reprTrans", "Expression_level"]]
        return df_clean
-    
+
    def match_repr_transcript_expression_level(
        self, exprTrans: str, dict_reprTrans: dict, gtf_file: str,
    ):