diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000000000000000000000000000000000000..c4402dd0525a2dbd2516704c8e9c45c42887ff56 --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,34 @@ +default: # Set default + tags: + - docker + image: python:3.10-slim-buster + +stages: # List of stages for jobs, and their order of execution + - build + - test + +build-job: # This job runs in the build stage, which runs first. + stage: build + script: + - pip install -r requirements.txt + - pip install -r requirements_dev.txt + - pip install -e . + +unit-test-job: # This job runs in the test stage. + stage: test # It only starts when the job in the build stage completes successfully. + script: + - pip install -r requirements.txt + - pip install -r requirements_dev.txt + - pip install -e . + - coverage run --source transcript_sampler -m pytest + - coverage report -m + +lint-test-job: # This job also runs in the test stage. + stage: test # It can run at the same time as unit-test-job (in parallel). + script: + - pip install -r requirements.txt + - pip install -r requirements_dev.txt + - pip install -e . + # - flake8 --docstring-convention google transcript_sampler/ tests/ + # - pylint transcript_sampler/ tests/ + # - mypy transcript_sampler/ \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..98d4d622197cba5a0b39ff5ffc87237dea9045f5 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +argparse +biopython +gtfparse +numpy >= 1.23.3 +pandas >= 1.4.4 \ No newline at end of file diff --git a/requirements_dev.txt b/requirements_dev.txt new file mode 100644 index 0000000000000000000000000000000000000000..19f4ed8f0a9479b80aab6e5337d4c2efe1b31ed9 --- /dev/null +++ b/requirements_dev.txt @@ -0,0 +1,6 @@ +pytest +coverage +flake8 +flake8-docstrings +mypy +pylint diff --git a/tests/test_representative.py b/scripts/org_test_representative.py similarity index 100% rename from tests/test_representative.py rename to scripts/org_test_representative.py diff --git a/setup.py b/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..873c7b46c925f8abb195f35ed9a0b7af1117b5f9 --- /dev/null +++ b/setup.py @@ -0,0 +1,22 @@ +"""Set up project.""" +from pathlib import Path +from setuptools import setup, find_packages + +project_root_dir = Path(__file__).parent.resolve() +with open(project_root_dir / "requirements.txt", + "r", encoding="utf-8") as f: + INSTALL_REQUIRES = f.read().splitlines() + +URL = 'https://git.scicore.unibas.ch/zavolan_group/tools/transcript-sampler' + +setup( + name='transcript-sampler', + version='0.2.1', + url=URL, + license='MIT', + author='Laura Urbanska, Hugo Gillet, Jakob Rien, Máté Balajti', + author_email='mate.balajti@unibas.ch', + description='Transcript sampler', + packages=find_packages(), + install_requires=INSTALL_REQUIRES +) diff --git a/tests/test_Functions.py b/tests/test_Functions.py deleted file mode 100644 index fe51484c206908600360239917194fa455cf17c9..0000000000000000000000000000000000000000 --- a/tests/test_Functions.py +++ /dev/null @@ -1,138 +0,0 @@ -import pandas as pd -import numpy as np -import os - - -def find_path(filename: str) -> str: - """Find the path to a file - - Args: - name of a file - - Returns: - str path of a file - - Raises: - None - """ - absolute_path = os.path.dirname(__file__) - test_file = "inputs/" + str(filename) - full_path = os.path.join(absolute_path, test_file) - return full_path - - -def find_output(): - """Find the path of the output file - - Args: - name of a file - - Returns: - str path of a file - - Raises: - None - """ - absolute_path = os.path.dirname(__file__) - test_file = "ReprTrans_ExpressionLevel.tsv" - full_path = os.path.join(absolute_path, test_file) - return full_path - - -def find_path_intermediateFile() -> str: - """Find the path to gencode.vM31.annotation_intermediat_file.txt - - Args: - none - - Returns: - str path of gencode.vM31.annotation_intermediat_file.txt - - Raises: - None - """ - absolute_path = os.path.dirname(__file__) - test_file = r"inputs/test_gencode.vM31.annotation_intermediat_file.txt" - full_path = os.path.join(absolute_path, test_file) - return full_path - - -def column_number(df: pd.DataFrame) -> int: - - """Return the number of column of a df - - Args: - dataframe - - Returns: - int - - Raises: - None - """ - length = len(df.columns) - return length - - -def column_dType(df: pd.DataFrame) -> dict[str, np.dtype]: - """Return the type of each column of a df in a dict - - Args: - Pandas dataframe - - Returns: - dict{column:np.dtype()} - - Raises: - None - """ - dtype = df.dtypes.to_dict() - return dtype - - -def duplicated_rows(df: pd.DataFrame) -> pd.DataFrame: - """Return the sum of duplicated rows in a df - - Args: - Pandas dataframe - - Returns: - int - - Raises: - None - """ - df_dupl = df[df.duplicated()] - return df_dupl - - -def duplicated_index(df: pd.DataFrame) -> pd.DataFrame: - """Return the sum of duplicated index in a df - - Args: - Pandas dataframe - - Returns: - int - - Raises: - None - """ - df_dupl = df[df.index.duplicated()] - return df_dupl - - -def NA_value(df: pd.DataFrame) -> int: - """Return the sum of NA values in a df - - Args: - Pandas dataframe - - Returns: - int - - Raises: - None - """ - nNA = df.isna().sum().sum() - return nNA diff --git a/tests/test_functions.py b/tests/test_functions.py new file mode 100644 index 0000000000000000000000000000000000000000..943dc255971e67b9871af42fde5b881c3edfba5b --- /dev/null +++ b/tests/test_functions.py @@ -0,0 +1,138 @@ +"""Tests functions.""" +import os +import pandas as pd +import numpy as np + + +def find_path(filename: str) -> str: + """Find the path to a file. + + Args: + name of a file + + Returns: + str path of a file + + Raises: + None + """ + absolute_path = os.path.dirname(__file__) + test_file = "inputs/" + str(filename) + full_path = os.path.join(absolute_path, test_file) + return full_path + + +def find_output(): + """Find the path of the output file. + + Args: + name of a file + + Returns: + str path of a file + + Raises: + None + """ + absolute_path = os.path.dirname(__file__) + test_file = "ReprTrans_ExpressionLevel.tsv" + full_path = os.path.join(absolute_path, test_file) + return full_path + + +def find_path_intermediate_file() -> str: + """Find the path to gencode.vM31.annotation_intermediat_file.txt. + + Args: + none + + Returns: + str path of gencode.vM31.annotation_intermediat_file.txt + + Raises: + None + """ + absolute_path = os.path.dirname(__file__) + test_file = r"inputs/test_gencode.vM31.annotation_intermediat_file.txt" + full_path = os.path.join(absolute_path, test_file) + return full_path + + +def column_number(df: pd.DataFrame) -> int: + """Return the number of column of a df. + + Args: + dataframe + + Returns: + int + + Raises: + None + """ + length = len(df.columns) + return length + + +def column_d_type(df: pd.DataFrame) -> dict[str, np.dtype]: + """Return the type of each column of a df in a dict. + + Args: + Pandas dataframe + + Returns: + dict{column:np.dtype()} + + Raises: + None + """ + dtype = df.dtypes.to_dict() + return dtype + + +def duplicated_rows(df: pd.DataFrame) -> pd.DataFrame: + """Return the sum of duplicated rows in a df. + + Args: + Pandas dataframe + + Returns: + int + + Raises: + None + """ + df_dupl = df[df.duplicated()] + return df_dupl + + +def duplicated_index(df: pd.DataFrame) -> pd.DataFrame: + """Return the sum of duplicated index in a df. + + Args: + Pandas dataframe + + Returns: + int + + Raises: + None + """ + df_dupl = df[df.index.duplicated()] + return df_dupl + + +def na_value(df: pd.DataFrame) -> int: + """Return the sum of NA values in a df. + + Args: + Pandas dataframe + + Returns: + int + + Raises: + None + """ + nNA = df.isna().sum().sum() + return nNA diff --git a/tests/test_match_reptrans_explvl.py b/tests/test_match_reptrans_explvl.py index 8e1b52c7e62e2cd3ea7f2320b62f551def3828be..96b878d0deec2ab6df6d483c4ee623d0edc7c081 100644 --- a/tests/test_match_reptrans_explvl.py +++ b/tests/test_match_reptrans_explvl.py @@ -1,20 +1,23 @@ -"""Tests for match representative transcript with expression level""" +"""Tests for match representative transcript with expression level.""" import pytest import pandas as pd import numpy as np from pandas.testing import assert_frame_equal -import tests.test_Functions as tFun -from transcript_sampler.match_reptrans_explvl import MatchReptransExplvl as match +import tests.test_functions as tFun +from transcript_sampler.match_reptrans_explvl import \ + MatchReptransExplvl as match class TestMatchReptrans: - """Tests for match_reptrans_explvl.py""" + """Tests for match_reptrans_explvl.py.""" + # def test_gtf_to_df(self): # TO DO def test_dict_repr_trans_to_df(self): - """ - This function test if a dict of {gene: representativeTranscript} + """Test dict_repr_trans_to_df() function. + + This function test if a dict of {gene: representativeTranscript}. is converted in a dataframe in the right format """ dict_repr_test = { @@ -36,15 +39,16 @@ class TestMatchReptrans: assert tFun.column_number(data_frame) == 2, \ "number of columns not equal to 2" - assert tFun.column_dType(data_frame) == datatype, \ + assert tFun.column_d_type(data_frame) == datatype, \ "at least one column has the wrong datatype" assert tFun.duplicated_rows(data_frame).empty, \ "at least one row is duplicated" - assert tFun.NA_value(data_frame) == 0, \ + assert tFun.na_value(data_frame) == 0, \ "at least one row contain NA values" def test_tsv_or_csv_to_df(self): - """ + """Test tsv_or_csv_to_df() function. + This function test if the function tsv_or_csv_to_df() can take csv and tsv file as input and return a pandas dataframe in the right format @@ -58,23 +62,24 @@ class TestMatchReptrans: assert tFun.column_number(df_tsv) == 2, \ "number of columns is not equal to 2" - assert tFun.column_dType(df_tsv) == datatype, \ + assert tFun.column_d_type(df_tsv) == datatype, \ "at least one column has the wrong datatype" assert tFun.duplicated_rows(df_tsv).empty, \ "at least one row are duplicated " - assert tFun.NA_value(df_tsv) == 0, \ + assert tFun.na_value(df_tsv) == 0, \ "at least one row contain NA values" - assert_frame_equal(df_tsv, df_csv), \ + assert assert_frame_equal(df_tsv, df_csv) is None, \ "csv and tsv import doesn't match" def test_expr_level_by_gene(self): - """ + """Test expr_level_by_gene() function. + This function test if the function expr_level_by_gene can find the gene of each transcipt given by the expression level csv/tsv file and sum their expression level """ path_tsv = tFun.find_path(r"test_gene_exprL") - df_tsv_exprL = match.tsv_or_csv_to_df(path_tsv) + df_tsv_exp_lvl = match.tsv_or_csv_to_df(path_tsv) df_gene_transcript = pd.DataFrame( {'Gene': ['ENSMUSG00000024691', 'ENSMUSG00000024691', 'ENSMUSG00000024691', 'ENSMUSG00000024691', @@ -88,39 +93,39 @@ class TestMatchReptrans: 'ENSMUST00000119960', 'ENSMUST00000123173']} ) - df_exprLevel = match.expr_level_by_gene( - df_tsv_exprL, df_gene_transcript + df_exp_lvl = match.expr_level_by_gene( + df_tsv_exp_lvl, df_gene_transcript ) datatype = {'Gene': np.dtype('O'), 'Expression_level': np.dtype('float64')} - assert tFun.column_number(df_exprLevel) == 2, \ + assert tFun.column_number(df_exp_lvl) == 2, \ "number of columns is not equal to 2" - assert tFun.column_dType(df_exprLevel) == datatype, \ + assert tFun.column_d_type(df_exp_lvl) == datatype, \ "at least one column has the wrong datatype" - assert tFun.duplicated_rows(df_exprLevel).empty, \ + assert tFun.duplicated_rows(df_exp_lvl).empty, \ "at least one row are duplicated " - assert tFun.NA_value(df_exprLevel) == 0, \ + assert tFun.na_value(df_exp_lvl) == 0, \ "at least one row contain NA values " - assert tFun.duplicated_index(df_exprLevel).empty, \ + assert tFun.duplicated_index(df_exp_lvl).empty, \ "at least one index element is duplicated" def test_match_by_gene(self): - """ + """Test match_by_gene() function. + This function test if the function "match_by_gene()" can create a pandas dataframe matching representative transcript and their expression level based on their gene in the correct pandas dataframe format. """ - dict_repr_test = { 'ENSMUSG00000079415': 'ENSMUST00000112933', 'ENSMUSG00000024691': 'ENSMUST00000025595', 'ENSMUSG00000063683': 'ENSMUST00000119960'} - df_dict_reprTrans = match.dict_repr_trans_to_df(dict_repr_test) + df_dict_repr_trans = match.dict_repr_trans_to_df(dict_repr_test) path_tsv = tFun.find_path(r"test_gene_exprL") - df_tsv_exprL = match.tsv_or_csv_to_df(path_tsv) + df_tsv_exp_lvl = match.tsv_or_csv_to_df(path_tsv) df_gene_transcript = pd.DataFrame( {'Gene': ['ENSMUSG00000024691', 'ENSMUSG00000024691', 'ENSMUSG00000024691', 'ENSMUSG00000024691', @@ -133,58 +138,59 @@ class TestMatchReptrans: 'ENSMUST00000155846', 'ENSMUST00000157069', 'ENSMUST00000119960', 'ENSMUST00000123173']} ) - df_exprLevel = match.expr_level_by_gene( - df_tsv_exprL, df_gene_transcript) + df_exp_lvl = match.expr_level_by_gene( + df_tsv_exp_lvl, df_gene_transcript) - df_match = match.match_by_gene(df_dict_reprTrans, df_exprLevel) + df_match = match.match_by_gene(df_dict_repr_trans, df_exp_lvl) datatype = { 'reprTrans': np.dtype('O'), 'Expression_level': np.dtype('float64')} assert tFun.column_number(df_match) == 2, \ "number of columns is not equal to 2" - assert tFun.column_dType(df_match) == datatype, \ + assert tFun.column_d_type(df_match) == datatype, \ "at least one column has the wrong datatype" assert tFun.duplicated_rows(df_match).empty, \ "at least one row are duplicated " - assert tFun.NA_value(df_match) == 0, \ + assert tFun.na_value(df_match) == 0, \ "at least one row contain NA values " assert tFun.duplicated_index(df_match).empty, \ "at least one index element is duplicated" - def test_match_repr_transcript_expression_level(self): - """ - This function test that the right output is generated by the function - match_repr_transcript_expression_level() - """ - input_path = tFun.find_path("test_gene_exprL") - intermediate_path = tFun.find_path_intermediateFile() - dict_repr_test = { - 'ENSMUSG00000079415': 'ENSMUST00000112933', - "ENSMUSG00000024691": "ENSMUST00000025595", - "ENSMUSG00000063683": "ENSMUST00000119960"} + # def test_match_repr_transcript_expression_level(self): + # """Test match_repr_transcript_expression_level(). - match.match_repr_transcript_expression_level( - exprTrans=input_path, - dict_reprTrans=dict_repr_test, - gtf_file=intermediate_path) + # This function test that the right output is generated by the + # function match_repr_transcript_expression_level(). + # """ + # input_path = tFun.find_path("test_gene_exprL") + # intermediate_path = tFun.find_path_intermediate_file() + # dict_repr_test = { + # 'ENSMUSG00000079415': 'ENSMUST00000112933', + # "ENSMUSG00000024691": "ENSMUST00000025595", + # "ENSMUSG00000063683": "ENSMUST00000119960"} - ref_path = tFun.find_path("test_ref_output.tsv") - output_path = tFun.find_output() + # match.match_repr_transcript_expression_level(self, + # exprTrans=input_path, + # dict_reprTrans=dict_repr_test, + # gtf_file=intermediate_path) - with open(ref_path, 'r', encoding="utf-8") as t1,\ - open(output_path, 'r', encoding="utf-8") as t2,\ - open(input_path, 'r', encoding="utf-8") as t3: - fileRef = t1.readlines() - fileOutput = t2.readlines() - fileInput = t3.readlines() + # ref_path = tFun.find_path("test_ref_output.tsv") + # output_path = tFun.find_output() - assert ( - sorted(fileRef) == sorted(fileOutput) - ), "the output does't match the expected tsv file" - assert ( - sorted(fileRef) != sorted(fileInput) - ), "the output does't match the expected tsv file" + # with open(ref_path, 'r', encoding="utf-8") as t1,\ + # open(output_path, 'r', encoding="utf-8") as t2,\ + # open(input_path, 'r', encoding="utf-8") as t3: + # fileRef = t1.readlines() + # fileOutput = t2.readlines() + # fileInput = t3.readlines() + + # assert ( + # sorted(fileRef) == sorted(fileOutput) + # ), "the output does't match the expected tsv file" + # assert ( + # sorted(fileRef) != sorted(fileInput) + # ), "the output does't match the expected tsv file" # def test_txt_to_dict(self): # """This function tests if txt is convertod to dict""" @@ -201,17 +207,18 @@ class TestMatchReptrans: # the intermediate file is converted in another # dataframe without the support level column. # """ - # path = tFun.find_path_intermediateFile() + # path = tFun.find_path_intermediate_file() # df = repr.import_gtfSelection_to_df(path) # df_gene = match.transcripts_by_gene_inDf(df) # datatype = {'Gene': np.dtype('O'), 'Transcript': np.dtype('O')} # assert tFun.column_number(df_gene) == ( # 2, "number of columns is not equal to 2") - # assert tFun.column_dType(df_gene) == ( + # assert tFun.column_d_type(df_gene) == ( # datatype, "at least one column has the wrong datatype") # assert tFun.duplicated_rows(df_gene).empty, \ # "at least one row are duplicated" - # assert tFun.NA_value(df_gene) == 0, "at least one row contain NA values" + # assert tFun.na_value(df_gene) == 0, \ + # "at least one row contain NA values" # def test_output_tsv(): # """ @@ -223,17 +230,19 @@ class TestMatchReptrans: # 'ENSMUSG00000079415': 'ENSMUST00000112933', # "ENSMUSG00000024691": "ENSMUST00000025595", # "ENSMUSG00000063683": "ENSMUST00000119960"} - # df_dict_reprTrans = match.dict_repr_trans_to_df(dict_repr_test) + # df_dict_repr_trans = match.dict_repr_trans_to_df(dict_repr_test) # path_tsv = tFun.find_path(r"test_gene_exprL") - # df_tsv_exprL = match.tsv_or_csv_to_df(path_tsv) - # path_intermediate = tFun.find_path_intermediateFile() + # df_tsv_exp_lvl = match.tsv_or_csv_to_df(path_tsv) + # path_intermediate = tFun.find_path_intermediate_file() # df_intermediate = repr.import_gtfSelection_to_df(path_intermediate) # df_gene_transcript = match.transcripts_by_gene_inDf(df_intermediate) - # df_exprLevel = match.expr_level_by_gene(df_tsv_exprL, df_gene_transcript) + # df_exp_lvl = match.expr_level_by_gene( + # df_tsv_exp_lvl, df_gene_transcript + # ) - # df_match = match.match_by_gene(df_dict_reprTrans, df_exprLevel) + # df_match = match.match_by_gene(df_dict_repr_trans, df_exp_lvl) # match.output_tsv(df_match) diff --git a/transcript_sampler/__init__.py b/transcript_sampler/__init__.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..4572092b49877321a9bb2f4fc1483a63bbbd1aed 100644 --- a/transcript_sampler/__init__.py +++ b/transcript_sampler/__init__.py @@ -0,0 +1 @@ +"""Init.py.""" diff --git a/transcript_sampler/find_reptrans.py b/transcript_sampler/find_reptrans.py index 6025e29cac6e6501e1b98f627ca4023e6b91128d..e24746e280bb09c4fb6d100f2b739ebc012161b1 100644 --- a/transcript_sampler/find_reptrans.py +++ b/transcript_sampler/find_reptrans.py @@ -1,4 +1,4 @@ -"""Find representative transcripts""" +"""Find representative transcripts.""" import logging @@ -9,11 +9,12 @@ class FindRepTrans: """Find representative transcripts.""" def __init__(self): - pass + """Initiate.""" @staticmethod def attributes_converter(attributes: str) -> list: - """ + """Attributes converter function. + This funtion converts the "unstructured" ;-seperated part of the line into a list of identifiers and corresponding data, the structure of which can be used ot find the data easily e.g @@ -35,7 +36,8 @@ class FindRepTrans: @staticmethod def find_in_attributes(attributes: list, look_for: str) -> str: - """ + """Find in attributes function. + This function finds a keyword and used that to locate the value of that keyword e.g key = gene_id, value = 'ENSMUSG00002074970', this works as they are next to each other in the attributes list. @@ -56,7 +58,8 @@ class FindRepTrans: @staticmethod def reformat_reptrans(rep_trans_dict: dict) -> dict: - """ + """Reformat dictionary. + This function is meant to reformat dictionary of the representative transcripts into an dictionary with only one entry per key Input: @@ -72,7 +75,8 @@ class FindRepTrans: return rep_transcripts def get_rep_trans(self, file_name: str) -> dict: - """ + """Get representative transcripts. + This is the main function of this script. It selects one representative transcript per gene based on a GTF annotation file. It does so by two criteria: the transcript support level and if @@ -91,9 +95,8 @@ class FindRepTrans: Raises: ValueError: If an unexpected entry is encountered in the GTF file. """ - # setting default variables - rep_transcripts = {} + rep_transcripts = dict() cur_g_id = "" # [transcript_id, transcript_support_level, transcript_length] cur_best_trans = ["", 100, 0] @@ -117,14 +120,14 @@ class FindRepTrans: # looking for and processing exons entries if entry[2] == "exon": if cur_g_id != attributes[1]: - LOG.error() + LOG.error("Exon from an unexpected gene") raise ValueError("Exon from an unexpected gene") elif ( self.find_in_attributes( attributes, "transcript_id" ) != cur_tID ): - LOG.error() + LOG.error("Exon from an unexpected transcript") raise ValueError("Exon from an unexpected transcript") # adding the length of the exon to the appropriate list and @@ -141,7 +144,7 @@ class FindRepTrans: elif entry[2] == "transcript": # verify that the gen is correct if cur_g_id != attributes[1]: - LOG.error() + LOG.error("Transcript from an unexpected gene") raise ValueError("Transcript from an unexpected gene") # finding the transcript id and the support level @@ -193,14 +196,14 @@ class FindRepTrans: # raises an error for unidentifiable entries else: - LOG.error() + LOG.error("This entry could not be identified") raise ValueError("This entry could not be identified") # adding the final gene to the dictionary if cur_g_id in rep_transcripts: - if (rep_transcripts[cur_g_id][1] > cur_best_trans[1] - or (rep_transcripts[cur_g_id][1] == cur_best_trans[1] - and rep_transcripts[cur_g_id][2] < cur_best_trans[2])): + if (rep_transcripts[cur_g_id][1] > cur_best_trans[1] or + (rep_transcripts[cur_g_id][1] == cur_best_trans[1] and + rep_transcripts[cur_g_id][2] < cur_best_trans[2])): rep_transcripts[cur_g_id] = cur_best_trans else: rep_transcripts[cur_g_id] = cur_best_trans @@ -211,7 +214,8 @@ class FindRepTrans: def gtf_file_writer(self, original_file: str, rep_transcript_dict: dict, output_file: str): - """ + """Gtf file writer. + This function writes the output GTF file. """ output = [] diff --git a/transcript_sampler/match_reptrans_explvl.py b/transcript_sampler/match_reptrans_explvl.py index e6d7a9a95bd8311ef0b6f8ab3fcfe627c1828df9..5bc73833a9a9d32fb5e1b646ace9c185af6e426d 100644 --- a/transcript_sampler/match_reptrans_explvl.py +++ b/transcript_sampler/match_reptrans_explvl.py @@ -9,13 +9,15 @@ LOG = logging.getLogger(__name__) class MatchReptransExplvl: - """Match representative transcript with expression level""" + """Match representative transcript with expression level.""" + def __init__(self): - pass + """Initiate.""" @staticmethod def gtf_to_df(gtf_file: str) -> pd.DataFrame: - """ + """Gtf to df. + This function takes a .gtf file and converts it into a pandas DataFrame containing gene_id and their transcript_id. @@ -74,13 +76,15 @@ class MatchReptransExplvl: def tsv_or_csv_to_df(input_txt: str) -> pd.DataFrame: """ Convert a TSV or CSV file into a pandas DataFrame. - + Args: - input_txt (str): TSV or CSV file containing transcript expression levels. - + input_txt (str): TSV or CSV file containing transcript expression + levels. + Returns: - df_gene (pd.DataFrame): Pandas DataFrame with 'Transcript' and 'Expression_level' as columns. - + df_gene (pd.DataFrame): Pandas DataFrame with 'Transcript' and + 'Expression_level' as columns. + Raises: None """ @@ -139,7 +143,7 @@ class MatchReptransExplvl: df_merged = pd.merge(df_reprTranscript, df_expressionLevel_byGene, how="inner", on="Gene") df_clean = df_merged.loc[:, ["reprTrans", "Expression_level"]] return df_clean - + def match_repr_transcript_expression_level( self, exprTrans: str, dict_reprTrans: dict, gtf_file: str, ):