From 60430a41bb1a48ad0b1d4f9ef97486b0e1f41c09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= <mate.balajti@unibas.ch> Date: Mon, 4 Sep 2023 14:31:23 +0200 Subject: [PATCH] refactor: update gitignore, requirements, cli --- .gitignore | 69 ++++++++- requirements.txt | 1 - tests/input_files/expression.csv | 8 +- tests/test_match_reptrans_explvl.py | 200 +++++++++++++------------ transcript_sampler/cli.py | 34 ++--- transcript_sampler/poisson_sampling.py | 13 +- 6 files changed, 191 insertions(+), 134 deletions(-) diff --git a/.gitignore b/.gitignore index 2a680d3..e6b679b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,12 +1,67 @@ -# ignore ALL .log files -*.log +# Created by https://www.toptal.com/developers/gitignore/api/macos,visualstudiocode +# Edit at https://www.toptal.com/developers/gitignore?templates=macos,visualstudiocode -# ignore ALL files in ANY directory named temp -temp/ -__pycache__ -output_files +### macOS ### +# General +.DS_Store +.AppleDouble +.LSOverride + +# Icon must end with two \r +Icon + + +# Thumbnails +._* + +# Files that might appear in the root of a volume +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns +.com.apple.timemachine.donotpresent + +# Directories potentially created on remote AFP share +.AppleDB +.AppleDesktop +Network Trash Folder +Temporary Items +.apdisk + +### macOS Patch ### +# iCloud generated files +*.icloud + +### VisualStudioCode ### +.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json +!.vscode/*.code-snippets + +# Local History for Visual Studio Code +.history/ + +# Built Visual Studio Code Extensions +*.vsix + +### VisualStudioCode Patch ### +# Ignore all local history of files +.history +.ionide +.vscode + +# End of https://www.toptal.com/developers/gitignore/api/macos,visualstudiocode + +__pycache__/ *_cache *egg-info/ .coverage build/ -*/play.py \ No newline at end of file +*/play.py +*.log +temp/ +output_files diff --git a/requirements.txt b/requirements.txt index 3fd96b3..98d4d62 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,5 @@ argparse biopython gtfparse -polars == 0.16.17 numpy >= 1.23.3 pandas >= 1.4.4 \ No newline at end of file diff --git a/tests/input_files/expression.csv b/tests/input_files/expression.csv index d6fc944..f4ba134 100644 --- a/tests/input_files/expression.csv +++ b/tests/input_files/expression.csv @@ -1,4 +1,4 @@ -ENST00000472194 0.8914783511010855 -ENST00000308647 1.0887715239511602 -ENST00000442483 0.8381441606416928 -ENST00000511072 0.9145581387636652 +ENST00000472194,0.8914783511010855 +ENST00000308647,1.0887715239511602 +ENST00000442483,0.8381441606416928 +ENST00000511072,0.9145581387636652 diff --git a/tests/test_match_reptrans_explvl.py b/tests/test_match_reptrans_explvl.py index 96b878d..ae3bb89 100644 --- a/tests/test_match_reptrans_explvl.py +++ b/tests/test_match_reptrans_explvl.py @@ -157,105 +157,107 @@ class TestMatchReptrans: assert tFun.duplicated_index(df_match).empty, \ "at least one index element is duplicated" - # def test_match_repr_transcript_expression_level(self): - # """Test match_repr_transcript_expression_level(). - - # This function test that the right output is generated by the - # function match_repr_transcript_expression_level(). - # """ - # input_path = tFun.find_path("test_gene_exprL") - # intermediate_path = tFun.find_path_intermediate_file() - # dict_repr_test = { - # 'ENSMUSG00000079415': 'ENSMUST00000112933', - # "ENSMUSG00000024691": "ENSMUST00000025595", - # "ENSMUSG00000063683": "ENSMUST00000119960"} - - # match.match_repr_transcript_expression_level(self, - # exprTrans=input_path, - # dict_reprTrans=dict_repr_test, - # gtf_file=intermediate_path) - - # ref_path = tFun.find_path("test_ref_output.tsv") - # output_path = tFun.find_output() - - # with open(ref_path, 'r', encoding="utf-8") as t1,\ - # open(output_path, 'r', encoding="utf-8") as t2,\ - # open(input_path, 'r', encoding="utf-8") as t3: - # fileRef = t1.readlines() - # fileOutput = t2.readlines() - # fileInput = t3.readlines() - - # assert ( - # sorted(fileRef) == sorted(fileOutput) - # ), "the output does't match the expected tsv file" - # assert ( - # sorted(fileRef) != sorted(fileInput) - # ), "the output does't match the expected tsv file" - - # def test_txt_to_dict(self): - # """This function tests if txt is convertod to dict""" - # path = tFun.find_path("test_dict_repr_trans.txt") - # dico = match.txt_to_dict(path) - # dict_test = {'ENSMUSG00000079415': 'ENSMUST00000112933', - # "ENSMUSG00000024691": "ENSMUST00000025595", - # "ENSMUSG00000063683": "ENSMUST00000119960"} - # assert dico == dict_test - - # def test_transcripts_by_gene_inDf(): - # """ - # This function test if a dataframe generated from - # the intermediate file is converted in another - # dataframe without the support level column. - # """ - # path = tFun.find_path_intermediate_file() - # df = repr.import_gtfSelection_to_df(path) - # df_gene = match.transcripts_by_gene_inDf(df) - # datatype = {'Gene': np.dtype('O'), 'Transcript': np.dtype('O')} - # assert tFun.column_number(df_gene) == ( - # 2, "number of columns is not equal to 2") - # assert tFun.column_d_type(df_gene) == ( - # datatype, "at least one column has the wrong datatype") - # assert tFun.duplicated_rows(df_gene).empty, \ - # "at least one row are duplicated" - # assert tFun.na_value(df_gene) == 0, \ - # "at least one row contain NA values" - - # def test_output_tsv(): - # """ - # This function test if a tsv file is generated from a pandas - # dataframe in the right format. - # """ - - # dict_repr_test = { - # 'ENSMUSG00000079415': 'ENSMUST00000112933', - # "ENSMUSG00000024691": "ENSMUST00000025595", - # "ENSMUSG00000063683": "ENSMUST00000119960"} - # df_dict_repr_trans = match.dict_repr_trans_to_df(dict_repr_test) - - # path_tsv = tFun.find_path(r"test_gene_exprL") - # df_tsv_exp_lvl = match.tsv_or_csv_to_df(path_tsv) - # path_intermediate = tFun.find_path_intermediate_file() - # df_intermediate = repr.import_gtfSelection_to_df(path_intermediate) - # df_gene_transcript = match.transcripts_by_gene_inDf(df_intermediate) - - # df_exp_lvl = match.expr_level_by_gene( - # df_tsv_exp_lvl, df_gene_transcript - # ) - - # df_match = match.match_by_gene(df_dict_repr_trans, df_exp_lvl) - - # match.output_tsv(df_match) - - # ref_path = tFun.find_path("test_ref_output.tsv") - # output_path = tFun.find_output() - - # with open(ref_path, 'r') as t1, open(output_path, 'r') as t2: - # fileRef = t1.readlines() - # fileOutput = t2.readlines() - - # assert ( - # sorted(fileRef) == sorted(fileOutput) - # ), "the output does't match the expected tsv file" + def test_match_repr_transcript_expression_level(self): + """Test match_repr_transcript_expression_level(). + + This function test that the right output is generated by the + function match_repr_transcript_expression_level(). + """ + input_path = tFun.find_path("test_gene_exprL") + intermediate_path = tFun.find_path_intermediate_file() + dict_repr_test = { + 'ENSMUSG00000079415': 'ENSMUST00000112933', + "ENSMUSG00000024691": "ENSMUST00000025595", + "ENSMUSG00000063683": "ENSMUST00000119960"} + + match.match_repr_transcript_expression_level( + self, + exprTrans=input_path, + dict_reprTrans=dict_repr_test, + gtf_file=intermediate_path + ) + + ref_path = tFun.find_path("test_ref_output.tsv") + output_path = tFun.find_output() + + with open(ref_path, 'r', encoding="utf-8") as t1,\ + open(output_path, 'r', encoding="utf-8") as t2,\ + open(input_path, 'r', encoding="utf-8") as t3: + fileRef = t1.readlines() + fileOutput = t2.readlines() + fileInput = t3.readlines() + + assert ( + sorted(fileRef) == sorted(fileOutput) + ), "the output does't match the expected tsv file" + assert ( + sorted(fileRef) != sorted(fileInput) + ), "the output does't match the expected tsv file" + + def test_txt_to_dict(self): + """This function tests if txt is convertod to dict""" + path = tFun.find_path("test_dict_repr_trans.txt") + dico = match.txt_to_dict(path) + dict_test = {'ENSMUSG00000079415': 'ENSMUST00000112933', + "ENSMUSG00000024691": "ENSMUST00000025595", + "ENSMUSG00000063683": "ENSMUST00000119960"} + assert dico == dict_test + + def test_transcripts_by_gene_inDf(): + """ + This function test if a dataframe generated from + the intermediate file is converted in another + dataframe without the support level column. + """ + path = tFun.find_path_intermediate_file() + df = repr.import_gtfSelection_to_df(path) + df_gene = match.transcripts_by_gene_inDf(df) + datatype = {'Gene': np.dtype('O'), 'Transcript': np.dtype('O')} + assert tFun.column_number(df_gene) == ( + 2, "number of columns is not equal to 2") + assert tFun.column_d_type(df_gene) == ( + datatype, "at least one column has the wrong datatype") + assert tFun.duplicated_rows(df_gene).empty, \ + "at least one row are duplicated" + assert tFun.na_value(df_gene) == 0, \ + "at least one row contain NA values" + + def test_output_tsv(): + """ + This function test if a tsv file is generated from a pandas + dataframe in the right format. + """ + + dict_repr_test = { + 'ENSMUSG00000079415': 'ENSMUST00000112933', + "ENSMUSG00000024691": "ENSMUST00000025595", + "ENSMUSG00000063683": "ENSMUST00000119960"} + df_dict_repr_trans = match.dict_repr_trans_to_df(dict_repr_test) + + path_tsv = tFun.find_path(r"test_gene_exprL") + df_tsv_exp_lvl = match.tsv_or_csv_to_df(path_tsv) + path_intermediate = tFun.find_path_intermediate_file() + df_intermediate = repr.import_gtfSelection_to_df(path_intermediate) + df_gene_transcript = match.transcripts_by_gene_inDf(df_intermediate) + + df_exp_lvl = match.expr_level_by_gene( + df_tsv_exp_lvl, df_gene_transcript + ) + + df_match = match.match_by_gene(df_dict_repr_trans, df_exp_lvl) + + match.output_tsv(df_match) + + ref_path = tFun.find_path("test_ref_output.tsv") + output_path = tFun.find_output() + + with open(ref_path, 'r') as t1, open(output_path, 'r') as t2: + fileRef = t1.readlines() + fileOutput = t2.readlines() + + assert ( + sorted(fileRef) == sorted(fileOutput) + ), "the output does't match the expected tsv file" # test_dict_repr_trans_to_df() # test_txt_to_dict() diff --git a/transcript_sampler/cli.py b/transcript_sampler/cli.py index 8b65042..f4d2788 100644 --- a/transcript_sampler/cli.py +++ b/transcript_sampler/cli.py @@ -25,31 +25,31 @@ def main(): formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) parser.add_argument( - "--input_gtf", required=True, default=None, - help="GTF file with genome annotation" + "-ic", "--input_csv", required=True, default=None, + help="CSV or TSV file with transcripts and their expression level" ) parser.add_argument( - "--input_csv", required=True, default=None, - help="CSV or TSV file with transcripts and their expression level" + "-ig", "--input_gtf", required=True, default=None, + help="GTF file with genome annotation" ) parser.add_argument( - "--output_gtf", required=True, default=None, - help="Output path for the new GTF file of representative transcripts" + "-oc", "--output_csv", required=True, default=None, + help="Output path for the new CSV file of representative transcripts " + "and their sampled number" ) parser.add_argument( - "--output_csv", required=True, default=None, - help="Output path for the new CSV file of representative transcripts \ - and their sampled number" + "-og", "--output_gtf", required=True, default=None, + help="Output path for the new GTF file of representative transcripts" ) parser.add_argument( - "--n_to_sample", required=True, default=None, + "-n", "--n_to_sample", required=True, default=None, help="Total number of transcripts to sample" ) args = parser.parse_args() log = logging.getLogger("main") start = time.time() - log.info("Started transcript sampler...") + log.info("Started transcript sampler.") dict_repr_trans = find_rep_trans.get_rep_trans(args.input_gtf) df_repr = match_reptrs_explvl.match_repr_transcript_expression_level( dict_reprTrans=dict_repr_trans, @@ -57,20 +57,20 @@ def main(): gtf_file=args.input_gtf ) log.info( - "Finding match between representative transcripts \ - and expression level file" + "Finding match between representative transcripts " + "and expression level file..." ) - log.info("Poisson sampling of transcripts") + log.info("Poisson sampling of transcripts...") poisson_sample.transcript_sampling( args.n_to_sample, df_repr, args.output_csv) - log.info("Output CSV file ready") + log.info("Output CSV file ready.") - log.info("Writing output GTF file") + log.info("Writing output GTF file...") find_rep_trans.gtf_file_writer( args.input_gtf, dict_repr_trans, args.output_gtf) end = time.time() - log.info("Script executed in %s sec", (end - start)) + log.info("Script executed in %s sec.", round(end - start, 2)) if __name__ == "__main__": diff --git a/transcript_sampler/poisson_sampling.py b/transcript_sampler/poisson_sampling.py index 6c586ac..f86e2bb 100644 --- a/transcript_sampler/poisson_sampling.py +++ b/transcript_sampler/poisson_sampling.py @@ -1,12 +1,12 @@ -"""Sample transcripts by Poisson-sampling""" +"""Sample transcripts by Poisson-sampling.""" import pandas as pd import numpy as np +# pylint: disable=R0903 class SampleTranscript: - ''' - Sample transcript + """Sample transcript. This part of the code does Poisson sampling proportionally to gene expression levels for each gene. @@ -17,10 +17,11 @@ class SampleTranscript: output: csv file with gene id and count gtf file with transcript samples - ''' + """ + @staticmethod def transcript_sampling(total_transcript_number, df_repr, output_csv): - """Samples transcript based on Poisson-sampling""" + """Sample transcript based on Poisson-sampling.""" total = df_repr["level"].sum() total_transcript_number = int(total_transcript_number) normalized = total_transcript_number / total @@ -28,7 +29,7 @@ class SampleTranscript: transcript_numbers = pd.DataFrame({ "id": df_repr["id"], "count": levels }) - transcript_numbers.to_csv(output_csv, index=False) + transcript_numbers.to_csv(output_csv, index=False, header=False) # python_version = "3.7.13" -- GitLab