From 60430a41bb1a48ad0b1d4f9ef97486b0e1f41c09 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= <mate.balajti@unibas.ch>
Date: Mon, 4 Sep 2023 14:31:23 +0200
Subject: [PATCH] refactor: update gitignore, requirements, cli

---
 .gitignore                             |  69 ++++++++-
 requirements.txt                       |   1 -
 tests/input_files/expression.csv       |   8 +-
 tests/test_match_reptrans_explvl.py    | 200 +++++++++++++------------
 transcript_sampler/cli.py              |  34 ++---
 transcript_sampler/poisson_sampling.py |  13 +-
 6 files changed, 191 insertions(+), 134 deletions(-)

diff --git a/.gitignore b/.gitignore
index 2a680d3..e6b679b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,12 +1,67 @@
-# ignore ALL .log files
-*.log
+# Created by https://www.toptal.com/developers/gitignore/api/macos,visualstudiocode
+# Edit at https://www.toptal.com/developers/gitignore?templates=macos,visualstudiocode
 
-# ignore ALL files in ANY directory named temp
-temp/ 
-__pycache__
-output_files
+### macOS ###
+# General
+.DS_Store
+.AppleDouble
+.LSOverride
+
+# Icon must end with two \r
+Icon
+
+
+# Thumbnails
+._*
+
+# Files that might appear in the root of a volume
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+.com.apple.timemachine.donotpresent
+
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
+
+### macOS Patch ###
+# iCloud generated files
+*.icloud
+
+### VisualStudioCode ###
+.vscode/*
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+!.vscode/*.code-snippets
+
+# Local History for Visual Studio Code
+.history/
+
+# Built Visual Studio Code Extensions
+*.vsix
+
+### VisualStudioCode Patch ###
+# Ignore all local history of files
+.history
+.ionide
+.vscode
+
+# End of https://www.toptal.com/developers/gitignore/api/macos,visualstudiocode
+
+__pycache__/
 *_cache
 *egg-info/
 .coverage
 build/
-*/play.py
\ No newline at end of file
+*/play.py
+*.log
+temp/
+output_files
diff --git a/requirements.txt b/requirements.txt
index 3fd96b3..98d4d62 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,5 @@
 argparse
 biopython
 gtfparse
-polars == 0.16.17
 numpy >= 1.23.3
 pandas >= 1.4.4
\ No newline at end of file
diff --git a/tests/input_files/expression.csv b/tests/input_files/expression.csv
index d6fc944..f4ba134 100644
--- a/tests/input_files/expression.csv
+++ b/tests/input_files/expression.csv
@@ -1,4 +1,4 @@
-ENST00000472194	0.8914783511010855
-ENST00000308647	1.0887715239511602
-ENST00000442483	0.8381441606416928
-ENST00000511072	0.9145581387636652
+ENST00000472194,0.8914783511010855
+ENST00000308647,1.0887715239511602
+ENST00000442483,0.8381441606416928
+ENST00000511072,0.9145581387636652
diff --git a/tests/test_match_reptrans_explvl.py b/tests/test_match_reptrans_explvl.py
index 96b878d..ae3bb89 100644
--- a/tests/test_match_reptrans_explvl.py
+++ b/tests/test_match_reptrans_explvl.py
@@ -157,105 +157,107 @@ class TestMatchReptrans:
         assert tFun.duplicated_index(df_match).empty, \
             "at least one index element is duplicated"
 
-    # def test_match_repr_transcript_expression_level(self):
-    #     """Test match_repr_transcript_expression_level().
-
-    #     This function test that the right output is generated by the
-    #     function match_repr_transcript_expression_level().
-    #     """
-    #     input_path = tFun.find_path("test_gene_exprL")
-    #     intermediate_path = tFun.find_path_intermediate_file()
-    #     dict_repr_test = {
-    #         'ENSMUSG00000079415': 'ENSMUST00000112933',
-    #         "ENSMUSG00000024691": "ENSMUST00000025595",
-    #         "ENSMUSG00000063683": "ENSMUST00000119960"}
-
-    #     match.match_repr_transcript_expression_level(self,
-    #         exprTrans=input_path,
-    #         dict_reprTrans=dict_repr_test,
-    #         gtf_file=intermediate_path)
-
-    #     ref_path = tFun.find_path("test_ref_output.tsv")
-    #     output_path = tFun.find_output()
-
-    #     with open(ref_path, 'r', encoding="utf-8") as t1,\
-    #         open(output_path, 'r', encoding="utf-8") as t2,\
-    #         open(input_path, 'r', encoding="utf-8") as t3:
-    #         fileRef = t1.readlines()
-    #         fileOutput = t2.readlines()
-    #         fileInput = t3.readlines()
-
-    #     assert (
-    #         sorted(fileRef) == sorted(fileOutput)
-    #         ), "the output does't match the expected tsv file"
-    #     assert (
-    #         sorted(fileRef) != sorted(fileInput)
-    #         ), "the output does't match the expected tsv file"
-
-    # def test_txt_to_dict(self):
-    #     """This function tests if txt is convertod to dict"""
-    #     path = tFun.find_path("test_dict_repr_trans.txt")
-    #     dico = match.txt_to_dict(path)
-    #     dict_test = {'ENSMUSG00000079415': 'ENSMUST00000112933',
-    #                 "ENSMUSG00000024691": "ENSMUST00000025595",
-    #                 "ENSMUSG00000063683": "ENSMUST00000119960"}
-    #     assert dico == dict_test
-
-    # def test_transcripts_by_gene_inDf():
-    #     """
-    #     This function test if a dataframe generated from
-    #     the intermediate file is converted in another
-    #     dataframe without the support level column.
-    #     """
-    #     path = tFun.find_path_intermediate_file()
-    #     df = repr.import_gtfSelection_to_df(path)
-    #     df_gene = match.transcripts_by_gene_inDf(df)
-    #     datatype = {'Gene': np.dtype('O'), 'Transcript': np.dtype('O')}
-    #     assert tFun.column_number(df_gene) == (
-    #         2, "number of columns is not equal to 2")
-    #     assert tFun.column_d_type(df_gene) == (
-    #         datatype, "at least one column has the wrong datatype")
-    #     assert tFun.duplicated_rows(df_gene).empty, \
-    #         "at least one row are duplicated"
-    #     assert tFun.na_value(df_gene) == 0, \
-    #         "at least one row contain NA values"
-
-    # def test_output_tsv():
-    #     """
-    #     This function test if a tsv file is generated from a pandas
-    #     dataframe in the right format.
-    #     """
-
-    #     dict_repr_test = {
-    #         'ENSMUSG00000079415': 'ENSMUST00000112933',
-    #         "ENSMUSG00000024691": "ENSMUST00000025595",
-    #         "ENSMUSG00000063683": "ENSMUST00000119960"}
-    #     df_dict_repr_trans = match.dict_repr_trans_to_df(dict_repr_test)
-
-    #     path_tsv = tFun.find_path(r"test_gene_exprL")
-    #     df_tsv_exp_lvl = match.tsv_or_csv_to_df(path_tsv)
-    #     path_intermediate = tFun.find_path_intermediate_file()
-    #     df_intermediate = repr.import_gtfSelection_to_df(path_intermediate)
-    #     df_gene_transcript = match.transcripts_by_gene_inDf(df_intermediate)
-
-    #     df_exp_lvl = match.expr_level_by_gene(
-    #         df_tsv_exp_lvl, df_gene_transcript
-    #         )
-
-    #     df_match = match.match_by_gene(df_dict_repr_trans, df_exp_lvl)
-
-    #     match.output_tsv(df_match)
-
-    #     ref_path = tFun.find_path("test_ref_output.tsv")
-    #     output_path = tFun.find_output()
-
-    #     with open(ref_path, 'r') as t1, open(output_path, 'r') as t2:
-    #         fileRef = t1.readlines()
-    #         fileOutput = t2.readlines()
-
-    #     assert (
-    #         sorted(fileRef) == sorted(fileOutput)
-    #         ), "the output does't match the expected tsv file"
+    def test_match_repr_transcript_expression_level(self):
+        """Test match_repr_transcript_expression_level().
+
+        This function test that the right output is generated by the
+        function match_repr_transcript_expression_level().
+        """
+        input_path = tFun.find_path("test_gene_exprL")
+        intermediate_path = tFun.find_path_intermediate_file()
+        dict_repr_test = {
+            'ENSMUSG00000079415': 'ENSMUST00000112933',
+            "ENSMUSG00000024691": "ENSMUST00000025595",
+            "ENSMUSG00000063683": "ENSMUST00000119960"}
+
+        match.match_repr_transcript_expression_level(
+            self,
+            exprTrans=input_path,
+            dict_reprTrans=dict_repr_test,
+            gtf_file=intermediate_path
+            )
+
+        ref_path = tFun.find_path("test_ref_output.tsv")
+        output_path = tFun.find_output()
+
+        with open(ref_path, 'r', encoding="utf-8") as t1,\
+            open(output_path, 'r', encoding="utf-8") as t2,\
+            open(input_path, 'r', encoding="utf-8") as t3:
+            fileRef = t1.readlines()
+            fileOutput = t2.readlines()
+            fileInput = t3.readlines()
+
+        assert (
+            sorted(fileRef) == sorted(fileOutput)
+            ), "the output does't match the expected tsv file"
+        assert (
+            sorted(fileRef) != sorted(fileInput)
+            ), "the output does't match the expected tsv file"
+
+    def test_txt_to_dict(self):
+        """This function tests if txt is convertod to dict"""
+        path = tFun.find_path("test_dict_repr_trans.txt")
+        dico = match.txt_to_dict(path)
+        dict_test = {'ENSMUSG00000079415': 'ENSMUST00000112933',
+                    "ENSMUSG00000024691": "ENSMUST00000025595",
+                    "ENSMUSG00000063683": "ENSMUST00000119960"}
+        assert dico == dict_test
+
+    def test_transcripts_by_gene_inDf():
+        """
+        This function test if a dataframe generated from
+        the intermediate file is converted in another
+        dataframe without the support level column.
+        """
+        path = tFun.find_path_intermediate_file()
+        df = repr.import_gtfSelection_to_df(path)
+        df_gene = match.transcripts_by_gene_inDf(df)
+        datatype = {'Gene': np.dtype('O'), 'Transcript': np.dtype('O')}
+        assert tFun.column_number(df_gene) == (
+            2, "number of columns is not equal to 2")
+        assert tFun.column_d_type(df_gene) == (
+            datatype, "at least one column has the wrong datatype")
+        assert tFun.duplicated_rows(df_gene).empty, \
+            "at least one row are duplicated"
+        assert tFun.na_value(df_gene) == 0, \
+            "at least one row contain NA values"
+
+    def test_output_tsv():
+        """
+        This function test if a tsv file is generated from a pandas
+        dataframe in the right format.
+        """
+
+        dict_repr_test = {
+            'ENSMUSG00000079415': 'ENSMUST00000112933',
+            "ENSMUSG00000024691": "ENSMUST00000025595",
+            "ENSMUSG00000063683": "ENSMUST00000119960"}
+        df_dict_repr_trans = match.dict_repr_trans_to_df(dict_repr_test)
+
+        path_tsv = tFun.find_path(r"test_gene_exprL")
+        df_tsv_exp_lvl = match.tsv_or_csv_to_df(path_tsv)
+        path_intermediate = tFun.find_path_intermediate_file()
+        df_intermediate = repr.import_gtfSelection_to_df(path_intermediate)
+        df_gene_transcript = match.transcripts_by_gene_inDf(df_intermediate)
+
+        df_exp_lvl = match.expr_level_by_gene(
+            df_tsv_exp_lvl, df_gene_transcript
+            )
+
+        df_match = match.match_by_gene(df_dict_repr_trans, df_exp_lvl)
+
+        match.output_tsv(df_match)
+
+        ref_path = tFun.find_path("test_ref_output.tsv")
+        output_path = tFun.find_output()
+
+        with open(ref_path, 'r') as t1, open(output_path, 'r') as t2:
+            fileRef = t1.readlines()
+            fileOutput = t2.readlines()
+
+        assert (
+            sorted(fileRef) == sorted(fileOutput)
+            ), "the output does't match the expected tsv file"
 
 # test_dict_repr_trans_to_df()
 # test_txt_to_dict()
diff --git a/transcript_sampler/cli.py b/transcript_sampler/cli.py
index 8b65042..f4d2788 100644
--- a/transcript_sampler/cli.py
+++ b/transcript_sampler/cli.py
@@ -25,31 +25,31 @@ def main():
         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
     )
     parser.add_argument(
-        "--input_gtf", required=True, default=None,
-        help="GTF file with genome annotation"
+        "-ic", "--input_csv", required=True, default=None,
+        help="CSV or TSV file with transcripts and their expression level"
         )
     parser.add_argument(
-        "--input_csv", required=True, default=None,
-        help="CSV or TSV file with transcripts and their expression level"
+        "-ig", "--input_gtf", required=True, default=None,
+        help="GTF file with genome annotation"
         )
     parser.add_argument(
-        "--output_gtf", required=True, default=None,
-        help="Output path for the new GTF file of representative transcripts"
+        "-oc", "--output_csv", required=True, default=None,
+        help="Output path for the new CSV file of representative transcripts "
+             "and their sampled number"
         )
     parser.add_argument(
-        "--output_csv", required=True, default=None,
-        help="Output path for the new CSV file of representative transcripts \
-            and their sampled number"
+        "-og", "--output_gtf", required=True, default=None,
+        help="Output path for the new GTF file of representative transcripts"
         )
     parser.add_argument(
-        "--n_to_sample", required=True, default=None,
+        "-n", "--n_to_sample", required=True, default=None,
         help="Total number of transcripts to sample"
         )
     args = parser.parse_args()
 
     log = logging.getLogger("main")
     start = time.time()
-    log.info("Started transcript sampler...")
+    log.info("Started transcript sampler.")
     dict_repr_trans = find_rep_trans.get_rep_trans(args.input_gtf)
     df_repr = match_reptrs_explvl.match_repr_transcript_expression_level(
         dict_reprTrans=dict_repr_trans,
@@ -57,20 +57,20 @@ def main():
         gtf_file=args.input_gtf
         )
     log.info(
-        "Finding match between representative transcripts \
-            and expression level file"
+        "Finding match between representative transcripts "
+        "and expression level file..."
         )
-    log.info("Poisson sampling of transcripts")
+    log.info("Poisson sampling of transcripts...")
     poisson_sample.transcript_sampling(
         args.n_to_sample, df_repr, args.output_csv)
-    log.info("Output CSV file ready")
+    log.info("Output CSV file ready.")
 
-    log.info("Writing output GTF file")
+    log.info("Writing output GTF file...")
     find_rep_trans.gtf_file_writer(
         args.input_gtf, dict_repr_trans, args.output_gtf)
 
     end = time.time()
-    log.info("Script executed in %s sec", (end - start))
+    log.info("Script executed in %s sec.", round(end - start, 2))
 
 
 if __name__ == "__main__":
diff --git a/transcript_sampler/poisson_sampling.py b/transcript_sampler/poisson_sampling.py
index 6c586ac..f86e2bb 100644
--- a/transcript_sampler/poisson_sampling.py
+++ b/transcript_sampler/poisson_sampling.py
@@ -1,12 +1,12 @@
-"""Sample transcripts by Poisson-sampling"""
+"""Sample transcripts by Poisson-sampling."""
 
 import pandas as pd
 import numpy as np
 
 
+# pylint: disable=R0903
 class SampleTranscript:
-    '''
-    Sample transcript
+    """Sample transcript.
 
     This part of the code does Poisson sampling proportionally
     to gene expression levels for each gene.
@@ -17,10 +17,11 @@ class SampleTranscript:
 
     output: csv file with gene id and count
             gtf file with transcript samples
-    '''
+    """
+
     @staticmethod
     def transcript_sampling(total_transcript_number, df_repr, output_csv):
-        """Samples transcript based on Poisson-sampling"""
+        """Sample transcript based on Poisson-sampling."""
         total = df_repr["level"].sum()
         total_transcript_number = int(total_transcript_number)
         normalized = total_transcript_number / total
@@ -28,7 +29,7 @@ class SampleTranscript:
         transcript_numbers = pd.DataFrame({
             "id": df_repr["id"], "count": levels
             })
-        transcript_numbers.to_csv(output_csv, index=False)
+        transcript_numbers.to_csv(output_csv, index=False, header=False)
 
 
 # python_version = "3.7.13"
-- 
GitLab