refactor: update gitignore, requirements, cli

60430a41 · Mate Balajti · 3a28cd35 · 60430a41 · 60430a41 · 60430a41
Commit 60430a41 authored 1 year ago by Mate Balajti
--- a/.gitignore
+++ b/.gitignore
-# ignore ALL .log files
+# Created by https://www.toptal.com/developers/gitignore/api/macos,visualstudiocode
-*.log
+# Edit at https://www.toptal.com/developers/gitignore?templates=macos,visualstudiocode
-# ignore ALL files in ANY directory named temp
+### macOS ###
-temp/ 
+# General
-__pycache__
+.DS_Store
-output_files
+.AppleDouble
+.LSOverride
+# Icon must end with two \r
+Icon
+# Thumbnails
+._*
+# Files that might appear in the root of a volume
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+.com.apple.timemachine.donotpresent
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
+### macOS Patch ###
+# iCloud generated files
+*.icloud
+### VisualStudioCode ###
+.vscode/*
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+!.vscode/*.code-snippets
+# Local History for Visual Studio Code
+.history/
+# Built Visual Studio Code Extensions
+*.vsix
+### VisualStudioCode Patch ###
+# Ignore all local history of files
+.history
+.ionide
+.vscode
+# End of https://www.toptal.com/developers/gitignore/api/macos,visualstudiocode
+__pycache__/
 *_cache
 *egg-info/
 .coverage
 build/
 */play.py
\ No newline at end of file
+*.log
+temp/
+output_files
--- a/requirements.txt
+++ b/requirements.txt
 argparse
 biopython
 gtfparse
-polars == 0.16.17
 numpy >= 1.23.3
 pandas >= 1.4.4
\ No newline at end of file
--- a/tests/input_files/expression.csv
+++ b/tests/input_files/expression.csv
-ENST00000472194	0.8914783511010855
+ENST00000472194,0.8914783511010855
-ENST00000308647	1.0887715239511602
+ENST00000308647,1.0887715239511602
-ENST00000442483	0.8381441606416928
+ENST00000442483,0.8381441606416928
-ENST00000511072	0.9145581387636652
+ENST00000511072,0.9145581387636652
--- a/tests/test_match_reptrans_explvl.py
+++ b/tests/test_match_reptrans_explvl.py
@@ -157,105 +157,107 @@ class TestMatchReptrans:
        assert tFun.duplicated_index(df_match).empty, \
            "at least one index element is duplicated"
-    # def test_match_repr_transcript_expression_level(self):
+    def test_match_repr_transcript_expression_level(self):
-    #     """Test match_repr_transcript_expression_level().
+        """Test match_repr_transcript_expression_level().
-    #     This function test that the right output is generated by the
+        This function test that the right output is generated by the
-    #     function match_repr_transcript_expression_level().
+        function match_repr_transcript_expression_level().
-    #     """
+        """
-    #     input_path = tFun.find_path("test_gene_exprL")
+        input_path = tFun.find_path("test_gene_exprL")
-    #     intermediate_path = tFun.find_path_intermediate_file()
+        intermediate_path = tFun.find_path_intermediate_file()
-    #     dict_repr_test = {
+        dict_repr_test = {
-    #         'ENSMUSG00000079415': 'ENSMUST00000112933',
+            'ENSMUSG00000079415': 'ENSMUST00000112933',
-    #         "ENSMUSG00000024691": "ENSMUST00000025595",
+            "ENSMUSG00000024691": "ENSMUST00000025595",
-    #         "ENSMUSG00000063683": "ENSMUST00000119960"}
+            "ENSMUSG00000063683": "ENSMUST00000119960"}
-    #     match.match_repr_transcript_expression_level(self,
+        match.match_repr_transcript_expression_level(
-    #         exprTrans=input_path,
+            self,
-    #         dict_reprTrans=dict_repr_test,
+            exprTrans=input_path,
-    #         gtf_file=intermediate_path)
+            dict_reprTrans=dict_repr_test,
+            gtf_file=intermediate_path
-    #     ref_path = tFun.find_path("test_ref_output.tsv")
+            )
-    #     output_path = tFun.find_output()
+        ref_path = tFun.find_path("test_ref_output.tsv")
-    #     with open(ref_path, 'r', encoding="utf-8") as t1,\
+        output_path = tFun.find_output()
-    #         open(output_path, 'r', encoding="utf-8") as t2,\
-    #         open(input_path, 'r', encoding="utf-8") as t3:
+        with open(ref_path, 'r', encoding="utf-8") as t1,\
-    #         fileRef = t1.readlines()
+            open(output_path, 'r', encoding="utf-8") as t2,\
-    #         fileOutput = t2.readlines()
+            open(input_path, 'r', encoding="utf-8") as t3:
-    #         fileInput = t3.readlines()
+            fileRef = t1.readlines()
+            fileOutput = t2.readlines()
-    #     assert (
+            fileInput = t3.readlines()
-    #         sorted(fileRef) == sorted(fileOutput)
-    #         ), "the output does't match the expected tsv file"
+        assert (
-    #     assert (
+            sorted(fileRef) == sorted(fileOutput)
-    #         sorted(fileRef) != sorted(fileInput)
+            ), "the output does't match the expected tsv file"
-    #         ), "the output does't match the expected tsv file"
+        assert (
+            sorted(fileRef) != sorted(fileInput)
-    # def test_txt_to_dict(self):
+            ), "the output does't match the expected tsv file"
-    #     """This function tests if txt is convertod to dict"""
-    #     path = tFun.find_path("test_dict_repr_trans.txt")
+    def test_txt_to_dict(self):
-    #     dico = match.txt_to_dict(path)
+        """This function tests if txt is convertod to dict"""
-    #     dict_test = {'ENSMUSG00000079415': 'ENSMUST00000112933',
+        path = tFun.find_path("test_dict_repr_trans.txt")
-    #                 "ENSMUSG00000024691": "ENSMUST00000025595",
+        dico = match.txt_to_dict(path)
-    #                 "ENSMUSG00000063683": "ENSMUST00000119960"}
+        dict_test = {'ENSMUSG00000079415': 'ENSMUST00000112933',
-    #     assert dico == dict_test
+                    "ENSMUSG00000024691": "ENSMUST00000025595",
+                    "ENSMUSG00000063683": "ENSMUST00000119960"}
-    # def test_transcripts_by_gene_inDf():
+        assert dico == dict_test
-    #     """
-    #     This function test if a dataframe generated from
+    def test_transcripts_by_gene_inDf():
-    #     the intermediate file is converted in another
+        """
-    #     dataframe without the support level column.
+        This function test if a dataframe generated from
-    #     """
+        the intermediate file is converted in another
-    #     path = tFun.find_path_intermediate_file()
+        dataframe without the support level column.
-    #     df = repr.import_gtfSelection_to_df(path)
+        """
-    #     df_gene = match.transcripts_by_gene_inDf(df)
+        path = tFun.find_path_intermediate_file()
-    #     datatype = {'Gene': np.dtype('O'), 'Transcript': np.dtype('O')}
+        df = repr.import_gtfSelection_to_df(path)
-    #     assert tFun.column_number(df_gene) == (
+        df_gene = match.transcripts_by_gene_inDf(df)
-    #         2, "number of columns is not equal to 2")
+        datatype = {'Gene': np.dtype('O'), 'Transcript': np.dtype('O')}
-    #     assert tFun.column_d_type(df_gene) == (
+        assert tFun.column_number(df_gene) == (
-    #         datatype, "at least one column has the wrong datatype")
+            2, "number of columns is not equal to 2")
-    #     assert tFun.duplicated_rows(df_gene).empty, \
+        assert tFun.column_d_type(df_gene) == (
-    #         "at least one row are duplicated"
+            datatype, "at least one column has the wrong datatype")
-    #     assert tFun.na_value(df_gene) == 0, \
+        assert tFun.duplicated_rows(df_gene).empty, \
-    #         "at least one row contain NA values"
+            "at least one row are duplicated"
+        assert tFun.na_value(df_gene) == 0, \
-    # def test_output_tsv():
+            "at least one row contain NA values"
-    #     """
-    #     This function test if a tsv file is generated from a pandas
+    def test_output_tsv():
-    #     dataframe in the right format.
+        """
-    #     """
+        This function test if a tsv file is generated from a pandas
+        dataframe in the right format.
-    #     dict_repr_test = {
+        """
-    #         'ENSMUSG00000079415': 'ENSMUST00000112933',
-    #         "ENSMUSG00000024691": "ENSMUST00000025595",
+        dict_repr_test = {
-    #         "ENSMUSG00000063683": "ENSMUST00000119960"}
+            'ENSMUSG00000079415': 'ENSMUST00000112933',
-    #     df_dict_repr_trans = match.dict_repr_trans_to_df(dict_repr_test)
+            "ENSMUSG00000024691": "ENSMUST00000025595",
+            "ENSMUSG00000063683": "ENSMUST00000119960"}
-    #     path_tsv = tFun.find_path(r"test_gene_exprL")
+        df_dict_repr_trans = match.dict_repr_trans_to_df(dict_repr_test)
-    #     df_tsv_exp_lvl = match.tsv_or_csv_to_df(path_tsv)
-    #     path_intermediate = tFun.find_path_intermediate_file()
+        path_tsv = tFun.find_path(r"test_gene_exprL")
-    #     df_intermediate = repr.import_gtfSelection_to_df(path_intermediate)
+        df_tsv_exp_lvl = match.tsv_or_csv_to_df(path_tsv)
-    #     df_gene_transcript = match.transcripts_by_gene_inDf(df_intermediate)
+        path_intermediate = tFun.find_path_intermediate_file()
+        df_intermediate = repr.import_gtfSelection_to_df(path_intermediate)
-    #     df_exp_lvl = match.expr_level_by_gene(
+        df_gene_transcript = match.transcripts_by_gene_inDf(df_intermediate)
-    #         df_tsv_exp_lvl, df_gene_transcript
-    #         )
+        df_exp_lvl = match.expr_level_by_gene(
+            df_tsv_exp_lvl, df_gene_transcript
-    #     df_match = match.match_by_gene(df_dict_repr_trans, df_exp_lvl)
+            )
-    #     match.output_tsv(df_match)
+        df_match = match.match_by_gene(df_dict_repr_trans, df_exp_lvl)
-    #     ref_path = tFun.find_path("test_ref_output.tsv")
+        match.output_tsv(df_match)
-    #     output_path = tFun.find_output()
+        ref_path = tFun.find_path("test_ref_output.tsv")
-    #     with open(ref_path, 'r') as t1, open(output_path, 'r') as t2:
+        output_path = tFun.find_output()
-    #         fileRef = t1.readlines()
-    #         fileOutput = t2.readlines()
+        with open(ref_path, 'r') as t1, open(output_path, 'r') as t2:
+            fileRef = t1.readlines()
-    #     assert (
+            fileOutput = t2.readlines()
-    #         sorted(fileRef) == sorted(fileOutput)
-    #         ), "the output does't match the expected tsv file"
+        assert (
+            sorted(fileRef) == sorted(fileOutput)
+            ), "the output does't match the expected tsv file"
 # test_dict_repr_trans_to_df()
 # test_txt_to_dict()

--- a/transcript_sampler/cli.py
+++ b/transcript_sampler/cli.py
@@ -25,31 +25,31 @@ def main():
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument(
-        "--input_gtf", required=True, default=None,
+        "-ic", "--input_csv", required=True, default=None,
-        help="GTF file with genome annotation"
+        help="CSV or TSV file with transcripts and their expression level"
        )
    parser.add_argument(
-        "--input_csv", required=True, default=None,
+        "-ig", "--input_gtf", required=True, default=None,
-        help="CSV or TSV file with transcripts and their expression level"
+        help="GTF file with genome annotation"
        )
    parser.add_argument(
-        "--output_gtf", required=True, default=None,
+        "-oc", "--output_csv", required=True, default=None,
-        help="Output path for the new GTF file of representative transcripts"
+        help="Output path for the new CSV file of representative transcripts "
+             "and their sampled number"
        )
    parser.add_argument(
-        "--output_csv", required=True, default=None,
+        "-og", "--output_gtf", required=True, default=None,
-        help="Output path for the new CSV file of representative transcripts \
+        help="Output path for the new GTF file of representative transcripts"
-            and their sampled number"
        )
    parser.add_argument(
-        "--n_to_sample", required=True, default=None,
+        "-n", "--n_to_sample", required=True, default=None,
        help="Total number of transcripts to sample"
        )
    args = parser.parse_args()
    log = logging.getLogger("main")
    start = time.time()
-    log.info("Started transcript sampler...")
+    log.info("Started transcript sampler.")
    dict_repr_trans = find_rep_trans.get_rep_trans(args.input_gtf)
    df_repr = match_reptrs_explvl.match_repr_transcript_expression_level(
        dict_reprTrans=dict_repr_trans,
@@ -57,20 +57,20 @@ def main():
        gtf_file=args.input_gtf
        )
    log.info(
-        "Finding match between representative transcripts \
+        "Finding match between representative transcripts "
-            and expression level file"
+        "and expression level file..."
        )
-    log.info("Poisson sampling of transcripts")
+    log.info("Poisson sampling of transcripts...")
    poisson_sample.transcript_sampling(
        args.n_to_sample, df_repr, args.output_csv)
-    log.info("Output CSV file ready")
+    log.info("Output CSV file ready.")
-    log.info("Writing output GTF file")
+    log.info("Writing output GTF file...")
    find_rep_trans.gtf_file_writer(
        args.input_gtf, dict_repr_trans, args.output_gtf)
    end = time.time()
-    log.info("Script executed in %s sec", (end - start))
+    log.info("Script executed in %s sec.", round(end - start, 2))
 if __name__ == "__main__":

--- a/transcript_sampler/poisson_sampling.py
+++ b/transcript_sampler/poisson_sampling.py
-"""Sample transcripts by Poisson-sampling"""
+"""Sample transcripts by Poisson-sampling."""
 import pandas as pd
 import numpy as np
+# pylint: disable=R0903
 class SampleTranscript:
-    '''
+    """Sample transcript.
-    Sample transcript
    This part of the code does Poisson sampling proportionally
    to gene expression levels for each gene.
@@ -17,10 +17,11 @@ class SampleTranscript:
    output: csv file with gene id and count
            gtf file with transcript samples
-    '''
+    """
    @staticmethod
    def transcript_sampling(total_transcript_number, df_repr, output_csv):
-        """Samples transcript based on Poisson-sampling"""
+        """Sample transcript based on Poisson-sampling."""
        total = df_repr["level"].sum()
        total_transcript_number = int(total_transcript_number)
        normalized = total_transcript_number / total
@@ -28,7 +29,7 @@ class SampleTranscript:
        transcript_numbers = pd.DataFrame({
            "id": df_repr["id"], "count": levels
            })
-        transcript_numbers.to_csv(output_csv, index=False)
+        transcript_numbers.to_csv(output_csv, index=False, header=False)
 # python_version = "3.7.13"