diff --git a/.gitignore b/.gitignore index 7a8baad3fbb9055363fc60eda6a2113c091f85f5..2a680d3b0a4bafbaa25ed4d8d76d524aafe15891 100644 --- a/.gitignore +++ b/.gitignore @@ -4,4 +4,9 @@ # ignore ALL files in ANY directory named temp temp/ __pycache__ -output_files \ No newline at end of file +output_files +*_cache +*egg-info/ +.coverage +build/ +*/play.py \ No newline at end of file diff --git a/README.md b/README.md index fee84f77ae53f93f3ef3466c1261a323e7f336a4..75d56db07b37124880f302961414fd32aedcf86d 100644 --- a/README.md +++ b/README.md @@ -1,22 +1,50 @@ # Transcript Sampler +## Overview This workflow samples representative transcripts per gene, in proportion to their relative abundance levels. Sampling is done by Poisson sampling. -**This workflow takes as input:** - - Path to genome annotation file in gtf format - - Integer of number of transcripts to sample - - Path to csv or tsv file with transcript IDs and expression levels - - Path to output sample gtf file - - Path to output sample transcript IDs and counts +This workflow takes as input: +- Path to genome annotation file in gtf format +- Path to csv or tsv file with transcript IDs and expression levels +- Path to output sample gtf file +- Path to output sample transcript IDs and counts +- Integer of number of transcripts to sample - **The outputs are :** - - trancript sample gtf file - - csv file containing sample transcript IDs and counts. +The outputs are : +- trancript sample gtf file +- csv file containing sample transcript IDs and counts. - **The workflow can be run via the command line as** - - `python transcript_sampler/new_exe.py --input_gtf={gtf input file} --input_csv={input csv file} --output_gtf={output gtf file} --output_csv={output csv file} --n_to_sample={number of transcripts}` +## Installation from github +Transcript sampler requires Python 3.9 or later. + +Install Transcript sampler from Github using: + +``` +git clone https://git.scicore.unibas.ch/zavolan_group/tools/transcript-sampler.git +cd transcript-sampler +pip install . +``` + +## Usage +``` +usage: transcript-sampler [-h] --input_gtf INPUT_GTF --input_csv INPUT_CSV --output_gtf OUTPUT_GTF --output_csv OUTPUT_CSV --n_to_sample N_TO_SAMPLE + +Transcript sampler + +options: + -h, --help show this help message and exit + --input_gtf INPUT_GTF + GTF file with genome annotation (default: None) + --input_csv INPUT_CSV + CSV or TSV file with transcripts and their expression level (default: None) + --output_gtf OUTPUT_GTF + Output path for the new GTF file of representative transcripts (default: None) + --output_csv OUTPUT_CSV + Output path for the new CSV file of representative transcripts and their sampled number (default: None) + --n_to_sample N_TO_SAMPLE + Total number of transcripts to sample (default: None) +``` - Example : +Example : - `python transcript_sampler/new_exe.py --input_gtf="input_files/test.gtf" --input_csv="input_files/expression.csv" --output_gtf="output_files/output.gtf" --output_csv="output_files/output.csv" --n_to_sample=100` +`transcript_sampler --input_gtf="tests/input_files/test.gtf" --input_csv="tests/input_files/expression.csv" --output_gtf="output_files/output.gtf" --output_csv="output_files/output.csv" --n_to_sample=100` diff --git a/requirements.txt b/requirements.txt index 98d4d622197cba5a0b39ff5ffc87237dea9045f5..3fd96b317696344e9d5c3dcf7d66d92d45f05421 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ argparse biopython gtfparse +polars == 0.16.17 numpy >= 1.23.3 pandas >= 1.4.4 \ No newline at end of file diff --git a/setup.py b/setup.py index 873c7b46c925f8abb195f35ed9a0b7af1117b5f9..9542e8a5d2e5e83f2e3f1ce44a26a9bfc796731e 100644 --- a/setup.py +++ b/setup.py @@ -18,5 +18,8 @@ setup( author_email='mate.balajti@unibas.ch', description='Transcript sampler', packages=find_packages(), - install_requires=INSTALL_REQUIRES + install_requires=INSTALL_REQUIRES, + entry_points={ + 'console_scripts': ['transcript-sampler=transcript_sampler.cli:main'] + } ) diff --git a/input_files/expression.csv b/tests/input_files/expression.csv similarity index 100% rename from input_files/expression.csv rename to tests/input_files/expression.csv diff --git a/input_files/test.gtf b/tests/input_files/test.gtf similarity index 100% rename from input_files/test.gtf rename to tests/input_files/test.gtf diff --git a/transcript_sampler/__init__.py b/transcript_sampler/__init__.py index 4572092b49877321a9bb2f4fc1483a63bbbd1aed..bb7d5f3186cb2e516714c06c9a5e8d2b57696586 100644 --- a/transcript_sampler/__init__.py +++ b/transcript_sampler/__init__.py @@ -1 +1 @@ -"""Init.py.""" +"""Initialise package.""" diff --git a/transcript_sampler/cli.py b/transcript_sampler/cli.py new file mode 100644 index 0000000000000000000000000000000000000000..8b650422e98331948f550776373767617ac2a161 --- /dev/null +++ b/transcript_sampler/cli.py @@ -0,0 +1,82 @@ +"""This module executes the transcript_sampler.""" +import argparse +import time +import logging + +logging.basicConfig( + format='[%(asctime)s: %(levelname)s] %(message)s \ + (module "%(module)s")', + level=logging.INFO, + ) + +from transcript_sampler.find_reptrans import FindRepTrans # noqa: E402,E501 # pylint:disable=wrong-import-position +from transcript_sampler.match_reptrans_explvl import MatchReptransExplvl # noqa: E402,E501 # pylint:disable=wrong-import-position +from transcript_sampler.poisson_sampling import SampleTranscript # noqa: E402,E501 # pylint:disable=wrong-import-position + +find_rep_trans = FindRepTrans() +match_reptrs_explvl = MatchReptransExplvl() +poisson_sample = SampleTranscript() + + +def main(): + """Execute transcript sampler.""" + parser = argparse.ArgumentParser( + description="Transcript sampler", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument( + "--input_gtf", required=True, default=None, + help="GTF file with genome annotation" + ) + parser.add_argument( + "--input_csv", required=True, default=None, + help="CSV or TSV file with transcripts and their expression level" + ) + parser.add_argument( + "--output_gtf", required=True, default=None, + help="Output path for the new GTF file of representative transcripts" + ) + parser.add_argument( + "--output_csv", required=True, default=None, + help="Output path for the new CSV file of representative transcripts \ + and their sampled number" + ) + parser.add_argument( + "--n_to_sample", required=True, default=None, + help="Total number of transcripts to sample" + ) + args = parser.parse_args() + + log = logging.getLogger("main") + start = time.time() + log.info("Started transcript sampler...") + dict_repr_trans = find_rep_trans.get_rep_trans(args.input_gtf) + df_repr = match_reptrs_explvl.match_repr_transcript_expression_level( + dict_reprTrans=dict_repr_trans, + exprTrans=args.input_csv, + gtf_file=args.input_gtf + ) + log.info( + "Finding match between representative transcripts \ + and expression level file" + ) + log.info("Poisson sampling of transcripts") + poisson_sample.transcript_sampling( + args.n_to_sample, df_repr, args.output_csv) + log.info("Output CSV file ready") + + log.info("Writing output GTF file") + find_rep_trans.gtf_file_writer( + args.input_gtf, dict_repr_trans, args.output_gtf) + + end = time.time() + log.info("Script executed in %s sec", (end - start)) + + +if __name__ == "__main__": + logging.basicConfig( + format='[%(asctime)s: %(levelname)s] %(message)s \ + (module "%(module)s")', + level=logging.INFO, + ) + main() diff --git a/images/.gitkeep b/transcript_sampler/images/.gitkeep similarity index 100% rename from images/.gitkeep rename to transcript_sampler/images/.gitkeep diff --git a/images/screenshot_git_tutorial_1_hGillet.png b/transcript_sampler/images/screenshot_git_tutorial_1_hGillet.png similarity index 100% rename from images/screenshot_git_tutorial_1_hGillet.png rename to transcript_sampler/images/screenshot_git_tutorial_1_hGillet.png diff --git a/images/screenshot_git_tutorial_2_hGillet.png b/transcript_sampler/images/screenshot_git_tutorial_2_hGillet.png similarity index 100% rename from images/screenshot_git_tutorial_2_hGillet.png rename to transcript_sampler/images/screenshot_git_tutorial_2_hGillet.png diff --git a/images/screenshot_markdown_tutorial_hGillet.png b/transcript_sampler/images/screenshot_markdown_tutorial_hGillet.png similarity index 100% rename from images/screenshot_markdown_tutorial_hGillet.png rename to transcript_sampler/images/screenshot_markdown_tutorial_hGillet.png diff --git a/transcript_sampler/new_exe.py b/transcript_sampler/new_exe.py deleted file mode 100644 index d96f6136bf181b6c98078bb94a222893ab6e1ef5..0000000000000000000000000000000000000000 --- a/transcript_sampler/new_exe.py +++ /dev/null @@ -1,78 +0,0 @@ -"""This module executes the transcript_sampler""" -import argparse -import time -import logging -logging.basicConfig( - format='[%(asctime)s: %(levelname)s] %(message)s (module "%(module)s")', - level=logging.INFO, - ) -from find_reptrans import FindRepTrans # pylint: disable=E0401,C0413 -from match_reptrans_explvl import MatchReptransExplvl # pylint: disable=E0401,C0413 -from poisson_sampling import SampleTranscript # pylint: disable=E0401,C0413 - -find_rep_trans = FindRepTrans() -match_reptrs_explvl = MatchReptransExplvl() -poisson_sample = SampleTranscript() - -LOG = logging.getLogger(__name__) - - -def exe(input_gtf, input_csv, output_gtf, output_csv, transcript_nr): - """Execute transcript sampler.""" - start = time.time() - LOG.info("Started transcript sampler...") - dict_repr_trans = find_rep_trans.get_rep_trans(input_gtf) - df_repr = match_reptrs_explvl.match_repr_transcript_expression_level( - dict_reprTrans=dict_repr_trans, exprTrans=input_csv, gtf_file=input_gtf - ) - LOG.info( - "Finding match between representative transcripts \ - and expression level file" - ) - LOG.info("Poisson sampling of transcripts") - poisson_sample.transcript_sampling(transcript_nr, df_repr, output_csv) - LOG.info("Output CSV file ready") - - LOG.info("Writing output GTF file") - find_rep_trans.gtf_file_writer(input_gtf, dict_repr_trans, output_gtf) - - end = time.time() - LOG.info("Script executed in %s sec", (end - start)) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Transcript sampler", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) - parser.add_argument( - "--input_gtf", required=True, - help="GTF file with genome annotation" - ) - parser.add_argument( - "--input_csv", required=True, - help="CSV or TSV file with transcripts and their expression level" - ) - parser.add_argument( - "--output_gtf", required=True, - help="Output path for the new GTF file of representative transcripts" - ) - parser.add_argument( - "--output_csv", required=True, - help="Output path for the new CSV file of representative transcripts \ - and their sampled number" - ) - parser.add_argument( - "--n_to_sample", required=True, - help="Total number of transcripts to sample" - ) - args = parser.parse_args() - print(args) - - exe( - args.input_gtf, - args.input_csv, - args.output_gtf, - args.output_csv, - args.n_to_sample, - ) diff --git a/scripts/exon_length_filter.py b/transcript_sampler/obsolete_scripts/exon_length_filter.py similarity index 100% rename from scripts/exon_length_filter.py rename to transcript_sampler/obsolete_scripts/exon_length_filter.py diff --git a/scripts/org_test_representative.py b/transcript_sampler/obsolete_scripts/org_test_representative.py similarity index 100% rename from scripts/org_test_representative.py rename to transcript_sampler/obsolete_scripts/org_test_representative.py diff --git a/scripts/representative.py b/transcript_sampler/obsolete_scripts/representative.py similarity index 100% rename from scripts/representative.py rename to transcript_sampler/obsolete_scripts/representative.py diff --git a/scripts/transcript_extractor.py b/transcript_sampler/obsolete_scripts/transcript_extractor.py similarity index 100% rename from scripts/transcript_extractor.py rename to transcript_sampler/obsolete_scripts/transcript_extractor.py