diff --git a/README.md b/README.md index fee84f77ae53f93f3ef3466c1261a323e7f336a4..75d56db07b37124880f302961414fd32aedcf86d 100644 --- a/README.md +++ b/README.md @@ -1,22 +1,50 @@ # Transcript Sampler +## Overview This workflow samples representative transcripts per gene, in proportion to their relative abundance levels. Sampling is done by Poisson sampling. -**This workflow takes as input:** - - Path to genome annotation file in gtf format - - Integer of number of transcripts to sample - - Path to csv or tsv file with transcript IDs and expression levels - - Path to output sample gtf file - - Path to output sample transcript IDs and counts +This workflow takes as input: +- Path to genome annotation file in gtf format +- Path to csv or tsv file with transcript IDs and expression levels +- Path to output sample gtf file +- Path to output sample transcript IDs and counts +- Integer of number of transcripts to sample - **The outputs are :** - - trancript sample gtf file - - csv file containing sample transcript IDs and counts. +The outputs are : +- trancript sample gtf file +- csv file containing sample transcript IDs and counts. - **The workflow can be run via the command line as** - - `python transcript_sampler/new_exe.py --input_gtf={gtf input file} --input_csv={input csv file} --output_gtf={output gtf file} --output_csv={output csv file} --n_to_sample={number of transcripts}` +## Installation from github +Transcript sampler requires Python 3.9 or later. + +Install Transcript sampler from Github using: + +``` +git clone https://git.scicore.unibas.ch/zavolan_group/tools/transcript-sampler.git +cd transcript-sampler +pip install . +``` + +## Usage +``` +usage: transcript-sampler [-h] --input_gtf INPUT_GTF --input_csv INPUT_CSV --output_gtf OUTPUT_GTF --output_csv OUTPUT_CSV --n_to_sample N_TO_SAMPLE + +Transcript sampler + +options: + -h, --help show this help message and exit + --input_gtf INPUT_GTF + GTF file with genome annotation (default: None) + --input_csv INPUT_CSV + CSV or TSV file with transcripts and their expression level (default: None) + --output_gtf OUTPUT_GTF + Output path for the new GTF file of representative transcripts (default: None) + --output_csv OUTPUT_CSV + Output path for the new CSV file of representative transcripts and their sampled number (default: None) + --n_to_sample N_TO_SAMPLE + Total number of transcripts to sample (default: None) +``` - Example : +Example : - `python transcript_sampler/new_exe.py --input_gtf="input_files/test.gtf" --input_csv="input_files/expression.csv" --output_gtf="output_files/output.gtf" --output_csv="output_files/output.csv" --n_to_sample=100` +`transcript_sampler --input_gtf="tests/input_files/test.gtf" --input_csv="tests/input_files/expression.csv" --output_gtf="output_files/output.gtf" --output_csv="output_files/output.csv" --n_to_sample=100` diff --git a/requirements.txt b/requirements.txt index 98d4d622197cba5a0b39ff5ffc87237dea9045f5..3fd96b317696344e9d5c3dcf7d66d92d45f05421 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ argparse biopython gtfparse +polars == 0.16.17 numpy >= 1.23.3 pandas >= 1.4.4 \ No newline at end of file diff --git a/setup.py b/setup.py index 873c7b46c925f8abb195f35ed9a0b7af1117b5f9..9542e8a5d2e5e83f2e3f1ce44a26a9bfc796731e 100644 --- a/setup.py +++ b/setup.py @@ -18,5 +18,8 @@ setup( author_email='mate.balajti@unibas.ch', description='Transcript sampler', packages=find_packages(), - install_requires=INSTALL_REQUIRES + install_requires=INSTALL_REQUIRES, + entry_points={ + 'console_scripts': ['transcript-sampler=transcript_sampler.cli:main'] + } ) diff --git a/transcript_sampler/cli.py b/transcript_sampler/cli.py index f2d67d0ee27756d0768524a508eac7cf441cb4ec..8b650422e98331948f550776373767617ac2a161 100644 --- a/transcript_sampler/cli.py +++ b/transcript_sampler/cli.py @@ -2,92 +2,81 @@ import argparse import time import logging -from find_reptrans import FindRepTrans -from match_reptrans_explvl import MatchReptransExplvl -from poisson_sampling import SampleTranscript - -find_rep_trans = FindRepTrans() -match_reptrs_explvl = MatchReptransExplvl() -poisson_sample = SampleTranscript() logging.basicConfig( format='[%(asctime)s: %(levelname)s] %(message)s \ (module "%(module)s")', level=logging.INFO, -) -LOG = logging.getLogger("main") - - -def main(args: argparse.Namespace): - """Execute transcript sampler.""" - start = time.time() - LOG.info("Started transcript sampler...") - dict_repr_trans = find_rep_trans.get_rep_trans(args.input_gtf) - df_repr = match_reptrs_explvl.match_repr_transcript_expression_level( - dict_reprTrans=dict_repr_trans, - exprTrans=args.input_csv, - gtf_file=args.input_gtf - ) - LOG.info( - "Finding match between representative transcripts \ - and expression level file" - ) - LOG.info("Poisson sampling of transcripts") - poisson_sample.transcript_sampling( - args.transcript_nr, df_repr, args.output_csv) - LOG.info("Output CSV file ready") - - LOG.info("Writing output GTF file") - find_rep_trans.gtf_file_writer( - args.input_gtf, dict_repr_trans, args.output_gtf) + ) - end = time.time() - LOG.info("Script executed in %s sec", (end - start)) +from transcript_sampler.find_reptrans import FindRepTrans # noqa: E402,E501 # pylint:disable=wrong-import-position +from transcript_sampler.match_reptrans_explvl import MatchReptransExplvl # noqa: E402,E501 # pylint:disable=wrong-import-position +from transcript_sampler.poisson_sampling import SampleTranscript # noqa: E402,E501 # pylint:disable=wrong-import-position +find_rep_trans = FindRepTrans() +match_reptrs_explvl = MatchReptransExplvl() +poisson_sample = SampleTranscript() -def parse_arguments() -> argparse.Namespace: - """Request parameters from user on CLI. - Returns: - argparse.Namespace: object of arguments from CLI. - """ +def main(): + """Execute transcript sampler.""" parser = argparse.ArgumentParser( description="Transcript sampler", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) parser.add_argument( - "--input_gtf", required=True, + "--input_gtf", required=True, default=None, help="GTF file with genome annotation" ) parser.add_argument( - "--input_csv", required=True, + "--input_csv", required=True, default=None, help="CSV or TSV file with transcripts and their expression level" ) parser.add_argument( - "--output_gtf", required=True, + "--output_gtf", required=True, default=None, help="Output path for the new GTF file of representative transcripts" ) parser.add_argument( - "--output_csv", required=True, + "--output_csv", required=True, default=None, help="Output path for the new CSV file of representative transcripts \ and their sampled number" ) parser.add_argument( - "--n_to_sample", required=True, + "--n_to_sample", required=True, default=None, help="Total number of transcripts to sample" ) args = parser.parse_args() - return args + log = logging.getLogger("main") + start = time.time() + log.info("Started transcript sampler...") + dict_repr_trans = find_rep_trans.get_rep_trans(args.input_gtf) + df_repr = match_reptrs_explvl.match_repr_transcript_expression_level( + dict_reprTrans=dict_repr_trans, + exprTrans=args.input_csv, + gtf_file=args.input_gtf + ) + log.info( + "Finding match between representative transcripts \ + and expression level file" + ) + log.info("Poisson sampling of transcripts") + poisson_sample.transcript_sampling( + args.n_to_sample, df_repr, args.output_csv) + log.info("Output CSV file ready") + log.info("Writing output GTF file") + find_rep_trans.gtf_file_writer( + args.input_gtf, dict_repr_trans, args.output_gtf) -if __name__ == '__main__': + end = time.time() + log.info("Script executed in %s sec", (end - start)) + + +if __name__ == "__main__": logging.basicConfig( format='[%(asctime)s: %(levelname)s] %(message)s \ (module "%(module)s")', level=logging.INFO, - ) - logger = logging.getLogger(__name__) - - arguments = parse_arguments() - main(arguments) + ) + main()