From 32c933a310da5bd8c1e9e76bfa61823829de72a6 Mon Sep 17 00:00:00 2001 From: Maciej Bak <bakma@kb-rgmz01-pdl04.pz.unibas.ch> Date: Mon, 23 Aug 2021 13:22:42 +0200 Subject: [PATCH] refactor: remove everything labkey-related --- README.md | 51 -- install/environment.dev.yml | 1 - scripts/labkey_api.py | 27 - scripts/prepare_inputs.dict.tsv | 51 -- scripts/prepare_inputs.py | 665 ------------------ scripts/requirements.txt | 3 - .../expected_output.md5 | 2 - .../test.sh | 55 -- .../expected_output.md5 | 2 - .../input_table.tsv | 3 - .../test_scripts_prepare_inputs_table/test.sh | 45 -- 11 files changed, 905 deletions(-) delete mode 100755 scripts/labkey_api.py delete mode 100644 scripts/prepare_inputs.dict.tsv delete mode 100755 scripts/prepare_inputs.py delete mode 100644 scripts/requirements.txt delete mode 100644 tests/test_scripts_prepare_inputs_labkey/expected_output.md5 delete mode 100755 tests/test_scripts_prepare_inputs_labkey/test.sh delete mode 100644 tests/test_scripts_prepare_inputs_table/expected_output.md5 delete mode 100644 tests/test_scripts_prepare_inputs_table/input_table.tsv delete mode 100755 tests/test_scripts_prepare_inputs_table/test.sh diff --git a/README.md b/README.md index 6bd20fa..5e3b047 100644 --- a/README.md +++ b/README.md @@ -218,56 +218,6 @@ your run. bash run.sh ``` -### Configuring workflow runs via LabKey tables - -Our lab stores metadata for sequencing samples in a locally deployed -[LabKey][labkey] instance. This repository provides two scripts that give -programmatic access to the LabKey data table and convert it to the -corresponding workflow inputs (`samples.tsv` and `config.yaml`), respectively. -As such, these scripts largely automate step 3. of the above instructions. -However, as these scripts were written specifically for the needs of our lab, -they are likely not directly usable or, at least, will require considerable -modification for other setups (e.g., different LabKey table structure). -Nevertheless, they can serve as an example for interfacing between LabKey and -your workflow. - -> **NOTE:** All of the below steps assume that your current working directory -> is the repository's root directory. - -1. The scripts have additional dependencies that can be installed with: - - ```bash - pip install -r scripts/requirements.txt - ``` - -2. In order to gain programmatic access to LabKey via its API, a credential -file is required. Create it with the following command after replacing the -placeholder values with your real credentials (talk to your LabKey manager if -you do not have these): - - ```bash - cat << EOF | ( umask 0377; cat >> ${HOME}/.netrc; ) - machine <remote-instance-of-labkey-server> - login <user-email> - password <user-password> - EOF - ``` - -3. Generate the workflow configuration with the following command, after -replacing the placeholders with the appropriate values (check out the -help screen with option '--help' for further options and information): - - ```bash - python scripts/prepare_inputs.py \ - --labkey-domain="my.labkey.service.io" - --labkey-domain="/my/project/path" - --input-to-output-mapping="scripts/prepare_inputs.dict.tsv" \ - --resources-dir="/path/to/my/genome/resources" \ - --output-table="config/my_run/samples.tsv" \ - --config_file="config/my_run/config.yaml" \ - <table_name> - ``` - #### Additional information The metadata field names in the LabKey instance and those in the parameters @@ -328,7 +278,6 @@ Contaminant sequences | contaminant_seqs [conda]: <https://docs.conda.io/projects/conda/en/latest/index.html> [profiles]: <https://snakemake.readthedocs.io/en/stable/executing/cli.html#profiles> -[labkey]: <https://www.labkey.com/> [miniconda-installation]: <https://docs.conda.io/en/latest/miniconda.html> [rule-graph]: images/rule_graph.svg [snakemake]: <https://snakemake.readthedocs.io/en/stable/> diff --git a/install/environment.dev.yml b/install/environment.dev.yml index 3fa2e72..4ec8e7d 100644 --- a/install/environment.dev.yml +++ b/install/environment.dev.yml @@ -10,5 +10,4 @@ dependencies: - pip: - pandas==1.0.1 - biopython==1.76 - - labkey==1.2.0 diff --git a/scripts/labkey_api.py b/scripts/labkey_api.py deleted file mode 100755 index 759acb4..0000000 --- a/scripts/labkey_api.py +++ /dev/null @@ -1,27 +0,0 @@ -# This script targets the client api version 0.4.0 and later - -# -# Check the page: https://github.com/LabKey/labkey-api-python/blob/master/samples/query_examples.py -# for example about filtering in queries. -# A starting point to investigate further is here: -# https://www.labkey.org/download/clientapi_docs/javascript-api/symbols/LABKEY.Query.Filter.html - -import labkey -import pandas as pd -import sys - -# for convenience, load QueryFilter explicitly (avoids long lines in filter definitions) -from labkey.query import QueryFilter - -if __name__ == "__main__": - # These are values of variables for which the script works - # project_name = "TEST_ABOERSCH" - # query_name = "RNA_Seq_data_template" - project_name = sys.argv[1] - query_name = sys.argv[2] - server_context = labkey.utils.create_server_context('labkey.scicore.unibas.ch', '/Zavolan Group/'+project_name, 'labkey', use_ssl=True) - schema_name = "lists" - results = labkey.query.select_rows(server_context,schema_name,query_name) - table_of_data = pd.DataFrame(results["rows"]) - print(table_of_data) - diff --git a/scripts/prepare_inputs.dict.tsv b/scripts/prepare_inputs.dict.tsv deleted file mode 100644 index ab2f741..0000000 --- a/scripts/prepare_inputs.dict.tsv +++ /dev/null @@ -1,51 +0,0 @@ -labkey snakemake -Entry_Date entry_date -Path_Fastq_Files fastq_path -Condition_Name condition -Sample_Name sample_name -Single_Paired seqmode -Mate1_File fq1 -Mate2_File fq2 -Mate1_Direction mate1_direction -Mate2_Direction mate2_direction -Mate1_5p_Adapter fq1_5p -Mate1_3p_Adapter fq1_3p -Mate2_5p_Adapter fq2_5p -Mate2_3p_Adapter fq2_3p -Fragment_Length_Mean mean -Fragment_Length_SD sd -Quality_Control_Flag quality_control_flag -Checksum_Raw_FASTQ_Mate1 mate1_checksum -Checksum_Raw_FASTQ_Mate2 mate2_checksum -File_Name_Metadata_File metadata -Name_Quality_Control_File_Mate1 mate1_quality -Name_Quality_Control_File_Mate2 mate2_quality -Organism organism -TaxonID taxon_id -Strain_Isolate_Breed_Ecotype strain_name -Strain_Isolate_Breed_Ecotype_ID strain_id -Biomaterial_Provider biomaterial_provider -Source_Tissue_Name source_name -Tissue_Code tissue_code -Additional_Tissue_Description tissue_description -Genotype_Short_Name genotype_name -Genotype_Description genotype_description -Disease_Short_Name disease_name -Disease_Description disease_description -Treatment_Short_Name treatment -Treatment_Description treatment_description -Gender gender -Age age -Developmental_Stage development_stage -Passage_Number passage_number -Sample_Preparation_Date sample_prep_date -Prepared_By prepared_by -Documentation documentation -Protocol_File protocol_file -Sequencing_Date seq_date -Sequencing_Instrument seq_instrument -Library_preparation_kit library_kit -Cycles cycles -Molecule molecule -Contaminant_Sequences contaminant_seqs -BioAnalyzer_File bioanalyser_file diff --git a/scripts/prepare_inputs.py b/scripts/prepare_inputs.py deleted file mode 100755 index bc164aa..0000000 --- a/scripts/prepare_inputs.py +++ /dev/null @@ -1,665 +0,0 @@ -#!/usr/bin/env python3 - -"""Create input table and config for ZARP.""" - -import argparse -from functools import partial -import gzip -import logging -import math -import os -import sys -from typing import Tuple - -from Bio import SeqIO -import labkey -import pandas as pd - -logger = logging.getLogger(__name__) - - -def parse_cli_args() -> argparse.Namespace: - """ - Parses command line arguments. - - :returns: parsed CLI arguments - """ - parser = argparse.ArgumentParser( - description=__doc__, - ) - - parser.add_argument( - "table", - type=str, - default=None, - help="either local file path of input table *or* name of table on " - "LabKey instance (see 'LabKey API' options below)", - metavar="TABLE", - ) - - api = parser.add_argument_group("LabKey API") - api.add_argument( - "--labkey-domain", - type=str, - default=None, - help="domain of LabKey instance to query; required for obtaining " - "input table via LabKey API", - metavar="STR", - ) - api.add_argument( - "--labkey-path", - type=str, - default=None, - help="path to LabKey container that includes specified input table; " - "required for obtaining input table via LabKey API", - metavar="STR", - ) - - io = parser.add_argument_group("input/output") - io.add_argument( - "--input-to-output-mapping", - type=argparse.FileType('r'), - default=os.path.join( - os.path.dirname(__file__), - 'prepare_inputs.dict.tsv', - ), - help="lookup table with mappings from input (LabKey or LabKey-like) " - "to output (Snakemake) table; default: '%(default)s'", - metavar="FILE", - ) - io.add_argument( - "--resources-dir", - type=str, - default=os.getcwd(), - help="path containing the genome resources for all organisms " - "(default: %(default)s)", - metavar="DIR", - ) - io.add_argument( - "--output-table", - type=argparse.FileType('w'), - default="samples.tsv", - help="output sample table for use in ZARP (default: %(default)s)", - metavar="FILE", - ) - io.add_argument( - "--config-file", - type=argparse.FileType('w'), - default="config.yaml", - help="output Snakemake configuration file for use in ZARP (default: " - "%(default)s)", - metavar="FILE", - ) - io.add_argument( - "--output-dir", - type=str, - default=os.getcwd(), - help="directory to which ZARP results and logs are to be written " - "(default: %(default)s)", - metavar="DIR", - ) - parser.add_argument( - "--no-process-paths", - action="store_true", - default=False, - help="do not attempt to create absolute paths in output files", - ) - - behavior = parser.add_argument_group("workflow behavior") - behavior.add_argument( - "--trim-polya", - type=int, - choices=[True, False], - default=True, - help="cutadapt: trim poly(A) tails option (default: %(default)s)", - ) - behavior.add_argument( - "--multimappers", - type=int, - default=100, - help="STAR: number of multimappers to report (default: %(default)s)", - metavar='INT', - ) - behavior.add_argument( - "--soft-clip", - type=str, - default="EndToEnd", - help="STAR: soft-clipping option (default: %(default)s)", - choices=['EndToEnd', 'Local'], - ) - behavior.add_argument( - "--pass-mode", - type=str, - default="None", - help="STAR: 2-pass mode option (default: %(default)s)", - choices=["None", "Basic"], - ) - behavior.add_argument( - "--libtype", - type=str, - default="", - help="Salmon library type (default: %(default)s). Leave empty to infer one of 'SF', 'SR', 'ISF', 'ISR'." - "Warning: If value is provided by user, it will be applied to ALL samples. If the table contains samples from different sequencing modes this might cause errors in zarp.", - metavar="STR", - choices=["", "SF", "SR", "ISF", "ISR", "OSF", "OSR", "MSF", "MSR"] - ) - - report = parser.add_argument_group("report") - report.add_argument( - "--description", - type=str, - default="N/A", - help="short description to be added to the report (default: " - "%(default)s)", - metavar="STR", - ) - report.add_argument( - "--logo", - type=argparse.FileType('r'), - default=None, - help="path to image file to be added to the report (default: " - "%(default)s)", - metavar="FILE", - ) - report.add_argument( - "--url", - type=str, - default="N/A", - help="contact URL to be added to the report (default: %(default)s)", - metavar="STR", - ) - - parser.add_argument( - "-v", "--verbose", - action="store_true", - default=False, - help="print log messages to STDERR", - ) - parser.add_argument( - "--debug", - action="store_true", - default=False, - help="print log and debug messages to STDERR", - ) - - args = parser.parse_args() - - if args.logo: - args.logo.close() - args.logo = args.logo.name - else: - args.logo = "" - - if (args.labkey_domain and not args.labkey_path) or \ - (args.labkey_path and not args.labkey_domain): - parser.print_help() - sys.exit( - "\n[ERROR] Either none or both of '--labkey-domain' and " - "'--labkey-path' are required." - ) - return args - - -def setup_logging( - logger: logging.Logger, - verbose: bool = False, - debug: bool = False, -) -> None: - """ - Configure logger. - - :param logger: the `logging.Logger` object to configure - :param verbose: whether `logging.INFO` messages shall be logged - :param debug: whether `logging.DEBUG` messages shall be logged - - :returns: None - :raises ?: TODO - """ - if debug: - logger.setLevel(logging.DEBUG) - elif verbose: - logger.setLevel(logging.INFO) - else: - logger.setLevel(logging.WARNING) - handler = logging.StreamHandler() - handler.setFormatter(logging.Formatter( - "[%(asctime)-15s: %(levelname)-8s @ %(funcName)s] %(message)s" - )) - logger.addHandler(handler) - - -def fetch_labkey_table( - domain: str, - container_path: str, - query_name: str, - context_path: str = "labkey", - schema_name: str = "lists", -) -> pd.DataFrame: - """ - Export LabKey table as Pandas data frame. - - :param domain: domain of LabKey instance - :param container_path: path to LabKey container that includes the table of - interest - :param query_name: name of LabKey table to export - :context_path: required by API; usage unclear TODO - :schema_name: required by API; usage unclear TODO - - :returns: Pandas data frame - :raises ?: TODO - """ - server_context = labkey.utils.create_server_context( - domain=domain, - container_path=container_path, - context_path=context_path, - use_ssl=True, - ) - results = labkey.query.select_rows( - server_context=server_context, - schema_name=schema_name, - query_name=query_name, - ) - input_table = pd.DataFrame(results["rows"]) - return input_table - - -def get_read_length(file: str) -> int: - """ - Returns read length of first entry of gzipped FASTQ file. - - :param file: path to gzipped FASTQ file - - :returns: read length - :raises FileNotFoundError: file does not exist - :raises IsADirectoryError: file is a directory - :raises OSError: file is not gzipped - :raises PermissionError: file cannot be read - :raises ValueError: not a valid FASTQ file - """ - with gzip.open(file, "rt") as handle: - return len(next(SeqIO.parse(handle, "fastq"))) - - -def kmer_from_read_length( - length: int, - k_max: int = 31, - k_min: int = 11, -) -> int: - """ - Given a read length, returns appropriate kmer parameter size for Salmon - (https://salmon.readthedocs.io/) or similar k-mer-based quantification - tools. - - References for implementation: - https://salmon.readthedocs.io/en/latest/salmon.html#preparing-transcriptome-indices-mapping-based-mode - https://groups.google.com/d/msg/sailfish-users/fphjX7OIGzY/bMBwlCaZAgAJ - - :param length: length of read in nucleotides - :param k_max: maximum allowed k-mer size - :param k_min: minimum allowed k-mer size - - :returns: k_max for l > 2 * k_max, or else the maximum of k and k_min, - where k is biggest odd integer that fulfills k < l / 2 - """ - k = k_max - if length < 2 * k_max + 1: - # ensure kmer is smaller than half of read length - k = math.floor((length - 1) / 2) - # ensure kmer is odd - if not k % 2: - k -= 1 - if k < k_min: - k = k_min - return k - - -def get_libtype(directionality: str, seqmode: str) -> str: - """ - Returns libtype (https://salmon.readthedocs.io/en/latest/library_type.html) given strings indicating the - "directionality", and sequencing mode of a sequencing library, respectively. - - :param directionality: direction in which library was sequenced (one of - "SENSE" and "ANTISENSE") - - :param seqmode: sequencing mode(one of - "pe" and "se") - - :returns: salmon code (one of 'SF', 'SR', 'ISF', 'ISR') for specified directionality; - """ - - if seqmode == "pe": - option = "I" - else: - option = "" - - if directionality == "SENSE": - option += "SF" - elif directionality == "ANTISENSE": - option += "SR" - else: - logger.error( - f"[ERROR] Can't infer library type." - f"Make sure directionality and sequencing mode are specified correctly." - ) - sys.exit("Execution aborted.") - - return option - - -def get_polya_adapter_seqs(directionality: str) -> Tuple[str, str]: - """ - Returns repeat oligomers for detecting and trimming of poly(A) signals from - a sequencing library, given a string indicating the library's - "directionality". - - :param directionality: direction in which library was sequenced (one of - "SENSE" and "ANTISENSE") - - :returns: tuple of two 15-mers to be used to detect and trim poly(A) - signals from the 3' and 5' ends of the reads of sequencing library, - respectively - """ - if directionality == 'SENSE': - three = 'AAAAAAAAAAAAAAA' - five = 'XXXXXXXXXXXXXXX' - elif directionality == 'ANTISENSE': - three = 'XXXXXXXXXXXXXXX' - five = 'TTTTTTTTTTTTTTT' - else: - three = 'XXXXXXXXXXXXXXX' - five = 'XXXXXXXXXXXXXXX' - return (three, five) - - -def expand_path( - *args: str, - anchor: str = os.getcwd(), - expand: bool = True, - no_abs: bool = False, -) -> str: - """ - Constructs absolute path. - - Not tested with symbolic links. - - :param args: path fragments which will be joined to the anchor from left - to right - :param anchor: path relative to which the path fragments in *args shall - be interpreted; can be absolute or relative; in the latter case, it is - interpreted relative to the current working directory; if path - fragments evaluate to absolute path (either before or after expansion), - the path will be returned without considering the anchor - :param expand: whether environment variables and user directories (e.g, - `~`) shall be expanded - :param join_only: path fragments in args are joined, but no further - processing is done - - :returns: absolute path - """ - suffix = os.path.join(*args) - if no_abs: - return suffix - if os.path.isabs(suffix): - return os.path.normpath(suffix) - if expand: - suffix = os.path.expanduser( - os.path.expandvars( - suffix - ) - ) - if os.path.isabs(suffix): - return os.path.normpath(suffix) - anchor = os.path.expanduser( - os.path.expandvars( - anchor - ) - ) - path = os.path.join(anchor, suffix) - return os.path.normpath(path) - - -def main(args): - """ - Create input table and config for ZARP. - """ - setup_logging( - logger=logger, - verbose=args.verbose, - debug=args.debug, - ) - - # get input table from LabKey or CLI - if args.labkey_domain: - logger.info( - f"Fetching input table from LabKey instance " - "'{args.labkey_domain}'..." - ) - input_table = fetch_labkey_table( - domain=args.labkey_domain, - container_path=args.labkey_path, - query_name=args.table, - ) - labkey_table = expand_path( - '.'.join([args.output_table.name, "labkey"]) - ) - input_table.to_csv( - labkey_table, - sep='\t', - index=False, - ) - from_api = True - else: - logger.info(f"Reading input table from file '{args.table}'...") - input_table = pd.read_csv( - args.table, - header=0, - sep='\t', - index_col=None, - comment='#', - engine='python', - ) - from_api = False - - # get LabKey to Snakemake sample table field mappings - input_dict = pd.read_csv( - args.input_to_output_mapping, - header=0, - sep='\t', - index_col=None, - comment='#', - engine='python', - ) - args.input_to_output_mapping.close() - input_dict.set_index('snakemake', inplace=True, drop=True) - - # create Snakemake table - logger.info("Creating Snakemake input table...") - snakemake_table = pd.DataFrame() - - for index, row in input_table.iterrows(): - - # extract data from LabKey-like table - lk_sample_name = row[input_dict.loc['sample_name', 'labkey']] - lk_condition = row[input_dict.loc['condition', 'labkey']] - lk_seqmode = row[input_dict.loc['seqmode', 'labkey']] - lk_fastq_path = row[input_dict.loc['fastq_path', 'labkey']] - lk_fq1 = row[input_dict.loc['fq1', 'labkey']] - lk_fq2 = row[input_dict.loc['fq2', 'labkey']] - lk_fq1_3p = row[input_dict.loc['fq1_3p', 'labkey']] - lk_fq1_5p = row[input_dict.loc['fq1_5p', 'labkey']] - lk_fq2_3p = row[input_dict.loc['fq2_3p', 'labkey']] - lk_fq2_5p = row[input_dict.loc['fq2_5p', 'labkey']] - lk_organism = row[input_dict.loc['organism', 'labkey']] - lk_sd = row[input_dict.loc['sd', 'labkey']] - lk_mean = row[input_dict.loc['mean', 'labkey']] - lk_mate1_direction = row[input_dict.loc['mate1_direction', 'labkey']] - lk_mate2_direction = row[input_dict.loc['mate2_direction', 'labkey']] - - # extract, infer or convert to Snakemake input format - if from_api and not os.path.isabs(lk_fastq_path): - anchor = os.getcwd() - logger.warning( - f"[WARNING] Don't know how to interpret relative paths " - "inside LabKey table. Trying with current working directory " - f"'{anchor}' as an anchor, but it may be better to use" - "absolute paths wherever possible..." - ) - else: - anchor = os.path.abspath(os.path.dirname(args.table)) - sample = "_".join([lk_sample_name, lk_condition]) - if lk_seqmode == 'PAIRED': - seqmode = 'pe' - fq2 = expand_path( - lk_fastq_path, - lk_fq2, - anchor=anchor, - ) - elif lk_seqmode == 'SINGLE': - seqmode = 'se' - fq2 = "XXXXXXXXXXXXXXX" - else: - logger.error( - f"[ERROR] Illegal sequencing mode '{lk_seqmode}' in row " - f"{index+1}." - ) - sys.exit("Execution aborted.") - fq1 = expand_path( - lk_fastq_path, - lk_fq1, - anchor=anchor, - ) - read_length = get_read_length(fq1) - index_size = read_length - 1 - kmer = kmer_from_read_length(read_length) - fq1_3p = lk_fq1_3p - fq1_5p = lk_fq1_5p - fq2_3p = lk_fq2_3p - fq2_5p = lk_fq2_5p - organism = lk_organism.replace(' ', '_').lower() - gtf = expand_path( - args.resources_dir, - organism, - 'annotation.gtf', - ) - genome = expand_path( - args.resources_dir, - organism, - 'genome.fa', - ) - sd = lk_sd - mean = lk_mean - fq1_polya_3p, fq1_polya_5p = get_polya_adapter_seqs(lk_mate1_direction) - fq2_polya_3p, fq2_polya_5p = get_polya_adapter_seqs(lk_mate2_direction) - - # construct row in Snakemake input table - snakemake_table.loc[index, 'sample'] = sample - snakemake_table.loc[index, 'seqmode'] = seqmode - snakemake_table.loc[index, 'fq1'] = fq1 - snakemake_table.loc[index, 'fq2'] = fq2 - snakemake_table.loc[index, 'index_size'] = index_size - snakemake_table.loc[index, 'kmer'] = kmer - snakemake_table.loc[index, 'fq1_3p'] = fq1_3p - snakemake_table.loc[index, 'fq1_5p'] = fq1_5p - snakemake_table.loc[index, 'fq2_3p'] = fq2_3p - snakemake_table.loc[index, 'fq2_5p'] = fq2_5p - snakemake_table.loc[index, 'organism'] = organism - snakemake_table.loc[index, 'gtf'] = gtf - snakemake_table.loc[index, 'genome'] = genome - snakemake_table.loc[index, 'sd'] = sd - snakemake_table.loc[index, 'mean'] = mean - - # add CLI argument-dependent parameters - snakemake_table.loc[index, 'multimappers'] = args.multimappers - snakemake_table.loc[index, 'soft_clip'] = args.soft_clip - snakemake_table.loc[index, 'pass_mode'] = args.pass_mode - - if not args.libtype: - snakemake_table.loc[index, 'libtype'] = get_libtype(lk_mate1_direction, seqmode) - elif args.libtype in ['SF', 'SR', 'ISF', 'ISR', 'OSF', 'OSR', 'MSF', 'MSR']: - snakemake_table.loc[index, 'libtype'] = args.libtype - logger.warning( - f"Library type {args.libtype} set for sample {sample}." - ) - - if args.trim_polya is True: - snakemake_table.loc[index, 'fq1_polya_3p'] = fq1_polya_3p - snakemake_table.loc[index, 'fq1_polya_5p'] = fq1_polya_5p - snakemake_table.loc[index, 'fq2_polya_3p'] = fq2_polya_3p - snakemake_table.loc[index, 'fq2_polya_5p'] = fq2_polya_5p - - # adjust sample table format - snakemake_table.fillna('XXXXXXXXXXXXXXX', inplace=True) - snakemake_table = snakemake_table.astype( - { - "sd": int, - "mean": int, - "multimappers": int, - "kmer": int, - "index_size": int, - } - ) - - # write Snakemake sample table - logger.info("Writing Snakemake input table...") - snakemake_table.to_csv( - args.output_table, - sep='\t', - header=True, - index=False) - args.output_table.close() - - # compile entries for Snakemake config file - logger.info("Creating Snakemake config file...") - results_dir = expand_path( - args.output_dir, - "results", - ) - log_dir = expand_path( - args.output_dir, - "logs", - ) - kallisto_indexes = expand_path( - results_dir, - "kallisto_indexes", - ) - salmon_indexes = expand_path( - results_dir, - "salmon_indexes", - ) - star_indexes = expand_path( - results_dir, - "star_indexes", - ) - alfa_indexes = expand_path( - results_dir, - "alfa_indexes", - ) - - # write Snakemake config file - logger.info("Writing Snakemake config file...") - config_file_content = f'''--- - samples: "{expand_path(args.output_table.name)}" - output_dir: "{results_dir}" - log_dir: "{log_dir}" - kallisto_indexes: "{kallisto_indexes}" - salmon_indexes: "{salmon_indexes}" - star_indexes: "{star_indexes}" - alfa_indexes: "{alfa_indexes}" - report_description: "{args.description}" - report_logo: "{args.logo}" - report_url: "{args.url}" -... -''' - args.config_file.write(config_file_content) - args.config_file.close() - - -if __name__ == '__main__': - args = parse_cli_args() - - # Set default according to CLI arg - expand_path = partial(expand_path, no_abs=args.no_process_paths) # type: ignore - - main(args) - logger.info("Program completed successfully.") - sys.exit(0) diff --git a/scripts/requirements.txt b/scripts/requirements.txt deleted file mode 100644 index ba30bb9..0000000 --- a/scripts/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -biopython==1.76 -labkey==1.2.0 -pandas==0.25.3 diff --git a/tests/test_scripts_prepare_inputs_labkey/expected_output.md5 b/tests/test_scripts_prepare_inputs_labkey/expected_output.md5 deleted file mode 100644 index a94e93e..0000000 --- a/tests/test_scripts_prepare_inputs_labkey/expected_output.md5 +++ /dev/null @@ -1,2 +0,0 @@ -aa583b9bad45eeb520d9d624cca0af78 samples.tsv -c4cda83b069eb7ccb16547e1a9cdb34a config.yaml \ No newline at end of file diff --git a/tests/test_scripts_prepare_inputs_labkey/test.sh b/tests/test_scripts_prepare_inputs_labkey/test.sh deleted file mode 100755 index dea4515..0000000 --- a/tests/test_scripts_prepare_inputs_labkey/test.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/bin/bash - -# Scripts requires environment variables 'LABKEY_HOST', 'LABKEY_USER' and -# 'LABKEY_PASS' to be set with the appropriate values - -# Tear down test environment -cleanup () { - rc=$? - rm -rf ${HOME}/.netrc - rm -rf .snakemake/ - rm -rf config.yaml - rm -rf samples.tsv.labkey - rm -rf samples.tsv - cd $user_dir - echo "Exit status: $rc" -} -trap cleanup EXIT - -# Set up test environment -set -eo pipefail # ensures that script exits at first command that exits with non-zero status -set -u # ensures that script exits when unset variables are used -set -x # facilitates debugging by printing out executed commands -user_dir=$PWD -script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" -cd $script_dir -cat << EOF | ( umask 0377; cat >> ${HOME}/.netrc; ) -machine ${LABKEY_HOST} -login ${LABKEY_USER} -password ${LABKEY_PASS} -EOF - -# Run tests -python "../../scripts/prepare_inputs.py" \ - --labkey-domain="${LABKEY_HOST}" \ - --labkey-path="/Zavolan Group/TEST_LABKEY" \ - --input-to-output-mapping="../../scripts/prepare_inputs.dict.tsv" \ - --resources-dir="../input_files" \ - --output-table="samples.tsv" \ - --config-file="config.yaml" \ - --multimappers='10' \ - --logo="../../images/logo.128px.png" \ - --debug \ - "RNA_Seq_data_template_raw" - -# Check if dry run completes -snakemake \ - --snakefile="../../Snakefile" \ - --configfile="config.yaml" \ - --dryrun \ - --verbose - -#md5sum --check "expected_output.md5" -# MD5 sums obtained with command: -# md5sum config.yaml samples.tsv > expected_output.md5 -md5sum config.yaml samples.tsv diff --git a/tests/test_scripts_prepare_inputs_table/expected_output.md5 b/tests/test_scripts_prepare_inputs_table/expected_output.md5 deleted file mode 100644 index aca34c0..0000000 --- a/tests/test_scripts_prepare_inputs_table/expected_output.md5 +++ /dev/null @@ -1,2 +0,0 @@ -40bd0f0fcecdd0d9bc932f63c2811478 config.yaml -d8fb1773e3b83b6fab0a0d44c9fa71e6 samples.tsv \ No newline at end of file diff --git a/tests/test_scripts_prepare_inputs_table/input_table.tsv b/tests/test_scripts_prepare_inputs_table/input_table.tsv deleted file mode 100644 index 10b0244..0000000 --- a/tests/test_scripts_prepare_inputs_table/input_table.tsv +++ /dev/null @@ -1,3 +0,0 @@ -Mate2_5p_Adapter Condition_Name Name_Quality_Control_File_Mate1 Disease_Short_Name Single_Paired Gender Entry_Date Disease_Description Strain_Isolate_Breed_Ecotype Genotype_Description Mate1_File Source_Tissue_Name Developmental_Stage Mate1_Direction Quality_Control_Flag Genotype_Short_Name Strain_Isolate_Breed_Ecotype_ID Fragment_Length_Mean Organism Contaminant_Sequences TaxonID Documentation Prepared_By _labkeyurl_Entry_Date Molecule Mate2_Direction Library_preparation_kit Checksum_Raw_FASTQ_Mate1 Cycles Fragment_Length_SD Sample_Name Passage_Number Mate1_5p_Adapter Mate2_3p_Adapter Path_Fastq_Files Mate1_3p_Adapter Treatment_Short_Name Age Sequencing_Date Checksum_Raw_FASTQ_Mate2 Biomaterial_Provider Treatment_Description Sample_Preparation_Date BioAnalyzer_File Sequencing_Instrument Additional_Tissue_Description Protocol_File Name_Quality_Control_File_Mate2 Tissue_Code File_Name_Metadata_File Mate2_File - synthetic_10_reads_paired xxx xxx PAIRED xxx Fri Dec 20 00:00:00 CET 2019 xxx xxx xxx synthetic.mate_1.fastq.gz xxx xxx SENSE xxx xxx xxx 250.0 Homo sapiens xxx 9606 xxx xxx /labkey/Zavolan%20Group/Test_labkey/list-details.view?listId=9&pk=../input_files/project1 xxx ANTISENSE xxx xxx xxx 100.0 synthetic_10_reads_paired xxx AGATCGGAAGAGCGT ../input_files/project1 AGATCGGAAGAGCACA xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx synthetic.mate_2.fastq.gz - synthetic_10_reads_mate_1 xxx xxx SINGLE xxx Fri Dec 20 00:00:00 CET 2019 xxx xxx xxx synthetic.mate_1.fastq.gz xxx xxx SENSE xxx xxx xxx 250.0 Homo sapiens xxx 9606 xxx xxx /labkey/Zavolan%20Group/Test_labkey/list-details.view?listId=9&pk=../input_files/project2 xxx xxx xxx xxx 100.0 synthetic_10_reads_mate_1 xxx ../input_files/project2 AGATCGGAAGAGCACA xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx diff --git a/tests/test_scripts_prepare_inputs_table/test.sh b/tests/test_scripts_prepare_inputs_table/test.sh deleted file mode 100755 index 3c57024..0000000 --- a/tests/test_scripts_prepare_inputs_table/test.sh +++ /dev/null @@ -1,45 +0,0 @@ -#!/bin/bash - -# Tear down test environment -cleanup () { - rc=$? - rm -rf .snakemake/ - rm -rf config.yaml - rm -rf samples.tsv - rm -rf logs - cd $user_dir - echo "Exit status: $rc" -} -trap cleanup EXIT - -# Set up test environment -set -eo pipefail # ensures that script exits at first command that exits with non-zero status -set -u # ensures that script exits when unset variables are used -set -x # facilitates debugging by printing out executed commands -user_dir=$PWD -script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" -cd $script_dir/ - -# Run tests -python "../../scripts/prepare_inputs.py" \ - --input-to-output-mapping="../../scripts/prepare_inputs.dict.tsv" \ - --resources-dir="../input_files" \ - --output-table="samples.tsv" \ - --config-file="config.yaml" \ - --multimappers='10' \ - --logo="../../images/logo.128px.png" \ - --output-dir="" \ - --no-process-paths \ - "input_table.tsv" - - -# Check if dry run completes -snakemake \ - --snakefile="../../workflow/Snakefile" \ - --configfile="config.yaml" \ - --dryrun \ - --verbose - -md5sum --check "expected_output.md5" -# MD5 sums obtained with command: -# md5sum config.yaml samples.tsv > expected_output.md5 -- GitLab