From d35d5500dac8d03419508931b4537002e756037a Mon Sep 17 00:00:00 2001 From: BIOPZ-Herrmann Christina <christina.herrmann@unibas.ch> Date: Thu, 15 Apr 2021 18:03:11 +0000 Subject: [PATCH] feat: enable user to configure CLI params per rule --- Snakefile | 494 ++++++++++++++++---- pipeline_documentation.md | 108 ++--- tests/input_files/config.mutliple_lanes.yml | 3 +- tests/input_files/config.yaml | 3 +- tests/input_files/rule_config.yaml | 148 ++++++ workflow/rules/paired_end.snakefile.smk | 128 +++-- workflow/rules/single_end.snakefile.smk | 119 ++++- workflow/scripts/zarp_multiqc_config.py | 6 +- 8 files changed, 810 insertions(+), 199 deletions(-) create mode 100644 tests/input_files/rule_config.yaml diff --git a/Snakefile b/Snakefile index f0087cb..2f5842a 100644 --- a/Snakefile +++ b/Snakefile @@ -2,7 +2,11 @@ import os import pandas as pd import shutil +import yaml +from shlex import quote +from typing import Tuple +## Preparations # Get sample table samples_table = pd.read_csv( config['samples'], @@ -13,8 +17,33 @@ samples_table = pd.read_csv( sep="\t", ) +# Parse YAML rule config file +if 'rule_config' in config and config['rule_config']: + try: + with open(config['rule_config']) as _file: + rule_config = yaml.safe_load(_file) + logger.info(f"Loaded rule_config from {config['rule_config']}.") + except FileNotFoundError: + logger.error(f"No rule config file found at {config['rule_config']}. Either provide file or remove rule_config parameter from config.yaml! ") + raise +else: + rule_config = {} + logger.warning(f"No rule config specified: using default values for all tools.") + +# Create dir for cluster logs, if applicable +if cluster_config: + os.makedirs( + os.path.join( + os.getcwd(), + os.path.dirname(cluster_config['__default__']['out']), + ), + exist_ok=True) + + +## Function definitions def get_sample(column_id, search_id=None, search_value=None): + """ Get relevant per sample information from samples table""" if search_id: if search_id == 'index': return str(samples_table[column_id][samples_table.index == search_value][0]) @@ -22,22 +51,59 @@ def get_sample(column_id, search_id=None, search_value=None): return str(samples_table[column_id][samples_table[search_id] == search_value][0]) else: return str(samples_table[column_id][0]) -# Global config -localrules: start, finish, rename_star_rpm_for_alfa, prepare_multiqc_config -if cluster_config: - os.makedirs( - os.path.join( - os.getcwd(), - os.path.dirname(cluster_config['__default__']['out']), - ), - exist_ok=True) +def parse_rule_config(rule_config: dict, current_rule: str, immutable: Tuple[str, ...] = ()): + """Get rule specific parameters from rule_config file""" + + # If rule config file not present, emtpy string will be returned + if not rule_config: + logger.info(f"No rule config specified: using default values for all tools.") + return '' + # Same if current rule not specified in rule config + if current_rule not in rule_config or not rule_config[current_rule]: + logger.info(f"No additional parameters for rule {current_rule} specified: using default settings.") + return '' + + # Subset only section for current rule + rule_config = rule_config[current_rule] + + # Build list of parameters and values + params_vals = [] + for param, val in rule_config.items(): + # Do not allow the user to change wiring-critical, fixed arguments, or arguments that are passed through samples table + if param in immutable: + raise ValueError( + f"The following parameter in rule {current_rule} is critical for the pipeline to " + f"function as expected and cannot be modified: {param}" + ) + # Accept only strings; this prevents unintended results potentially + # arising from users entering reserved YAML keywords or nested + # structures (lists, dictionaries) + if isinstance(val, str): + params_vals.append(str(param)) + # Do not include a value for flags (signified by empty strings) + if val: + params_vals.append(val) + else: + raise ValueError( + "Only string values allowed for tool parameters: Found type " + f"'{type(val).__name__}' for value of parameter '{param}'" + ) + # Return quoted string + add_params = ' '.join(quote(item) for item in params_vals) + logger.info(f"User specified additional parameters for rule {current_rule}:\n {add_params}") + return add_params + + +# Global config +localrules: start, finish, rename_star_rpm_for_alfa, prepare_multiqc_config # Include subworkflows include: os.path.join("workflow", "rules", "paired_end.snakefile.smk") include: os.path.join("workflow", "rules", "single_end.snakefile.smk") + rule finish: """ Rule for collecting outputs @@ -80,6 +146,8 @@ rule finish: "summary_kallisto", "genes_tpm.tsv") + +current_rule = 'start' rule start: ''' Get samples @@ -104,12 +172,12 @@ rule start: config["log_dir"], "samples", "{sample}", - "start_{sample}.{mate}.stderr.log"), + current_rule + "_{sample}.{mate}.stderr.log"), stdout = os.path.join( config["log_dir"], "samples", "{sample}", - "start_{sample}.{mate}.stdout.log") + current_rule + "_{sample}.{mate}.stdout.log") singularity: "docker://bash:5.0.16" @@ -119,6 +187,7 @@ rule start: 1> {log.stdout} 2> {log.stderr} " +current_rule = 'fastqc' rule fastqc: ''' A quality control tool for high throughput sequence data @@ -139,6 +208,15 @@ rule fastqc: "{sample}", "fastqc", "{mate}")) + + params: + additional_params = parse_rule_config( + rule_config, + current_rule=current_rule, + immutable=( + '--outdir', + ) + ) threads: 2 @@ -150,19 +228,23 @@ rule fastqc: config["log_dir"], "samples", "{sample}", - "fastqc_{mate}.stderr.log"), + current_rule + "_{mate}.stderr.log"), stdout = os.path.join( config["log_dir"], "samples", "{sample}", - "fastqc_{mate}.stdout.log") + current_rule + "_{mate}.stdout.log") shell: "(mkdir -p {output.outdir}; \ - fastqc --outdir {output.outdir} --threads {threads} {input.reads}) \ + fastqc --outdir {output.outdir} \ + --threads {threads} \ + {params.additional_params} \ + {input.reads}) \ 1> {log.stdout} 2> {log.stderr}" +current_rule = 'create_index_star' rule create_index_star: """ Create index for STAR alignments @@ -205,7 +287,19 @@ rule create_index_star: "{organism}", "{index_size}", "STAR_index/STAR_"), - sjdbOverhang = "{index_size}" + sjdbOverhang = "{index_size}", + additional_params = parse_rule_config( + rule_config, + current_rule=current_rule, + immutable=( + '--runMode', + '--sjdbOverhang', + '--genomeDir', + '--genomeFastaFiles', + '--outFileNamePrefix', + '--sjdbGTFfile', + ) + ) singularity: "docker://zavolab/star:2.7.3a-slim" @@ -215,10 +309,10 @@ rule create_index_star: log: stderr = os.path.join( config['log_dir'], - "{organism}_{index_size}_create_index_star.stderr.log"), + current_rule + "_{organism}_{index_size}.stderr.log"), stdout = os.path.join( config['log_dir'], - "{organism}_{index_size}_create_index_star.stdout.log") + current_rule + "_{organism}_{index_size}.stdout.log") shell: "(mkdir -p {params.output_dir}; \ @@ -231,9 +325,11 @@ rule create_index_star: --runThreadN {threads} \ --outFileNamePrefix {params.outFileNamePrefix} \ --sjdbGTFfile {input.gtf}) \ + {params.additional_params} \ 1> {log.stdout} 2> {log.stderr}" +current_rule = 'extract_transcriptome' rule extract_transcriptome: """ Create transcriptome from genome and gene annotations @@ -256,13 +352,23 @@ rule extract_transcriptome: "{organism}", "transcriptome.fa")) + params: + additional_params = parse_rule_config( + rule_config, + current_rule=current_rule, + immutable=( + '-w', + '-g', + ) + ) + log: stderr = os.path.join( config['log_dir'], - "{organism}_extract_transcriptome.log"), + current_rule + "_{organism}.log"), stdout = os.path.join( config['log_dir'], - "{organism}_extract_transcriptome.log") + current_rule + "_{organism}.log") singularity: "docker://zavolab/gffread:0.11.7-slim" @@ -270,10 +376,13 @@ rule extract_transcriptome: shell: "(gffread \ -w {output.transcriptome} \ - -g {input.genome} {input.gtf}) \ + -g {input.genome} \ + {params.additional_params} \ + {input.gtf}) \ 1> {log.stdout} 2> {log.stderr}" +current_rule = 'concatenate_transcriptome_and_genome' rule concatenate_transcriptome_and_genome: """ Concatenate genome and transcriptome @@ -304,7 +413,7 @@ rule concatenate_transcriptome_and_genome: log: stderr = os.path.join( config['log_dir'], - "{organism}_concatenate_transcriptome_and_genome.stderr.log") + current_rule + "_{organism}.stderr.log") shell: "(cat {input.transcriptome} {input.genome} \ @@ -312,6 +421,7 @@ rule concatenate_transcriptome_and_genome: 2> {log.stderr}" +current_rule = 'create_index_salmon' rule create_index_salmon: """ Create index for Salmon quantification @@ -339,7 +449,17 @@ rule create_index_salmon: "salmon.idx")) params: - kmerLen = "{kmer}" + kmerLen = "{kmer}", + additional_params = parse_rule_config( + rule_config, + current_rule=current_rule, + immutable=( + '--transcripts', + '--decoys', + '--index', + '--kmerLen', + ) + ) singularity: "docker://zavolab/salmon:1.1.0-slim" @@ -347,10 +467,10 @@ rule create_index_salmon: log: stderr = os.path.join( config['log_dir'], - "{organism}_{kmer}_create_index_salmon.stderr.log"), + current_rule + "_{organism}_{kmer}.stderr.log"), stdout = os.path.join( config['log_dir'], - "{organism}_{kmer}_create_index_salmon.stdout.log") + current_rule + "_{organism}_{kmer}.stdout.log") threads: 8 @@ -361,9 +481,11 @@ rule create_index_salmon: --index {output.index} \ --kmerLen {params.kmerLen} \ --threads {threads}) \ + {params.additional_params} \ 1> {log.stdout} 2> {log.stderr}" +current_rule = 'create_index_kallisto' rule create_index_kallisto: """ Create index for Kallisto quantification @@ -384,7 +506,14 @@ rule create_index_kallisto: params: output_dir = os.path.join( config['kallisto_indexes'], - "{organism}") + "{organism}"), + additional_params = parse_rule_config( + rule_config, + current_rule=current_rule, + immutable=( + '-i', + ) + ) singularity: "docker://zavolab/kallisto:0.46.1-slim" @@ -392,18 +521,22 @@ rule create_index_kallisto: log: stderr = os.path.join( config['log_dir'], - "{organism}_create_index_kallisto.stderr.log"), + current_rule + "_{organism}.stderr.log"), stdout = os.path.join( config['log_dir'], - "{organism}_create_index_kallisto.stdout.log") + current_rule + "_{organism}.stdout.log") shell: "(mkdir -p {params.output_dir}; \ chmod -R 777 {params.output_dir}; \ - kallisto index -i {output.index} {input.transcriptome}) \ + kallisto index \ + {params.additional_params} \ + -i {output.index} \ + {input.transcriptome}) \ 1> {log.stdout} 2> {log.stderr}" +current_rule = 'extract_transcripts_as_bed12' rule extract_transcripts_as_bed12: """ Convert transcripts to BED12 format @@ -417,6 +550,16 @@ rule extract_transcripts_as_bed12: config['output_dir'], "full_transcripts_protein_coding.bed")) + params: + additional_params = parse_rule_config( + rule_config, + current_rule=current_rule, + immutable=( + '--gtf', + '--bed12', + ) + ) + singularity: "docker://zavolab/zgtf:0.1" @@ -425,18 +568,20 @@ rule extract_transcripts_as_bed12: log: stdout = os.path.join( config['log_dir'], - "extract_transcripts_as_bed12.stdout.log"), + current_rule + ".stdout.log"), stderr = os.path.join( config['log_dir'], - "extract_transcripts_as_bed12.stderr.log") + current_rule + ".stderr.log") shell: "(gtf2bed12 \ --gtf {input.gtf} \ --bed12 {output.bed12}); \ + {params.additional_params} \ 1> {log.stdout} 2> {log.stderr}" +current_rule = 'index_genomic_alignment_samtools' rule index_genomic_alignment_samtools: ''' Index genome bamfile using samtools @@ -456,6 +601,13 @@ rule index_genomic_alignment_samtools: "map_genome", "{sample}.{seqmode}.Aligned.sortedByCoord.out.bam.bai") + params: + additional_params = parse_rule_config( + rule_config, + current_rule=current_rule, + immutable=() + ) + singularity: "docker://zavolab/samtools:1.10-slim" @@ -466,18 +618,21 @@ rule index_genomic_alignment_samtools: config["log_dir"], "samples", "{sample}", - "index_genomic_alignment_samtools.{seqmode}.stderr.log"), + current_rule + ".{seqmode}.stderr.log"), stdout = os.path.join( config["log_dir"], "samples", "{sample}", - "index_genomic_alignment_samtools.{seqmode}.stdout.log") + current_rule + ".{seqmode}.stdout.log") shell: - "(samtools index {input.bam} {output.bai};) \ + "(samtools index \ + {params.additional_params} \ + {input.bam} {output.bai};) \ 1> {log.stdout} 2> {log.stderr}" +current_rule = 'calculate_TIN_scores' rule calculate_TIN_scores: """ Calculate transcript integrity (TIN) score @@ -522,14 +677,23 @@ rule calculate_TIN_scores: "TIN_score.tsv")) params: - sample = "{sample}" + sample = "{sample}", + additional_params = parse_rule_config( + rule_config, + current_rule=current_rule, + immutable=( + '-i', + '-r', + '--names', + ) + ) log: stderr = os.path.join( config['log_dir'], "samples", "{sample}", - "calculate_TIN_scores.log") + current_rule + ".log") threads: 8 @@ -540,11 +704,12 @@ rule calculate_TIN_scores: "(tin_score_calculation.py \ -i {input.bam} \ -r {input.transcripts_bed12} \ - -c 0 \ --names {params.sample} \ + {params.additional_params} \ > {output.TIN_score};) 2> {log.stderr}" +current_rule = 'salmon_quantmerge_genes' rule salmon_quantmerge_genes: ''' Merge gene quantifications into a single file @@ -589,15 +754,27 @@ rule salmon_quantmerge_genes: sample_name_list = expand( "{sample}", sample=pd.unique(samples_table.index.values)), - salmon_merge_on = "{salmon_merge_on}" + salmon_merge_on = "{salmon_merge_on}", + additional_params = parse_rule_config( + rule_config, + current_rule=current_rule, + immutable=( + '--quants', + '--genes', + '--transcripts', + '--names', + '--column', + '--output', + ) + ) log: stderr = os.path.join( config["log_dir"], - "salmon_quantmerge_genes_{salmon_merge_on}.stderr.log"), + current_rule + "_{salmon_merge_on}.stderr.log"), stdout = os.path.join( config["log_dir"], - "salmon_quantmerge_genes_{salmon_merge_on}.stdout.log") + current_rule + "_{salmon_merge_on}.stdout.log") threads: 1 @@ -611,9 +788,11 @@ rule salmon_quantmerge_genes: --names {params.sample_name_list} \ --column {params.salmon_merge_on} \ --output {output.salmon_out};) \ + {params.additional_params} \ 1> {log.stdout} 2> {log.stderr}" +current_rule = 'salmon_quantmerge_transcripts' rule salmon_quantmerge_transcripts: ''' Merge transcript quantifications into a single file @@ -655,19 +834,30 @@ rule salmon_quantmerge_transcripts: search_id='index', search_value=i) for i in pd.unique(samples_table.index.values)]), - sample_name_list = expand( "{sample}", sample=pd.unique(samples_table.index.values)), - salmon_merge_on = "{salmon_merge_on}" + salmon_merge_on = "{salmon_merge_on}", + additional_params = parse_rule_config( + rule_config, + current_rule=current_rule, + immutable=( + '--quants', + '--genes', + '--transcripts', + '--names', + '--column', + '--output', + ) + ) log: stderr = os.path.join( config["log_dir"], - "salmon_quantmerge_transcripts_{salmon_merge_on}.stderr.log"), + current_rule + "_{salmon_merge_on}.stderr.log"), stdout = os.path.join( config["log_dir"], - "salmon_quantmerge_transcripts_{salmon_merge_on}.stdout.log") + current_rule + "_{salmon_merge_on}.stdout.log") threads: 1 @@ -680,9 +870,11 @@ rule salmon_quantmerge_transcripts: --names {params.sample_name_list} \ --column {params.salmon_merge_on} \ --output {output.salmon_out}) \ + {params.additional_params} \ 1> {log.stdout} 2> {log.stderr}" +current_rule= 'kallisto_merge_genes' rule kallisto_merge_genes: ''' Merge gene quantifications into single file @@ -729,14 +921,25 @@ rule kallisto_merge_genes: sample_name_list = ','.join(expand( "{sample}", sample=pd.unique(samples_table.index.values))), + additional_params = parse_rule_config( + rule_config, + current_rule=current_rule, + immutable=( + '--input', + '--names', + '--txOut', + '--anno', + '--output', + ) + ) log: stderr = os.path.join( config["log_dir"], - "kallisto_merge_genes.stderr.log"), + current_rule + ".stderr.log"), stdout = os.path.join( config["log_dir"], - "kallisto_merge_genes.stdout.log") + current_rule + ".stdout.log") threads: 1 @@ -750,10 +953,11 @@ rule kallisto_merge_genes: --txOut FALSE \ --anno {input.gtf} \ --output {params.dir_out} \ - --verbose) \ + {params.additional_params} ) \ 1> {log.stdout} 2> {log.stderr}" +current_rule = 'kallisto_merge_transcripts' rule kallisto_merge_transcripts: ''' Merge transcript quantifications into a single files @@ -799,14 +1003,24 @@ rule kallisto_merge_transcripts: sample_name_list = ','.join(expand( "{sample}", sample=pd.unique(samples_table.index.values))), + additional_params = parse_rule_config( + rule_config, + current_rule=current_rule, + immutable=( + '--input', + '--names', + '--txOut', + '--output', + ) + ) log: stderr = os.path.join( config["log_dir"], - "kallisto_merge_transcripts.stderr.log"), + current_rule + ".stderr.log"), stdout = os.path.join( config["log_dir"], - "kallisto_merge_transcripts.stdout.log") + current_rule + ".stdout.log") threads: 1 @@ -818,10 +1032,11 @@ rule kallisto_merge_transcripts: --input {params.tables} \ --names {params.sample_name_list} \ --output {params.dir_out} \ - --verbose) \ + {params.additional_params}) \ 1> {log.stdout} 2> {log.stderr}" +current_rule = 'pca_salmon' rule pca_salmon: input: tpm = os.path.join( @@ -836,13 +1051,23 @@ rule pca_salmon: "zpca", "pca_salmon_{molecule}")) + params: + additional_params = parse_rule_config( + rule_config, + current_rule=current_rule, + immutable=( + '--tpm', + '--out', + ) + ) + log: stderr = os.path.join( config["log_dir"], - "pca_salmon_{molecule}.stderr.log"), + current_rule + "_{molecule}.stderr.log"), stdout = os.path.join( config["log_dir"], - "pca_salmon_{molecule}.stdout.log") + current_rule + "_{molecule}.stdout.log") threads: 1 @@ -853,10 +1078,11 @@ rule pca_salmon: "(zpca-tpm \ --tpm {input.tpm} \ --out {output.out} \ - --verbose) \ + {params.additional_params}) \ 1> {log.stdout} 2> {log.stderr}" +current_rule = 'pca_kallisto' rule pca_kallisto: input: tpm = os.path.join( @@ -871,13 +1097,23 @@ rule pca_kallisto: "zpca", "pca_kallisto_{molecule}")) + params: + additional_params = parse_rule_config( + rule_config, + current_rule=current_rule, + immutable=( + '--tpm', + '--out', + ) + ) + log: stderr = os.path.join( config["log_dir"], - "pca_kallisto_{molecule}.stderr.log"), + current_rule + "_{molecule}.stderr.log"), stdout = os.path.join( config["log_dir"], - "pca_kallisto_{molecule}.stdout.log") + current_rule + "_{molecule}.stdout.log") threads: 1 @@ -888,10 +1124,11 @@ rule pca_kallisto: "(zpca-tpm \ --tpm {input.tpm} \ --out {output.out} \ - --verbose) \ + {params.additional_params}) \ 1> {log.stdout} 2> {log.stderr}" +current_rule = 'star_rpm' rule star_rpm: ''' Create stranded bedgraph coverage with STARs RPM normalisation @@ -958,7 +1195,17 @@ rule star_rpm: prefix = lambda wildcards, output: os.path.join( os.path.dirname(output.str1), - str(wildcards.sample) + "_") + str(wildcards.sample) + "_"), + additional_params = parse_rule_config( + rule_config, + current_rule=current_rule, + immutable=( + '--runMode', + '--inputBAMfile', + '--outWigType', + '--outFileNamePrefix', + ) + ) singularity: "docker://zavolab/star:2.7.3a-slim" @@ -968,12 +1215,12 @@ rule star_rpm: config["log_dir"], "samples", "{sample}", - "star_rpm.stderr.log"), + current_rule + ".stderr.log"), stdout = os.path.join( config["log_dir"], "samples", "{sample}", - "star_rpm.stdout.log") + current_rule + ".stdout.log") threads: 4 @@ -986,9 +1233,11 @@ rule star_rpm: --inputBAMfile {input.bam} \ --outWigType bedGraph \ --outFileNamePrefix {params.prefix}) \ + {params.additional_params} \ 1> {log.stdout} 2> {log.stderr}" +current_rule = 'rename_star_rpm_for_alfa' rule rename_star_rpm_for_alfa: input: plus = lambda wildcards: @@ -1041,12 +1290,12 @@ rule rename_star_rpm_for_alfa: config["log_dir"], "samples", "{sample}", - "rename_star_rpm_for_alfa__{unique}.stderr.log"), + current_rule + "_{unique}.stderr.log"), stdout = os.path.join( config["log_dir"], "samples", "{sample}", - "rename_star_rpm_for_alfa__{unique}.stdout.log") + current_rule + "_{unique}.stdout.log") singularity: "docker://bash:5.0.16" @@ -1057,6 +1306,7 @@ rule rename_star_rpm_for_alfa: 1>{log.stdout} 2>{log.stderr}" +current_rule = 'generate_alfa_index' rule generate_alfa_index: ''' Generate ALFA index files from sorted GTF file ''' input: @@ -1065,7 +1315,6 @@ rule generate_alfa_index: 'gtf', search_id='organism', search_value=wildcards.organism)), - chr_len = os.path.join( config["star_indexes"], "{organism}", @@ -1090,7 +1339,17 @@ rule generate_alfa_index: params: genome_index = "sorted_genes", out_dir = lambda wildcards, output: - os.path.dirname(output.index_stranded) + os.path.dirname(output.index_stranded), + additional_params = parse_rule_config( + rule_config, + current_rule=current_rule, + immutable=( + '-a', + '-g', + '--chr_len', + '-o', + ) + ) threads: 4 @@ -1100,16 +1359,19 @@ rule generate_alfa_index: log: os.path.join( config["log_dir"], - "{organism}_{index_size}_generate_alfa_index.log") + current_rule + "_{organism}_{index_size}.log") shell: "(alfa -a {input.gtf} \ -g {params.genome_index} \ --chr_len {input.chr_len} \ -p {threads} \ - -o {params.out_dir}) &> {log}" + -o {params.out_dir} \ + {params.additional_params}) \ + &> {log}" +current_rule = 'alfa_qc' rule alfa_qc: ''' Run ALFA from stranded bedgraph files @@ -1169,21 +1431,30 @@ rule alfa_qc: params: out_dir = lambda wildcards, output: os.path.dirname(output.biotypes), + genome_index = lambda wildcards, input: + os.path.abspath( + os.path.join( + os.path.dirname(input.gtf), + "sorted_genes")), plus = lambda wildcards, input: os.path.basename(input.plus), minus = lambda wildcards, input: os.path.basename(input.minus), + name = "{sample}", alfa_orientation = lambda wildcards: get_sample( 'alfa_directionality', search_id='index', search_value=wildcards.sample), - genome_index = lambda wildcards, input: - os.path.abspath( - os.path.join( - os.path.dirname(input.gtf), - "sorted_genes")), - name = "{sample}" + additional_params = parse_rule_config( + rule_config, + current_rule=current_rule, + immutable=( + '-g', + '--bedgraph', + '-s', + ) + ) singularity: "docker://zavolab/alfa:1.1.1-slim" @@ -1193,16 +1464,19 @@ rule alfa_qc: config["log_dir"], "samples", "{sample}", - "alfa_qc.{unique}.log") + current_rule + ".{unique}.log") shell: "(cd {params.out_dir}; \ alfa \ -g {params.genome_index} \ --bedgraph {params.plus} {params.minus} {params.name} \ - -s {params.alfa_orientation}) &> {log}" + -s {params.alfa_orientation} \ + {params.additional_params}) \ + &> {log}" +current_rule = 'prepare_multiqc_config' rule prepare_multiqc_config: ''' Prepare config for the MultiQC @@ -1222,25 +1496,36 @@ rule prepare_multiqc_config: params: logo_path = config['report_logo'], multiqc_intro_text = config['report_description'], - url = config['report_url'] + url = config['report_url'], + additional_params = parse_rule_config( + rule_config, + current_rule=current_rule, + immutable=( + '--config', + '--intro-text', + '--custom-logo', + '--url', + ) + ) log: stderr = os.path.join( config["log_dir"], - "prepare_multiqc_config.stderr.log"), + current_rule + ".stderr.log"), stdout = os.path.join( config["log_dir"], - "prepare_multiqc_config.stdout.log") + current_rule + ".stdout.log") shell: "(python {input.script} \ --config {output.multiqc_config} \ --intro-text '{params.multiqc_intro_text}' \ --custom-logo {params.logo_path} \ - --url '{params.url}') \ + --url '{params.url}' \ + {params.additional_params}) \ 1> {log.stdout} 2> {log.stderr}" - +current_rule = 'multiqc_report' rule multiqc_report: ''' Create report with MultiQC @@ -1326,15 +1611,23 @@ rule multiqc_report: params: results_dir = os.path.join( config["output_dir"]), - log_dir = config["log_dir"] + log_dir = config["log_dir"], + additional_params = parse_rule_config( + rule_config, + current_rule=current_rule, + immutable=( + '--outdir', + '--config', + ) + ) log: stderr = os.path.join( config["log_dir"], - "multiqc_report.stderr.log"), + current_rule + ".stderr.log"), stdout = os.path.join( config["log_dir"], - "multiqc_report.stdout.log") + current_rule + ".stdout.log") singularity: "docker://zavolab/multiqc-plugins:1.0.0" @@ -1343,11 +1636,12 @@ rule multiqc_report: "(multiqc \ --outdir {output.multiqc_report} \ --config {input.multiqc_config} \ + {params.additional_params} \ {params.results_dir} \ {params.log_dir};) \ 1> {log.stdout} 2> {log.stderr}" - +current_rule = 'sort_bed_4_big' rule sort_bed_4_big: ''' sort bedGraphs in order to work with bedGraphtobigWig @@ -1370,6 +1664,15 @@ rule sort_bed_4_big: "{unique}", "{sample}_{unique}_{strand}.sorted.bg")) + params: + additional_params = parse_rule_config( + rule_config, + current_rule=current_rule, + immutable=( + '-i', + ) + ) + singularity: "docker://cjh4zavolab/bedtools:2.27" @@ -1378,13 +1681,16 @@ rule sort_bed_4_big: config["log_dir"], "samples", "{sample}", - "sort_bg_{unique}_{strand}.stderr.log") + current_rule + "_{unique}_{strand}.stderr.log") shell: "(sortBed \ - -i {input.bg} \ - > {output.sorted_bg};) 2> {log.stderr}" + -i {input.bg} \ + {params.additional_params} \ + > {output.sorted_bg};) 2> {log.stderr}" + +current_rule = 'prepare_bigWig' rule prepare_bigWig: ''' bedGraphtobigWig, for viewing in genome browsers @@ -1420,6 +1726,13 @@ rule prepare_bigWig: "{unique}", "{sample}_{unique}_{strand}.bw") + params: + additional_params = parse_rule_config( + rule_config, + current_rule=current_rule, + immutable=() + ) + singularity: "docker://zavolab/bedgraphtobigwig:4-slim" @@ -1428,17 +1741,18 @@ rule prepare_bigWig: config["log_dir"], "samples", "{sample}", - "bigwig_{unique}_{strand}.stderr.log"), + current_rule + "_{unique}_{strand}.stderr.log"), stdout = os.path.join( config["log_dir"], "samples", "{sample}", - "bigwig_{unique}_{strand}.stdout.log") + current_rule + "_{unique}_{strand}.stdout.log") shell: "(bedGraphToBigWig \ - {input.sorted_bg} \ - {input.chr_sizes} \ - {output.bigWig};) \ - 1> {log.stdout} 2> {log.stderr}" + {params.additional_params} \ + {input.sorted_bg} \ + {input.chr_sizes} \ + {output.bigWig};) \ + 1> {log.stdout} 2> {log.stderr}" diff --git a/pipeline_documentation.md b/pipeline_documentation.md index 04a62fc..5f96d48 100644 --- a/pipeline_documentation.md +++ b/pipeline_documentation.md @@ -104,7 +104,7 @@ sample | Descriptive sample name | `str` seqmode | Required for various steps of the workflow. One of `pe` (for paired-end libraries) or `se` (for single-end libraries). | `str` fq1 | Path of library file in `.fastq.gz` format (or mate 1 read file for paired-end libraries) | `str` index_size | Required for [STAR](#third-party-software-used). Ideally the maximum read length minus 1 (`max(ReadLength)-1`). Values lower than maximum read length may result in lower mapping accuracy, while higher values may result in longer processing times. | `int` -kmer | Required for [Salmon](#third-party-software-used). Default value of 31 usually works fine for reads of 75 bp or longer. Consider using lower values of poor mapping is observed. | `int` +kmer | Required for [Salmon](#third-party-software-used). Default value of 31 usually works fine for reads of 75 bp or longer. Consider using lower values if poor mapping is observed. | `int` fq2 | Path of mate 2 read file in `.fastq.gz` format. Value ignored for for single-end libraries. | `str` fq1_3p | Required for [Cutadapt](#third-party-software-used). 3' adapter of mate 1. Use value such as `XXXXXXXXXXXXXXX` if no adapter present or if no trimming is desired. | `str` fq1_5p | Required for [Cutadapt](#third-party-software-used). 5' adapter of mate 1. Use value such as `XXXXXXXXXXXXXXX` if no adapter present or if no trimming is desired. | `str` @@ -156,9 +156,8 @@ Create index for [**STAR**](#third-party-software-used) short read aligner. - Genome sequence file (`.fasta`) - Gene annotation file (`.gtf`) - **Parameters** - - `--sjdbOverhang`: maximum read length - 1; lower values may reduce accuracy, - higher values may increase STAR runtime; specify in sample table column - `index_size` + - **samples.tsv** + - `--sjdbOverhang`: maximum read length - 1; lower values may reduce accuracy, higher values may increase STAR runtime; specify in sample table column `index_size` - **Output** - STAR index; used in [**map_genome_star**](#map_genome_star) - Index includes files: @@ -215,7 +214,8 @@ Create index for [**Salmon**](#third-party-software-used) quantification. - Chromosome name list `chrName.txt`; from [**create_index_star**](#create_index_star) - **Parameters** - - `--kmerLen`: k-mer length; specify in sample table column `kmer` + - **samples.tsv** + - `--kmerLen`: k-mer length; specify in sample table column `kmer` - **Output** - Salmon index; used in [**quantification_salmon**](#quantification_salmon) @@ -354,11 +354,13 @@ Calculates the Transcript Integrity Number (TIN) for each transcript with [**index_genomic_alignment_samtools**](#index_genomic_alignment_samtools) - Transcript annotations file (12-column `.bed`); from [**extract_transcripts_as_bed12**](#extract_transcripts_as_bed12) +- **Parameters** + - **rule_config.yaml** + - `-c 0`: minimum number of read mapped to a transcript (default 10) - **Output** - TIN score table (custom `tsv`); used in [**merge_TIN_scores**](#merge_tin_scores) -- **Non-configurable & non-default** - - `-c 0`: minimum number of read mapped to a transcript + #### `salmon_quantmerge_genes` @@ -470,8 +472,8 @@ Annotate alignments with [**ALFA**](#third-party-software-used). [**rename_star_rpm_for_alfa**](#rename_star_rpm_for_alfa) - ALFA index, stranded; from [**generate_alfa_index**](#generate_alfa_index) - **Parameters** - - `-s`: library orientation; specified by user in sample table column - `kallisto_directionality` + - **samples.tsv** + - `-s`: library orientation; specified by user in sample table column `kallisto_directionality` - **Output** - Figures for biotypes and feature categories (`.pdf`) - Feature counts table (custom `.tsv`); used in @@ -486,6 +488,11 @@ Prepare config file for [**MultiQC**](#third-party-software-used). - **Input** - Directories created during [**prepare_files_for_report**](#prepare_files_for_report) +- **Parameters** + All parameters for this rule have to be specified in main `config.yaml` + - `--intro-text` + - `--custom-logo` + - `--url` - **Output** - Config file (`.yaml`); used in [**multiqc_report**](#multiqc_report) @@ -529,6 +536,8 @@ Target rule as required by [Snakemake][docs-snakemake-target-rule]. - Coverage files, one per strand and sample (`.bw`); used in [**prepare_bigWig**](#prepare_bigwig) + + ### Sequencing mode-specific > Steps described here have two variants, one with the specified names for @@ -544,16 +553,19 @@ Remove adapter sequences from reads with - **Input** - Reads file (`.fastq.gz`); from [**start**](#start) - **Parameters** - - Adapters to be removed; specify in sample table columns `fq1_3p`, `fq1_5p`, + - **samples.tsv** + - Adapters to be removed; specify in sample table columns `fq1_3p`, `fq1_5p`, `fq2_3p`, `fq2_5p` + - **rule_config.yaml:** + - `-m 10`: Discard processed reads that are shorter than 10 (default 0, that might cause problems in downstream programs) + - `-n 2`: search for all the given adapter sequences repeatedly, either until + no adapter match was found or until 2 rounds have been performed. (default 1) + - **Output** - Reads file (`.fastq.gz`); used in [**remove_polya_cutadapt**](#remove_polya_cutadapt) -- **Non-configurable & non-default** - - `-j 8`: use 8 threads - - `-m 10`: Discard processed reads that are shorter than 10 (default 0, that might cause problems in downstream programs) - - `-n 2`: search for all the given adapter sequences repeatedly, either until - no adapter match was found or until 2 rounds have been performed. (default 1) + + #### `remove_polya_cutadapt` @@ -564,17 +576,18 @@ Remove poly(A) tails from reads with - Reads file (`.fastq.gz`); from [**remove_adapters_cutadapt**](#remove_adapters_cutadapt) - **Parameters** - - Poly(A) stretches to be removed; specify in sample table columns `fq1_polya` - and `fq2_polya` + - **samples.tsv** + - Poly(A) stretches to be removed; specify in sample table columns `fq1_polya` and `fq2_polya` + - **rule_config.yaml** + - `-m 10`: Discard processed reads that are shorter than 10 (default 0, that might cause problems in downstream programs) + - `-O 1`: minimal overlap of 1 (default: 3) - **Output** - Reads file (`.fastq.gz`); used in [**genome_quantification_kallisto**](#genome_quantification_kallisto), [**map_genome_star**](#map_genome_star) and [**quantification_salmon**](#quantification_salmon) -- **Non-configurable & non-default** - - `-j 8`: use 8 threads - - `-m 10`: Discard processed reads that are shorter than 10 - - `-O 1`: minimal overlap of 1 (default: 3) + + #### `map_genome_star` @@ -586,15 +599,13 @@ Align short reads to reference genome and/or transcriptome with [**remove_polya_cutadapt**](#remove_polya_cutadapt) - Index; from [**create_index_star**](#create_index_star) - **Parameters** - - `--outFilterMultimapNmax`: maximum number of multiple alignments allowed; - if exceeded, read is considered unmapped; specify in sample table column - `multimappers` - - `--alignEndsType`: one of `Local` (standard local alignment with - soft-clipping allowed) or `EndToEnd` (force end-to-end read alignment, do - not soft-clip); specify in sample table column `soft_clip` - - `--twopassMode`: one of `None` (1-pass mapping) or `Basic` (basic 2-pass - mapping, with all 1st-pass junctions inserted into the genome indices on - the fly); specify in sample table column `pass_mode` + - **samples.tsv** + - `--outFilterMultimapNmax`: maximum number of multiple alignments allowed; if exceeded, read is considered unmapped; specify in sample table column `multimappers` + - `--alignEndsType`: one of `Local` (standard local alignment with soft-clipping allowed) or `EndToEnd` (force end-to-end read alignment, do not soft-clip); specify in sample table column `soft_clip` + - `--twopassMode`: one of `None` (1-pass mapping) or `Basic` (basic 2-pass mapping, with all 1st-pass junctions inserted into the genome indices on the fly); specify in sample table column `pass_mode` + - **rule_config.yaml** + - `--outFilterMultimapScoreRange=0`: the score range below the maximum score for multimapping alignments (default 1) + - `--outFilterType=BySJout`: reduces the number of ”spurious” junctions - **Output** - Aligned reads file (`.bam`); used in [**calculate_TIN_scores**](#calculate_TIN_scores), @@ -602,12 +613,9 @@ Align short reads to reference genome and/or transcriptome with and [**star_rpm**](#star_rpm) - STAR log file - **Non-configurable & non-default** - - `--outFilterMultimapScoreRange=0`: the score range below the maximum score - for multimapping alignments (default 1) - `--outSAMattributes=All`: NH HI AS nM NM MD jM jI MC ch - `--outStd=BAM_SortedByCoordinate`: which output will be directed to `STDOUT` (default 'Log') - `--outSAMtype=BAM SortedByCoordinate`: type of SAM/BAM output (default SAM) - - `--outFilterType=BySJout`: reduces the number of ”spurious” junctions - `--outSAMattrRGline`: ID:rnaseq_pipeline SM: *sampleID* #### `quantification_salmon` @@ -621,12 +629,15 @@ Estimate transcript- and gene-level expression with - Filtered annotation file (`.gtf`) - Index; from [**create_index_salmon**](#create_index_salmon) - **Parameters** - - `libType`: see [Salmon manual][docs-salmon] for allowed values; specify in - sample table column `libtype` - - `--fldMean`: mean of distribution of fragment lengths; specify in sample - table column `mean` **(single-end only)** - - `--fldSD`: standard deviation of distribution of fragment lengths; specify - in sample table column `sd` **(single-end only)** + - **samples.tsv** + - `libType`: see [Salmon manual][docs-salmon] for allowed values; specify in sample table column `libtype` + - `--fldMean`: mean of distribution of fragment lengths; specify in sample table column `mean` **(single-end only)** + - `--fldSD`: standard deviation of distribution of fragment lengths; specify in sample table column `sd` **(single-end only)** + - **rule_config.yaml** + - `--seqBias`: [correct for sequence specific + biases](https://salmon.readthedocs.io/en/latest/salmon.html#seqbias) + - `--validateMappings`: enables selective alignment of the sequencing reads when mapping them to the transcriptome; this can improve both the sensitivity and specificity of mapping and, as a result, can [improve quantification accuracy](https://salmon.readthedocs.io/en/latest/salmon.html#validatemappings). + - `--writeUnmappedNames`: write out the names of reads (or mates in paired-end reads) that do not map to the transcriptome. For paired-end this gives flags that indicate how a read failed to map **(paired-end only)** - **Output** - Gene expression table (`quant.sf`); used in [**salmon_quantmerge_genes**](#salmon_quantmerge_genes) @@ -634,14 +645,7 @@ Estimate transcript- and gene-level expression with [**salmon_quantmerge_transcripts**](#salmon_quantmerge_transcripts) - `meta_info.json` - `flenDist.txt` -- **Non-configurable & non-default** - - `--seqBias`: [correct for sequence specific - biases](https://salmon.readthedocs.io/en/latest/salmon.html#seqbias) - - `--validateMappings`: enables selective alignment of the sequencing reads - when mapping them to the transcriptome; this can improve both the - sensitivity and specificity of mapping and, as a result, can [improve - quantification - accuracy](https://salmon.readthedocs.io/en/latest/salmon.html#validatemappings). + #### `genome_quantification_kallisto` @@ -653,16 +657,16 @@ Generate pseudoalignments of reads to transcripts with [**remove_polya_cutadapt**](#remove_polya_cutadapt) - Index; from [**create_index_kallisto**](#create_index_kallisto) - **Parameters** - - `directionality`; specify in sample table column `kallisto_directionality` - - `-l`: mean of distribution of fragment lengths; specify in sample table - column `mean` **(single-end only)** - - `-s`: standard deviation of distribution of fragment lengths; specify in - sample table column `sd` **(single-end only)** + - **samples.tsv** + - `directionality`; specify in sample table column `kallisto_directionality` + - `-l`: mean of distribution of fragment lengths; specify in sample table column `mean` **(single-end only)** + - `-s`: standard deviation of distribution of fragment lengths; specify in sample table column `sd` **(single-end only)** - **Output** - Pseudoalignments file (`.sam`) and - abundance (`.h5`) used in [**kallisto_merge_genes**](#kallisto_merge_genes) - **Non-configurable & non-default** + - `--single`: Quantify single-end reads **(single-end only)** - `--pseudobam`: Save pseudoalignments to transcriptome to BAM file [code-alfa]: <https://github.com/biocompibens/ALFA> diff --git a/tests/input_files/config.mutliple_lanes.yml b/tests/input_files/config.mutliple_lanes.yml index 8cfbb38..3d04a01 100644 --- a/tests/input_files/config.mutliple_lanes.yml +++ b/tests/input_files/config.mutliple_lanes.yml @@ -1,5 +1,6 @@ --- samples: "../input_files/samples.multiple_lanes.tsv" + rule_config: "../input_files/rule_config.yaml" output_dir: "results" log_dir: "logs" kallisto_indexes: "results/kallisto_indexes" @@ -9,4 +10,4 @@ report_description: "No description provided by user" report_logo: "../../images/logo.128px.png" report_url: "https://zavolan.biozentrum.unibas.ch/" -... \ No newline at end of file +... diff --git a/tests/input_files/config.yaml b/tests/input_files/config.yaml index a7e59f6..853a8a7 100644 --- a/tests/input_files/config.yaml +++ b/tests/input_files/config.yaml @@ -1,5 +1,6 @@ --- samples: "../input_files/samples.tsv" + rule_config: "../input_files/rule_config.yaml" output_dir: "results" log_dir: "logs" kallisto_indexes: "results/kallisto_indexes" @@ -9,4 +10,4 @@ report_description: "No description provided by user" report_logo: "../../images/logo.128px.png" report_url: "https://zavolan.biozentrum.unibas.ch/" -... \ No newline at end of file +... diff --git a/tests/input_files/rule_config.yaml b/tests/input_files/rule_config.yaml new file mode 100644 index 0000000..2cdec07 --- /dev/null +++ b/tests/input_files/rule_config.yaml @@ -0,0 +1,148 @@ +############################################################################# +# +# __________________________________________________________________ +# | WARNING: ONLY CHANGE THIS FILE IF YOU KNOW WHAT YOU'RE DOING!!! | +# | ZARP DOES NOT GUARANTEE SENSIBLE RESULTS IF PARAMETERS | +# | ARE CHANGED HERE. | +# |__________________________________________________________________| +# +# RULE CONFIGURATION +# +# For RUN SPECIFIC PARAMETERS (sample specific parameters have to be +# defined in the samples table!) +# +# Specify path to this file in main config.yaml under key 'rule_config' +# +# One top-level keyword per RULE (not per tool, as one tool might be used +# with different settings by more than one rule) +# +# Parameters have to be specified exactly like they have to appear on the +# command line call (e.g. -n or --name) +# +# All values need to be QUOTED STRINGS; to specify flags (i.e., parameters +# without values), specify an empty string as value. +# +# Note: number of threads has to be set in the respective Snakefile +# +############################################################################# + + +# Specify parameters for individual rules: + +################################################ +# MAIN SNAKEFILE / SEQUENCING-MODE INDEPENDENT # +################################################ + +#start: No parameters to change here + +fastqc: + +create_index_star: + +extract_transcriptome: + +#concatenate_transcriptome_and_genome: No parameters to change here + +create_index_salmon: + +create_index_kallisto: + +extract_transcripts_as_bed12: + +index_genomic_alignment_samtools: + +calculate_TIN_scores: + # Minimum number of reads mapped to a transcript (default 10, ZARP recommends 0) + -c: '0' + +salmon_quantmerge_genes: + +salmon_quantmerge_transcripts: + +kallisto_merge_genes: + --verbose: '' + +kallisto_merge_transcripts: + --verbose: '' + +pca_salmon: + --verbose: '' + +pca_kallisto: + --verbose: '' + +star_rpm: + +# rename_star_rpm_for_alfa: No parameters to change here + +generate_alfa_index: + +alfa_qc: + +prepare_multiqc_config: + +multiqc_report: + +sort_bed_4_big: + +prepare_bigWig: + +########################################## +# SEQUENCING-MODE SPECIFIC # +# single-end: rule name without prefix, # +# paired-end: rule name with prefix 'pe' # +########################################## + +remove_adapters_cutadapt: + # search for all the given adapter sequences repeatedly, either until no adapter match was found or until n rounds have been performed. (default 1, ZARP recommends 2) + -n: '2' + # Discard processed reads that are shorter than 10 (default 0, ZARP strongly recommends m > 0, because empty reads might cause problems in downstream programs) + -m: '10' + +pe_remove_adapters_cutadapt: + # search for all the given adapter sequences repeatedly, either until no adapter match was found or until n rounds have been performed. (default 1, ZARP recommends 2) + -n: '2' + # Discard processed reads that are shorter than 10 (default 0, ZARP strongly recommends m > 0, because empty reads might cause problems in downstream programs) + -m: '10' + +remove_polya_cutadapt: + # Discard processed reads that are shorter than 10 (default 0, ZARP strongly recommends m > 0, because empty reads might cause problems in downstream programs) + -m: '10' + # Minimal overlap of adapter and read (default 3, ZARP recommends 1 in order to remove all 3' As) + -O: '1' + +pe_remove_polya_cutadapt: + # Discard processed reads that are shorter than 10 (default 0, ZARP strongly recommends m > 0, because empty reads might cause problems in downstream programs) + -m: '10' + # Minimal overlap of adapter and read (default 3, ZARP recommends 1 in order to remove all 3' As) + -O: '1' + +map_genome_star: + # the score range below the maximum score for multimapping alignments (default 1, ZARP recommends 0) + --outFilterMultimapScoreRange: '0' + # keep only those reads that contain junctions that passed filtering into SJ.out.tab. (default 'Normal', ZARP recommends 'BySJout', as this reduces the number of ”spurious” junctions ) + --outFilterType: 'BySJout' + +pe_map_genome_star: + # the score range below the maximum score for multimapping alignments (default 1, ZARP recommends 0) + --outFilterMultimapScoreRange: '0' + # keep only those reads that contain junctions that passed filtering into SJ.out.tab. (default 'Normal', ZARP recommends 'BySJout', as this reduces the number of ”spurious” junctions ) + --outFilterType: 'BySJout' + +quantification_salmon: + # correct for sequence specific biases](https://salmon.readthedocs.io/en/latest/salmon.html#seqbias + --seqBias: '' + # enables selective alignment of the sequencing reads when mapping them to the transcriptome; this can improve both the sensitivity and specificity of mapping and, as a result, can [improve quantification accuracy](https://salmon.readthedocs.io/en/latest/salmon.html#validatemappings) + --validateMappings: '' + +pe_quantification_salmon: + # correct for sequence specific biases](https://salmon.readthedocs.io/en/latest/salmon.html#seqbias + --seqBias: '' + # enables selective alignment of the sequencing reads when mapping them to the transcriptome; this can improve both the sensitivity and specificity of mapping and, as a result, can [improve quantification accuracy](https://salmon.readthedocs.io/en/latest/salmon.html#validatemappings) + --validateMappings: '' + # write out the names of reads (or mates in paired-end reads) that do not map to the transcriptome. For paired-end this gives flags that indicate how a read failed to map + --writeUnmappedNames: '' + +genome_quantification_kallisto: + +pe_genome_quantification_kallisto: diff --git a/workflow/rules/paired_end.snakefile.smk b/workflow/rules/paired_end.snakefile.smk index 41c87eb..3f7834c 100644 --- a/workflow/rules/paired_end.snakefile.smk +++ b/workflow/rules/paired_end.snakefile.smk @@ -1,3 +1,4 @@ +current_rule = 'pe_remove_adapters_cutadapt' rule pe_remove_adapters_cutadapt: ''' Remove adapters @@ -37,7 +38,19 @@ rule pe_remove_adapters_cutadapt: adapter_3_mate2 = lambda wildcards: get_sample('fq2_3p', search_id='index', search_value=wildcards.sample), adapter_5_mate2 = lambda wildcards: - get_sample('fq2_5p', search_id='index', search_value=wildcards.sample) + get_sample('fq2_5p', search_id='index', search_value=wildcards.sample), + additional_params = parse_rule_config( + rule_config, + current_rule=current_rule, + immutable=( + '-a', + '-A', + '-g', + '-G', + '-o', + '-p', + ) + ) singularity: "docker://zavolab/cutadapt:1.16-slim" @@ -49,22 +62,21 @@ rule pe_remove_adapters_cutadapt: config["log_dir"], "samples", "{sample}", - "remove_adapters_cutadapt.pe.stderr.log"), + current_rule + ".stderr.log"), stdout = os.path.join( config["log_dir"], "samples", "{sample}", - "remove_adapters_cutadapt.pe.stdout.log") + current_rule + ".stdout.log") shell: "(cutadapt \ -j {threads} \ - -m 10 \ - -n 2 \ -a {params.adapter_3_mate1} \ -g {params.adapter_5_mate1} \ -A {params.adapter_3_mate2} \ -G {params.adapter_5_mate2} \ + {params.additional_params} \ -o {output.reads1} \ -p {output.reads2} \ {input.reads1} \ @@ -72,6 +84,7 @@ rule pe_remove_adapters_cutadapt: 1> {log.stdout} 2>{log.stderr}" +current_rule = 'pe_remove_polya_cutadapt' rule pe_remove_polya_cutadapt: ''' Remove polyA tails @@ -120,7 +133,19 @@ rule pe_remove_polya_cutadapt: get_sample( 'fq2_polya_5p', search_id='index', - search_value=wildcards.sample) + search_value=wildcards.sample), + additional_params = parse_rule_config( + rule_config, + current_rule=current_rule, + immutable=( + '-a', + '-A', + '-g', + '-G', + '-o', + '-p', + ) + ) singularity: "docker://zavolab/cutadapt:1.16-slim" @@ -132,22 +157,21 @@ rule pe_remove_polya_cutadapt: config["log_dir"], "samples", "{sample}", - "remove_polya_cutadapt.pe.stderr.log"), + current_rule + ".stderr.log"), stdout = os.path.join( config["log_dir"], "samples", "{sample}", - "remove_polya_cutadapt.pe.stdout.log") + current_rule + ".stdout.log") shell: "(cutadapt \ -j {threads} \ - -m 10 \ - -O 1 \ -a {params.polya_3_mate1} \ -g {params.polya_5_mate1} \ -A {params.polya_3_mate2} \ -G {params.polya_5_mate2} \ + {params.additional_params} \ -o {output.reads1} \ -p {output.reads2} \ {input.reads1} \ @@ -155,6 +179,7 @@ rule pe_remove_polya_cutadapt: 1> {log.stdout} 2>{log.stderr}" +current_rule = 'pe_map_genome_star' rule pe_map_genome_star: ''' Map to genome using STAR @@ -171,8 +196,8 @@ rule pe_map_genome_star: 'index_size', search_id='index', search_value=wildcards.sample), - "STAR_index", - "chrNameLength.txt"), + "STAR_index", + "chrNameLength.txt"), reads1 = os.path.join( config["output_dir"], "samples", @@ -213,7 +238,7 @@ rule pe_map_genome_star: 'index_size', search_id='index', search_value=wildcards.sample), - "STAR_index")), + "STAR_index")), outFileNamePrefix = os.path.join( config["output_dir"], "samples", @@ -235,6 +260,23 @@ rule pe_map_genome_star: 'pass_mode', search_id='index', search_value=wildcards.sample), + additional_params = parse_rule_config( + rule_config, + current_rule=current_rule, + immutable=( + '--twopassMode', + '--genomeDir', + '--readFilesIn', + '--readFilesCommand', + '--outFilterMultimapNmax', + '--outFileNamePrefix', + '--outSAMattributes', + '--outStd', + '--outSAMtype', + '--outSAMattrRGline', + '--alignEndsType', + ) + ) singularity: "docker://zavolab/star:2.7.3a-slim" @@ -246,7 +288,7 @@ rule pe_map_genome_star: config["log_dir"], "samples", "{sample}", - "map_genome_star.pe.stderr.log") + current_rule + ".stderr.log") shell: "(STAR \ @@ -256,17 +298,18 @@ rule pe_map_genome_star: --readFilesIn {input.reads1} {input.reads2} \ --readFilesCommand zcat \ --outFilterMultimapNmax {params.multimappers} \ - --outFilterMultimapScoreRange 0 \ --outFileNamePrefix {params.outFileNamePrefix} \ --outSAMattributes All \ --outStd BAM_SortedByCoordinate \ --outSAMtype BAM SortedByCoordinate \ - --outFilterType BySJout \ --outSAMattrRGline ID:rnaseq_pipeline SM:{params.sample_id} \ - --alignEndsType {params.soft_clip} > {output.bam};) \ + --alignEndsType {params.soft_clip} \ + {params.additional_params} \ + > {output.bam};) \ 2> {log.stderr}" +current_rule = 'pe_quantification_salmon' rule pe_quantification_salmon: ''' Quantification at transcript and gene level using Salmon @@ -298,7 +341,7 @@ rule pe_quantification_salmon: 'kmer', search_id='index', search_value=wildcards.sample), - "salmon.idx") + "salmon.idx") output: gn_estimates = os.path.join( @@ -340,19 +383,33 @@ rule pe_quantification_salmon: get_sample( 'libtype', search_id='index', - search_value=wildcards.sample) + search_value=wildcards.sample), + additional_params = parse_rule_config( + rule_config, + current_rule=current_rule, + immutable=( + '--libType', + '--fldMean', + '--fldSD', + '--index', + '--geneMap', + '-1', + '-2', + '-o', + ) + ) log: stderr = os.path.join( config["log_dir"], "samples", "{sample}", - "genome_quantification_salmon.pe.stderr.log"), + current_rule + ".stderr.log"), stdout = os.path.join( config["log_dir"], "samples", "{sample}", - "genome_quantification_salmon.pe.stdout.log"), + current_rule + ".stdout.log"), threads: 6 @@ -362,10 +419,8 @@ rule pe_quantification_salmon: shell: "(salmon quant \ --libType {params.libType} \ - --seqBias \ - --validateMappings \ --threads {threads} \ - --writeUnmappedNames \ + {params.additional_params} \ --index {input.index} \ --geneMap {input.gtf} \ -1 {input.reads1} \ @@ -374,6 +429,7 @@ rule pe_quantification_salmon: ) 1> {log.stdout} 2> {log.stderr}" +current_rule = 'pe_genome_quantification_kallisto' rule pe_genome_quantification_kallisto: ''' Quantification at transcript and gene level using Kallisto @@ -396,7 +452,7 @@ rule pe_genome_quantification_kallisto: 'organism', search_id='index', search_value=wildcards.sample), - "kallisto.idx") + "kallisto.idx") output: pseudoalignment = os.path.join( @@ -424,7 +480,22 @@ rule pe_genome_quantification_kallisto: get_sample( 'kallisto_directionality', search_id='index', - search_value=wildcards.sample) + search_value=wildcards.sample), + additional_params = parse_rule_config( + rule_config, + current_rule=current_rule, + immutable=( + '--single', + '-i', + '-o', + '-l', + '-s', + '--pseudobam', + '--fr-stranded', + '--rf-stranded', + ) + ) + singularity: "docker://zavolab/kallisto:0.46.1-slim" @@ -436,15 +507,16 @@ rule pe_genome_quantification_kallisto: config["log_dir"], "samples", "{sample}", - "genome_quantification_kallisto.pe.stderr.log") + current_rule + ".stderr.log") shell: "(kallisto quant \ -i {input.index} \ -o {params.output_dir} \ - --pseudobam \ -t {threads} \ {params.directionality}-stranded \ + {params.additional_params} \ + --pseudobam \ {input.reads1} {input.reads2} > {output.pseudoalignment}) \ 2> {log.stderr}" diff --git a/workflow/rules/single_end.snakefile.smk b/workflow/rules/single_end.snakefile.smk index 9563e06..337ae75 100644 --- a/workflow/rules/single_end.snakefile.smk +++ b/workflow/rules/single_end.snakefile.smk @@ -1,3 +1,4 @@ +current_rule = 'remove_adapters_cutadapt' rule remove_adapters_cutadapt: ''' Remove adapters @@ -27,7 +28,19 @@ rule remove_adapters_cutadapt: get_sample( 'fq1_5p', search_id='index', - search_value=wildcards.sample) + search_value=wildcards.sample), + additional_params = parse_rule_config( + rule_config, + current_rule=current_rule, + immutable=( + '-a', + '-A', + '-g', + '-G', + '-o', + '-p', + ) + ) singularity: "docker://zavolab/cutadapt:1.16-slim" @@ -39,24 +52,24 @@ rule remove_adapters_cutadapt: config["log_dir"], "samples", "{sample}", - "remove_adapters_cutadapt.se.stderr.log"), + current_rule + ".se.stderr.log"), stdout = os.path.join( config["log_dir"], "samples", "{sample}", - "remove_adapters_cutadapt.se.stdout.log") + current_rule + ".se.stdout.log") shell: "(cutadapt \ -j {threads} \ - -m 10 \ - -n 2 \ -a {params.adapters_3} \ -g {params.adapters_5} \ + {params.additional_params} \ -o {output.reads} \ {input.reads}) \ 1> {log.stdout} 2> {log.stderr}" +current_rule = 'remove_polya_cutadapt' rule remove_polya_cutadapt: ''' Remove ployA tails @@ -85,7 +98,19 @@ rule remove_polya_cutadapt: get_sample( 'fq1_polya_5p', search_id='index', - search_value=wildcards.sample) + search_value=wildcards.sample), + additional_params = parse_rule_config( + rule_config, + current_rule=current_rule, + immutable=( + '-a', + '-A', + '-g', + '-G', + '-o', + '-p', + ) + ) singularity: "docker://zavolab/cutadapt:1.16-slim" @@ -97,25 +122,25 @@ rule remove_polya_cutadapt: config["log_dir"], "samples", "{sample}", - "remove_polya_cutadapt.se.stderr.log"), + current_rule + ".se.stderr.log"), stdout = os.path.join( config["log_dir"], "samples", "{sample}", - "remove_polya_cutadapt.se.stdout.log") + current_rule + ".se.stdout.log") shell: "(cutadapt \ -j {threads} \ - -O 1 \ - -m 10 \ -a {params.polya_3} \ -g {params.polya_5} \ + {params.additional_params} \ -o {output.reads} \ {input.reads};) \ 1> {log.stdout} 2> {log.stderr}" +current_rule = 'map_genome_star' rule map_genome_star: ''' Map to genome using STAR @@ -178,7 +203,24 @@ rule map_genome_star: get_sample( 'pass_mode', search_id='index', - search_value=wildcards.sample) + search_value=wildcards.sample), + additional_params = parse_rule_config( + rule_config, + current_rule=current_rule, + immutable=( + '--twopassMode', + '--genomeDir', + '--readFilesIn', + '--readFilesCommand', + '--outFilterMultimapNmax', + '--outFileNamePrefix', + '--outSAMattributes', + '--outStd', + '--outSAMtype', + '--outSAMattrRGline', + '--alignEndsType', + ) + ) singularity: "docker://zavolab/star:2.7.3a-slim" @@ -190,7 +232,7 @@ rule map_genome_star: config["log_dir"], "samples", "{sample}", - "map_genome_star.se.stderr.log") + current_rule + ".se.stderr.log") shell: "(STAR \ @@ -200,17 +242,18 @@ rule map_genome_star: --readFilesIn {input.reads} \ --readFilesCommand zcat \ --outFilterMultimapNmax {params.multimappers} \ - --outFilterMultimapScoreRange 0 \ --outFileNamePrefix {params.outFileNamePrefix} \ --outSAMattributes All \ --outStd BAM_SortedByCoordinate \ --outSAMtype BAM SortedByCoordinate \ - --outFilterType BySJout \ --outSAMattrRGline ID:rnaseq_pipeline SM:{params.sample_id} \ - --alignEndsType {params.soft_clip} > {output.bam};) \ + --alignEndsType {params.soft_clip} \ + {params.additional_params} \ + > {output.bam};) \ 2> {log.stderr}" +current_rule = 'quantification_salmon' rule quantification_salmon: ''' Quantification at transcript and gene level using Salmon @@ -289,18 +332,31 @@ rule quantification_salmon: get_sample( 'sd', search_id='index', - search_value=wildcards.sample) + search_value=wildcards.sample), + additional_params = parse_rule_config( + rule_config, + current_rule=current_rule, + immutable=( + '--libType', + '--fldMean', + '--fldSD', + '--index', + '--geneMap', + '--unmatedReads', + '-o', + ) + ) log: stderr = os.path.join( config["log_dir"], "samples", "{sample}", - "quantification_salmon.se.stderr.log"), + current_rule + ".se.stderr.log"), stdout = os.path.join( config["log_dir"], "samples", "{sample}", - "quantification_salmon.se.stdout.log") + current_rule + ".se.stdout.log") threads: 12 @@ -310,11 +366,10 @@ rule quantification_salmon: shell: "(salmon quant \ --libType {params.libType} \ - --seqBias \ - --validateMappings \ --threads {threads} \ --fldMean {params.fraglen} \ --fldSD {params.fragsd} \ + {params.additional_params} \ --index {input.index} \ --geneMap {input.gtf} \ --unmatedReads {input.reads} \ @@ -322,6 +377,7 @@ rule quantification_salmon: 1> {log.stdout} 2> {log.stderr}" +current_rule = 'genome_quantification_kallisto' rule genome_quantification_kallisto: ''' Quantification at transcript and gene level using Kallisto @@ -377,7 +433,21 @@ rule genome_quantification_kallisto: get_sample( 'kallisto_directionality', search_id='index', - search_value=wildcards.sample) + search_value=wildcards.sample), + additional_params = parse_rule_config( + rule_config, + current_rule=current_rule, + immutable=( + '--single', + '-i', + '-o', + '-l', + '-s', + '--pseudobam', + '--fr-stranded', + '--rf-stranded', + ) + ) threads: 8 @@ -386,21 +456,22 @@ rule genome_quantification_kallisto: config["log_dir"], "samples", "{sample}", - "genome_quantification_kallisto.se.stderr.log") + current_rule + ".se.stderr.log") singularity: "docker://zavolab/kallisto:0.46.1-slim" shell: "(kallisto quant \ + --single \ -i {input.index} \ -o {params.output_dir} \ - --single \ -l {params.fraglen} \ -s {params.fragsd} \ - --pseudobam \ -t {threads} \ {params.directionality}-stranded \ + {params.additional_params} \ + --pseudobam \ {input.reads} > {output.pseudoalignment};) \ 2> {log.stderr}" diff --git a/workflow/scripts/zarp_multiqc_config.py b/workflow/scripts/zarp_multiqc_config.py index 0b72f85..40a469e 100644 --- a/workflow/scripts/zarp_multiqc_config.py +++ b/workflow/scripts/zarp_multiqc_config.py @@ -98,12 +98,12 @@ module_order: - cutadapt: name: "Cutadapt: adapter removal" path_filters: - - "*/*/remove_adapters_cutadapt*.stdout.log" + - "*/*/*remove_adapters_cutadapt*.stdout.log" - cutadapt: name: "Cutadapt: polyA tails removal" path_filters: - - "*/*/remove_polya_cutadapt*.stdout.log" + - "*/*/*remove_polya_cutadapt*.stdout.log" - star: path_filters: @@ -123,7 +123,7 @@ module_order: - kallisto: path_filters: - - "*/*/genome_quantification_kallisto*.stderr.log" + - "*/*/*genome_quantification_kallisto*.stderr.log" fn_clean_exts: - '.fq1' -- GitLab