From 6034ec2f6188d5af74fcbb780b4b6d449e47f05d Mon Sep 17 00:00:00 2001 From: BIOPZ-Katsantoni Maria <maria.katsantoni@unibas.ch> Date: Mon, 23 Aug 2021 07:34:24 +0000 Subject: [PATCH] Remove sample specific options and use them as rule specific. --- pipeline_documentation.md | 10 +++------- tests/input_files/rule_config.yaml | 12 +++++++++++ tests/input_files/samples.multiple_lanes.tsv | 10 +++++----- tests/input_files/samples.tsv | 6 +++--- tests/input_files/samples_alfa.tsv | 10 +++++----- workflow/rules/paired_end.snakefile.smk | 21 -------------------- workflow/rules/single_end.snakefile.smk | 21 -------------------- 7 files changed, 28 insertions(+), 62 deletions(-) diff --git a/pipeline_documentation.md b/pipeline_documentation.md index 070b366..58b2e8a 100644 --- a/pipeline_documentation.md +++ b/pipeline_documentation.md @@ -116,9 +116,6 @@ gtf_filtered | Required for [Salmon](#third-party-software-used). Path to filter genome | Required for [STAR](#third-party-software-used). Path to genome `.fa` file. File needs to be in subdirectory corresponding to `organism` field. Example: `/path/to/GRCh38/genome.fa` | `str` sd | Required for [kallisto](#third-party-software-used) and [Salmon](#third-party-software-used), but only for single-end libraries. Estimated standard deviation of fragment length distribution. Can be assessed from, e.g., BioAnalyzer profiles. Value ignored for paired-end libraries. | `int` mean | Required for [kallisto](#third-party-software-used) and [Salmon](#third-party-software-used), but only for single-end libraries. Estimated mean of fragment length distribution. Can be assessed, e.g., from BioAnalyzer profiles. Value ignored for paired-end libraries. | `int` -multimappers | Required for [STAR](#third-party-software-used). Maximum number of multiple alignments allowed for a read; if exceeded, the read is considered unmapped. | `int` -soft_clip | Required for [STAR](#third-party-software-used). One of `Local` (standard local alignment with soft-clipping allowed) or `EndToEnd` (force end-to-end read alignment, do not soft-clip). | `str` -pass_mode | Required for [STAR](#third-party-software-used). One of `None` (1-pass mapping) or `Basic` (basic 2-pass mapping, with all 1st-pass junctions inserted into the genome indices on the fly). | `str` libtype | Required for [Salmon](#third-party-software-used), and, after internal conversion, for [kallisto](#third-party-software-used) and [ALFA](#third-party-software-used) . See [Salmon manual][docs-salmon] for allowed values. **WARNING**: do *NOT* use `A` to automatically infer the salmon library type, this will cause kallisto and ALFA to fail. | `str` fq1_polya3p | Required for [Cutadapt](#third-party-software-used). Stretch of `A`s or `T`s, depending on read orientation. Trimmed from the 3' end of the read. Use value such as `XXXXXXXXXXXXXXX` if no poly(A) stretch present or if no trimming is desired. | `str` fq1_polya5p | Required for [Cutadapt](#third-party-software-used). Stretch of `A`s or `T`s, depending on read orientation. Trimmed from the 5' end of the read. Use value such as `XXXXXXXXXXXXXXX` if no poly(A) stretch present or if no trimming is desired. | `str` @@ -612,13 +609,12 @@ Align short reads to reference genome and/or transcriptome with [**remove_polya_cutadapt**](#remove_polya_cutadapt) - Index; from [**create_index_star**](#create_index_star) - **Parameters** - - **samples.tsv** - - `--outFilterMultimapNmax`: maximum number of multiple alignments allowed; if exceeded, read is considered unmapped; specify in sample table column `multimappers` - - `--alignEndsType`: one of `Local` (standard local alignment with soft-clipping allowed) or `EndToEnd` (force end-to-end read alignment, do not soft-clip); specify in sample table column `soft_clip` - - `--twopassMode`: one of `None` (1-pass mapping) or `Basic` (basic 2-pass mapping, with all 1st-pass junctions inserted into the genome indices on the fly); specify in sample table column `pass_mode` - **rule_config.yaml** - `--outFilterMultimapScoreRange=0`: the score range below the maximum score for multimapping alignments (default 1) - `--outFilterType=BySJout`: reduces the number of â€spurious†junctions + - `--alignEndsType`: one of `Local` (standard local alignment with soft-clipping allowed) or `EndToEnd` (force end-to-end read alignment, do not soft-clip); specify in sample table column `soft_clip` + - `--twopassMode`: one of `None` (1-pass mapping) or `Basic` (basic 2-pass mapping, with all 1st-pass junctions inserted into the genome indices on the fly); specify in sample table column `pass_mode` + - `--outFilterMultimapNmax`: maximum number of multiple alignments allowed; if exceeded, read is considered unmapped; specify in sample table column `multimappers` - **Output** - Aligned reads file (`.bam`); used in [**calculate_TIN_scores**](#calculate_TIN_scores), diff --git a/tests/input_files/rule_config.yaml b/tests/input_files/rule_config.yaml index a42874f..d45c335 100644 --- a/tests/input_files/rule_config.yaml +++ b/tests/input_files/rule_config.yaml @@ -160,6 +160,12 @@ map_genome_star: # "SJ.out.tab" (default 'Normal', ZARP recommends 'BySJout', as this # reduces the number of spurious junctions ) --outFilterType: 'BySJout' + # type of read ends alignment: force end-to-end read alignment, do not soft-clip + --alignEndsType: 'EndToEnd' + # extract junctions, insert them into the genome index and re-map reads in a 2nd mapping pass + --twopassMode: Basic + # alignments (all of them) will be output only if the read maps to no more loci than 10 + --outFilterMultimapNmax: '10' pe_map_genome_star: # The score range below the maximum score for multimapping alignments @@ -169,6 +175,12 @@ pe_map_genome_star: # "SJ.out.tab" (default 'Normal', ZARP recommends 'BySJout', as this # reduces the number of spurious junctions ) --outFilterType: 'BySJout' + # type of read ends alignment: force end-to-end read alignment, do not soft-clip + --alignEndsType: 'EndToEnd' + # extract junctions, insert them into the genome index and re-map reads in a 2nd mapping pass + --twopassMode: Basic + # alignments (all of them) will be output only if the read maps to no more loci than 10 + --outFilterMultimapNmax: '10' quantification_salmon: # Correct for sequence specific biases; cf. diff --git a/tests/input_files/samples.multiple_lanes.tsv b/tests/input_files/samples.multiple_lanes.tsv index 6b32ab8..26ab518 100644 --- a/tests/input_files/samples.multiple_lanes.tsv +++ b/tests/input_files/samples.multiple_lanes.tsv @@ -1,5 +1,5 @@ -sample seqmode fq1 fq2 index_size kmer fq1_3p fq1_5p fq2_3p fq2_5p organism gtf genome sd mean multimappers soft_clip pass_mode libtype fq1_polya_3p fq1_polya_5p fq2_polya_3p fq2_polya_5p -synthetic_10_reads_paired_synthetic_10_reads_paired pe ../input_files/pe_lane1/synthetic_split_lane1.mate_1.fastq.gz ../input_files/pe_lane1/synthetic_split_lane1.mate_2.fastq.gz 75 31 AGATCGGAAGAGCACA XXXXXXXXXXXXXXX AGATCGGAAGAGCGT XXXXXXXXXXXXXXX homo_sapiens ../input_files/homo_sapiens/annotation.gtf ../input_files/homo_sapiens/genome.fa 100 250 10 EndToEnd None ISF AAAAAAAAAAAAAAA XXXXXXXXXXXXXXX XXXXXXXXXXXXXXX TTTTTTTTTTTTTTT -synthetic_10_reads_paired_synthetic_10_reads_paired pe ../input_files/pe_lane2/synthetic_split_lane2.mate_1.fastq.gz ../input_files/pe_lane2/synthetic_split_lane2.mate_2.fastq.gz 75 31 AGATCGGAAGAGCACA XXXXXXXXXXXXXXX AGATCGGAAGAGCGT XXXXXXXXXXXXXXX homo_sapiens ../input_files/homo_sapiens/annotation.gtf ../input_files/homo_sapiens/genome.fa 100 250 10 EndToEnd None ISF AAAAAAAAAAAAAAA XXXXXXXXXXXXXXX XXXXXXXXXXXXXXX TTTTTTTTTTTTTTT -synthetic_10_reads_mate_1_synthetic_10_reads_mate_1 se ../input_files/se_lane1/synthetic_split_lane1.mate_1.fastq.gz XXXXXXXXXXXXXXX 75 31 AGATCGGAAGAGCACA XXXXXXXXXXXXXXX XXXXXXXXXXXXXXX XXXXXXXXXXXXXXX homo_sapiens ../input_files/homo_sapiens/annotation.gtf ../input_files/homo_sapiens/genome.fa 100 250 10 EndToEnd None SF AAAAAAAAAAAAAAA XXXXXXXXXXXXXXX XXXXXXXXXXXXXXX XXXXXXXXXXXXXXX -synthetic_10_reads_mate_1_synthetic_10_reads_mate_1 se ../input_files/se_lane2/synthetic_split_lane2.mate_1.fastq.gz XXXXXXXXXXXXXXX 75 31 AGATCGGAAGAGCACA XXXXXXXXXXXXXXX XXXXXXXXXXXXXXX XXXXXXXXXXXXXXX homo_sapiens ../input_files/homo_sapiens/annotation.gtf ../input_files/homo_sapiens/genome.fa 100 250 10 EndToEnd None SF AAAAAAAAAAAAAAA XXXXXXXXXXXXXXX XXXXXXXXXXXXXXX XXXXXXXXXXXXXXX +sample seqmode fq1 fq2 index_size kmer fq1_3p fq1_5p fq2_3p fq2_5p organism gtf genome sd mean libtype fq1_polya_3p fq1_polya_5p fq2_polya_3p fq2_polya_5p +synthetic_10_reads_paired_synthetic_10_reads_paired pe ../input_files/pe_lane1/synthetic_split_lane1.mate_1.fastq.gz ../input_files/pe_lane1/synthetic_split_lane1.mate_2.fastq.gz 75 31 AGATCGGAAGAGCACA XXXXXXXXXXXXXXX AGATCGGAAGAGCGT XXXXXXXXXXXXXXX homo_sapiens ../input_files/homo_sapiens/annotation.gtf ../input_files/homo_sapiens/genome.fa 100 250 ISF AAAAAAAAAAAAAAA XXXXXXXXXXXXXXX XXXXXXXXXXXXXXX TTTTTTTTTTTTTTT +synthetic_10_reads_paired_synthetic_10_reads_paired pe ../input_files/pe_lane2/synthetic_split_lane2.mate_1.fastq.gz ../input_files/pe_lane2/synthetic_split_lane2.mate_2.fastq.gz 75 31 AGATCGGAAGAGCACA XXXXXXXXXXXXXXX AGATCGGAAGAGCGT XXXXXXXXXXXXXXX homo_sapiens ../input_files/homo_sapiens/annotation.gtf ../input_files/homo_sapiens/genome.fa 100 250 ISF AAAAAAAAAAAAAAA XXXXXXXXXXXXXXX XXXXXXXXXXXXXXX TTTTTTTTTTTTTTT +synthetic_10_reads_mate_1_synthetic_10_reads_mate_1 se ../input_files/se_lane1/synthetic_split_lane1.mate_1.fastq.gz XXXXXXXXXXXXXXX 75 31 AGATCGGAAGAGCACA XXXXXXXXXXXXXXX XXXXXXXXXXXXXXX XXXXXXXXXXXXXXX homo_sapiens ../input_files/homo_sapiens/annotation.gtf ../input_files/homo_sapiens/genome.fa 100 250 SF AAAAAAAAAAAAAAA XXXXXXXXXXXXXXX XXXXXXXXXXXXXXX XXXXXXXXXXXXXXX +synthetic_10_reads_mate_1_synthetic_10_reads_mate_1 se ../input_files/se_lane2/synthetic_split_lane2.mate_1.fastq.gz XXXXXXXXXXXXXXX 75 31 AGATCGGAAGAGCACA XXXXXXXXXXXXXXX XXXXXXXXXXXXXXX XXXXXXXXXXXXXXX homo_sapiens ../input_files/homo_sapiens/annotation.gtf ../input_files/homo_sapiens/genome.fa 100 250 SF AAAAAAAAAAAAAAA XXXXXXXXXXXXXXX XXXXXXXXXXXXXXX XXXXXXXXXXXXXXX diff --git a/tests/input_files/samples.tsv b/tests/input_files/samples.tsv index 58ac18f..25ce3ab 100644 --- a/tests/input_files/samples.tsv +++ b/tests/input_files/samples.tsv @@ -1,3 +1,3 @@ -sample seqmode fq1 index_size kmer fq1_3p fq1_5p organism gtf genome sd mean multimappers soft_clip pass_mode libtype fq1_polya_3p fq1_polya_5p fq2 fq2_3p fq2_5p fq2_polya_3p fq2_polya_5p -synthetic_10_reads_paired_synthetic_10_reads_paired pe ../input_files/project1/synthetic.mate_1.fastq.gz 75 31 AGATCGGAAGAGCACA XXXXXXXXXXXXX homo_sapiens ../input_files/homo_sapiens/annotation.gtf ../input_files/homo_sapiens/genome.fa 100 250 10 EndToEnd None ISF AAAAAAAAAAAAAAAAA XXXXXXXXXXXXXXXXX ../input_files/project1/synthetic.mate_2.fastq.gz AGATCGGAAGAGCGT XXXXXXXXXXXXX XXXXXXXXXXXXXXXXX TTTTTTTTTTTTTTTTT -synthetic_10_reads_mate_1_synthetic_10_reads_mate_1 se ../input_files/project2/synthetic.mate_1.fastq.gz 75 31 AGATCGGAAGAGCACA XXXXXXXXXXXXX homo_sapiens ../input_files/homo_sapiens/annotation.gtf ../input_files/homo_sapiens/genome.fa 100 250 10 EndToEnd None SF AAAAAAAAAAAAAAAAA XXXXXXXXXXXXXXXXX XXXXXXXXXXXXX XXXXXXXXXXXXX XXXXXXXXXXXXX XXXXXXXXXXXXX XXXXXXXXXXXXX +sample seqmode fq1 index_size kmer fq1_3p fq1_5p organism gtf genome sd mean libtype fq1_polya_3p fq1_polya_5p fq2 fq2_3p fq2_5p fq2_polya_3p fq2_polya_5p +synthetic_10_reads_paired_synthetic_10_reads_paired pe ../input_files/project1/synthetic.mate_1.fastq.gz 75 31 AGATCGGAAGAGCACA XXXXXXXXXXXXX homo_sapiens ../input_files/homo_sapiens/annotation.gtf ../input_files/homo_sapiens/genome.fa 100 250 ISF AAAAAAAAAAAAAAAAA XXXXXXXXXXXXXXXXX ../input_files/project1/synthetic.mate_2.fastq.gz AGATCGGAAGAGCGT XXXXXXXXXXXXX XXXXXXXXXXXXXXXXX TTTTTTTTTTTTTTTTT +synthetic_10_reads_mate_1_synthetic_10_reads_mate_1 se ../input_files/project2/synthetic.mate_1.fastq.gz 75 31 AGATCGGAAGAGCACA XXXXXXXXXXXXX homo_sapiens ../input_files/homo_sapiens/annotation.gtf ../input_files/homo_sapiens/genome.fa 100 250 SF AAAAAAAAAAAAAAAAA XXXXXXXXXXXXXXXXX XXXXXXXXXXXXX XXXXXXXXXXXXX XXXXXXXXXXXXX XXXXXXXXXXXXX XXXXXXXXXXXXX \ No newline at end of file diff --git a/tests/input_files/samples_alfa.tsv b/tests/input_files/samples_alfa.tsv index 9ef1c3c..bf40271 100644 --- a/tests/input_files/samples_alfa.tsv +++ b/tests/input_files/samples_alfa.tsv @@ -1,5 +1,5 @@ -sample seqmode fq1 index_size kmer fq2 fq1_3p fq1_5p fq2_3p fq2_5p organism gtf genome sd mean multimappers soft_clip pass_mode libtype fq1_polya fq2_polya -paired_end_R1_on_plus_sense pe XXXXXXXXXXXXX 75 31 XXXXXXXXXXXXX GATCGGAAGAGCACA XXXXXXXXXXXXX AGATCGGAAGAGCGT XXXXXXXXXXXXX homo_sapiens ../input_files/homo_sapiens/quick_start.gtf XXXXXXXXXXXXX 100 250 10 EndToEnd None ISF AAAAAAAAAAAAAAAAA TTTTTTTTTTTTTTTTT -paired_end_R1_on_plus_antisense pe XXXXXXXXXXXXX 75 31 XXXXXXXXXXXXX AGATCGGAAGAGCACA XXXXXXXXXXXXX AGATCGGAAGAGCGT XXXXXXXXXXXXX homo_sapiens ../input_files/homo_sapiens/quick_start.gtf XXXXXXXXXXXXX 100 250 10 EndToEnd None ISR AAAAAAAAAAAAAAAAA TTTTTTTTTTTTTTTTT -paired_end_R1_on_minus_sense pe XXXXXXXXXXXXX 75 31 XXXXXXXXXXXXX AGATCGGAAGAGCACA XXXXXXXXXXXXX AGATCGGAAGAGCGT XXXXXXXXXXXXX homo_sapiens ../input_files/homo_sapiens/quick_start.gtf XXXXXXXXXXXXX 100 250 10 EndToEnd None ISF AAAAAAAAAAAAAAAAA TTTTTTTTTTTTTTTTT -paired_end_R1_on_minus_antisense pe XXXXXXXXXXXXX 75 31 XXXXXXXXXXXXX AGATCGGAAGAGCACA XXXXXXXXXXXXX AGATCGGAAGAGCGT XXXXXXXXXXXXX homo_sapiens ../input_files/homo_sapiens/quick_start.gtf XXXXXXXXXXXXX 100 250 10 EndToEnd None ISR AAAAAAAAAAAAAAAAA TTTTTTTTTTTTTTTTT +sample seqmode fq1 index_size kmer fq2 fq1_3p fq1_5p fq2_3p fq2_5p organism gtf genome sd mean libtype fq1_polya fq2_polya +paired_end_R1_on_plus_sense pe XXXXXXXXXXXXX 75 31 XXXXXXXXXXXXX GATCGGAAGAGCACA XXXXXXXXXXXXX AGATCGGAAGAGCGT XXXXXXXXXXXXX homo_sapiens ../input_files/homo_sapiens/quick_start.gtf XXXXXXXXXXXXX 100 250 ISF AAAAAAAAAAAAAAAAA TTTTTTTTTTTTTTTTT +paired_end_R1_on_plus_antisense pe XXXXXXXXXXXXX 75 31 XXXXXXXXXXXXX AGATCGGAAGAGCACA XXXXXXXXXXXXX AGATCGGAAGAGCGT XXXXXXXXXXXXX homo_sapiens ../input_files/homo_sapiens/quick_start.gtf XXXXXXXXXXXXX 100 250 ISR AAAAAAAAAAAAAAAAA TTTTTTTTTTTTTTTTT +paired_end_R1_on_minus_sense pe XXXXXXXXXXXXX 75 31 XXXXXXXXXXXXX AGATCGGAAGAGCACA XXXXXXXXXXXXX AGATCGGAAGAGCGT XXXXXXXXXXXXX homo_sapiens ../input_files/homo_sapiens/quick_start.gtf XXXXXXXXXXXXX 100 250 ISF AAAAAAAAAAAAAAAAA TTTTTTTTTTTTTTTTT +paired_end_R1_on_minus_antisense pe XXXXXXXXXXXXX 75 31 XXXXXXXXXXXXX AGATCGGAAGAGCACA XXXXXXXXXXXXX AGATCGGAAGAGCGT XXXXXXXXXXXXX homo_sapiens ../input_files/homo_sapiens/quick_start.gtf XXXXXXXXXXXXX 100 250 ISR AAAAAAAAAAAAAAAAA TTTTTTTTTTTTTTTTT diff --git a/workflow/rules/paired_end.snakefile.smk b/workflow/rules/paired_end.snakefile.smk index 081fa53..5cf75ef 100644 --- a/workflow/rules/paired_end.snakefile.smk +++ b/workflow/rules/paired_end.snakefile.smk @@ -256,36 +256,18 @@ rule pe_map_genome_star: "{sample}", "map_genome", "{sample}.pe."), - multimappers = lambda wildcards: - get_sample( - 'multimappers', - search_id='index', - search_value=wildcards.sample), - soft_clip = lambda wildcards: - get_sample( - 'soft_clip', - search_id='index', - search_value=wildcards.sample), - pass_mode = lambda wildcards: - get_sample( - 'pass_mode', - search_id='index', - search_value=wildcards.sample), additional_params = parse_rule_config( rule_config, current_rule=current_rule, immutable=( - '--twopassMode', '--genomeDir', '--readFilesIn', '--readFilesCommand', - '--outFilterMultimapNmax', '--outFileNamePrefix', '--outSAMattributes', '--outStd', '--outSAMtype', '--outSAMattrRGline', - '--alignEndsType', ) ) @@ -306,18 +288,15 @@ rule pe_map_genome_star: shell: "(STAR \ - --twopassMode {params.pass_mode} \ --runThreadN {threads} \ --genomeDir {params.index} \ --readFilesIn {input.reads1} {input.reads2} \ --readFilesCommand zcat \ - --outFilterMultimapNmax {params.multimappers} \ --outFileNamePrefix {params.outFileNamePrefix} \ --outSAMattributes All \ --outStd BAM_SortedByCoordinate \ --outSAMtype BAM SortedByCoordinate \ --outSAMattrRGline ID:rnaseq_pipeline SM:{params.sample_id} \ - --alignEndsType {params.soft_clip} \ {params.additional_params} \ > {output.bam};) \ 2> {log.stderr}" diff --git a/workflow/rules/single_end.snakefile.smk b/workflow/rules/single_end.snakefile.smk index c64d71d..4e9269b 100644 --- a/workflow/rules/single_end.snakefile.smk +++ b/workflow/rules/single_end.snakefile.smk @@ -200,36 +200,18 @@ rule map_genome_star: "{sample}", "map_genome", "{sample}.se."), - multimappers = lambda wildcards: - get_sample( - 'multimappers', - search_id='index', - search_value=wildcards.sample), - soft_clip = lambda wildcards: - get_sample( - 'soft_clip', - search_id='index', - search_value=wildcards.sample), - pass_mode = lambda wildcards: - get_sample( - 'pass_mode', - search_id='index', - search_value=wildcards.sample), additional_params = parse_rule_config( rule_config, current_rule=current_rule, immutable=( - '--twopassMode', '--genomeDir', '--readFilesIn', '--readFilesCommand', - '--outFilterMultimapNmax', '--outFileNamePrefix', '--outSAMattributes', '--outStd', '--outSAMtype', '--outSAMattrRGline', - '--alignEndsType', ) ) @@ -250,18 +232,15 @@ rule map_genome_star: shell: "(STAR \ - --twopassMode {params.pass_mode} \ --runThreadN {threads} \ --genomeDir {params.index} \ --readFilesIn {input.reads} \ --readFilesCommand zcat \ - --outFilterMultimapNmax {params.multimappers} \ --outFileNamePrefix {params.outFileNamePrefix} \ --outSAMattributes All \ --outStd BAM_SortedByCoordinate \ --outSAMtype BAM SortedByCoordinate \ --outSAMattrRGline ID:rnaseq_pipeline SM:{params.sample_id} \ - --alignEndsType {params.soft_clip} \ {params.additional_params} \ > {output.bam};) \ 2> {log.stderr}" -- GitLab