From 2a9347408c8b8bbf83017fddca06fb641bfc710d Mon Sep 17 00:00:00 2001 From: BIOPZ-Katsantoni Maria <maria.katsantoni@unibas.ch> Date: Wed, 14 Jul 2021 16:29:55 +0200 Subject: [PATCH] Remove sample specific options and use them as rule specific. Partially solves issue #158. The features softclip, passmode, multimappers which were related to STAR are relocated to the rule config and given a default value. The documentation is updated and these options are removed from the immutable lists in the corresponding rules. --- pipeline_documentation.md | 10 +++------- tests/input_files/rule_config.yaml | 12 ++++++++++++ tests/input_files/samples.tsv | 6 +++--- workflow/rules/paired_end.snakefile.smk | 21 --------------------- workflow/rules/single_end.snakefile.smk | 21 --------------------- 5 files changed, 18 insertions(+), 52 deletions(-) diff --git a/pipeline_documentation.md b/pipeline_documentation.md index 5f96d48..88e0689 100644 --- a/pipeline_documentation.md +++ b/pipeline_documentation.md @@ -116,9 +116,6 @@ gtf_filtered | Required for [Salmon](#third-party-software-used). Path to filter genome | Required for [STAR](#third-party-software-used). Path to genome `.fa` file. File needs to be in subdirectory corresponding to `organism` field. Example: `/path/to/GRCh38/genome.fa` | `str` sd | Required for [kallisto](#third-party-software-used) and [Salmon](#third-party-software-used), but only for single-end libraries. Estimated standard deviation of fragment length distribution. Can be assessed from, e.g., BioAnalyzer profiles. Value ignored for paired-end libraries. | `int` mean | Required for [kallisto](#third-party-software-used) and [Salmon](#third-party-software-used), but only for single-end libraries. Estimated mean of fragment length distribution. Can be assessed, e.g., from BioAnalyzer profiles. Value ignored for paired-end libraries. | `int` -multimappers | Required for [STAR](#third-party-software-used). Maximum number of multiple alignments allowed for a read; if exceeded, the read is considered unmapped. | `int` -soft_clip | Required for [STAR](#third-party-software-used). One of `Local` (standard local alignment with soft-clipping allowed) or `EndToEnd` (force end-to-end read alignment, do not soft-clip). | `str` -pass_mode | Required for [STAR](#third-party-software-used). One of `None` (1-pass mapping) or `Basic` (basic 2-pass mapping, with all 1st-pass junctions inserted into the genome indices on the fly). | `str` libtype | Required for [Salmon](#third-party-software-used). See [Salmon manual][docs-salmon] for allowed values. If in doubt, enter `A` to automatically infer the library type. | `str` kallisto_directionality | Required for [kallisto](#third-party-software-used) and [ALFA](#third-party-software-used). One of `--fr-stranded` (strand-specific reads, first read forward) and `--rf-stranded` (strand-specific reads, first read reverse) | `str` fq1_polya3p | Required for [Cutadapt](#third-party-software-used). Stretch of `A`s or `T`s, depending on read orientation. Trimmed from the 3' end of the read. Use value such as `XXXXXXXXXXXXXXX` if no poly(A) stretch present or if no trimming is desired. | `str` @@ -599,13 +596,12 @@ Align short reads to reference genome and/or transcriptome with [**remove_polya_cutadapt**](#remove_polya_cutadapt) - Index; from [**create_index_star**](#create_index_star) - **Parameters** - - **samples.tsv** - - `--outFilterMultimapNmax`: maximum number of multiple alignments allowed; if exceeded, read is considered unmapped; specify in sample table column `multimappers` - - `--alignEndsType`: one of `Local` (standard local alignment with soft-clipping allowed) or `EndToEnd` (force end-to-end read alignment, do not soft-clip); specify in sample table column `soft_clip` - - `--twopassMode`: one of `None` (1-pass mapping) or `Basic` (basic 2-pass mapping, with all 1st-pass junctions inserted into the genome indices on the fly); specify in sample table column `pass_mode` - **rule_config.yaml** - `--outFilterMultimapScoreRange=0`: the score range below the maximum score for multimapping alignments (default 1) - `--outFilterType=BySJout`: reduces the number of â€spurious†junctions + - `--outFilterMultimapNmax`: maximum number of multiple alignments allowed; if exceeded, read is considered unmapped; specify in sample table column `multimappers` + - `--alignEndsType`: one of `Local` (standard local alignment with soft-clipping allowed) or `EndToEnd` (force end-to-end read alignment, do not soft-clip); specify in sample table column `soft_clip` + - `--twopassMode`: one of `None` (1-pass mapping) or `Basic` (basic 2-pass mapping, with all 1st-pass junctions inserted into the genome indices on the fly); specify in sample table column `pass_mode` - **Output** - Aligned reads file (`.bam`); used in [**calculate_TIN_scores**](#calculate_TIN_scores), diff --git a/tests/input_files/rule_config.yaml b/tests/input_files/rule_config.yaml index 2cdec07..5999ddb 100644 --- a/tests/input_files/rule_config.yaml +++ b/tests/input_files/rule_config.yaml @@ -122,12 +122,24 @@ map_genome_star: --outFilterMultimapScoreRange: '0' # keep only those reads that contain junctions that passed filtering into SJ.out.tab. (default 'Normal', ZARP recommends 'BySJout', as this reduces the number of â€spurious†junctions ) --outFilterType: 'BySJout' + # type of read ends alignment: force end-to-end read alignment, do not soft-clip + --alignEndsType: 'EndToEnd' + # extract junctions, insert them into the genome index and re-map reads in a 2nd mapping pass + --twopassMode: Basic + # alignments (all of them) will be output only if the read maps to no more loci than 10 + --outFilterMultimapNmax: '10' pe_map_genome_star: # the score range below the maximum score for multimapping alignments (default 1, ZARP recommends 0) --outFilterMultimapScoreRange: '0' # keep only those reads that contain junctions that passed filtering into SJ.out.tab. (default 'Normal', ZARP recommends 'BySJout', as this reduces the number of â€spurious†junctions ) --outFilterType: 'BySJout' + # type of read ends alignment: force end-to-end read alignment, do not soft-clip + --alignEndsType: 'EndToEnd' + # extract junctions, insert them into the genome index and re-map reads in a 2nd mapping pass + --twopassMode: Basic + # alignments (all of them) will be output only if the read maps to no more loci than 10 + --outFilterMultimapNmax: '10' quantification_salmon: # correct for sequence specific biases](https://salmon.readthedocs.io/en/latest/salmon.html#seqbias diff --git a/tests/input_files/samples.tsv b/tests/input_files/samples.tsv index 4b26302..d74233f 100644 --- a/tests/input_files/samples.tsv +++ b/tests/input_files/samples.tsv @@ -1,3 +1,3 @@ -sample seqmode fq1 index_size kmer fq1_3p fq1_5p organism gtf genome sd mean multimappers soft_clip pass_mode libtype fq1_polya_3p fq1_polya_5p kallisto_directionality alfa_directionality alfa_plus alfa_minus fq2 fq2_3p fq2_5p fq2_polya_3p fq2_polya_5p -synthetic_10_reads_paired_synthetic_10_reads_paired pe ../input_files/project1/synthetic.mate_1.fastq.gz 75 31 AGATCGGAAGAGCACA XXXXXXXXXXXXX homo_sapiens ../input_files/homo_sapiens/annotation.gtf ../input_files/homo_sapiens/genome.fa 100 250 10 EndToEnd None A AAAAAAAAAAAAAAAAA XXXXXXXXXXXXXXXXX --fr fr-firststrand str1 str2 ../input_files/project1/synthetic.mate_2.fastq.gz AGATCGGAAGAGCGT XXXXXXXXXXXXX XXXXXXXXXXXXXXXXX TTTTTTTTTTTTTTTTT -synthetic_10_reads_mate_1_synthetic_10_reads_mate_1 se ../input_files/project2/synthetic.mate_1.fastq.gz 75 31 AGATCGGAAGAGCACA XXXXXXXXXXXXX homo_sapiens ../input_files/homo_sapiens/annotation.gtf ../input_files/homo_sapiens/genome.fa 100 250 10 EndToEnd None A AAAAAAAAAAAAAAAAA XXXXXXXXXXXXXXXXX --fr fr-firststrand str1 str2 XXXXXXXXXXXXX XXXXXXXXXXXXX XXXXXXXXXXXXX XXXXXXXXXXXXX XXXXXXXXXXXXX +sample seqmode fq1 index_size kmer fq1_3p fq1_5p organism gtf genome sd mean libtype fq1_polya_3p fq1_polya_5p kallisto_directionality alfa_directionality alfa_plus alfa_minus fq2 fq2_3p fq2_5p fq2_polya_3p fq2_polya_5p +synthetic_10_reads_paired_synthetic_10_reads_paired pe ../input_files/project1/synthetic.mate_1.fastq.gz 75 31 AGATCGGAAGAGCACA XXXXXXXXXXXXX homo_sapiens ../input_files/homo_sapiens/annotation.gtf ../input_files/homo_sapiens/genome.fa 100 250 A AAAAAAAAAAAAAAAAA XXXXXXXXXXXXXXXXX --fr fr-firststrand str1 str2 ../input_files/project1/synthetic.mate_2.fastq.gz AGATCGGAAGAGCGT XXXXXXXXXXXXX XXXXXXXXXXXXXXXXX TTTTTTTTTTTTTTTTT +synthetic_10_reads_mate_1_synthetic_10_reads_mate_1 se ../input_files/project2/synthetic.mate_1.fastq.gz 75 31 AGATCGGAAGAGCACA XXXXXXXXXXXXX homo_sapiens ../input_files/homo_sapiens/annotation.gtf ../input_files/homo_sapiens/genome.fa 100 250 A AAAAAAAAAAAAAAAAA XXXXXXXXXXXXXXXXX --fr fr-firststrand str1 str2 XXXXXXXXXXXXX XXXXXXXXXXXXX XXXXXXXXXXXXX XXXXXXXXXXXXX XXXXXXXXXXXXX diff --git a/workflow/rules/paired_end.snakefile.smk b/workflow/rules/paired_end.snakefile.smk index 37c5546..1d03822 100644 --- a/workflow/rules/paired_end.snakefile.smk +++ b/workflow/rules/paired_end.snakefile.smk @@ -251,36 +251,18 @@ rule pe_map_genome_star: "{sample}", "map_genome", "{sample}.pe."), - multimappers = lambda wildcards: - get_sample( - 'multimappers', - search_id='index', - search_value=wildcards.sample), - soft_clip = lambda wildcards: - get_sample( - 'soft_clip', - search_id='index', - search_value=wildcards.sample), - pass_mode = lambda wildcards: - get_sample( - 'pass_mode', - search_id='index', - search_value=wildcards.sample), additional_params = parse_rule_config( rule_config, current_rule=current_rule, immutable=( - '--twopassMode', '--genomeDir', '--readFilesIn', '--readFilesCommand', - '--outFilterMultimapNmax', '--outFileNamePrefix', '--outSAMattributes', '--outStd', '--outSAMtype', '--outSAMattrRGline', - '--alignEndsType', ) ) @@ -301,18 +283,15 @@ rule pe_map_genome_star: shell: "(STAR \ - --twopassMode {params.pass_mode} \ --runThreadN {threads} \ --genomeDir {params.index} \ --readFilesIn {input.reads1} {input.reads2} \ --readFilesCommand zcat \ - --outFilterMultimapNmax {params.multimappers} \ --outFileNamePrefix {params.outFileNamePrefix} \ --outSAMattributes All \ --outStd BAM_SortedByCoordinate \ --outSAMtype BAM SortedByCoordinate \ --outSAMattrRGline ID:rnaseq_pipeline SM:{params.sample_id} \ - --alignEndsType {params.soft_clip} \ {params.additional_params} \ > {output.bam};) \ 2> {log.stderr}" diff --git a/workflow/rules/single_end.snakefile.smk b/workflow/rules/single_end.snakefile.smk index e270aba..6898c13 100644 --- a/workflow/rules/single_end.snakefile.smk +++ b/workflow/rules/single_end.snakefile.smk @@ -195,36 +195,18 @@ rule map_genome_star: "{sample}", "map_genome", "{sample}.se."), - multimappers = lambda wildcards: - get_sample( - 'multimappers', - search_id='index', - search_value=wildcards.sample), - soft_clip = lambda wildcards: - get_sample( - 'soft_clip', - search_id='index', - search_value=wildcards.sample), - pass_mode = lambda wildcards: - get_sample( - 'pass_mode', - search_id='index', - search_value=wildcards.sample), additional_params = parse_rule_config( rule_config, current_rule=current_rule, immutable=( - '--twopassMode', '--genomeDir', '--readFilesIn', '--readFilesCommand', - '--outFilterMultimapNmax', '--outFileNamePrefix', '--outSAMattributes', '--outStd', '--outSAMtype', '--outSAMattrRGline', - '--alignEndsType', ) ) @@ -245,18 +227,15 @@ rule map_genome_star: shell: "(STAR \ - --twopassMode {params.pass_mode} \ --runThreadN {threads} \ --genomeDir {params.index} \ --readFilesIn {input.reads} \ --readFilesCommand zcat \ - --outFilterMultimapNmax {params.multimappers} \ --outFileNamePrefix {params.outFileNamePrefix} \ --outSAMattributes All \ --outStd BAM_SortedByCoordinate \ --outSAMtype BAM SortedByCoordinate \ --outSAMattrRGline ID:rnaseq_pipeline SM:{params.sample_id} \ - --alignEndsType {params.soft_clip} \ {params.additional_params} \ > {output.bam};) \ 2> {log.stderr}" -- GitLab