diff --git a/pipeline_documentation.md b/pipeline_documentation.md index 5f96d48eba52621b0962c5e8ce9c88392894086b..bbef50f59eeb32d5da838b9d80b0bc91c6e18d49 100644 --- a/pipeline_documentation.md +++ b/pipeline_documentation.md @@ -557,7 +557,14 @@ Remove adapter sequences from reads with - Adapters to be removed; specify in sample table columns `fq1_3p`, `fq1_5p`, `fq2_3p`, `fq2_5p` - **rule_config.yaml:** - - `-m 10`: Discard processed reads that are shorter than 10 (default 0, that might cause problems in downstream programs) + - `-m 10`: Discard processed reads that are shorter than 10 nt. If + specified in `rule_config.yaml`, it will override ZARP's default value of + `m=1` for this parameter. Note that this is different from `cutadapt`'s + default behavior (`m=0`), which leads to empty reads being retained, + causing problems in downstream applications in ZARP. We thus strongly + recommend to **not** set the value of `m` to `0`! Refer to `cutadapt`'s + [documentation][docs-cutadapt-m] for more information on the `m` + parameter. - `-n 2`: search for all the given adapter sequences repeatedly, either until no adapter match was found or until 2 rounds have been performed. (default 1) @@ -579,8 +586,15 @@ Remove poly(A) tails from reads with - **samples.tsv** - Poly(A) stretches to be removed; specify in sample table columns `fq1_polya` and `fq2_polya` - **rule_config.yaml** - - `-m 10`: Discard processed reads that are shorter than 10 (default 0, that might cause problems in downstream programs) - - `-O 1`: minimal overlap of 1 (default: 3) + - `-m 10`: Discard processed reads that are shorter than 10 nt. If + specified in `rule_config.yaml`, it will override ZARP's default value of + `m=1` for this parameter. Note that this is different from `cutadapt`'s + default behavior (`m=0`), which leads to empty reads being retained, + causing problems in downstream applications in ZARP. We thus strongly + recommend to **not** set the value of `m` to `0`! Refer to `cutadapt`'s + [documentation][docs-cutadapt-m] for more information on the `m` + parameter. + - `-O 1`: minimal overlap of 1 (default: 3) - **Output** - Reads file (`.fastq.gz`); used in [**genome_quantification_kallisto**](#genome_quantification_kallisto), @@ -690,6 +704,7 @@ Generate pseudoalignments of reads to transcripts with [docs-bedgraphtobigwig]: <http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/> [docs-bedtools]: <https://bedtools.readthedocs.io/en/latest/> [docs-cutadapt]: <https://cutadapt.readthedocs.io/en/stable/> +[docs-cutadapt-m]: <https://cutadapt.readthedocs.io/en/stable/guide.html#filtering-reads> [docs-gffread]: <http://ccb.jhu.edu/software/stringtie/gff.shtml#gffread> [docs-fastqc]: <http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/> [docs-imagemagick]: <https://imagemagick.org/> diff --git a/tests/input_files/rule_config.yaml b/tests/input_files/rule_config.yaml index 2cdec07f884f51f1267c421236ce45b39b45cf09..a42874f3fdd09019cca8e9e5da6e3a0eeeb4ea4c 100644 --- a/tests/input_files/rule_config.yaml +++ b/tests/input_files/rule_config.yaml @@ -1,22 +1,22 @@ ############################################################################# -# +# # __________________________________________________________________ # | WARNING: ONLY CHANGE THIS FILE IF YOU KNOW WHAT YOU'RE DOING!!! | # | ZARP DOES NOT GUARANTEE SENSIBLE RESULTS IF PARAMETERS | # | ARE CHANGED HERE. | # |__________________________________________________________________| -# -# RULE CONFIGURATION +# +# RULE CONFIGURATION # # For RUN SPECIFIC PARAMETERS (sample specific parameters have to be # defined in the samples table!) # # Specify path to this file in main config.yaml under key 'rule_config' # -# One top-level keyword per RULE (not per tool, as one tool might be used +# One top-level keyword per RULE (not per tool, as one tool might be used # with different settings by more than one rule) # -# Parameters have to be specified exactly like they have to appear on the +# Parameters have to be specified exactly like they have to appear on the # command line call (e.g. -n or --name) # # All values need to be QUOTED STRINGS; to specify flags (i.e., parameters @@ -33,7 +33,7 @@ # MAIN SNAKEFILE / SEQUENCING-MODE INDEPENDENT # ################################################ -#start: No parameters to change here +# start: No parameters to change here fastqc: @@ -41,7 +41,7 @@ create_index_star: extract_transcriptome: -#concatenate_transcriptome_and_genome: No parameters to change here +# concatenate_transcriptome_and_genome: No parameters to change here create_index_salmon: @@ -52,7 +52,8 @@ extract_transcripts_as_bed12: index_genomic_alignment_samtools: calculate_TIN_scores: - # Minimum number of reads mapped to a transcript (default 10, ZARP recommends 0) + # Minimum number of reads mapped to a transcript (default 10, ZARP + # recommends 0) -c: '0' salmon_quantmerge_genes: @@ -94,53 +95,103 @@ prepare_bigWig: ########################################## remove_adapters_cutadapt: - # search for all the given adapter sequences repeatedly, either until no adapter match was found or until n rounds have been performed. (default 1, ZARP recommends 2) + # Search for all the given adapter sequences repeatedly, either until no + # adapter match was found or until n rounds have been performed (default 1, + # ZARP recommends 2) -n: '2' - # Discard processed reads that are shorter than 10 (default 0, ZARP strongly recommends m > 0, because empty reads might cause problems in downstream programs) + # Discard processed reads that are shorter than m; note that cutadapt uses + # a default value of m=0, causing reads without any nucleotides remaining + # after proessing to be retained; as "empty reads" will cause errors in + # downstream applications in ZARP, we have changed the default to m=1, + # meaning that only read fragments of at least 1 nt will be retained after + # processing. The default will be overridden by the value specified here, + # but for the reason stated above, we strongly recommend NOT to set m=0; + # cf. https://cutadapt.readthedocs.io/en/stable/guide.html#filtering-reads -m: '10' pe_remove_adapters_cutadapt: - # search for all the given adapter sequences repeatedly, either until no adapter match was found or until n rounds have been performed. (default 1, ZARP recommends 2) + # Search for all the given adapter sequences repeatedly, either until no + # adapter match was found or until n rounds have been performed (default 1, + # ZARP recommends 2) -n: '2' - # Discard processed reads that are shorter than 10 (default 0, ZARP strongly recommends m > 0, because empty reads might cause problems in downstream programs) + # Discard processed reads that are shorter than m; note that cutadapt uses + # a default value of m=0, causing reads without any nucleotides remaining + # after proessing to be retained; as "empty reads" will cause errors in + # downstream applications in ZARP, we have changed the default to m=1, + # meaning that only read fragments of at least 1 nt will be retained after + # processing. The default will be overridden by the value specified here, + # but for the reason stated above, we strongly recommend NOT to set m=0; + # cf. https://cutadapt.readthedocs.io/en/stable/guide.html#filtering-reads -m: '10' remove_polya_cutadapt: - # Discard processed reads that are shorter than 10 (default 0, ZARP strongly recommends m > 0, because empty reads might cause problems in downstream programs) + # Discard processed reads that are shorter than m; note that cutadapt uses + # a default value of m=0, causing reads without any nucleotides remaining + # after proessing to be retained; as "empty reads" will cause errors in + # downstream applications in ZARP, we have changed the default to m=1, + # meaning that only read fragments of at least 1 nt will be retained after + # processing. The default will be overridden by the value specified here, + # but for the reason stated above, we strongly recommend NOT to set m=0; + # cf. https://cutadapt.readthedocs.io/en/stable/guide.html#filtering-reads -m: '10' - # Minimal overlap of adapter and read (default 3, ZARP recommends 1 in order to remove all 3' As) - -O: '1' + # Minimal overlap of adapter and read (default 3, ZARP recommends 1 in + # order to remove all 3' As) + -O: '1' pe_remove_polya_cutadapt: - # Discard processed reads that are shorter than 10 (default 0, ZARP strongly recommends m > 0, because empty reads might cause problems in downstream programs) + # Discard processed reads that are shorter than m; note that cutadapt uses + # a default value of m=0, causing reads without any nucleotides remaining + # after proessing to be retained; as "empty reads" will cause errors in + # downstream applications in ZARP, we have changed the default to m=1, + # meaning that only read fragments of at least 1 nt will be retained after + # processing. The default will be overridden by the value specified here, + # but for the reason stated above, we strongly recommend NOT to set m=0; + # cf. https://cutadapt.readthedocs.io/en/stable/guide.html#filtering-reads -m: '10' - # Minimal overlap of adapter and read (default 3, ZARP recommends 1 in order to remove all 3' As) - -O: '1' + # Minimal overlap of adapter and read (default 3, ZARP recommends 1 in + # order to remove all 3' As) + -O: '1' map_genome_star: - # the score range below the maximum score for multimapping alignments (default 1, ZARP recommends 0) + # The score range below the maximum score for multimapping alignments + # (default 1, ZARP recommends 0) --outFilterMultimapScoreRange: '0' - # keep only those reads that contain junctions that passed filtering into SJ.out.tab. (default 'Normal', ZARP recommends 'BySJout', as this reduces the number of â€spurious†junctions ) + # Keep only those reads that contain junctions that passed filtering into + # "SJ.out.tab" (default 'Normal', ZARP recommends 'BySJout', as this + # reduces the number of spurious junctions ) --outFilterType: 'BySJout' pe_map_genome_star: - # the score range below the maximum score for multimapping alignments (default 1, ZARP recommends 0) + # The score range below the maximum score for multimapping alignments + # (default 1, ZARP recommends 0) --outFilterMultimapScoreRange: '0' - # keep only those reads that contain junctions that passed filtering into SJ.out.tab. (default 'Normal', ZARP recommends 'BySJout', as this reduces the number of â€spurious†junctions ) + # Keep only those reads that contain junctions that passed filtering into + # "SJ.out.tab" (default 'Normal', ZARP recommends 'BySJout', as this + # reduces the number of spurious junctions ) --outFilterType: 'BySJout' quantification_salmon: - # correct for sequence specific biases](https://salmon.readthedocs.io/en/latest/salmon.html#seqbias + # Correct for sequence specific biases; cf. + # https://salmon.readthedocs.io/en/latest/salmon.html#seqbias --seqBias: '' - # enables selective alignment of the sequencing reads when mapping them to the transcriptome; this can improve both the sensitivity and specificity of mapping and, as a result, can [improve quantification accuracy](https://salmon.readthedocs.io/en/latest/salmon.html#validatemappings) + # Enable selective alignment of the sequencing reads when mapping them to + # the transcriptome; this can improve both the sensitivity and specificity + # of mapping and, as a result, can improve quantification accuracy; cf. + # https://salmon.readthedocs.io/en/latest/salmon.html#validatemappings --validateMappings: '' pe_quantification_salmon: - # correct for sequence specific biases](https://salmon.readthedocs.io/en/latest/salmon.html#seqbias + # Correct for sequence specific biases, cf. + # https://salmon.readthedocs.io/en/latest/salmon.html#seqbias --seqBias: '' - # enables selective alignment of the sequencing reads when mapping them to the transcriptome; this can improve both the sensitivity and specificity of mapping and, as a result, can [improve quantification accuracy](https://salmon.readthedocs.io/en/latest/salmon.html#validatemappings) + # Enable selective alignment of the sequencing reads when mapping them to + # the transcriptome; this can improve both the sensitivity and specificity + # of mapping and, as a result, can improve quantification accuracy; cf. + # https://salmon.readthedocs.io/en/latest/salmon.html#validatemappings --validateMappings: '' - # write out the names of reads (or mates in paired-end reads) that do not map to the transcriptome. For paired-end this gives flags that indicate how a read failed to map + # Write out the names of reads (or mates in paired-end reads) that do not + # map to the transcriptome. For paired-end libraries this gives flags that + # indicate how a read failed to map --writeUnmappedNames: '' genome_quantification_kallisto: diff --git a/workflow/rules/paired_end.snakefile.smk b/workflow/rules/paired_end.snakefile.smk index 37c55460935a5bc1c86716f3ad69efc96e7e5fde..76a88f1dbb6f691f438136a088b73f374920d4dd 100644 --- a/workflow/rules/paired_end.snakefile.smk +++ b/workflow/rules/paired_end.snakefile.smk @@ -79,6 +79,7 @@ rule pe_remove_adapters_cutadapt: -g {params.adapter_5_mate1} \ -A {params.adapter_3_mate2} \ -G {params.adapter_5_mate2} \ + -m 1 \ {params.additional_params} \ -o {output.reads1} \ -p {output.reads2} \ @@ -177,6 +178,7 @@ rule pe_remove_polya_cutadapt: -g {params.polya_5_mate1} \ -A {params.polya_3_mate2} \ -G {params.polya_5_mate2} \ + -m 1 \ {params.additional_params} \ -o {output.reads1} \ -p {output.reads2} \ diff --git a/workflow/rules/single_end.snakefile.smk b/workflow/rules/single_end.snakefile.smk index e270aba84b8a70d69780aa5f31891b991dc8f54f..cd7bb1da424cce1066ebc4e6531dd3ea48204c91 100644 --- a/workflow/rules/single_end.snakefile.smk +++ b/workflow/rules/single_end.snakefile.smk @@ -66,6 +66,7 @@ rule remove_adapters_cutadapt: -j {threads} \ -a {params.adapters_3} \ -g {params.adapters_5} \ + -m 1 \ {params.additional_params} \ -o {output.reads} \ {input.reads}) \ @@ -140,6 +141,7 @@ rule remove_polya_cutadapt: -j {threads} \ -a {params.polya_3} \ -g {params.polya_5} \ + -m 1 \ {params.additional_params} \ -o {output.reads} \ {input.reads};) \