rephrased and reformatted

aae455f7 · Alex Kanitz · 838ead97 · aae455f7 · aae455f7
Unverified Commit aae455f7 authored 3 years ago by Alex Kanitz
--- a/pipeline_documentation.md
+++ b/pipeline_documentation.md
@@ -557,7 +557,14 @@ Remove adapter sequences from reads with
    - Adapters to be removed; specify in sample table columns `fq1_3p`, `fq1_5p`,
    `fq2_3p`, `fq2_5p`
  - **rule_config.yaml:**
-    - `-m 10`: Discard processed reads that are shorter than 10 nt (cutadapt's default is 0 (see [cutadapt docs][docs-cutadapt-m]), which will keep reads even if they are empty; Because empty reads will cause problems in downstream programs, -m=1 (keep reads only if at least 1nt long) is hardcoded in the snakefile). That value will be overwritten by the value specified in `rule_config.yaml`)
+    - `-m 10`: Discard processed reads that are shorter than 10 nt. If
+    specified in `rule_config.yaml`, it will override ZARP's default value of
+    `m=1` for this parameter. Note that this is different from `cutadapt`'s
+    default behavior (`m=0`), which leads to empty reads being retained,
+    causing problems in downstream applications in ZARP. We thus strongly
+    recommend to **not** set the value of `m` to `0`! Refer to `cutadapt`'s
+    [documentation][docs-cutadapt-m]) for more information on the `m`
+    parameter.
    - `-n 2`: search for all the given adapter sequences repeatedly, either until
    no adapter match was found or until 2 rounds have been performed. (default 1)
@@ -579,8 +586,15 @@ Remove poly(A) tails from reads with
  - **samples.tsv**
    - Poly(A) stretches to be removed; specify in sample table columns `fq1_polya` and `fq2_polya`
  - **rule_config.yaml**
-    - `-m 10`: Discard processed reads that are shorter than 10 nt (cutadapt's default is 0 (see [cutadapt docs][docs-cutadapt-m]), which will keep reads even if they are empty; Because empty reads will cause problems in downstream programs, -m=1 (keep reads only if at least 1nt long) is hardcoded in the snakefile). That value will be overwritten by the value specified in `rule_config.yaml`)
+    - `-m 10`: Discard processed reads that are shorter than 10 nt. If
-      - `-O 1`: minimal overlap of 1 (default: 3)
+    specified in `rule_config.yaml`, it will override ZARP's default value of
+    `m=1` for this parameter. Note that this is different from `cutadapt`'s
+    default behavior (`m=0`), which leads to empty reads being retained,
+    causing problems in downstream applications in ZARP. We thus strongly
+    recommend to **not** set the value of `m` to `0`! Refer to `cutadapt`'s
+    [documentation][docs-cutadapt-m]) for more information on the `m`
+    parameter.
+    - `-O 1`: minimal overlap of 1 (default: 3)
 - **Output**
  - Reads file (`.fastq.gz`); used in
    [**genome_quantification_kallisto**](#genome_quantification_kallisto),

--- a/tests/input_files/rule_config.yaml
+++ b/tests/input_files/rule_config.yaml
 #############################################################################
-# 
+#
 #   __________________________________________________________________
 #  | WARNING: ONLY CHANGE THIS FILE IF YOU KNOW WHAT YOU'RE DOING!!!  |
 #  |          ZARP DOES NOT GUARANTEE SENSIBLE RESULTS IF PARAMETERS  |
 #  |          ARE CHANGED HERE.                                       |
 #  |__________________________________________________________________|
-# 
+#
-# RULE CONFIGURATION 
+# RULE CONFIGURATION
 #
 # For RUN SPECIFIC PARAMETERS (sample specific parameters have to be
 # defined in the samples table!)
 #
 # Specify path to this file in main config.yaml under key 'rule_config'
 #
-# One top-level keyword per RULE (not per tool, as one tool might be used 
+# One top-level keyword per RULE (not per tool, as one tool might be used
 # with different settings by more than one rule)
 #
-# Parameters have to be specified exactly like they have to appear on the 
+# Parameters have to be specified exactly like they have to appear on the
 # command line call (e.g. -n or --name)
 #
 # All values need to be QUOTED STRINGS; to specify flags (i.e., parameters
@@ -33,7 +33,7 @@
 # MAIN SNAKEFILE / SEQUENCING-MODE INDEPENDENT #
 ################################################
-#start: No parameters to change here
+# start: No parameters to change here
 fastqc:
@@ -41,7 +41,7 @@ create_index_star:
 extract_transcriptome:
-#concatenate_transcriptome_and_genome: No parameters to change here
+# concatenate_transcriptome_and_genome: No parameters to change here
 create_index_salmon:
@@ -52,7 +52,8 @@ extract_transcripts_as_bed12:
 index_genomic_alignment_samtools:
 calculate_TIN_scores:
-    # Minimum number of reads mapped to a transcript (default 10, ZARP recommends 0)
+    # Minimum number of reads mapped to a transcript (default 10, ZARP
+    # recommends 0)
    -c: '0'
 salmon_quantmerge_genes:
@@ -94,57 +95,103 @@ prepare_bigWig:
 ##########################################
 remove_adapters_cutadapt:
-    # search for all the given adapter sequences repeatedly, either until no adapter match was found or until n rounds have been performed. (default 1, ZARP recommends 2)
+    # Search for all the given adapter sequences repeatedly, either until no
+    # adapter match was found or until n rounds have been performed (default 1,
+    # ZARP recommends 2)
    -n: '2'
-    # Discard processed reads that are shorter than m (cutadapt's default is 0, keeping reads even if they are empty. ZARP strongly recommends m > 0, because empty reads will cause problems in downstream programs; 
+    # Discard processed reads that are shorter than m; note that cutadapt uses
-    # Hardcoded to -m=1 (Keep reads only if at least 1nt is left) in both snakefiles; that value will be overwritten by the -m value specified here, e.g. keep reads only if they are at least 10nt long!)
+    # a default value of m=0, causing reads without any nucleotides remaining
+    # after proessing to be retained; as "empty reads" will cause errors in
+    # downstream applications in ZARP, we have changed the default to m=1,
+    # meaning that only read fragments of at least 1 nt will be retained after
+    # processing. The default will be overridden by the value specified here,
+    # but for the reason stated above, we strongly recommend NOT to set m=0;
+    # cf. https://cutadapt.readthedocs.io/en/stable/guide.html#filtering-reads
    -m: '10'
 pe_remove_adapters_cutadapt:
-    # search for all the given adapter sequences repeatedly, either until no adapter match was found or until n rounds have been performed. (default 1, ZARP recommends 2)
+    # Search for all the given adapter sequences repeatedly, either until no
+    # adapter match was found or until n rounds have been performed (default 1,
+    # ZARP recommends 2)
    -n: '2'
-    # Discard processed reads that are shorter than m (cutadapt's default is 0, keeping reads even if they are empty. ZARP strongly recommends m > 0, because empty reads will cause problems in downstream programs; 
+    # Discard processed reads that are shorter than m; note that cutadapt uses
-    # Hardcoded to -m=1 (Keep reads only if at least 1nt is left) in both snakefiles; that value will be overwritten by the -m value specified here, e.g. keep reads only if they are at least 10nt long!)
+    # a default value of m=0, causing reads without any nucleotides remaining
+    # after proessing to be retained; as "empty reads" will cause errors in
+    # downstream applications in ZARP, we have changed the default to m=1,
+    # meaning that only read fragments of at least 1 nt will be retained after
+    # processing. The default will be overridden by the value specified here,
+    # but for the reason stated above, we strongly recommend NOT to set m=0;
+    # cf. https://cutadapt.readthedocs.io/en/stable/guide.html#filtering-reads
    -m: '10'
 remove_polya_cutadapt:
-    # Discard processed reads that are shorter than m (cutadapt's default is 0, keeping reads even if they are empty. ZARP strongly recommends m > 0, because empty reads will cause problems in downstream programs; 
+    # Discard processed reads that are shorter than m; note that cutadapt uses
-    # Hardcoded to -m=1 (Keep reads only if at least 1nt is left) in both snakefiles; that value will be overwritten by the -m value specified here, e.g. keep reads only if they are at least 10nt long!)
+    # a default value of m=0, causing reads without any nucleotides remaining
+    # after proessing to be retained; as "empty reads" will cause errors in
+    # downstream applications in ZARP, we have changed the default to m=1,
+    # meaning that only read fragments of at least 1 nt will be retained after
+    # processing. The default will be overridden by the value specified here,
+    # but for the reason stated above, we strongly recommend NOT to set m=0;
+    # cf. https://cutadapt.readthedocs.io/en/stable/guide.html#filtering-reads
    -m: '10'
-    # Minimal overlap of adapter and read (default 3, ZARP recommends 1 in order to remove all 3' As)
+    # Minimal overlap of adapter and read (default 3, ZARP recommends 1 in
-    -O: '1' 
+    # order to remove all 3' As)
+    -O: '1'
 pe_remove_polya_cutadapt:
-    # Discard processed reads that are shorter than m (cutadapt's default is 0, keeping reads even if they are empty. ZARP strongly recommends m > 0, because empty reads will cause problems in downstream programs; 
+    # Discard processed reads that are shorter than m; note that cutadapt uses
-    # Hardcoded to -m=1 (Keep reads only if at least 1nt is left) in both snakefiles; that value will be overwritten by the -m value specified here, e.g. keep reads only if they are at least 10nt long!)
+    # a default value of m=0, causing reads without any nucleotides remaining
+    # after proessing to be retained; as "empty reads" will cause errors in
+    # downstream applications in ZARP, we have changed the default to m=1,
+    # meaning that only read fragments of at least 1 nt will be retained after
+    # processing. The default will be overridden by the value specified here,
+    # but for the reason stated above, we strongly recommend NOT to set m=0;
+    # cf. https://cutadapt.readthedocs.io/en/stable/guide.html#filtering-reads
    -m: '10'
-    # Minimal overlap of adapter and read (default 3, ZARP recommends 1 in order to remove all 3' As)
+    # Minimal overlap of adapter and read (default 3, ZARP recommends 1 in
-    -O: '1' 
+    # order to remove all 3' As)
+    -O: '1'
 map_genome_star:
-    # the score range below the maximum score for multimapping alignments (default 1, ZARP recommends 0)
+    # The score range below the maximum score for multimapping alignments
+    # (default 1, ZARP recommends 0)
    --outFilterMultimapScoreRange: '0'
-    # keep only those reads that contain junctions that passed filtering into SJ.out.tab. (default 'Normal', ZARP recommends 'BySJout', as this reduces the number of ”spurious” junctions )
+    # Keep only those reads that contain junctions that passed filtering into
+    # "SJ.out.tab" (default 'Normal', ZARP recommends 'BySJout', as this
+    # reduces the number of spurious junctions )
    --outFilterType: 'BySJout'
 pe_map_genome_star:
-    # the score range below the maximum score for multimapping alignments (default 1, ZARP recommends 0)
+    # The score range below the maximum score for multimapping alignments
+    # (default 1, ZARP recommends 0)
    --outFilterMultimapScoreRange: '0'
-    # keep only those reads that contain junctions that passed filtering into SJ.out.tab. (default 'Normal', ZARP recommends 'BySJout', as this reduces the number of ”spurious” junctions )
+    # Keep only those reads that contain junctions that passed filtering into
+    # "SJ.out.tab" (default 'Normal', ZARP recommends 'BySJout', as this
+    # reduces the number of spurious junctions )
    --outFilterType: 'BySJout'
 quantification_salmon:
-    # correct for sequence specific biases](https://salmon.readthedocs.io/en/latest/salmon.html#seqbias
+    # Correct for sequence specific biases; cf.
+    # https://salmon.readthedocs.io/en/latest/salmon.html#seqbias
    --seqBias: ''
-    # enables selective alignment of the sequencing reads when mapping them to the transcriptome; this can improve both the sensitivity and specificity of mapping and, as a result, can [improve quantification accuracy](https://salmon.readthedocs.io/en/latest/salmon.html#validatemappings)
+    # Enable selective alignment of the sequencing reads when mapping them to
+    # the transcriptome; this can improve both the sensitivity and specificity
+    # of mapping and, as a result, can improve quantification accuracy; cf.
+    # https://salmon.readthedocs.io/en/latest/salmon.html#validatemappings
    --validateMappings: ''
 pe_quantification_salmon:
-    # correct for sequence specific biases](https://salmon.readthedocs.io/en/latest/salmon.html#seqbias
+    # Correct for sequence specific biases, cf.
+    # https://salmon.readthedocs.io/en/latest/salmon.html#seqbias
    --seqBias: ''
-    # enables selective alignment of the sequencing reads when mapping them to the transcriptome; this can improve both the sensitivity and specificity of mapping and, as a result, can [improve quantification accuracy](https://salmon.readthedocs.io/en/latest/salmon.html#validatemappings)
+    # Enable selective alignment of the sequencing reads when mapping them to
+    # the transcriptome; this can improve both the sensitivity and specificity
+    # of mapping and, as a result, can improve quantification accuracy; cf.
+    # https://salmon.readthedocs.io/en/latest/salmon.html#validatemappings
    --validateMappings: ''
-    # write out the names of reads (or mates in paired-end reads) that do not map to the transcriptome. For paired-end this gives flags that indicate how a read failed to map
+    # Write out the names of reads (or mates in paired-end reads) that do not
+    # map to the transcriptome. For paired-end libraries this gives flags that
+    # indicate how a read failed to map
    --writeUnmappedNames: ''
 genome_quantification_kallisto: