From 838ead9789c302a84df9d7fb87a84b466bcc2cc6 Mon Sep 17 00:00:00 2001
From: CJ Herrmann <christina.herrmann@unibas.ch>
Date: Wed, 14 Jul 2021 09:21:28 +0200
Subject: [PATCH] improved documentation

---
 pipeline_documentation.md          |  5 +++--
 tests/input_files/rule_config.yaml | 16 ++++++++--------
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/pipeline_documentation.md b/pipeline_documentation.md
index e7a4be1..9d1ef4b 100644
--- a/pipeline_documentation.md
+++ b/pipeline_documentation.md
@@ -557,7 +557,7 @@ Remove adapter sequences from reads with
     - Adapters to be removed; specify in sample table columns `fq1_3p`, `fq1_5p`,
     `fq2_3p`, `fq2_5p`
   - **rule_config.yaml:**
-    - `-m 10`: Discard processed reads that are shorter than 10 (default 0; Because empty reads will cause problems in downstream programs, -m=1 is hardcoded in the snakefile. That value will be overwritten by the value specified in `rule_config.yaml`)
+    - `-m 10`: Discard processed reads that are shorter than 10 nt (cutadapt's default is 0 (see [cutadapt docs][docs-cutadapt-m]), which will keep reads even if they are empty; Because empty reads will cause problems in downstream programs, -m=1 (keep reads only if at least 1nt long) is hardcoded in the snakefile). That value will be overwritten by the value specified in `rule_config.yaml`)
     - `-n 2`: search for all the given adapter sequences repeatedly, either until
     no adapter match was found or until 2 rounds have been performed. (default 1)
 
@@ -579,7 +579,7 @@ Remove poly(A) tails from reads with
   - **samples.tsv**
     - Poly(A) stretches to be removed; specify in sample table columns `fq1_polya` and `fq2_polya`
   - **rule_config.yaml**
-    - `-m 10`: Discard processed reads that are shorter than 10 (default 0; Because empty reads will cause problems in downstream programs, -m=1 is hardcoded in the snakefile. That value will be overwritten by the value specified in `rule_config.yaml`)
+    - `-m 10`: Discard processed reads that are shorter than 10 nt (cutadapt's default is 0 (see [cutadapt docs][docs-cutadapt-m]), which will keep reads even if they are empty; Because empty reads will cause problems in downstream programs, -m=1 (keep reads only if at least 1nt long) is hardcoded in the snakefile). That value will be overwritten by the value specified in `rule_config.yaml`)
       - `-O 1`: minimal overlap of 1 (default: 3)
 - **Output**
   - Reads file (`.fastq.gz`); used in
@@ -690,6 +690,7 @@ Generate pseudoalignments of reads to transcripts with
 [docs-bedgraphtobigwig]: <http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/>
 [docs-bedtools]: <https://bedtools.readthedocs.io/en/latest/>
 [docs-cutadapt]: <https://cutadapt.readthedocs.io/en/stable/>
+[docs-cutadapt-m]: <https://cutadapt.readthedocs.io/en/stable/guide.html#filtering-reads>
 [docs-gffread]: <http://ccb.jhu.edu/software/stringtie/gff.shtml#gffread>
 [docs-fastqc]: <http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/>
 [docs-imagemagick]: <https://imagemagick.org/>
diff --git a/tests/input_files/rule_config.yaml b/tests/input_files/rule_config.yaml
index c7f1242..25e933b 100644
--- a/tests/input_files/rule_config.yaml
+++ b/tests/input_files/rule_config.yaml
@@ -96,27 +96,27 @@ prepare_bigWig:
 remove_adapters_cutadapt:
     # search for all the given adapter sequences repeatedly, either until no adapter match was found or until n rounds have been performed. (default 1, ZARP recommends 2)
     -n: '2'
-    # Discard processed reads that are shorter than m (default 0, ZARP strongly recommends m > 0, because empty reads will cause problems in downstream programs; 
-    # Hardcoded to -m=1 in both snakefiles; that value will be overwritten by the -m value specified here!)
+    # Discard processed reads that are shorter than m (cutadapt's default is 0, keeping reads even if they are empty. ZARP strongly recommends m > 0, because empty reads will cause problems in downstream programs; 
+    # Hardcoded to -m=1 (Keep reads only if at least 1nt is left) in both snakefiles; that value will be overwritten by the -m value specified here, e.g. keep reads only if they are at least 10nt long!)
     -m: '10'
 
 pe_remove_adapters_cutadapt:
     # search for all the given adapter sequences repeatedly, either until no adapter match was found or until n rounds have been performed. (default 1, ZARP recommends 2)
     -n: '2'
-    # Discard processed reads that are shorter than m (default 0, ZARP strongly recommends m > 0, because empty reads will cause problems in downstream programs; 
-    # Hardcoded to -m=1 in both snakefiles; that value will be overwritten by the -m value specified here!)
+    # Discard processed reads that are shorter than m (cutadapt's default is 0, keeping reads even if they are empty. ZARP strongly recommends m > 0, because empty reads will cause problems in downstream programs; 
+    # Hardcoded to -m=1 (Keep reads only if at least 1nt is left) in both snakefiles; that value will be overwritten by the -m value specified here, e.g. keep reads only if they are at least 10nt long!)
     -m: '10'
 
 remove_polya_cutadapt:
-    # Discard processed reads that are shorter than m (default 0, ZARP strongly recommends m > 0, because empty reads will cause problems in downstream programs; 
-    # Hardcoded to -m=1 in both snakefiles; that value will be overwritten by the -m value specified here!)
+    # Discard processed reads that are shorter than m (cutadapt's default is 0, keeping reads even if they are empty. ZARP strongly recommends m > 0, because empty reads will cause problems in downstream programs; 
+    # Hardcoded to -m=1 (Keep reads only if at least 1nt is left) in both snakefiles; that value will be overwritten by the -m value specified here, e.g. keep reads only if they are at least 10nt long!)
     -m: '10'
     # Minimal overlap of adapter and read (default 3, ZARP recommends 1 in order to remove all 3' As)
     -O: '1' 
 
 pe_remove_polya_cutadapt:
-    # Discard processed reads that are shorter than m (default 0, ZARP strongly recommends m > 0, because empty reads will cause problems in downstream programs; 
-    # Hardcoded to -m=1 in both snakefiles; that value will be overwritten by the -m value specified here!)
+    # Discard processed reads that are shorter than m (cutadapt's default is 0, keeping reads even if they are empty. ZARP strongly recommends m > 0, because empty reads will cause problems in downstream programs; 
+    # Hardcoded to -m=1 (Keep reads only if at least 1nt is left) in both snakefiles; that value will be overwritten by the -m value specified here, e.g. keep reads only if they are at least 10nt long!)
     -m: '10'
     # Minimal overlap of adapter and read (default 3, ZARP recommends 1 in order to remove all 3' As)
     -O: '1' 
-- 
GitLab