From bab8f25ab1484accbf2dfa32a032395c7ef3dd32 Mon Sep 17 00:00:00 2001
From: BIOPZ-Katsantoni Maria <maria.katsantoni@unibas.ch>
Date: Tue, 17 Mar 2020 11:44:51 +0100
Subject: [PATCH] Fix cutadapt overtrimming

---
 .../test_integration_workflow/expected_output.md5 | 14 +++++++-------
 workflow/rules/paired_end.snakefile.smk           | 15 +++++++--------
 workflow/rules/single_end.snakefile.smk           |  7 ++-----
 3 files changed, 16 insertions(+), 20 deletions(-)

diff --git a/tests/test_integration_workflow/expected_output.md5 b/tests/test_integration_workflow/expected_output.md5
index 4feedc0..c017ef0 100644
--- a/tests/test_integration_workflow/expected_output.md5
+++ b/tests/test_integration_workflow/expected_output.md5
@@ -54,8 +54,8 @@ c77480e0235761f2d7f80dbceb2e2806  results/paired_end/synthetic_10_reads_paired_s
 92bcd0592d22a6a58d0360fc76103e56  results/paired_end/synthetic_10_reads_paired_synthetic_10_reads_paired/salmon_quant/aux_info/observed_bias
 92bcd0592d22a6a58d0360fc76103e56  results/paired_end/synthetic_10_reads_paired_synthetic_10_reads_paired/salmon_quant/aux_info/observed_bias_3p
 d41d8cd98f00b204e9800998ecf8427e  results/paired_end/synthetic_10_reads_paired_synthetic_10_reads_paired/salmon_quant/aux_info/unmapped_names.txt
-12ac6d56ed50ab74ce16a4d618612847  results/single_end/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1.remove_adapters_mate1.fastq
-12ac6d56ed50ab74ce16a4d618612847  results/single_end/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1.remove_polya_mate1.fastq
+500dd49da40b16799aba62aa5cf239ba  results/single_end/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1.remove_adapters_mate1.fastq
+500dd49da40b16799aba62aa5cf239ba  results/single_end/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1.remove_polya_mate1.fastq
 d41d8cd98f00b204e9800998ecf8427e  results/single_end/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/map_genome/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1_SJ.out.tab
 6c5d2ffd046e24384a7557aa9be0fdfd  results/single_end/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/mate1_fastqc/synthetic.mate_1_fastqc/fastqc_data.txt
 c0df759ceab72ea4b1a560f991fe6497  results/single_end/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/mate1_fastqc/synthetic.mate_1_fastqc/fastqc.fo
@@ -70,7 +70,7 @@ e4c1a39967ec9547a2e4c71c97982ee0  results/single_end/synthetic_10_reads_mate_1_s
 b28aac49f537b8cba364b6422458ad28  results/single_end/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/mate1_fastqc/synthetic.mate_1_fastqc/Images/per_tile_quality.png
 5b950b5dfe3c7407e9aac153db330a38  results/single_end/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/mate1_fastqc/synthetic.mate_1_fastqc/Images/sequence_length_distribution.png
 50a9b89a9f1da2c438cb0041b64faa0e  results/single_end/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/quant_kallisto/abundance.tsv
-3a727fbf59b74a85e1738b0eb3404a73  results/single_end/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/quant_kallisto/pseudoalignments.bam
+fd8242418230a4edb33350be2e4f1d78  results/single_end/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/quant_kallisto/pseudoalignments.bam
 d41d8cd98f00b204e9800998ecf8427e  results/single_end/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/quant_kallisto/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1.kallisto.pseudo.sam
 e72f5d798c99272f8c0166dc77247db1  results/single_end/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/salmon_quant/lib_format_counts.json
 989d6ee63b728fced9ec0249735ab83d  results/single_end/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/salmon_quant/aux_info/ambig_info.tsv
@@ -83,15 +83,15 @@ c266d31e0a2ad84975cb9de335891e64  results/paired_end/synthetic_10_reads_paired_s
 0139e75ddbfe6eb081c2c2d9b9108ab4  results/paired_end/synthetic_10_reads_paired_synthetic_10_reads_paired/STAR_coverage/synthetic_10_reads_paired_synthetic_10_reads_paired_Signal.Unique.str1.out.bg
 c266d31e0a2ad84975cb9de335891e64  results/paired_end/synthetic_10_reads_paired_synthetic_10_reads_paired/STAR_coverage/synthetic_10_reads_paired_synthetic_10_reads_paired_Signal.Unique.str2.out.bg
 ea91b4f85622561158bff2f7c9c312b3  results/single_end/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/STAR_coverage/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1_Signal.UniqueMultiple.str1.out.bg
-ede14ac41c10067838f375106fce4852  results/single_end/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/STAR_coverage/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1_Signal.UniqueMultiple.str2.out.bg
+bcccf679a8c083d01527514c9f5680a0  results/single_end/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/STAR_coverage/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1_Signal.UniqueMultiple.str2.out.bg
 ea91b4f85622561158bff2f7c9c312b3  results/single_end/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/STAR_coverage/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1_Signal.Unique.str1.out.bg
-ede14ac41c10067838f375106fce4852  results/single_end/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/STAR_coverage/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1_Signal.Unique.str2.out.bg
+bcccf679a8c083d01527514c9f5680a0  results/single_end/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/STAR_coverage/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1_Signal.Unique.str2.out.bg
 3ce47cb1d62482c5d62337751d7e8552  results/transcriptome/homo_sapiens/transcriptome.fa
 6b44c507f0a1c9f7369db0bb1deef0fd  results/alfa_indexes/homo_sapiens/75/ALFA/sorted_genes.stranded.ALFA_index
 2caebc23faf78fdbbbdbb118d28bd6b5  results/alfa_indexes/homo_sapiens/75/ALFA/sorted_genes.unstranded.ALFA_index
 c1254a0bae19ac3ffc39f73099ffcf2b  results/paired_end/synthetic_10_reads_paired_synthetic_10_reads_paired/ALFA/synthetic_10_reads_paired_synthetic_10_reads_paired.ALFA_feature_counts.tsv
 c266d31e0a2ad84975cb9de335891e64  results/paired_end/synthetic_10_reads_paired_synthetic_10_reads_paired/ALFA/synthetic_10_reads_paired_synthetic_10_reads_paired_Signal.UniqueMultiple.out.minus.bg
 0139e75ddbfe6eb081c2c2d9b9108ab4  results/paired_end/synthetic_10_reads_paired_synthetic_10_reads_paired/ALFA/synthetic_10_reads_paired_synthetic_10_reads_paired_Signal.UniqueMultiple.out.plus.bg
-a9fdb9b135132dda339b85346525c9c5  results/single_end/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/ALFA/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1.ALFA_feature_counts.tsv
-ede14ac41c10067838f375106fce4852  results/single_end/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/ALFA/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1_Signal.UniqueMultiple.out.minus.bg
+c1254a0bae19ac3ffc39f73099ffcf2b  results/single_end/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/ALFA/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1.ALFA_feature_counts.tsv
+bcccf679a8c083d01527514c9f5680a0  results/single_end/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/ALFA/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1_Signal.UniqueMultiple.out.minus.bg
 ea91b4f85622561158bff2f7c9c312b3  results/single_end/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/ALFA/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1_Signal.UniqueMultiple.out.plus.bg
\ No newline at end of file
diff --git a/workflow/rules/paired_end.snakefile.smk b/workflow/rules/paired_end.snakefile.smk
index 096f895..d49d55f 100644
--- a/workflow/rules/paired_end.snakefile.smk
+++ b/workflow/rules/paired_end.snakefile.smk
@@ -24,7 +24,7 @@ rule pe_fastqc:
     threads: 2
 
     singularity:
-        "docker://zavolab/fastqc:0.11.9-slim"
+        "docker://zavolab/fastqc:0.11.9"
 
     log:
         stderr = os.path.join(
@@ -42,7 +42,7 @@ rule pe_fastqc:
         "(mkdir -p {output.outdir1}; \
         mkdir -p {output.outdir2}; \
         fastqc --outdir {output.outdir1} {input.reads1}; \
-        fastqc --outdir {output.outdir2} {input.reads2}); \
+        fastqc --outdir {output.outdir2} {input.reads2};) \
         1> {log.stdout} 2> {log.stderr}"
 
 
@@ -99,9 +99,9 @@ rule pe_remove_adapters_cutadapt:
         "(cutadapt \
         -e 0.1 \
         -j {threads} \
-        --pair-filter=both \
+        --pair-filter=any \
         -m 10 \
-        -n 3 \
+        -n 2 \
         -a {params.adapter_3_mate1} \
         -g {params.adapter_5_mate1} \
         -A {params.adapter_3_mate2} \
@@ -166,13 +166,12 @@ rule pe_remove_polya_cutadapt:
 
     shell:
         "(cutadapt \
-        --match-read-wildcards \
         -j {threads} \
-        --pair-filter=both \
+        --pair-filter=any \
         -m 10 \
-        -n 2 \
+        -n 1 \
         -e 0.1 \
-        -q 6 \
+        -O 1 \
         -a {params.polya_3_mate1} \
         -A {params.polya_3_mate2} \
         -o {output.reads1} \
diff --git a/workflow/rules/single_end.snakefile.smk b/workflow/rules/single_end.snakefile.smk
index 5f9372b..3857baa 100644
--- a/workflow/rules/single_end.snakefile.smk
+++ b/workflow/rules/single_end.snakefile.smk
@@ -78,10 +78,9 @@ rule remove_adapters_cutadapt:
     shell:
         "(cutadapt \
         -e 0.1 \
-        -O 1 \
         -j {threads} \
         -m 10 \
-        -n 3 \
+        -n 2 \
         -a {params.adapters_3} \
         -g {params.adapters_5} \
         -o {output.reads} \
@@ -130,12 +129,10 @@ rule remove_polya_cutadapt:
 
     shell:
         "(cutadapt \
-        --match-read-wildcards \
         -j {threads} \
-        -n 2 \
+        -n 1 \
         -e 0.1 \
         -O 1 \
-        -q 6 \
         -m 10  \
         -a {params.polya_3} \
         -o {output.reads} \
-- 
GitLab