add rules for pre-processing of fastq files and to merge fastqs from the same sample

5c7fb77c · Ticlla Ccenhua Monica Roxana · e17c37c4 · 5c7fb77c · 5c7fb77c · 5c7fb77c
Commit 5c7fb77c authored 5 years ago by Ticlla Ccenhua Monica Roxana
--- a/config.yaml
+++ b/config.yaml
@@ -15,9 +15,11 @@ rawQC:
  samplerate: 0.1

 preprocess:
+  trim_adapters:
    adapter: 'CTGTCTCTTATACACATCT'
    min_length: 50
-
+    trim_tails: 1
+  filter_human:
    # Activate 'low-memory' mode for bbmap
    # -------------------------------------
    # value is either t (true) or f (false)

--- a/report/MetagenomicSnake.rst
+++ b/report/MetagenomicSnake.rst
@@ -9,4 +9,24 @@ datasets from human microbiomes.

 Modules
 -------
- rawQC
+**rawQC**
+  runs FastQC on a random sample of R1 reads from the paired fastq-format files.
+
+**preprocess**
+  FastQC only performs a quality check but no QC processing is done. The **preprocess**
+  rule runs a multi-step pre-processing of the paired fastq files, includes:
+
+  - Adapter-trimming with "fastp"
+  - 
+
+  - with Fastp a quality check is performed and both paired fastq files are processed as follows:
+      + remove adapters: here we provide the Nextera XT adapters,
+      + base correction in overlapped regions
+      + trimming of the last base in read 1
+      + discard reads shorter than a minimum length, after trimming
+      + a report with quality check, before and after processing
+  - removal of reads derived from human DNA
+  - removal of duplicated reads
+
+**merge_samples**
+  merges fastq files corresponding to the same sample into a single pair of fastq files, after pre-processing of paired fastq files.
--- a/rules/merge_samples.smk
+++ b/rules/merge_samples.smk
+'''
+Author: Monica R. Ticlla
+Afiliation(s): SIB, SwissTPH, UNIBAS
+Description: after pre-processing of paired fastq files form paired-end shotgun
+DNA sequencing of metagenomic samples, fastq files corresponding to the same
+sample are merged into a single pair of fastq files.
+
+'''
+localrules:
+    multiqc_merged_list_files,
+    multiqc_merged
+##----------------------------------------------------------------------------##
+## Local variables
+##----------------------------------------------------------------------------##
+singularity_img = 'shub://mticlla/MetagenomicSnake:preqc_v0_1'
+
+##----------------------------------------------------------------------------##
+## Rules with target files
+##----------------------------------------------------------------------------##
+
+###**THESE TARGET FILES ARE THE FINAL CLEAN**###
+# concatenate cleaned,deduplicated and trimmed fastqs from the same samples
+rule concatenate_fastqs:
+    input:
+        #
+        sample_fwds = lambda wildcards: ['{}/{}_dfinaltrim/{}-{}_{}-R1.clean.nodup.fastp.fastq.gz'.format(
+        PRE_PROC_DIR, DATASETS[ix], value, RUNS[ix], LANES[ix])
+        for ix,value in enumerate(SAMPLES) if (value == wildcards.sample) and (DATASETS[ix] == wildcards.dataset)],
+        #
+        sample_revs = lambda wildcards: ['{}/{}_dfinaltrim/{}-{}_{}-R2.clean.nodup.fastp.fastq.gz'.format(
+        PRE_PROC_DIR, DATASETS[ix], value, RUNS[ix], LANES[ix])
+        for ix,value in enumerate(SAMPLES) if (value==wildcards.sample) and (DATASETS[ix]==wildcards.dataset)]
+    output:
+        sample_fwd = MERGE_DIR + '/{dataset}_merged/{sample}-R1.fastq.gz',
+        sample_rev = MERGE_DIR + '/{dataset}_merged/{sample}-R2.fastq.gz'
+    wildcard_constraints:
+        sample = '\w+'
+    group: 'preprocess'
+    shell:
+        '''
+        cat {input.sample_fwds} > {output.sample_fwd}
+        cat {input.sample_revs} > {output.sample_rev}
+        '''
+# Final quality check with Fastp, but no further QC processing
+rule fastp_concatenated:
+    input:
+        sample_fwd = MERGE_DIR + '/{dataset}_merged/{sample}-R1.fastq.gz',
+        sample_rev = MERGE_DIR + '/{dataset}_merged/{sample}-R2.fastq.gz'
+    output:
+        report1 = MERGE_DIR + '/{dataset}_merged/{sample}.fastp.html',
+        report2 = MERGE_DIR + '/{dataset}_merged/{sample}.fastp.json'
+    wildcard_constraints:
+        sample = '\w+'
+    threads: cpus_avail
+    group: 'preprocess'
+    singularity: singularity_img
+    shell:
+        '''
+        fastp \
+        -A \
+        --in1 {input.sample_fwd} --in2 {input.sample_rev} \
+        --html {output.report1} --json {output.report2} \
+        --thread {threads}
+        '''
+
+rule multiqc_merged_list_files:
+    input:
+        sample_fastp_report = lambda wildcards: ['{}/{}_merged/{}.fastp.json'.format(
+        MERGE_DIR, wildcards.dataset, value) for ix,value in enumerate(SAMPLES)
+        if DATASETS[ix]==wildcards.dataset]
+    output:
+        multiqc_input_list = MERGE_REPORT + '/{dataset}_multiqc_inputs.txt'
+    run:
+        import os
+        try:
+            os.makedirs(os.path.dirname(output.multiqc_input_list))
+        except OSError:
+            pass
+
+        with open(output.multiqc_input_list, mode='w', encoding='utf-8') as out:
+            for item in set(input.sample_fastp_report):
+                out.write("%s\n" % item)
+rule multiqc_merged:
+    input:
+        MERGE_REPORT + '/{dataset}_multiqc_inputs.txt'
+    output:
+        multiqc_report=report(MERGE_REPORT + '/{dataset}_multiqc.html',
+        category='merge_samples')
+    singularity: singularity_img
+    shell:
+        '''
+        multiqc -f --file-list {input} --filename {output.multiqc_report}
+        '''
--- a/rules/preprocess.smk
+++ b/rules/preprocess.smk
@@ -5,14 +5,14 @@ Description: rules for pre-processing of paired-end shotgun DNA metagenomic
 sequencing.
 '''
 localrules:
-    multiqc_preprocess_listing_files,
+    multiqc_preprocess_list_files,
    multiqc_preprocess
 ##----------------------------------------------------------------------------##
 ## Local variables
 ##----------------------------------------------------------------------------##
 singularity_img = 'shub://mticlla/MetagenomicSnake:preqc_v0_1'
-BBMAP_REF = config['preprocess']['bbmap_reference']
-BBMAP_REF_DIR = config['preprocess']['bbmap_ref_dir']
+BBMAP_REF = config['preprocess']['filter_human']['bbmap_reference']
+BBMAP_REF_DIR = config['preprocess']['filter_human']['bbmap_ref_dir']

 ##----------------------------------------------------------------------------##
 ## Rules with target files
@@ -21,12 +21,17 @@ BBMAP_REF_DIR = config['preprocess']['bbmap_ref_dir']
 # FastQC only performs a quality check but no QC processing is done.
 # With Fastp a quality check is performed and both paired fastq files
 # are processed as follows:
-# - remove adapters: here we provide the Nextera XT adapters,
-#   check this page from Illumina to know which adapters to remove:
+# - default fastp's quality filtering
+# - remove adapters: enabled by default, fastp detects adapters by per-read overlap
+#   analysis (seeks for the overlap of each read pair). If fastp fails to find an
+#   overlap, here we provide the adapter sequences (e.g Nextera XT adapters).
+#   Check this page from Illumina to know which adapters to remove:
 #   https://support.illumina.com/bulletins/2016/12/what-sequences-do-i-use-for-adapter-trimming.html
 # - base correction in overlapped regions
-# - trimming of the last base in read 1
+# - trimming of the last base(s) for every read pair( this step preceeds adapter removal)
 # - discard reads shorter than a minimum length, after trimming
+# WARNING: cutting by quality score is not done at this stage because it interferes with
+# deduplication step
 # It provides a report with quality check, before and after processing
 rule trim_adapters:
    input:
@@ -45,8 +50,9 @@ rule trim_adapters:
        LOGS_DIR + '/preprocess/{dataset}_atrimmed/{fastq_file}.log'
    params:
        fastp_dir = PRE_PROC_DIR,
-        adapter = config['preprocess']['adapter'],
-        min_length = config['preprocess']['min_length']
+        adapter = config['preprocess']['trim_adapters']['adapter'],
+        min_length = config['preprocess']['trim_adapters']['min_length'],
+        trim_tails = config['preprocess']['trim_adapters']['trim_tails']
    threads: cpus_avail
    singularity: singularity_img
    group: 'preprocess'
@@ -54,18 +60,18 @@ rule trim_adapters:
        '''
        echo "Running trim_adapters with {threads} cores."
        [ ! -d {params.fastp_dir} ] && mkdir {params.fastp_dir}
-        fastp \
+        (fastp \
        --adapter_sequence {params.adapter} \
        --adapter_sequence_r2 {params.adapter} \
        --detect_adapter_for_pe \
        --correction \
-        --overrepresentation_analysis \
        --length_required {params.min_length} \
-        --trim_tail1 1 \
+        --trim_tail1 {params.trim_tails} \
+        --trim_tail2 {params.trim_tails} \
        --in1 {input.fwd} --in2 {input.rev} \
        --out1 {output.fwd_tr} --out2 {output.rev_tr} \
        --html {output.report1} --json {output.report2} \
-        --thread {threads}
+        --thread {threads}) &>{log}
        '''
 # After pre-processing with fastp, human reads have to be removed
 #
@@ -84,8 +90,8 @@ rule filter_human:
    log:
        LOGS_DIR + '/preprocess/{dataset}_bfiltered/{fastq_file}.log'
    params:
-        low_mem = config['preprocess']['bbmap_usemodulo'],
-        mem_gb = config['preprocess']['bbmap_mem']
+        low_mem = config['preprocess']['filter_human']['bbmap_usemodulo'],
+        mem_gb = config['preprocess']['filter_human']['bbmap_mem']
    threads:cpus_avail
    singularity: singularity_img
    group: 'preprocess'
@@ -93,7 +99,7 @@ rule filter_human:
        '''
        echo "Running filter_human with {threads} cores."
        [ ! -d {PRE_PROC_DIR} ] && mkdir {PRE_PROC_DIR}
-        bbmap.sh \
+        (bbmap.sh \
        usemodulo={params.low_mem} \
        minid=0.95 \
        maxindel=3 \
@@ -112,7 +118,7 @@ rule filter_human:
        scafstats={output.bbmap_scafstats} \
        statsfile={output.bbmap_refstats} \
        unpigz=t pigz=t -Xmx{params.mem_gb}g \
-        threads={threads}
+        threads={threads}) &>{log}
        '''

 # This is an intensive job
@@ -125,7 +131,7 @@ rule index_human_ref:
    log:
        LOGS_DIR + '/preprocess/ref_indexing.log'
    params:
-        usemodulo=config['preprocess']['bbmap_usemodulo']
+        usemodulo=config['preprocess']['filter_human']['bbmap_usemodulo']
    threads: cpus_avail
    singularity: singularity_img
    group: 'preprocess'
@@ -134,13 +140,13 @@ rule index_human_ref:
        echo "Running index_human_ref with {threads} cores."
        # Masked hg19 kindly provided by Brian Brushnell and manually dowloaded from
        # https://drive.google.com/open?id=0B3llHR93L14wd0pSSnFULUlhcUk
-        bbmap.sh \
+        (bbmap.sh \
        ref={input.reference} \
        usemodulo={params.usemodulo} \
        path={output} \
-        threads={threads}
+        threads={threads}) &>{log}
        '''
-#
+# Deduplication step with BBTool dedupe.sh
 rule dedupe:
    input:
        fwd_clean = PRE_PROC_DIR+'/{dataset}_bfiltered/{fastq_file}-R1.clean.fastq.gz',
@@ -156,7 +162,7 @@ rule dedupe:
    group: 'preprocess'
    shell:
        '''
-        dedupe.sh \
+        (dedupe.sh \
        in1={input.fwd_clean} in2={input.rev_clean} \
        out=stdout.fq \
        outd={output.fastq_duplicates} \
@@ -165,43 +171,45 @@ rule dedupe:
        int=t in=stdin.fq \
        out1={output.fwd_clean_dedup} \
        out2={output.rev_clean_dedup} \
-        threads={threads}
+        threads={threads}) &>{log}
        '''
-# After quality filtering and removal of adapters, human reads, and duplicates,
-# the reads' 3'end are quality trimmed
+# After removal of adapters, human reads, and duplicates,
+# the reads' 3'end are quality trimmed with fastp
+# Notice that quality filtering is disabled because it was done by rule trim_adapters
 rule trim3end_dedupe:
    input:
        fwd_clean_dedup = PRE_PROC_DIR+'/{dataset}_cdedupe/{fastq_file}-R1.clean.nodup.fastq.gz',
        rev_clean_dedup = PRE_PROC_DIR+'/{dataset}_cdedupe/{fastq_file}-R2.clean.nodup.fastq.gz'
    params:
        fastp_dir = PRE_PROC_DIR,
-        min_length = config['preprocess']['min_length']
+        min_length = config['preprocess']['trim_adapters']['min_length']
    output:
        fwd_tr = PRE_PROC_DIR+'/{dataset}_dfinaltrim/{fastq_file}-R1.clean.nodup.fastp.fastq.gz',
        rev_tr = PRE_PROC_DIR+'/{dataset}_dfinaltrim/{fastq_file}-R2.clean.nodup.fastp.fastq.gz',
        report1 = PRE_PROC_DIR+'/{dataset}_dfinaltrim/{fastq_file}.clean.nodup.fastp.html',
        report2 = PRE_PROC_DIR+'/{dataset}_dfinaltrim/{fastq_file}.clean.nodup.fastp.json'
-    threads: int(cpus_avail/4)
+    threads: cpus_avail
    singularity: singularity_img
    group: 'preprocess'
    shell:
        '''
-        fastp \
+        (fastp \
        -Q \
+        --overrepresentation_analysis \
        --length_required {params.min_length} \
        --cut_tail -W 4 -M 20 \
        --in1 {input.fwd_clean_dedup} --in2 {input.rev_clean_dedup} \
        --out1 {output.fwd_tr} --out2 {output.rev_tr} \
        --html {output.report1} --json {output.report2} \
-        --thread {threads}
+        --thread {threads}) &2>{log}
        '''

-rule multiqc_preprocess_listing_files:
+rule multiqc_preprocess_list_files:
    input:
        #
-        atrimmed = lambda wildcards: ['{}/{}_atrimmed/{}.fastp.json'.format(
-        PRE_PROC_DIR, value, FASTQS[ix])
-        for ix,value in enumerate(DATASETSX) if value==wildcards.dataset],
+        #atrimmed = lambda wildcards: ['{}/{}_atrimmed/{}.fastp.json'.format(
+        #PRE_PROC_DIR, value, FASTQS[ix])
+        #for ix,value in enumerate(DATASETSX) if value==wildcards.dataset],
        #
        dfinaltrim = lambda wildcards: ['{}/{}_dfinaltrim/{}.clean.nodup.fastp.json'.format(
        PRE_PROC_DIR, value, FASTQS[ix])
@@ -216,7 +224,7 @@ rule multiqc_preprocess_listing_files:
            pass

        with open(output.multiqc_input_list, mode='w', encoding='utf-8') as out:
-            for item in input.atrimmed+input.dfinaltrim:
+            for item in input.dfinaltrim:
                out.write("%s\n" % item)

 rule multiqc_preprocess: