add target rule "preprocess" for pre-processing of paired fastq files

a1c1f04b · Ticlla Ccenhua Monica Roxana · 2ca82660 · a1c1f04b · a1c1f04b · a1c1f04b
Commit a1c1f04b authored 5 years ago by Ticlla Ccenhua Monica Roxana
--- a/Snakefile
+++ b/Snakefile
@@ -10,7 +10,7 @@ import os
 ##----------------------------------------------------------------------------##
 from multiprocessing import cpu_count
 cpus_avail = cpu_count()
-
+print('CPUs available: {}'.format(cpus_avail))
 ##----------------------------------------------------------------------------##
 ## Working directory
 ##----------------------------------------------------------------------------##
@@ -62,9 +62,13 @@ RESULTS_DIR = config['PREFIX_DIR'] +'/results'
 LOGS_DIR = config['PREFIX_DIR'] +'/logs'
 REPORTS_DIR = config['PREFIX_DIR'] +'/reports'

+
 RAW_QC_DIR = RESULTS_DIR + '/rawQC'
 RAW_QC_REPORT = REPORTS_DIR + '/rawQC'

+PRE_PROC_DIR = RESULTS_DIR + '/preprocessed'
+PRE_PROC_REPORT = REPORTS_DIR + '/preprocessed'
+
 ##----------------------------------------------------------------------------##
 ## Fastq files to be processed
 ##----------------------------------------------------------------------------##
@@ -91,4 +95,13 @@ rule rawQC:
    input:
        expand(RAW_QC_REPORT + '/{dataset}_multiqc.html', dataset=set(DATASETS))

+rule preprocess:
+    input:
+        expand(PRE_PROC_REPORT + '/{dataset}_multiqc.html', dataset=set(DATASETS))
+
+#rule merge_samples:
+#    input:
+#        expand()
+
 include: "rules/rawQC.smk"
+include: "rules/preprocess.smk"
--- a/config.yaml
+++ b/config.yaml
@@ -13,4 +13,26 @@ SAMPLE_UNITS:
 #-------------------------------------------------------------------------------
 rawQC:
  samplerate: 0.1
+
 preprocess:
+    adapter: 'CTGTCTCTTATACACATCT'
+    min_length: 50
+
+    # Activate 'low-memory' mode for bbmap
+    # -------------------------------------
+    # value is either t (true) or f (false)
+    # BBMap normally uses roughly 6 bytes per reference base. With low-memory mode
+    # (triggered by the flag “usemodulo”), BBMap will use approximately 3 bytes
+    # per base, with a slight reduction in sensitivity
+    bbmap_usemodulo: t
+    bbmap_mem: 10
+
+    # PATH to reference(e.g human) genome
+    # -----------------------------------
+    # Masked hg19 kindly provided by Brian Brushnell and manually dowloaded from
+    # https://drive.google.com/open?id=0B3llHR93L14wd0pSSnFULUlhcUk
+    # How the human genome was masked is explained in this SEQanswers entry
+    # http://seqanswers.com/forums/archive/index.php/t-42552.html
+    bbmap_reference: 'external/hg19_main_mask_ribo_animal_allplant_allfungus.fa.gz'
+    #bbmap_reference: 'external/reference_test_2.fa.gz'
+    bbmap_ref_dir: 'external/ref_indexed'
--- a/rules/preprocess.smk
+++ b/rules/preprocess.smk
+'''
+Author: Monica R. Ticlla
+Afiliation(s): SIB, SwissTPH, UNIBAS
+Description: rules for pre-processing of paired-end shotgun DNA metagenomic
+sequencing.
+'''
+localrules:
+    multiqc_preprocess_listing_files,
+    multiqc_preprocess
+##----------------------------------------------------------------------------##
+## Local variables
+##----------------------------------------------------------------------------##
+singularity_img = 'shub://mticlla/MetagenomicSnake:preqc_v0_1'
+BBMAP_REF = config['preprocess']['bbmap_reference']
+BBMAP_REF_DIR = config['preprocess']['bbmap_ref_dir']
+
+##----------------------------------------------------------------------------##
+## Rules with target files
+##----------------------------------------------------------------------------##
+
+# FastQC only performs a quality check but no QC processing is done.
+# With Fastp a quality check is performed and both paired fastq files
+# are processed as follows:
+# - remove adapters: here we provide the Nextera XT adapters,
+#   check this page from Illumina to know which adapters to remove:
+#   https://support.illumina.com/bulletins/2016/12/what-sequences-do-i-use-for-adapter-trimming.html
+# - base correction in overlapped regions
+# - trimming of the last base in read 1
+# - discard reads shorter than a minimum length, after trimming
+# It provides a report with quality check, before and after processing
+rule trim_adapters:
+    input:
+        fwd=lambda wildcards: ['{}/{}/{}-R1.fastq.gz'.format(RAW_FASTQ_DIR, DATASETSX[ix], value)
+        for ix,value in enumerate(FASTQS) if value==wildcards.fastq_file],
+        #RAW_FASTQ_DIR+'/{dataset}/{fastq_file}-R1.fastq.gz',
+        rev=lambda wildcards: ['{}/{}/{}-R2.fastq.gz'.format(RAW_FASTQ_DIR, DATASETSX[ix], value)
+        for ix,value in enumerate(FASTQS) if value==wildcards.fastq_file],
+        #RAW_FASTQ_DIR+'/{dataset}/{fastq_file}-R2.fastq.gz'
+    output:
+        fwd_tr=temp(PRE_PROC_DIR+'/{dataset}_atrimmed/{fastq_file}-R1.fastp.fastq.gz'),
+        rev_tr=temp(PRE_PROC_DIR+'/{dataset}_atrimmed/{fastq_file}-R2.fastp.fastq.gz'),
+        report1=PRE_PROC_DIR+'/{dataset}_atrimmed/{fastq_file}.fastp.html',
+        report2=PRE_PROC_DIR+'/{dataset}_atrimmed/{fastq_file}.fastp.json'
+    log:
+        LOGS_DIR + '/preprocess/{dataset}_atrimmed/{fastq_file}.log'
+    params:
+        fastp_dir=PRE_PROC_DIR,
+        adapter = config['preprocess']['adapter'],
+        min_length = config['preprocess']['min_length']
+    threads: int(cpus_avail/4)
+    singularity: singularity_img
+    group: 'preprocess'
+    shell:
+        '''
+        echo "Running trim_adapters with {threads} cores."
+        [ ! -d {params.fastp_dir} ] && mkdir {params.fastp_dir}
+        fastp \
+        --adapter_sequence {params.adapter} \
+        --adapter_sequence_r2 {params.adapter} \
+        --detect_adapter_for_pe \
+        --correction \
+        --overrepresentation_analysis \
+        --length_required {params.min_length} \
+        --trim_tail1 1 \
+        --in1 {input.fwd} --in2 {input.rev} \
+        --out1 {output.fwd_tr} --out2 {output.rev_tr} \
+        --html {output.report1} --json {output.report2} \
+        --thread {threads}
+        '''
+# After pre-processing with fastp, human reads have to be removed
+#
+rule filter_human:
+    input:
+        human_ref=BBMAP_REF_DIR,
+        fwd_tr=rules.trim_adapters.output.fwd_tr,
+        rev_tr=rules.trim_adapters.output.rev_tr
+        #fwd_tr=PRE_PROC_DIR+'/{dataset}_atrimmed/{fastq_file}-R1.fastp.fastq.gz',
+        #rev_tr=PRE_PROC_DIR+'/{dataset}_atrimmed/{fastq_file}-R2.fastp.fastq.gz'
+    output:
+        fwd_clean = temp(PRE_PROC_DIR+'/{dataset}_bfiltered/{fastq_file}-R1.clean.fastq.gz'),
+        rev_clean = temp(PRE_PROC_DIR+'/{dataset}_bfiltered/{fastq_file}-R2.clean.fastq.gz'),
+        bbmap_scafstats = PRE_PROC_DIR+'/{dataset}_bfiltered/{fastq_file}.clean.scafstats',
+        bbmap_refstats = PRE_PROC_DIR+'/{dataset}_bfiltered/{fastq_file}.clean.stats'
+    log:
+        LOGS_DIR + '/preprocess/{dataset}_bfiltered/{fastq_file}.log'
+    params:
+        low_mem = config['preprocess']['bbmap_usemodulo'],
+        mem_gb = config['preprocess']['bbmap_mem']
+    threads:cpus_avail
+    singularity: singularity_img
+    group: 'preprocess'
+    shell:
+        '''
+        echo "Running filter_human with {threads} cores."
+        [ ! -d {PRE_PROC_DIR} ] && mkdir {PRE_PROC_DIR}
+        bbmap.sh \
+        usemodulo={params.low_mem} \
+        minid=0.95 \
+        maxindel=3 \
+        bwr=0.16 \
+        bw=12 \
+        minhits=2 \
+        quickmatch \
+        fast \
+        qtrim=rl \
+        trimq=10 \
+        untrim \
+        path={input.human_ref} \
+        in1={input.fwd_tr} in2={input.rev_tr} \
+        outu1={output.fwd_clean} outu2={output.rev_clean} \
+        sortscafs=t \
+        scafstats={output.bbmap_scafstats} \
+        statsfile={output.bbmap_refstats} \
+        unpigz=t pigz=t -Xmx{params.mem_gb}g \
+        threads={threads}
+        '''
+
+# This is an intensive job
+#
+rule index_human_ref:
+    input:
+        reference = BBMAP_REF
+    output:
+        directory(BBMAP_REF_DIR)
+    log:
+        LOGS_DIR + '/preprocess/ref_indexing.log'
+    params:
+        usemodulo=config['preprocess']['bbmap_usemodulo']
+    threads: cpus_avail
+    singularity: singularity_img
+    group: 'preprocess'
+    shell:
+        '''
+        echo "Running index_human_ref with {threads} cores."
+        # Masked hg19 kindly provided by Brian Brushnell and manually dowloaded from
+        # https://drive.google.com/open?id=0B3llHR93L14wd0pSSnFULUlhcUk
+        bbmap.sh \
+        ref={input.reference} \
+        usemodulo={params.usemodulo} \
+        path={output} \
+        threads={threads}
+        '''
+#
+rule dedupe:
+    input:
+        fwd_clean = PRE_PROC_DIR+'/{dataset}_bfiltered/{fastq_file}-R1.clean.fastq.gz',
+        rev_clean = PRE_PROC_DIR+'/{dataset}_bfiltered/{fastq_file}-R2.clean.fastq.gz',
+    output:
+        fwd_clean_dedup = temp(PRE_PROC_DIR+'/{dataset}_cdedupe/{fastq_file}-R1.clean.nodup.fastq.gz'),
+        rev_clean_dedup = temp(PRE_PROC_DIR+'/{dataset}_cdedupe/{fastq_file}-R2.clean.nodup.fastq.gz'),
+        fastq_duplicates = PRE_PROC_DIR+'/{dataset}_cdedupe/{fastq_file}.clean.duplicates.fastq.gz'
+    log:
+        LOGS_DIR + '/preprocess/{dataset}_cdedupe/{fastq_file}.log'
+    threads:cpus_avail
+    singularity: singularity_img
+    group: 'preprocess'
+    shell:
+        '''
+        dedupe.sh \
+        in1={input.fwd_clean} in2={input.rev_clean} \
+        out=stdout.fq \
+        outd={output.fastq_duplicates} \
+        ac=f minidentity=99 | \
+        reformat.sh \
+        int=t in=stdin.fq \
+        out1={output.fwd_clean_dedup} \
+        out2={output.rev_clean_dedup} \
+        threads={threads}
+        '''
+# After quality filtering and removal of adapters, human reads, and duplicates,
+# the reads' 3'end are quality trimmed
+rule trim3end_dedupe:
+    input:
+        fwd_clean_dedup = PRE_PROC_DIR+'/{dataset}_cdedupe/{fastq_file}-R1.clean.nodup.fastq.gz',
+        rev_clean_dedup = PRE_PROC_DIR+'/{dataset}_cdedupe/{fastq_file}-R2.clean.nodup.fastq.gz'
+    params:
+        fastp_dir=PRE_PROC_DIR,
+        min_length = config['preprocess']['min_length']
+    output:
+        fwd_tr=PRE_PROC_DIR+'/{dataset}_dfinaltrim/{fastq_file}-R1.clean.nodup.fastp.fastq.gz',
+        rev_tr=PRE_PROC_DIR+'/{dataset}_dfinaltrim/{fastq_file}-R2.clean.nodup.fastp.fastq.gz',
+        report1=PRE_PROC_DIR+'/{dataset}_dfinaltrim/{fastq_file}.clean.nodup.fastp.html',
+        report2=PRE_PROC_DIR+'/{dataset}_dfinaltrim/{fastq_file}.clean.nodup.fastp.json'
+    threads: int(cpus_avail/4)
+    singularity: singularity_img
+    group: 'preprocess'
+    shell:
+        '''
+        fastp \
+        -Q \
+        --length_required {params.min_length} \
+        --cut_tail -W 4 -M 20 \
+        --in1 {input.fwd_clean_dedup} --in2 {input.rev_clean_dedup} \
+        --out1 {output.fwd_tr} --out2 {output.rev_tr} \
+        --html {output.report1} --json {output.report2} \
+        --thread {threads}
+        '''
+
+rule multiqc_preprocess_listing_files:
+    input:
+        #
+        atrimmed=lambda wildcards: ['{}/{}_atrimmed/{}.fastp.json'.format(PRE_PROC_DIR, value, FASTQS[ix])
+        for ix,value in enumerate(DATASETSX) if value==wildcards.dataset],
+        #
+        dfinaltrim=lambda wildcards: ['{}/{}_dfinaltrim/{}.clean.nodup.fastp.json'.format(PRE_PROC_DIR, value, FASTQS[ix])
+        for ix,value in enumerate(DATASETSX) if value==wildcards.dataset]
+    output:
+        multiqc_input_list = PRE_PROC_REPORT+'/{dataset}_multiqc_inputs.txt'
+    run:
+        import os
+        try:
+            os.makedirs(os.path.dirname(output.multiqc_input_list))
+        except OSError:
+            pass
+
+        with open(output.multiqc_input_list, mode='w', encoding='utf-8') as out:
+            for item in input.atrimmed+input.dfinaltrim:
+                out.write("%s\n" % item)
+
+rule multiqc_preprocess:
+    input:
+        PRE_PROC_REPORT+'/{dataset}_multiqc_inputs.txt'
+    output:
+        multiqc_report=report(PRE_PROC_REPORT+'/{dataset}_multiqc.html',
+        category='preprocess')
+    singularity: singularity_img
+    shell:
+        '''
+        multiqc -f --file-list {input} --filename {output.multiqc_report}
+        '''