diff --git a/config.yaml b/config.yaml index c10ada30c483be85535a0bd7a7833d9cbe2c051e..7f7cfc2cc7fc62cb03bc8610da3152b92e004356 100644 --- a/config.yaml +++ b/config.yaml @@ -15,9 +15,11 @@ rawQC: samplerate: 0.1 preprocess: + trim_adapters: adapter: 'CTGTCTCTTATACACATCT' min_length: 50 - + trim_tails: 1 + filter_human: # Activate 'low-memory' mode for bbmap # ------------------------------------- # value is either t (true) or f (false) diff --git a/report/MetagenomicSnake.rst b/report/MetagenomicSnake.rst index 514a64112b95e77d210782054b7d7f88b75d616c..07a341e0fe856c5899b983bbe6d7737fa8e0143d 100644 --- a/report/MetagenomicSnake.rst +++ b/report/MetagenomicSnake.rst @@ -9,4 +9,24 @@ datasets from human microbiomes. Modules ------- - rawQC +**rawQC** + runs FastQC on a random sample of R1 reads from the paired fastq-format files. + +**preprocess** + FastQC only performs a quality check but no QC processing is done. The **preprocess** + rule runs a multi-step pre-processing of the paired fastq files, includes: + + - Adapter-trimming with "fastp" + - + + - with Fastp a quality check is performed and both paired fastq files are processed as follows: + + remove adapters: here we provide the Nextera XT adapters, + + base correction in overlapped regions + + trimming of the last base in read 1 + + discard reads shorter than a minimum length, after trimming + + a report with quality check, before and after processing + - removal of reads derived from human DNA + - removal of duplicated reads + +**merge_samples** + merges fastq files corresponding to the same sample into a single pair of fastq files, after pre-processing of paired fastq files. diff --git a/rules/merge_samples.smk b/rules/merge_samples.smk new file mode 100644 index 0000000000000000000000000000000000000000..d138546f3040a9e625eafc26d86cec7caf82b669 --- /dev/null +++ b/rules/merge_samples.smk @@ -0,0 +1,93 @@ +''' +Author: Monica R. Ticlla +Afiliation(s): SIB, SwissTPH, UNIBAS +Description: after pre-processing of paired fastq files form paired-end shotgun +DNA sequencing of metagenomic samples, fastq files corresponding to the same +sample are merged into a single pair of fastq files. + +''' +localrules: + multiqc_merged_list_files, + multiqc_merged +##----------------------------------------------------------------------------## +## Local variables +##----------------------------------------------------------------------------## +singularity_img = 'shub://mticlla/MetagenomicSnake:preqc_v0_1' + +##----------------------------------------------------------------------------## +## Rules with target files +##----------------------------------------------------------------------------## + +###**THESE TARGET FILES ARE THE FINAL CLEAN**### +# concatenate cleaned,deduplicated and trimmed fastqs from the same samples +rule concatenate_fastqs: + input: + # + sample_fwds = lambda wildcards: ['{}/{}_dfinaltrim/{}-{}_{}-R1.clean.nodup.fastp.fastq.gz'.format( + PRE_PROC_DIR, DATASETS[ix], value, RUNS[ix], LANES[ix]) + for ix,value in enumerate(SAMPLES) if (value == wildcards.sample) and (DATASETS[ix] == wildcards.dataset)], + # + sample_revs = lambda wildcards: ['{}/{}_dfinaltrim/{}-{}_{}-R2.clean.nodup.fastp.fastq.gz'.format( + PRE_PROC_DIR, DATASETS[ix], value, RUNS[ix], LANES[ix]) + for ix,value in enumerate(SAMPLES) if (value==wildcards.sample) and (DATASETS[ix]==wildcards.dataset)] + output: + sample_fwd = MERGE_DIR + '/{dataset}_merged/{sample}-R1.fastq.gz', + sample_rev = MERGE_DIR + '/{dataset}_merged/{sample}-R2.fastq.gz' + wildcard_constraints: + sample = '\w+' + group: 'preprocess' + shell: + ''' + cat {input.sample_fwds} > {output.sample_fwd} + cat {input.sample_revs} > {output.sample_rev} + ''' +# Final quality check with Fastp, but no further QC processing +rule fastp_concatenated: + input: + sample_fwd = MERGE_DIR + '/{dataset}_merged/{sample}-R1.fastq.gz', + sample_rev = MERGE_DIR + '/{dataset}_merged/{sample}-R2.fastq.gz' + output: + report1 = MERGE_DIR + '/{dataset}_merged/{sample}.fastp.html', + report2 = MERGE_DIR + '/{dataset}_merged/{sample}.fastp.json' + wildcard_constraints: + sample = '\w+' + threads: cpus_avail + group: 'preprocess' + singularity: singularity_img + shell: + ''' + fastp \ + -A \ + --in1 {input.sample_fwd} --in2 {input.sample_rev} \ + --html {output.report1} --json {output.report2} \ + --thread {threads} + ''' + +rule multiqc_merged_list_files: + input: + sample_fastp_report = lambda wildcards: ['{}/{}_merged/{}.fastp.json'.format( + MERGE_DIR, wildcards.dataset, value) for ix,value in enumerate(SAMPLES) + if DATASETS[ix]==wildcards.dataset] + output: + multiqc_input_list = MERGE_REPORT + '/{dataset}_multiqc_inputs.txt' + run: + import os + try: + os.makedirs(os.path.dirname(output.multiqc_input_list)) + except OSError: + pass + + with open(output.multiqc_input_list, mode='w', encoding='utf-8') as out: + for item in set(input.sample_fastp_report): + out.write("%s\n" % item) +rule multiqc_merged: + input: + MERGE_REPORT + '/{dataset}_multiqc_inputs.txt' + output: + multiqc_report=report(MERGE_REPORT + '/{dataset}_multiqc.html', + category='merge_samples') + singularity: singularity_img + shell: + ''' + multiqc -f --file-list {input} --filename {output.multiqc_report} + ''' diff --git a/rules/preprocess.smk b/rules/preprocess.smk index 0124820009e0688eb82e70a22451985173074410..4efe2c384db30f33b3d9a47349e31766660066cc 100644 --- a/rules/preprocess.smk +++ b/rules/preprocess.smk @@ -5,14 +5,14 @@ Description: rules for pre-processing of paired-end shotgun DNA metagenomic sequencing. ''' localrules: - multiqc_preprocess_listing_files, + multiqc_preprocess_list_files, multiqc_preprocess ##----------------------------------------------------------------------------## ## Local variables ##----------------------------------------------------------------------------## singularity_img = 'shub://mticlla/MetagenomicSnake:preqc_v0_1' -BBMAP_REF = config['preprocess']['bbmap_reference'] -BBMAP_REF_DIR = config['preprocess']['bbmap_ref_dir'] +BBMAP_REF = config['preprocess']['filter_human']['bbmap_reference'] +BBMAP_REF_DIR = config['preprocess']['filter_human']['bbmap_ref_dir'] ##----------------------------------------------------------------------------## ## Rules with target files @@ -21,12 +21,17 @@ BBMAP_REF_DIR = config['preprocess']['bbmap_ref_dir'] # FastQC only performs a quality check but no QC processing is done. # With Fastp a quality check is performed and both paired fastq files # are processed as follows: -# - remove adapters: here we provide the Nextera XT adapters, -# check this page from Illumina to know which adapters to remove: +# - default fastp's quality filtering +# - remove adapters: enabled by default, fastp detects adapters by per-read overlap +# analysis (seeks for the overlap of each read pair). If fastp fails to find an +# overlap, here we provide the adapter sequences (e.g Nextera XT adapters). +# Check this page from Illumina to know which adapters to remove: # https://support.illumina.com/bulletins/2016/12/what-sequences-do-i-use-for-adapter-trimming.html # - base correction in overlapped regions -# - trimming of the last base in read 1 +# - trimming of the last base(s) for every read pair( this step preceeds adapter removal) # - discard reads shorter than a minimum length, after trimming +# WARNING: cutting by quality score is not done at this stage because it interferes with +# deduplication step # It provides a report with quality check, before and after processing rule trim_adapters: input: @@ -45,8 +50,9 @@ rule trim_adapters: LOGS_DIR + '/preprocess/{dataset}_atrimmed/{fastq_file}.log' params: fastp_dir = PRE_PROC_DIR, - adapter = config['preprocess']['adapter'], - min_length = config['preprocess']['min_length'] + adapter = config['preprocess']['trim_adapters']['adapter'], + min_length = config['preprocess']['trim_adapters']['min_length'], + trim_tails = config['preprocess']['trim_adapters']['trim_tails'] threads: cpus_avail singularity: singularity_img group: 'preprocess' @@ -54,18 +60,18 @@ rule trim_adapters: ''' echo "Running trim_adapters with {threads} cores." [ ! -d {params.fastp_dir} ] && mkdir {params.fastp_dir} - fastp \ + (fastp \ --adapter_sequence {params.adapter} \ --adapter_sequence_r2 {params.adapter} \ --detect_adapter_for_pe \ --correction \ - --overrepresentation_analysis \ --length_required {params.min_length} \ - --trim_tail1 1 \ + --trim_tail1 {params.trim_tails} \ + --trim_tail2 {params.trim_tails} \ --in1 {input.fwd} --in2 {input.rev} \ --out1 {output.fwd_tr} --out2 {output.rev_tr} \ --html {output.report1} --json {output.report2} \ - --thread {threads} + --thread {threads}) &>{log} ''' # After pre-processing with fastp, human reads have to be removed # @@ -84,8 +90,8 @@ rule filter_human: log: LOGS_DIR + '/preprocess/{dataset}_bfiltered/{fastq_file}.log' params: - low_mem = config['preprocess']['bbmap_usemodulo'], - mem_gb = config['preprocess']['bbmap_mem'] + low_mem = config['preprocess']['filter_human']['bbmap_usemodulo'], + mem_gb = config['preprocess']['filter_human']['bbmap_mem'] threads:cpus_avail singularity: singularity_img group: 'preprocess' @@ -93,7 +99,7 @@ rule filter_human: ''' echo "Running filter_human with {threads} cores." [ ! -d {PRE_PROC_DIR} ] && mkdir {PRE_PROC_DIR} - bbmap.sh \ + (bbmap.sh \ usemodulo={params.low_mem} \ minid=0.95 \ maxindel=3 \ @@ -112,7 +118,7 @@ rule filter_human: scafstats={output.bbmap_scafstats} \ statsfile={output.bbmap_refstats} \ unpigz=t pigz=t -Xmx{params.mem_gb}g \ - threads={threads} + threads={threads}) &>{log} ''' # This is an intensive job @@ -125,7 +131,7 @@ rule index_human_ref: log: LOGS_DIR + '/preprocess/ref_indexing.log' params: - usemodulo=config['preprocess']['bbmap_usemodulo'] + usemodulo=config['preprocess']['filter_human']['bbmap_usemodulo'] threads: cpus_avail singularity: singularity_img group: 'preprocess' @@ -134,13 +140,13 @@ rule index_human_ref: echo "Running index_human_ref with {threads} cores." # Masked hg19 kindly provided by Brian Brushnell and manually dowloaded from # https://drive.google.com/open?id=0B3llHR93L14wd0pSSnFULUlhcUk - bbmap.sh \ + (bbmap.sh \ ref={input.reference} \ usemodulo={params.usemodulo} \ path={output} \ - threads={threads} + threads={threads}) &>{log} ''' -# +# Deduplication step with BBTool dedupe.sh rule dedupe: input: fwd_clean = PRE_PROC_DIR+'/{dataset}_bfiltered/{fastq_file}-R1.clean.fastq.gz', @@ -156,7 +162,7 @@ rule dedupe: group: 'preprocess' shell: ''' - dedupe.sh \ + (dedupe.sh \ in1={input.fwd_clean} in2={input.rev_clean} \ out=stdout.fq \ outd={output.fastq_duplicates} \ @@ -165,43 +171,45 @@ rule dedupe: int=t in=stdin.fq \ out1={output.fwd_clean_dedup} \ out2={output.rev_clean_dedup} \ - threads={threads} + threads={threads}) &>{log} ''' -# After quality filtering and removal of adapters, human reads, and duplicates, -# the reads' 3'end are quality trimmed +# After removal of adapters, human reads, and duplicates, +# the reads' 3'end are quality trimmed with fastp +# Notice that quality filtering is disabled because it was done by rule trim_adapters rule trim3end_dedupe: input: fwd_clean_dedup = PRE_PROC_DIR+'/{dataset}_cdedupe/{fastq_file}-R1.clean.nodup.fastq.gz', rev_clean_dedup = PRE_PROC_DIR+'/{dataset}_cdedupe/{fastq_file}-R2.clean.nodup.fastq.gz' params: fastp_dir = PRE_PROC_DIR, - min_length = config['preprocess']['min_length'] + min_length = config['preprocess']['trim_adapters']['min_length'] output: fwd_tr = PRE_PROC_DIR+'/{dataset}_dfinaltrim/{fastq_file}-R1.clean.nodup.fastp.fastq.gz', rev_tr = PRE_PROC_DIR+'/{dataset}_dfinaltrim/{fastq_file}-R2.clean.nodup.fastp.fastq.gz', report1 = PRE_PROC_DIR+'/{dataset}_dfinaltrim/{fastq_file}.clean.nodup.fastp.html', report2 = PRE_PROC_DIR+'/{dataset}_dfinaltrim/{fastq_file}.clean.nodup.fastp.json' - threads: int(cpus_avail/4) + threads: cpus_avail singularity: singularity_img group: 'preprocess' shell: ''' - fastp \ + (fastp \ -Q \ + --overrepresentation_analysis \ --length_required {params.min_length} \ --cut_tail -W 4 -M 20 \ --in1 {input.fwd_clean_dedup} --in2 {input.rev_clean_dedup} \ --out1 {output.fwd_tr} --out2 {output.rev_tr} \ --html {output.report1} --json {output.report2} \ - --thread {threads} + --thread {threads}) &2>{log} ''' -rule multiqc_preprocess_listing_files: +rule multiqc_preprocess_list_files: input: # - atrimmed = lambda wildcards: ['{}/{}_atrimmed/{}.fastp.json'.format( - PRE_PROC_DIR, value, FASTQS[ix]) - for ix,value in enumerate(DATASETSX) if value==wildcards.dataset], + #atrimmed = lambda wildcards: ['{}/{}_atrimmed/{}.fastp.json'.format( + #PRE_PROC_DIR, value, FASTQS[ix]) + #for ix,value in enumerate(DATASETSX) if value==wildcards.dataset], # dfinaltrim = lambda wildcards: ['{}/{}_dfinaltrim/{}.clean.nodup.fastp.json'.format( PRE_PROC_DIR, value, FASTQS[ix]) @@ -216,7 +224,7 @@ rule multiqc_preprocess_listing_files: pass with open(output.multiqc_input_list, mode='w', encoding='utf-8') as out: - for item in input.atrimmed+input.dfinaltrim: + for item in input.dfinaltrim: out.write("%s\n" % item) rule multiqc_preprocess: