diff --git a/Snakefile b/Snakefile index 3d3055ae4ecc16dc7766029b89c56be4e06fed40..903c813a6b3045ba7de6b2bcbab55cd4b2985734 100644 --- a/Snakefile +++ b/Snakefile @@ -10,7 +10,7 @@ import os ##----------------------------------------------------------------------------## from multiprocessing import cpu_count cpus_avail = cpu_count() - +print('CPUs available: {}'.format(cpus_avail)) ##----------------------------------------------------------------------------## ## Working directory ##----------------------------------------------------------------------------## @@ -62,9 +62,13 @@ RESULTS_DIR = config['PREFIX_DIR'] +'/results' LOGS_DIR = config['PREFIX_DIR'] +'/logs' REPORTS_DIR = config['PREFIX_DIR'] +'/reports' + RAW_QC_DIR = RESULTS_DIR + '/rawQC' RAW_QC_REPORT = REPORTS_DIR + '/rawQC' +PRE_PROC_DIR = RESULTS_DIR + '/preprocessed' +PRE_PROC_REPORT = REPORTS_DIR + '/preprocessed' + ##----------------------------------------------------------------------------## ## Fastq files to be processed ##----------------------------------------------------------------------------## @@ -91,4 +95,13 @@ rule rawQC: input: expand(RAW_QC_REPORT + '/{dataset}_multiqc.html', dataset=set(DATASETS)) +rule preprocess: + input: + expand(PRE_PROC_REPORT + '/{dataset}_multiqc.html', dataset=set(DATASETS)) + +#rule merge_samples: +# input: +# expand() + include: "rules/rawQC.smk" +include: "rules/preprocess.smk" diff --git a/config.yaml b/config.yaml index c618aa20b39e403178384f91e7ac6307a9fe2586..c10ada30c483be85535a0bd7a7833d9cbe2c051e 100644 --- a/config.yaml +++ b/config.yaml @@ -13,4 +13,26 @@ SAMPLE_UNITS: #------------------------------------------------------------------------------- rawQC: samplerate: 0.1 + preprocess: + adapter: 'CTGTCTCTTATACACATCT' + min_length: 50 + + # Activate 'low-memory' mode for bbmap + # ------------------------------------- + # value is either t (true) or f (false) + # BBMap normally uses roughly 6 bytes per reference base. With low-memory mode + # (triggered by the flag “usemodulo”), BBMap will use approximately 3 bytes + # per base, with a slight reduction in sensitivity + bbmap_usemodulo: t + bbmap_mem: 10 + + # PATH to reference(e.g human) genome + # ----------------------------------- + # Masked hg19 kindly provided by Brian Brushnell and manually dowloaded from + # https://drive.google.com/open?id=0B3llHR93L14wd0pSSnFULUlhcUk + # How the human genome was masked is explained in this SEQanswers entry + # http://seqanswers.com/forums/archive/index.php/t-42552.html + bbmap_reference: 'external/hg19_main_mask_ribo_animal_allplant_allfungus.fa.gz' + #bbmap_reference: 'external/reference_test_2.fa.gz' + bbmap_ref_dir: 'external/ref_indexed' diff --git a/rules/preprocess.smk b/rules/preprocess.smk new file mode 100644 index 0000000000000000000000000000000000000000..4fd7c7dec23dd4c2f17c30fa4ecda7ef25aaad3b --- /dev/null +++ b/rules/preprocess.smk @@ -0,0 +1,230 @@ +''' +Author: Monica R. Ticlla +Afiliation(s): SIB, SwissTPH, UNIBAS +Description: rules for pre-processing of paired-end shotgun DNA metagenomic +sequencing. +''' +localrules: + multiqc_preprocess_listing_files, + multiqc_preprocess +##----------------------------------------------------------------------------## +## Local variables +##----------------------------------------------------------------------------## +singularity_img = 'shub://mticlla/MetagenomicSnake:preqc_v0_1' +BBMAP_REF = config['preprocess']['bbmap_reference'] +BBMAP_REF_DIR = config['preprocess']['bbmap_ref_dir'] + +##----------------------------------------------------------------------------## +## Rules with target files +##----------------------------------------------------------------------------## + +# FastQC only performs a quality check but no QC processing is done. +# With Fastp a quality check is performed and both paired fastq files +# are processed as follows: +# - remove adapters: here we provide the Nextera XT adapters, +# check this page from Illumina to know which adapters to remove: +# https://support.illumina.com/bulletins/2016/12/what-sequences-do-i-use-for-adapter-trimming.html +# - base correction in overlapped regions +# - trimming of the last base in read 1 +# - discard reads shorter than a minimum length, after trimming +# It provides a report with quality check, before and after processing +rule trim_adapters: + input: + fwd=lambda wildcards: ['{}/{}/{}-R1.fastq.gz'.format(RAW_FASTQ_DIR, DATASETSX[ix], value) + for ix,value in enumerate(FASTQS) if value==wildcards.fastq_file], + #RAW_FASTQ_DIR+'/{dataset}/{fastq_file}-R1.fastq.gz', + rev=lambda wildcards: ['{}/{}/{}-R2.fastq.gz'.format(RAW_FASTQ_DIR, DATASETSX[ix], value) + for ix,value in enumerate(FASTQS) if value==wildcards.fastq_file], + #RAW_FASTQ_DIR+'/{dataset}/{fastq_file}-R2.fastq.gz' + output: + fwd_tr=temp(PRE_PROC_DIR+'/{dataset}_atrimmed/{fastq_file}-R1.fastp.fastq.gz'), + rev_tr=temp(PRE_PROC_DIR+'/{dataset}_atrimmed/{fastq_file}-R2.fastp.fastq.gz'), + report1=PRE_PROC_DIR+'/{dataset}_atrimmed/{fastq_file}.fastp.html', + report2=PRE_PROC_DIR+'/{dataset}_atrimmed/{fastq_file}.fastp.json' + log: + LOGS_DIR + '/preprocess/{dataset}_atrimmed/{fastq_file}.log' + params: + fastp_dir=PRE_PROC_DIR, + adapter = config['preprocess']['adapter'], + min_length = config['preprocess']['min_length'] + threads: int(cpus_avail/4) + singularity: singularity_img + group: 'preprocess' + shell: + ''' + echo "Running trim_adapters with {threads} cores." + [ ! -d {params.fastp_dir} ] && mkdir {params.fastp_dir} + fastp \ + --adapter_sequence {params.adapter} \ + --adapter_sequence_r2 {params.adapter} \ + --detect_adapter_for_pe \ + --correction \ + --overrepresentation_analysis \ + --length_required {params.min_length} \ + --trim_tail1 1 \ + --in1 {input.fwd} --in2 {input.rev} \ + --out1 {output.fwd_tr} --out2 {output.rev_tr} \ + --html {output.report1} --json {output.report2} \ + --thread {threads} + ''' +# After pre-processing with fastp, human reads have to be removed +# +rule filter_human: + input: + human_ref=BBMAP_REF_DIR, + fwd_tr=rules.trim_adapters.output.fwd_tr, + rev_tr=rules.trim_adapters.output.rev_tr + #fwd_tr=PRE_PROC_DIR+'/{dataset}_atrimmed/{fastq_file}-R1.fastp.fastq.gz', + #rev_tr=PRE_PROC_DIR+'/{dataset}_atrimmed/{fastq_file}-R2.fastp.fastq.gz' + output: + fwd_clean = temp(PRE_PROC_DIR+'/{dataset}_bfiltered/{fastq_file}-R1.clean.fastq.gz'), + rev_clean = temp(PRE_PROC_DIR+'/{dataset}_bfiltered/{fastq_file}-R2.clean.fastq.gz'), + bbmap_scafstats = PRE_PROC_DIR+'/{dataset}_bfiltered/{fastq_file}.clean.scafstats', + bbmap_refstats = PRE_PROC_DIR+'/{dataset}_bfiltered/{fastq_file}.clean.stats' + log: + LOGS_DIR + '/preprocess/{dataset}_bfiltered/{fastq_file}.log' + params: + low_mem = config['preprocess']['bbmap_usemodulo'], + mem_gb = config['preprocess']['bbmap_mem'] + threads:cpus_avail + singularity: singularity_img + group: 'preprocess' + shell: + ''' + echo "Running filter_human with {threads} cores." + [ ! -d {PRE_PROC_DIR} ] && mkdir {PRE_PROC_DIR} + bbmap.sh \ + usemodulo={params.low_mem} \ + minid=0.95 \ + maxindel=3 \ + bwr=0.16 \ + bw=12 \ + minhits=2 \ + quickmatch \ + fast \ + qtrim=rl \ + trimq=10 \ + untrim \ + path={input.human_ref} \ + in1={input.fwd_tr} in2={input.rev_tr} \ + outu1={output.fwd_clean} outu2={output.rev_clean} \ + sortscafs=t \ + scafstats={output.bbmap_scafstats} \ + statsfile={output.bbmap_refstats} \ + unpigz=t pigz=t -Xmx{params.mem_gb}g \ + threads={threads} + ''' + +# This is an intensive job +# +rule index_human_ref: + input: + reference = BBMAP_REF + output: + directory(BBMAP_REF_DIR) + log: + LOGS_DIR + '/preprocess/ref_indexing.log' + params: + usemodulo=config['preprocess']['bbmap_usemodulo'] + threads: cpus_avail + singularity: singularity_img + group: 'preprocess' + shell: + ''' + echo "Running index_human_ref with {threads} cores." + # Masked hg19 kindly provided by Brian Brushnell and manually dowloaded from + # https://drive.google.com/open?id=0B3llHR93L14wd0pSSnFULUlhcUk + bbmap.sh \ + ref={input.reference} \ + usemodulo={params.usemodulo} \ + path={output} \ + threads={threads} + ''' +# +rule dedupe: + input: + fwd_clean = PRE_PROC_DIR+'/{dataset}_bfiltered/{fastq_file}-R1.clean.fastq.gz', + rev_clean = PRE_PROC_DIR+'/{dataset}_bfiltered/{fastq_file}-R2.clean.fastq.gz', + output: + fwd_clean_dedup = temp(PRE_PROC_DIR+'/{dataset}_cdedupe/{fastq_file}-R1.clean.nodup.fastq.gz'), + rev_clean_dedup = temp(PRE_PROC_DIR+'/{dataset}_cdedupe/{fastq_file}-R2.clean.nodup.fastq.gz'), + fastq_duplicates = PRE_PROC_DIR+'/{dataset}_cdedupe/{fastq_file}.clean.duplicates.fastq.gz' + log: + LOGS_DIR + '/preprocess/{dataset}_cdedupe/{fastq_file}.log' + threads:cpus_avail + singularity: singularity_img + group: 'preprocess' + shell: + ''' + dedupe.sh \ + in1={input.fwd_clean} in2={input.rev_clean} \ + out=stdout.fq \ + outd={output.fastq_duplicates} \ + ac=f minidentity=99 | \ + reformat.sh \ + int=t in=stdin.fq \ + out1={output.fwd_clean_dedup} \ + out2={output.rev_clean_dedup} \ + threads={threads} + ''' +# After quality filtering and removal of adapters, human reads, and duplicates, +# the reads' 3'end are quality trimmed +rule trim3end_dedupe: + input: + fwd_clean_dedup = PRE_PROC_DIR+'/{dataset}_cdedupe/{fastq_file}-R1.clean.nodup.fastq.gz', + rev_clean_dedup = PRE_PROC_DIR+'/{dataset}_cdedupe/{fastq_file}-R2.clean.nodup.fastq.gz' + params: + fastp_dir=PRE_PROC_DIR, + min_length = config['preprocess']['min_length'] + output: + fwd_tr=PRE_PROC_DIR+'/{dataset}_dfinaltrim/{fastq_file}-R1.clean.nodup.fastp.fastq.gz', + rev_tr=PRE_PROC_DIR+'/{dataset}_dfinaltrim/{fastq_file}-R2.clean.nodup.fastp.fastq.gz', + report1=PRE_PROC_DIR+'/{dataset}_dfinaltrim/{fastq_file}.clean.nodup.fastp.html', + report2=PRE_PROC_DIR+'/{dataset}_dfinaltrim/{fastq_file}.clean.nodup.fastp.json' + threads: int(cpus_avail/4) + singularity: singularity_img + group: 'preprocess' + shell: + ''' + fastp \ + -Q \ + --length_required {params.min_length} \ + --cut_tail -W 4 -M 20 \ + --in1 {input.fwd_clean_dedup} --in2 {input.rev_clean_dedup} \ + --out1 {output.fwd_tr} --out2 {output.rev_tr} \ + --html {output.report1} --json {output.report2} \ + --thread {threads} + ''' + +rule multiqc_preprocess_listing_files: + input: + # + atrimmed=lambda wildcards: ['{}/{}_atrimmed/{}.fastp.json'.format(PRE_PROC_DIR, value, FASTQS[ix]) + for ix,value in enumerate(DATASETSX) if value==wildcards.dataset], + # + dfinaltrim=lambda wildcards: ['{}/{}_dfinaltrim/{}.clean.nodup.fastp.json'.format(PRE_PROC_DIR, value, FASTQS[ix]) + for ix,value in enumerate(DATASETSX) if value==wildcards.dataset] + output: + multiqc_input_list = PRE_PROC_REPORT+'/{dataset}_multiqc_inputs.txt' + run: + import os + try: + os.makedirs(os.path.dirname(output.multiqc_input_list)) + except OSError: + pass + + with open(output.multiqc_input_list, mode='w', encoding='utf-8') as out: + for item in input.atrimmed+input.dfinaltrim: + out.write("%s\n" % item) + +rule multiqc_preprocess: + input: + PRE_PROC_REPORT+'/{dataset}_multiqc_inputs.txt' + output: + multiqc_report=report(PRE_PROC_REPORT+'/{dataset}_multiqc.html', + category='preprocess') + singularity: singularity_img + shell: + ''' + multiqc -f --file-list {input} --filename {output.multiqc_report} + '''