Skip to content
Snippets Groups Projects
Commit a1c1f04b authored by Ticlla Ccenhua Monica Roxana's avatar Ticlla Ccenhua Monica Roxana
Browse files

add target rule "preprocess" for pre-processing of paired fastq files

parent 2ca82660
Branches
No related tags found
No related merge requests found
......@@ -10,7 +10,7 @@ import os
##----------------------------------------------------------------------------##
from multiprocessing import cpu_count
cpus_avail = cpu_count()
print('CPUs available: {}'.format(cpus_avail))
##----------------------------------------------------------------------------##
## Working directory
##----------------------------------------------------------------------------##
......@@ -62,9 +62,13 @@ RESULTS_DIR = config['PREFIX_DIR'] +'/results'
LOGS_DIR = config['PREFIX_DIR'] +'/logs'
REPORTS_DIR = config['PREFIX_DIR'] +'/reports'
RAW_QC_DIR = RESULTS_DIR + '/rawQC'
RAW_QC_REPORT = REPORTS_DIR + '/rawQC'
PRE_PROC_DIR = RESULTS_DIR + '/preprocessed'
PRE_PROC_REPORT = REPORTS_DIR + '/preprocessed'
##----------------------------------------------------------------------------##
## Fastq files to be processed
##----------------------------------------------------------------------------##
......@@ -91,4 +95,13 @@ rule rawQC:
input:
expand(RAW_QC_REPORT + '/{dataset}_multiqc.html', dataset=set(DATASETS))
rule preprocess:
input:
expand(PRE_PROC_REPORT + '/{dataset}_multiqc.html', dataset=set(DATASETS))
#rule merge_samples:
# input:
# expand()
include: "rules/rawQC.smk"
include: "rules/preprocess.smk"
......@@ -13,4 +13,26 @@ SAMPLE_UNITS:
#-------------------------------------------------------------------------------
rawQC:
samplerate: 0.1
preprocess:
adapter: 'CTGTCTCTTATACACATCT'
min_length: 50
# Activate 'low-memory' mode for bbmap
# -------------------------------------
# value is either t (true) or f (false)
# BBMap normally uses roughly 6 bytes per reference base. With low-memory mode
# (triggered by the flag “usemodulo”), BBMap will use approximately 3 bytes
# per base, with a slight reduction in sensitivity
bbmap_usemodulo: t
bbmap_mem: 10
# PATH to reference(e.g human) genome
# -----------------------------------
# Masked hg19 kindly provided by Brian Brushnell and manually dowloaded from
# https://drive.google.com/open?id=0B3llHR93L14wd0pSSnFULUlhcUk
# How the human genome was masked is explained in this SEQanswers entry
# http://seqanswers.com/forums/archive/index.php/t-42552.html
bbmap_reference: 'external/hg19_main_mask_ribo_animal_allplant_allfungus.fa.gz'
#bbmap_reference: 'external/reference_test_2.fa.gz'
bbmap_ref_dir: 'external/ref_indexed'
'''
Author: Monica R. Ticlla
Afiliation(s): SIB, SwissTPH, UNIBAS
Description: rules for pre-processing of paired-end shotgun DNA metagenomic
sequencing.
'''
localrules:
multiqc_preprocess_listing_files,
multiqc_preprocess
##----------------------------------------------------------------------------##
## Local variables
##----------------------------------------------------------------------------##
singularity_img = 'shub://mticlla/MetagenomicSnake:preqc_v0_1'
BBMAP_REF = config['preprocess']['bbmap_reference']
BBMAP_REF_DIR = config['preprocess']['bbmap_ref_dir']
##----------------------------------------------------------------------------##
## Rules with target files
##----------------------------------------------------------------------------##
# FastQC only performs a quality check but no QC processing is done.
# With Fastp a quality check is performed and both paired fastq files
# are processed as follows:
# - remove adapters: here we provide the Nextera XT adapters,
# check this page from Illumina to know which adapters to remove:
# https://support.illumina.com/bulletins/2016/12/what-sequences-do-i-use-for-adapter-trimming.html
# - base correction in overlapped regions
# - trimming of the last base in read 1
# - discard reads shorter than a minimum length, after trimming
# It provides a report with quality check, before and after processing
rule trim_adapters:
input:
fwd=lambda wildcards: ['{}/{}/{}-R1.fastq.gz'.format(RAW_FASTQ_DIR, DATASETSX[ix], value)
for ix,value in enumerate(FASTQS) if value==wildcards.fastq_file],
#RAW_FASTQ_DIR+'/{dataset}/{fastq_file}-R1.fastq.gz',
rev=lambda wildcards: ['{}/{}/{}-R2.fastq.gz'.format(RAW_FASTQ_DIR, DATASETSX[ix], value)
for ix,value in enumerate(FASTQS) if value==wildcards.fastq_file],
#RAW_FASTQ_DIR+'/{dataset}/{fastq_file}-R2.fastq.gz'
output:
fwd_tr=temp(PRE_PROC_DIR+'/{dataset}_atrimmed/{fastq_file}-R1.fastp.fastq.gz'),
rev_tr=temp(PRE_PROC_DIR+'/{dataset}_atrimmed/{fastq_file}-R2.fastp.fastq.gz'),
report1=PRE_PROC_DIR+'/{dataset}_atrimmed/{fastq_file}.fastp.html',
report2=PRE_PROC_DIR+'/{dataset}_atrimmed/{fastq_file}.fastp.json'
log:
LOGS_DIR + '/preprocess/{dataset}_atrimmed/{fastq_file}.log'
params:
fastp_dir=PRE_PROC_DIR,
adapter = config['preprocess']['adapter'],
min_length = config['preprocess']['min_length']
threads: int(cpus_avail/4)
singularity: singularity_img
group: 'preprocess'
shell:
'''
echo "Running trim_adapters with {threads} cores."
[ ! -d {params.fastp_dir} ] && mkdir {params.fastp_dir}
fastp \
--adapter_sequence {params.adapter} \
--adapter_sequence_r2 {params.adapter} \
--detect_adapter_for_pe \
--correction \
--overrepresentation_analysis \
--length_required {params.min_length} \
--trim_tail1 1 \
--in1 {input.fwd} --in2 {input.rev} \
--out1 {output.fwd_tr} --out2 {output.rev_tr} \
--html {output.report1} --json {output.report2} \
--thread {threads}
'''
# After pre-processing with fastp, human reads have to be removed
#
rule filter_human:
input:
human_ref=BBMAP_REF_DIR,
fwd_tr=rules.trim_adapters.output.fwd_tr,
rev_tr=rules.trim_adapters.output.rev_tr
#fwd_tr=PRE_PROC_DIR+'/{dataset}_atrimmed/{fastq_file}-R1.fastp.fastq.gz',
#rev_tr=PRE_PROC_DIR+'/{dataset}_atrimmed/{fastq_file}-R2.fastp.fastq.gz'
output:
fwd_clean = temp(PRE_PROC_DIR+'/{dataset}_bfiltered/{fastq_file}-R1.clean.fastq.gz'),
rev_clean = temp(PRE_PROC_DIR+'/{dataset}_bfiltered/{fastq_file}-R2.clean.fastq.gz'),
bbmap_scafstats = PRE_PROC_DIR+'/{dataset}_bfiltered/{fastq_file}.clean.scafstats',
bbmap_refstats = PRE_PROC_DIR+'/{dataset}_bfiltered/{fastq_file}.clean.stats'
log:
LOGS_DIR + '/preprocess/{dataset}_bfiltered/{fastq_file}.log'
params:
low_mem = config['preprocess']['bbmap_usemodulo'],
mem_gb = config['preprocess']['bbmap_mem']
threads:cpus_avail
singularity: singularity_img
group: 'preprocess'
shell:
'''
echo "Running filter_human with {threads} cores."
[ ! -d {PRE_PROC_DIR} ] && mkdir {PRE_PROC_DIR}
bbmap.sh \
usemodulo={params.low_mem} \
minid=0.95 \
maxindel=3 \
bwr=0.16 \
bw=12 \
minhits=2 \
quickmatch \
fast \
qtrim=rl \
trimq=10 \
untrim \
path={input.human_ref} \
in1={input.fwd_tr} in2={input.rev_tr} \
outu1={output.fwd_clean} outu2={output.rev_clean} \
sortscafs=t \
scafstats={output.bbmap_scafstats} \
statsfile={output.bbmap_refstats} \
unpigz=t pigz=t -Xmx{params.mem_gb}g \
threads={threads}
'''
# This is an intensive job
#
rule index_human_ref:
input:
reference = BBMAP_REF
output:
directory(BBMAP_REF_DIR)
log:
LOGS_DIR + '/preprocess/ref_indexing.log'
params:
usemodulo=config['preprocess']['bbmap_usemodulo']
threads: cpus_avail
singularity: singularity_img
group: 'preprocess'
shell:
'''
echo "Running index_human_ref with {threads} cores."
# Masked hg19 kindly provided by Brian Brushnell and manually dowloaded from
# https://drive.google.com/open?id=0B3llHR93L14wd0pSSnFULUlhcUk
bbmap.sh \
ref={input.reference} \
usemodulo={params.usemodulo} \
path={output} \
threads={threads}
'''
#
rule dedupe:
input:
fwd_clean = PRE_PROC_DIR+'/{dataset}_bfiltered/{fastq_file}-R1.clean.fastq.gz',
rev_clean = PRE_PROC_DIR+'/{dataset}_bfiltered/{fastq_file}-R2.clean.fastq.gz',
output:
fwd_clean_dedup = temp(PRE_PROC_DIR+'/{dataset}_cdedupe/{fastq_file}-R1.clean.nodup.fastq.gz'),
rev_clean_dedup = temp(PRE_PROC_DIR+'/{dataset}_cdedupe/{fastq_file}-R2.clean.nodup.fastq.gz'),
fastq_duplicates = PRE_PROC_DIR+'/{dataset}_cdedupe/{fastq_file}.clean.duplicates.fastq.gz'
log:
LOGS_DIR + '/preprocess/{dataset}_cdedupe/{fastq_file}.log'
threads:cpus_avail
singularity: singularity_img
group: 'preprocess'
shell:
'''
dedupe.sh \
in1={input.fwd_clean} in2={input.rev_clean} \
out=stdout.fq \
outd={output.fastq_duplicates} \
ac=f minidentity=99 | \
reformat.sh \
int=t in=stdin.fq \
out1={output.fwd_clean_dedup} \
out2={output.rev_clean_dedup} \
threads={threads}
'''
# After quality filtering and removal of adapters, human reads, and duplicates,
# the reads' 3'end are quality trimmed
rule trim3end_dedupe:
input:
fwd_clean_dedup = PRE_PROC_DIR+'/{dataset}_cdedupe/{fastq_file}-R1.clean.nodup.fastq.gz',
rev_clean_dedup = PRE_PROC_DIR+'/{dataset}_cdedupe/{fastq_file}-R2.clean.nodup.fastq.gz'
params:
fastp_dir=PRE_PROC_DIR,
min_length = config['preprocess']['min_length']
output:
fwd_tr=PRE_PROC_DIR+'/{dataset}_dfinaltrim/{fastq_file}-R1.clean.nodup.fastp.fastq.gz',
rev_tr=PRE_PROC_DIR+'/{dataset}_dfinaltrim/{fastq_file}-R2.clean.nodup.fastp.fastq.gz',
report1=PRE_PROC_DIR+'/{dataset}_dfinaltrim/{fastq_file}.clean.nodup.fastp.html',
report2=PRE_PROC_DIR+'/{dataset}_dfinaltrim/{fastq_file}.clean.nodup.fastp.json'
threads: int(cpus_avail/4)
singularity: singularity_img
group: 'preprocess'
shell:
'''
fastp \
-Q \
--length_required {params.min_length} \
--cut_tail -W 4 -M 20 \
--in1 {input.fwd_clean_dedup} --in2 {input.rev_clean_dedup} \
--out1 {output.fwd_tr} --out2 {output.rev_tr} \
--html {output.report1} --json {output.report2} \
--thread {threads}
'''
rule multiqc_preprocess_listing_files:
input:
#
atrimmed=lambda wildcards: ['{}/{}_atrimmed/{}.fastp.json'.format(PRE_PROC_DIR, value, FASTQS[ix])
for ix,value in enumerate(DATASETSX) if value==wildcards.dataset],
#
dfinaltrim=lambda wildcards: ['{}/{}_dfinaltrim/{}.clean.nodup.fastp.json'.format(PRE_PROC_DIR, value, FASTQS[ix])
for ix,value in enumerate(DATASETSX) if value==wildcards.dataset]
output:
multiqc_input_list = PRE_PROC_REPORT+'/{dataset}_multiqc_inputs.txt'
run:
import os
try:
os.makedirs(os.path.dirname(output.multiqc_input_list))
except OSError:
pass
with open(output.multiqc_input_list, mode='w', encoding='utf-8') as out:
for item in input.atrimmed+input.dfinaltrim:
out.write("%s\n" % item)
rule multiqc_preprocess:
input:
PRE_PROC_REPORT+'/{dataset}_multiqc_inputs.txt'
output:
multiqc_report=report(PRE_PROC_REPORT+'/{dataset}_multiqc.html',
category='preprocess')
singularity: singularity_img
shell:
'''
multiqc -f --file-list {input} --filename {output.multiqc_report}
'''
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment