Skip to content
Snippets Groups Projects
Commit 5c7fb77c authored by Ticlla Ccenhua Monica Roxana's avatar Ticlla Ccenhua Monica Roxana
Browse files

add rules for pre-processing of fastq files and to merge fastqs from the same sample

parent e17c37c4
Branches
No related tags found
No related merge requests found
......@@ -15,9 +15,11 @@ rawQC:
samplerate: 0.1
preprocess:
trim_adapters:
adapter: 'CTGTCTCTTATACACATCT'
min_length: 50
trim_tails: 1
filter_human:
# Activate 'low-memory' mode for bbmap
# -------------------------------------
# value is either t (true) or f (false)
......
......@@ -9,4 +9,24 @@ datasets from human microbiomes.
Modules
-------
rawQC
**rawQC**
runs FastQC on a random sample of R1 reads from the paired fastq-format files.
**preprocess**
FastQC only performs a quality check but no QC processing is done. The **preprocess**
rule runs a multi-step pre-processing of the paired fastq files, includes:
- Adapter-trimming with "fastp"
-
- with Fastp a quality check is performed and both paired fastq files are processed as follows:
+ remove adapters: here we provide the Nextera XT adapters,
+ base correction in overlapped regions
+ trimming of the last base in read 1
+ discard reads shorter than a minimum length, after trimming
+ a report with quality check, before and after processing
- removal of reads derived from human DNA
- removal of duplicated reads
**merge_samples**
merges fastq files corresponding to the same sample into a single pair of fastq files, after pre-processing of paired fastq files.
'''
Author: Monica R. Ticlla
Afiliation(s): SIB, SwissTPH, UNIBAS
Description: after pre-processing of paired fastq files form paired-end shotgun
DNA sequencing of metagenomic samples, fastq files corresponding to the same
sample are merged into a single pair of fastq files.
'''
localrules:
multiqc_merged_list_files,
multiqc_merged
##----------------------------------------------------------------------------##
## Local variables
##----------------------------------------------------------------------------##
singularity_img = 'shub://mticlla/MetagenomicSnake:preqc_v0_1'
##----------------------------------------------------------------------------##
## Rules with target files
##----------------------------------------------------------------------------##
###**THESE TARGET FILES ARE THE FINAL CLEAN**###
# concatenate cleaned,deduplicated and trimmed fastqs from the same samples
rule concatenate_fastqs:
input:
#
sample_fwds = lambda wildcards: ['{}/{}_dfinaltrim/{}-{}_{}-R1.clean.nodup.fastp.fastq.gz'.format(
PRE_PROC_DIR, DATASETS[ix], value, RUNS[ix], LANES[ix])
for ix,value in enumerate(SAMPLES) if (value == wildcards.sample) and (DATASETS[ix] == wildcards.dataset)],
#
sample_revs = lambda wildcards: ['{}/{}_dfinaltrim/{}-{}_{}-R2.clean.nodup.fastp.fastq.gz'.format(
PRE_PROC_DIR, DATASETS[ix], value, RUNS[ix], LANES[ix])
for ix,value in enumerate(SAMPLES) if (value==wildcards.sample) and (DATASETS[ix]==wildcards.dataset)]
output:
sample_fwd = MERGE_DIR + '/{dataset}_merged/{sample}-R1.fastq.gz',
sample_rev = MERGE_DIR + '/{dataset}_merged/{sample}-R2.fastq.gz'
wildcard_constraints:
sample = '\w+'
group: 'preprocess'
shell:
'''
cat {input.sample_fwds} > {output.sample_fwd}
cat {input.sample_revs} > {output.sample_rev}
'''
# Final quality check with Fastp, but no further QC processing
rule fastp_concatenated:
input:
sample_fwd = MERGE_DIR + '/{dataset}_merged/{sample}-R1.fastq.gz',
sample_rev = MERGE_DIR + '/{dataset}_merged/{sample}-R2.fastq.gz'
output:
report1 = MERGE_DIR + '/{dataset}_merged/{sample}.fastp.html',
report2 = MERGE_DIR + '/{dataset}_merged/{sample}.fastp.json'
wildcard_constraints:
sample = '\w+'
threads: cpus_avail
group: 'preprocess'
singularity: singularity_img
shell:
'''
fastp \
-A \
--in1 {input.sample_fwd} --in2 {input.sample_rev} \
--html {output.report1} --json {output.report2} \
--thread {threads}
'''
rule multiqc_merged_list_files:
input:
sample_fastp_report = lambda wildcards: ['{}/{}_merged/{}.fastp.json'.format(
MERGE_DIR, wildcards.dataset, value) for ix,value in enumerate(SAMPLES)
if DATASETS[ix]==wildcards.dataset]
output:
multiqc_input_list = MERGE_REPORT + '/{dataset}_multiqc_inputs.txt'
run:
import os
try:
os.makedirs(os.path.dirname(output.multiqc_input_list))
except OSError:
pass
with open(output.multiqc_input_list, mode='w', encoding='utf-8') as out:
for item in set(input.sample_fastp_report):
out.write("%s\n" % item)
rule multiqc_merged:
input:
MERGE_REPORT + '/{dataset}_multiqc_inputs.txt'
output:
multiqc_report=report(MERGE_REPORT + '/{dataset}_multiqc.html',
category='merge_samples')
singularity: singularity_img
shell:
'''
multiqc -f --file-list {input} --filename {output.multiqc_report}
'''
......@@ -5,14 +5,14 @@ Description: rules for pre-processing of paired-end shotgun DNA metagenomic
sequencing.
'''
localrules:
multiqc_preprocess_listing_files,
multiqc_preprocess_list_files,
multiqc_preprocess
##----------------------------------------------------------------------------##
## Local variables
##----------------------------------------------------------------------------##
singularity_img = 'shub://mticlla/MetagenomicSnake:preqc_v0_1'
BBMAP_REF = config['preprocess']['bbmap_reference']
BBMAP_REF_DIR = config['preprocess']['bbmap_ref_dir']
BBMAP_REF = config['preprocess']['filter_human']['bbmap_reference']
BBMAP_REF_DIR = config['preprocess']['filter_human']['bbmap_ref_dir']
##----------------------------------------------------------------------------##
## Rules with target files
......@@ -21,12 +21,17 @@ BBMAP_REF_DIR = config['preprocess']['bbmap_ref_dir']
# FastQC only performs a quality check but no QC processing is done.
# With Fastp a quality check is performed and both paired fastq files
# are processed as follows:
# - remove adapters: here we provide the Nextera XT adapters,
# check this page from Illumina to know which adapters to remove:
# - default fastp's quality filtering
# - remove adapters: enabled by default, fastp detects adapters by per-read overlap
# analysis (seeks for the overlap of each read pair). If fastp fails to find an
# overlap, here we provide the adapter sequences (e.g Nextera XT adapters).
# Check this page from Illumina to know which adapters to remove:
# https://support.illumina.com/bulletins/2016/12/what-sequences-do-i-use-for-adapter-trimming.html
# - base correction in overlapped regions
# - trimming of the last base in read 1
# - trimming of the last base(s) for every read pair( this step preceeds adapter removal)
# - discard reads shorter than a minimum length, after trimming
# WARNING: cutting by quality score is not done at this stage because it interferes with
# deduplication step
# It provides a report with quality check, before and after processing
rule trim_adapters:
input:
......@@ -45,8 +50,9 @@ rule trim_adapters:
LOGS_DIR + '/preprocess/{dataset}_atrimmed/{fastq_file}.log'
params:
fastp_dir = PRE_PROC_DIR,
adapter = config['preprocess']['adapter'],
min_length = config['preprocess']['min_length']
adapter = config['preprocess']['trim_adapters']['adapter'],
min_length = config['preprocess']['trim_adapters']['min_length'],
trim_tails = config['preprocess']['trim_adapters']['trim_tails']
threads: cpus_avail
singularity: singularity_img
group: 'preprocess'
......@@ -54,18 +60,18 @@ rule trim_adapters:
'''
echo "Running trim_adapters with {threads} cores."
[ ! -d {params.fastp_dir} ] && mkdir {params.fastp_dir}
fastp \
(fastp \
--adapter_sequence {params.adapter} \
--adapter_sequence_r2 {params.adapter} \
--detect_adapter_for_pe \
--correction \
--overrepresentation_analysis \
--length_required {params.min_length} \
--trim_tail1 1 \
--trim_tail1 {params.trim_tails} \
--trim_tail2 {params.trim_tails} \
--in1 {input.fwd} --in2 {input.rev} \
--out1 {output.fwd_tr} --out2 {output.rev_tr} \
--html {output.report1} --json {output.report2} \
--thread {threads}
--thread {threads}) &>{log}
'''
# After pre-processing with fastp, human reads have to be removed
#
......@@ -84,8 +90,8 @@ rule filter_human:
log:
LOGS_DIR + '/preprocess/{dataset}_bfiltered/{fastq_file}.log'
params:
low_mem = config['preprocess']['bbmap_usemodulo'],
mem_gb = config['preprocess']['bbmap_mem']
low_mem = config['preprocess']['filter_human']['bbmap_usemodulo'],
mem_gb = config['preprocess']['filter_human']['bbmap_mem']
threads:cpus_avail
singularity: singularity_img
group: 'preprocess'
......@@ -93,7 +99,7 @@ rule filter_human:
'''
echo "Running filter_human with {threads} cores."
[ ! -d {PRE_PROC_DIR} ] && mkdir {PRE_PROC_DIR}
bbmap.sh \
(bbmap.sh \
usemodulo={params.low_mem} \
minid=0.95 \
maxindel=3 \
......@@ -112,7 +118,7 @@ rule filter_human:
scafstats={output.bbmap_scafstats} \
statsfile={output.bbmap_refstats} \
unpigz=t pigz=t -Xmx{params.mem_gb}g \
threads={threads}
threads={threads}) &>{log}
'''
# This is an intensive job
......@@ -125,7 +131,7 @@ rule index_human_ref:
log:
LOGS_DIR + '/preprocess/ref_indexing.log'
params:
usemodulo=config['preprocess']['bbmap_usemodulo']
usemodulo=config['preprocess']['filter_human']['bbmap_usemodulo']
threads: cpus_avail
singularity: singularity_img
group: 'preprocess'
......@@ -134,13 +140,13 @@ rule index_human_ref:
echo "Running index_human_ref with {threads} cores."
# Masked hg19 kindly provided by Brian Brushnell and manually dowloaded from
# https://drive.google.com/open?id=0B3llHR93L14wd0pSSnFULUlhcUk
bbmap.sh \
(bbmap.sh \
ref={input.reference} \
usemodulo={params.usemodulo} \
path={output} \
threads={threads}
threads={threads}) &>{log}
'''
#
# Deduplication step with BBTool dedupe.sh
rule dedupe:
input:
fwd_clean = PRE_PROC_DIR+'/{dataset}_bfiltered/{fastq_file}-R1.clean.fastq.gz',
......@@ -156,7 +162,7 @@ rule dedupe:
group: 'preprocess'
shell:
'''
dedupe.sh \
(dedupe.sh \
in1={input.fwd_clean} in2={input.rev_clean} \
out=stdout.fq \
outd={output.fastq_duplicates} \
......@@ -165,43 +171,45 @@ rule dedupe:
int=t in=stdin.fq \
out1={output.fwd_clean_dedup} \
out2={output.rev_clean_dedup} \
threads={threads}
threads={threads}) &>{log}
'''
# After quality filtering and removal of adapters, human reads, and duplicates,
# the reads' 3'end are quality trimmed
# After removal of adapters, human reads, and duplicates,
# the reads' 3'end are quality trimmed with fastp
# Notice that quality filtering is disabled because it was done by rule trim_adapters
rule trim3end_dedupe:
input:
fwd_clean_dedup = PRE_PROC_DIR+'/{dataset}_cdedupe/{fastq_file}-R1.clean.nodup.fastq.gz',
rev_clean_dedup = PRE_PROC_DIR+'/{dataset}_cdedupe/{fastq_file}-R2.clean.nodup.fastq.gz'
params:
fastp_dir = PRE_PROC_DIR,
min_length = config['preprocess']['min_length']
min_length = config['preprocess']['trim_adapters']['min_length']
output:
fwd_tr = PRE_PROC_DIR+'/{dataset}_dfinaltrim/{fastq_file}-R1.clean.nodup.fastp.fastq.gz',
rev_tr = PRE_PROC_DIR+'/{dataset}_dfinaltrim/{fastq_file}-R2.clean.nodup.fastp.fastq.gz',
report1 = PRE_PROC_DIR+'/{dataset}_dfinaltrim/{fastq_file}.clean.nodup.fastp.html',
report2 = PRE_PROC_DIR+'/{dataset}_dfinaltrim/{fastq_file}.clean.nodup.fastp.json'
threads: int(cpus_avail/4)
threads: cpus_avail
singularity: singularity_img
group: 'preprocess'
shell:
'''
fastp \
(fastp \
-Q \
--overrepresentation_analysis \
--length_required {params.min_length} \
--cut_tail -W 4 -M 20 \
--in1 {input.fwd_clean_dedup} --in2 {input.rev_clean_dedup} \
--out1 {output.fwd_tr} --out2 {output.rev_tr} \
--html {output.report1} --json {output.report2} \
--thread {threads}
--thread {threads}) &2>{log}
'''
rule multiqc_preprocess_listing_files:
rule multiqc_preprocess_list_files:
input:
#
atrimmed = lambda wildcards: ['{}/{}_atrimmed/{}.fastp.json'.format(
PRE_PROC_DIR, value, FASTQS[ix])
for ix,value in enumerate(DATASETSX) if value==wildcards.dataset],
#atrimmed = lambda wildcards: ['{}/{}_atrimmed/{}.fastp.json'.format(
#PRE_PROC_DIR, value, FASTQS[ix])
#for ix,value in enumerate(DATASETSX) if value==wildcards.dataset],
#
dfinaltrim = lambda wildcards: ['{}/{}_dfinaltrim/{}.clean.nodup.fastp.json'.format(
PRE_PROC_DIR, value, FASTQS[ix])
......@@ -216,7 +224,7 @@ rule multiqc_preprocess_listing_files:
pass
with open(output.multiqc_input_list, mode='w', encoding='utf-8') as out:
for item in input.atrimmed+input.dfinaltrim:
for item in input.dfinaltrim:
out.write("%s\n" % item)
rule multiqc_preprocess:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment