Skip to content
Snippets Groups Projects
Commit bc357ae8 authored by Ticlla Ccenhua Monica Roxana's avatar Ticlla Ccenhua Monica Roxana
Browse files

update rules/preprocess.smk for better compatibility when running in cluster mode

parent 418e318e
Branches
No related tags found
No related merge requests found
......@@ -20,7 +20,8 @@ localrules:
mqc_trim_3end_list_files,
mqc_trim_adapters,
mqc_filter_human,
multiqc_trim_3end
multiqc_trim_3end,
summarize_preQC
##----------------------------------------------------------------------------##
## Local variables
##----------------------------------------------------------------------------##
......@@ -77,7 +78,10 @@ def list_concatenated_r2_fastqs(wildcards):
# With Fastp a quality check is performed and both paired fastq files
# are processed as follows:
#
# - default fastp's quality filtering
# - default fastp's quality filtering by:
# - limiting the N gase number (-n)
# - percentage of unqualified bases (-u)
# - average quality score (-e, default 0 means no requirement)
# - remove adapters: enabled by default, fastp detects adapters by
# per-read overlap analysis (seeks for the overlap of each read pair).
# If fastp fails to find an overlap, It usess the provided the adapter
......@@ -87,7 +91,7 @@ def list_concatenated_r2_fastqs(wildcards):
# - base correction in overlapped regions
# - trimming of the last base(s) for every read pair(this step preceeds
# adapter removal)
# - discard reads shorter than a minimum length, after trimming
# - length filtering: discard reads shorter than a minimum length, after trimming
#
# WARNING: cutting by quality score is not done at this stage because it
# interferes with deduplication step (rule dedupe).
......@@ -135,8 +139,8 @@ rule trim_adapters:
rule filter_human:
input:
human_ref = BBMAP_REF_DIR,
fwd_tr = rules.trim_adapters.output.fwd_tr,
rev_tr = rules.trim_adapters.output.rev_tr
fwd_tr = OUT_DIR+'/{dataset}/preQC/atrimmed/{fastq_file}-R1.fastp.fastq.gz',
rev_tr = OUT_DIR+'/{dataset}/preQC/atrimmed/{fastq_file}-R2.fastp.fastq.gz',
output:
fwd_clean = temp(OUT_DIR+'/{dataset}/preQC/bfiltered/{fastq_file}-R1.clean.fastq.gz'),
rev_clean = temp(OUT_DIR+'/{dataset}/preQC/bfiltered/{fastq_file}-R2.clean.fastq.gz'),
......@@ -147,6 +151,9 @@ rule filter_human:
params:
low_mem = config['preprocess']['filter_human']['bbmap_usemodulo'],
mem_gb = config['preprocess']['filter_human']['bbmap_mem']
resources:
# in minutes
runtime = lambda wildcards, attempt: 120*attempt
threads:cpus_avail
singularity: singularity_img
group: 'preprocess'
......@@ -192,7 +199,6 @@ rule index_human_ref:
mem_gb = config['preprocess']['filter_human']['bbmap_mem']
threads: cpus_avail
singularity: singularity_img
group: 'preprocess'
message: "Running index_human_ref with {threads} cores."
shell:
'''
......@@ -215,6 +221,9 @@ rule dedupe:
log:
OUT_DIR + '/{dataset}/preQC/logs/cdedupe/{fastq_file}.log'
threads:cpus_avail
params:
dd_mem_gb = (config['preprocess']['filter_human']['bbmap_mem']/3)*2,
rf_mem_gb = config['preprocess']['filter_human']['bbmap_mem']/4
singularity: singularity_img
group: 'preprocess'
message: "Running dedupe with {threads} cores."
......@@ -224,16 +233,18 @@ rule dedupe:
in1={input.fwd_clean} in2={input.rev_clean} \
out=stdout.fq \
outd={output.fastq_duplicates} \
ac=f minidentity=99 | \
ac=f minidentity=99 \
-Xmx{params.dd_mem_gb}g| \
reformat.sh \
int=t in=stdin.fq \
out1={output.fwd_clean_dedup} \
out2={output.rev_clean_dedup} \
threads={threads}) &>{log}
threads={threads} \
-Xmx{params.rf_mem_gb}g) &>{log}
'''
# After removal of adapters, human reads, and duplicates,
# the reads' 3'end are quality trimmed with fastp
# Notice that quality filtering is disabled because it was done by rule trim_adapters
# the reads' 3'end are quality trimmed (cut by quality score) with fastp
# Notice that adapter- and quality- filtering are disabled because it was done by rule trim_adapters
rule trim_3end:
input:
fwd_clean_dedup = OUT_DIR+'/{dataset}/preQC/cdedupe/{fastq_file}-R1.clean.nodup.fastq.gz',
......@@ -254,17 +265,18 @@ rule trim_3end:
shell:
'''
(fastp \
-Q \
--overrepresentation_analysis \
--disable_quality_filtering \
--disable_adapter_trimming \
--length_required {params.min_length} \
--cut_tail -W 4 -M 20 \
--in1 {input.fwd_clean_dedup} --in2 {input.rev_clean_dedup} \
--out1 {output.fwd_tr} --out2 {output.rev_tr} \
--html {output.report1} --json {output.report2} \
--thread {threads}) &2>{log}
--thread {threads}) &>{log}
'''
###**THESE TARGET FILES ARE THE FINAL CLEAN**###
# concatenate adapter-trimmed, cleaned,deduplicated and quality-trimmed fastqs from the same samples
# concatenate quality/length filtered, adapter-trimmed, cleaned, deduplicated and
# quality-trimmed fastqs from the same samples
rule concatenate_fastqs:
input:
#
......@@ -280,7 +292,6 @@ rule concatenate_fastqs:
sample_rev = protected(OUT_DIR + '/{dataset}/preQC/emerged/{sample}-R2.fastq.gz')
wildcard_constraints:
sample = '\w+'
group: 'preprocess'
message: "Running concatenate_fastqs ..."
shell:
'''
......@@ -295,7 +306,7 @@ rule check_concatenation:
output:
concatenated_fastqs_list = OUT_DIR + '/{dataset}/preQC/summary_stats/' +
'{dataset}_concatenation_all.done'
conda:'../envs/rawQC.yaml'
#conda:'../envs/rawQC.yaml'
run:
import os
from pathlib import Path
......@@ -390,7 +401,7 @@ rule mqc_filter_human:
'/{dataset}/preQC/multiqc/bfiltered' +
'/{dataset}_bfiltered_stats.tsv',
category='preQC_step2:filter_human')
conda:'../envs/rawQC.yaml'
#conda:'../envs/rawQC.yaml'
run:
from utils import summarize_filter_human_step
mqc_stats_data = summarize_filter_human_step('{}'.format(input))
......@@ -450,7 +461,7 @@ rule summarize_preQC:
'/{dataset}/preQC/summary_stats'+
'/{dataset}_preqc_samples_pct_barchart.svg',
category='preQC:summaries')
conda:'../envs/rawQC.yaml'
#conda:'../envs/rawQC.yaml'
run:
from utils import summarize_preqc
from utils import plot_preqc_summary
......
......@@ -161,7 +161,7 @@ def plot_preqc_summary(preqc_summary_df, by='units', plot_type='raw'):
# This uses the given y-units of bottom spacing for each bar
# You may increase this number a bit to have more spacing between bars and text.
bottom_spacing = top_lim/1.8
rgrids_positions = sample_run_lane['total_reads'].describe()[[3,5,7]].values + bottom_spacing
rgrids_positions = [sample_run_lane['total_reads'].max()/(i) for i in [10,4,2,1]] + bottom_spacing
rgrids_positions_labels = [' {}M'.format(nr)
for ix,nr in enumerate(np.round((rgrids_positions-bottom_spacing)/1000000, decimals=3))]
else:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment