Skip to content
Snippets Groups Projects
Commit bc357ae8 authored by Ticlla Ccenhua Monica Roxana's avatar Ticlla Ccenhua Monica Roxana
Browse files

update rules/preprocess.smk for better compatibility when running in cluster mode

parent 418e318e
No related branches found
No related tags found
No related merge requests found
...@@ -20,7 +20,8 @@ localrules: ...@@ -20,7 +20,8 @@ localrules:
mqc_trim_3end_list_files, mqc_trim_3end_list_files,
mqc_trim_adapters, mqc_trim_adapters,
mqc_filter_human, mqc_filter_human,
multiqc_trim_3end multiqc_trim_3end,
summarize_preQC
##----------------------------------------------------------------------------## ##----------------------------------------------------------------------------##
## Local variables ## Local variables
##----------------------------------------------------------------------------## ##----------------------------------------------------------------------------##
...@@ -77,7 +78,10 @@ def list_concatenated_r2_fastqs(wildcards): ...@@ -77,7 +78,10 @@ def list_concatenated_r2_fastqs(wildcards):
# With Fastp a quality check is performed and both paired fastq files # With Fastp a quality check is performed and both paired fastq files
# are processed as follows: # are processed as follows:
# #
# - default fastp's quality filtering # - default fastp's quality filtering by:
# - limiting the N gase number (-n)
# - percentage of unqualified bases (-u)
# - average quality score (-e, default 0 means no requirement)
# - remove adapters: enabled by default, fastp detects adapters by # - remove adapters: enabled by default, fastp detects adapters by
# per-read overlap analysis (seeks for the overlap of each read pair). # per-read overlap analysis (seeks for the overlap of each read pair).
# If fastp fails to find an overlap, It usess the provided the adapter # If fastp fails to find an overlap, It usess the provided the adapter
...@@ -87,7 +91,7 @@ def list_concatenated_r2_fastqs(wildcards): ...@@ -87,7 +91,7 @@ def list_concatenated_r2_fastqs(wildcards):
# - base correction in overlapped regions # - base correction in overlapped regions
# - trimming of the last base(s) for every read pair(this step preceeds # - trimming of the last base(s) for every read pair(this step preceeds
# adapter removal) # adapter removal)
# - discard reads shorter than a minimum length, after trimming # - length filtering: discard reads shorter than a minimum length, after trimming
# #
# WARNING: cutting by quality score is not done at this stage because it # WARNING: cutting by quality score is not done at this stage because it
# interferes with deduplication step (rule dedupe). # interferes with deduplication step (rule dedupe).
...@@ -135,8 +139,8 @@ rule trim_adapters: ...@@ -135,8 +139,8 @@ rule trim_adapters:
rule filter_human: rule filter_human:
input: input:
human_ref = BBMAP_REF_DIR, human_ref = BBMAP_REF_DIR,
fwd_tr = rules.trim_adapters.output.fwd_tr, fwd_tr = OUT_DIR+'/{dataset}/preQC/atrimmed/{fastq_file}-R1.fastp.fastq.gz',
rev_tr = rules.trim_adapters.output.rev_tr rev_tr = OUT_DIR+'/{dataset}/preQC/atrimmed/{fastq_file}-R2.fastp.fastq.gz',
output: output:
fwd_clean = temp(OUT_DIR+'/{dataset}/preQC/bfiltered/{fastq_file}-R1.clean.fastq.gz'), fwd_clean = temp(OUT_DIR+'/{dataset}/preQC/bfiltered/{fastq_file}-R1.clean.fastq.gz'),
rev_clean = temp(OUT_DIR+'/{dataset}/preQC/bfiltered/{fastq_file}-R2.clean.fastq.gz'), rev_clean = temp(OUT_DIR+'/{dataset}/preQC/bfiltered/{fastq_file}-R2.clean.fastq.gz'),
...@@ -147,6 +151,9 @@ rule filter_human: ...@@ -147,6 +151,9 @@ rule filter_human:
params: params:
low_mem = config['preprocess']['filter_human']['bbmap_usemodulo'], low_mem = config['preprocess']['filter_human']['bbmap_usemodulo'],
mem_gb = config['preprocess']['filter_human']['bbmap_mem'] mem_gb = config['preprocess']['filter_human']['bbmap_mem']
resources:
# in minutes
runtime = lambda wildcards, attempt: 120*attempt
threads:cpus_avail threads:cpus_avail
singularity: singularity_img singularity: singularity_img
group: 'preprocess' group: 'preprocess'
...@@ -192,7 +199,6 @@ rule index_human_ref: ...@@ -192,7 +199,6 @@ rule index_human_ref:
mem_gb = config['preprocess']['filter_human']['bbmap_mem'] mem_gb = config['preprocess']['filter_human']['bbmap_mem']
threads: cpus_avail threads: cpus_avail
singularity: singularity_img singularity: singularity_img
group: 'preprocess'
message: "Running index_human_ref with {threads} cores." message: "Running index_human_ref with {threads} cores."
shell: shell:
''' '''
...@@ -215,6 +221,9 @@ rule dedupe: ...@@ -215,6 +221,9 @@ rule dedupe:
log: log:
OUT_DIR + '/{dataset}/preQC/logs/cdedupe/{fastq_file}.log' OUT_DIR + '/{dataset}/preQC/logs/cdedupe/{fastq_file}.log'
threads:cpus_avail threads:cpus_avail
params:
dd_mem_gb = (config['preprocess']['filter_human']['bbmap_mem']/3)*2,
rf_mem_gb = config['preprocess']['filter_human']['bbmap_mem']/4
singularity: singularity_img singularity: singularity_img
group: 'preprocess' group: 'preprocess'
message: "Running dedupe with {threads} cores." message: "Running dedupe with {threads} cores."
...@@ -224,16 +233,18 @@ rule dedupe: ...@@ -224,16 +233,18 @@ rule dedupe:
in1={input.fwd_clean} in2={input.rev_clean} \ in1={input.fwd_clean} in2={input.rev_clean} \
out=stdout.fq \ out=stdout.fq \
outd={output.fastq_duplicates} \ outd={output.fastq_duplicates} \
ac=f minidentity=99 | \ ac=f minidentity=99 \
-Xmx{params.dd_mem_gb}g| \
reformat.sh \ reformat.sh \
int=t in=stdin.fq \ int=t in=stdin.fq \
out1={output.fwd_clean_dedup} \ out1={output.fwd_clean_dedup} \
out2={output.rev_clean_dedup} \ out2={output.rev_clean_dedup} \
threads={threads}) &>{log} threads={threads} \
-Xmx{params.rf_mem_gb}g) &>{log}
''' '''
# After removal of adapters, human reads, and duplicates, # After removal of adapters, human reads, and duplicates,
# the reads' 3'end are quality trimmed with fastp # the reads' 3'end are quality trimmed (cut by quality score) with fastp
# Notice that quality filtering is disabled because it was done by rule trim_adapters # Notice that adapter- and quality- filtering are disabled because it was done by rule trim_adapters
rule trim_3end: rule trim_3end:
input: input:
fwd_clean_dedup = OUT_DIR+'/{dataset}/preQC/cdedupe/{fastq_file}-R1.clean.nodup.fastq.gz', fwd_clean_dedup = OUT_DIR+'/{dataset}/preQC/cdedupe/{fastq_file}-R1.clean.nodup.fastq.gz',
...@@ -254,17 +265,18 @@ rule trim_3end: ...@@ -254,17 +265,18 @@ rule trim_3end:
shell: shell:
''' '''
(fastp \ (fastp \
-Q \ --disable_quality_filtering \
--overrepresentation_analysis \ --disable_adapter_trimming \
--length_required {params.min_length} \ --length_required {params.min_length} \
--cut_tail -W 4 -M 20 \ --cut_tail -W 4 -M 20 \
--in1 {input.fwd_clean_dedup} --in2 {input.rev_clean_dedup} \ --in1 {input.fwd_clean_dedup} --in2 {input.rev_clean_dedup} \
--out1 {output.fwd_tr} --out2 {output.rev_tr} \ --out1 {output.fwd_tr} --out2 {output.rev_tr} \
--html {output.report1} --json {output.report2} \ --html {output.report1} --json {output.report2} \
--thread {threads}) &2>{log} --thread {threads}) &>{log}
''' '''
###**THESE TARGET FILES ARE THE FINAL CLEAN**### ###**THESE TARGET FILES ARE THE FINAL CLEAN**###
# concatenate adapter-trimmed, cleaned,deduplicated and quality-trimmed fastqs from the same samples # concatenate quality/length filtered, adapter-trimmed, cleaned, deduplicated and
# quality-trimmed fastqs from the same samples
rule concatenate_fastqs: rule concatenate_fastqs:
input: input:
# #
...@@ -280,7 +292,6 @@ rule concatenate_fastqs: ...@@ -280,7 +292,6 @@ rule concatenate_fastqs:
sample_rev = protected(OUT_DIR + '/{dataset}/preQC/emerged/{sample}-R2.fastq.gz') sample_rev = protected(OUT_DIR + '/{dataset}/preQC/emerged/{sample}-R2.fastq.gz')
wildcard_constraints: wildcard_constraints:
sample = '\w+' sample = '\w+'
group: 'preprocess'
message: "Running concatenate_fastqs ..." message: "Running concatenate_fastqs ..."
shell: shell:
''' '''
...@@ -295,7 +306,7 @@ rule check_concatenation: ...@@ -295,7 +306,7 @@ rule check_concatenation:
output: output:
concatenated_fastqs_list = OUT_DIR + '/{dataset}/preQC/summary_stats/' + concatenated_fastqs_list = OUT_DIR + '/{dataset}/preQC/summary_stats/' +
'{dataset}_concatenation_all.done' '{dataset}_concatenation_all.done'
conda:'../envs/rawQC.yaml' #conda:'../envs/rawQC.yaml'
run: run:
import os import os
from pathlib import Path from pathlib import Path
...@@ -390,7 +401,7 @@ rule mqc_filter_human: ...@@ -390,7 +401,7 @@ rule mqc_filter_human:
'/{dataset}/preQC/multiqc/bfiltered' + '/{dataset}/preQC/multiqc/bfiltered' +
'/{dataset}_bfiltered_stats.tsv', '/{dataset}_bfiltered_stats.tsv',
category='preQC_step2:filter_human') category='preQC_step2:filter_human')
conda:'../envs/rawQC.yaml' #conda:'../envs/rawQC.yaml'
run: run:
from utils import summarize_filter_human_step from utils import summarize_filter_human_step
mqc_stats_data = summarize_filter_human_step('{}'.format(input)) mqc_stats_data = summarize_filter_human_step('{}'.format(input))
...@@ -450,7 +461,7 @@ rule summarize_preQC: ...@@ -450,7 +461,7 @@ rule summarize_preQC:
'/{dataset}/preQC/summary_stats'+ '/{dataset}/preQC/summary_stats'+
'/{dataset}_preqc_samples_pct_barchart.svg', '/{dataset}_preqc_samples_pct_barchart.svg',
category='preQC:summaries') category='preQC:summaries')
conda:'../envs/rawQC.yaml' #conda:'../envs/rawQC.yaml'
run: run:
from utils import summarize_preqc from utils import summarize_preqc
from utils import plot_preqc_summary from utils import plot_preqc_summary
......
...@@ -161,7 +161,7 @@ def plot_preqc_summary(preqc_summary_df, by='units', plot_type='raw'): ...@@ -161,7 +161,7 @@ def plot_preqc_summary(preqc_summary_df, by='units', plot_type='raw'):
# This uses the given y-units of bottom spacing for each bar # This uses the given y-units of bottom spacing for each bar
# You may increase this number a bit to have more spacing between bars and text. # You may increase this number a bit to have more spacing between bars and text.
bottom_spacing = top_lim/1.8 bottom_spacing = top_lim/1.8
rgrids_positions = sample_run_lane['total_reads'].describe()[[3,5,7]].values + bottom_spacing rgrids_positions = [sample_run_lane['total_reads'].max()/(i) for i in [10,4,2,1]] + bottom_spacing
rgrids_positions_labels = [' {}M'.format(nr) rgrids_positions_labels = [' {}M'.format(nr)
for ix,nr in enumerate(np.round((rgrids_positions-bottom_spacing)/1000000, decimals=3))] for ix,nr in enumerate(np.round((rgrids_positions-bottom_spacing)/1000000, decimals=3))]
else: else:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment