Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
M
MetagenomicSnake
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
TBRU
MetagenomicSnake
Commits
bc357ae8
Commit
bc357ae8
authored
5 years ago
by
Ticlla Ccenhua Monica Roxana
Browse files
Options
Downloads
Patches
Plain Diff
update rules/preprocess.smk for better compatibility when running in cluster mode
parent
418e318e
Branches
Branches containing commit
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
rules/preprocess.smk
+29
-18
29 additions, 18 deletions
rules/preprocess.smk
rules/utils.py
+1
-1
1 addition, 1 deletion
rules/utils.py
with
30 additions
and
19 deletions
rules/preprocess.smk
+
29
−
18
View file @
bc357ae8
...
...
@@ -20,7 +20,8 @@ localrules:
mqc_trim_3end_list_files,
mqc_trim_adapters,
mqc_filter_human,
multiqc_trim_3end
multiqc_trim_3end,
summarize_preQC
##----------------------------------------------------------------------------##
## Local variables
##----------------------------------------------------------------------------##
...
...
@@ -77,7 +78,10 @@ def list_concatenated_r2_fastqs(wildcards):
# With Fastp a quality check is performed and both paired fastq files
# are processed as follows:
#
# - default fastp's quality filtering
# - default fastp's quality filtering by:
# - limiting the N gase number (-n)
# - percentage of unqualified bases (-u)
# - average quality score (-e, default 0 means no requirement)
# - remove adapters: enabled by default, fastp detects adapters by
# per-read overlap analysis (seeks for the overlap of each read pair).
# If fastp fails to find an overlap, It usess the provided the adapter
...
...
@@ -87,7 +91,7 @@ def list_concatenated_r2_fastqs(wildcards):
# - base correction in overlapped regions
# - trimming of the last base(s) for every read pair(this step preceeds
# adapter removal)
# - discard reads shorter than a minimum length, after trimming
# -
length filtering:
discard reads shorter than a minimum length, after trimming
#
# WARNING: cutting by quality score is not done at this stage because it
# interferes with deduplication step (rule dedupe).
...
...
@@ -135,8 +139,8 @@ rule trim_adapters:
rule filter_human:
input:
human_ref = BBMAP_REF_DIR,
fwd_tr =
rules.trim_adapters.output.fwd_tr
,
rev_tr =
rules.trim_adapters.output.rev_tr
fwd_tr =
OUT_DIR+'/{dataset}/preQC/atrimmed/{fastq_file}-R1.fastp.fastq.gz'
,
rev_tr =
OUT_DIR+'/{dataset}/preQC/atrimmed/{fastq_file}-R2.fastp.fastq.gz',
output:
fwd_clean = temp(OUT_DIR+'/{dataset}/preQC/bfiltered/{fastq_file}-R1.clean.fastq.gz'),
rev_clean = temp(OUT_DIR+'/{dataset}/preQC/bfiltered/{fastq_file}-R2.clean.fastq.gz'),
...
...
@@ -147,6 +151,9 @@ rule filter_human:
params:
low_mem = config['preprocess']['filter_human']['bbmap_usemodulo'],
mem_gb = config['preprocess']['filter_human']['bbmap_mem']
resources:
# in minutes
runtime = lambda wildcards, attempt: 120*attempt
threads:cpus_avail
singularity: singularity_img
group: 'preprocess'
...
...
@@ -192,7 +199,6 @@ rule index_human_ref:
mem_gb = config['preprocess']['filter_human']['bbmap_mem']
threads: cpus_avail
singularity: singularity_img
group: 'preprocess'
message: "Running index_human_ref with {threads} cores."
shell:
'''
...
...
@@ -215,6 +221,9 @@ rule dedupe:
log:
OUT_DIR + '/{dataset}/preQC/logs/cdedupe/{fastq_file}.log'
threads:cpus_avail
params:
dd_mem_gb = (config['preprocess']['filter_human']['bbmap_mem']/3)*2,
rf_mem_gb = config['preprocess']['filter_human']['bbmap_mem']/4
singularity: singularity_img
group: 'preprocess'
message: "Running dedupe with {threads} cores."
...
...
@@ -224,16 +233,18 @@ rule dedupe:
in1={input.fwd_clean} in2={input.rev_clean} \
out=stdout.fq \
outd={output.fastq_duplicates} \
ac=f minidentity=99 | \
ac=f minidentity=99 \
-Xmx{params.dd_mem_gb}g| \
reformat.sh \
int=t in=stdin.fq \
out1={output.fwd_clean_dedup} \
out2={output.rev_clean_dedup} \
threads={threads}) &>{log}
threads={threads} \
-Xmx{params.rf_mem_gb}g) &>{log}
'''
# After removal of adapters, human reads, and duplicates,
# the reads' 3'end are quality trimmed with fastp
# Notice that quality filtering
is
disabled because it was done by rule trim_adapters
# the reads' 3'end are quality trimmed
(cut by quality score)
with fastp
# Notice that
adapter- and
quality
-
filtering
are
disabled because it was done by rule trim_adapters
rule trim_3end:
input:
fwd_clean_dedup = OUT_DIR+'/{dataset}/preQC/cdedupe/{fastq_file}-R1.clean.nodup.fastq.gz',
...
...
@@ -254,17 +265,18 @@ rule trim_3end:
shell:
'''
(fastp \
-
Q
\
--
overrepresentation_analysis
\
-
-disable_quality_filtering
\
--
disable_adapter_trimming
\
--length_required {params.min_length} \
--cut_tail -W 4 -M 20 \
--in1 {input.fwd_clean_dedup} --in2 {input.rev_clean_dedup} \
--out1 {output.fwd_tr} --out2 {output.rev_tr} \
--html {output.report1} --json {output.report2} \
--thread {threads}) &
2
>{log}
--thread {threads}) &>{log}
'''
###**THESE TARGET FILES ARE THE FINAL CLEAN**###
# concatenate adapter-trimmed, cleaned,deduplicated and quality-trimmed fastqs from the same samples
# concatenate quality/length filtered, adapter-trimmed, cleaned, deduplicated and
# quality-trimmed fastqs from the same samples
rule concatenate_fastqs:
input:
#
...
...
@@ -280,7 +292,6 @@ rule concatenate_fastqs:
sample_rev = protected(OUT_DIR + '/{dataset}/preQC/emerged/{sample}-R2.fastq.gz')
wildcard_constraints:
sample = '\w+'
group: 'preprocess'
message: "Running concatenate_fastqs ..."
shell:
'''
...
...
@@ -295,7 +306,7 @@ rule check_concatenation:
output:
concatenated_fastqs_list = OUT_DIR + '/{dataset}/preQC/summary_stats/' +
'{dataset}_concatenation_all.done'
conda:'../envs/rawQC.yaml'
#
conda:'../envs/rawQC.yaml'
run:
import os
from pathlib import Path
...
...
@@ -390,7 +401,7 @@ rule mqc_filter_human:
'/{dataset}/preQC/multiqc/bfiltered' +
'/{dataset}_bfiltered_stats.tsv',
category='preQC_step2:filter_human')
conda:'../envs/rawQC.yaml'
#
conda:'../envs/rawQC.yaml'
run:
from utils import summarize_filter_human_step
mqc_stats_data = summarize_filter_human_step('{}'.format(input))
...
...
@@ -450,7 +461,7 @@ rule summarize_preQC:
'/{dataset}/preQC/summary_stats'+
'/{dataset}_preqc_samples_pct_barchart.svg',
category='preQC:summaries')
conda:'../envs/rawQC.yaml'
#
conda:'../envs/rawQC.yaml'
run:
from utils import summarize_preqc
from utils import plot_preqc_summary
...
...
This diff is collapsed.
Click to expand it.
rules/utils.py
+
1
−
1
View file @
bc357ae8
...
...
@@ -161,7 +161,7 @@ def plot_preqc_summary(preqc_summary_df, by='units', plot_type='raw'):
# This uses the given y-units of bottom spacing for each bar
# You may increase this number a bit to have more spacing between bars and text.
bottom_spacing
=
top_lim
/
1.8
rgrids_positions
=
sample_run_lane
[
'
total_reads
'
].
describe
()[[
3
,
5
,
7
]].
values
+
bottom_spacing
rgrids_positions
=
[
sample_run_lane
[
'
total_reads
'
].
max
()
/
(
i
)
for
i
in
[
10
,
4
,
2
,
1
]]
+
bottom_spacing
rgrids_positions_labels
=
[
'
{}M
'
.
format
(
nr
)
for
ix
,
nr
in
enumerate
(
np
.
round
((
rgrids_positions
-
bottom_spacing
)
/
1000000
,
decimals
=
3
))]
else
:
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment