diff --git a/rules/preprocess.smk b/rules/preprocess.smk index b52bbf408674fe7e585d03c2851c7f0baae299b1..7d0f1a5abd388a4e4f700ebcbbf8b8f686f71249 100644 --- a/rules/preprocess.smk +++ b/rules/preprocess.smk @@ -20,7 +20,8 @@ localrules: mqc_trim_3end_list_files, mqc_trim_adapters, mqc_filter_human, - multiqc_trim_3end + multiqc_trim_3end, + summarize_preQC ##----------------------------------------------------------------------------## ## Local variables ##----------------------------------------------------------------------------## @@ -77,7 +78,10 @@ def list_concatenated_r2_fastqs(wildcards): # With Fastp a quality check is performed and both paired fastq files # are processed as follows: # -# - default fastp's quality filtering +# - default fastp's quality filtering by: +# - limiting the N gase number (-n) +# - percentage of unqualified bases (-u) +# - average quality score (-e, default 0 means no requirement) # - remove adapters: enabled by default, fastp detects adapters by # per-read overlap analysis (seeks for the overlap of each read pair). # If fastp fails to find an overlap, It usess the provided the adapter @@ -87,7 +91,7 @@ def list_concatenated_r2_fastqs(wildcards): # - base correction in overlapped regions # - trimming of the last base(s) for every read pair(this step preceeds # adapter removal) -# - discard reads shorter than a minimum length, after trimming +# - length filtering: discard reads shorter than a minimum length, after trimming # # WARNING: cutting by quality score is not done at this stage because it # interferes with deduplication step (rule dedupe). @@ -135,8 +139,8 @@ rule trim_adapters: rule filter_human: input: human_ref = BBMAP_REF_DIR, - fwd_tr = rules.trim_adapters.output.fwd_tr, - rev_tr = rules.trim_adapters.output.rev_tr + fwd_tr = OUT_DIR+'/{dataset}/preQC/atrimmed/{fastq_file}-R1.fastp.fastq.gz', + rev_tr = OUT_DIR+'/{dataset}/preQC/atrimmed/{fastq_file}-R2.fastp.fastq.gz', output: fwd_clean = temp(OUT_DIR+'/{dataset}/preQC/bfiltered/{fastq_file}-R1.clean.fastq.gz'), rev_clean = temp(OUT_DIR+'/{dataset}/preQC/bfiltered/{fastq_file}-R2.clean.fastq.gz'), @@ -147,6 +151,9 @@ rule filter_human: params: low_mem = config['preprocess']['filter_human']['bbmap_usemodulo'], mem_gb = config['preprocess']['filter_human']['bbmap_mem'] + resources: + # in minutes + runtime = lambda wildcards, attempt: 120*attempt threads:cpus_avail singularity: singularity_img group: 'preprocess' @@ -192,7 +199,6 @@ rule index_human_ref: mem_gb = config['preprocess']['filter_human']['bbmap_mem'] threads: cpus_avail singularity: singularity_img - group: 'preprocess' message: "Running index_human_ref with {threads} cores." shell: ''' @@ -215,6 +221,9 @@ rule dedupe: log: OUT_DIR + '/{dataset}/preQC/logs/cdedupe/{fastq_file}.log' threads:cpus_avail + params: + dd_mem_gb = (config['preprocess']['filter_human']['bbmap_mem']/3)*2, + rf_mem_gb = config['preprocess']['filter_human']['bbmap_mem']/4 singularity: singularity_img group: 'preprocess' message: "Running dedupe with {threads} cores." @@ -224,16 +233,18 @@ rule dedupe: in1={input.fwd_clean} in2={input.rev_clean} \ out=stdout.fq \ outd={output.fastq_duplicates} \ - ac=f minidentity=99 | \ + ac=f minidentity=99 \ + -Xmx{params.dd_mem_gb}g| \ reformat.sh \ int=t in=stdin.fq \ out1={output.fwd_clean_dedup} \ out2={output.rev_clean_dedup} \ - threads={threads}) &>{log} + threads={threads} \ + -Xmx{params.rf_mem_gb}g) &>{log} ''' # After removal of adapters, human reads, and duplicates, -# the reads' 3'end are quality trimmed with fastp -# Notice that quality filtering is disabled because it was done by rule trim_adapters +# the reads' 3'end are quality trimmed (cut by quality score) with fastp +# Notice that adapter- and quality- filtering are disabled because it was done by rule trim_adapters rule trim_3end: input: fwd_clean_dedup = OUT_DIR+'/{dataset}/preQC/cdedupe/{fastq_file}-R1.clean.nodup.fastq.gz', @@ -254,17 +265,18 @@ rule trim_3end: shell: ''' (fastp \ - -Q \ - --overrepresentation_analysis \ + --disable_quality_filtering \ + --disable_adapter_trimming \ --length_required {params.min_length} \ --cut_tail -W 4 -M 20 \ --in1 {input.fwd_clean_dedup} --in2 {input.rev_clean_dedup} \ --out1 {output.fwd_tr} --out2 {output.rev_tr} \ --html {output.report1} --json {output.report2} \ - --thread {threads}) &2>{log} + --thread {threads}) &>{log} ''' ###**THESE TARGET FILES ARE THE FINAL CLEAN**### -# concatenate adapter-trimmed, cleaned,deduplicated and quality-trimmed fastqs from the same samples +# concatenate quality/length filtered, adapter-trimmed, cleaned, deduplicated and +# quality-trimmed fastqs from the same samples rule concatenate_fastqs: input: # @@ -280,7 +292,6 @@ rule concatenate_fastqs: sample_rev = protected(OUT_DIR + '/{dataset}/preQC/emerged/{sample}-R2.fastq.gz') wildcard_constraints: sample = '\w+' - group: 'preprocess' message: "Running concatenate_fastqs ..." shell: ''' @@ -295,7 +306,7 @@ rule check_concatenation: output: concatenated_fastqs_list = OUT_DIR + '/{dataset}/preQC/summary_stats/' + '{dataset}_concatenation_all.done' - conda:'../envs/rawQC.yaml' + #conda:'../envs/rawQC.yaml' run: import os from pathlib import Path @@ -390,7 +401,7 @@ rule mqc_filter_human: '/{dataset}/preQC/multiqc/bfiltered' + '/{dataset}_bfiltered_stats.tsv', category='preQC_step2:filter_human') - conda:'../envs/rawQC.yaml' + #conda:'../envs/rawQC.yaml' run: from utils import summarize_filter_human_step mqc_stats_data = summarize_filter_human_step('{}'.format(input)) @@ -450,7 +461,7 @@ rule summarize_preQC: '/{dataset}/preQC/summary_stats'+ '/{dataset}_preqc_samples_pct_barchart.svg', category='preQC:summaries') - conda:'../envs/rawQC.yaml' + #conda:'../envs/rawQC.yaml' run: from utils import summarize_preqc from utils import plot_preqc_summary diff --git a/rules/utils.py b/rules/utils.py index fa43fddc08f4f0387751a1119e8206fc4658cd92..072471831d929f9082c55c44251adb7af01752c2 100644 --- a/rules/utils.py +++ b/rules/utils.py @@ -161,7 +161,7 @@ def plot_preqc_summary(preqc_summary_df, by='units', plot_type='raw'): # This uses the given y-units of bottom spacing for each bar # You may increase this number a bit to have more spacing between bars and text. bottom_spacing = top_lim/1.8 - rgrids_positions = sample_run_lane['total_reads'].describe()[[3,5,7]].values + bottom_spacing + rgrids_positions = [sample_run_lane['total_reads'].max()/(i) for i in [10,4,2,1]] + bottom_spacing rgrids_positions_labels = [' {}M'.format(nr) for ix,nr in enumerate(np.round((rgrids_positions-bottom_spacing)/1000000, decimals=3))] else: