update rules/preprocess.smk for better compatibility when running in cluster mode

bc357ae8 · Ticlla Ccenhua Monica Roxana · 418e318e · bc357ae8 · bc357ae8
Commit bc357ae8 authored 5 years ago by Ticlla Ccenhua Monica Roxana
--- a/rules/preprocess.smk
+++ b/rules/preprocess.smk
@@ -20,7 +20,8 @@ localrules:
    mqc_trim_3end_list_files,
    mqc_trim_adapters,
    mqc_filter_human,
-    multiqc_trim_3end
+    multiqc_trim_3end,
+    summarize_preQC
 ##----------------------------------------------------------------------------##
 ## Local variables
 ##----------------------------------------------------------------------------##
@@ -77,7 +78,10 @@ def list_concatenated_r2_fastqs(wildcards):
 # With Fastp a quality check is performed and both paired fastq files
 # are processed as follows:
 #
-# - default fastp's quality filtering
+# - default fastp's quality filtering by:
+#   - limiting the N gase number (-n)
+#   - percentage of unqualified bases (-u)
+#   - average quality score (-e, default 0 means no requirement)
 # - remove adapters: enabled by default, fastp detects adapters by 
 #   per-read overlap analysis (seeks for the overlap of each read pair). 
 #   If fastp fails to find an overlap, It usess the provided the adapter 
@@ -87,7 +91,7 @@ def list_concatenated_r2_fastqs(wildcards):
 # - base correction in overlapped regions
 # - trimming of the last base(s) for every read pair(this step preceeds 
 #   adapter removal)
-# - discard reads shorter than a minimum length, after trimming
+# - length filtering: discard reads shorter than a minimum length, after trimming
 #
 # WARNING: cutting by quality score is not done at this stage because it
 # interferes with deduplication step (rule dedupe).
@@ -135,8 +139,8 @@ rule trim_adapters:
 rule filter_human:
    input:
        human_ref = BBMAP_REF_DIR,
-        fwd_tr = rules.trim_adapters.output.fwd_tr,
+        fwd_tr = OUT_DIR+'/{dataset}/preQC/atrimmed/{fastq_file}-R1.fastp.fastq.gz',
-        rev_tr = rules.trim_adapters.output.rev_tr
+        rev_tr = OUT_DIR+'/{dataset}/preQC/atrimmed/{fastq_file}-R2.fastp.fastq.gz',
    output:
        fwd_clean = temp(OUT_DIR+'/{dataset}/preQC/bfiltered/{fastq_file}-R1.clean.fastq.gz'),
        rev_clean = temp(OUT_DIR+'/{dataset}/preQC/bfiltered/{fastq_file}-R2.clean.fastq.gz'),
@@ -147,6 +151,9 @@ rule filter_human:
    params:
        low_mem = config['preprocess']['filter_human']['bbmap_usemodulo'],
        mem_gb = config['preprocess']['filter_human']['bbmap_mem']
+    resources:
+        # in minutes
+        runtime = lambda wildcards, attempt: 120*attempt
    threads:cpus_avail
    singularity: singularity_img
    group: 'preprocess'
@@ -192,7 +199,6 @@ rule index_human_ref:
        mem_gb = config['preprocess']['filter_human']['bbmap_mem']
    threads: cpus_avail
    singularity: singularity_img
-    group: 'preprocess'
    message: "Running index_human_ref with {threads} cores."
    shell:
        '''
@@ -215,6 +221,9 @@ rule dedupe:
    log:
        OUT_DIR + '/{dataset}/preQC/logs/cdedupe/{fastq_file}.log'
    threads:cpus_avail
+    params:
+        dd_mem_gb = (config['preprocess']['filter_human']['bbmap_mem']/3)*2,
+        rf_mem_gb = config['preprocess']['filter_human']['bbmap_mem']/4
    singularity: singularity_img
    group: 'preprocess'
    message: "Running dedupe with {threads} cores."
@@ -224,16 +233,18 @@ rule dedupe:
        in1={input.fwd_clean} in2={input.rev_clean} \
        out=stdout.fq \
        outd={output.fastq_duplicates} \
-        ac=f minidentity=99 | \
+        ac=f minidentity=99 \
+        -Xmx{params.dd_mem_gb}g| \
        reformat.sh \
        int=t in=stdin.fq \
        out1={output.fwd_clean_dedup} \
        out2={output.rev_clean_dedup} \
-        threads={threads}) &>{log}
+        threads={threads} \
+        -Xmx{params.rf_mem_gb}g) &>{log}
        '''
 # After removal of adapters, human reads, and duplicates,
-# the reads' 3'end are quality trimmed with fastp
+# the reads' 3'end are quality trimmed (cut by quality score) with fastp
-# Notice that quality filtering is disabled because it was done by rule trim_adapters
+# Notice that adapter- and quality- filtering are disabled because it was done by rule trim_adapters
 rule trim_3end:
    input:
        fwd_clean_dedup = OUT_DIR+'/{dataset}/preQC/cdedupe/{fastq_file}-R1.clean.nodup.fastq.gz',
@@ -254,17 +265,18 @@ rule trim_3end:
    shell:
        '''
        (fastp \
-        -Q \
+        --disable_quality_filtering \
-        --overrepresentation_analysis \
+        --disable_adapter_trimming \
        --length_required {params.min_length} \
        --cut_tail -W 4 -M 20 \
        --in1 {input.fwd_clean_dedup} --in2 {input.rev_clean_dedup} \
        --out1 {output.fwd_tr} --out2 {output.rev_tr} \
        --html {output.report1} --json {output.report2} \
-        --thread {threads}) &2>{log}
+        --thread {threads}) &>{log}
        '''
 ###**THESE TARGET FILES ARE THE FINAL CLEAN**###
-# concatenate adapter-trimmed, cleaned,deduplicated and quality-trimmed fastqs from the same samples
+# concatenate quality/length filtered, adapter-trimmed, cleaned, deduplicated and 
+# quality-trimmed fastqs from the same samples
 rule concatenate_fastqs:
    input:
        #
@@ -280,7 +292,6 @@ rule concatenate_fastqs:
        sample_rev = protected(OUT_DIR + '/{dataset}/preQC/emerged/{sample}-R2.fastq.gz')
    wildcard_constraints:
        sample = '\w+'
-    group: 'preprocess'
    message: "Running concatenate_fastqs ..."
    shell:
        '''
@@ -295,7 +306,7 @@ rule check_concatenation:
    output:
        concatenated_fastqs_list = OUT_DIR + '/{dataset}/preQC/summary_stats/' + 
                                    '{dataset}_concatenation_all.done'
-    conda:'../envs/rawQC.yaml'
+    #conda:'../envs/rawQC.yaml'
    run:
        import os
        from pathlib import Path
@@ -390,7 +401,7 @@ rule mqc_filter_human:
                            '/{dataset}/preQC/multiqc/bfiltered' + 
                            '/{dataset}_bfiltered_stats.tsv', 
                            category='preQC_step2:filter_human')
-    conda:'../envs/rawQC.yaml'
+    #conda:'../envs/rawQC.yaml'
    run:
        from utils import summarize_filter_human_step
        mqc_stats_data = summarize_filter_human_step('{}'.format(input))
@@ -450,7 +461,7 @@ rule summarize_preQC:
                                          '/{dataset}/preQC/summary_stats'+ 
                                          '/{dataset}_preqc_samples_pct_barchart.svg', 
                                          category='preQC:summaries')
-    conda:'../envs/rawQC.yaml'
+    #conda:'../envs/rawQC.yaml'
    run:
        from utils import summarize_preqc
        from utils import plot_preqc_summary

--- a/rules/utils.py
+++ b/rules/utils.py
@@ -161,7 +161,7 @@ def plot_preqc_summary(preqc_summary_df, by='units', plot_type='raw'):
                # This uses the given y-units of bottom spacing for each bar
                # You may increase this number a bit to have more spacing between bars and text.
                bottom_spacing = top_lim/1.8
-                rgrids_positions = sample_run_lane['total_reads'].describe()[[3,5,7]].values + bottom_spacing
+                rgrids_positions = [sample_run_lane['total_reads'].max()/(i) for i in [10,4,2,1]] + bottom_spacing
                rgrids_positions_labels = [' {}M'.format(nr) 
                                           for ix,nr in enumerate(np.round((rgrids_positions-bottom_spacing)/1000000, decimals=3))]
            else: