diff --git a/Snakefile b/Snakefile index b25709b8fc379b66b2f0427263ac14c1a3855a1a..fb7a30eece936a2ff38a17c96ec6221b713b7cec 100644 --- a/Snakefile +++ b/Snakefile @@ -10,9 +10,9 @@ import os ##----------------------------------------------------------------------------## from multiprocessing import cpu_count cpus_avail = cpu_count() -print('CPUs available: {}'.format(cpus_avail)) +print('CPUs available: {}'.format(cpus_avail), file=sys.stderr) ##----------------------------------------------------------------------------## -## Working directory +## Re-set working directory ##----------------------------------------------------------------------------## # Set working directory workdir_path = os.path.dirname(os.path.abspath(__name__)) @@ -20,67 +20,67 @@ workflow_path = workflow.basedir if workdir_path == workflow_path: message = "Working directory was not specified!"+\ - "MetagenomicSnake assumes ./data, relative to current directory, "+\ - "as your working directory ..." - print(message) + "Then, MetagenomicSnake assumes ./data, relative to current directory, "+\ + "as your working directory ..." + print(message, file=sys.stderr) if os.path.exists(workflow_path+'/data'): workdir_path = workflow_path+'/data' workdir: workdir_path - print("Working directory:{}".format(workdir_path)) + print("Working directory:{}".format(workdir_path), file=sys.stderr) else: - print("... Folder ./data not found in current directory...") - print("... instead, setting current directory as working directory ...") - print("Working directory:{}".format(workdir_path)) + message = "... Folder ./data not found in current directory..."+\ + "... instead, setting current directory as working directory ..."+\ + "Working directory:{}".format(workdir_path) + print(message, file=sys.stderr) else: - print("Working directory:{}".format(workdir_path)) + print("Working directory:{}".format(workdir_path), file=sys.stderr) ##----------------------------------------------------------------------------## ## Configuration of MetagenomicSnake ##----------------------------------------------------------------------------## try: configfile_path = config['configfile_path'] - print("Configuration file: {}".format(configfile_path)) + print("Configuration file: {}".format(configfile_path), file=sys.stderr) except: - print("Configuration file config.yaml not especified at execution!") + print("Configuration file config.yaml not especified at execution!", file=sys.stderr) try: - print("... Trying working directory ...") + print("... Trying working directory ...", file=sys.stderr) configfile_path = "config.yaml" configfile: configfile_path - print("... Configuration file: {}".format(configfile_path)) + print("... Configuration file: {}".format(configfile_path), file=sys.stderr) except: - print("... config.yaml not found in working directory ...") - print("... Loading default config.yaml provided with MetagenomicSnake...") + message = "... config.yaml not found in working directory ..."+\ + "... Loading default config.yaml provided with MetagenomicSnake..." + print(message, file=sys.stderr) configfile_path = workflow_path + "/config.yaml" configfile: configfile_path - print("... Configuration file: {}".format(configfile_path)) + print("... Configuration file: {}".format(configfile_path), file=sys.stderr) ##----------------------------------------------------------------------------## ## Define paths ##----------------------------------------------------------------------------## -RAW_FASTQ_DIR = 'raw/fastq' -RESULTS_DIR = config['PREFIX_DIR'] +'/results' -LOGS_DIR = config['PREFIX_DIR'] +'/logs' -REPORTS_DIR = config['PREFIX_DIR'] +'/reports' +RAW_FASTQ_DIR = config['RAW_DIR'] +OUT_DIR = config['OUT_DIR'] +#RESULTS_DIR = config['OUT_DIR'] +'/results' +#LOGS_DIR = config['OUT_DIR'] +'/logs' +#REPORTS_DIR = config['OUT_DIR'] +'/reports' -RAW_QC_DIR = RESULTS_DIR + '/rawQC' -RAW_QC_REPORT = REPORTS_DIR + '/rawQC' - -PRE_PROC_DIR = RESULTS_DIR + '/preprocessed' -PRE_PROC_REPORT = REPORTS_DIR + '/preprocessed' - -#MERGE_DIR = RESULTS_DIR + '/merged' -#MERGE_REPORT = REPORTS_DIR + '/merged' ##----------------------------------------------------------------------------## ## Fastq files to be processed ##----------------------------------------------------------------------------## if config['SAMPLE_UNITS']['auto']: - (DATASETS, SAMPLES, RUNS, LANES) = glob_wildcards(RAW_FASTQ_DIR+'/{dataset}/{sample}-{run}_{lane}-R1.fastq.gz') - (DATASETSX, FASTQS) = glob_wildcards(RAW_FASTQ_DIR+'/{dataset}/{fastq_file}-R1.fastq.gz') + (DATASETS, SAMPLES, RUNS, LANES) = glob_wildcards(RAW_FASTQ_DIR+'/{dataset}/fastq/{sample}-{run}_{lane}-R1.fastq.gz') + (DATASETSX, FASTQS) = glob_wildcards(RAW_FASTQ_DIR+'/{dataset}/fastq/{fastq_file}-R1.fastq.gz') else: # TODO: pass - +##----------------------------------------------------------------------------## +## Local rules +##----------------------------------------------------------------------------## +localrules: + rawQC, + preQC ##----------------------------------------------------------------------------## ## Run entire workflow ##----------------------------------------------------------------------------## @@ -95,21 +95,54 @@ rule all: ##----------------------------------------------------------------------------## rule rawQC: input: - expand(RAW_QC_REPORT + '/{dataset}_samples_stats.txt', dataset=set(DATASETS)), - expand(RAW_QC_REPORT + '/{dataset}_units_stats.txt', dataset=set(DATASETS)), - expand(RAW_QC_REPORT + '/{dataset}_run_lane_stats.txt', dataset=set(DATASETS)), - expand(RAW_QC_REPORT + '/{dataset}_units_read_dist.svg', dataset=set(DATASETS)), - expand(RAW_QC_REPORT + '/{dataset}_samples_read_dist.svg', dataset=set(DATASETS)), - expand(RAW_QC_REPORT + '/{dataset}_samples_read_dispersion.svg', dataset=set(DATASETS)), - expand(RAW_QC_REPORT + '/{dataset}_fastqc_tests_status.svg', dataset=set(DATASETS)) -rule preprocess: + expand(OUT_DIR + '/{dataset}/rawQC/summary_stats/{dataset}_samples_stats.txt', dataset=set(DATASETS)), + expand(OUT_DIR + '/{dataset}/rawQC/summary_stats/{dataset}_units_stats.txt', dataset=set(DATASETS)), + expand(OUT_DIR + '/{dataset}/rawQC/summary_stats/{dataset}_run_lane_stats.txt', dataset=set(DATASETS)), + expand(OUT_DIR + '/{dataset}/rawQC/summary_stats/{dataset}_units_read_dist.svg', dataset=set(DATASETS)), + expand(OUT_DIR + '/{dataset}/rawQC/summary_stats/{dataset}_samples_read_dist.svg', dataset=set(DATASETS)), + expand(OUT_DIR + '/{dataset}/rawQC/summary_stats/{dataset}_samples_read_dispersion.svg', dataset=set(DATASETS)), + expand(OUT_DIR + '/{dataset}/rawQC/summary_stats/{dataset}_fastqc_tests_status.svg', dataset=set(DATASETS)) + +rule preQC: input: - expand(PRE_PROC_REPORT + '/{dataset}_multiqc.html', dataset=set(DATASETS)) + expand(OUT_DIR + '/{dataset}/preQC/summary_stats/{dataset}_preqc_units_summary.tsv', dataset=set(DATASETS)), + expand(OUT_DIR + '/{dataset}/preQC/summary_stats/{dataset}_preqc_samples_summary.tsv', dataset=set(DATASETS)), + expand(OUT_DIR + '/{dataset}/preQC/summary_stats/{dataset}_preqc_units_barchart.svg', dataset=set(DATASETS)), + expand(OUT_DIR + '/{dataset}/preQC/summary_stats/{dataset}_preqc_samples_barchart.svg', dataset=set(DATASETS)), + expand(OUT_DIR + '/{dataset}/preQC/summary_stats/{dataset}_preqc_units_pct_barchart.svg', dataset=set(DATASETS)), + expand(OUT_DIR + '/{dataset}/preQC/summary_stats/{dataset}_preqc_samples_pct_barchart.svg', dataset=set(DATASETS)), + expand(OUT_DIR + '/{dataset}/preQC/summary_stats/{dataset}_concatenation_all.done', dataset=set(DATASETS)) + +##----------------------------------------------------------------------------## +## Rules to make reports +##----------------------------------------------------------------------------## +rule rawQC_make_report: + output: + temp(touch(OUT_DIR+'/logs/raw_QC_make_report.done')) + log: + OUT_DIR+'/logs/rawQC_make_report.log' + params: + report_path = OUT_DIR+'/rawQC_report.html', + workdir = workdir_path, + workflow_dir = workflow_path + shell: + ''' + (snakemake --directory={params.workdir} --report {params.report_path} -s {params.workflow_dir}/Snakefile rawQC) &>{log} + ''' -#rule merge_samples: -# input: -# expand(MERGE_REPORT + '/{dataset}_multiqc.html', dataset=set(DATASETS)) +rule preQC_make_report: + output: + temp(touch(OUT_DIR+'/logs/preQC_make_report.done')) + log: + OUT_DIR+'/logs/preQC_make_report.log' + params: + report_path = OUT_DIR+'/preQC_report.html', + workdir = workdir_path, + workflow_dir = workflow_path + shell: + ''' + (snakemake --directory={params.workdir} --report {params.report_path} -s {params.workflow_dir}/Snakefile preQC) &>{log} + ''' include: "rules/rawQC.smk" include: "rules/preprocess.smk" -#include: "rules/merge_samples.smk" diff --git a/config.yaml b/config.yaml index 7f7cfc2cc7fc62cb03bc8610da3152b92e004356..ca6405ed84e68deaa17d87a8dff9ca32615ee89f 100644 --- a/config.yaml +++ b/config.yaml @@ -2,8 +2,21 @@ # In case of sample based data, it should be complemented by a samples.tsv file that contains # one row per sample. It can be parsed easily via pandas. +# Set PATHS +# WARNING: +# MetagenomicSnake uses singularity containers to run the workflow. +# Bind access to those paths if needed +# Singularity's --bind/-B option can be specified multiple times, +# or a comma-delimited string of bind path specifications can be used. +# +# PATH to config file for SnakeMake +# configfile_path: '' +# Absolute PATH to folder where to find fastq files +RAW_DIR: '/home/mticlla/Documents/Git_repositories/metagenomicsnake_testdata/data/raw' +# Absolute PATH to folder where to place output files +OUT_DIR: '/home/mticlla/Documents/Git_repositories/metagenomicsnake_testdata/data/interim/MetaSnk' # PATH to folder where MetagenomicSnake will store results -PREFIX_DIR: 'MetagenomicSnake_results' +# PREFIX: 'MetaSnk' # SAMPLE_UNITS: @@ -31,10 +44,10 @@ preprocess: # PATH to reference(e.g human) genome # ----------------------------------- - # Masked hg19 kindly provided by Brian Brushnell and manually dowloaded from + # Masked hg19 kindly provided by Brian Brushnell and + # manually dowloaded from # https://drive.google.com/open?id=0B3llHR93L14wd0pSSnFULUlhcUk # How the human genome was masked is explained in this SEQanswers entry # http://seqanswers.com/forums/archive/index.php/t-42552.html - bbmap_reference: 'external/hg19_main_mask_ribo_animal_allplant_allfungus.fa.gz' - #bbmap_reference: 'external/reference_test_2.fa.gz' - bbmap_ref_dir: 'external/ref_indexed' + bbmap_reference: '/home/mticlla/Documents/Git_repositories/metagenomicsnake/ref/hg19_main_mask_ribo_animal_allplant_allfungus.fa.gz' + bbmap_ref_dir: '/home/mticlla/Documents/Git_repositories/metagenomicsnake/ref/ref_bbsplit' diff --git a/report/MetagenomicSnake.rst b/report/MetagenomicSnake.rst index 07a341e0fe856c5899b983bbe6d7737fa8e0143d..bbdbb3e86c1b495f4e2128dbf9e5339b43c53bb1 100644 --- a/report/MetagenomicSnake.rst +++ b/report/MetagenomicSnake.rst @@ -1,4 +1,4 @@ -Workflow version 0.1 +Workflow version 0.2 MetagenomicSnake ================ @@ -12,21 +12,22 @@ Modules **rawQC** runs FastQC on a random sample of R1 reads from the paired fastq-format files. -**preprocess** - FastQC only performs a quality check but no QC processing is done. The **preprocess** +**preQC** + FastQC only performs a quality check but no QC processing is done. The **preQC** rule runs a multi-step pre-processing of the paired fastq files, includes: - - Adapter-trimming with "fastp" - - - - - with Fastp a quality check is performed and both paired fastq files are processed as follows: + - **trim_adapters**: adapter-trimming with "fastp". Fastp performs a quality check + and both paired fastq files are processed as follows\: + + remove adapters: here we provide the Nextera XT adapters, + base correction in overlapped regions + trimming of the last base in read 1 + discard reads shorter than a minimum length, after trimming + a report with quality check, before and after processing - - removal of reads derived from human DNA - - removal of duplicated reads + - **filter_human**: removal of reads derived from human DNA with BBTools' bbsplit_ + - **dedupe**: removal of duplicated reads with BBTools' dedupe + - **trim_3end**: 3\'-end quality trimming with "fastp" + - **concatenate_fastqs**: merges fastq files corresponding to the same sample into a single pair of fastq files + - **summarize_preQC**: creates summarizing tables and plots -**merge_samples** - merges fastq files corresponding to the same sample into a single pair of fastq files, after pre-processing of paired fastq files. +.. _bbsplit: http://seqanswers.com/forums/showthread.php?t=41288 \ No newline at end of file diff --git a/rules/preprocess.smk b/rules/preprocess.smk index 4efe2c384db30f33b3d9a47349e31766660066cc..b52bbf408674fe7e585d03c2851c7f0baae299b1 100644 --- a/rules/preprocess.smk +++ b/rules/preprocess.smk @@ -4,9 +4,23 @@ Afiliation(s): SIB, SwissTPH, UNIBAS Description: rules for pre-processing of paired-end shotgun DNA metagenomic sequencing. ''' + +##----------------------------------------------------------------------------## +## Import modules +##----------------------------------------------------------------------------## +#from utils import summarize_filter_human_step + +##----------------------------------------------------------------------------## +## Local rules +##----------------------------------------------------------------------------## localrules: - multiqc_preprocess_list_files, - multiqc_preprocess + check_concatenation, + mqc_trim_adapters_list_files, + mqc_filter_human_list_files, + mqc_trim_3end_list_files, + mqc_trim_adapters, + mqc_filter_human, + multiqc_trim_3end ##----------------------------------------------------------------------------## ## Local variables ##----------------------------------------------------------------------------## @@ -14,6 +28,47 @@ singularity_img = 'shub://mticlla/MetagenomicSnake:preqc_v0_1' BBMAP_REF = config['preprocess']['filter_human']['bbmap_reference'] BBMAP_REF_DIR = config['preprocess']['filter_human']['bbmap_ref_dir'] +##----------------------------------------------------------------------------## +## Helper functions +##----------------------------------------------------------------------------## +# Helper functions to create list of files for MultiQC + +def mqc_trim_adapters_inputs(wildcards): + list_files = ['{}/{}/preQC/atrimmed/{}.fastp.json'.format(OUT_DIR, + value, + FASTQS[ix]) + for ix,value in enumerate(DATASETSX) + if value==wildcards.dataset] + return(list_files) + +def mqc_filter_human_inputs(wildcards): + list_files = ['{}/{}/preQC/bfiltered/{}.clean.stats'.format(OUT_DIR, + value, + FASTQS[ix]) + for ix,value in enumerate(DATASETSX) + if value==wildcards.dataset] + return(list_files) + +def mqc_trim_3end_inputs(wildcards): + list_files = ['{}/{}/preQC/dfinaltrim/{}.clean.nodup.fastp.json'.format(OUT_DIR, + value, + FASTQS[ix]) + for ix,value in enumerate(DATASETSX) + if value==wildcards.dataset] + return(list_files) + +def list_concatenated_r1_fastqs(wildcards): + list_r1 = ['{}/{}/preQC/emerged/{}-R1.fastq.gz'.format(OUT_DIR, + wildcards.dataset, + value) + for ix,value in enumerate(SAMPLES) if DATASETS[ix]==wildcards.dataset] + return(list_r1) +def list_concatenated_r2_fastqs(wildcards): + list_r2 = ['{}/{}/preQC/emerged/{}-R2.fastq.gz'.format(OUT_DIR, + wildcards.dataset, + value) + for ix,value in enumerate(SAMPLES) if DATASETS[ix]==wildcards.dataset] + return(list_r2) ##----------------------------------------------------------------------------## ## Rules with target files ##----------------------------------------------------------------------------## @@ -21,45 +76,47 @@ BBMAP_REF_DIR = config['preprocess']['filter_human']['bbmap_ref_dir'] # FastQC only performs a quality check but no QC processing is done. # With Fastp a quality check is performed and both paired fastq files # are processed as follows: +# # - default fastp's quality filtering -# - remove adapters: enabled by default, fastp detects adapters by per-read overlap -# analysis (seeks for the overlap of each read pair). If fastp fails to find an -# overlap, here we provide the adapter sequences (e.g Nextera XT adapters). +# - remove adapters: enabled by default, fastp detects adapters by +# per-read overlap analysis (seeks for the overlap of each read pair). +# If fastp fails to find an overlap, It usess the provided the adapter +# sequences (e.g Nextera XT adapters). # Check this page from Illumina to know which adapters to remove: # https://support.illumina.com/bulletins/2016/12/what-sequences-do-i-use-for-adapter-trimming.html # - base correction in overlapped regions -# - trimming of the last base(s) for every read pair( this step preceeds adapter removal) +# - trimming of the last base(s) for every read pair(this step preceeds +# adapter removal) # - discard reads shorter than a minimum length, after trimming -# WARNING: cutting by quality score is not done at this stage because it interferes with -# deduplication step -# It provides a report with quality check, before and after processing +# +# WARNING: cutting by quality score is not done at this stage because it +# interferes with deduplication step (rule dedupe). +# +# FastP provides a report with quality check, before and after processing rule trim_adapters: input: - fwd = lambda wildcards: ['{}/{}/{}-R1.fastq.gz'.format(RAW_FASTQ_DIR, DATASETSX[ix], value) + fwd = lambda wildcards: ['{}/{}/fastq/{}-R1.fastq.gz'.format(RAW_FASTQ_DIR, DATASETSX[ix], value) for ix,value in enumerate(FASTQS) if value==wildcards.fastq_file], - #RAW_FASTQ_DIR+'/{dataset}/{fastq_file}-R1.fastq.gz', - rev = lambda wildcards: ['{}/{}/{}-R2.fastq.gz'.format(RAW_FASTQ_DIR, DATASETSX[ix], value) - for ix,value in enumerate(FASTQS) if value==wildcards.fastq_file], - #RAW_FASTQ_DIR+'/{dataset}/{fastq_file}-R2.fastq.gz' + rev = lambda wildcards: ['{}/{}/fastq/{}-R2.fastq.gz'.format(RAW_FASTQ_DIR, DATASETSX[ix], value) + for ix,value in enumerate(FASTQS) if value==wildcards.fastq_file] output: - fwd_tr = temp(PRE_PROC_DIR+'/{dataset}_atrimmed/{fastq_file}-R1.fastp.fastq.gz'), - rev_tr = temp(PRE_PROC_DIR+'/{dataset}_atrimmed/{fastq_file}-R2.fastp.fastq.gz'), - report1 = PRE_PROC_DIR+'/{dataset}_atrimmed/{fastq_file}.fastp.html', - report2 = PRE_PROC_DIR+'/{dataset}_atrimmed/{fastq_file}.fastp.json' + fwd_tr = temp(OUT_DIR+'/{dataset}/preQC/atrimmed/{fastq_file}-R1.fastp.fastq.gz'), + rev_tr = temp(OUT_DIR+'/{dataset}/preQC/atrimmed/{fastq_file}-R2.fastp.fastq.gz'), + report1 = OUT_DIR+'/{dataset}/preQC/atrimmed/{fastq_file}.fastp.html', + report2 = OUT_DIR+'/{dataset}/preQC/atrimmed/{fastq_file}.fastp.json' log: - LOGS_DIR + '/preprocess/{dataset}_atrimmed/{fastq_file}.log' + OUT_DIR + '/{dataset}/preQC/logs/atrimmed/{fastq_file}.log' params: - fastp_dir = PRE_PROC_DIR, + #fastp_dir = PRE_PROC_DIR, adapter = config['preprocess']['trim_adapters']['adapter'], min_length = config['preprocess']['trim_adapters']['min_length'], trim_tails = config['preprocess']['trim_adapters']['trim_tails'] threads: cpus_avail singularity: singularity_img group: 'preprocess' + message: "Running trim_adapters with {threads} cores." shell: ''' - echo "Running trim_adapters with {threads} cores." - [ ! -d {params.fastp_dir} ] && mkdir {params.fastp_dir} (fastp \ --adapter_sequence {params.adapter} \ --adapter_sequence_r2 {params.adapter} \ @@ -80,26 +137,23 @@ rule filter_human: human_ref = BBMAP_REF_DIR, fwd_tr = rules.trim_adapters.output.fwd_tr, rev_tr = rules.trim_adapters.output.rev_tr - #fwd_tr=PRE_PROC_DIR+'/{dataset}_atrimmed/{fastq_file}-R1.fastp.fastq.gz', - #rev_tr=PRE_PROC_DIR+'/{dataset}_atrimmed/{fastq_file}-R2.fastp.fastq.gz' output: - fwd_clean = temp(PRE_PROC_DIR+'/{dataset}_bfiltered/{fastq_file}-R1.clean.fastq.gz'), - rev_clean = temp(PRE_PROC_DIR+'/{dataset}_bfiltered/{fastq_file}-R2.clean.fastq.gz'), - bbmap_scafstats = PRE_PROC_DIR+'/{dataset}_bfiltered/{fastq_file}.clean.scafstats', - bbmap_refstats = PRE_PROC_DIR+'/{dataset}_bfiltered/{fastq_file}.clean.stats' + fwd_clean = temp(OUT_DIR+'/{dataset}/preQC/bfiltered/{fastq_file}-R1.clean.fastq.gz'), + rev_clean = temp(OUT_DIR+'/{dataset}/preQC/bfiltered/{fastq_file}-R2.clean.fastq.gz'), + bbmap_scafstats = OUT_DIR+'/{dataset}/preQC/bfiltered/{fastq_file}.clean.scafstats', + bbmap_refstats = OUT_DIR+'/{dataset}/preQC/bfiltered/{fastq_file}.clean.stats' log: - LOGS_DIR + '/preprocess/{dataset}_bfiltered/{fastq_file}.log' + OUT_DIR + '/{dataset}/preQC/logs/bfiltered/{fastq_file}.log' params: low_mem = config['preprocess']['filter_human']['bbmap_usemodulo'], mem_gb = config['preprocess']['filter_human']['bbmap_mem'] threads:cpus_avail singularity: singularity_img group: 'preprocess' + message: "Running filter_human with {threads} cores." shell: ''' - echo "Running filter_human with {threads} cores." - [ ! -d {PRE_PROC_DIR} ] && mkdir {PRE_PROC_DIR} - (bbmap.sh \ + (bbsplit.sh \ usemodulo={params.low_mem} \ minid=0.95 \ maxindel=3 \ @@ -116,7 +170,7 @@ rule filter_human: outu1={output.fwd_clean} outu2={output.rev_clean} \ sortscafs=t \ scafstats={output.bbmap_scafstats} \ - statsfile={output.bbmap_refstats} \ + refstats={output.bbmap_refstats} \ unpigz=t pigz=t -Xmx{params.mem_gb}g \ threads={threads}) &>{log} ''' @@ -125,22 +179,25 @@ rule filter_human: # rule index_human_ref: input: + # Masked hg19 kindly provided by Brian Brushnell and + # manually dowloaded from + # https://drive.google.com/open?id=0B3llHR93L14wd0pSSnFULUlhcUk reference = BBMAP_REF output: directory(BBMAP_REF_DIR) log: - LOGS_DIR + '/preprocess/ref_indexing.log' + OUT_DIR + '/logs/ref_indexing.log' params: - usemodulo=config['preprocess']['filter_human']['bbmap_usemodulo'] + usemodulo=config['preprocess']['filter_human']['bbmap_usemodulo'], + mem_gb = config['preprocess']['filter_human']['bbmap_mem'] threads: cpus_avail singularity: singularity_img group: 'preprocess' + message: "Running index_human_ref with {threads} cores." shell: ''' - echo "Running index_human_ref with {threads} cores." - # Masked hg19 kindly provided by Brian Brushnell and manually dowloaded from - # https://drive.google.com/open?id=0B3llHR93L14wd0pSSnFULUlhcUk - (bbmap.sh \ + (bbsplit.sh \ + -Xmx{params.mem_gb}g \ ref={input.reference} \ usemodulo={params.usemodulo} \ path={output} \ @@ -149,17 +206,18 @@ rule index_human_ref: # Deduplication step with BBTool dedupe.sh rule dedupe: input: - fwd_clean = PRE_PROC_DIR+'/{dataset}_bfiltered/{fastq_file}-R1.clean.fastq.gz', - rev_clean = PRE_PROC_DIR+'/{dataset}_bfiltered/{fastq_file}-R2.clean.fastq.gz', + fwd_clean = OUT_DIR+'/{dataset}/preQC/bfiltered/{fastq_file}-R1.clean.fastq.gz', + rev_clean = OUT_DIR+'/{dataset}/preQC/bfiltered/{fastq_file}-R2.clean.fastq.gz', output: - fwd_clean_dedup = temp(PRE_PROC_DIR+'/{dataset}_cdedupe/{fastq_file}-R1.clean.nodup.fastq.gz'), - rev_clean_dedup = temp(PRE_PROC_DIR+'/{dataset}_cdedupe/{fastq_file}-R2.clean.nodup.fastq.gz'), - fastq_duplicates = PRE_PROC_DIR+'/{dataset}_cdedupe/{fastq_file}.clean.duplicates.fastq.gz' + fwd_clean_dedup = temp(OUT_DIR+'/{dataset}/preQC/cdedupe/{fastq_file}-R1.clean.nodup.fastq.gz'), + rev_clean_dedup = temp(OUT_DIR+'/{dataset}/preQC/cdedupe/{fastq_file}-R2.clean.nodup.fastq.gz'), + fastq_duplicates = OUT_DIR+'/{dataset}/preQC/cdedupe/{fastq_file}.clean.duplicates.fastq.gz' log: - LOGS_DIR + '/preprocess/{dataset}_cdedupe/{fastq_file}.log' + OUT_DIR + '/{dataset}/preQC/logs/cdedupe/{fastq_file}.log' threads:cpus_avail singularity: singularity_img group: 'preprocess' + message: "Running dedupe with {threads} cores." shell: ''' (dedupe.sh \ @@ -176,21 +234,23 @@ rule dedupe: # After removal of adapters, human reads, and duplicates, # the reads' 3'end are quality trimmed with fastp # Notice that quality filtering is disabled because it was done by rule trim_adapters -rule trim3end_dedupe: +rule trim_3end: input: - fwd_clean_dedup = PRE_PROC_DIR+'/{dataset}_cdedupe/{fastq_file}-R1.clean.nodup.fastq.gz', - rev_clean_dedup = PRE_PROC_DIR+'/{dataset}_cdedupe/{fastq_file}-R2.clean.nodup.fastq.gz' + fwd_clean_dedup = OUT_DIR+'/{dataset}/preQC/cdedupe/{fastq_file}-R1.clean.nodup.fastq.gz', + rev_clean_dedup = OUT_DIR+'/{dataset}/preQC/cdedupe/{fastq_file}-R2.clean.nodup.fastq.gz' params: - fastp_dir = PRE_PROC_DIR, min_length = config['preprocess']['trim_adapters']['min_length'] output: - fwd_tr = PRE_PROC_DIR+'/{dataset}_dfinaltrim/{fastq_file}-R1.clean.nodup.fastp.fastq.gz', - rev_tr = PRE_PROC_DIR+'/{dataset}_dfinaltrim/{fastq_file}-R2.clean.nodup.fastp.fastq.gz', - report1 = PRE_PROC_DIR+'/{dataset}_dfinaltrim/{fastq_file}.clean.nodup.fastp.html', - report2 = PRE_PROC_DIR+'/{dataset}_dfinaltrim/{fastq_file}.clean.nodup.fastp.json' + fwd_tr = temp(OUT_DIR+'/{dataset}/preQC/dfinaltrim/{fastq_file}-R1.clean.nodup.fastp.fastq.gz'), + rev_tr = temp(OUT_DIR+'/{dataset}/preQC/dfinaltrim/{fastq_file}-R2.clean.nodup.fastp.fastq.gz'), + report1 = OUT_DIR+'/{dataset}/preQC/dfinaltrim/{fastq_file}.clean.nodup.fastp.html', + report2 = OUT_DIR+'/{dataset}/preQC/dfinaltrim/{fastq_file}.clean.nodup.fastp.json' + log: + OUT_DIR + '/{dataset}/preQC/logs/dfinaltrim/{fastq_file}.log' threads: cpus_avail singularity: singularity_img group: 'preprocess' + message: "Running trim3end_dedupe with {threads} cores ..." shell: ''' (fastp \ @@ -203,38 +263,234 @@ rule trim3end_dedupe: --html {output.report1} --json {output.report2} \ --thread {threads}) &2>{log} ''' - -rule multiqc_preprocess_list_files: +###**THESE TARGET FILES ARE THE FINAL CLEAN**### +# concatenate adapter-trimmed, cleaned,deduplicated and quality-trimmed fastqs from the same samples +rule concatenate_fastqs: input: # - #atrimmed = lambda wildcards: ['{}/{}_atrimmed/{}.fastp.json'.format( - #PRE_PROC_DIR, value, FASTQS[ix]) - #for ix,value in enumerate(DATASETSX) if value==wildcards.dataset], + sample_fwds = lambda wildcards: ['{}/{}/preQC/dfinaltrim/{}-{}_{}-R1.clean.nodup.fastp.fastq.gz'.format( + OUT_DIR, DATASETS[ix], value, RUNS[ix], LANES[ix]) + for ix,value in enumerate(SAMPLES) if (value == wildcards.sample) and (DATASETS[ix] == wildcards.dataset)], # - dfinaltrim = lambda wildcards: ['{}/{}_dfinaltrim/{}.clean.nodup.fastp.json'.format( - PRE_PROC_DIR, value, FASTQS[ix]) - for ix,value in enumerate(DATASETSX) if value==wildcards.dataset] + sample_revs = lambda wildcards: ['{}/{}/preQC/dfinaltrim/{}-{}_{}-R2.clean.nodup.fastp.fastq.gz'.format( + OUT_DIR, DATASETS[ix], value, RUNS[ix], LANES[ix]) + for ix,value in enumerate(SAMPLES) if (value==wildcards.sample) and (DATASETS[ix]==wildcards.dataset)] + output: + sample_fwd = protected(OUT_DIR + '/{dataset}/preQC/emerged/{sample}-R1.fastq.gz'), + sample_rev = protected(OUT_DIR + '/{dataset}/preQC/emerged/{sample}-R2.fastq.gz') + wildcard_constraints: + sample = '\w+' + group: 'preprocess' + message: "Running concatenate_fastqs ..." + shell: + ''' + cat {input.sample_fwds} > {output.sample_fwd} + cat {input.sample_revs} > {output.sample_rev} + ''' +# Check that for each sample, fastq files were concatenated +rule check_concatenation: + input: + inputs_r1 = list_concatenated_r1_fastqs, + inputs_r2 = list_concatenated_r2_fastqs + output: + concatenated_fastqs_list = OUT_DIR + '/{dataset}/preQC/summary_stats/' + + '{dataset}_concatenation_all.done' + conda:'../envs/rawQC.yaml' + run: + import os + from pathlib import Path + + try: + sample_fastqs_exist = [os.path.exists(r1_file) and os.path.exists(r2_file) + for r1_file, r2_file in zip(input.inputs_r1,input.inputs_r2)] + if all(sample_fastqs_exist): + Path(output.concatenated_fastqs_list).touch() + except (TypeError, ValueError) as e: + print(e) + +# MultiQC/multi-file reports +#---------------------------- + +# List files to include in MultiQC/multi-file reports + +# +rule mqc_trim_adapters_list_files: + input: + # + inputs_list = mqc_trim_adapters_inputs + output: + multiqc_inputs_file = OUT_DIR +'/{dataset}/preQC/multiqc/atrimmed/multiqc_inputs.txt' + run: + import os + try: + os.makedirs(os.path.dirname(output.multiqc_inputs_file)) + except OSError: + pass + + with open(output.multiqc_inputs_file, mode='w', encoding='utf-8') as out: + for item in input.inputs_list: + out.write("%s\n" % item) +# +rule mqc_filter_human_list_files: + input: + inputs_list = mqc_filter_human_inputs + output: + multiqc_inputs_file = OUT_DIR + '/{dataset}/preQC/multiqc/bfiltered/multiqc_inputs.txt' + run: + import os + try: + os.makedirs(os.path.dirname(output.multiqc_inputs_file)) + except OSError: + pass + + with open(output.multiqc_inputs_file, mode='w', encoding='utf-8') as out: + for item in input.inputs_list: + out.write("%s\n" % item) +# +rule mqc_trim_3end_list_files: + input: + inputs_list = mqc_trim_3end_inputs output: - multiqc_input_list = PRE_PROC_REPORT+'/{dataset}_multiqc_inputs.txt' + multiqc_inputs_file = OUT_DIR + '/{dataset}/preQC/multiqc/dfinaltrim/multiqc_inputs.txt' run: import os try: - os.makedirs(os.path.dirname(output.multiqc_input_list)) + os.makedirs(os.path.dirname(output.multiqc_inputs_file)) except OSError: pass - with open(output.multiqc_input_list, mode='w', encoding='utf-8') as out: - for item in input.dfinaltrim: + with open(output.multiqc_inputs_file, mode='w', encoding='utf-8') as out: + for item in input.inputs_list: out.write("%s\n" % item) -rule multiqc_preprocess: +# Run MultiQC on all pre-processed reports +# +rule mqc_trim_adapters: input: - PRE_PROC_REPORT+'/{dataset}_multiqc_inputs.txt' + OUT_DIR +'/{dataset}/preQC/multiqc/atrimmed/multiqc_inputs.txt' output: - multiqc_report = report(PRE_PROC_REPORT+'/{dataset}_multiqc.html', - category='preprocess') + multiqc_data_dir = OUT_DIR + + '/{dataset}/preQC/multiqc/atrimmed'+ + '/{dataset}_atrimmed_mqc_data/multiqc_data.json', + multiqc_report = report(OUT_DIR + + '/{dataset}/preQC/multiqc/atrimmed' + + '/{dataset}_atrimmed_mqc.html', + category='preQC_step1:trim_adapters') singularity: singularity_img shell: ''' multiqc -f --file-list {input} --filename {output.multiqc_report} ''' +# +rule mqc_filter_human: + input: + OUT_DIR +'/{dataset}/preQC/multiqc/bfiltered/multiqc_inputs.txt' + output: + mqc_report = report(OUT_DIR + + '/{dataset}/preQC/multiqc/bfiltered' + + '/{dataset}_bfiltered_stats.tsv', + category='preQC_step2:filter_human') + conda:'../envs/rawQC.yaml' + run: + from utils import summarize_filter_human_step + mqc_stats_data = summarize_filter_human_step('{}'.format(input)) + mqc_stats_data.to_csv(output.mqc_report, sep='\t', header=True, + index=True, index_label='Unit') +# +rule multiqc_trim_3end: + input: + OUT_DIR+'/{dataset}/preQC/multiqc/dfinaltrim/multiqc_inputs.txt' + output: + multiqc_fastqc = OUT_DIR + + '/{dataset}/preQC/multiqc/dfinaltrim'+ + '/{dataset}_dfinaltrim_mqc_data/multiqc_data.json', + multiqc_report = report(OUT_DIR + + '/{dataset}/preQC/multiqc/dfinaltrim'+ + '/{dataset}_dfinaltrim_mqc.html', + category='preQC_step4:trim_3end') + singularity: singularity_img + shell: + ''' + multiqc -f --file-list {input} --filename {output.multiqc_report} + ''' +# +# Create summarizing tables and plots +rule summarize_preQC: + input: + trim_adapters_mqc_json = OUT_DIR + '/{dataset}/preQC/multiqc/atrimmed'+ + '/{dataset}_atrimmed_mqc_data/multiqc_data.json', + filter_human_mqc_stats = OUT_DIR + '/{dataset}/preQC/multiqc/bfiltered' + + '/{dataset}_bfiltered_stats.tsv', + trim3end_mqc_json = OUT_DIR + '/{dataset}/preQC/multiqc/dfinaltrim'+ + '/{dataset}_dfinaltrim_mqc_data/multiqc_data.json' + log: + OUT_DIR + '/{dataset}/preQC/logs/summarize_preqc.log' + output: + units_summary = report(OUT_DIR + + '/{dataset}/preQC/summary_stats' + + '/{dataset}_preqc_units_summary.tsv', + category='preQC:summaries'), + samples_summary = report(OUT_DIR + + '/{dataset}/preQC/summary_stats' + + '/{dataset}_preqc_samples_summary.tsv', + category='preQC:summaries'), + units_summary_plot = report(OUT_DIR + + '/{dataset}/preQC/summary_stats' + + '/{dataset}_preqc_units_barchart.svg', + category='preQC:summaries'), + samples_summary_plot = report(OUT_DIR + + '/{dataset}/preQC/summary_stats' + + '/{dataset}_preqc_samples_barchart.svg', + category='preQC:summaries'), + units_summary_pct_plot = report(OUT_DIR + + '/{dataset}/preQC/summary_stats' + + '/{dataset}_preqc_units_pct_barchart.svg', + category='preQC:summaries'), + samples_summary_pct_plot = report(OUT_DIR + + '/{dataset}/preQC/summary_stats'+ + '/{dataset}_preqc_samples_pct_barchart.svg', + category='preQC:summaries') + conda:'../envs/rawQC.yaml' + run: + from utils import summarize_preqc + from utils import plot_preqc_summary + import sys + + sys.stdout = open(log[0], 'w') + + + try: + trim_adapters_mqc_json = '{}'.format(input.trim_adapters_mqc_json) + filter_human_mqc_stats = '{}'.format(input.filter_human_mqc_stats) + trim3end_mqc_json = '{}'.format(input.trim3end_mqc_json) + # units summary + units_summary_df = summarize_preqc(trim_adapters_mqc_json, filter_human_mqc_stats, + trim3end_mqc_json) + except (TypeError, ValueError) as e: + print(e) + else: + units_summary_df.to_csv(output.units_summary, sep='\t', header=True, index=True, + index_label='Unit') + # samples summary + samples_summary_df = units_summary_df.groupby(['Sample']).agg('sum') + pct_columns = ['trim_adapters_pct_disc', 'filter_human_pct_disc', 'dedupe_pct_disc', + 'trim_3end_pct_disc', 'total_pct_disc', 'total_pct_kept'] + samples_summary_df[pct_columns] = units_summary_df[['Sample']+pct_columns].groupby(['Sample']).agg('mean') + samples_summary_df.to_csv(output.samples_summary, sep='\t', header=True, index=True, + index_label='Sample') + # radial barcharts + # + fig1 = plot_preqc_summary(units_summary_df, by='units', plot_type='raw') + fig1.savefig(output.units_summary_plot, bbox_inches='tight', format='svg') + # + fig2 = plot_preqc_summary(units_summary_df, by='units', plot_type='pct') + fig2.savefig(output.units_summary_pct_plot, bbox_inches='tight', format='svg') + # + fig3 = plot_preqc_summary(units_summary_df, by='samples', plot_type='raw') + fig3.savefig(output.samples_summary_plot, bbox_inches='tight', format='svg') + # + fig4 = plot_preqc_summary(units_summary_df, by='samples', plot_type='pct') + fig4.savefig(output.samples_summary_pct_plot, bbox_inches='tight', format='svg') + finally: + sys.stdout.close() + \ No newline at end of file diff --git a/rules/rawQC.smk b/rules/rawQC.smk index d064965d3c6b9beee41b8cdce767de9167243aa9..3ba8d34e0481b713a0b4a2f091fa9e0de420a5ae 100644 --- a/rules/rawQC.smk +++ b/rules/rawQC.smk @@ -4,6 +4,7 @@ Afiliation(s): SIB, SwissTPH, UNIBAS Description: rules for QC and pre-processing of paired-end shotgun DNA metagenomic sequencing. ''' + localrules: multiqc_raw_list_files, multiqc_raw @@ -20,17 +21,16 @@ singularity_img = 'shub://mticlla/MetagenomicSnake:preqc_v0_1' # for R1. 'R1' was removed from names in html and zip outputs rule fastqc_raw: input: - #'data/raw/fastq/{dataset}/{fastq_file}-R1.fastq.gz' - lambda wildcards: ['{}/{}/{}-R1.fastq.gz'.format(RAW_FASTQ_DIR, DATASETSX[ix], value) + lambda wildcards: ['{}/{}/fastq/{}-R1.fastq.gz'.format(RAW_FASTQ_DIR, DATASETSX[ix], value) for ix,value in enumerate(FASTQS) if value==wildcards.fastq_file] params: - fastqc_dir = RAW_QC_DIR, + fastqc_dir = OUT_DIR, samplerate = config['rawQC']['samplerate'] log: - LOGS_DIR+'/raw_qc/{dataset}_fastqc/{fastq_file}.log' + OUT_DIR+'/{dataset}/rawQC/logs/{fastq_file}_fastqc.log' output: - fastqc_html = RAW_QC_DIR+'/{dataset}_fastqc/{fastq_file}_fastqc.html', - fastqc_zip = RAW_QC_DIR+'/{dataset}_fastqc/{fastq_file}_fastqc.zip' + fastqc_html = OUT_DIR+'/{dataset}/rawQC/fastqc/{fastq_file}_fastqc.html', + fastqc_zip = OUT_DIR+'/{dataset}/rawQC/fastqc/{fastq_file}_fastqc.zip' threads: cpus_avail singularity: singularity_img message: @@ -39,7 +39,7 @@ rule fastqc_raw: ''' # Take a random sample of reads and process them with fastQC (reformat.sh in={input} out=stdout.fq samplerate={params.samplerate} | \ - fastqc -o {params.fastqc_dir}/{wildcards.dataset}_fastqc -f fastq \ + fastqc -o {params.fastqc_dir}/{wildcards.dataset}/rawQC/fastqc -f fastq \ -t {threads} stdin:{wildcards.fastq_file}) 2> {log} ''' @@ -47,10 +47,11 @@ rule fastqc_raw: rule multiqc_raw_list_files: input: #all fastqc zip files per dataset - fastqcs=lambda wildcards: ['{}/{}_fastqc/{}_fastqc.zip'.format(RAW_QC_DIR, value, FASTQS[ix]) + fastqcs=lambda wildcards: ['{}/{}/rawQC/fastqc/{}_fastqc.zip'.format(OUT_DIR, value, FASTQS[ix]) for ix,value in enumerate(DATASETSX) if value==wildcards.dataset] output: - multiqc_input_list = RAW_QC_REPORT+'/{dataset}_multiqc_inputs.txt' + #multiqc_input_list = RAW_QC_REPORT+'/{dataset}/multiqc_inputs.txt' + multiqc_input_list = OUT_DIR+'/{dataset}/rawQC/multiqc_inputs.txt' run: import os try: @@ -65,11 +66,14 @@ rule multiqc_raw_list_files: # Run MultiQC on all FastQC reports rule multiqc_raw: input: - RAW_QC_REPORT + '/{dataset}_multiqc_inputs.txt' + multiqc_input_list = OUT_DIR+'/{dataset}/rawQC/multiqc_inputs.txt' + params: + multiqc_dir = OUT_DIR+'/{dataset}/rawQC/multiqc' output: - multiqc_fastqc = RAW_QC_REPORT + '/{dataset}_multiqc_data/multiqc_fastqc.txt', - multiqc_report = report(RAW_QC_REPORT + '/{dataset}_multiqc.html', - category='rawQC', caption='../report/rawQC/rawQC_multiqc_fastqc.rst') + multiqc_fastqc = OUT_DIR + '/{dataset}/rawQC/multiqc/{dataset}_multiqc_data/multiqc_fastqc.txt', + multiqc_report = report(OUT_DIR + '/{dataset}/rawQC/multiqc/{dataset}_multiqc.html', + category='RawQC', + caption='../report/rawQC/rawQC_multiqc_fastqc.rst') singularity:singularity_img shell: ''' @@ -78,23 +82,31 @@ rule multiqc_raw: # Creates summarizing tables and plots rule summarize_raw: input: - RAW_QC_REPORT + '/{dataset}_multiqc_data/multiqc_fastqc.txt' - log:LOGS_DIR+'/raw_qc/{dataset}_summarize_raw.log' + OUT_DIR + '/{dataset}/rawQC/multiqc/{dataset}_multiqc_data/multiqc_fastqc.txt' + log: + OUT_DIR+'/{dataset}/rawQC/logs/summarize_raw.log' output: - units_stats = report(RAW_QC_REPORT + '/{dataset}_units_stats.txt', - category='rawQC', caption='../report/rawQC/rawQC_units_stats.rst'), - samples_stats = report(RAW_QC_REPORT + '/{dataset}_samples_stats.txt', - category='rawQC', caption='../report/rawQC/rawQC_samples_stats.rst'), - run_lane_stats = report(RAW_QC_REPORT + '/{dataset}_run_lane_stats.txt', - category='rawQC', caption='../report/rawQC/rawQC_run_lane_stats.rst'), - units_read_pairs_plot = report(RAW_QC_REPORT + '/{dataset}_units_read_dist.svg', - category='rawQC', caption='../report/rawQC/rawQC_units_read_dist.rst'), - samples_read_pairs_plot = report(RAW_QC_REPORT + '/{dataset}_samples_read_dist.svg', - category='rawQC', caption='../report/rawQC/rawQC_samples_read_dist.rst'), - samples_read_pairs_dispersion_plot = report(RAW_QC_REPORT + '/{dataset}_samples_read_dispersion.svg', - category='rawQC', caption='../report/rawQC/rawQC_samples_reads_dispersion.rst'), - fastqc_tests_status_plot = report(RAW_QC_REPORT + '/{dataset}_fastqc_tests_status.svg', - category='rawQC', caption='../report/rawQC/rawQC_fastqc_tests_status.rst') + units_stats = report(OUT_DIR + '/{dataset}/rawQC/summary_stats/{dataset}_units_stats.txt', + category='RawQC', + caption='../report/rawQC/rawQC_units_stats.rst'), + samples_stats = report(OUT_DIR + '/{dataset}/rawQC/summary_stats/{dataset}_samples_stats.txt', + category='RawQC', + caption='../report/rawQC/rawQC_samples_stats.rst'), + run_lane_stats = report(OUT_DIR + '/{dataset}/rawQC/summary_stats/{dataset}_run_lane_stats.txt', + category='RawQC', + caption='../report/rawQC/rawQC_run_lane_stats.rst'), + units_read_pairs_plot = report(OUT_DIR + '/{dataset}/rawQC/summary_stats/{dataset}_units_read_dist.svg', + category='RawQC', + caption='../report/rawQC/rawQC_units_read_dist.rst'), + samples_read_pairs_plot = report(OUT_DIR + '/{dataset}/rawQC/summary_stats/{dataset}_samples_read_dist.svg', + category='RawQC', + caption='../report/rawQC/rawQC_samples_read_dist.rst'), + samples_read_pairs_dispersion_plot = report(OUT_DIR + '/{dataset}/rawQC/summary_stats/{dataset}_samples_read_dispersion.svg', + category='RawQC', + caption='../report/rawQC/rawQC_samples_reads_dispersion.rst'), + fastqc_tests_status_plot = report(OUT_DIR + '/{dataset}/rawQC/summary_stats/{dataset}_fastqc_tests_status.svg', + category='RawQC', + caption='../report/rawQC/rawQC_fastqc_tests_status.rst') conda:'../envs/rawQC.yaml' script: '../scripts/rawQC_summarize.py' diff --git a/rules/utils.py b/rules/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..fa43fddc08f4f0387751a1119e8206fc4658cd92 --- /dev/null +++ b/rules/utils.py @@ -0,0 +1,270 @@ +import json +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +from matplotlib import rc + +def read_multiqc_fastp(mqc_fastp_json): + '''Parses the json file generated by MultiQC to generate a tsv report for FastP results. + ''' + mqc_fastp_json_file = mqc_fastp_json + with open(mqc_fastp_json_file, 'r') as my_file: + json_data = my_file.read() + #json_obj = json.loads(json_data) + #multiqc_fastp_data = json_obj['report_general_stats_data'][0] + multiqc_fastp_data = json.loads(json_data)['report_general_stats_data'][0] + multiqc_fastp_data_df = pd.DataFrame.from_dict(multiqc_fastp_data, + orient='index') + return(multiqc_fastp_data_df) + +def summarize_filter_human_step(mqc_filter_human_list): + '''Parses all the stats files included in mqc_filter_human_list and + creates a single merged table report. + + Args: + mqc_filter_human_list: FilePath to the file containing a list of stats files + Returns: + A pandas.DataFRame + ''' + try: + with open(mqc_filter_human_list, 'r') as my_file: + print(mqc_filter_human_list) + bfiltered_files = [f.strip() for f in my_file.readlines()] + except (TypeError, ValueError, IOError, EOFError) as e: + print(e) + else: + header = list() + multi_bfiltered_data = dict() + for ix, bfiltered_file in enumerate(bfiltered_files): + unit_name = bfiltered_file.split('/')[-1].split('.')[0] + with open(bfiltered_file) as f: + lines = f.read().split('\n')[:-1] + if ix==0: + header = lines[0].split('\t') + multi_bfiltered_data[unit_name] = lines[1].split('\t') + else: + multi_bfiltered_data[unit_name] = lines[1].split('\t') + header[0] = 'ref' + header[1] = 'pct_unambiguousReads' + header[3] = 'pct_ambiguousReads' + + multi_bfiltered_stats = pd.DataFrame.from_dict(multi_bfiltered_data, + orient='index', columns=header) + return(multi_bfiltered_stats) + +def summarize_preqc(mqc_trimadapters_fastps_json, mqc_filter_human_stats, mqc_trim3end_fastps_json): + """Summarizes all the steps in the preQC workflow. + + Args: + mqc_trimadapters_fastps_json: FilePath to the tsv report from step trimadapters + mqc_filter_human_stats: FilePath to the tsv report from step filter_human + mqc_trim3end_fastps_json: FilePath to the tsv report from step trim3end + Returns: + A pandas.DataFRame + """ + try: + trim_adapters_mqc_data = read_multiqc_fastp(mqc_trimadapters_fastps_json) + filter_human_mqc_data = pd.read_csv(mqc_filter_human_stats, sep='\t', header=0, index_col=0) + trim3end_mqc_data = read_multiqc_fastp(mqc_trim3end_fastps_json) + except (ValueError,TypeError) as e: + print(e) + else: + trim_adapters_mqc_tmp = pd.DataFrame() + trim_adapters_mqc_tmp['total_reads'] = trim_adapters_mqc_data['before_filtering_total_reads'] + trim_adapters_mqc_tmp['trim_adapters_disc_reads'] = trim_adapters_mqc_tmp['total_reads'] - \ + trim_adapters_mqc_data['after_filtering_total_reads'] + + filter_human_mqc_tmp = pd.DataFrame() + filter_human_mqc_tmp['filter_human_disc_reads'] = filter_human_mqc_data['assignedReads'].astype('int') + + trim3end_mqc_tmp = pd.DataFrame() + trim3end_mqc_tmp['trim_3end_disc_reads'] = trim3end_mqc_data['before_filtering_total_reads'] - \ + trim3end_mqc_data['after_filtering_total_reads'] + + tmp_df = pd.concat([trim_adapters_mqc_tmp, filter_human_mqc_tmp, trim3end_mqc_tmp], axis=1, join='outer', sort=True) + tmp_df.loc[trim3end_mqc_data.index,'dedupe_disc_reads'] = tmp_df.loc[trim3end_mqc_data.index,'total_reads'] - \ + tmp_df.loc[trim3end_mqc_data.index,'trim_adapters_disc_reads'] - \ + tmp_df.loc[trim3end_mqc_data.index,'filter_human_disc_reads'] - \ + trim3end_mqc_data['before_filtering_total_reads'].values + tmp_df.loc[trim3end_mqc_data.index,'total_disc_reads'] = tmp_df.loc[trim3end_mqc_data.index,'total_reads'] - \ + trim3end_mqc_data['after_filtering_total_reads'] + + tmp_df.loc[trim3end_mqc_data.index,'total_kept_reads'] = trim3end_mqc_data['after_filtering_total_reads'] + + # Adding columns with percentages of discarded reads in each step + tmp_df['trim_adapters_pct_disc'] = tmp_df['trim_adapters_disc_reads'] / tmp_df['total_reads']*100 + tmp_df['filter_human_pct_disc'] = tmp_df['filter_human_disc_reads'] / tmp_df['total_reads']*100 + tmp_df['dedupe_pct_disc'] = tmp_df['dedupe_disc_reads'] / tmp_df['total_reads']*100 + tmp_df['trim_3end_pct_disc'] = tmp_df['trim_3end_disc_reads'] / tmp_df['total_reads']*100 + tmp_df['total_pct_disc'] = tmp_df['total_disc_reads'] / tmp_df['total_reads']*100 + tmp_df['total_pct_kept'] = tmp_df['total_kept_reads'] / tmp_df['total_reads']*100 + + # Re-order columns + tmp_columns = ['total_reads','trim_adapters_disc_reads','filter_human_disc_reads','dedupe_disc_reads','trim_3end_disc_reads'] + \ + tmp_df.columns.tolist()[5:] + tmp_df = tmp_df[tmp_columns] + + # Add sample, run, lane columns + sample_run_lane = pd.DataFrame.from_dict({unit:{'Sample':unit.split('-')[0], + 'Run':unit.split('-')[1].split('_')[0], + 'Lane':unit.split('-')[1].split('_')[1] + } + for unit in tmp_df.index + }, + orient='index' + ) + + sample_run_lane = pd.concat([sample_run_lane, tmp_df], axis=1, join='outer', sort=False) + return(sample_run_lane) + +def plot_preqc_summary(preqc_summary_df, by='units', plot_type='raw'): + """Creates a radial bar chart from the dataframe generted by summarize_preqc. + + Args: + preqc_summary_df: pd.DataFrame object generated by summarize_preqc() + by: either 'units' or 'samples'. it specifies how to aggregate the data. + plot_type: either 'raw' or 'pct'. If 'pct' a percent stacked barchar is returned. + Returns: + matplotlib.figure.Figure + """ + import inspect + by_options = ['units','samples'] + type_options = ['raw','pct'] + + frame = inspect.currentframe() + args_type = ['pandas.DataFrame', by_options, type_options] + UNDEFINED = object() + def print_message(frame, args_type): + args, _, _, values = inspect.getargvalues(frame) + print('{}()'.format(inspect.getframeinfo(frame)[2])) + print('Args:') + for ix,arg in enumerate(args): + print(' {}: {}'.format(arg, str(args_type[ix]))) + + if (not isinstance(preqc_summary_df, pd.DataFrame)) or (by not in by_options) or (plot_type not in type_options): + print('Invalid arguments') + print_message(frame,args_type) + return(UNDEFINED) + else: + try: + if plot_type == 'raw': + columns_to_plot = ['trim_adapters_disc_reads', + 'filter_human_disc_reads', + 'dedupe_disc_reads', + 'trim_3end_disc_reads', + 'total_kept_reads'] + sample_run_lane = preqc_summary_df[['total_reads','Sample']+columns_to_plot].copy() + if by == 'samples': + sample_run_lane = sample_run_lane.groupby(['Sample']).agg('sum') + bottom_lim = 0.0 + top_lim = sample_run_lane['total_reads'].max() + # This uses the given y-units of bottom spacing for each bar + # You may increase this number a bit to have more spacing between bars and text. + bottom_spacing = top_lim/1.8 + rgrids_positions = sample_run_lane['total_reads'].describe()[[3,5,7]].values + bottom_spacing + rgrids_positions_labels = [' {}M'.format(nr) + for ix,nr in enumerate(np.round((rgrids_positions-bottom_spacing)/1000000, decimals=3))] + else: + columns_to_plot = ['trim_adapters_pct_disc', + 'filter_human_pct_disc', + 'dedupe_pct_disc', + 'trim_3end_pct_disc', + 'total_pct_kept'] + sample_run_lane = preqc_summary_df[['total_reads','Sample']+columns_to_plot].copy() + if by == 'samples': + sample_run_lane = sample_run_lane.groupby(['Sample']).agg('mean') + bottom_lim = 0.0 + top_lim = 100 + bottom_spacing = top_lim/1.8 + rgrids_values = [25,50,75,100] + rgrids_positions = np.add(rgrids_values,bottom_spacing) + rgrids_positions_labels = [' {}%'.format(nr) for nr in rgrids_values] + + sample_run_lane.sort_values(by=['total_reads'], inplace=True) + # Colors for each column (nested bars inside each bar) + barColors = ['#fdb863', '#e08214', '#b35806', '#542788','#8073ac'][::-1] + except (TypeError, ValueError) as e: + print(e) + else: + # This variable was used for testing purposes to see how fold increases + # of the datapoints affects the plot + lenMult = 1 + # Nr of datapoints to plot + nr_h_bars = len(sample_run_lane)*lenMult + # This sets the number of x-positions left empty at the beginning + # so that enough space is left for y-labels + nr_empty_bars = np.int(np.ceil(nr_h_bars/40)) + empty_bars = [0]*nr_empty_bars + empty_bars_labels = ['']*len(empty_bars) + # re-set the number of x-positions (data points) to plot + nr_h_bars = nr_h_bars+nr_empty_bars + + # The positions of the bars on the x-axis + # Each position corresponds to a radian degree in a unit circunference (radius is 1) + # The circumference (2pi) is divided into nr_h_bars sections + r= np.arange(0,2*np.pi,2*np.pi/nr_h_bars) # for radial barchart + r= r[0:nr_h_bars] + # Bar width + barWidth = (2*np.pi)/(nr_h_bars)*0.9 # for radial barchart + + # Define the columns to use for the stacked bars, + # values of each column are plot on top of one another + barColumns = columns_to_plot[::-1] + + # This uses the given y-units of bottom spacing for each bar + # You may increase this number a bit to have more spacing between bars and text. + bottomBars = [bottom_spacing]*nr_h_bars + + # Lets set the figure size depending on the amount of data points to plot + # Labels until 1000 datapoints are readable + # Consider adding additional elif scenarios to increase size for larger datasets + if nr_h_bars <=200: + fig = plt.figure(figsize=(10, 10)) + font_size = 7.5 + elif nr_h_bars <= 500: + fig = plt.figure(figsize=(20, 20)) + font_size = 'x-small' + else: + fig = plt.figure(figsize=(28, 28)) + font_size = 6 + + # Here we do the actual plotting + # ------------------------------ + # Values for each column for a datapoint (i.e sequencing unit, or sample) + # are plot on top of one another + ax = fig.add_axes([0.1,0.1,0.75,0.75], projection='polar') + for ix,column in enumerate(barColumns): + bar_values=empty_bars+sample_run_lane[column].tolist()*lenMult + bars=ax.bar(r, bar_values, bottom=bottomBars, + color=barColors[ix], edgecolor='white', width=barWidth, label=column) + bottomBars = np.add(bottomBars,bar_values).tolist() + + # Here we add annotations (e.g labels, legend) + # -------------------------------------------- + rotations = np.rad2deg(r) + y0,y1 = ax.get_ylim() + bar_labels= empty_bars_labels + sample_run_lane.index.tolist()*lenMult + for x, y_pos, rotation, label in zip(r, bottomBars, rotations, bar_labels): + offset = (y_pos)/(y1-y0) + lab = ax.text(0, 0, label, fontsize=font_size, transform=None, ha='center', va='center', fontstretch='ultra-condensed') + renderer = ax.figure.canvas.get_renderer() + bbox = lab.get_window_extent(renderer=renderer) + invb = ax.transData.inverted().transform([[0,0],[bbox.width,0] ]) + lab.set_position((x,offset+(invb[1][0]-invb[0][0])/2.*2.7) ) + lab.set_transform(ax.get_xaxis_transform()) + lab.set_rotation(rotation) + #rgrids_positions = sample_run_lane['total_reads'].describe()[[3,5,7]].values + bottom_spacing + #rgrids_positions_labels = [' {}M'.format(nr) + # for ix,nr in enumerate(np.round((rgrids_positions-bottom_spacing)/1000000, decimals=3))] + ax.set_rgrids(rgrids_positions, + labels=rgrids_positions_labels, + angle=np.rad2deg(r[np.int(nr_empty_bars/2-1)]), + family='serif', ha='left', va='center', weight='bold',fontstretch='ultra-condensed', fontsize=8.5) + ax.legend(bbox_to_anchor=(0.5, 0.5), loc='center', frameon=False, title='MetaSnk: preQC') + # A few extra edits for aesthetics + # ------------------------------- + ax.xaxis.grid(False) + ax.spines["polar"].set_visible(False) + ax.set_xticklabels([]) + + return(fig) \ No newline at end of file