add module preQC, rules for pre-processing paired fastqs

33007fc2 · Ticlla Ccenhua Monica Roxana · 5c7fb77c · 33007fc2 · 33007fc2 · 33007fc2
Commit 33007fc2 authored 5 years ago by Ticlla Ccenhua Monica Roxana
--- a/Snakefile
+++ b/Snakefile
@@ -10,9 +10,9 @@ import os
 ##----------------------------------------------------------------------------##
 from multiprocessing import cpu_count
 cpus_avail = cpu_count()
-print('CPUs available: {}'.format(cpus_avail))
+print('CPUs available: {}'.format(cpus_avail), file=sys.stderr)
 ##----------------------------------------------------------------------------##
-## Working directory
+## Re-set working directory
 ##----------------------------------------------------------------------------##
 # Set working directory
 workdir_path = os.path.dirname(os.path.abspath(__name__))
@@ -20,67 +20,67 @@ workflow_path = workflow.basedir

 if workdir_path == workflow_path:
    message = "Working directory was not specified!"+\
-    "MetagenomicSnake assumes ./data, relative to current directory, "+\
-    "as your working directory ..."
-    print(message)
+              "Then, MetagenomicSnake assumes ./data, relative to current directory, "+\
+              "as your working directory ..."
+    print(message, file=sys.stderr)
    if os.path.exists(workflow_path+'/data'):
        workdir_path = workflow_path+'/data'
        workdir: workdir_path
-        print("Working directory:{}".format(workdir_path))
+        print("Working directory:{}".format(workdir_path), file=sys.stderr)
    else:
-        print("... Folder ./data not found in current directory...")
-        print("... instead, setting current directory as working directory ...")
-        print("Working directory:{}".format(workdir_path))
+        message = "... Folder ./data not found in current directory..."+\
+                  "... instead, setting current directory as working directory ..."+\
+                  "Working directory:{}".format(workdir_path)
+        print(message, file=sys.stderr)
 else:
-    print("Working directory:{}".format(workdir_path))
+    print("Working directory:{}".format(workdir_path), file=sys.stderr)

 ##----------------------------------------------------------------------------##
 ## Configuration of MetagenomicSnake
 ##----------------------------------------------------------------------------##
 try:
    configfile_path = config['configfile_path']
-    print("Configuration file: {}".format(configfile_path))
+    print("Configuration file: {}".format(configfile_path), file=sys.stderr)
 except:
-    print("Configuration file config.yaml not especified at execution!")
+    print("Configuration file config.yaml not especified at execution!", file=sys.stderr)
    try:
-        print("... Trying working directory ...")
+        print("... Trying working directory ...", file=sys.stderr)
        configfile_path = "config.yaml"
        configfile: configfile_path
-        print("... Configuration file: {}".format(configfile_path))
+        print("... Configuration file: {}".format(configfile_path), file=sys.stderr)
    except:
-        print("... config.yaml not found in working directory ...")
-        print("... Loading default config.yaml provided with MetagenomicSnake...")
+        message = "... config.yaml not found in working directory ..."+\
+                  "... Loading default config.yaml provided with MetagenomicSnake..."
+        print(message, file=sys.stderr)
        configfile_path = workflow_path + "/config.yaml"
        configfile: configfile_path
-        print("... Configuration file: {}".format(configfile_path))
+        print("... Configuration file: {}".format(configfile_path), file=sys.stderr)

 ##----------------------------------------------------------------------------##
 ## Define paths
 ##----------------------------------------------------------------------------##
-RAW_FASTQ_DIR = 'raw/fastq'
-RESULTS_DIR = config['PREFIX_DIR'] +'/results'
-LOGS_DIR = config['PREFIX_DIR'] +'/logs'
-REPORTS_DIR = config['PREFIX_DIR'] +'/reports'
+RAW_FASTQ_DIR = config['RAW_DIR']
+OUT_DIR = config['OUT_DIR']

+#RESULTS_DIR = config['OUT_DIR'] +'/results'
+#LOGS_DIR = config['OUT_DIR'] +'/logs'
+#REPORTS_DIR = config['OUT_DIR'] +'/reports'

-RAW_QC_DIR = RESULTS_DIR + '/rawQC'
-RAW_QC_REPORT = REPORTS_DIR + '/rawQC'
-
-PRE_PROC_DIR = RESULTS_DIR + '/preprocessed'
-PRE_PROC_REPORT = REPORTS_DIR + '/preprocessed'
-
-#MERGE_DIR = RESULTS_DIR + '/merged'
-#MERGE_REPORT = REPORTS_DIR + '/merged'
 ##----------------------------------------------------------------------------##
 ## Fastq files to be processed
 ##----------------------------------------------------------------------------##
 if config['SAMPLE_UNITS']['auto']:
-    (DATASETS, SAMPLES, RUNS, LANES) = glob_wildcards(RAW_FASTQ_DIR+'/{dataset}/{sample}-{run}_{lane}-R1.fastq.gz')
-    (DATASETSX, FASTQS) = glob_wildcards(RAW_FASTQ_DIR+'/{dataset}/{fastq_file}-R1.fastq.gz')
+    (DATASETS, SAMPLES, RUNS, LANES) = glob_wildcards(RAW_FASTQ_DIR+'/{dataset}/fastq/{sample}-{run}_{lane}-R1.fastq.gz')
+    (DATASETSX, FASTQS) = glob_wildcards(RAW_FASTQ_DIR+'/{dataset}/fastq/{fastq_file}-R1.fastq.gz')
 else:
    # TODO:
    pass
-
+##----------------------------------------------------------------------------##
+## Local rules
+##----------------------------------------------------------------------------##
+localrules:
+    rawQC,
+    preQC
 ##----------------------------------------------------------------------------##
 ## Run entire workflow
 ##----------------------------------------------------------------------------##
@@ -95,21 +95,54 @@ rule all:
 ##----------------------------------------------------------------------------##
 rule rawQC:
    input:
-        expand(RAW_QC_REPORT + '/{dataset}_samples_stats.txt', dataset=set(DATASETS)),
-        expand(RAW_QC_REPORT + '/{dataset}_units_stats.txt', dataset=set(DATASETS)),
-        expand(RAW_QC_REPORT + '/{dataset}_run_lane_stats.txt', dataset=set(DATASETS)),
-        expand(RAW_QC_REPORT + '/{dataset}_units_read_dist.svg', dataset=set(DATASETS)),
-        expand(RAW_QC_REPORT + '/{dataset}_samples_read_dist.svg', dataset=set(DATASETS)),
-        expand(RAW_QC_REPORT + '/{dataset}_samples_read_dispersion.svg', dataset=set(DATASETS)),
-        expand(RAW_QC_REPORT + '/{dataset}_fastqc_tests_status.svg', dataset=set(DATASETS))
-rule preprocess:
+        expand(OUT_DIR + '/{dataset}/rawQC/summary_stats/{dataset}_samples_stats.txt', dataset=set(DATASETS)),
+        expand(OUT_DIR + '/{dataset}/rawQC/summary_stats/{dataset}_units_stats.txt', dataset=set(DATASETS)),
+        expand(OUT_DIR + '/{dataset}/rawQC/summary_stats/{dataset}_run_lane_stats.txt', dataset=set(DATASETS)),
+        expand(OUT_DIR + '/{dataset}/rawQC/summary_stats/{dataset}_units_read_dist.svg', dataset=set(DATASETS)),
+        expand(OUT_DIR + '/{dataset}/rawQC/summary_stats/{dataset}_samples_read_dist.svg', dataset=set(DATASETS)),
+        expand(OUT_DIR + '/{dataset}/rawQC/summary_stats/{dataset}_samples_read_dispersion.svg', dataset=set(DATASETS)),
+        expand(OUT_DIR + '/{dataset}/rawQC/summary_stats/{dataset}_fastqc_tests_status.svg', dataset=set(DATASETS))
+
+rule preQC:
    input:
-        expand(PRE_PROC_REPORT + '/{dataset}_multiqc.html', dataset=set(DATASETS))
+        expand(OUT_DIR + '/{dataset}/preQC/summary_stats/{dataset}_preqc_units_summary.tsv', dataset=set(DATASETS)),
+        expand(OUT_DIR + '/{dataset}/preQC/summary_stats/{dataset}_preqc_samples_summary.tsv', dataset=set(DATASETS)),
+        expand(OUT_DIR + '/{dataset}/preQC/summary_stats/{dataset}_preqc_units_barchart.svg', dataset=set(DATASETS)),
+        expand(OUT_DIR + '/{dataset}/preQC/summary_stats/{dataset}_preqc_samples_barchart.svg', dataset=set(DATASETS)),
+        expand(OUT_DIR + '/{dataset}/preQC/summary_stats/{dataset}_preqc_units_pct_barchart.svg', dataset=set(DATASETS)),
+        expand(OUT_DIR + '/{dataset}/preQC/summary_stats/{dataset}_preqc_samples_pct_barchart.svg', dataset=set(DATASETS)),
+        expand(OUT_DIR + '/{dataset}/preQC/summary_stats/{dataset}_concatenation_all.done', dataset=set(DATASETS))
+
+##----------------------------------------------------------------------------##
+## Rules to make reports
+##----------------------------------------------------------------------------##
+rule rawQC_make_report:
+    output: 
+        temp(touch(OUT_DIR+'/logs/raw_QC_make_report.done'))
+    log:
+        OUT_DIR+'/logs/rawQC_make_report.log'
+    params:
+        report_path = OUT_DIR+'/rawQC_report.html',
+        workdir = workdir_path,
+        workflow_dir = workflow_path
+    shell:
+        '''
+        (snakemake --directory={params.workdir} --report {params.report_path} -s {params.workflow_dir}/Snakefile rawQC) &>{log}
+        '''

-#rule merge_samples:
-#    input:
-#        expand(MERGE_REPORT + '/{dataset}_multiqc.html', dataset=set(DATASETS))
+rule preQC_make_report:
+    output: 
+        temp(touch(OUT_DIR+'/logs/preQC_make_report.done'))
+    log:
+        OUT_DIR+'/logs/preQC_make_report.log'
+    params:
+        report_path = OUT_DIR+'/preQC_report.html',
+        workdir = workdir_path,
+        workflow_dir = workflow_path
+    shell:
+        '''
+        (snakemake --directory={params.workdir} --report {params.report_path} -s {params.workflow_dir}/Snakefile preQC) &>{log}
+        '''

 include: "rules/rawQC.smk"
 include: "rules/preprocess.smk"
-#include: "rules/merge_samples.smk"
--- a/config.yaml
+++ b/config.yaml
@@ -2,8 +2,21 @@
 # In case of sample based data, it should be complemented by a samples.tsv file that contains
 # one row per sample. It can be parsed easily via pandas.

+# Set PATHS
+# WARNING: 
+#   MetagenomicSnake uses singularity containers to run the workflow.
+#   Bind access to those paths if needed
+#   Singularity's --bind/-B option can be specified multiple times, 
+#   or a comma-delimited string of bind path specifications can be used.
+#   
+# PATH to config file for SnakeMake
+# configfile_path: ''
+# Absolute PATH to folder where to find fastq files
+RAW_DIR: '/home/mticlla/Documents/Git_repositories/metagenomicsnake_testdata/data/raw'
+# Absolute PATH to folder where to place output files
+OUT_DIR: '/home/mticlla/Documents/Git_repositories/metagenomicsnake_testdata/data/interim/MetaSnk'
 # PATH to folder where MetagenomicSnake will store results
-PREFIX_DIR: 'MetagenomicSnake_results'
+# PREFIX: 'MetaSnk'

 #
 SAMPLE_UNITS:
@@ -31,10 +44,10 @@ preprocess:

    # PATH to reference(e.g human) genome
    # -----------------------------------
-    # Masked hg19 kindly provided by Brian Brushnell and manually dowloaded from
+    # Masked hg19 kindly provided by Brian Brushnell and 
+    # manually dowloaded from
    # https://drive.google.com/open?id=0B3llHR93L14wd0pSSnFULUlhcUk
    # How the human genome was masked is explained in this SEQanswers entry
    # http://seqanswers.com/forums/archive/index.php/t-42552.html
-    bbmap_reference: 'external/hg19_main_mask_ribo_animal_allplant_allfungus.fa.gz'
-    #bbmap_reference: 'external/reference_test_2.fa.gz'
-    bbmap_ref_dir: 'external/ref_indexed'
+    bbmap_reference: '/home/mticlla/Documents/Git_repositories/metagenomicsnake/ref/hg19_main_mask_ribo_animal_allplant_allfungus.fa.gz'
+    bbmap_ref_dir: '/home/mticlla/Documents/Git_repositories/metagenomicsnake/ref/ref_bbsplit'
--- a/report/MetagenomicSnake.rst
+++ b/report/MetagenomicSnake.rst
-Workflow version 0.1
+Workflow version 0.2

 MetagenomicSnake
 ================
@@ -12,21 +12,22 @@ Modules
 **rawQC**
  runs FastQC on a random sample of R1 reads from the paired fastq-format files.

-**preprocess**
-  FastQC only performs a quality check but no QC processing is done. The **preprocess**
+**preQC**
+  FastQC only performs a quality check but no QC processing is done. The **preQC**
  rule runs a multi-step pre-processing of the paired fastq files, includes:

-  - Adapter-trimming with "fastp"
-  - 
-
-  - with Fastp a quality check is performed and both paired fastq files are processed as follows:
+  - **trim_adapters**: adapter-trimming with "fastp". Fastp performs a quality check 
+    and both paired fastq files are processed as follows\:
+    
      + remove adapters: here we provide the Nextera XT adapters,
      + base correction in overlapped regions
      + trimming of the last base in read 1
      + discard reads shorter than a minimum length, after trimming
      + a report with quality check, before and after processing
-  - removal of reads derived from human DNA
-  - removal of duplicated reads
+  - **filter_human**: removal of reads derived from human DNA with BBTools' bbsplit_
+  - **dedupe**: removal of duplicated reads with BBTools' dedupe
+  - **trim_3end**: 3\'-end quality trimming with "fastp"
+  - **concatenate_fastqs**: merges fastq files corresponding to the same sample into a single pair of fastq files
+  - **summarize_preQC**: creates summarizing tables and plots

-**merge_samples**
-  merges fastq files corresponding to the same sample into a single pair of fastq files, after pre-processing of paired fastq files.
+.. _bbsplit: http://seqanswers.com/forums/showthread.php?t=41288
\ No newline at end of file
--- a/rules/preprocess.smk
+++ b/rules/preprocess.smk
--- a/rules/rawQC.smk
+++ b/rules/rawQC.smk
@@ -4,6 +4,7 @@ Afiliation(s): SIB, SwissTPH, UNIBAS
 Description: rules for QC and pre-processing of paired-end shotgun DNA
 metagenomic sequencing.
 '''
+
 localrules:
    multiqc_raw_list_files,
    multiqc_raw
@@ -20,17 +21,16 @@ singularity_img = 'shub://mticlla/MetagenomicSnake:preqc_v0_1'
 # for R1. 'R1' was removed from names in html and zip outputs
 rule fastqc_raw:
    input:
-        #'data/raw/fastq/{dataset}/{fastq_file}-R1.fastq.gz'
-        lambda wildcards: ['{}/{}/{}-R1.fastq.gz'.format(RAW_FASTQ_DIR, DATASETSX[ix], value)
+        lambda wildcards: ['{}/{}/fastq/{}-R1.fastq.gz'.format(RAW_FASTQ_DIR, DATASETSX[ix], value)
        for ix,value in enumerate(FASTQS) if value==wildcards.fastq_file]
    params:
-        fastqc_dir = RAW_QC_DIR,
+        fastqc_dir = OUT_DIR,
        samplerate = config['rawQC']['samplerate']
    log:
-        LOGS_DIR+'/raw_qc/{dataset}_fastqc/{fastq_file}.log'
+        OUT_DIR+'/{dataset}/rawQC/logs/{fastq_file}_fastqc.log'
    output:
-        fastqc_html = RAW_QC_DIR+'/{dataset}_fastqc/{fastq_file}_fastqc.html',
-        fastqc_zip = RAW_QC_DIR+'/{dataset}_fastqc/{fastq_file}_fastqc.zip'
+        fastqc_html = OUT_DIR+'/{dataset}/rawQC/fastqc/{fastq_file}_fastqc.html',
+        fastqc_zip = OUT_DIR+'/{dataset}/rawQC/fastqc/{fastq_file}_fastqc.zip'
    threads: cpus_avail
    singularity: singularity_img
    message:
@@ -39,7 +39,7 @@ rule fastqc_raw:
        '''
        # Take a random sample of reads and process them with fastQC
        (reformat.sh in={input} out=stdout.fq samplerate={params.samplerate} | \
-        fastqc -o {params.fastqc_dir}/{wildcards.dataset}_fastqc -f fastq \
+        fastqc -o {params.fastqc_dir}/{wildcards.dataset}/rawQC/fastqc -f fastq \
        -t {threads} stdin:{wildcards.fastq_file}) 2> {log}
        '''

@@ -47,10 +47,11 @@ rule fastqc_raw:
 rule multiqc_raw_list_files:
    input:
        #all fastqc zip files per dataset
-        fastqcs=lambda wildcards: ['{}/{}_fastqc/{}_fastqc.zip'.format(RAW_QC_DIR, value, FASTQS[ix])
+        fastqcs=lambda wildcards: ['{}/{}/rawQC/fastqc/{}_fastqc.zip'.format(OUT_DIR, value, FASTQS[ix])
        for ix,value in enumerate(DATASETSX) if value==wildcards.dataset]
    output:
-        multiqc_input_list = RAW_QC_REPORT+'/{dataset}_multiqc_inputs.txt'
+        #multiqc_input_list = RAW_QC_REPORT+'/{dataset}/multiqc_inputs.txt'
+        multiqc_input_list = OUT_DIR+'/{dataset}/rawQC/multiqc_inputs.txt'
    run:
        import os
        try:
@@ -65,11 +66,14 @@ rule multiqc_raw_list_files:
 # Run MultiQC on all FastQC reports
 rule multiqc_raw:
    input:
-        RAW_QC_REPORT + '/{dataset}_multiqc_inputs.txt'
+        multiqc_input_list = OUT_DIR+'/{dataset}/rawQC/multiqc_inputs.txt'
+    params:
+        multiqc_dir = OUT_DIR+'/{dataset}/rawQC/multiqc'
    output:
-        multiqc_fastqc = RAW_QC_REPORT + '/{dataset}_multiqc_data/multiqc_fastqc.txt',
-        multiqc_report = report(RAW_QC_REPORT + '/{dataset}_multiqc.html',
-        category='rawQC', caption='../report/rawQC/rawQC_multiqc_fastqc.rst')
+        multiqc_fastqc = OUT_DIR + '/{dataset}/rawQC/multiqc/{dataset}_multiqc_data/multiqc_fastqc.txt',
+        multiqc_report = report(OUT_DIR + '/{dataset}/rawQC/multiqc/{dataset}_multiqc.html', 
+                         category='RawQC', 
+                         caption='../report/rawQC/rawQC_multiqc_fastqc.rst')
    singularity:singularity_img
    shell:
        '''
@@ -78,23 +82,31 @@ rule multiqc_raw:
 # Creates summarizing tables and plots
 rule summarize_raw:
    input:
-        RAW_QC_REPORT + '/{dataset}_multiqc_data/multiqc_fastqc.txt'
-    log:LOGS_DIR+'/raw_qc/{dataset}_summarize_raw.log'
+        OUT_DIR + '/{dataset}/rawQC/multiqc/{dataset}_multiqc_data/multiqc_fastqc.txt'
+    log:
+        OUT_DIR+'/{dataset}/rawQC/logs/summarize_raw.log'
    output:
-        units_stats = report(RAW_QC_REPORT + '/{dataset}_units_stats.txt',
-        category='rawQC', caption='../report/rawQC/rawQC_units_stats.rst'),
-        samples_stats = report(RAW_QC_REPORT + '/{dataset}_samples_stats.txt',
-        category='rawQC', caption='../report/rawQC/rawQC_samples_stats.rst'),
-        run_lane_stats = report(RAW_QC_REPORT + '/{dataset}_run_lane_stats.txt',
-        category='rawQC', caption='../report/rawQC/rawQC_run_lane_stats.rst'),
-        units_read_pairs_plot = report(RAW_QC_REPORT + '/{dataset}_units_read_dist.svg',
-        category='rawQC', caption='../report/rawQC/rawQC_units_read_dist.rst'),
-        samples_read_pairs_plot = report(RAW_QC_REPORT + '/{dataset}_samples_read_dist.svg',
-        category='rawQC', caption='../report/rawQC/rawQC_samples_read_dist.rst'),
-        samples_read_pairs_dispersion_plot = report(RAW_QC_REPORT + '/{dataset}_samples_read_dispersion.svg',
-        category='rawQC', caption='../report/rawQC/rawQC_samples_reads_dispersion.rst'),
-        fastqc_tests_status_plot = report(RAW_QC_REPORT + '/{dataset}_fastqc_tests_status.svg',
-        category='rawQC', caption='../report/rawQC/rawQC_fastqc_tests_status.rst')
+        units_stats = report(OUT_DIR + '/{dataset}/rawQC/summary_stats/{dataset}_units_stats.txt', 
+                      category='RawQC', 
+                      caption='../report/rawQC/rawQC_units_stats.rst'),
+        samples_stats = report(OUT_DIR + '/{dataset}/rawQC/summary_stats/{dataset}_samples_stats.txt', 
+                        category='RawQC', 
+                        caption='../report/rawQC/rawQC_samples_stats.rst'),
+        run_lane_stats = report(OUT_DIR + '/{dataset}/rawQC/summary_stats/{dataset}_run_lane_stats.txt', 
+                         category='RawQC', 
+                         caption='../report/rawQC/rawQC_run_lane_stats.rst'),
+        units_read_pairs_plot = report(OUT_DIR + '/{dataset}/rawQC/summary_stats/{dataset}_units_read_dist.svg', 
+                                category='RawQC', 
+                                caption='../report/rawQC/rawQC_units_read_dist.rst'),
+        samples_read_pairs_plot = report(OUT_DIR + '/{dataset}/rawQC/summary_stats/{dataset}_samples_read_dist.svg', 
+                                  category='RawQC', 
+                                  caption='../report/rawQC/rawQC_samples_read_dist.rst'),
+        samples_read_pairs_dispersion_plot = report(OUT_DIR + '/{dataset}/rawQC/summary_stats/{dataset}_samples_read_dispersion.svg', 
+                                             category='RawQC', 
+                                             caption='../report/rawQC/rawQC_samples_reads_dispersion.rst'),
+        fastqc_tests_status_plot = report(OUT_DIR + '/{dataset}/rawQC/summary_stats/{dataset}_fastqc_tests_status.svg', 
+                                   category='RawQC', 
+                                   caption='../report/rawQC/rawQC_fastqc_tests_status.rst')
    conda:'../envs/rawQC.yaml'
    script:
        '../scripts/rawQC_summarize.py'
--- a/rules/utils.py
+++ b/rules/utils.py
+import json
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+from matplotlib import rc
+
+def read_multiqc_fastp(mqc_fastp_json):
+    '''Parses the json file generated by MultiQC to generate a tsv report for FastP results.
+    '''
+    mqc_fastp_json_file = mqc_fastp_json
+    with open(mqc_fastp_json_file, 'r') as my_file:
+        json_data = my_file.read()
+    #json_obj = json.loads(json_data)
+    #multiqc_fastp_data = json_obj['report_general_stats_data'][0]
+    multiqc_fastp_data = json.loads(json_data)['report_general_stats_data'][0]
+    multiqc_fastp_data_df = pd.DataFrame.from_dict(multiqc_fastp_data, 
+                                                   orient='index')
+    return(multiqc_fastp_data_df)
+
+def summarize_filter_human_step(mqc_filter_human_list):
+    '''Parses all the stats files included in mqc_filter_human_list and
+    creates a single merged table report.
+    
+    Args:
+        mqc_filter_human_list: FilePath to the file containing a list of stats files
+    Returns:
+        A pandas.DataFRame
+    '''
+    try:
+        with open(mqc_filter_human_list, 'r') as my_file:
+            print(mqc_filter_human_list)
+            bfiltered_files = [f.strip() for f in my_file.readlines()]
+    except (TypeError, ValueError, IOError, EOFError) as e:
+        print(e)
+    else:
+        header = list()
+        multi_bfiltered_data = dict()
+        for ix, bfiltered_file in enumerate(bfiltered_files):
+            unit_name = bfiltered_file.split('/')[-1].split('.')[0]
+            with open(bfiltered_file) as f:
+                lines = f.read().split('\n')[:-1]
+                if ix==0:
+                    header = lines[0].split('\t')
+                    multi_bfiltered_data[unit_name] = lines[1].split('\t')
+                else:
+                    multi_bfiltered_data[unit_name] = lines[1].split('\t')
+        header[0] = 'ref'
+        header[1] = 'pct_unambiguousReads'
+        header[3] = 'pct_ambiguousReads'
+
+        multi_bfiltered_stats = pd.DataFrame.from_dict(multi_bfiltered_data, 
+                                                       orient='index', columns=header)
+        return(multi_bfiltered_stats)
+
+def summarize_preqc(mqc_trimadapters_fastps_json, mqc_filter_human_stats, mqc_trim3end_fastps_json):
+    """Summarizes all the steps in the preQC workflow.
+    
+    Args:
+        mqc_trimadapters_fastps_json: FilePath to the tsv report from step trimadapters
+        mqc_filter_human_stats: FilePath to the tsv report from step filter_human
+        mqc_trim3end_fastps_json: FilePath to the tsv report from step trim3end
+    Returns:
+        A pandas.DataFRame
+    """
+    try:
+        trim_adapters_mqc_data = read_multiqc_fastp(mqc_trimadapters_fastps_json)
+        filter_human_mqc_data = pd.read_csv(mqc_filter_human_stats, sep='\t', header=0, index_col=0)
+        trim3end_mqc_data = read_multiqc_fastp(mqc_trim3end_fastps_json)
+    except (ValueError,TypeError) as e:
+        print(e)
+    else:
+        trim_adapters_mqc_tmp = pd.DataFrame()
+        trim_adapters_mqc_tmp['total_reads'] = trim_adapters_mqc_data['before_filtering_total_reads']
+        trim_adapters_mqc_tmp['trim_adapters_disc_reads'] = trim_adapters_mqc_tmp['total_reads'] - \
+                                                            trim_adapters_mqc_data['after_filtering_total_reads']
+
+        filter_human_mqc_tmp = pd.DataFrame()
+        filter_human_mqc_tmp['filter_human_disc_reads'] = filter_human_mqc_data['assignedReads'].astype('int') 
+
+        trim3end_mqc_tmp = pd.DataFrame()
+        trim3end_mqc_tmp['trim_3end_disc_reads'] = trim3end_mqc_data['before_filtering_total_reads'] - \
+                                                   trim3end_mqc_data['after_filtering_total_reads']
+
+        tmp_df = pd.concat([trim_adapters_mqc_tmp, filter_human_mqc_tmp, trim3end_mqc_tmp], axis=1, join='outer', sort=True)
+        tmp_df.loc[trim3end_mqc_data.index,'dedupe_disc_reads'] = tmp_df.loc[trim3end_mqc_data.index,'total_reads'] - \
+                                                                  tmp_df.loc[trim3end_mqc_data.index,'trim_adapters_disc_reads'] - \
+                                                                  tmp_df.loc[trim3end_mqc_data.index,'filter_human_disc_reads'] - \
+                                                                  trim3end_mqc_data['before_filtering_total_reads'].values
+        tmp_df.loc[trim3end_mqc_data.index,'total_disc_reads'] = tmp_df.loc[trim3end_mqc_data.index,'total_reads'] - \
+                                                                 trim3end_mqc_data['after_filtering_total_reads']
+
+        tmp_df.loc[trim3end_mqc_data.index,'total_kept_reads'] = trim3end_mqc_data['after_filtering_total_reads']
+
+        # Adding columns with percentages of discarded reads in each step
+        tmp_df['trim_adapters_pct_disc'] = tmp_df['trim_adapters_disc_reads'] / tmp_df['total_reads']*100
+        tmp_df['filter_human_pct_disc'] = tmp_df['filter_human_disc_reads'] / tmp_df['total_reads']*100
+        tmp_df['dedupe_pct_disc'] = tmp_df['dedupe_disc_reads'] / tmp_df['total_reads']*100
+        tmp_df['trim_3end_pct_disc'] = tmp_df['trim_3end_disc_reads'] / tmp_df['total_reads']*100
+        tmp_df['total_pct_disc'] = tmp_df['total_disc_reads'] / tmp_df['total_reads']*100
+        tmp_df['total_pct_kept'] = tmp_df['total_kept_reads'] / tmp_df['total_reads']*100
+
+        # Re-order columns
+        tmp_columns = ['total_reads','trim_adapters_disc_reads','filter_human_disc_reads','dedupe_disc_reads','trim_3end_disc_reads'] + \
+                      tmp_df.columns.tolist()[5:]
+        tmp_df = tmp_df[tmp_columns]
+
+        # Add sample, run, lane columns
+        sample_run_lane = pd.DataFrame.from_dict({unit:{'Sample':unit.split('-')[0], 
+                                                        'Run':unit.split('-')[1].split('_')[0], 
+                                                        'Lane':unit.split('-')[1].split('_')[1]
+                                                       } 
+                                                  for unit in tmp_df.index
+                                                 }, 
+                                                 orient='index'
+                                                )
+
+        sample_run_lane = pd.concat([sample_run_lane, tmp_df], axis=1, join='outer', sort=False)
+        return(sample_run_lane)
+
+def plot_preqc_summary(preqc_summary_df, by='units', plot_type='raw'):
+    """Creates a radial bar chart from the dataframe generted by summarize_preqc.
+    
+    Args:
+        preqc_summary_df: pd.DataFrame object generated by summarize_preqc()
+        by: either 'units' or 'samples'. it specifies how to aggregate the data.
+        plot_type: either 'raw' or 'pct'. If 'pct' a percent stacked barchar is returned.
+    Returns:
+        matplotlib.figure.Figure
+    """
+    import inspect
+    by_options = ['units','samples']
+    type_options = ['raw','pct']
+    
+    frame = inspect.currentframe()
+    args_type = ['pandas.DataFrame', by_options, type_options]
+    UNDEFINED = object()
+    def print_message(frame, args_type):
+        args, _, _, values = inspect.getargvalues(frame)
+        print('{}()'.format(inspect.getframeinfo(frame)[2]))
+        print('Args:')
+        for ix,arg in enumerate(args):
+            print('    {}: {}'.format(arg, str(args_type[ix])))
+    
+    if (not isinstance(preqc_summary_df, pd.DataFrame)) or (by not in by_options) or (plot_type not in type_options):
+        print('Invalid arguments')
+        print_message(frame,args_type)
+        return(UNDEFINED)
+    else:
+        try:
+            if plot_type == 'raw':
+                columns_to_plot = ['trim_adapters_disc_reads',
+                                   'filter_human_disc_reads', 
+                                   'dedupe_disc_reads',
+                                   'trim_3end_disc_reads',
+                                   'total_kept_reads']
+                sample_run_lane = preqc_summary_df[['total_reads','Sample']+columns_to_plot].copy()
+                if by == 'samples':
+                    sample_run_lane = sample_run_lane.groupby(['Sample']).agg('sum')
+                bottom_lim = 0.0
+                top_lim = sample_run_lane['total_reads'].max()
+                # This uses the given y-units of bottom spacing for each bar
+                # You may increase this number a bit to have more spacing between bars and text.
+                bottom_spacing = top_lim/1.8
+                rgrids_positions = sample_run_lane['total_reads'].describe()[[3,5,7]].values + bottom_spacing
+                rgrids_positions_labels = [' {}M'.format(nr) 
+                                           for ix,nr in enumerate(np.round((rgrids_positions-bottom_spacing)/1000000, decimals=3))]
+            else:
+                columns_to_plot = ['trim_adapters_pct_disc',
+                                   'filter_human_pct_disc', 
+                                   'dedupe_pct_disc',
+                                   'trim_3end_pct_disc',
+                                   'total_pct_kept']
+                sample_run_lane = preqc_summary_df[['total_reads','Sample']+columns_to_plot].copy()
+                if by == 'samples':
+                    sample_run_lane = sample_run_lane.groupby(['Sample']).agg('mean')
+                bottom_lim = 0.0
+                top_lim = 100
+                bottom_spacing = top_lim/1.8
+                rgrids_values = [25,50,75,100]
+                rgrids_positions =  np.add(rgrids_values,bottom_spacing)
+                rgrids_positions_labels = [' {}%'.format(nr) for nr in rgrids_values]
+                    
+            sample_run_lane.sort_values(by=['total_reads'], inplace=True)
+            # Colors for each column (nested bars inside each bar)
+            barColors = ['#fdb863', '#e08214', '#b35806', '#542788','#8073ac'][::-1]
+        except (TypeError, ValueError) as e:
+            print(e)
+        else:
+            # This variable was used for testing purposes to see how fold increases 
+            # of the datapoints affects the plot
+            lenMult = 1
+            # Nr of datapoints to plot
+            nr_h_bars = len(sample_run_lane)*lenMult
+            # This sets the number of x-positions left empty at the beginning
+            # so that enough space is left for y-labels
+            nr_empty_bars = np.int(np.ceil(nr_h_bars/40))
+            empty_bars = [0]*nr_empty_bars
+            empty_bars_labels = ['']*len(empty_bars)
+            # re-set the number of x-positions (data points) to plot
+            nr_h_bars = nr_h_bars+nr_empty_bars
+    
+            # The positions of the bars on the x-axis
+            # Each position corresponds to a radian degree in a unit circunference (radius is 1)
+            # The circumference (2pi) is divided into nr_h_bars sections
+            r= np.arange(0,2*np.pi,2*np.pi/nr_h_bars) # for radial barchart 
+            r= r[0:nr_h_bars]
+            # Bar width
+            barWidth = (2*np.pi)/(nr_h_bars)*0.9 # for radial barchart 
+    
+            # Define the columns to use for the stacked bars, 
+            # values of each column are plot on top of one another
+            barColumns = columns_to_plot[::-1]
+    
+            # This uses the given y-units of bottom spacing for each bar
+            # You may increase this number a bit to have more spacing between bars and text.
+            bottomBars = [bottom_spacing]*nr_h_bars
+    
+            # Lets set the figure size depending on the amount of data points to plot
+            # Labels until 1000 datapoints are readable
+            # Consider adding additional elif scenarios to increase size for larger datasets
+            if nr_h_bars <=200:
+                fig = plt.figure(figsize=(10, 10))
+                font_size = 7.5
+            elif nr_h_bars <= 500:
+                fig = plt.figure(figsize=(20, 20))
+                font_size = 'x-small'
+            else:
+                fig = plt.figure(figsize=(28, 28))
+                font_size = 6
+
+            # Here we do the actual plotting
+            # ------------------------------
+            # Values for each column for a datapoint (i.e sequencing unit, or sample)
+            # are plot on top of one another
+            ax = fig.add_axes([0.1,0.1,0.75,0.75], projection='polar')
+            for ix,column in enumerate(barColumns):
+                bar_values=empty_bars+sample_run_lane[column].tolist()*lenMult
+                bars=ax.bar(r, bar_values, bottom=bottomBars, 
+                            color=barColors[ix], edgecolor='white', width=barWidth, label=column)
+                bottomBars = np.add(bottomBars,bar_values).tolist()
+    
+            # Here we add annotations (e.g labels, legend)
+            # --------------------------------------------
+            rotations = np.rad2deg(r)
+            y0,y1 = ax.get_ylim()
+            bar_labels= empty_bars_labels + sample_run_lane.index.tolist()*lenMult
+            for x, y_pos, rotation, label in zip(r, bottomBars, rotations, bar_labels):
+                offset = (y_pos)/(y1-y0)
+                lab = ax.text(0, 0, label, fontsize=font_size, transform=None, ha='center', va='center', fontstretch='ultra-condensed')
+                renderer = ax.figure.canvas.get_renderer()
+                bbox = lab.get_window_extent(renderer=renderer)
+                invb = ax.transData.inverted().transform([[0,0],[bbox.width,0] ])
+                lab.set_position((x,offset+(invb[1][0]-invb[0][0])/2.*2.7) )
+                lab.set_transform(ax.get_xaxis_transform())
+                lab.set_rotation(rotation)
+            #rgrids_positions = sample_run_lane['total_reads'].describe()[[3,5,7]].values + bottom_spacing
+            #rgrids_positions_labels = [' {}M'.format(nr)
+            #                           for ix,nr in enumerate(np.round((rgrids_positions-bottom_spacing)/1000000, decimals=3))]
+            ax.set_rgrids(rgrids_positions,
+                          labels=rgrids_positions_labels, 
+                          angle=np.rad2deg(r[np.int(nr_empty_bars/2-1)]), 
+                          family='serif', ha='left', va='center', weight='bold',fontstretch='ultra-condensed', fontsize=8.5)
+            ax.legend(bbox_to_anchor=(0.5, 0.5), loc='center', frameon=False, title='MetaSnk: preQC')
+            # A few extra edits for aesthetics
+            # -------------------------------
+            ax.xaxis.grid(False)
+            ax.spines["polar"].set_visible(False)
+            ax.set_xticklabels([])
+    
+            return(fig)
\ No newline at end of file