Skip to content
Snippets Groups Projects
Commit 33007fc2 authored by Ticlla Ccenhua Monica Roxana's avatar Ticlla Ccenhua Monica Roxana
Browse files

add module preQC, rules for pre-processing paired fastqs

parent 5c7fb77c
Branches
No related tags found
No related merge requests found
......@@ -10,9 +10,9 @@ import os
##----------------------------------------------------------------------------##
from multiprocessing import cpu_count
cpus_avail = cpu_count()
print('CPUs available: {}'.format(cpus_avail))
print('CPUs available: {}'.format(cpus_avail), file=sys.stderr)
##----------------------------------------------------------------------------##
## Working directory
## Re-set working directory
##----------------------------------------------------------------------------##
# Set working directory
workdir_path = os.path.dirname(os.path.abspath(__name__))
......@@ -20,67 +20,67 @@ workflow_path = workflow.basedir
if workdir_path == workflow_path:
message = "Working directory was not specified!"+\
"MetagenomicSnake assumes ./data, relative to current directory, "+\
"as your working directory ..."
print(message)
"Then, MetagenomicSnake assumes ./data, relative to current directory, "+\
"as your working directory ..."
print(message, file=sys.stderr)
if os.path.exists(workflow_path+'/data'):
workdir_path = workflow_path+'/data'
workdir: workdir_path
print("Working directory:{}".format(workdir_path))
print("Working directory:{}".format(workdir_path), file=sys.stderr)
else:
print("... Folder ./data not found in current directory...")
print("... instead, setting current directory as working directory ...")
print("Working directory:{}".format(workdir_path))
message = "... Folder ./data not found in current directory..."+\
"... instead, setting current directory as working directory ..."+\
"Working directory:{}".format(workdir_path)
print(message, file=sys.stderr)
else:
print("Working directory:{}".format(workdir_path))
print("Working directory:{}".format(workdir_path), file=sys.stderr)
##----------------------------------------------------------------------------##
## Configuration of MetagenomicSnake
##----------------------------------------------------------------------------##
try:
configfile_path = config['configfile_path']
print("Configuration file: {}".format(configfile_path))
print("Configuration file: {}".format(configfile_path), file=sys.stderr)
except:
print("Configuration file config.yaml not especified at execution!")
print("Configuration file config.yaml not especified at execution!", file=sys.stderr)
try:
print("... Trying working directory ...")
print("... Trying working directory ...", file=sys.stderr)
configfile_path = "config.yaml"
configfile: configfile_path
print("... Configuration file: {}".format(configfile_path))
print("... Configuration file: {}".format(configfile_path), file=sys.stderr)
except:
print("... config.yaml not found in working directory ...")
print("... Loading default config.yaml provided with MetagenomicSnake...")
message = "... config.yaml not found in working directory ..."+\
"... Loading default config.yaml provided with MetagenomicSnake..."
print(message, file=sys.stderr)
configfile_path = workflow_path + "/config.yaml"
configfile: configfile_path
print("... Configuration file: {}".format(configfile_path))
print("... Configuration file: {}".format(configfile_path), file=sys.stderr)
##----------------------------------------------------------------------------##
## Define paths
##----------------------------------------------------------------------------##
RAW_FASTQ_DIR = 'raw/fastq'
RESULTS_DIR = config['PREFIX_DIR'] +'/results'
LOGS_DIR = config['PREFIX_DIR'] +'/logs'
REPORTS_DIR = config['PREFIX_DIR'] +'/reports'
RAW_FASTQ_DIR = config['RAW_DIR']
OUT_DIR = config['OUT_DIR']
#RESULTS_DIR = config['OUT_DIR'] +'/results'
#LOGS_DIR = config['OUT_DIR'] +'/logs'
#REPORTS_DIR = config['OUT_DIR'] +'/reports'
RAW_QC_DIR = RESULTS_DIR + '/rawQC'
RAW_QC_REPORT = REPORTS_DIR + '/rawQC'
PRE_PROC_DIR = RESULTS_DIR + '/preprocessed'
PRE_PROC_REPORT = REPORTS_DIR + '/preprocessed'
#MERGE_DIR = RESULTS_DIR + '/merged'
#MERGE_REPORT = REPORTS_DIR + '/merged'
##----------------------------------------------------------------------------##
## Fastq files to be processed
##----------------------------------------------------------------------------##
if config['SAMPLE_UNITS']['auto']:
(DATASETS, SAMPLES, RUNS, LANES) = glob_wildcards(RAW_FASTQ_DIR+'/{dataset}/{sample}-{run}_{lane}-R1.fastq.gz')
(DATASETSX, FASTQS) = glob_wildcards(RAW_FASTQ_DIR+'/{dataset}/{fastq_file}-R1.fastq.gz')
(DATASETS, SAMPLES, RUNS, LANES) = glob_wildcards(RAW_FASTQ_DIR+'/{dataset}/fastq/{sample}-{run}_{lane}-R1.fastq.gz')
(DATASETSX, FASTQS) = glob_wildcards(RAW_FASTQ_DIR+'/{dataset}/fastq/{fastq_file}-R1.fastq.gz')
else:
# TODO:
pass
##----------------------------------------------------------------------------##
## Local rules
##----------------------------------------------------------------------------##
localrules:
rawQC,
preQC
##----------------------------------------------------------------------------##
## Run entire workflow
##----------------------------------------------------------------------------##
......@@ -95,21 +95,54 @@ rule all:
##----------------------------------------------------------------------------##
rule rawQC:
input:
expand(RAW_QC_REPORT + '/{dataset}_samples_stats.txt', dataset=set(DATASETS)),
expand(RAW_QC_REPORT + '/{dataset}_units_stats.txt', dataset=set(DATASETS)),
expand(RAW_QC_REPORT + '/{dataset}_run_lane_stats.txt', dataset=set(DATASETS)),
expand(RAW_QC_REPORT + '/{dataset}_units_read_dist.svg', dataset=set(DATASETS)),
expand(RAW_QC_REPORT + '/{dataset}_samples_read_dist.svg', dataset=set(DATASETS)),
expand(RAW_QC_REPORT + '/{dataset}_samples_read_dispersion.svg', dataset=set(DATASETS)),
expand(RAW_QC_REPORT + '/{dataset}_fastqc_tests_status.svg', dataset=set(DATASETS))
rule preprocess:
expand(OUT_DIR + '/{dataset}/rawQC/summary_stats/{dataset}_samples_stats.txt', dataset=set(DATASETS)),
expand(OUT_DIR + '/{dataset}/rawQC/summary_stats/{dataset}_units_stats.txt', dataset=set(DATASETS)),
expand(OUT_DIR + '/{dataset}/rawQC/summary_stats/{dataset}_run_lane_stats.txt', dataset=set(DATASETS)),
expand(OUT_DIR + '/{dataset}/rawQC/summary_stats/{dataset}_units_read_dist.svg', dataset=set(DATASETS)),
expand(OUT_DIR + '/{dataset}/rawQC/summary_stats/{dataset}_samples_read_dist.svg', dataset=set(DATASETS)),
expand(OUT_DIR + '/{dataset}/rawQC/summary_stats/{dataset}_samples_read_dispersion.svg', dataset=set(DATASETS)),
expand(OUT_DIR + '/{dataset}/rawQC/summary_stats/{dataset}_fastqc_tests_status.svg', dataset=set(DATASETS))
rule preQC:
input:
expand(PRE_PROC_REPORT + '/{dataset}_multiqc.html', dataset=set(DATASETS))
expand(OUT_DIR + '/{dataset}/preQC/summary_stats/{dataset}_preqc_units_summary.tsv', dataset=set(DATASETS)),
expand(OUT_DIR + '/{dataset}/preQC/summary_stats/{dataset}_preqc_samples_summary.tsv', dataset=set(DATASETS)),
expand(OUT_DIR + '/{dataset}/preQC/summary_stats/{dataset}_preqc_units_barchart.svg', dataset=set(DATASETS)),
expand(OUT_DIR + '/{dataset}/preQC/summary_stats/{dataset}_preqc_samples_barchart.svg', dataset=set(DATASETS)),
expand(OUT_DIR + '/{dataset}/preQC/summary_stats/{dataset}_preqc_units_pct_barchart.svg', dataset=set(DATASETS)),
expand(OUT_DIR + '/{dataset}/preQC/summary_stats/{dataset}_preqc_samples_pct_barchart.svg', dataset=set(DATASETS)),
expand(OUT_DIR + '/{dataset}/preQC/summary_stats/{dataset}_concatenation_all.done', dataset=set(DATASETS))
##----------------------------------------------------------------------------##
## Rules to make reports
##----------------------------------------------------------------------------##
rule rawQC_make_report:
output:
temp(touch(OUT_DIR+'/logs/raw_QC_make_report.done'))
log:
OUT_DIR+'/logs/rawQC_make_report.log'
params:
report_path = OUT_DIR+'/rawQC_report.html',
workdir = workdir_path,
workflow_dir = workflow_path
shell:
'''
(snakemake --directory={params.workdir} --report {params.report_path} -s {params.workflow_dir}/Snakefile rawQC) &>{log}
'''
#rule merge_samples:
# input:
# expand(MERGE_REPORT + '/{dataset}_multiqc.html', dataset=set(DATASETS))
rule preQC_make_report:
output:
temp(touch(OUT_DIR+'/logs/preQC_make_report.done'))
log:
OUT_DIR+'/logs/preQC_make_report.log'
params:
report_path = OUT_DIR+'/preQC_report.html',
workdir = workdir_path,
workflow_dir = workflow_path
shell:
'''
(snakemake --directory={params.workdir} --report {params.report_path} -s {params.workflow_dir}/Snakefile preQC) &>{log}
'''
include: "rules/rawQC.smk"
include: "rules/preprocess.smk"
#include: "rules/merge_samples.smk"
......@@ -2,8 +2,21 @@
# In case of sample based data, it should be complemented by a samples.tsv file that contains
# one row per sample. It can be parsed easily via pandas.
# Set PATHS
# WARNING:
# MetagenomicSnake uses singularity containers to run the workflow.
# Bind access to those paths if needed
# Singularity's --bind/-B option can be specified multiple times,
# or a comma-delimited string of bind path specifications can be used.
#
# PATH to config file for SnakeMake
# configfile_path: ''
# Absolute PATH to folder where to find fastq files
RAW_DIR: '/home/mticlla/Documents/Git_repositories/metagenomicsnake_testdata/data/raw'
# Absolute PATH to folder where to place output files
OUT_DIR: '/home/mticlla/Documents/Git_repositories/metagenomicsnake_testdata/data/interim/MetaSnk'
# PATH to folder where MetagenomicSnake will store results
PREFIX_DIR: 'MetagenomicSnake_results'
# PREFIX: 'MetaSnk'
#
SAMPLE_UNITS:
......@@ -31,10 +44,10 @@ preprocess:
# PATH to reference(e.g human) genome
# -----------------------------------
# Masked hg19 kindly provided by Brian Brushnell and manually dowloaded from
# Masked hg19 kindly provided by Brian Brushnell and
# manually dowloaded from
# https://drive.google.com/open?id=0B3llHR93L14wd0pSSnFULUlhcUk
# How the human genome was masked is explained in this SEQanswers entry
# http://seqanswers.com/forums/archive/index.php/t-42552.html
bbmap_reference: 'external/hg19_main_mask_ribo_animal_allplant_allfungus.fa.gz'
#bbmap_reference: 'external/reference_test_2.fa.gz'
bbmap_ref_dir: 'external/ref_indexed'
bbmap_reference: '/home/mticlla/Documents/Git_repositories/metagenomicsnake/ref/hg19_main_mask_ribo_animal_allplant_allfungus.fa.gz'
bbmap_ref_dir: '/home/mticlla/Documents/Git_repositories/metagenomicsnake/ref/ref_bbsplit'
Workflow version 0.1
Workflow version 0.2
MetagenomicSnake
================
......@@ -12,21 +12,22 @@ Modules
**rawQC**
runs FastQC on a random sample of R1 reads from the paired fastq-format files.
**preprocess**
FastQC only performs a quality check but no QC processing is done. The **preprocess**
**preQC**
FastQC only performs a quality check but no QC processing is done. The **preQC**
rule runs a multi-step pre-processing of the paired fastq files, includes:
- Adapter-trimming with "fastp"
-
- with Fastp a quality check is performed and both paired fastq files are processed as follows:
- **trim_adapters**: adapter-trimming with "fastp". Fastp performs a quality check
and both paired fastq files are processed as follows\:
+ remove adapters: here we provide the Nextera XT adapters,
+ base correction in overlapped regions
+ trimming of the last base in read 1
+ discard reads shorter than a minimum length, after trimming
+ a report with quality check, before and after processing
- removal of reads derived from human DNA
- removal of duplicated reads
- **filter_human**: removal of reads derived from human DNA with BBTools' bbsplit_
- **dedupe**: removal of duplicated reads with BBTools' dedupe
- **trim_3end**: 3\'-end quality trimming with "fastp"
- **concatenate_fastqs**: merges fastq files corresponding to the same sample into a single pair of fastq files
- **summarize_preQC**: creates summarizing tables and plots
**merge_samples**
merges fastq files corresponding to the same sample into a single pair of fastq files, after pre-processing of paired fastq files.
.. _bbsplit: http://seqanswers.com/forums/showthread.php?t=41288
\ No newline at end of file
This diff is collapsed.
......@@ -4,6 +4,7 @@ Afiliation(s): SIB, SwissTPH, UNIBAS
Description: rules for QC and pre-processing of paired-end shotgun DNA
metagenomic sequencing.
'''
localrules:
multiqc_raw_list_files,
multiqc_raw
......@@ -20,17 +21,16 @@ singularity_img = 'shub://mticlla/MetagenomicSnake:preqc_v0_1'
# for R1. 'R1' was removed from names in html and zip outputs
rule fastqc_raw:
input:
#'data/raw/fastq/{dataset}/{fastq_file}-R1.fastq.gz'
lambda wildcards: ['{}/{}/{}-R1.fastq.gz'.format(RAW_FASTQ_DIR, DATASETSX[ix], value)
lambda wildcards: ['{}/{}/fastq/{}-R1.fastq.gz'.format(RAW_FASTQ_DIR, DATASETSX[ix], value)
for ix,value in enumerate(FASTQS) if value==wildcards.fastq_file]
params:
fastqc_dir = RAW_QC_DIR,
fastqc_dir = OUT_DIR,
samplerate = config['rawQC']['samplerate']
log:
LOGS_DIR+'/raw_qc/{dataset}_fastqc/{fastq_file}.log'
OUT_DIR+'/{dataset}/rawQC/logs/{fastq_file}_fastqc.log'
output:
fastqc_html = RAW_QC_DIR+'/{dataset}_fastqc/{fastq_file}_fastqc.html',
fastqc_zip = RAW_QC_DIR+'/{dataset}_fastqc/{fastq_file}_fastqc.zip'
fastqc_html = OUT_DIR+'/{dataset}/rawQC/fastqc/{fastq_file}_fastqc.html',
fastqc_zip = OUT_DIR+'/{dataset}/rawQC/fastqc/{fastq_file}_fastqc.zip'
threads: cpus_avail
singularity: singularity_img
message:
......@@ -39,7 +39,7 @@ rule fastqc_raw:
'''
# Take a random sample of reads and process them with fastQC
(reformat.sh in={input} out=stdout.fq samplerate={params.samplerate} | \
fastqc -o {params.fastqc_dir}/{wildcards.dataset}_fastqc -f fastq \
fastqc -o {params.fastqc_dir}/{wildcards.dataset}/rawQC/fastqc -f fastq \
-t {threads} stdin:{wildcards.fastq_file}) 2> {log}
'''
......@@ -47,10 +47,11 @@ rule fastqc_raw:
rule multiqc_raw_list_files:
input:
#all fastqc zip files per dataset
fastqcs=lambda wildcards: ['{}/{}_fastqc/{}_fastqc.zip'.format(RAW_QC_DIR, value, FASTQS[ix])
fastqcs=lambda wildcards: ['{}/{}/rawQC/fastqc/{}_fastqc.zip'.format(OUT_DIR, value, FASTQS[ix])
for ix,value in enumerate(DATASETSX) if value==wildcards.dataset]
output:
multiqc_input_list = RAW_QC_REPORT+'/{dataset}_multiqc_inputs.txt'
#multiqc_input_list = RAW_QC_REPORT+'/{dataset}/multiqc_inputs.txt'
multiqc_input_list = OUT_DIR+'/{dataset}/rawQC/multiqc_inputs.txt'
run:
import os
try:
......@@ -65,11 +66,14 @@ rule multiqc_raw_list_files:
# Run MultiQC on all FastQC reports
rule multiqc_raw:
input:
RAW_QC_REPORT + '/{dataset}_multiqc_inputs.txt'
multiqc_input_list = OUT_DIR+'/{dataset}/rawQC/multiqc_inputs.txt'
params:
multiqc_dir = OUT_DIR+'/{dataset}/rawQC/multiqc'
output:
multiqc_fastqc = RAW_QC_REPORT + '/{dataset}_multiqc_data/multiqc_fastqc.txt',
multiqc_report = report(RAW_QC_REPORT + '/{dataset}_multiqc.html',
category='rawQC', caption='../report/rawQC/rawQC_multiqc_fastqc.rst')
multiqc_fastqc = OUT_DIR + '/{dataset}/rawQC/multiqc/{dataset}_multiqc_data/multiqc_fastqc.txt',
multiqc_report = report(OUT_DIR + '/{dataset}/rawQC/multiqc/{dataset}_multiqc.html',
category='RawQC',
caption='../report/rawQC/rawQC_multiqc_fastqc.rst')
singularity:singularity_img
shell:
'''
......@@ -78,23 +82,31 @@ rule multiqc_raw:
# Creates summarizing tables and plots
rule summarize_raw:
input:
RAW_QC_REPORT + '/{dataset}_multiqc_data/multiqc_fastqc.txt'
log:LOGS_DIR+'/raw_qc/{dataset}_summarize_raw.log'
OUT_DIR + '/{dataset}/rawQC/multiqc/{dataset}_multiqc_data/multiqc_fastqc.txt'
log:
OUT_DIR+'/{dataset}/rawQC/logs/summarize_raw.log'
output:
units_stats = report(RAW_QC_REPORT + '/{dataset}_units_stats.txt',
category='rawQC', caption='../report/rawQC/rawQC_units_stats.rst'),
samples_stats = report(RAW_QC_REPORT + '/{dataset}_samples_stats.txt',
category='rawQC', caption='../report/rawQC/rawQC_samples_stats.rst'),
run_lane_stats = report(RAW_QC_REPORT + '/{dataset}_run_lane_stats.txt',
category='rawQC', caption='../report/rawQC/rawQC_run_lane_stats.rst'),
units_read_pairs_plot = report(RAW_QC_REPORT + '/{dataset}_units_read_dist.svg',
category='rawQC', caption='../report/rawQC/rawQC_units_read_dist.rst'),
samples_read_pairs_plot = report(RAW_QC_REPORT + '/{dataset}_samples_read_dist.svg',
category='rawQC', caption='../report/rawQC/rawQC_samples_read_dist.rst'),
samples_read_pairs_dispersion_plot = report(RAW_QC_REPORT + '/{dataset}_samples_read_dispersion.svg',
category='rawQC', caption='../report/rawQC/rawQC_samples_reads_dispersion.rst'),
fastqc_tests_status_plot = report(RAW_QC_REPORT + '/{dataset}_fastqc_tests_status.svg',
category='rawQC', caption='../report/rawQC/rawQC_fastqc_tests_status.rst')
units_stats = report(OUT_DIR + '/{dataset}/rawQC/summary_stats/{dataset}_units_stats.txt',
category='RawQC',
caption='../report/rawQC/rawQC_units_stats.rst'),
samples_stats = report(OUT_DIR + '/{dataset}/rawQC/summary_stats/{dataset}_samples_stats.txt',
category='RawQC',
caption='../report/rawQC/rawQC_samples_stats.rst'),
run_lane_stats = report(OUT_DIR + '/{dataset}/rawQC/summary_stats/{dataset}_run_lane_stats.txt',
category='RawQC',
caption='../report/rawQC/rawQC_run_lane_stats.rst'),
units_read_pairs_plot = report(OUT_DIR + '/{dataset}/rawQC/summary_stats/{dataset}_units_read_dist.svg',
category='RawQC',
caption='../report/rawQC/rawQC_units_read_dist.rst'),
samples_read_pairs_plot = report(OUT_DIR + '/{dataset}/rawQC/summary_stats/{dataset}_samples_read_dist.svg',
category='RawQC',
caption='../report/rawQC/rawQC_samples_read_dist.rst'),
samples_read_pairs_dispersion_plot = report(OUT_DIR + '/{dataset}/rawQC/summary_stats/{dataset}_samples_read_dispersion.svg',
category='RawQC',
caption='../report/rawQC/rawQC_samples_reads_dispersion.rst'),
fastqc_tests_status_plot = report(OUT_DIR + '/{dataset}/rawQC/summary_stats/{dataset}_fastqc_tests_status.svg',
category='RawQC',
caption='../report/rawQC/rawQC_fastqc_tests_status.rst')
conda:'../envs/rawQC.yaml'
script:
'../scripts/rawQC_summarize.py'
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import rc
def read_multiqc_fastp(mqc_fastp_json):
'''Parses the json file generated by MultiQC to generate a tsv report for FastP results.
'''
mqc_fastp_json_file = mqc_fastp_json
with open(mqc_fastp_json_file, 'r') as my_file:
json_data = my_file.read()
#json_obj = json.loads(json_data)
#multiqc_fastp_data = json_obj['report_general_stats_data'][0]
multiqc_fastp_data = json.loads(json_data)['report_general_stats_data'][0]
multiqc_fastp_data_df = pd.DataFrame.from_dict(multiqc_fastp_data,
orient='index')
return(multiqc_fastp_data_df)
def summarize_filter_human_step(mqc_filter_human_list):
'''Parses all the stats files included in mqc_filter_human_list and
creates a single merged table report.
Args:
mqc_filter_human_list: FilePath to the file containing a list of stats files
Returns:
A pandas.DataFRame
'''
try:
with open(mqc_filter_human_list, 'r') as my_file:
print(mqc_filter_human_list)
bfiltered_files = [f.strip() for f in my_file.readlines()]
except (TypeError, ValueError, IOError, EOFError) as e:
print(e)
else:
header = list()
multi_bfiltered_data = dict()
for ix, bfiltered_file in enumerate(bfiltered_files):
unit_name = bfiltered_file.split('/')[-1].split('.')[0]
with open(bfiltered_file) as f:
lines = f.read().split('\n')[:-1]
if ix==0:
header = lines[0].split('\t')
multi_bfiltered_data[unit_name] = lines[1].split('\t')
else:
multi_bfiltered_data[unit_name] = lines[1].split('\t')
header[0] = 'ref'
header[1] = 'pct_unambiguousReads'
header[3] = 'pct_ambiguousReads'
multi_bfiltered_stats = pd.DataFrame.from_dict(multi_bfiltered_data,
orient='index', columns=header)
return(multi_bfiltered_stats)
def summarize_preqc(mqc_trimadapters_fastps_json, mqc_filter_human_stats, mqc_trim3end_fastps_json):
"""Summarizes all the steps in the preQC workflow.
Args:
mqc_trimadapters_fastps_json: FilePath to the tsv report from step trimadapters
mqc_filter_human_stats: FilePath to the tsv report from step filter_human
mqc_trim3end_fastps_json: FilePath to the tsv report from step trim3end
Returns:
A pandas.DataFRame
"""
try:
trim_adapters_mqc_data = read_multiqc_fastp(mqc_trimadapters_fastps_json)
filter_human_mqc_data = pd.read_csv(mqc_filter_human_stats, sep='\t', header=0, index_col=0)
trim3end_mqc_data = read_multiqc_fastp(mqc_trim3end_fastps_json)
except (ValueError,TypeError) as e:
print(e)
else:
trim_adapters_mqc_tmp = pd.DataFrame()
trim_adapters_mqc_tmp['total_reads'] = trim_adapters_mqc_data['before_filtering_total_reads']
trim_adapters_mqc_tmp['trim_adapters_disc_reads'] = trim_adapters_mqc_tmp['total_reads'] - \
trim_adapters_mqc_data['after_filtering_total_reads']
filter_human_mqc_tmp = pd.DataFrame()
filter_human_mqc_tmp['filter_human_disc_reads'] = filter_human_mqc_data['assignedReads'].astype('int')
trim3end_mqc_tmp = pd.DataFrame()
trim3end_mqc_tmp['trim_3end_disc_reads'] = trim3end_mqc_data['before_filtering_total_reads'] - \
trim3end_mqc_data['after_filtering_total_reads']
tmp_df = pd.concat([trim_adapters_mqc_tmp, filter_human_mqc_tmp, trim3end_mqc_tmp], axis=1, join='outer', sort=True)
tmp_df.loc[trim3end_mqc_data.index,'dedupe_disc_reads'] = tmp_df.loc[trim3end_mqc_data.index,'total_reads'] - \
tmp_df.loc[trim3end_mqc_data.index,'trim_adapters_disc_reads'] - \
tmp_df.loc[trim3end_mqc_data.index,'filter_human_disc_reads'] - \
trim3end_mqc_data['before_filtering_total_reads'].values
tmp_df.loc[trim3end_mqc_data.index,'total_disc_reads'] = tmp_df.loc[trim3end_mqc_data.index,'total_reads'] - \
trim3end_mqc_data['after_filtering_total_reads']
tmp_df.loc[trim3end_mqc_data.index,'total_kept_reads'] = trim3end_mqc_data['after_filtering_total_reads']
# Adding columns with percentages of discarded reads in each step
tmp_df['trim_adapters_pct_disc'] = tmp_df['trim_adapters_disc_reads'] / tmp_df['total_reads']*100
tmp_df['filter_human_pct_disc'] = tmp_df['filter_human_disc_reads'] / tmp_df['total_reads']*100
tmp_df['dedupe_pct_disc'] = tmp_df['dedupe_disc_reads'] / tmp_df['total_reads']*100
tmp_df['trim_3end_pct_disc'] = tmp_df['trim_3end_disc_reads'] / tmp_df['total_reads']*100
tmp_df['total_pct_disc'] = tmp_df['total_disc_reads'] / tmp_df['total_reads']*100
tmp_df['total_pct_kept'] = tmp_df['total_kept_reads'] / tmp_df['total_reads']*100
# Re-order columns
tmp_columns = ['total_reads','trim_adapters_disc_reads','filter_human_disc_reads','dedupe_disc_reads','trim_3end_disc_reads'] + \
tmp_df.columns.tolist()[5:]
tmp_df = tmp_df[tmp_columns]
# Add sample, run, lane columns
sample_run_lane = pd.DataFrame.from_dict({unit:{'Sample':unit.split('-')[0],
'Run':unit.split('-')[1].split('_')[0],
'Lane':unit.split('-')[1].split('_')[1]
}
for unit in tmp_df.index
},
orient='index'
)
sample_run_lane = pd.concat([sample_run_lane, tmp_df], axis=1, join='outer', sort=False)
return(sample_run_lane)
def plot_preqc_summary(preqc_summary_df, by='units', plot_type='raw'):
"""Creates a radial bar chart from the dataframe generted by summarize_preqc.
Args:
preqc_summary_df: pd.DataFrame object generated by summarize_preqc()
by: either 'units' or 'samples'. it specifies how to aggregate the data.
plot_type: either 'raw' or 'pct'. If 'pct' a percent stacked barchar is returned.
Returns:
matplotlib.figure.Figure
"""
import inspect
by_options = ['units','samples']
type_options = ['raw','pct']
frame = inspect.currentframe()
args_type = ['pandas.DataFrame', by_options, type_options]
UNDEFINED = object()
def print_message(frame, args_type):
args, _, _, values = inspect.getargvalues(frame)
print('{}()'.format(inspect.getframeinfo(frame)[2]))
print('Args:')
for ix,arg in enumerate(args):
print(' {}: {}'.format(arg, str(args_type[ix])))
if (not isinstance(preqc_summary_df, pd.DataFrame)) or (by not in by_options) or (plot_type not in type_options):
print('Invalid arguments')
print_message(frame,args_type)
return(UNDEFINED)
else:
try:
if plot_type == 'raw':
columns_to_plot = ['trim_adapters_disc_reads',
'filter_human_disc_reads',
'dedupe_disc_reads',
'trim_3end_disc_reads',
'total_kept_reads']
sample_run_lane = preqc_summary_df[['total_reads','Sample']+columns_to_plot].copy()
if by == 'samples':
sample_run_lane = sample_run_lane.groupby(['Sample']).agg('sum')
bottom_lim = 0.0
top_lim = sample_run_lane['total_reads'].max()
# This uses the given y-units of bottom spacing for each bar
# You may increase this number a bit to have more spacing between bars and text.
bottom_spacing = top_lim/1.8
rgrids_positions = sample_run_lane['total_reads'].describe()[[3,5,7]].values + bottom_spacing
rgrids_positions_labels = [' {}M'.format(nr)
for ix,nr in enumerate(np.round((rgrids_positions-bottom_spacing)/1000000, decimals=3))]
else:
columns_to_plot = ['trim_adapters_pct_disc',
'filter_human_pct_disc',
'dedupe_pct_disc',
'trim_3end_pct_disc',
'total_pct_kept']
sample_run_lane = preqc_summary_df[['total_reads','Sample']+columns_to_plot].copy()
if by == 'samples':
sample_run_lane = sample_run_lane.groupby(['Sample']).agg('mean')
bottom_lim = 0.0
top_lim = 100
bottom_spacing = top_lim/1.8
rgrids_values = [25,50,75,100]
rgrids_positions = np.add(rgrids_values,bottom_spacing)
rgrids_positions_labels = [' {}%'.format(nr) for nr in rgrids_values]
sample_run_lane.sort_values(by=['total_reads'], inplace=True)
# Colors for each column (nested bars inside each bar)
barColors = ['#fdb863', '#e08214', '#b35806', '#542788','#8073ac'][::-1]
except (TypeError, ValueError) as e:
print(e)
else:
# This variable was used for testing purposes to see how fold increases
# of the datapoints affects the plot
lenMult = 1
# Nr of datapoints to plot
nr_h_bars = len(sample_run_lane)*lenMult
# This sets the number of x-positions left empty at the beginning
# so that enough space is left for y-labels
nr_empty_bars = np.int(np.ceil(nr_h_bars/40))
empty_bars = [0]*nr_empty_bars
empty_bars_labels = ['']*len(empty_bars)
# re-set the number of x-positions (data points) to plot
nr_h_bars = nr_h_bars+nr_empty_bars
# The positions of the bars on the x-axis
# Each position corresponds to a radian degree in a unit circunference (radius is 1)
# The circumference (2pi) is divided into nr_h_bars sections
r= np.arange(0,2*np.pi,2*np.pi/nr_h_bars) # for radial barchart
r= r[0:nr_h_bars]
# Bar width
barWidth = (2*np.pi)/(nr_h_bars)*0.9 # for radial barchart
# Define the columns to use for the stacked bars,
# values of each column are plot on top of one another
barColumns = columns_to_plot[::-1]
# This uses the given y-units of bottom spacing for each bar
# You may increase this number a bit to have more spacing between bars and text.
bottomBars = [bottom_spacing]*nr_h_bars
# Lets set the figure size depending on the amount of data points to plot
# Labels until 1000 datapoints are readable
# Consider adding additional elif scenarios to increase size for larger datasets
if nr_h_bars <=200:
fig = plt.figure(figsize=(10, 10))
font_size = 7.5
elif nr_h_bars <= 500:
fig = plt.figure(figsize=(20, 20))
font_size = 'x-small'
else:
fig = plt.figure(figsize=(28, 28))
font_size = 6
# Here we do the actual plotting
# ------------------------------
# Values for each column for a datapoint (i.e sequencing unit, or sample)
# are plot on top of one another
ax = fig.add_axes([0.1,0.1,0.75,0.75], projection='polar')
for ix,column in enumerate(barColumns):
bar_values=empty_bars+sample_run_lane[column].tolist()*lenMult
bars=ax.bar(r, bar_values, bottom=bottomBars,
color=barColors[ix], edgecolor='white', width=barWidth, label=column)
bottomBars = np.add(bottomBars,bar_values).tolist()
# Here we add annotations (e.g labels, legend)
# --------------------------------------------
rotations = np.rad2deg(r)
y0,y1 = ax.get_ylim()
bar_labels= empty_bars_labels + sample_run_lane.index.tolist()*lenMult
for x, y_pos, rotation, label in zip(r, bottomBars, rotations, bar_labels):
offset = (y_pos)/(y1-y0)
lab = ax.text(0, 0, label, fontsize=font_size, transform=None, ha='center', va='center', fontstretch='ultra-condensed')
renderer = ax.figure.canvas.get_renderer()
bbox = lab.get_window_extent(renderer=renderer)
invb = ax.transData.inverted().transform([[0,0],[bbox.width,0] ])
lab.set_position((x,offset+(invb[1][0]-invb[0][0])/2.*2.7) )
lab.set_transform(ax.get_xaxis_transform())
lab.set_rotation(rotation)
#rgrids_positions = sample_run_lane['total_reads'].describe()[[3,5,7]].values + bottom_spacing
#rgrids_positions_labels = [' {}M'.format(nr)
# for ix,nr in enumerate(np.round((rgrids_positions-bottom_spacing)/1000000, decimals=3))]
ax.set_rgrids(rgrids_positions,
labels=rgrids_positions_labels,
angle=np.rad2deg(r[np.int(nr_empty_bars/2-1)]),
family='serif', ha='left', va='center', weight='bold',fontstretch='ultra-condensed', fontsize=8.5)
ax.legend(bbox_to_anchor=(0.5, 0.5), loc='center', frameon=False, title='MetaSnk: preQC')
# A few extra edits for aesthetics
# -------------------------------
ax.xaxis.grid(False)
ax.spines["polar"].set_visible(False)
ax.set_xticklabels([])
return(fig)
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment