Skip to content
Snippets Groups Projects
Commit 2ca82660 authored by Ticlla Ccenhua Monica Roxana's avatar Ticlla Ccenhua Monica Roxana
Browse files

add target rule rawQC to perform quality check of raw paired fastq files.

parent e4d76cea
Branches
No related tags found
No related merge requests found
......@@ -5,7 +5,7 @@
## Description
MetagenomicSnake is a new Snakemake workflow for analysis of metagenomic datasets from human microbiomes.
MetagenomicSnake is a Snakemake workflow for the analysis of metagenomic datasets from human microbiomes.
## Authors
......
# The main entry point of your workflow.
# After configuring, running snakemake -n in a clone of this repository should successfully execute a dry-run of the workflow.
# After configuring, running snakemake -n in a clone of this repository should
# successfully execute a dry-run of the workflow.
report: "report/MetagenomicSnake.rst"
configfile: "config.yaml"
report: "report/workflow.rst"
import os
##----------------------------------------------------------------------------##
## Check available resources
##----------------------------------------------------------------------------##
from multiprocessing import cpu_count
cpus_avail = cpu_count()
# Allow users to fix the underlying OS via singularity.
singularity: "docker://continuumio/miniconda3"
##----------------------------------------------------------------------------##
## Working directory
##----------------------------------------------------------------------------##
# Set working directory
workdir_path = os.path.dirname(os.path.abspath(__name__))
workflow_path = workflow.basedir
if workdir_path == workflow_path:
message = "Working directory was not specified!"+\
"MetagenomicSnake assumes ./data, relative to current directory, "+\
"as your working directory ..."
print(message)
if os.path.exists(workflow_path+'/data'):
workdir_path = workflow_path+'/data'
workdir: workdir_path
print("Working directory:{}".format(workdir_path))
else:
print("... Folder ./data not found in current directory...")
print("... instead, setting current directory as working directory ...")
print("Working directory:{}".format(workdir_path))
else:
print("Working directory:{}".format(workdir_path))
##----------------------------------------------------------------------------##
## Configuration of MetagenomicSnake
##----------------------------------------------------------------------------##
try:
configfile_path = config['configfile_path']
print("Configuration file: {}".format(configfile_path))
except:
print("Configuration file config.yaml not especified at execution!")
try:
print("... Trying working directory ...")
configfile_path = "config.yaml"
configfile: configfile_path
print("... Configuration file: {}".format(configfile_path))
except:
print("... config.yaml not found in working directory ...")
print("... Loading default config.yaml provided with MetagenomicSnake...")
configfile_path = workflow_path + "/config.yaml"
configfile: configfile_path
print("... Configuration file: {}".format(configfile_path))
##----------------------------------------------------------------------------##
## Define paths
##----------------------------------------------------------------------------##
RAW_FASTQ_DIR = 'raw/fastq'
RESULTS_DIR = config['PREFIX_DIR'] +'/results'
LOGS_DIR = config['PREFIX_DIR'] +'/logs'
REPORTS_DIR = config['PREFIX_DIR'] +'/reports'
RAW_QC_DIR = RESULTS_DIR + '/rawQC'
RAW_QC_REPORT = REPORTS_DIR + '/rawQC'
##----------------------------------------------------------------------------##
## Fastq files to be processed
##----------------------------------------------------------------------------##
if config['SAMPLE_UNITS']['auto']:
(DATASETS, SAMPLES, RUNS, LANES) = glob_wildcards(RAW_FASTQ_DIR+'/{dataset}/{sample}-{run}_{lane}-R1.fastq.gz')
(DATASETSX, FASTQS) = glob_wildcards(RAW_FASTQ_DIR+'/{dataset}/{fastq_file}-R1.fastq.gz')
else:
# TODO:
pass
##----------------------------------------------------------------------------##
## Run entire workflow
##----------------------------------------------------------------------------##
rule all:
input:
# The first rule should define the default target files
# Subsequent target rules can be specified below. They should start with all_*.
# Subsequent target rules can be specified below. They should start with
# all_*.
##----------------------------------------------------------------------------##
## Modules
##----------------------------------------------------------------------------##
rule rawQC:
input:
expand(RAW_QC_REPORT + '/{dataset}_multiqc.html', dataset=set(DATASETS))
include: "rules/other.smk"
include: "rules/rawQC.smk"
# This file should contain everything to configure the workflow on a global scale.
# In case of sample based data, it should be complemented by a samples.tsv file that contains
# one row per sample. It can be parsed easily via pandas.
# PATH to folder where MetagenomicSnake will store results
PREFIX_DIR: 'MetagenomicSnake_results'
#
SAMPLE_UNITS:
auto: true
#-------------------------------------------------------------------------------
# Modules
#-------------------------------------------------------------------------------
rawQC:
samplerate: 0.1
preprocess:
Workflow version 0.1
MetagenomicSnake
================
MetagenomicSnake is a Snakemake workflow for the analysis of metagenomic
datasets from human microbiomes.
----
Modules
-------
rawQC
'''
Author: Monica R. Ticlla
Afiliation(s): SIB, SwissTPH, UNIBAS
Description: rules for QC and pre-processing of paired-end shotgun DNA
metagenomic sequencing.
'''
localrules:
multiqc_raw_listing_files,
multiqc_raw
##---------------------------------------------------------------------------##
## Local variables
##----------------------------------------------------------------------------##
singularity_img = 'shub://mticlla/MetagenomicSnake:preqc_v0_1'
##----------------------------------------------------------------------------##
## Rules with target files
##----------------------------------------------------------------------------##
# This rule only processess a subset of reads (10%) per fastq file, and only
# for R1. 'R1' was removed from names in html and zip outputs
rule fastqc_raw:
input:
#'data/raw/fastq/{dataset}/{fastq_file}-R1.fastq.gz'
lambda wildcards: ['{}/{}/{}-R1.fastq.gz'.format(RAW_FASTQ_DIR, DATASETSX[ix], value)
for ix,value in enumerate(FASTQS) if value==wildcards.fastq_file]
params:
fastqc_dir = RAW_QC_DIR,
samplerate = config['rawQC']['samplerate']
log:
LOGS_DIR+'/raw_qc/{dataset}_fastqc/{fastq_file}.log'
output:
fastqc_html = RAW_QC_DIR+'/{dataset}_fastqc/{fastq_file}_fastqc.html',
fastqc_zip = RAW_QC_DIR+'/{dataset}_fastqc/{fastq_file}_fastqc.zip'
threads: cpus_avail
singularity: singularity_img
shell:
'''
# checks if output dir exist, created otherwise
[ ! -d {params.fastqc_dir}/{wildcards.dataset} ] && \
mkdir {params.fastqc_dir}/{wildcards.dataset}
# Take a random sample of reads (1%) and process them with fastQC
(reformat.sh in={input} out=stdout.fq samplerate={params.samplerate} | \
fastqc -o {params.fastqc_dir}/{wildcards.dataset}_fastqc -f fastq \
-t {threads} stdin:{wildcards.fastq_file}) 2> {log}
'''
# List files for MultiQC
rule multiqc_raw_listing_files:
input:
#all fastqc zip files in a dataset
fastqcs=lambda wildcards: ['{}/{}_fastqc/{}_fastqc.zip'.format(RAW_QC_DIR, value, FASTQS[ix])
for ix,value in enumerate(DATASETSX) if value==wildcards.dataset]
output:
multiqc_input_list = RAW_QC_DIR+'/{dataset}_multiqc_inputs.txt'
run:
import os
try:
os.makedirs(os.path.dirname(output.multiqc_input_list))
except OSError:
pass
with open(output.multiqc_input_list, mode='w', encoding='utf-8') as out:
for item in input.fastqcs:
out.write("%s\n" % item)
#
rule multiqc_raw:
input:
RAW_QC_DIR+'/{dataset}_multiqc_inputs.txt'
output:
multiqc_report = report(RAW_QC_REPORT + '/{dataset}_multiqc.html',
category='rawQC')
singularity:singularity_img
shell:
'''
multiqc --file-list {input} --filename {output.multiqc_report}
'''
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment