diff --git a/README.md b/README.md index 5716c3c9d47efdb0fa0256027972b6a91bceffd4..409479e1986ae6f00bed015ea33e57d2cbaddfa5 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ ## Description -MetagenomicSnake is a new Snakemake workflow for analysis of metagenomic datasets from human microbiomes. +MetagenomicSnake is a Snakemake workflow for the analysis of metagenomic datasets from human microbiomes. ## Authors diff --git a/Snakefile b/Snakefile index c7b5bbb4dec0e2ce432b1e5adfc3b2a0e9bed979..3d3055ae4ecc16dc7766029b89c56be4e06fed40 100644 --- a/Snakefile +++ b/Snakefile @@ -1,18 +1,94 @@ # The main entry point of your workflow. -# After configuring, running snakemake -n in a clone of this repository should successfully execute a dry-run of the workflow. +# After configuring, running snakemake -n in a clone of this repository should +# successfully execute a dry-run of the workflow. +report: "report/MetagenomicSnake.rst" -configfile: "config.yaml" -report: "report/workflow.rst" +import os +##----------------------------------------------------------------------------## +## Check available resources +##----------------------------------------------------------------------------## +from multiprocessing import cpu_count +cpus_avail = cpu_count() -# Allow users to fix the underlying OS via singularity. -singularity: "docker://continuumio/miniconda3" +##----------------------------------------------------------------------------## +## Working directory +##----------------------------------------------------------------------------## +# Set working directory +workdir_path = os.path.dirname(os.path.abspath(__name__)) +workflow_path = workflow.basedir +if workdir_path == workflow_path: + message = "Working directory was not specified!"+\ + "MetagenomicSnake assumes ./data, relative to current directory, "+\ + "as your working directory ..." + print(message) + if os.path.exists(workflow_path+'/data'): + workdir_path = workflow_path+'/data' + workdir: workdir_path + print("Working directory:{}".format(workdir_path)) + else: + print("... Folder ./data not found in current directory...") + print("... instead, setting current directory as working directory ...") + print("Working directory:{}".format(workdir_path)) +else: + print("Working directory:{}".format(workdir_path)) +##----------------------------------------------------------------------------## +## Configuration of MetagenomicSnake +##----------------------------------------------------------------------------## +try: + configfile_path = config['configfile_path'] + print("Configuration file: {}".format(configfile_path)) +except: + print("Configuration file config.yaml not especified at execution!") + try: + print("... Trying working directory ...") + configfile_path = "config.yaml" + configfile: configfile_path + print("... Configuration file: {}".format(configfile_path)) + except: + print("... config.yaml not found in working directory ...") + print("... Loading default config.yaml provided with MetagenomicSnake...") + configfile_path = workflow_path + "/config.yaml" + configfile: configfile_path + print("... Configuration file: {}".format(configfile_path)) + +##----------------------------------------------------------------------------## +## Define paths +##----------------------------------------------------------------------------## +RAW_FASTQ_DIR = 'raw/fastq' +RESULTS_DIR = config['PREFIX_DIR'] +'/results' +LOGS_DIR = config['PREFIX_DIR'] +'/logs' +REPORTS_DIR = config['PREFIX_DIR'] +'/reports' + +RAW_QC_DIR = RESULTS_DIR + '/rawQC' +RAW_QC_REPORT = REPORTS_DIR + '/rawQC' + +##----------------------------------------------------------------------------## +## Fastq files to be processed +##----------------------------------------------------------------------------## +if config['SAMPLE_UNITS']['auto']: + (DATASETS, SAMPLES, RUNS, LANES) = glob_wildcards(RAW_FASTQ_DIR+'/{dataset}/{sample}-{run}_{lane}-R1.fastq.gz') + (DATASETSX, FASTQS) = glob_wildcards(RAW_FASTQ_DIR+'/{dataset}/{fastq_file}-R1.fastq.gz') +else: + # TODO: + pass + +##----------------------------------------------------------------------------## +## Run entire workflow +##----------------------------------------------------------------------------## rule all: input: # The first rule should define the default target files - # Subsequent target rules can be specified below. They should start with all_*. + # Subsequent target rules can be specified below. They should start with + # all_*. +##----------------------------------------------------------------------------## +## Modules +##----------------------------------------------------------------------------## +rule rawQC: + input: + expand(RAW_QC_REPORT + '/{dataset}_multiqc.html', dataset=set(DATASETS)) -include: "rules/other.smk" +include: "rules/rawQC.smk" diff --git a/config.yaml b/config.yaml index d73d255977455dcc21b13496ff4ceee656cf1c0d..c618aa20b39e403178384f91e7ac6307a9fe2586 100644 --- a/config.yaml +++ b/config.yaml @@ -1,3 +1,16 @@ # This file should contain everything to configure the workflow on a global scale. # In case of sample based data, it should be complemented by a samples.tsv file that contains # one row per sample. It can be parsed easily via pandas. + +# PATH to folder where MetagenomicSnake will store results +PREFIX_DIR: 'MetagenomicSnake_results' + +# +SAMPLE_UNITS: + auto: true +#------------------------------------------------------------------------------- +# Modules +#------------------------------------------------------------------------------- +rawQC: + samplerate: 0.1 +preprocess: diff --git a/report/MetagenomicSnake.rst b/report/MetagenomicSnake.rst new file mode 100644 index 0000000000000000000000000000000000000000..514a64112b95e77d210782054b7d7f88b75d616c --- /dev/null +++ b/report/MetagenomicSnake.rst @@ -0,0 +1,12 @@ +Workflow version 0.1 + +MetagenomicSnake +================ +MetagenomicSnake is a Snakemake workflow for the analysis of metagenomic +datasets from human microbiomes. + +---- + +Modules +------- + rawQC diff --git a/rules/rawQC.smk b/rules/rawQC.smk new file mode 100644 index 0000000000000000000000000000000000000000..5bb0bcb0e869cfc7612a5fcbdfb8495b6247fa67 --- /dev/null +++ b/rules/rawQC.smk @@ -0,0 +1,76 @@ +''' +Author: Monica R. Ticlla +Afiliation(s): SIB, SwissTPH, UNIBAS +Description: rules for QC and pre-processing of paired-end shotgun DNA +metagenomic sequencing. +''' +localrules: + multiqc_raw_listing_files, + multiqc_raw + +##---------------------------------------------------------------------------## +## Local variables +##----------------------------------------------------------------------------## +singularity_img = 'shub://mticlla/MetagenomicSnake:preqc_v0_1' + +##----------------------------------------------------------------------------## +## Rules with target files +##----------------------------------------------------------------------------## +# This rule only processess a subset of reads (10%) per fastq file, and only +# for R1. 'R1' was removed from names in html and zip outputs +rule fastqc_raw: + input: + #'data/raw/fastq/{dataset}/{fastq_file}-R1.fastq.gz' + lambda wildcards: ['{}/{}/{}-R1.fastq.gz'.format(RAW_FASTQ_DIR, DATASETSX[ix], value) + for ix,value in enumerate(FASTQS) if value==wildcards.fastq_file] + params: + fastqc_dir = RAW_QC_DIR, + samplerate = config['rawQC']['samplerate'] + log: + LOGS_DIR+'/raw_qc/{dataset}_fastqc/{fastq_file}.log' + output: + fastqc_html = RAW_QC_DIR+'/{dataset}_fastqc/{fastq_file}_fastqc.html', + fastqc_zip = RAW_QC_DIR+'/{dataset}_fastqc/{fastq_file}_fastqc.zip' + threads: cpus_avail + singularity: singularity_img + shell: + ''' + # checks if output dir exist, created otherwise + [ ! -d {params.fastqc_dir}/{wildcards.dataset} ] && \ + mkdir {params.fastqc_dir}/{wildcards.dataset} + # Take a random sample of reads (1%) and process them with fastQC + (reformat.sh in={input} out=stdout.fq samplerate={params.samplerate} | \ + fastqc -o {params.fastqc_dir}/{wildcards.dataset}_fastqc -f fastq \ + -t {threads} stdin:{wildcards.fastq_file}) 2> {log} + ''' + +# List files for MultiQC +rule multiqc_raw_listing_files: + input: + #all fastqc zip files in a dataset + fastqcs=lambda wildcards: ['{}/{}_fastqc/{}_fastqc.zip'.format(RAW_QC_DIR, value, FASTQS[ix]) + for ix,value in enumerate(DATASETSX) if value==wildcards.dataset] + output: + multiqc_input_list = RAW_QC_DIR+'/{dataset}_multiqc_inputs.txt' + run: + import os + try: + os.makedirs(os.path.dirname(output.multiqc_input_list)) + except OSError: + pass + + with open(output.multiqc_input_list, mode='w', encoding='utf-8') as out: + for item in input.fastqcs: + out.write("%s\n" % item) +# +rule multiqc_raw: + input: + RAW_QC_DIR+'/{dataset}_multiqc_inputs.txt' + output: + multiqc_report = report(RAW_QC_REPORT + '/{dataset}_multiqc.html', + category='rawQC') + singularity:singularity_img + shell: + ''' + multiqc --file-list {input} --filename {output.multiqc_report} + '''