Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
M
MetagenomicSnake
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
TBRU
MetagenomicSnake
Commits
2ca82660
Commit
2ca82660
authored
5 years ago
by
Ticlla Ccenhua Monica Roxana
Browse files
Options
Downloads
Patches
Plain Diff
add target rule rawQC to perform quality check of raw paired fastq files.
parent
e4d76cea
Branches
Branches containing commit
No related tags found
No related merge requests found
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
README.md
+1
-1
1 addition, 1 deletion
README.md
Snakefile
+83
-7
83 additions, 7 deletions
Snakefile
config.yaml
+13
-0
13 additions, 0 deletions
config.yaml
report/MetagenomicSnake.rst
+12
-0
12 additions, 0 deletions
report/MetagenomicSnake.rst
rules/rawQC.smk
+76
-0
76 additions, 0 deletions
rules/rawQC.smk
with
185 additions
and
8 deletions
README.md
+
1
−
1
View file @
2ca82660
...
...
@@ -5,7 +5,7 @@
## Description
MetagenomicSnake is a
new
Snakemake workflow for analysis of metagenomic datasets from human microbiomes.
MetagenomicSnake is a Snakemake workflow for
the
analysis of metagenomic datasets from human microbiomes.
## Authors
...
...
This diff is collapsed.
Click to expand it.
Snakefile
+
83
−
7
View file @
2ca82660
# The main entry point of your workflow.
# After configuring, running snakemake -n in a clone of this repository should successfully execute a dry-run of the workflow.
# After configuring, running snakemake -n in a clone of this repository should
# successfully execute a dry-run of the workflow.
report: "report/MetagenomicSnake.rst"
configfile: "config.yaml"
report: "report/workflow.rst"
import os
##----------------------------------------------------------------------------##
## Check available resources
##----------------------------------------------------------------------------##
from multiprocessing import cpu_count
cpus_avail = cpu_count()
# Allow users to fix the underlying OS via singularity.
singularity: "docker://continuumio/miniconda3"
##----------------------------------------------------------------------------##
## Working directory
##----------------------------------------------------------------------------##
# Set working directory
workdir_path = os.path.dirname(os.path.abspath(__name__))
workflow_path = workflow.basedir
if workdir_path == workflow_path:
message = "Working directory was not specified!"+\
"MetagenomicSnake assumes ./data, relative to current directory, "+\
"as your working directory ..."
print(message)
if os.path.exists(workflow_path+'/data'):
workdir_path = workflow_path+'/data'
workdir: workdir_path
print("Working directory:{}".format(workdir_path))
else:
print("... Folder ./data not found in current directory...")
print("... instead, setting current directory as working directory ...")
print("Working directory:{}".format(workdir_path))
else:
print("Working directory:{}".format(workdir_path))
##----------------------------------------------------------------------------##
## Configuration of MetagenomicSnake
##----------------------------------------------------------------------------##
try:
configfile_path = config['configfile_path']
print("Configuration file: {}".format(configfile_path))
except:
print("Configuration file config.yaml not especified at execution!")
try:
print("... Trying working directory ...")
configfile_path = "config.yaml"
configfile: configfile_path
print("... Configuration file: {}".format(configfile_path))
except:
print("... config.yaml not found in working directory ...")
print("... Loading default config.yaml provided with MetagenomicSnake...")
configfile_path = workflow_path + "/config.yaml"
configfile: configfile_path
print("... Configuration file: {}".format(configfile_path))
##----------------------------------------------------------------------------##
## Define paths
##----------------------------------------------------------------------------##
RAW_FASTQ_DIR = 'raw/fastq'
RESULTS_DIR = config['PREFIX_DIR'] +'/results'
LOGS_DIR = config['PREFIX_DIR'] +'/logs'
REPORTS_DIR = config['PREFIX_DIR'] +'/reports'
RAW_QC_DIR = RESULTS_DIR + '/rawQC'
RAW_QC_REPORT = REPORTS_DIR + '/rawQC'
##----------------------------------------------------------------------------##
## Fastq files to be processed
##----------------------------------------------------------------------------##
if config['SAMPLE_UNITS']['auto']:
(DATASETS, SAMPLES, RUNS, LANES) = glob_wildcards(RAW_FASTQ_DIR+'/{dataset}/{sample}-{run}_{lane}-R1.fastq.gz')
(DATASETSX, FASTQS) = glob_wildcards(RAW_FASTQ_DIR+'/{dataset}/{fastq_file}-R1.fastq.gz')
else:
# TODO:
pass
##----------------------------------------------------------------------------##
## Run entire workflow
##----------------------------------------------------------------------------##
rule all:
input:
# The first rule should define the default target files
# Subsequent target rules can be specified below. They should start with all_*.
# Subsequent target rules can be specified below. They should start with
# all_*.
##----------------------------------------------------------------------------##
## Modules
##----------------------------------------------------------------------------##
rule rawQC:
input:
expand(RAW_QC_REPORT + '/{dataset}_multiqc.html', dataset=set(DATASETS))
include: "rules/
other
.smk"
include: "rules/
rawQC
.smk"
This diff is collapsed.
Click to expand it.
config.yaml
+
13
−
0
View file @
2ca82660
# This file should contain everything to configure the workflow on a global scale.
# In case of sample based data, it should be complemented by a samples.tsv file that contains
# one row per sample. It can be parsed easily via pandas.
# PATH to folder where MetagenomicSnake will store results
PREFIX_DIR
:
'
MetagenomicSnake_results'
#
SAMPLE_UNITS
:
auto
:
true
#-------------------------------------------------------------------------------
# Modules
#-------------------------------------------------------------------------------
rawQC
:
samplerate
:
0.1
preprocess
:
This diff is collapsed.
Click to expand it.
report/MetagenomicSnake.rst
0 → 100644
+
12
−
0
View file @
2ca82660
Workflow version 0.1
MetagenomicSnake
================
MetagenomicSnake is a Snakemake workflow for the analysis of metagenomic
datasets from human microbiomes.
----
Modules
-------
rawQC
This diff is collapsed.
Click to expand it.
rules/rawQC.smk
0 → 100644
+
76
−
0
View file @
2ca82660
'''
Author: Monica R. Ticlla
Afiliation(s): SIB, SwissTPH, UNIBAS
Description: rules for QC and pre-processing of paired-end shotgun DNA
metagenomic sequencing.
'''
localrules:
multiqc_raw_listing_files,
multiqc_raw
##---------------------------------------------------------------------------##
## Local variables
##----------------------------------------------------------------------------##
singularity_img = 'shub://mticlla/MetagenomicSnake:preqc_v0_1'
##----------------------------------------------------------------------------##
## Rules with target files
##----------------------------------------------------------------------------##
# This rule only processess a subset of reads (10%) per fastq file, and only
# for R1. 'R1' was removed from names in html and zip outputs
rule fastqc_raw:
input:
#'data/raw/fastq/{dataset}/{fastq_file}-R1.fastq.gz'
lambda wildcards: ['{}/{}/{}-R1.fastq.gz'.format(RAW_FASTQ_DIR, DATASETSX[ix], value)
for ix,value in enumerate(FASTQS) if value==wildcards.fastq_file]
params:
fastqc_dir = RAW_QC_DIR,
samplerate = config['rawQC']['samplerate']
log:
LOGS_DIR+'/raw_qc/{dataset}_fastqc/{fastq_file}.log'
output:
fastqc_html = RAW_QC_DIR+'/{dataset}_fastqc/{fastq_file}_fastqc.html',
fastqc_zip = RAW_QC_DIR+'/{dataset}_fastqc/{fastq_file}_fastqc.zip'
threads: cpus_avail
singularity: singularity_img
shell:
'''
# checks if output dir exist, created otherwise
[ ! -d {params.fastqc_dir}/{wildcards.dataset} ] && \
mkdir {params.fastqc_dir}/{wildcards.dataset}
# Take a random sample of reads (1%) and process them with fastQC
(reformat.sh in={input} out=stdout.fq samplerate={params.samplerate} | \
fastqc -o {params.fastqc_dir}/{wildcards.dataset}_fastqc -f fastq \
-t {threads} stdin:{wildcards.fastq_file}) 2> {log}
'''
# List files for MultiQC
rule multiqc_raw_listing_files:
input:
#all fastqc zip files in a dataset
fastqcs=lambda wildcards: ['{}/{}_fastqc/{}_fastqc.zip'.format(RAW_QC_DIR, value, FASTQS[ix])
for ix,value in enumerate(DATASETSX) if value==wildcards.dataset]
output:
multiqc_input_list = RAW_QC_DIR+'/{dataset}_multiqc_inputs.txt'
run:
import os
try:
os.makedirs(os.path.dirname(output.multiqc_input_list))
except OSError:
pass
with open(output.multiqc_input_list, mode='w', encoding='utf-8') as out:
for item in input.fastqcs:
out.write("%s\n" % item)
#
rule multiqc_raw:
input:
RAW_QC_DIR+'/{dataset}_multiqc_inputs.txt'
output:
multiqc_report = report(RAW_QC_REPORT + '/{dataset}_multiqc.html',
category='rawQC')
singularity:singularity_img
shell:
'''
multiqc --file-list {input} --filename {output.multiqc_report}
'''
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment