diff --git a/README.md b/README.md index b4535b5a7c57b975ba4c9fa2d7b40c29d57f003d..56e8b0b5345adf4f5f77ebe85e33fba90dabc128 100644 --- a/README.md +++ b/README.md @@ -11,24 +11,51 @@ MetaSnk is a reproducible and scalable modularized Snakemake workflow for the an MetaSnk wraps system and software dependencies within Singularity containers. ### Modules: - - [rawQC](README_rawQC.md) + - **[rawQC](README_rawQC.md)**: + It runs FastQC on a random sample of R1 reads from the paired fastq-format files. <div style="text-align:center"> <img src="./images/rawQC_dag.png" width="130" height="250" /> </div> - - [preQC](README_preQC.md) + - **[preQC](README_preQC.md)**: + FastQC only performs a quality check but no QC processing is done. The **preQC** + rule runs a multi-step pre-processing of the paired fastq files, it includes: + + - **trim_adapters**: adapter-trimming with "fastp". Fastp performs a quality check + and both paired fastq files are processed as follows: + + + remove adapters: here we provide the Nextera XT adapters, + + base correction in overlapped regions + + trimming of the last base in read 1 + + discard reads shorter than a minimum length, after trimming + + a report with quality check, before and after processing + - **filter_human**: removal of reads derived from human DNA with BBTools' [bbsplit](http://seqanswers.com/forums/showthread.php?t=41288) + - **dedupe**: removal of duplicated reads with BBTools' [clumpify](https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/clumpify-guide/) + - **trim_3end**: 3\'-end quality trimming with "fastp" + - **concatenate_fastqs**: merges fastq files corresponding to the same sample into a single pair of fastq files + - **summarize_preQC**: creates summarizing tables and plots <div style="text-align:center"> <img src="./images/preQC_dag.png" width="350" height="500" /> </div> - - [PhlAnProf](README_phlanprof.md) + - **[PhlAnProf](README_phlanprof.md)**: + It performs taxonmic and strain-level profiling using MetaPhlAn2 and StrainPhlAn. + If pre-processing (preQC) was not performed PhlAnProf will trigger its execution. <div style="text-align:center"> <img src="./images/PhlAnProf_dag.png" width="400" height="800" /> </div> + - **HUMAnN2Prof**: + It performs gene- and pathway-level functional profiling using HUMAnN2. If pre-processing(preQC) + and taxonomic profiling with MetaPhlAn2 was not performed it will trigger their execution. + + <div style="text-align:center"> + <img src="./images/HUMAnN2Prof_dag.png" width="300" height="800" /> + </div> + ### Authors * Monica R. Ticlla (@mticllacc) @@ -110,7 +137,7 @@ The singularity image files (.sif) will be stored in $METASNK_DBS/singularity. MetaSnK uses reference databases that need to be downloaded to the $METASNK_DBS directory: <div style="text-align:center"> - <img src="./images/buildDBS_dag.png" width="500" height="200" /> + <img src="./images/buildDBS_dag.png" width="600" height="200" /> </div> snakemake --profile ./profiles/local buildDBS diff --git a/Snakefile b/Snakefile index d56cc9beecd7835fca8e1e32e874175a98875241..27daeb26bd205c67d32e3ef35716dc43333ce846 100644 --- a/Snakefile +++ b/Snakefile @@ -100,13 +100,15 @@ localrules: buildDBS, rawQC, preQC, - PhlAnProf, MetaPhlAn2, StrainPhlAn, + PhlAnProf, + HUMAnN2Prof, rawQC_make_report, preQC_make_report, MetaPhlAn2_make_report, - PhlAnProf_make_report + PhlAnProf_make_report, + HUMAnN2Prof_make_report ##----------------------------------------------------------------------------## ## Run entire workflow ##----------------------------------------------------------------------------## @@ -162,6 +164,9 @@ rule PhlAnProf: input: expand(OUT_DIR + '/{dataset}/PhlAnProf/metaphlan/profiles_merged/{dataset}_abundances_sp_heatmap.png', dataset=set(DATASETS)), expand(OUT_DIR + '/{dataset}/PhlAnProf/strphlan/{dataset}_clades_profiled.tsv', dataset=set(DATASETS)) +rule HUMAnN2Prof: + input: + expand(OUT_DIR + '/{dataset}/Humann2/all.done', dataset=set(DATASETS)) ##----------------------------------------------------------------------------## ## Rules to make reports @@ -233,7 +238,25 @@ rule PhlAnProf_make_report: --report {params.report_path} \ -s {params.workflow_dir}/Snakefile PhlAnProf) &>{log} ''' +rule HUMAnN2Prof_make_report: + output: + temp(touch(OUT_DIR+'/logs/humann2prof_make_report.done')) + log: + OUT_DIR+'/logs/humann2prof_make_report.log' + params: + report_path = OUT_DIR+'/HUMAnN2Prof_report.html', + workdir = workdir_path, + workflow_dir = workflow_path + shell: + ''' + (snakemake \ + --directory={params.workdir} \ + --report {params.report_path} \ + -s {params.workflow_dir}/Snakefile HUMAnN2Prof) &>{log} + ''' + include: "rules/setup_rules.smk" include: "rules/rawQC.smk" include: "rules/preprocess.smk" include: "rules/phlanprof.smk" +include: "rules/humann2.smk" diff --git a/images/HUMAnN2Prof_dag.png b/images/HUMAnN2Prof_dag.png new file mode 100644 index 0000000000000000000000000000000000000000..05c02300e88bc8a26b8b1fbe9395be42a2e4aeb6 Binary files /dev/null and b/images/HUMAnN2Prof_dag.png differ diff --git a/report/MetagenomicSnake.rst b/report/MetagenomicSnake.rst index 3418a59e6f3b9ccfc68d91eef4777824aa8d71d0..3085000d384f4c7e90cec4ba2815cdf5be8a586c 100644 --- a/report/MetagenomicSnake.rst +++ b/report/MetagenomicSnake.rst @@ -1,4 +1,4 @@ -Workflow version 0.2 +Workflow version 0.3 MetaSnk ================ @@ -29,10 +29,14 @@ Modules - **trim_3end**: 3\'-end quality trimming with "fastp" - **concatenate_fastqs**: merges fastq files corresponding to the same sample into a single pair of fastq files - **summarize_preQC**: creates summarizing tables and plots - + **PhlAnProf** - performs taxonmic and strain-level profiling using MetaPhlAn2 and StrainPhlAn. + performs taxonmic and strain-level profiling using MetaPhlAn2 and StrainPhlAn. If pre-processing was not performed PhlAnProf will trigger execution of preQC. - + +**HUMAnN2Prof** + performs gene- and pathway-level functional profiling using HUMAnN2. If pre-processing(preQC) + and taxonomic profiling with MetaPhlAn2 was not performed it will trigger their execution. + .. _bbsplit: http://seqanswers.com/forums/showthread.php?t=41288 .. _clumpify: https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/clumpify-guide/ diff --git a/rules/humann2.smk b/rules/humann2.smk new file mode 100644 index 0000000000000000000000000000000000000000..23c48120e0788bdfe35d464179cca9fe6b6c336d --- /dev/null +++ b/rules/humann2.smk @@ -0,0 +1,230 @@ +''' +Author: Monica R. Ticlla +Afiliation(s): SIB, SwissTPH, UNIBAS +Description: rules for functional profiling of paired-end shotgun DNA metagenomic +sequencing data with HUMAnN2. +''' + +##----------------------------------------------------------------------------## +## Import modules +##----------------------------------------------------------------------------## + + +##----------------------------------------------------------------------------## +## Local rules +##----------------------------------------------------------------------------## +localrules: + humann2_check + +##----------------------------------------------------------------------------## +## Local variables +##----------------------------------------------------------------------------## +singularity_humann = METAPROF_SIF + +##----------------------------------------------------------------------------## +## Helper functions +##----------------------------------------------------------------------------## +# def get_first_forward_fastq_in_dataset(wildcards): +# FIRST_SAMPLE = DATASETS_SAMPLES_DICT[wildcards.dataset][0] +# FASTQ_FWD = '{}/{}/preQC/emerged/{}-R1.fastq.gz'.format(OUT_DIR, +# wildcards.dataset, +# FIRST_SAMPLE) +# return FASTQ_FWD +# +# def get_first_rev_fastq_in_dataset(wildcards): +# FIRST_SAMPLE = DATASETS_SAMPLES_DICT[wildcards.dataset][0] +# FASTQ_REV = '{}/{}/preQC/emerged/{}-R2.fastq.gz'.format(OUT_DIR, +# wildcards.dataset, +# FIRST_SAMPLE) +# return FASTQ_REV + +def get_all_samples_genefamilies(wildcards): + SAMPLES_W_PROF = DATASETS_SAMPLES_DICT[wildcards.dataset] + return expand('{out_dir}/{dataset}/Humann2/profiles/{sample}/{sample}_genefamilies.tsv', + out_dir=OUT_DIR, dataset=wildcards.dataset, sample=SAMPLES_W_PROF) +def get_all_samples_genefamilies_relab(wildcards): + SAMPLES_W_PROF_RELAB = DATASETS_SAMPLES_DICT[wildcards.dataset] + return expand('{out_dir}/{dataset}/Humann2/profiles_norm/{sample}/{sample}_genefamilies_relab.tsv', + out_dir=OUT_DIR, dataset=wildcards.dataset, sample=SAMPLES_W_PROF_RELAB) +def get_all_samples_genefamilies_cpm(wildcards): + SAMPLES_W_PROF_CPM = DATASETS_SAMPLES_DICT[wildcards.dataset] + return expand('{out_dir}/{dataset}/Humann2/profiles_norm/{sample}/{sample}_genefamilies_cpm.tsv', + out_dir=OUT_DIR, dataset=wildcards.dataset, sample=SAMPLES_W_PROF_CPM) + +def get_all_samples_pathabundances(wildcards): + SAMPLES_W_PATHABUND = DATASETS_SAMPLES_DICT[wildcards.dataset] + return expand('{out_dir}/{dataset}/Humann2/profiles/{sample}/{sample}_pathabundance.tsv', + out_dir=OUT_DIR, dataset=wildcards.dataset, sample=SAMPLES_W_PATHABUND) +def get_all_samples_pathabundances_relab(wildcards): + SAMPLES_W_PATHABUND_RELAB = DATASETS_SAMPLES_DICT[wildcards.dataset] + return expand('{out_dir}/{dataset}/Humann2/profiles_norm/{sample}/{sample}_pathabundance_relab.tsv', + out_dir=OUT_DIR, dataset=wildcards.dataset, sample=SAMPLES_W_PATHABUND_RELAB) +def get_all_samples_pathabundances_cpm(wildcards): + SAMPLES_W_PATHABUND_CPM = DATASETS_SAMPLES_DICT[wildcards.dataset] + return expand('{out_dir}/{dataset}/Humann2/profiles_norm/{sample}/{sample}_pathabundance_cpm.tsv', + out_dir=OUT_DIR, dataset=wildcards.dataset, sample=SAMPLES_W_PATHABUND_CPM) + +def get_all_samples_pathcoverage(wildcards): + SAMPLES_W_PATHCOV = DATASETS_SAMPLES_DICT[wildcards.dataset] + return expand('{out_dir}/{dataset}/Humann2/profiles/{sample}/{sample}_pathcoverage.tsv', + out_dir=OUT_DIR, dataset=wildcards.dataset, sample=SAMPLES_W_PATHCOV) + + +rule humann2_profiling: + input: + sample_fwd = OUT_DIR + '/{dataset}/preQC/emerged/{sample}-R1.fastq.gz', + sample_rev = OUT_DIR + '/{dataset}/preQC/emerged/{sample}-R2.fastq.gz', + metaphlan2_abund = OUT_DIR + '/{dataset}/PhlAnProf/metaphlan/profiles_abund/{sample}.txt' + #choco_ixs = OUT_DIR + '/{dataset}/Humann2/choco_custom_ixs/choco_humann2_temp' + output: + genefamilies = OUT_DIR + '/{dataset}/Humann2/profiles/{sample}/{sample}_genefamilies.tsv', + pathabundances = OUT_DIR + '/{dataset}/Humann2/profiles/{sample}/{sample}_pathabundance.tsv', + pathcoverage = OUT_DIR + '/{dataset}/Humann2/profiles/{sample}/{sample}_pathcoverage.tsv', + sample_log = OUT_DIR + '/{dataset}/Humann2/profiles/{sample}/{sample}.log' + log: + OUT_DIR + '/{dataset}/Humann2/logs/profiles/{sample}.log' + params: + fastq_tmp = OUT_DIR + '/{dataset}/Humann2/profiles/{sample}/{sample}.fastq.gz', + prot_db = METASNK_DB_DIR+'/humann2_databases/uniref', + nt_db = METASNK_DB_DIR+'/humann2_databases/chocophlan' + threads:cpus_avail + singularity:singularity_humann + message: 'Running HUMAnN2 on sample {wildcards.sample}' + group: 'HUMAnN2_prof_norm' + shell: + ''' + (bash -c "source activate humann2 && \ + cat {input.sample_fwd} {input.sample_rev} >{params.fastq_tmp}; \ + humann2 \ + --input {params.fastq_tmp} \ + --output $(dirname {output.genefamilies}) \ + --o-log {output.sample_log} \ + --taxonomic-profile {input.metaphlan2_abund} \ + --protein-database {params.prot_db} \ + --nucleotide-database {params.nt_db} \ + --threads {threads} \ + --remove-temp-output \ + --memory-use maximum; \ + rm {params.fastq_tmp}") &>{log} + ''' +rule humann2_normalize: + input: + genefamilies = OUT_DIR + '/{dataset}/Humann2/profiles/{sample}/{sample}_genefamilies.tsv', + pathabundances = OUT_DIR + '/{dataset}/Humann2/profiles/{sample}/{sample}_pathabundance.tsv' + output: + genefamilies_relab = OUT_DIR + '/{dataset}/Humann2/profiles_norm/{sample}/{sample}_genefamilies_relab.tsv', + genefamilies_cpm = OUT_DIR + '/{dataset}/Humann2/profiles_norm/{sample}/{sample}_genefamilies_cpm.tsv', + pathabundances_relab = OUT_DIR + '/{dataset}/Humann2/profiles_norm/{sample}/{sample}_pathabundance_relab.tsv', + pathabundances_cpm = OUT_DIR + '/{dataset}/Humann2/profiles_norm/{sample}/{sample}_pathabundance_cpm.tsv', + log: + OUT_DIR + '/{dataset}/Humann2/logs/profiles_norm/{sample}.log' + singularity:singularity_humann + message: 'Normalizing HUMAnN2 outputs for sample {wildcards.sample}' + group: 'HUMAnN2_prof_norm' + shell: + ''' + (bash -c "source activate humann2 && \ + humann2_renorm_table \ + --input {input.genefamilies} \ + --output {output.genefamilies_relab} \ + --units relab \ + --update-snames; \ + humann2_renorm_table \ + --input {input.genefamilies} \ + --output {output.genefamilies_cpm} \ + --units cpm \ + --update-snames; \ + humann2_renorm_table \ + --input {input.pathabundances} \ + --output {output.pathabundances_relab} \ + --units relab \ + --update-snames; \ + humann2_renorm_table \ + --input {input.pathabundances} \ + --output {output.pathabundances_cpm} \ + --units cpm \ + --update-snames") &>{log} + ''' +rule humann2_merging: + input: + genefamilies = get_all_samples_genefamilies, + genefamilies_relab = get_all_samples_genefamilies_relab, + genefamilies_cpm = get_all_samples_genefamilies_cpm, + pathabundances = get_all_samples_pathabundances, + pathabundances_relab = get_all_samples_pathabundances_relab, + pathabundances_cpm = get_all_samples_pathabundances_cpm, + pathcoverage = get_all_samples_pathcoverage + output: + genefamilies_merged = report(OUT_DIR + '/{dataset}/Humann2/profiles_merged/{dataset}_genefamilies.tsv', + category='HUMAnN2Prof'), + genefamilies_relab_merged = report(OUT_DIR + '/{dataset}/Humann2/profiles_merged/{dataset}_genefamilies_relab.tsv', + category='HUMAnN2Prof'), + genefamilies_cpm_merged = report(OUT_DIR + '/{dataset}/Humann2/profiles_merged/{dataset}_genefamilies_cpm.tsv', + category='HUMAnN2Prof'), + pathabund_merged = report(OUT_DIR + '/{dataset}/Humann2/profiles_merged/{dataset}_pathabundance.tsv', + category='HUMAnN2Prof'), + pathabund_relab_merged = report(OUT_DIR + '/{dataset}/Humann2/profiles_merged/{dataset}_pathabundance_relab.tsv', + category='HUMAnN2Prof'), + pathabund_cpm_merged = report(OUT_DIR + '/{dataset}/Humann2/profiles_merged/{dataset}_pathabundance_cpm.tsv', + category='HUMAnN2Prof'), + pathcov_merged = report(OUT_DIR + '/{dataset}/Humann2/profiles_merged/{dataset}_pathcoverage.tsv', + category='HUMAnN2Prof') + log: + OUT_DIR + '/{dataset}/Humann2/logs/humann2_merging.log' + singularity:singularity_humann + message: 'Joining HUMAnN2 outputs for dataset {wildcards.dataset}' + shell: + ''' + (bash -c "source activate humann2 && \ + humann2_join_tables \ + --input $(dirname $(dirname {input.genefamilies[0]})) \ + --output {output.genefamilies_merged} \ + --file_name genefamilies \ + --search-subdirectories; \ + humann2_join_tables \ + --input $(dirname $(dirname {input.genefamilies_relab[0]})) \ + --output {output.genefamilies_relab_merged} \ + --file_name genefamilies_relab \ + --search-subdirectories; \ + humann2_join_tables \ + --input $(dirname $(dirname {input.genefamilies_cpm[0]})) \ + --output {output.genefamilies_cpm_merged} \ + --file_name genefamilies_cpm \ + --search-subdirectories; \ + humann2_join_tables \ + --input $(dirname $(dirname {input.pathabundances[0]})) \ + --output {output.pathabund_merged} \ + --file_name pathabundance \ + --search-subdirectories; \ + humann2_join_tables \ + --input $(dirname $(dirname {input.pathabundances_relab[0]})) \ + --output {output.pathabund_relab_merged} \ + --file_name pathabundance_relab \ + --search-subdirectories; \ + humann2_join_tables \ + --input $(dirname $(dirname {input.pathabundances_cpm[0]})) \ + --output {output.pathabund_cpm_merged} \ + --file_name pathabundance_cpm \ + --search-subdirectories; \ + humann2_join_tables \ + --input $(dirname $(dirname {input.pathcoverage[0]})) \ + --output {output.pathcov_merged} \ + --file_name pathcoverage \ + --search-subdirectories") &>{log} + ''' +rule humann2_check: + input: + genefamilies_merged = OUT_DIR + '/{dataset}/Humann2/profiles_merged/{dataset}_genefamilies.tsv', + genefamilies_relab_merged = OUT_DIR + '/{dataset}/Humann2/profiles_merged/{dataset}_genefamilies_relab.tsv', + genefamilies_cpm_merged = OUT_DIR + '/{dataset}/Humann2/profiles_merged/{dataset}_genefamilies_cpm.tsv', + pathabund_merged = OUT_DIR + '/{dataset}/Humann2/profiles_merged/{dataset}_pathabundance.tsv', + pathabund_relab_merged = OUT_DIR + '/{dataset}/Humann2/profiles_merged/{dataset}_pathabundance_relab.tsv', + pathabund_cpm_merged = OUT_DIR + '/{dataset}/Humann2/profiles_merged/{dataset}_pathabundance_cpm.tsv', + pathcov_merged = OUT_DIR + '/{dataset}/Humann2/profiles_merged/{dataset}_pathcoverage.tsv' + output: + check = OUT_DIR + '/{dataset}/Humann2/all.done' + shell: + ''' + touch {output.check} + ''' +#rule humann2_renaming: diff --git a/slurm_cluster.json b/slurm_cluster.json index 310796afd67f5ce4bad89c2061a41baa91baca39..dfd3be2c9cb7f56a19b08eaa5d648eae4d710931 100644 --- a/slurm_cluster.json +++ b/slurm_cluster.json @@ -1,7 +1,7 @@ { "__default__":{ "nodes":1, - "ntasks":4, + "ntasks":1, "time":"30", "mem" : 8000, "qos":"30min" @@ -41,7 +41,7 @@ }, "metaphlan2_profile":{ "nodes":1, - "time":"120", + "time":"240", "ntasks":20, "qos":"6hours", "mem":32000 @@ -58,14 +58,24 @@ }, "strphlan_clades_detection":{ "time":"120", - "ntasks":20, - "mem":32000, + "cpus_per_task":20, + "mem":64000, "qos":"6hours" }, "strphlan_clade_analysis":{ - "time":"120", - "ntasks":20, + "time":"359", + "cpus_per_task":20, + "mem":64000, + "qos":"6hours" + }, + "HUMAnN2_prof_norm":{ + "time":"359", + "cpus_per_task":20, "mem":32000, "qos":"6hours" + }, + "humann2_merging":{ + "nodes":1, + "cpus_per_task":2 } }