diff --git a/Snakefile b/Snakefile index 9f6bf598aa6e333ac89a1e4960208df682222d95..06bc9449c17df993f07aa112a75ac8f978c67349 100644 --- a/Snakefile +++ b/Snakefile @@ -16,7 +16,7 @@ print('CPUs available: {}'.format(cpus_avail), file=sys.stderr) ## Check existence of expected variables in the environment ##----------------------------------------------------------------------------## try: - print('MetaSnk will store reference data/databases at {}'.format(os.environ["METASNK_DBS"]), + print('MetaSnk will store reference data/databases at {}'.format(os.environ["METASNK_DBS"]), file=sys.stderr) except KeyError: print("You did not set variable METASNK_DBS", file=sys.stderr) @@ -117,16 +117,16 @@ rule all: ##----------------------------------------------------------------------------## rule pullSIFS: input: - PREQC_SIF, + PREQC_SIF, METAPROF_SIF - + rule buildDBS: input: METASNK_DB_DIR+'/strainphlan_markers/all_markers.fasta', METASNK_DB_DIR+'/humann2_databases/chocophlan', METASNK_DB_DIR+'/humann2_databases/uniref', METASNK_DB_DIR+'/humann2_databases/utility_mapping' - + rule rawQC: input: expand(OUT_DIR + '/{dataset}/rawQC/summary_stats/{dataset}_samples_stats.txt', dataset=set(DATASETS)), @@ -148,20 +148,20 @@ rule preQC: expand(OUT_DIR + '/{dataset}/preQC/summary_stats/{dataset}_concatenation_all.done', dataset=set(DATASETS)) rule MetaPhlAn2: input: - expand(OUT_DIR + '/{dataset}/PhlAnProf/metaphlan/merged_abundances_sp_heatmap.png', dataset=set(DATASETS)) + expand(OUT_DIR + '/{dataset}/PhlAnProf/metaphlan/profiles_merged/{dataset}_abundances_sp_heatmap.png', dataset=set(DATASETS)) rule StrainPhlAn: input: expand(OUT_DIR + '/{dataset}/PhlAnProf/strphlan/all.done', dataset=set(DATASETS)) rule PhlAnProf: input: - expand(OUT_DIR + '/{dataset}/PhlAnProf/metaphlan/merged_abundances_sp_heatmap.png', dataset=set(DATASETS)), + expand(OUT_DIR + '/{dataset}/PhlAnProf/metaphlan/profiles_merged/{dataset}_abundances_sp_heatmap.png', dataset=set(DATASETS)), expand(OUT_DIR + '/{dataset}/PhlAnProf/strphlan/all.done', dataset=set(DATASETS)) ##----------------------------------------------------------------------------## ## Rules to make reports ##----------------------------------------------------------------------------## rule rawQC_make_report: - output: + output: temp(touch(OUT_DIR+'/logs/raw_QC_make_report.done')) log: OUT_DIR+'/logs/rawQC_make_report.log' @@ -171,11 +171,14 @@ rule rawQC_make_report: workflow_dir = workflow_path shell: ''' - (snakemake --directory={params.workdir} --report {params.report_path} -s {params.workflow_dir}/Snakefile rawQC) &>{log} + (snakemake \ + --directory={params.workdir} \ + --report {params.report_path} \ + -s {params.workflow_dir}/Snakefile rawQC) &>{log} ''' rule preQC_make_report: - output: + output: temp(touch(OUT_DIR+'/logs/preQC_make_report.done')) log: OUT_DIR+'/logs/preQC_make_report.log' @@ -185,9 +188,28 @@ rule preQC_make_report: workflow_dir = workflow_path shell: ''' - (snakemake --directory={params.workdir} --report {params.report_path} -s {params.workflow_dir}/Snakefile preQC) &>{log} + (snakemake \ + --directory={params.workdir} \ + --report {params.report_path} \ + -s {params.workflow_dir}/Snakefile preQC) &>{log} + ''' +rule PhlAnProf_make_report: + output: + temp(touch(OUT_DIR+'/logs/preQC_make_report.done')) + log: + OUT_DIR+'/logs/PhlAnProf_make_report.log' + params: + report_path = OUT_DIR+'/PhlAnProf_report.html', + workdir = workdir_path, + workflow_dir = workflow_path + shell: + ''' + (snakemake \ + --directory={params.workdir} \ + --report {params.report_path} \ + -s {params.workflow_dir}/Snakefile PhlAnProf) &>{log} ''' include: "rules/setup_rules.smk" include: "rules/rawQC.smk" include: "rules/preprocess.smk" -include: "rules/phlanprof.smk" \ No newline at end of file +include: "rules/phlanprof.smk" diff --git a/report/MetagenomicSnake.rst b/report/MetagenomicSnake.rst index d4f6e9cb081a9e5105b15f06bc329758d9d5396e..3418a59e6f3b9ccfc68d91eef4777824aa8d71d0 100644 --- a/report/MetagenomicSnake.rst +++ b/report/MetagenomicSnake.rst @@ -14,7 +14,7 @@ Modules **preQC** FastQC only performs a quality check but no QC processing is done. The **preQC** - rule runs a multi-step pre-processing of the paired fastq files, includes: + rule runs a multi-step pre-processing of the paired fastq files, it includes: - **trim_adapters**: adapter-trimming with "fastp". Fastp performs a quality check and both paired fastq files are processed as follows\: @@ -29,6 +29,10 @@ Modules - **trim_3end**: 3\'-end quality trimming with "fastp" - **concatenate_fastqs**: merges fastq files corresponding to the same sample into a single pair of fastq files - **summarize_preQC**: creates summarizing tables and plots - + +**PhlAnProf** + performs taxonmic and strain-level profiling using MetaPhlAn2 and StrainPhlAn. + If pre-processing was not performed PhlAnProf will trigger execution of preQC. + .. _bbsplit: http://seqanswers.com/forums/showthread.php?t=41288 .. _clumpify: https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/clumpify-guide/ diff --git a/report/PhlAnProf/metaphlan2_abund.rst b/report/PhlAnProf/metaphlan2_abund.rst new file mode 100644 index 0000000000000000000000000000000000000000..5aa8a7b9ddc4135be78a20bf7cc390a364c2ad6f --- /dev/null +++ b/report/PhlAnProf/metaphlan2_abund.rst @@ -0,0 +1,2 @@ +Table with MetaPhlAn2 taxonomic profiles of all samples in dataset +"{{ snakemake.wildcards.dataset }}". Values are relative abundances in percentages. diff --git a/report/PhlAnProf/metaphlan2_abund_sp.rst b/report/PhlAnProf/metaphlan2_abund_sp.rst new file mode 100644 index 0000000000000000000000000000000000000000..81ef457512624ca34e332348ca4484091cc45d3a --- /dev/null +++ b/report/PhlAnProf/metaphlan2_abund_sp.rst @@ -0,0 +1 @@ +Table with MetaPhlAn2 taxonomic profiles, at species level, of all samples in dataset "{{ snakemake.wildcards.dataset }}". Values are relative abundances in percentages. diff --git a/report/PhlAnProf/metaphlan2_abund_sp_cleaned.rst b/report/PhlAnProf/metaphlan2_abund_sp_cleaned.rst new file mode 100644 index 0000000000000000000000000000000000000000..536cbe44c2de9b596d3ab80e8b3392303693dbc9 --- /dev/null +++ b/report/PhlAnProf/metaphlan2_abund_sp_cleaned.rst @@ -0,0 +1,3 @@ +Table with MetaPhlAn2 taxonomic profiles, at species level, of all samples in dataset "{{ snakemake.wildcards.dataset }}". Values are relative abundances in percentages. Columns corresponding to samples for which MetaPhlAn2 was unable to classify any of its reads (100\% of unclassified reads) were removed; this might be due to not enough sequencing depth. Removing these columns avoids undefined values when computing certain distance/dissimilarity metrics (e.g Bray-Curtis). + +Please check the preQC reports for all the samples listed in file {{ snakemake.wildcards.dataset }}_samples_removed.txt_. \ No newline at end of file diff --git a/report/PhlAnProf/metaphlan2_abund_sp_heatmap.rst b/report/PhlAnProf/metaphlan2_abund_sp_heatmap.rst new file mode 100644 index 0000000000000000000000000000000000000000..824d35e2bc8906db55131dd0ad403bdcde045458 --- /dev/null +++ b/report/PhlAnProf/metaphlan2_abund_sp_heatmap.rst @@ -0,0 +1 @@ +Heatmap showing the relative abundance (in logarithmic scale) of each identified species (i.e features) across all samples in dataset "{{ snakemake.wildcards.dataset }}". Dendrograms of samples and features resulted from hierarchical clustering (linkage method: 'average') after computing Bray-Curtis dissimilarities. Notice that this heatmap uses {{ snakemake.wildcards.dataset }}_cleaned_abundances_sp_table.txt_. \ No newline at end of file diff --git a/report/PhlAnProf/metaphlan2_counts.rst b/report/PhlAnProf/metaphlan2_counts.rst new file mode 100644 index 0000000000000000000000000000000000000000..6fb592a72a338df659d85c3b368f9eaec99dbd0a --- /dev/null +++ b/report/PhlAnProf/metaphlan2_counts.rst @@ -0,0 +1,2 @@ +Table with MetaPhlAn2 taxonomic profiles of all samples in dataset +"{{ snakemake.wildcards.dataset }}". Values are MetaPhlAn2's estimated counts. diff --git a/report/PhlAnProf/metaphlan2_counts_sp.rst b/report/PhlAnProf/metaphlan2_counts_sp.rst new file mode 100644 index 0000000000000000000000000000000000000000..bfab72bdd7705aac2f9f4ca435349d6c99663b3b --- /dev/null +++ b/report/PhlAnProf/metaphlan2_counts_sp.rst @@ -0,0 +1 @@ +Table with MetaPhlAn2 taxonomic profiles, at species level, of all samples in dataset "{{ snakemake.wildcards.dataset }}". Values are MetaPhlAn2's estimated counts. diff --git a/report/PhlAnProf/metaphlan2_counts_sp_cleaned.rst b/report/PhlAnProf/metaphlan2_counts_sp_cleaned.rst new file mode 100644 index 0000000000000000000000000000000000000000..e0bcea8e6eb44c07f0ecc837ef6a42d64aafa5da --- /dev/null +++ b/report/PhlAnProf/metaphlan2_counts_sp_cleaned.rst @@ -0,0 +1 @@ +Table with MetaPhlAn2 taxonomic profiles, at species level, of all samples in dataset "{{ snakemake.wildcards.dataset }}". Values are MetaPhlAn2's estimated counts. Columns corresponding to samples for which MetaPhlAn2 was unable to classify any of its reads (100\% of unclassified reads) were removed; this might be due to not enough sequencing depth. Please check the preQC reports for all the samples listed in file "{{ snakemake.wildcards.dataset }}"_samples_removed.txt_. diff --git a/report/PhlAnProf/metaphlan2_samples_removed.rst b/report/PhlAnProf/metaphlan2_samples_removed.rst new file mode 100644 index 0000000000000000000000000000000000000000..c3fa3daed93ac1e9311992c92f5312c2cde45f22 --- /dev/null +++ b/report/PhlAnProf/metaphlan2_samples_removed.rst @@ -0,0 +1 @@ +List of samples in dataset "{{ snakemake.wildcards.dataset }}" for which MetaPhlAn2 was unable to classify any of its reads (100\% of unclassified reads). If the file is empty, samples were not removed. diff --git a/report/PhlAnProf/strphlan_clades_selected.rst b/report/PhlAnProf/strphlan_clades_selected.rst new file mode 100644 index 0000000000000000000000000000000000000000..b1888fb7463f5ea364401fe1924060b33d8971ef --- /dev/null +++ b/report/PhlAnProf/strphlan_clades_selected.rst @@ -0,0 +1 @@ +List of clades selected for strain profiling with StrainPhlAn. Notice that StrainPhlAn performs additional checks/filtering steps and if less than two samples remain, strain profiling is not performed for a particular clade. \ No newline at end of file diff --git a/rules/phlanprof.smk b/rules/phlanprof.smk index ebad71b027e189649cc948811adbba120151b836..150b1365797d5b4a047c2b2395dede62cce9281a 100644 --- a/rules/phlanprof.smk +++ b/rules/phlanprof.smk @@ -27,17 +27,17 @@ singularity_metaprof = METAPROF_SIF ##----------------------------------------------------------------------------## def get_all_samples_profiles(wildcards): SAMPLES_W_PROF = DATASETS_SAMPLES_DICT[wildcards.dataset] - return expand('{out_dir}/{dataset}/PhlAnProf/metaphlan/profiles_abund/{sample}.txt', + return expand('{out_dir}/{dataset}/PhlAnProf/metaphlan/profiles_abund/{sample}.txt', out_dir=OUT_DIR, dataset=wildcards.dataset, sample=SAMPLES_W_PROF) def get_all_samples_prof_counts(wildcards): SAMPLES_W_PROF_COUNTS = DATASETS_SAMPLES_DICT[wildcards.dataset] - return expand('{out_dir}/{dataset}/PhlAnProf/metaphlan/profiles_counts/{sample}.txt', + return expand('{out_dir}/{dataset}/PhlAnProf/metaphlan/profiles_counts/{sample}.txt', out_dir=OUT_DIR, dataset=wildcards.dataset, sample=SAMPLES_W_PROF_COUNTS) def get_all_samples_markers(wildcards): SAMPLES_W_MARKERS = DATASETS_SAMPLES_DICT[wildcards.dataset] - return expand('{out_dir}/{dataset}/PhlAnProf/strphlan/markers/{sample}.markers', + return expand('{out_dir}/{dataset}/PhlAnProf/strphlan/markers/{sample}.markers', out_dir=OUT_DIR, dataset=wildcards.dataset, sample=SAMPLES_W_MARKERS) ##----------------------------------------------------------------------------## @@ -80,26 +80,74 @@ rule metaphlan2_merge: profiles_w_stats = get_all_samples_profiles, profiles_counts = get_all_samples_prof_counts output: - merged_abund = OUT_DIR + '/{dataset}/PhlAnProf/metaphlan/merged_abundances_table.txt', - merged_counts = OUT_DIR + '/{dataset}/PhlAnProf/metaphlan/merged_counts_table.txt', - merged_abund_sp = OUT_DIR + '/{dataset}/PhlAnProf/metaphlan/merged_abundances_sp_table.txt', - merged_counts_sp = OUT_DIR + '/{dataset}/PhlAnProf/metaphlan/merged_counts_sp_table.txt' + merged_abund = report(OUT_DIR + '/{dataset}/PhlAnProf/metaphlan' + + '/profiles_merged/{dataset}_abundances_table.txt', + category='PhlAnProf:Metaphlan2', + caption='../report/PhlAnProf/metaphlan2_abund.rst'), + merged_counts = report(OUT_DIR + '/{dataset}/PhlAnProf/metaphlan' + + '/profiles_merged/{dataset}_counts_table.txt', + category='PhlAnProf:Metaphlan2', + caption='../report/PhlAnProf/metaphlan2_counts.rst'), + merged_abund_sp = report(OUT_DIR + '/{dataset}/PhlAnProf/metaphlan' + + '/profiles_merged/{dataset}_abundances_sp_table.txt', + category='PhlAnProf:Metaphlan2', + caption='../report/PhlAnProf/metaphlan2_abund_sp.rst'), + merged_counts_sp = report(OUT_DIR + '/{dataset}/PhlAnProf/metaphlan' + + '/profiles_merged/{dataset}_counts_sp_table.txt', + category='PhlAnProf:Metaphlan2', + caption='../report/PhlAnProf/metaphlan2_counts_sp.rst'), + merged_abund_sp_clean = report(OUT_DIR + '/{dataset}/PhlAnProf/metaphlan' + + '/profiles_merged/{dataset}_cleaned_abundances_sp_table.txt', + category='PhlAnProf:Metaphlan2', + caption='../report/PhlAnProf/metaphlan2_abund_sp_cleaned.rst'), + merged_counts_sp_clean = report(OUT_DIR + '/{dataset}/PhlAnProf/metaphlan' + + '/profiles_merged/{dataset}_cleaned_counts_sp_table.txt', + category='PhlAnProf:Metaphlan2', + caption='../report/PhlAnProf/metaphlan2_counts_sp_cleaned.rst'), + samples_removed = report(OUT_DIR + '/{dataset}/PhlAnProf/metaphlan' + + '/profiles_merged/{dataset}_samples_removed.txt', + category='PhlAnProf:Metaphlan2', + caption='../report/PhlAnProf/metaphlan2_samples_removed.rst') + log: OUT_DIR + '/{dataset}/PhlAnProf/logs/metaphlan/merged.log' singularity:singularity_metaprof group:'metaphlan2_merging' shell: ''' - (merge_metaphlan_tables.py {input.profiles_w_stats} > {output.merged_abund}; + (# Merge samples' abundance profiles + # ---------------------------------- + merge_metaphlan_tables.py {input.profiles_w_stats} > {output.merged_abund}; merge_metaphlan_tables.py {input.profiles_counts} > {output.merged_counts}; grep -E "(s__)|(^ID)" {output.merged_abund} | grep -v "t__" | sed 's/^.*s__//g' > {output.merged_abund_sp}; - grep -E "(s__)|(^ID)" {output.merged_counts} | grep -v "t__" | sed 's/^.*s__//g' > {output.merged_counts_sp}) &>{log} + grep -E "(s__)|(^ID)" {output.merged_counts} | grep -v "t__" | sed 's/^.*s__//g' > {output.merged_counts_sp}; + # Get column ixs of samples with only zero values in merged species-abundances table + # ---------------------------------------------------------------------------------- + samples_to_keep_ix=$(cut -f 2- {output.merged_abund_sp} | tail -n +2 | awk '{{for (i=1;i<=NF;i++) sum[i]+=$i;}}; \ + END{{for (i in sum) if (sum[i]>0) print i+1;}}'); + # Remove samples with only zero values from species-abundances table + # ------------------------------------------------------------------ + cut -f 1,$(echo ${{samples_to_keep_ix}} | tr " " ",") {output.merged_abund_sp} > {output.merged_abund_sp_clean}; + # Get list of samples removed + # --------------------------- + comm -13 <(head -n1 {output.merged_abund_sp_clean} | tr "\t" "\n"| sort) \ + <(head -n1 {output.merged_abund_sp} | tr "\t" "\n" | sort) > {output.samples_removed}; + # Get column ixs of samples with only zero values in merged species-counts table + # ------------------------------------------------------------------------------ + samples_ixs=$(cut -f 2- {output.merged_counts_sp} | tail -n +2 | awk '{{for (i=1;i<=NF;i++) sum[i]+=$i;}}; \ + END{{for (i in sum) if (sum[i]>0) print i+1;}}'); + # Remove samples with only zero values from species-counts table + # --------------------------------------------------------------- + cut -f 1,$(echo ${{samples_ixs}} | tr " " ",") {output.merged_counts_sp} > {output.merged_counts_sp_clean}; + ) &>{log} ''' rule metaphlan2_visualize: input: - merged_abund_sp = OUT_DIR + '/{dataset}/PhlAnProf/metaphlan/merged_abundances_sp_table.txt' + merged_abund_sp = OUT_DIR + '/{dataset}/PhlAnProf/metaphlan/profiles_merged/{dataset}_cleaned_abundances_sp_table.txt' output: - merged_abund_sp_png = OUT_DIR + '/{dataset}/PhlAnProf/metaphlan/merged_abundances_sp_heatmap.png' + merged_abund_sp_png = report(OUT_DIR + '/{dataset}/PhlAnProf/metaphlan/profiles_merged/{dataset}_abundances_sp_heatmap.png', + category='PhlAnProf:Metaphlan2', + caption='../report/PhlAnProf/metaphlan2_abund_sp_heatmap.rst') log: OUT_DIR + '/{dataset}/PhlAnProf/logs/metaphlan/merged_abundances_sp_heatmap.log' singularity:singularity_metaprof @@ -111,16 +159,16 @@ rule metaphlan2_visualize: hclust2.py \ -i {input.merged_abund_sp} \ -o {output.merged_abund_sp_png} \ - --ftop 25 \ --f_dist_f braycurtis \ --s_dist_f braycurtis \ - --cell_aspect_ratio 0.5 -l \ + --cell_aspect_ratio 1 -l \ --flabel_size 6 \ --slabel_size 6 \ --max_flabel_len 100 \ --max_slabel_len 100 \ --minv 0.1 \ --dpi 300 \ + --image_size 50 \ -c 'bbcyr' else echo 'hclust2.py expects at least two samples! It will generate and empty file!'; \ @@ -152,9 +200,12 @@ rule strphlan_get_sample_markers: checkpoint strphlan_clades_detection: input: sample_markers = get_all_samples_markers, - merged_abundances_sp = OUT_DIR + '/{dataset}/PhlAnProf/metaphlan/merged_abundances_sp_table.txt' + merged_abundances_sp = OUT_DIR + '/{dataset}/PhlAnProf/metaphlan' + + '/profiles_merged/{dataset}_cleaned_abundances_sp_table.txt' output: - clades_list = OUT_DIR + '/{dataset}/PhlAnProf/strphlan/clades.txt', + clades_list = report(OUT_DIR + '/{dataset}/PhlAnProf/strphlan/clades.txt', + category='PhlAnProf:StrainPhlAn', + caption='../report/PhlAnProf/strphlan_clades_selected.rst'), clades_detected_dir = directory("{}/{{dataset}}/PhlAnProf/strphlan/clades_detected".format(OUT_DIR)) log: OUT_DIR + '/{dataset}/PhlAnProf/logs/strphlan/clade_detection.log' @@ -248,7 +299,7 @@ rule strphlan_clade_profiling: --leaf_marker_size 60 \ --legend_marker_size 60; \ deactivate; \ - source activate py27breadcrumbs; \ + source activate py27breadcrumbs; \ strainphlan_ggtree.R \ {output.clade_result_dir}/RAxML_bestTree.{wildcards.clade}.tree \ {input.samples_metadata} \ @@ -263,7 +314,7 @@ def aggregate_clade_profiles(wildcards): generated at the strphlan2_clades_detection step ''' checkpoint_output = checkpoints.strphlan_clades_detection.get(**wildcards).output[1] - return expand('{out_dir}/{dataset}/PhlAnProf/strphlan/clades_profiled/{clade}', + return expand('{out_dir}/{dataset}/PhlAnProf/strphlan/clades_profiled/{clade}', out_dir=OUT_DIR, dataset=wildcards.dataset, clade=glob_wildcards(os.path.join(checkpoint_output, '{clade}.clade')).clade) @@ -277,3 +328,7 @@ rule strphlan_check: ''' touch {output.combined} ''' + +# TODO: +# -rule to agreggate reports from strphlan_clade_profiling, +# it needs to parse the {clade}.info files