include removal of columns with zero values in rule metaphlan2_merge, flag...

include removal of columns with zero values in rule metaphlan2_merge, flag output files as reports and add rst files with reports descriptions

include removal of columns with zero values in rule metaphlan2_merge, flag...
64d0609c · Ticlla Ccenhua Monica Roxana · 53f467cf · 64d0609c · 64d0609c · 64d0609c
Commit 64d0609c authored 5 years ago by Ticlla Ccenhua Monica Roxana
--- a/Snakefile
+++ b/Snakefile
@@ -16,7 +16,7 @@ print('CPUs available: {}'.format(cpus_avail), file=sys.stderr)
 ## Check existence of expected variables in the environment
 ##----------------------------------------------------------------------------##
 try:
-    print('MetaSnk will store reference data/databases at {}'.format(os.environ["METASNK_DBS"]), 
+    print('MetaSnk will store reference data/databases at {}'.format(os.environ["METASNK_DBS"]),
          file=sys.stderr)
 except KeyError:
    print("You did not set variable METASNK_DBS", file=sys.stderr)
@@ -117,16 +117,16 @@ rule all:
 ##----------------------------------------------------------------------------##
 rule pullSIFS:
    input:
-        PREQC_SIF, 
+        PREQC_SIF,
        METAPROF_SIF
-    
+
 rule buildDBS:
    input:
        METASNK_DB_DIR+'/strainphlan_markers/all_markers.fasta',
        METASNK_DB_DIR+'/humann2_databases/chocophlan',
        METASNK_DB_DIR+'/humann2_databases/uniref',
        METASNK_DB_DIR+'/humann2_databases/utility_mapping'
-        
+
 rule rawQC:
    input:
        expand(OUT_DIR + '/{dataset}/rawQC/summary_stats/{dataset}_samples_stats.txt', dataset=set(DATASETS)),
@@ -148,20 +148,20 @@ rule preQC:
        expand(OUT_DIR + '/{dataset}/preQC/summary_stats/{dataset}_concatenation_all.done', dataset=set(DATASETS))
 rule MetaPhlAn2:
    input:
-        expand(OUT_DIR + '/{dataset}/PhlAnProf/metaphlan/merged_abundances_sp_heatmap.png', dataset=set(DATASETS))
+        expand(OUT_DIR + '/{dataset}/PhlAnProf/metaphlan/profiles_merged/{dataset}_abundances_sp_heatmap.png', dataset=set(DATASETS))
 rule StrainPhlAn:
    input:
        expand(OUT_DIR + '/{dataset}/PhlAnProf/strphlan/all.done', dataset=set(DATASETS))
 rule PhlAnProf:
    input:
-        expand(OUT_DIR + '/{dataset}/PhlAnProf/metaphlan/merged_abundances_sp_heatmap.png', dataset=set(DATASETS)),
+        expand(OUT_DIR + '/{dataset}/PhlAnProf/metaphlan/profiles_merged/{dataset}_abundances_sp_heatmap.png', dataset=set(DATASETS)),
        expand(OUT_DIR + '/{dataset}/PhlAnProf/strphlan/all.done', dataset=set(DATASETS))

 ##----------------------------------------------------------------------------##
 ## Rules to make reports
 ##----------------------------------------------------------------------------##
 rule rawQC_make_report:
-    output: 
+    output:
        temp(touch(OUT_DIR+'/logs/raw_QC_make_report.done'))
    log:
        OUT_DIR+'/logs/rawQC_make_report.log'
@@ -171,11 +171,14 @@ rule rawQC_make_report:
        workflow_dir = workflow_path
    shell:
        '''
-        (snakemake --directory={params.workdir} --report {params.report_path} -s {params.workflow_dir}/Snakefile rawQC) &>{log}
+        (snakemake \
+        --directory={params.workdir} \
+        --report {params.report_path} \
+        -s {params.workflow_dir}/Snakefile rawQC) &>{log}
        '''

 rule preQC_make_report:
-    output: 
+    output:
        temp(touch(OUT_DIR+'/logs/preQC_make_report.done'))
    log:
        OUT_DIR+'/logs/preQC_make_report.log'
@@ -185,9 +188,28 @@ rule preQC_make_report:
        workflow_dir = workflow_path
    shell:
        '''
-        (snakemake --directory={params.workdir} --report {params.report_path} -s {params.workflow_dir}/Snakefile preQC) &>{log}
+        (snakemake \
+        --directory={params.workdir} \
+        --report {params.report_path} \
+        -s {params.workflow_dir}/Snakefile preQC) &>{log}
+        '''
+rule PhlAnProf_make_report:
+    output:
+        temp(touch(OUT_DIR+'/logs/preQC_make_report.done'))
+    log:
+        OUT_DIR+'/logs/PhlAnProf_make_report.log'
+    params:
+        report_path = OUT_DIR+'/PhlAnProf_report.html',
+        workdir = workdir_path,
+        workflow_dir = workflow_path
+    shell:
+        '''
+        (snakemake \
+        --directory={params.workdir} \
+        --report {params.report_path} \
+        -s {params.workflow_dir}/Snakefile PhlAnProf) &>{log}
        '''
 include: "rules/setup_rules.smk"
 include: "rules/rawQC.smk"
 include: "rules/preprocess.smk"
-include: "rules/phlanprof.smk"
\ No newline at end of file
+include: "rules/phlanprof.smk"
--- a/report/MetagenomicSnake.rst
+++ b/report/MetagenomicSnake.rst
@@ -14,7 +14,7 @@ Modules

 **preQC**
  FastQC only performs a quality check but no QC processing is done. The **preQC**
-  rule runs a multi-step pre-processing of the paired fastq files, includes:
+  rule runs a multi-step pre-processing of the paired fastq files, it includes:

  - **trim_adapters**: adapter-trimming with "fastp". Fastp performs a quality check
    and both paired fastq files are processed as follows\:
@@ -29,6 +29,10 @@ Modules
  - **trim_3end**: 3\'-end quality trimming with "fastp"
  - **concatenate_fastqs**: merges fastq files corresponding to the same sample into a single pair of fastq files
  - **summarize_preQC**: creates summarizing tables and plots
-
+  
+**PhlAnProf**
+    performs taxonmic and strain-level profiling using MetaPhlAn2 and StrainPhlAn. 
+    If pre-processing was not performed PhlAnProf will trigger execution of preQC.
+    
 .. _bbsplit: http://seqanswers.com/forums/showthread.php?t=41288
 .. _clumpify: https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/clumpify-guide/
--- a/report/PhlAnProf/metaphlan2_abund.rst
+++ b/report/PhlAnProf/metaphlan2_abund.rst
+Table with MetaPhlAn2 taxonomic profiles of all samples in dataset
+"{{ snakemake.wildcards.dataset }}". Values are relative abundances in percentages.
--- a/report/PhlAnProf/metaphlan2_abund_sp.rst
+++ b/report/PhlAnProf/metaphlan2_abund_sp.rst
+Table with MetaPhlAn2 taxonomic profiles, at species level, of all samples in dataset "{{ snakemake.wildcards.dataset }}". Values are relative abundances in percentages.
--- a/report/PhlAnProf/metaphlan2_abund_sp_cleaned.rst
+++ b/report/PhlAnProf/metaphlan2_abund_sp_cleaned.rst
+Table with MetaPhlAn2 taxonomic profiles, at species level, of all samples in dataset "{{ snakemake.wildcards.dataset }}". Values are relative abundances in percentages. Columns corresponding to samples for which MetaPhlAn2 was unable to classify any of its reads (100\% of unclassified reads) were removed; this might be due to not enough sequencing depth. Removing these columns avoids undefined values when computing certain distance/dissimilarity metrics (e.g Bray-Curtis). 
+
+Please check the preQC reports for all the samples listed in file {{ snakemake.wildcards.dataset }}_samples_removed.txt_.
\ No newline at end of file
--- a/report/PhlAnProf/metaphlan2_abund_sp_heatmap.rst
+++ b/report/PhlAnProf/metaphlan2_abund_sp_heatmap.rst
+Heatmap showing the relative abundance (in logarithmic scale) of each identified species (i.e features) across all samples in dataset "{{ snakemake.wildcards.dataset }}". Dendrograms of samples and features resulted from hierarchical clustering (linkage method: 'average') after computing Bray-Curtis dissimilarities. Notice that this heatmap uses {{ snakemake.wildcards.dataset }}_cleaned_abundances_sp_table.txt_.
\ No newline at end of file
--- a/report/PhlAnProf/metaphlan2_counts.rst
+++ b/report/PhlAnProf/metaphlan2_counts.rst
+Table with MetaPhlAn2 taxonomic profiles of all samples in dataset
+"{{ snakemake.wildcards.dataset }}". Values are MetaPhlAn2's estimated counts.
--- a/report/PhlAnProf/metaphlan2_counts_sp.rst
+++ b/report/PhlAnProf/metaphlan2_counts_sp.rst
+Table with MetaPhlAn2 taxonomic profiles, at species level, of all samples in dataset "{{ snakemake.wildcards.dataset }}". Values are MetaPhlAn2's estimated counts.
--- a/report/PhlAnProf/metaphlan2_counts_sp_cleaned.rst
+++ b/report/PhlAnProf/metaphlan2_counts_sp_cleaned.rst
+Table with MetaPhlAn2 taxonomic profiles, at species level, of all samples in dataset "{{ snakemake.wildcards.dataset }}". Values are MetaPhlAn2's estimated counts. Columns corresponding to samples for which MetaPhlAn2 was unable to classify any of its reads (100\% of unclassified reads) were removed; this might be due to not enough sequencing depth. Please check the preQC reports for all the samples listed in file "{{ snakemake.wildcards.dataset }}"_samples_removed.txt_.
--- a/report/PhlAnProf/metaphlan2_samples_removed.rst
+++ b/report/PhlAnProf/metaphlan2_samples_removed.rst
+List of samples in dataset "{{ snakemake.wildcards.dataset }}" for which MetaPhlAn2 was unable to classify any of its reads (100\% of unclassified reads). If the file is empty, samples were not removed.
--- a/report/PhlAnProf/strphlan_clades_selected.rst
+++ b/report/PhlAnProf/strphlan_clades_selected.rst
+List of clades selected for strain profiling with StrainPhlAn. Notice that StrainPhlAn performs additional checks/filtering steps and if less than two samples remain, strain profiling is not performed for a particular clade.
\ No newline at end of file
--- a/rules/phlanprof.smk
+++ b/rules/phlanprof.smk
@@ -27,17 +27,17 @@ singularity_metaprof = METAPROF_SIF
 ##----------------------------------------------------------------------------##
 def get_all_samples_profiles(wildcards):
    SAMPLES_W_PROF = DATASETS_SAMPLES_DICT[wildcards.dataset]
-    return expand('{out_dir}/{dataset}/PhlAnProf/metaphlan/profiles_abund/{sample}.txt', 
+    return expand('{out_dir}/{dataset}/PhlAnProf/metaphlan/profiles_abund/{sample}.txt',
                  out_dir=OUT_DIR, dataset=wildcards.dataset, sample=SAMPLES_W_PROF)

 def get_all_samples_prof_counts(wildcards):
    SAMPLES_W_PROF_COUNTS = DATASETS_SAMPLES_DICT[wildcards.dataset]
-    return expand('{out_dir}/{dataset}/PhlAnProf/metaphlan/profiles_counts/{sample}.txt', 
+    return expand('{out_dir}/{dataset}/PhlAnProf/metaphlan/profiles_counts/{sample}.txt',
                  out_dir=OUT_DIR, dataset=wildcards.dataset, sample=SAMPLES_W_PROF_COUNTS)

 def get_all_samples_markers(wildcards):
    SAMPLES_W_MARKERS = DATASETS_SAMPLES_DICT[wildcards.dataset]
-    return expand('{out_dir}/{dataset}/PhlAnProf/strphlan/markers/{sample}.markers', 
+    return expand('{out_dir}/{dataset}/PhlAnProf/strphlan/markers/{sample}.markers',
                  out_dir=OUT_DIR, dataset=wildcards.dataset, sample=SAMPLES_W_MARKERS)

 ##----------------------------------------------------------------------------##
@@ -80,26 +80,74 @@ rule metaphlan2_merge:
        profiles_w_stats = get_all_samples_profiles,
        profiles_counts = get_all_samples_prof_counts
    output:
-        merged_abund = OUT_DIR + '/{dataset}/PhlAnProf/metaphlan/merged_abundances_table.txt',
-        merged_counts = OUT_DIR + '/{dataset}/PhlAnProf/metaphlan/merged_counts_table.txt',
-        merged_abund_sp = OUT_DIR + '/{dataset}/PhlAnProf/metaphlan/merged_abundances_sp_table.txt',
-        merged_counts_sp = OUT_DIR + '/{dataset}/PhlAnProf/metaphlan/merged_counts_sp_table.txt'
+        merged_abund = report(OUT_DIR + '/{dataset}/PhlAnProf/metaphlan' +
+                              '/profiles_merged/{dataset}_abundances_table.txt',
+                              category='PhlAnProf:Metaphlan2', 
+                              caption='../report/PhlAnProf/metaphlan2_abund.rst'),
+        merged_counts = report(OUT_DIR + '/{dataset}/PhlAnProf/metaphlan' +
+                               '/profiles_merged/{dataset}_counts_table.txt',
+                               category='PhlAnProf:Metaphlan2', 
+                               caption='../report/PhlAnProf/metaphlan2_counts.rst'),
+        merged_abund_sp = report(OUT_DIR + '/{dataset}/PhlAnProf/metaphlan' +
+                                 '/profiles_merged/{dataset}_abundances_sp_table.txt',
+                                 category='PhlAnProf:Metaphlan2', 
+                                 caption='../report/PhlAnProf/metaphlan2_abund_sp.rst'),
+        merged_counts_sp = report(OUT_DIR + '/{dataset}/PhlAnProf/metaphlan' +
+                                  '/profiles_merged/{dataset}_counts_sp_table.txt',
+                                  category='PhlAnProf:Metaphlan2', 
+                                  caption='../report/PhlAnProf/metaphlan2_counts_sp.rst'),
+        merged_abund_sp_clean = report(OUT_DIR + '/{dataset}/PhlAnProf/metaphlan' +
+                                       '/profiles_merged/{dataset}_cleaned_abundances_sp_table.txt',
+                                       category='PhlAnProf:Metaphlan2', 
+                                       caption='../report/PhlAnProf/metaphlan2_abund_sp_cleaned.rst'),
+        merged_counts_sp_clean = report(OUT_DIR + '/{dataset}/PhlAnProf/metaphlan' +
+                                        '/profiles_merged/{dataset}_cleaned_counts_sp_table.txt',
+                                        category='PhlAnProf:Metaphlan2', 
+                                        caption='../report/PhlAnProf/metaphlan2_counts_sp_cleaned.rst'),
+        samples_removed = report(OUT_DIR + '/{dataset}/PhlAnProf/metaphlan' +
+                                 '/profiles_merged/{dataset}_samples_removed.txt',
+                                 category='PhlAnProf:Metaphlan2', 
+                                 caption='../report/PhlAnProf/metaphlan2_samples_removed.rst')
+
    log:
        OUT_DIR + '/{dataset}/PhlAnProf/logs/metaphlan/merged.log'
    singularity:singularity_metaprof
    group:'metaphlan2_merging'
    shell:
        '''
-        (merge_metaphlan_tables.py {input.profiles_w_stats} > {output.merged_abund};
+        (# Merge samples' abundance profiles
+        # ----------------------------------
+        merge_metaphlan_tables.py {input.profiles_w_stats} > {output.merged_abund};
        merge_metaphlan_tables.py {input.profiles_counts} > {output.merged_counts};
        grep -E "(s__)|(^ID)" {output.merged_abund} | grep -v "t__" | sed 's/^.*s__//g' > {output.merged_abund_sp};
-        grep -E "(s__)|(^ID)" {output.merged_counts} | grep -v "t__" | sed 's/^.*s__//g' > {output.merged_counts_sp}) &>{log}
+        grep -E "(s__)|(^ID)" {output.merged_counts} | grep -v "t__" | sed 's/^.*s__//g' > {output.merged_counts_sp};
+        # Get column ixs of samples with only zero values in merged species-abundances table
+        # ----------------------------------------------------------------------------------
+        samples_to_keep_ix=$(cut -f 2- {output.merged_abund_sp} | tail -n +2 | awk '{{for (i=1;i<=NF;i++) sum[i]+=$i;}}; \
+        END{{for (i in sum) if (sum[i]>0) print i+1;}}');
+        # Remove samples with only zero values from species-abundances table
+        # ------------------------------------------------------------------
+        cut -f 1,$(echo ${{samples_to_keep_ix}} | tr " " ",") {output.merged_abund_sp} > {output.merged_abund_sp_clean};
+        # Get list of samples removed
+        # ---------------------------
+        comm -13 <(head -n1 {output.merged_abund_sp_clean} | tr "\t" "\n"| sort) \
+        <(head -n1 {output.merged_abund_sp} | tr "\t" "\n" | sort) > {output.samples_removed};
+        # Get column ixs of samples with only zero values in merged species-counts table
+        # ------------------------------------------------------------------------------
+        samples_ixs=$(cut -f 2- {output.merged_counts_sp} | tail -n +2 | awk '{{for (i=1;i<=NF;i++) sum[i]+=$i;}}; \
+        END{{for (i in sum) if (sum[i]>0) print i+1;}}');
+        # Remove samples with only zero values from species-counts table
+        # ---------------------------------------------------------------
+        cut -f 1,$(echo ${{samples_ixs}} | tr " " ",") {output.merged_counts_sp} > {output.merged_counts_sp_clean};
+        ) &>{log}
        '''
 rule metaphlan2_visualize:
    input:
-        merged_abund_sp = OUT_DIR + '/{dataset}/PhlAnProf/metaphlan/merged_abundances_sp_table.txt'
+        merged_abund_sp = OUT_DIR + '/{dataset}/PhlAnProf/metaphlan/profiles_merged/{dataset}_cleaned_abundances_sp_table.txt'
    output:
-        merged_abund_sp_png = OUT_DIR + '/{dataset}/PhlAnProf/metaphlan/merged_abundances_sp_heatmap.png'
+        merged_abund_sp_png = report(OUT_DIR + '/{dataset}/PhlAnProf/metaphlan/profiles_merged/{dataset}_abundances_sp_heatmap.png',
+                                     category='PhlAnProf:Metaphlan2', 
+                                     caption='../report/PhlAnProf/metaphlan2_abund_sp_heatmap.rst')
    log:
        OUT_DIR + '/{dataset}/PhlAnProf/logs/metaphlan/merged_abundances_sp_heatmap.log'
    singularity:singularity_metaprof
@@ -111,16 +159,16 @@ rule metaphlan2_visualize:
                hclust2.py \
                -i {input.merged_abund_sp} \
                -o {output.merged_abund_sp_png} \
-                --ftop 25 \
                --f_dist_f braycurtis \
                --s_dist_f braycurtis \
-                --cell_aspect_ratio 0.5 -l \
+                --cell_aspect_ratio 1 -l \
                --flabel_size 6 \
                --slabel_size 6 \
                --max_flabel_len 100 \
                --max_slabel_len 100 \
                --minv 0.1 \
                --dpi 300 \
+                --image_size 50 \
                -c 'bbcyr'
            else
                echo 'hclust2.py expects at least two samples! It will generate and empty file!'; \
@@ -152,9 +200,12 @@ rule strphlan_get_sample_markers:
 checkpoint strphlan_clades_detection:
    input:
        sample_markers = get_all_samples_markers,
-        merged_abundances_sp = OUT_DIR + '/{dataset}/PhlAnProf/metaphlan/merged_abundances_sp_table.txt'
+        merged_abundances_sp = OUT_DIR + '/{dataset}/PhlAnProf/metaphlan' +
+                               '/profiles_merged/{dataset}_cleaned_abundances_sp_table.txt'
    output:
-        clades_list = OUT_DIR + '/{dataset}/PhlAnProf/strphlan/clades.txt',
+        clades_list = report(OUT_DIR + '/{dataset}/PhlAnProf/strphlan/clades.txt',
+                             category='PhlAnProf:StrainPhlAn', 
+                             caption='../report/PhlAnProf/strphlan_clades_selected.rst'),
        clades_detected_dir = directory("{}/{{dataset}}/PhlAnProf/strphlan/clades_detected".format(OUT_DIR))
    log:
        OUT_DIR + '/{dataset}/PhlAnProf/logs/strphlan/clade_detection.log'
@@ -248,7 +299,7 @@ rule strphlan_clade_profiling:
        --leaf_marker_size 60 \
        --legend_marker_size 60; \
        deactivate; \
-        source activate py27breadcrumbs; \ 
+        source activate py27breadcrumbs; \
        strainphlan_ggtree.R \
        {output.clade_result_dir}/RAxML_bestTree.{wildcards.clade}.tree \
        {input.samples_metadata} \
@@ -263,7 +314,7 @@ def aggregate_clade_profiles(wildcards):
    generated at the strphlan2_clades_detection step
    '''
    checkpoint_output = checkpoints.strphlan_clades_detection.get(**wildcards).output[1]
-    return expand('{out_dir}/{dataset}/PhlAnProf/strphlan/clades_profiled/{clade}', 
+    return expand('{out_dir}/{dataset}/PhlAnProf/strphlan/clades_profiled/{clade}',
                  out_dir=OUT_DIR,
                  dataset=wildcards.dataset,
                  clade=glob_wildcards(os.path.join(checkpoint_output, '{clade}.clade')).clade)
@@ -277,3 +328,7 @@ rule strphlan_check:
        '''
        touch {output.combined}
        '''
+
+# TODO:
+# -rule to agreggate reports from strphlan_clade_profiling,
+#  it needs to parse the {clade}.info files