replace rule strphlan_check with strphlan_summary to agregate info files from...

replace rule strphlan_check with strphlan_summary to agregate info files from StrainPhlAn into a report table

replace rule strphlan_check with strphlan_summary to agregate info files from...
cf78ba0c · Ticlla Ccenhua Monica Roxana · 1ec25cbb · cf78ba0c · 1ec25cbb · cf78ba0c
Commit cf78ba0c authored 5 years ago by Ticlla Ccenhua Monica Roxana
--- a/Snakefile
+++ b/Snakefile
@@ -153,11 +153,11 @@ rule MetaPhlAn2:
        expand(OUT_DIR + '/{dataset}/PhlAnProf/metaphlan/profiles_merged/{dataset}_abundances_sp_heatmap.png', dataset=set(DATASETS))
 rule StrainPhlAn:
    input:
-        expand(OUT_DIR + '/{dataset}/PhlAnProf/strphlan/all.done', dataset=set(DATASETS))
+        expand(OUT_DIR + '/{dataset}/PhlAnProf/strphlan/{dataset}_clades_profiled.tsv', dataset=set(DATASETS))
 rule PhlAnProf:
    input:
        expand(OUT_DIR + '/{dataset}/PhlAnProf/metaphlan/profiles_merged/{dataset}_abundances_sp_heatmap.png', dataset=set(DATASETS)),
-        expand(OUT_DIR + '/{dataset}/PhlAnProf/strphlan/all.done', dataset=set(DATASETS))
+        expand(OUT_DIR + '/{dataset}/PhlAnProf/strphlan/{dataset}_clades_profiled.tsv', dataset=set(DATASETS))

 ##----------------------------------------------------------------------------##
 ## Rules to make reports
@@ -195,7 +195,7 @@ rule preQC_make_report:
        --report {params.report_path} \
        -s {params.workflow_dir}/Snakefile preQC) &>{log}
        '''
-        
+
 rule MetaPhlAn2_make_report:
    output:
        temp(touch(OUT_DIR+'/logs/metaphlan2_make_report.done'))
@@ -212,7 +212,7 @@ rule MetaPhlAn2_make_report:
        --report {params.report_path} \
        -s {params.workflow_dir}/Snakefile MetaPhlAn2) &>{log}
        '''
-        
+
 rule PhlAnProf_make_report:
    output:
        temp(touch(OUT_DIR+'/logs/phlanprof_make_report.done'))

--- a/images/PhlAnProf_dag.png
+++ b/images/PhlAnProf_dag.png
--- a/rules/phlanprof.smk
+++ b/rules/phlanprof.smk
@@ -16,7 +16,7 @@ sequencing.
 localrules:
    metaphlan2_merge,
    metaphlan2_visualize,
-    strphlan_check
+    strphlan_summary
 ##----------------------------------------------------------------------------##
 ## Local variables
 ##----------------------------------------------------------------------------##
@@ -82,31 +82,31 @@ rule metaphlan2_merge:
    output:
        merged_abund = report(OUT_DIR + '/{dataset}/PhlAnProf/metaphlan' +
                              '/profiles_merged/{dataset}_abundances_table.txt',
-                              category='PhlAnProf:Metaphlan2', 
+                              category='PhlAnProf:Metaphlan2',
                              caption='../report/PhlAnProf/metaphlan2_abund.rst'),
        merged_counts = report(OUT_DIR + '/{dataset}/PhlAnProf/metaphlan' +
                               '/profiles_merged/{dataset}_counts_table.txt',
-                               category='PhlAnProf:Metaphlan2', 
+                               category='PhlAnProf:Metaphlan2',
                               caption='../report/PhlAnProf/metaphlan2_counts.rst'),
        merged_abund_sp = report(OUT_DIR + '/{dataset}/PhlAnProf/metaphlan' +
                                 '/profiles_merged/{dataset}_abundances_sp_table.txt',
-                                 category='PhlAnProf:Metaphlan2', 
+                                 category='PhlAnProf:Metaphlan2',
                                 caption='../report/PhlAnProf/metaphlan2_abund_sp.rst'),
        merged_counts_sp = report(OUT_DIR + '/{dataset}/PhlAnProf/metaphlan' +
                                  '/profiles_merged/{dataset}_counts_sp_table.txt',
-                                  category='PhlAnProf:Metaphlan2', 
+                                  category='PhlAnProf:Metaphlan2',
                                  caption='../report/PhlAnProf/metaphlan2_counts_sp.rst'),
        merged_abund_sp_clean = report(OUT_DIR + '/{dataset}/PhlAnProf/metaphlan' +
                                       '/profiles_merged/{dataset}_cleaned_abundances_sp_table.txt',
-                                       category='PhlAnProf:Metaphlan2', 
+                                       category='PhlAnProf:Metaphlan2',
                                       caption='../report/PhlAnProf/metaphlan2_abund_sp_cleaned.rst'),
        merged_counts_sp_clean = report(OUT_DIR + '/{dataset}/PhlAnProf/metaphlan' +
                                        '/profiles_merged/{dataset}_cleaned_counts_sp_table.txt',
-                                        category='PhlAnProf:Metaphlan2', 
+                                        category='PhlAnProf:Metaphlan2',
                                        caption='../report/PhlAnProf/metaphlan2_counts_sp_cleaned.rst'),
        samples_removed = report(OUT_DIR + '/{dataset}/PhlAnProf/metaphlan' +
                                 '/profiles_merged/{dataset}_samples_removed.txt',
-                                 category='PhlAnProf:Metaphlan2', 
+                                 category='PhlAnProf:Metaphlan2',
                                 caption='../report/PhlAnProf/metaphlan2_samples_removed.rst')

    log:
@@ -146,7 +146,7 @@ rule metaphlan2_visualize:
        merged_abund_sp = OUT_DIR + '/{dataset}/PhlAnProf/metaphlan/profiles_merged/{dataset}_cleaned_abundances_sp_table.txt'
    output:
        merged_abund_sp_png = report(OUT_DIR + '/{dataset}/PhlAnProf/metaphlan/profiles_merged/{dataset}_abundances_sp_heatmap.png',
-                                     category='PhlAnProf:Metaphlan2', 
+                                     category='PhlAnProf:Metaphlan2',
                                     caption='../report/PhlAnProf/metaphlan2_abund_sp_heatmap.rst')
    log:
        OUT_DIR + '/{dataset}/PhlAnProf/logs/metaphlan/merged_abundances_sp_heatmap.log'
@@ -203,8 +203,8 @@ checkpoint strphlan_clades_detection:
        merged_abundances_sp = OUT_DIR + '/{dataset}/PhlAnProf/metaphlan' +
                               '/profiles_merged/{dataset}_cleaned_abundances_sp_table.txt'
    output:
-        clades_list = report(OUT_DIR + '/{dataset}/PhlAnProf/strphlan/clades.txt',
-                             category='PhlAnProf:StrainPhlAn', 
+        clades_list = report(OUT_DIR + '/{dataset}/PhlAnProf/strphlan/{dataset}_clades.txt',
+                             category='PhlAnProf:StrainPhlAn',
                             caption='../report/PhlAnProf/strphlan_clades_selected.rst'),
        clades_detected_dir = directory("{}/{{dataset}}/PhlAnProf/strphlan/clades_detected".format(OUT_DIR))
    log:
@@ -320,16 +320,55 @@ def aggregate_clade_profiles(wildcards):
                  dataset=wildcards.dataset,
                  clade=glob_wildcards(os.path.join(checkpoint_output, '{clade}.clade')).clade)

-rule strphlan_check:
+rule strphlan_summary:
    input:
        aggregate_clade_profiles
    output:
-        combined = OUT_DIR + '/{dataset}/PhlAnProf/strphlan/all.done'
-    shell:
-        '''
-        touch {output.combined}
-        '''
+        #combined = OUT_DIR + '/{dataset}/PhlAnProf/strphlan/all.done',
+        clades_profiled = report(OUT_DIR + '/{dataset}/PhlAnProf/strphlan/{dataset}_clades_profiled.tsv',
+                                 category='PhlAnProf:StrainPhlAn',
+                                 caption='../report/PhlAnProf/strphlan_clades_profiled.rst')
+    log:
+        OUT_DIR + '/{dataset}/PhlAnProf/logs/strphlan/strphlan_summary.log'
+    run:
+        import os
+        from pathlib import Path
+        import pandas as pd
+        import collections
+        import sys
+
+        sys.stdout = open(log[0], 'w')

-# TODO:
-# -rule to agreggate reports from strphlan_clade_profiling,
-#  it needs to parse the {clade}.info files
+        clades_info_list = list()
+        try:
+            for clade_dir in input:
+                clade_name = str(clade_dir).split('/')[-1]
+                clade_info_file = str(clade_dir)+'/{}.info'.format(clade_name)
+                if os.path.exists(clade_info_file):
+                    with open(clade_info_file, mode='r') as f:
+                        clade_info_dict = collections.OrderedDict((line.strip().split(':')[0],
+                                                                   line.strip().split(':')[1].strip())
+                                                                  for line in f)
+                    clade_raxml_tree_png = '{}/RAxML_bestTree.{}.tree.metadata.png'.format(str(clade_dir),
+                                                                                           clade_name)
+                    if os.path.exists(clade_raxml_tree_png):
+                        clade_info_dict['raxml_tree_png'] = clade_raxml_tree_png
+                        clade_raxml_ggtree_1 = '{}/RAxML_bestTree.{}.tree_1.png'.format(str(clade_dir),
+                                                                                        clade_name)
+                        if os.path.exists(clade_raxml_ggtree_1):
+                            clade_info_dict['raxml_ggtree_1_png'] = clade_raxml_ggtree_1
+                            clade_raxml_ggtree_2 = '{}/RAxML_bestTree.{}.tree_2.png'.format(str(clade_dir),
+                                                                                            clade_name)
+                            if os.path.exists(clade_raxml_ggtree_2):
+                                clade_info_dict['raxml_ggtree_2_png'] = clade_raxml_ggtree_2
+                    clades_info_list.append(clade_info_dict)
+        except (TypeError, ValueError) as e:
+            print(e)
+        else:
+            clades_info_table = pd.DataFrame(clades_info_list)
+            clades_info_table.to_csv(output.clades_profiled, sep='\t', header=True, index=False)
+        finally:
+            sys.stdout.close()
+        #'''
+        #touch {output.combined}
+        #'''