diff --git a/Snakefile b/Snakefile index efb213f6974ec15b6e6787e15476e91eb1180316..d56cc9beecd7835fca8e1e32e874175a98875241 100644 --- a/Snakefile +++ b/Snakefile @@ -77,6 +77,8 @@ SHUB_PREQC_SIF = 'shub://mticlla/MetagenomicSnake:preqc_v0_1' SHUB_METAPROF_SIF = 'shub://mticlla/OmicSingularities:metaprof' PREQC_SIF = METASNK_DB_DIR+'/singularity/MetagenomicSnake_preqc_v0_1.sif' METAPROF_SIF = METASNK_DB_DIR+'/singularity/OmicSingularities_metaprof.sif' +HUMAN_REF = eval(config['preprocess']['filter_human']['bbmap_reference']) + ##----------------------------------------------------------------------------## ## Fastq files to be processed ##----------------------------------------------------------------------------## @@ -127,7 +129,9 @@ rule buildDBS: METASNK_DB_DIR+'/strainphlan_markers/all_markers.fasta', METASNK_DB_DIR+'/humann2_databases/chocophlan', METASNK_DB_DIR+'/humann2_databases/uniref', - METASNK_DB_DIR+'/humann2_databases/utility_mapping' + METASNK_DB_DIR+'/humann2_databases/utility_mapping', + HUMAN_REF + #METASNK_DB_DIR+'/ref_genomes/hg19_main_mask_ribo_animal_allplant_allfungus.fa.gz' rule rawQC: input: diff --git a/config.yaml b/config.yaml index a165debec251c77e18fa8d54fd233333745de79f..e2a29fd54bdc9a2c4f2d21e657ca3021e10b1339 100644 --- a/config.yaml +++ b/config.yaml @@ -3,12 +3,12 @@ # one row per sample. It can be parsed easily via pandas. # Set PATHS -# WARNING: +# WARNING: # MetagenomicSnake uses singularity containers to run the workflow. # Bind access to those paths if needed -# Singularity's --bind/-B option can be specified multiple times, +# Singularity's --bind/-B option can be specified multiple times, # or a comma-delimited string of bind path specifications can be used. -# +# # Absolute PATH to folder where to find fastq files RAW_DIR: '/home/mticlla/Documents/Git_repositories/metagenomicsnake_testdata/data/raw' @@ -39,16 +39,21 @@ preprocess: bbmap_usemodulo: t bbmap_mem: 10 - # PATH to reference(e.g human) genome - # ----------------------------------- - # Masked hg19 kindly provided by Brian Brushnell and - # manually dowloaded from + # PATH to reference(e.g human) genome to be removed + # ------------------------------------------------- + # MetaSnk uses by default the masked hg19 genome, kindly provided by Brian Brushnell + # and dowloaded from # https://drive.google.com/open?id=0B3llHR93L14wd0pSSnFULUlhcUk # How the human genome was masked is explained in this SEQanswers entry # http://seqanswers.com/forums/archive/index.php/t-42552.html - bbmap_reference: '/home/mticlla/Documents/Git_repositories/metagenomicsnake/ref/hg19_main_mask_ribo_animal_allplant_allfungus.fa.gz' - bbmap_ref_dir: '/home/mticlla/Documents/Git_repositories/metagenomicsnake/ref/ref_bbsplit' - + # If you want to use a different human reference genome, provide the absolute path below + bbmap_reference: "METASNK_DB_DIR+'/ref_genomes/hg19_main_mask_ribo_animal_allplant_allfungus.fa.gz'" + + # PATH to the folder where to place the indexed human reference genomes + # --------------------------------------------------------------------- + # The reference genome needs to be indexed, by default the indexed files are + # placed in your $METASNK_DBS dir (set during the installation steps of MetaSnk) + bbmap_ref_dir: "METASNK_DB_DIR+'/ref_genomes/ref_bbsplit'" phlanprof: strphlan_clade_profiling: metadatas: diff --git a/images/buildDBS_dag.png b/images/buildDBS_dag.png index 98b56ebd4a6191fb72d2be93e7be2a71bc2eaede..a00062d63045f65f7a72bd23e4ca24426ac60cdb 100644 Binary files a/images/buildDBS_dag.png and b/images/buildDBS_dag.png differ diff --git a/rules/preprocess.smk b/rules/preprocess.smk index 2938d73d589f3a9fa5ee804ce26edd754121157f..0066ba55ab264375e9a5a84bea6a452c2966cdbd 100644 --- a/rules/preprocess.smk +++ b/rules/preprocess.smk @@ -25,50 +25,48 @@ localrules: ##----------------------------------------------------------------------------## ## Local variables ##----------------------------------------------------------------------------## -#singularity_img = 'shub://mticlla/MetagenomicSnake:preqc_v0_1' singularity_preqc = PREQC_SIF -BBMAP_REF = config['preprocess']['filter_human']['bbmap_reference'] -BBMAP_REF_DIR = config['preprocess']['filter_human']['bbmap_ref_dir'] - +BBMAP_REF = HUMAN_REF +BBMAP_REF_DIR = eval(config['preprocess']['filter_human']['bbmap_ref_dir']) ##----------------------------------------------------------------------------## ## Helper functions ##----------------------------------------------------------------------------## # Helper functions to create list of files for MultiQC def mqc_trim_adapters_inputs(wildcards): - list_files = ['{}/{}/preQC/atrimmed/{}.fastp.json'.format(OUT_DIR, - value, - FASTQS[ix]) - for ix,value in enumerate(DATASETSX) + list_files = ['{}/{}/preQC/atrimmed/{}.fastp.json'.format(OUT_DIR, + value, + FASTQS[ix]) + for ix,value in enumerate(DATASETSX) if value==wildcards.dataset] return(list_files) def mqc_filter_human_inputs(wildcards): - list_files = ['{}/{}/preQC/bfiltered/{}.clean.stats'.format(OUT_DIR, - value, - FASTQS[ix]) - for ix,value in enumerate(DATASETSX) + list_files = ['{}/{}/preQC/bfiltered/{}.clean.stats'.format(OUT_DIR, + value, + FASTQS[ix]) + for ix,value in enumerate(DATASETSX) if value==wildcards.dataset] return(list_files) def mqc_trim_3end_inputs(wildcards): - list_files = ['{}/{}/preQC/dfinaltrim/{}.clean.nodup.fastp.json'.format(OUT_DIR, - value, + list_files = ['{}/{}/preQC/dfinaltrim/{}.clean.nodup.fastp.json'.format(OUT_DIR, + value, FASTQS[ix]) - for ix,value in enumerate(DATASETSX) + for ix,value in enumerate(DATASETSX) if value==wildcards.dataset] return(list_files) def list_concatenated_r1_fastqs(wildcards): list_r1 = ['{}/{}/preQC/emerged/{}-R1.fastq.gz'.format(OUT_DIR, wildcards.dataset, - value) + value) for ix,value in enumerate(SAMPLES) if DATASETS[ix]==wildcards.dataset] return(list_r1) def list_concatenated_r2_fastqs(wildcards): list_r2 = ['{}/{}/preQC/emerged/{}-R2.fastq.gz'.format(OUT_DIR, wildcards.dataset, - value) + value) for ix,value in enumerate(SAMPLES) if DATASETS[ix]==wildcards.dataset] return(list_r2) ##----------------------------------------------------------------------------## @@ -83,14 +81,14 @@ def list_concatenated_r2_fastqs(wildcards): # - limiting the N gase number (-n) # - percentage of unqualified bases (-u) # - average quality score (-e, default 0 means no requirement) -# - remove adapters: enabled by default, fastp detects adapters by -# per-read overlap analysis (seeks for the overlap of each read pair). -# If fastp fails to find an overlap, It usess the provided the adapter -# sequences (e.g Nextera XT adapters). +# - remove adapters: enabled by default, fastp detects adapters by +# per-read overlap analysis (seeks for the overlap of each read pair). +# If fastp fails to find an overlap, It usess the provided the adapter +# sequences (e.g Nextera XT adapters). # Check this page from Illumina to know which adapters to remove: # https://support.illumina.com/bulletins/2016/12/what-sequences-do-i-use-for-adapter-trimming.html # - base correction in overlapped regions -# - trimming of the last base(s) for every read pair(this step preceeds +# - trimming of the last base(s) for every read pair(this step preceeds # adapter removal) # - length filtering: discard reads shorter than a minimum length, after trimming # @@ -187,8 +185,8 @@ rule filter_human: # rule index_human_ref: input: - # Masked hg19 kindly provided by Brian Brushnell and - # manually dowloaded from + # Masked hg19 kindly provided by Brian Brushnell and + # dowloaded from # https://drive.google.com/open?id=0B3llHR93L14wd0pSSnFULUlhcUk reference = BBMAP_REF output: @@ -281,7 +279,7 @@ rule trim_3end: --thread {threads}) &>{log} ''' ###**THESE TARGET FILES ARE THE FINAL CLEAN**### -# concatenate quality/length filtered, adapter-trimmed, cleaned, deduplicated and +# concatenate quality/length filtered, adapter-trimmed, cleaned, deduplicated and # quality-trimmed fastqs from the same samples rule concatenate_fastqs: input: @@ -310,15 +308,15 @@ rule check_concatenation: inputs_r1 = list_concatenated_r1_fastqs, inputs_r2 = list_concatenated_r2_fastqs output: - concatenated_fastqs_list = OUT_DIR + '/{dataset}/preQC/summary_stats/' + + concatenated_fastqs_list = OUT_DIR + '/{dataset}/preQC/summary_stats/' + '{dataset}_concatenation_all.done' #conda:'../envs/rawQC.yaml' run: import os from pathlib import Path - + try: - sample_fastqs_exist = [os.path.exists(r1_file) and os.path.exists(r2_file) + sample_fastqs_exist = [os.path.exists(r1_file) and os.path.exists(r2_file) for r1_file, r2_file in zip(input.inputs_r1,input.inputs_r2)] if all(sample_fastqs_exist): Path(output.concatenated_fastqs_list).touch() @@ -386,12 +384,12 @@ rule mqc_trim_adapters: input: OUT_DIR +'/{dataset}/preQC/multiqc/atrimmed/multiqc_inputs.txt' output: - multiqc_data_dir = OUT_DIR + + multiqc_data_dir = OUT_DIR + '/{dataset}/preQC/multiqc/atrimmed'+ '/{dataset}_atrimmed_mqc_data/multiqc_data.json', - multiqc_report = report(OUT_DIR + - '/{dataset}/preQC/multiqc/atrimmed' + - '/{dataset}_atrimmed_mqc.html', + multiqc_report = report(OUT_DIR + + '/{dataset}/preQC/multiqc/atrimmed' + + '/{dataset}_atrimmed_mqc.html', category='preQC_step1:trim_adapters') singularity: singularity_preqc shell: @@ -403,25 +401,25 @@ rule mqc_filter_human: input: OUT_DIR +'/{dataset}/preQC/multiqc/bfiltered/multiqc_inputs.txt' output: - mqc_report = report(OUT_DIR + - '/{dataset}/preQC/multiqc/bfiltered' + - '/{dataset}_bfiltered_stats.tsv', + mqc_report = report(OUT_DIR + + '/{dataset}/preQC/multiqc/bfiltered' + + '/{dataset}_bfiltered_stats.tsv', category='preQC_step2:filter_human') #conda:'../envs/rawQC.yaml' run: from utils import summarize_filter_human_step mqc_stats_data = summarize_filter_human_step('{}'.format(input)) - mqc_stats_data.to_csv(output.mqc_report, sep='\t', header=True, + mqc_stats_data.to_csv(output.mqc_report, sep='\t', header=True, index=True, index_label='Unit') # rule multiqc_trim_3end: input: OUT_DIR+'/{dataset}/preQC/multiqc/dfinaltrim/multiqc_inputs.txt' output: - multiqc_fastqc = OUT_DIR + + multiqc_fastqc = OUT_DIR + '/{dataset}/preQC/multiqc/dfinaltrim'+ '/{dataset}_dfinaltrim_mqc_data/multiqc_data.json', - multiqc_report = report(OUT_DIR + + multiqc_report = report(OUT_DIR + '/{dataset}/preQC/multiqc/dfinaltrim'+ '/{dataset}_dfinaltrim_mqc.html', category='preQC_step4:trim_3end') @@ -436,64 +434,64 @@ rule summarize_preQC: input: trim_adapters_mqc_json = OUT_DIR + '/{dataset}/preQC/multiqc/atrimmed'+ '/{dataset}_atrimmed_mqc_data/multiqc_data.json', - filter_human_mqc_stats = OUT_DIR + '/{dataset}/preQC/multiqc/bfiltered' + + filter_human_mqc_stats = OUT_DIR + '/{dataset}/preQC/multiqc/bfiltered' + '/{dataset}_bfiltered_stats.tsv', trim3end_mqc_json = OUT_DIR + '/{dataset}/preQC/multiqc/dfinaltrim'+ '/{dataset}_dfinaltrim_mqc_data/multiqc_data.json' log: OUT_DIR + '/{dataset}/preQC/logs/summarize_preqc.log' output: - units_summary = report(OUT_DIR + - '/{dataset}/preQC/summary_stats' + - '/{dataset}_preqc_units_summary.tsv', + units_summary = report(OUT_DIR + + '/{dataset}/preQC/summary_stats' + + '/{dataset}_preqc_units_summary.tsv', category='preQC:summaries'), - samples_summary = report(OUT_DIR + + samples_summary = report(OUT_DIR + '/{dataset}/preQC/summary_stats' + - '/{dataset}_preqc_samples_summary.tsv', + '/{dataset}_preqc_samples_summary.tsv', category='preQC:summaries'), - units_summary_plot = report(OUT_DIR + - '/{dataset}/preQC/summary_stats' + - '/{dataset}_preqc_units_barchart.svg', + units_summary_plot = report(OUT_DIR + + '/{dataset}/preQC/summary_stats' + + '/{dataset}_preqc_units_barchart.svg', category='preQC:summaries'), - samples_summary_plot = report(OUT_DIR + - '/{dataset}/preQC/summary_stats' + - '/{dataset}_preqc_samples_barchart.svg', - category='preQC:summaries'), - units_summary_pct_plot = report(OUT_DIR + - '/{dataset}/preQC/summary_stats' + - '/{dataset}_preqc_units_pct_barchart.svg', + samples_summary_plot = report(OUT_DIR + + '/{dataset}/preQC/summary_stats' + + '/{dataset}_preqc_samples_barchart.svg', + category='preQC:summaries'), + units_summary_pct_plot = report(OUT_DIR + + '/{dataset}/preQC/summary_stats' + + '/{dataset}_preqc_units_pct_barchart.svg', category='preQC:summaries'), - samples_summary_pct_plot = report(OUT_DIR + - '/{dataset}/preQC/summary_stats'+ - '/{dataset}_preqc_samples_pct_barchart.svg', + samples_summary_pct_plot = report(OUT_DIR + + '/{dataset}/preQC/summary_stats'+ + '/{dataset}_preqc_samples_pct_barchart.svg', category='preQC:summaries') #conda:'../envs/rawQC.yaml' run: from utils import summarize_preqc from utils import plot_preqc_summary import sys - + sys.stdout = open(log[0], 'w') - - + + try: trim_adapters_mqc_json = '{}'.format(input.trim_adapters_mqc_json) filter_human_mqc_stats = '{}'.format(input.filter_human_mqc_stats) trim3end_mqc_json = '{}'.format(input.trim3end_mqc_json) # units summary - units_summary_df = summarize_preqc(trim_adapters_mqc_json, filter_human_mqc_stats, + units_summary_df = summarize_preqc(trim_adapters_mqc_json, filter_human_mqc_stats, trim3end_mqc_json) except (TypeError, ValueError) as e: - print(e) + print(e) else: - units_summary_df.to_csv(output.units_summary, sep='\t', header=True, index=True, + units_summary_df.to_csv(output.units_summary, sep='\t', header=True, index=True, index_label='Unit') # samples summary samples_summary_df = units_summary_df.groupby(['Sample']).agg('sum') - pct_columns = ['trim_adapters_pct_disc', 'filter_human_pct_disc', 'dedupe_pct_disc', + pct_columns = ['trim_adapters_pct_disc', 'filter_human_pct_disc', 'dedupe_pct_disc', 'trim_3end_pct_disc', 'total_pct_disc', 'total_pct_kept'] samples_summary_df[pct_columns] = units_summary_df[['Sample']+pct_columns].groupby(['Sample']).agg('mean') - samples_summary_df.to_csv(output.samples_summary, sep='\t', header=True, index=True, + samples_summary_df.to_csv(output.samples_summary, sep='\t', header=True, index=True, index_label='Sample') # radial barcharts # @@ -510,4 +508,3 @@ rule summarize_preQC: fig4.savefig(output.samples_summary_pct_plot, bbox_inches='tight', format='svg') finally: sys.stdout.close() - \ No newline at end of file diff --git a/rules/setup_rules.smk b/rules/setup_rules.smk index f84fd4ab134099491aa12cae13c1e8e5d337ffd2..29a61f7593ce74181f2a7f1d88119b4e9109a51b 100644 --- a/rules/setup_rules.smk +++ b/rules/setup_rules.smk @@ -24,8 +24,9 @@ localrules: get_humann2_chocophlan, get_humann2_uniref, get_humann2_utility, - get_humann2_dbs - + get_humann2_dbs, + get_human_ref + ##----------------------------------------------------------------------------## ## Rules to download/build container/databases used by MetaSnk ##----------------------------------------------------------------------------## @@ -59,6 +60,17 @@ rule pull_metaprof_sif: $(basename {output}) \ {params.shub_sif} ''' +rule get_human_ref: + output: + # Human reference genome for removal of human DNA + human_ref = HUMAN_REF + message: "Downloading fasta file of human reference genome ..." + shell: + ''' + [ ! -d $(dirname {output.human_ref}) ] && mkdir $(dirname {output.human_ref}); + cd $(dirname {output.human_ref}) + wget https://github.com/mticlla/MetaSnk_data/raw/master/ref/hg19_main_mask_ribo_animal_allplant_allfungus.fa.gz + ''' rule get_metaphlan_db: output: directory(METASNK_DB_DIR+'/metaphlan_databases') @@ -67,7 +79,7 @@ rule get_metaphlan_db: shell: ''' metaphlan2.py --install --bowtie2db {output} - ''' + ''' rule get_strphlan_db: input: metaphlan_db_dir = METASNK_DB_DIR+'/metaphlan_databases' @@ -81,7 +93,7 @@ rule get_strphlan_db: rule get_humann2_chocophlan: output: humann2_choco_dir = directory(METASNK_DB_DIR+'/humann2_databases/chocophlan') - singularity:METAPROF_SIF + singularity:METAPROF_SIF shell: ''' bash -c 'source activate humann2 && mkdir -p $(dirname {output.humann2_choco_dir}); \ @@ -95,7 +107,7 @@ rule get_humann2_chocophlan: rule get_humann2_uniref: output: humann2_uniref_dir = directory(METASNK_DB_DIR+'/humann2_databases/uniref') - singularity:METAPROF_SIF + singularity:METAPROF_SIF shell: ''' bash -c 'source activate humann2 && mkdir -p $(dirname {output.humann2_uniref_dir}); \ @@ -122,4 +134,4 @@ rule get_humann2_dbs: rule get_panphlan_db: output: - directory(METASNK_DB_DIR+'/panphlan_databases') \ No newline at end of file + directory(METASNK_DB_DIR+'/panphlan_databases')