Skip to content
Snippets Groups Projects
Commit a36902f0 authored by Ticlla Ccenhua Monica Roxana's avatar Ticlla Ccenhua Monica Roxana
Browse files

add rule get_human_ref to automatically download human reference and place it...

add rule get_human_ref to automatically download human reference and place it in the METASNK_DBS directory
parent cf78ba0c
Branches
No related tags found
No related merge requests found
......@@ -77,6 +77,8 @@ SHUB_PREQC_SIF = 'shub://mticlla/MetagenomicSnake:preqc_v0_1'
SHUB_METAPROF_SIF = 'shub://mticlla/OmicSingularities:metaprof'
PREQC_SIF = METASNK_DB_DIR+'/singularity/MetagenomicSnake_preqc_v0_1.sif'
METAPROF_SIF = METASNK_DB_DIR+'/singularity/OmicSingularities_metaprof.sif'
HUMAN_REF = eval(config['preprocess']['filter_human']['bbmap_reference'])
##----------------------------------------------------------------------------##
## Fastq files to be processed
##----------------------------------------------------------------------------##
......@@ -127,7 +129,9 @@ rule buildDBS:
METASNK_DB_DIR+'/strainphlan_markers/all_markers.fasta',
METASNK_DB_DIR+'/humann2_databases/chocophlan',
METASNK_DB_DIR+'/humann2_databases/uniref',
METASNK_DB_DIR+'/humann2_databases/utility_mapping'
METASNK_DB_DIR+'/humann2_databases/utility_mapping',
HUMAN_REF
#METASNK_DB_DIR+'/ref_genomes/hg19_main_mask_ribo_animal_allplant_allfungus.fa.gz'
rule rawQC:
input:
......
......@@ -3,12 +3,12 @@
# one row per sample. It can be parsed easily via pandas.
# Set PATHS
# WARNING:
# WARNING:
# MetagenomicSnake uses singularity containers to run the workflow.
# Bind access to those paths if needed
# Singularity's --bind/-B option can be specified multiple times,
# Singularity's --bind/-B option can be specified multiple times,
# or a comma-delimited string of bind path specifications can be used.
#
#
# Absolute PATH to folder where to find fastq files
RAW_DIR: '/home/mticlla/Documents/Git_repositories/metagenomicsnake_testdata/data/raw'
......@@ -39,16 +39,21 @@ preprocess:
bbmap_usemodulo: t
bbmap_mem: 10
# PATH to reference(e.g human) genome
# -----------------------------------
# Masked hg19 kindly provided by Brian Brushnell and
# manually dowloaded from
# PATH to reference(e.g human) genome to be removed
# -------------------------------------------------
# MetaSnk uses by default the masked hg19 genome, kindly provided by Brian Brushnell
# and dowloaded from
# https://drive.google.com/open?id=0B3llHR93L14wd0pSSnFULUlhcUk
# How the human genome was masked is explained in this SEQanswers entry
# http://seqanswers.com/forums/archive/index.php/t-42552.html
bbmap_reference: '/home/mticlla/Documents/Git_repositories/metagenomicsnake/ref/hg19_main_mask_ribo_animal_allplant_allfungus.fa.gz'
bbmap_ref_dir: '/home/mticlla/Documents/Git_repositories/metagenomicsnake/ref/ref_bbsplit'
# If you want to use a different human reference genome, provide the absolute path below
bbmap_reference: "METASNK_DB_DIR+'/ref_genomes/hg19_main_mask_ribo_animal_allplant_allfungus.fa.gz'"
# PATH to the folder where to place the indexed human reference genomes
# ---------------------------------------------------------------------
# The reference genome needs to be indexed, by default the indexed files are
# placed in your $METASNK_DBS dir (set during the installation steps of MetaSnk)
bbmap_ref_dir: "METASNK_DB_DIR+'/ref_genomes/ref_bbsplit'"
phlanprof:
strphlan_clade_profiling:
metadatas:
......
images/buildDBS_dag.png

18.8 KiB | W: | H:

images/buildDBS_dag.png

22.4 KiB | W: | H:

images/buildDBS_dag.png
images/buildDBS_dag.png
images/buildDBS_dag.png
images/buildDBS_dag.png
  • 2-up
  • Swipe
  • Onion skin
......@@ -25,50 +25,48 @@ localrules:
##----------------------------------------------------------------------------##
## Local variables
##----------------------------------------------------------------------------##
#singularity_img = 'shub://mticlla/MetagenomicSnake:preqc_v0_1'
singularity_preqc = PREQC_SIF
BBMAP_REF = config['preprocess']['filter_human']['bbmap_reference']
BBMAP_REF_DIR = config['preprocess']['filter_human']['bbmap_ref_dir']
BBMAP_REF = HUMAN_REF
BBMAP_REF_DIR = eval(config['preprocess']['filter_human']['bbmap_ref_dir'])
##----------------------------------------------------------------------------##
## Helper functions
##----------------------------------------------------------------------------##
# Helper functions to create list of files for MultiQC
def mqc_trim_adapters_inputs(wildcards):
list_files = ['{}/{}/preQC/atrimmed/{}.fastp.json'.format(OUT_DIR,
value,
FASTQS[ix])
for ix,value in enumerate(DATASETSX)
list_files = ['{}/{}/preQC/atrimmed/{}.fastp.json'.format(OUT_DIR,
value,
FASTQS[ix])
for ix,value in enumerate(DATASETSX)
if value==wildcards.dataset]
return(list_files)
def mqc_filter_human_inputs(wildcards):
list_files = ['{}/{}/preQC/bfiltered/{}.clean.stats'.format(OUT_DIR,
value,
FASTQS[ix])
for ix,value in enumerate(DATASETSX)
list_files = ['{}/{}/preQC/bfiltered/{}.clean.stats'.format(OUT_DIR,
value,
FASTQS[ix])
for ix,value in enumerate(DATASETSX)
if value==wildcards.dataset]
return(list_files)
def mqc_trim_3end_inputs(wildcards):
list_files = ['{}/{}/preQC/dfinaltrim/{}.clean.nodup.fastp.json'.format(OUT_DIR,
value,
list_files = ['{}/{}/preQC/dfinaltrim/{}.clean.nodup.fastp.json'.format(OUT_DIR,
value,
FASTQS[ix])
for ix,value in enumerate(DATASETSX)
for ix,value in enumerate(DATASETSX)
if value==wildcards.dataset]
return(list_files)
def list_concatenated_r1_fastqs(wildcards):
list_r1 = ['{}/{}/preQC/emerged/{}-R1.fastq.gz'.format(OUT_DIR,
wildcards.dataset,
value)
value)
for ix,value in enumerate(SAMPLES) if DATASETS[ix]==wildcards.dataset]
return(list_r1)
def list_concatenated_r2_fastqs(wildcards):
list_r2 = ['{}/{}/preQC/emerged/{}-R2.fastq.gz'.format(OUT_DIR,
wildcards.dataset,
value)
value)
for ix,value in enumerate(SAMPLES) if DATASETS[ix]==wildcards.dataset]
return(list_r2)
##----------------------------------------------------------------------------##
......@@ -83,14 +81,14 @@ def list_concatenated_r2_fastqs(wildcards):
# - limiting the N gase number (-n)
# - percentage of unqualified bases (-u)
# - average quality score (-e, default 0 means no requirement)
# - remove adapters: enabled by default, fastp detects adapters by
# per-read overlap analysis (seeks for the overlap of each read pair).
# If fastp fails to find an overlap, It usess the provided the adapter
# sequences (e.g Nextera XT adapters).
# - remove adapters: enabled by default, fastp detects adapters by
# per-read overlap analysis (seeks for the overlap of each read pair).
# If fastp fails to find an overlap, It usess the provided the adapter
# sequences (e.g Nextera XT adapters).
# Check this page from Illumina to know which adapters to remove:
# https://support.illumina.com/bulletins/2016/12/what-sequences-do-i-use-for-adapter-trimming.html
# - base correction in overlapped regions
# - trimming of the last base(s) for every read pair(this step preceeds
# - trimming of the last base(s) for every read pair(this step preceeds
# adapter removal)
# - length filtering: discard reads shorter than a minimum length, after trimming
#
......@@ -187,8 +185,8 @@ rule filter_human:
#
rule index_human_ref:
input:
# Masked hg19 kindly provided by Brian Brushnell and
# manually dowloaded from
# Masked hg19 kindly provided by Brian Brushnell and
# dowloaded from
# https://drive.google.com/open?id=0B3llHR93L14wd0pSSnFULUlhcUk
reference = BBMAP_REF
output:
......@@ -281,7 +279,7 @@ rule trim_3end:
--thread {threads}) &>{log}
'''
###**THESE TARGET FILES ARE THE FINAL CLEAN**###
# concatenate quality/length filtered, adapter-trimmed, cleaned, deduplicated and
# concatenate quality/length filtered, adapter-trimmed, cleaned, deduplicated and
# quality-trimmed fastqs from the same samples
rule concatenate_fastqs:
input:
......@@ -310,15 +308,15 @@ rule check_concatenation:
inputs_r1 = list_concatenated_r1_fastqs,
inputs_r2 = list_concatenated_r2_fastqs
output:
concatenated_fastqs_list = OUT_DIR + '/{dataset}/preQC/summary_stats/' +
concatenated_fastqs_list = OUT_DIR + '/{dataset}/preQC/summary_stats/' +
'{dataset}_concatenation_all.done'
#conda:'../envs/rawQC.yaml'
run:
import os
from pathlib import Path
try:
sample_fastqs_exist = [os.path.exists(r1_file) and os.path.exists(r2_file)
sample_fastqs_exist = [os.path.exists(r1_file) and os.path.exists(r2_file)
for r1_file, r2_file in zip(input.inputs_r1,input.inputs_r2)]
if all(sample_fastqs_exist):
Path(output.concatenated_fastqs_list).touch()
......@@ -386,12 +384,12 @@ rule mqc_trim_adapters:
input:
OUT_DIR +'/{dataset}/preQC/multiqc/atrimmed/multiqc_inputs.txt'
output:
multiqc_data_dir = OUT_DIR +
multiqc_data_dir = OUT_DIR +
'/{dataset}/preQC/multiqc/atrimmed'+
'/{dataset}_atrimmed_mqc_data/multiqc_data.json',
multiqc_report = report(OUT_DIR +
'/{dataset}/preQC/multiqc/atrimmed' +
'/{dataset}_atrimmed_mqc.html',
multiqc_report = report(OUT_DIR +
'/{dataset}/preQC/multiqc/atrimmed' +
'/{dataset}_atrimmed_mqc.html',
category='preQC_step1:trim_adapters')
singularity: singularity_preqc
shell:
......@@ -403,25 +401,25 @@ rule mqc_filter_human:
input:
OUT_DIR +'/{dataset}/preQC/multiqc/bfiltered/multiqc_inputs.txt'
output:
mqc_report = report(OUT_DIR +
'/{dataset}/preQC/multiqc/bfiltered' +
'/{dataset}_bfiltered_stats.tsv',
mqc_report = report(OUT_DIR +
'/{dataset}/preQC/multiqc/bfiltered' +
'/{dataset}_bfiltered_stats.tsv',
category='preQC_step2:filter_human')
#conda:'../envs/rawQC.yaml'
run:
from utils import summarize_filter_human_step
mqc_stats_data = summarize_filter_human_step('{}'.format(input))
mqc_stats_data.to_csv(output.mqc_report, sep='\t', header=True,
mqc_stats_data.to_csv(output.mqc_report, sep='\t', header=True,
index=True, index_label='Unit')
#
rule multiqc_trim_3end:
input:
OUT_DIR+'/{dataset}/preQC/multiqc/dfinaltrim/multiqc_inputs.txt'
output:
multiqc_fastqc = OUT_DIR +
multiqc_fastqc = OUT_DIR +
'/{dataset}/preQC/multiqc/dfinaltrim'+
'/{dataset}_dfinaltrim_mqc_data/multiqc_data.json',
multiqc_report = report(OUT_DIR +
multiqc_report = report(OUT_DIR +
'/{dataset}/preQC/multiqc/dfinaltrim'+
'/{dataset}_dfinaltrim_mqc.html',
category='preQC_step4:trim_3end')
......@@ -436,64 +434,64 @@ rule summarize_preQC:
input:
trim_adapters_mqc_json = OUT_DIR + '/{dataset}/preQC/multiqc/atrimmed'+
'/{dataset}_atrimmed_mqc_data/multiqc_data.json',
filter_human_mqc_stats = OUT_DIR + '/{dataset}/preQC/multiqc/bfiltered' +
filter_human_mqc_stats = OUT_DIR + '/{dataset}/preQC/multiqc/bfiltered' +
'/{dataset}_bfiltered_stats.tsv',
trim3end_mqc_json = OUT_DIR + '/{dataset}/preQC/multiqc/dfinaltrim'+
'/{dataset}_dfinaltrim_mqc_data/multiqc_data.json'
log:
OUT_DIR + '/{dataset}/preQC/logs/summarize_preqc.log'
output:
units_summary = report(OUT_DIR +
'/{dataset}/preQC/summary_stats' +
'/{dataset}_preqc_units_summary.tsv',
units_summary = report(OUT_DIR +
'/{dataset}/preQC/summary_stats' +
'/{dataset}_preqc_units_summary.tsv',
category='preQC:summaries'),
samples_summary = report(OUT_DIR +
samples_summary = report(OUT_DIR +
'/{dataset}/preQC/summary_stats' +
'/{dataset}_preqc_samples_summary.tsv',
'/{dataset}_preqc_samples_summary.tsv',
category='preQC:summaries'),
units_summary_plot = report(OUT_DIR +
'/{dataset}/preQC/summary_stats' +
'/{dataset}_preqc_units_barchart.svg',
units_summary_plot = report(OUT_DIR +
'/{dataset}/preQC/summary_stats' +
'/{dataset}_preqc_units_barchart.svg',
category='preQC:summaries'),
samples_summary_plot = report(OUT_DIR +
'/{dataset}/preQC/summary_stats' +
'/{dataset}_preqc_samples_barchart.svg',
category='preQC:summaries'),
units_summary_pct_plot = report(OUT_DIR +
'/{dataset}/preQC/summary_stats' +
'/{dataset}_preqc_units_pct_barchart.svg',
samples_summary_plot = report(OUT_DIR +
'/{dataset}/preQC/summary_stats' +
'/{dataset}_preqc_samples_barchart.svg',
category='preQC:summaries'),
units_summary_pct_plot = report(OUT_DIR +
'/{dataset}/preQC/summary_stats' +
'/{dataset}_preqc_units_pct_barchart.svg',
category='preQC:summaries'),
samples_summary_pct_plot = report(OUT_DIR +
'/{dataset}/preQC/summary_stats'+
'/{dataset}_preqc_samples_pct_barchart.svg',
samples_summary_pct_plot = report(OUT_DIR +
'/{dataset}/preQC/summary_stats'+
'/{dataset}_preqc_samples_pct_barchart.svg',
category='preQC:summaries')
#conda:'../envs/rawQC.yaml'
run:
from utils import summarize_preqc
from utils import plot_preqc_summary
import sys
sys.stdout = open(log[0], 'w')
try:
trim_adapters_mqc_json = '{}'.format(input.trim_adapters_mqc_json)
filter_human_mqc_stats = '{}'.format(input.filter_human_mqc_stats)
trim3end_mqc_json = '{}'.format(input.trim3end_mqc_json)
# units summary
units_summary_df = summarize_preqc(trim_adapters_mqc_json, filter_human_mqc_stats,
units_summary_df = summarize_preqc(trim_adapters_mqc_json, filter_human_mqc_stats,
trim3end_mqc_json)
except (TypeError, ValueError) as e:
print(e)
print(e)
else:
units_summary_df.to_csv(output.units_summary, sep='\t', header=True, index=True,
units_summary_df.to_csv(output.units_summary, sep='\t', header=True, index=True,
index_label='Unit')
# samples summary
samples_summary_df = units_summary_df.groupby(['Sample']).agg('sum')
pct_columns = ['trim_adapters_pct_disc', 'filter_human_pct_disc', 'dedupe_pct_disc',
pct_columns = ['trim_adapters_pct_disc', 'filter_human_pct_disc', 'dedupe_pct_disc',
'trim_3end_pct_disc', 'total_pct_disc', 'total_pct_kept']
samples_summary_df[pct_columns] = units_summary_df[['Sample']+pct_columns].groupby(['Sample']).agg('mean')
samples_summary_df.to_csv(output.samples_summary, sep='\t', header=True, index=True,
samples_summary_df.to_csv(output.samples_summary, sep='\t', header=True, index=True,
index_label='Sample')
# radial barcharts
#
......@@ -510,4 +508,3 @@ rule summarize_preQC:
fig4.savefig(output.samples_summary_pct_plot, bbox_inches='tight', format='svg')
finally:
sys.stdout.close()
\ No newline at end of file
......@@ -24,8 +24,9 @@ localrules:
get_humann2_chocophlan,
get_humann2_uniref,
get_humann2_utility,
get_humann2_dbs
get_humann2_dbs,
get_human_ref
##----------------------------------------------------------------------------##
## Rules to download/build container/databases used by MetaSnk
##----------------------------------------------------------------------------##
......@@ -59,6 +60,17 @@ rule pull_metaprof_sif:
$(basename {output}) \
{params.shub_sif}
'''
rule get_human_ref:
output:
# Human reference genome for removal of human DNA
human_ref = HUMAN_REF
message: "Downloading fasta file of human reference genome ..."
shell:
'''
[ ! -d $(dirname {output.human_ref}) ] && mkdir $(dirname {output.human_ref});
cd $(dirname {output.human_ref})
wget https://github.com/mticlla/MetaSnk_data/raw/master/ref/hg19_main_mask_ribo_animal_allplant_allfungus.fa.gz
'''
rule get_metaphlan_db:
output:
directory(METASNK_DB_DIR+'/metaphlan_databases')
......@@ -67,7 +79,7 @@ rule get_metaphlan_db:
shell:
'''
metaphlan2.py --install --bowtie2db {output}
'''
'''
rule get_strphlan_db:
input:
metaphlan_db_dir = METASNK_DB_DIR+'/metaphlan_databases'
......@@ -81,7 +93,7 @@ rule get_strphlan_db:
rule get_humann2_chocophlan:
output:
humann2_choco_dir = directory(METASNK_DB_DIR+'/humann2_databases/chocophlan')
singularity:METAPROF_SIF
singularity:METAPROF_SIF
shell:
'''
bash -c 'source activate humann2 && mkdir -p $(dirname {output.humann2_choco_dir}); \
......@@ -95,7 +107,7 @@ rule get_humann2_chocophlan:
rule get_humann2_uniref:
output:
humann2_uniref_dir = directory(METASNK_DB_DIR+'/humann2_databases/uniref')
singularity:METAPROF_SIF
singularity:METAPROF_SIF
shell:
'''
bash -c 'source activate humann2 && mkdir -p $(dirname {output.humann2_uniref_dir}); \
......@@ -122,4 +134,4 @@ rule get_humann2_dbs:
rule get_panphlan_db:
output:
directory(METASNK_DB_DIR+'/panphlan_databases')
\ No newline at end of file
directory(METASNK_DB_DIR+'/panphlan_databases')
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment