From c95864b2a30c5c5eac12dab5d54bfc52721dc274 Mon Sep 17 00:00:00 2001 From: burri0000 <dominik.burri@unibas.ch> Date: Mon, 19 Oct 2020 10:53:10 +0200 Subject: [PATCH] use temp() and shadow rules for removing unnecessary files --- Snakefile | 76 ++++++++++++++----------- workflow/rules/paired_end.snakefile.smk | 26 +++++---- workflow/rules/single_end.snakefile.smk | 18 ++++-- 3 files changed, 72 insertions(+), 48 deletions(-) diff --git a/Snakefile b/Snakefile index 2753972..1648e97 100644 --- a/Snakefile +++ b/Snakefile @@ -170,16 +170,16 @@ rule create_index_star: """ input: genome = lambda wildcards: - get_sample( + os.path.abspath(get_sample( 'genome', search_id='organism', - search_value=wildcards.organism), + search_value=wildcards.organism)), gtf = lambda wildcards: - get_sample( + os.path.abspath(get_sample( 'gtf', search_id='organism', - search_value=wildcards.organism) + search_value=wildcards.organism)) output: chromosome_info = os.path.join( @@ -195,6 +195,8 @@ rule create_index_star: "STAR_index", "chrName.txt") + shadow: "full" + params: output_dir = os.path.join( config['star_indexes'], @@ -251,11 +253,11 @@ rule extract_transcriptome: search_id='organism', search_value=wildcards.organism) output: - transcriptome = os.path.join( + transcriptome = temp(os.path.join( config['output_dir'], "transcriptome", "{organism}", - "transcriptome.fa") + "transcriptome.fa")) log: stderr = os.path.join( @@ -293,11 +295,11 @@ rule concatenate_transcriptome_and_genome: search_value=wildcards.organism) output: - genome_transcriptome = os.path.join( + genome_transcriptome = temp(os.path.join( config['output_dir'], "transcriptome", "{organism}", - "genome_transcriptome.fa") + "genome_transcriptome.fa")) singularity: "docker://bash:5.0.16" @@ -339,6 +341,8 @@ rule create_index_salmon: "{kmer}", "salmon.idx")) + shadow: "full" + params: kmerLen = "{kmer}" @@ -382,6 +386,8 @@ rule create_index_kallisto: "{organism}", "kallisto.idx") + shadow: "full" + params: output_dir = os.path.join( config['kallisto_indexes'], @@ -414,9 +420,9 @@ rule extract_transcripts_as_bed12: get_sample('gtf') output: - bed12 = os.path.join( + bed12 = temp(os.path.join( config['output_dir'], - "full_transcripts_protein_coding.bed") + "full_transcripts_protein_coding.bed")) singularity: "docker://zavolab/zgtf:0.1" @@ -516,12 +522,14 @@ rule calculate_TIN_scores: "full_transcripts_protein_coding.bed") output: - TIN_score = os.path.join( + TIN_score = temp(os.path.join( config['output_dir'], "samples", "{sample}", "TIN", - "TIN_score.tsv") + "TIN_score.tsv")) + + shadow: "full" params: sample = "{sample}" @@ -944,30 +952,32 @@ rule star_rpm: search_value=wildcards.sample)) output: - str1 = os.path.join( + str1 = temp(os.path.join( config["output_dir"], "samples", "{sample}", "STAR_coverage", - "{sample}_Signal.Unique.str1.out.bg"), - str2 = os.path.join( + "{sample}_Signal.Unique.str1.out.bg")), + str2 = temp(os.path.join( config["output_dir"], "samples", "{sample}", "STAR_coverage", - "{sample}_Signal.UniqueMultiple.str1.out.bg"), - str3 = os.path.join( + "{sample}_Signal.UniqueMultiple.str1.out.bg")), + str3 = temp(os.path.join( config["output_dir"], "samples", "{sample}", "STAR_coverage", - "{sample}_Signal.Unique.str2.out.bg"), - str4 = os.path.join( + "{sample}_Signal.Unique.str2.out.bg")), + str4 = temp(os.path.join( config["output_dir"], "samples", "{sample}", "STAR_coverage", - "{sample}_Signal.UniqueMultiple.str2.out.bg") + "{sample}_Signal.UniqueMultiple.str2.out.bg")) + + shadow: "full" params: out_dir = lambda wildcards, output: @@ -1041,20 +1051,20 @@ rule rename_star_rpm_for_alfa: search_value=wildcards.sample)) output: - plus = os.path.join( + plus = temp(os.path.join( config["output_dir"], "samples", "{sample}", "ALFA", "{unique}", - "{sample}.{unique}.plus.bg"), - minus = os.path.join( + "{sample}.{unique}.plus.bg")), + minus = temp(os.path.join( config["output_dir"], "samples", "{sample}", "ALFA", "{unique}", - "{sample}.{unique}.minus.bg") + "{sample}.{unique}.minus.bg")) params: orientation = lambda wildcards: @@ -1088,10 +1098,10 @@ rule generate_alfa_index: ''' Generate ALFA index files from sorted GTF file ''' input: gtf = lambda wildcards: - get_sample( + os.path.abspath(get_sample( 'gtf', search_id='organism', - search_value=wildcards.organism), + search_value=wildcards.organism)), chr_len = os.path.join( config["star_indexes"], @@ -1114,6 +1124,8 @@ rule generate_alfa_index: "ALFA", "sorted_genes.unstranded.ALFA_index") + shadow: "full" + params: genome_index = "sorted_genes", out_dir = lambda wildcards, output: @@ -1171,20 +1183,20 @@ rule alfa_qc: "sorted_genes.stranded.ALFA_index") output: - biotypes = os.path.join( + biotypes = temp(os.path.join( config["output_dir"], "samples", "{sample}", "ALFA", "{unique}", - "ALFA_plots.Biotypes.pdf"), - categories = os.path.join( + "ALFA_plots.Biotypes.pdf")), + categories = temp(os.path.join( config["output_dir"], "samples", "{sample}", "ALFA", "{unique}", - "ALFA_plots.Categories.pdf"), + "ALFA_plots.Categories.pdf")), table = os.path.join( config["output_dir"], "samples", @@ -1446,13 +1458,13 @@ rule sort_bed_4_big: "{sample}.{unique}.{strand}.bg") output: - sorted_bg = os.path.join( + sorted_bg = temp(os.path.join( config["output_dir"], "samples", "{sample}", "bigWig", "{unique}", - "{sample}_{unique}_{strand}.sorted.bg") + "{sample}_{unique}_{strand}.sorted.bg")) singularity: "docker://cjh4zavolab/bedtools:2.27" diff --git a/workflow/rules/paired_end.snakefile.smk b/workflow/rules/paired_end.snakefile.smk index 25046d0..f06d4dc 100644 --- a/workflow/rules/paired_end.snakefile.smk +++ b/workflow/rules/paired_end.snakefile.smk @@ -18,16 +18,16 @@ rule pe_remove_adapters_cutadapt: "{sample}.fq2.fastq.gz"), output: - reads1 = os.path.join( + reads1 = temp(os.path.join( config["output_dir"], "samples", "{sample}", - "{sample}.pe.remove_adapters_mate1.fastq.gz"), - reads2 = os.path.join( + "{sample}.pe.remove_adapters_mate1.fastq.gz")), + reads2 = temp(os.path.join( config["output_dir"], "samples", "{sample}", - "{sample}.pe.remove_adapters_mate2.fastq.gz") + "{sample}.pe.remove_adapters_mate2.fastq.gz")) params: adapter_3_mate1 = lambda wildcards: @@ -91,16 +91,16 @@ rule pe_remove_polya_cutadapt: "{sample}.pe.remove_adapters_mate2.fastq.gz") output: - reads1 = os.path.join( + reads1 = temp(os.path.join( config["output_dir"], "samples", "{sample}", - "{sample}.pe.remove_polya_mate1.fastq.gz"), - reads2 = os.path.join( + "{sample}.pe.remove_polya_mate1.fastq.gz")), + reads2 = temp(os.path.join( config["output_dir"], "samples", "{sample}", - "{sample}.pe.remove_polya_mate2.fastq.gz") + "{sample}.pe.remove_polya_mate2.fastq.gz")) params: polya_3_mate1 = lambda wildcards: @@ -203,6 +203,8 @@ rule pe_map_genome_star: "map_genome", "{sample}.pe.Log.final.out") + shadow: "full" + params: sample_id = "{sample}", index = lambda wildcards: @@ -292,10 +294,10 @@ rule pe_quantification_salmon: "{sample}", "{sample}.pe.remove_polya_mate2.fastq.gz"), gtf = lambda wildcards: - get_sample( + os.path.abspath(get_sample( 'gtf', search_id='index', - search_value=wildcards.sample), + search_value=wildcards.sample)), index = lambda wildcards: os.path.join( config["salmon_indexes"], @@ -323,6 +325,8 @@ rule pe_quantification_salmon: "{sample}.salmon.pe", "quant.sf") + shadow: "full" + params: output_dir = os.path.join( config["output_dir"], @@ -399,6 +403,8 @@ rule pe_genome_quantification_kallisto: "quant_kallisto", "{sample}.pe.kallisto.pseudo.sam") + shadow: "full" + params: output_dir = os.path.join( config["output_dir"], diff --git a/workflow/rules/single_end.snakefile.smk b/workflow/rules/single_end.snakefile.smk index 071b9e5..565bc01 100644 --- a/workflow/rules/single_end.snakefile.smk +++ b/workflow/rules/single_end.snakefile.smk @@ -11,11 +11,11 @@ rule remove_adapters_cutadapt: "{sample}.fq1.fastq.gz") output: - reads = os.path.join( + reads = temp(os.path.join( config["output_dir"], "samples", "{sample}", - "{sample}.se.remove_adapters_mate1.fastq.gz") + "{sample}.se.remove_adapters_mate1.fastq.gz")) params: adapters_3 = lambda wildcards: @@ -70,11 +70,11 @@ rule remove_polya_cutadapt: "{sample}.se.remove_adapters_mate1.fastq.gz") output: - reads = os.path.join( + reads = temp(os.path.join( config["output_dir"], "samples", "{sample}", - "{sample}.se.remove_polya_mate1.fastq.gz") + "{sample}.se.remove_polya_mate1.fastq.gz")) params: polya_3 = lambda wildcards: @@ -151,6 +151,8 @@ rule map_genome_star: "map_genome", "{sample}.se.Log.final.out") + shadow: "full" + params: sample_id = "{sample}", index = lambda wildcards: @@ -241,10 +243,10 @@ rule quantification_salmon: search_value=wildcards.sample), "salmon.idx"), gtf = lambda wildcards: - get_sample( + os.path.abspath(get_sample( 'gtf', search_id='index', - search_value=wildcards.sample) + search_value=wildcards.sample)) output: gn_estimates = os.path.join( @@ -260,6 +262,8 @@ rule quantification_salmon: "{sample}.salmon.se", "quant.sf") + shadow: "full" + params: output_dir = os.path.join( config["output_dir"], @@ -341,6 +345,8 @@ rule genome_quantification_kallisto: "quant_kallisto", "{sample}.se.kallisto.pseudo.sam") + shadow: "full" + params: output_dir = os.path.join( config["output_dir"], -- GitLab