From 4f7447ac1248678f4e05b95b5cee733598054af1 Mon Sep 17 00:00:00 2001 From: burri0000 <dominik.burri@unibas.ch> Date: Fri, 21 Feb 2020 12:28:06 +0100 Subject: [PATCH] alfa for all samples --- envs/env_alfa.yaml | 1 + snakemake/Snakefile | 87 +++++++++++++++++-- snakemake/paired_end.snakefile.smk | 25 +++--- snakemake/single_end.snakefile.smk | 25 +++--- tests/test_integration_workflow/test.local.sh | 3 +- 5 files changed, 108 insertions(+), 33 deletions(-) diff --git a/envs/env_alfa.yaml b/envs/env_alfa.yaml index 0faa646..9dd3b45 100644 --- a/envs/env_alfa.yaml +++ b/envs/env_alfa.yaml @@ -6,3 +6,4 @@ channels: - defaults dependencies: - alfa=1.1.1 + - rename=1.600 diff --git a/snakemake/Snakefile b/snakemake/Snakefile index 1e81a7e..1758062 100644 --- a/snakemake/Snakefile +++ b/snakemake/Snakefile @@ -12,11 +12,13 @@ import pandas as pd samples_table = pd.read_csv(config["samples"], header=0, index_col=0, comment='#', engine='python', sep="\t") directionality = {"--fr": "fr-firststrand", "--rv": "fr-secondstrand"} -rename_files = {"fr-firststrand": ("STAR_Signal.UniqueMultiple.out.plus.bg", - "STAR_Signal.UniqueMultiple.out.minus.bg"), - "fr-secondstrand": ("STAR_Signal.UniqueMultiple.out.minus.bg", - "STAR_Signal.UniqueMultiple.out.plus.bg")} - +rename_files = {"fr-firststrand": ("Signal.UniqueMultiple.out.plus.bg", + "Signal.UniqueMultiple.out.minus.bg"), + "fr-secondstrand": ("Signal.UniqueMultiple.out.minus.bg", + "Signal.UniqueMultiple.out.plus.bg")} +rename_expr = {"fr-firststrand": ("out.plus", "out.minus"), + "fr-secondstrand": ("out.minus", "out.plus")} + localrules: finish ################################################################################## @@ -60,7 +62,11 @@ rule finish: "ALFA_plots.Biotypes.pdf"), zip, sample= [i for i in list(samples_table.index.values)], - seqmode= [samples_table.loc[i,"seqmode"] for i in list(samples_table.index.values)]) + seqmode= [samples_table.loc[i,"seqmode"] for i in list(samples_table.index.values)]), + alfa = os.path.join( + config["output_dir"], + "ALFA", + "ALFA_plots.Biotypes.pdf") rule create_index_star: @@ -193,3 +199,72 @@ rule generate_alfa_index: -p {threads} \ -o {params.out_dir} &> {log}" + +# TODO: add rule that executes ALFA on all samples; should be on the same genome & same orientation +rule alfa_bg_all_samples: + ''' Run ALFA from stranded bedgraph files ''' + input: + str1 = expand(os.path.join( + config["output_dir"], + "{seqmode}", + "{sample}", + "ALFA", + "{sample}_Signal.UniqueMultiple.str1.out.bg"), + zip, + sample= [i for i in list(samples_table.index.values)], + seqmode= [samples_table.loc[i,"seqmode"] for i in list(samples_table.index.values)]), + str2 = expand(os.path.join( + config["output_dir"], + "{seqmode}", + "{sample}", + "ALFA", + "{sample}_Signal.UniqueMultiple.str2.out.bg"), + zip, + sample= [i for i in list(samples_table.index.values)], + seqmode= [samples_table.loc[i,"seqmode"] for i in list(samples_table.index.values)]), + gtf = [os.path.join(config["alfa_indexes"], samples_table.loc[sample1, "organism"], + str(samples_table.loc[sample1, "index_size"]), "ALFA", "sorted_genes.stranded.ALFA_index") for sample1 in list(samples_table.index.values)] + output: + biotypes = os.path.join( + config["output_dir"], + "ALFA", + "ALFA_plots.Biotypes.pdf"), + categories = os.path.join( + config["output_dir"], + "ALFA", + "ALFA_plots.Categories.pdf") + params: + str1 = lambda wildcards, input: [os.path.abspath(i) for i in input.str1], + str2 = lambda wildcards, input: [os.path.abspath(i) for i in input.str2], + out_dir = lambda wildcards, output: directory(os.path.dirname(output.biotypes)), + in_file_str1 = lambda wildcards, input: [os.path.basename(inp) for inp in input.str1], # copy files into out_dir and create with expand construct of bg + name + rename_str1 = [rename_expr[directionality[samples_table.loc[sample1, "kallisto_directionality"]]][0] for sample1 in list(samples_table.index.values)][0], + in_file_str2 = lambda wildcards, input: [os.path.basename(inp) for inp in input.str2], + rename_str2 = [rename_expr[directionality[samples_table.loc[sample1, "kallisto_directionality"]]][1] for sample1 in list(samples_table.index.values)][0], + genome_index = [os.path.abspath(os.path.join( + config["alfa_indexes"], + samples_table.loc[sample1, "organism"], + str(samples_table.loc[sample1, "index_size"]), + "ALFA", "sorted_genes")) for sample1 in list(samples_table.index.values)][0], + orientation = [directionality[samples_table.loc[sample1, "kallisto_directionality"]] for sample1 in list(samples_table.index.values)][0], + bg_and_sample_name = [sample1 + "_Signal.UniqueMultiple." + + rename_expr[directionality[samples_table.loc[sample1, "kallisto_directionality"]]][0] + ".bg " + + sample1 + "_Signal.UniqueMultiple." + rename_expr[directionality[samples_table.loc[sample1, "kallisto_directionality"]]][1] + ".bg " + sample1 + for sample1 in list(samples_table.index.values)] + # singularity: + # "docker://zavolab/alfa:1.1.1" + conda: os.path.abspath("../../envs/env_alfa.yaml") + log: os.path.abspath(os.path.join(config["local_log"], "ALFA", "alfa_bg_all_samples.log")) + shell: + """ + mkdir -p {params.out_dir}; \ + cd {params.out_dir}; \ + cp {params.str1} . ; \ + cp {params.str2} . ; \ + rename 's/str1.out/{params.rename_str1}/' *str1.out.bg; \ + rename 's/str2.out/{params.rename_str2}/' *str2.out.bg; \ + (alfa -g {params.genome_index} \ + --bedgraph {params.bg_and_sample_name} \ + -s {params.orientation}) &> {log}; \ + rm *{params.rename_str1}.bg *{params.rename_str2}.bg + """ \ No newline at end of file diff --git a/snakemake/paired_end.snakefile.smk b/snakemake/paired_end.snakefile.smk index 61d8309..b92a203 100644 --- a/snakemake/paired_end.snakefile.smk +++ b/snakemake/paired_end.snakefile.smk @@ -354,25 +354,25 @@ rule star_rpm_paired_end: "paired_end", "{sample}", "ALFA", - "STAR_Signal.Unique.str1.out.bg"), + "{sample}_Signal.Unique.str1.out.bg"), os.path.join( config["output_dir"], "paired_end", "{sample}", "ALFA", - "STAR_Signal.UniqueMultiple.str1.out.bg")), + "{sample}_Signal.UniqueMultiple.str1.out.bg")), str2 = (os.path.join( config["output_dir"], "paired_end", "{sample}", "ALFA", - "STAR_Signal.Unique.str2.out.bg"), + "{sample}_Signal.Unique.str2.out.bg"), os.path.join( config["output_dir"], "paired_end", "{sample}", "ALFA", - "STAR_Signal.UniqueMultiple.str2.out.bg")) + "{sample}_Signal.UniqueMultiple.str2.out.bg")) params: out_dir = directory(os.path.join(config["output_dir"], "paired_end", @@ -381,7 +381,7 @@ rule star_rpm_paired_end: prefix = os.path.join(config["output_dir"], "paired_end", "{sample}", - "ALFA", "STAR_"), + "ALFA", "{sample}_"), stranded = "Stranded" singularity: "docker://zavolab/star:2.6.0a" @@ -409,13 +409,13 @@ rule alfa_bg_paired_end: "paired_end", "{sample}", "ALFA", - "STAR_Signal.UniqueMultiple.str1.out.bg"), + "{sample}_Signal.UniqueMultiple.str1.out.bg"), str2 = os.path.join( config["output_dir"], "paired_end", "{sample}", "ALFA", - "STAR_Signal.UniqueMultiple.str2.out.bg"), + "{sample}_Signal.UniqueMultiple.str2.out.bg"), gtf = lambda wildcards: os.path.join(config["alfa_indexes"], samples_table.loc[wildcards.sample, "organism"], str(samples_table.loc[wildcards.sample, "index_size"]), "ALFA", "sorted_genes.stranded.ALFA_index") output: @@ -438,12 +438,11 @@ rule alfa_bg_paired_end: "{sample}", "ALFA")), orientation = lambda wildcards: directionality[samples_table.loc[wildcards.sample, "kallisto_directionality"]], - in_file_str1 = "STAR_Signal.UniqueMultiple.str1.out.bg", - rename_str1 = lambda wildcards: rename_files[directionality[samples_table.loc[wildcards.sample, "kallisto_directionality"]]][0], - in_file_str2 = "STAR_Signal.UniqueMultiple.str2.out.bg", - rename_str2 = lambda wildcards: rename_files[directionality[samples_table.loc[wildcards.sample, "kallisto_directionality"]]][1], - genome_index = lambda wildcards: os.path.abspath(os.path.join(config["alfa_indexes"], samples_table.loc[wildcards.sample, "organism"], - str(samples_table.loc[wildcards.sample, "index_size"]), "ALFA", "sorted_genes")), + in_file_str1 = lambda wildcards, input: os.path.basename(input.str1), + rename_str1 = lambda wildcards: wildcards.sample + "_" + rename_files[directionality[samples_table.loc[wildcards.sample, "kallisto_directionality"]]][0], + in_file_str2 = lambda wildcards, input: os.path.basename(input.str2), + rename_str2 = lambda wildcards: wildcards.sample + "_" + rename_files[directionality[samples_table.loc[wildcards.sample, "kallisto_directionality"]]][1], + genome_index = lambda wildcards, input: os.path.abspath(os.path.join(os.path.dirname(input.gtf), "sorted_genes")), name = "{sample}" singularity: "docker://zavolab/alfa:1.1.1" diff --git a/snakemake/single_end.snakefile.smk b/snakemake/single_end.snakefile.smk index 280d4ea..4d5d231 100644 --- a/snakemake/single_end.snakefile.smk +++ b/snakemake/single_end.snakefile.smk @@ -280,25 +280,25 @@ rule star_rpm_single_end: "single_end", "{sample}", "ALFA", - "STAR_Signal.Unique.str1.out.bg"), + "{sample}_Signal.Unique.str1.out.bg"), os.path.join( config["output_dir"], "single_end", "{sample}", "ALFA", - "STAR_Signal.UniqueMultiple.str1.out.bg")), + "{sample}_Signal.UniqueMultiple.str1.out.bg")), str2 = (os.path.join( config["output_dir"], "single_end", "{sample}", "ALFA", - "STAR_Signal.Unique.str2.out.bg"), + "{sample}_Signal.Unique.str2.out.bg"), os.path.join( config["output_dir"], "single_end", "{sample}", "ALFA", - "STAR_Signal.UniqueMultiple.str2.out.bg")) + "{sample}_Signal.UniqueMultiple.str2.out.bg")) params: out_dir = directory(os.path.join(config["output_dir"], "single_end", @@ -307,7 +307,7 @@ rule star_rpm_single_end: prefix = os.path.join(config["output_dir"], "single_end", "{sample}", - "ALFA", "STAR_"), + "ALFA", "{sample}_"), stranded = "Stranded" singularity: "docker://zavolab/star:2.6.0a" @@ -336,13 +336,13 @@ rule alfa_bg_single_end: "single_end", "{sample}", "ALFA", - "STAR_Signal.UniqueMultiple.str1.out.bg"), + "{sample}_Signal.UniqueMultiple.str1.out.bg"), str2 = os.path.join( config["output_dir"], "single_end", "{sample}", "ALFA", - "STAR_Signal.UniqueMultiple.str2.out.bg"), + "{sample}_Signal.UniqueMultiple.str2.out.bg"), gtf = lambda wildcards: os.path.join(config["alfa_indexes"], samples_table.loc[wildcards.sample, "organism"], str(samples_table.loc[wildcards.sample, "index_size"]), "ALFA", "sorted_genes.stranded.ALFA_index") output: @@ -365,12 +365,11 @@ rule alfa_bg_single_end: "{sample}", "ALFA")), orientation = lambda wildcards: directionality[samples_table.loc[wildcards.sample, "kallisto_directionality"]], - in_file_str1 = "STAR_Signal.UniqueMultiple.str1.out.bg", - rename_str1 = lambda wildcards: rename_files[directionality[samples_table.loc[wildcards.sample, "kallisto_directionality"]]][0], - in_file_str2 = "STAR_Signal.UniqueMultiple.str2.out.bg", - rename_str2 = lambda wildcards: rename_files[directionality[samples_table.loc[wildcards.sample, "kallisto_directionality"]]][1], - genome_index = lambda wildcards: os.path.abspath(os.path.join(config["alfa_indexes"], samples_table.loc[wildcards.sample, "organism"], - str(samples_table.loc[wildcards.sample, "index_size"]), "ALFA", "sorted_genes")), + in_file_str1 = lambda wildcards, input: os.path.basename(input.str1), + rename_str1 = lambda wildcards: wildcards.sample + "_" + rename_files[directionality[samples_table.loc[wildcards.sample, "kallisto_directionality"]]][0], + in_file_str2 = lambda wildcards, input: os.path.basename(input.str2), + rename_str2 = lambda wildcards: wildcards.sample + "_" + rename_files[directionality[samples_table.loc[wildcards.sample, "kallisto_directionality"]]][1], + genome_index = lambda wildcards, input: os.path.abspath(os.path.join(os.path.dirname(input.gtf), "sorted_genes")), name = "{sample}" singularity: "docker://zavolab/alfa:1.1.1" diff --git a/tests/test_integration_workflow/test.local.sh b/tests/test_integration_workflow/test.local.sh index 6bcf8d6..f26856c 100755 --- a/tests/test_integration_workflow/test.local.sh +++ b/tests/test_integration_workflow/test.local.sh @@ -20,7 +20,8 @@ snakemake \ --printshellcmds \ --rerun-incomplete \ --use-singularity \ - --singularity-args="--bind ${PWD}" + --singularity-args="--bind ${PWD}" \ + --use-conda find results/ -type f -name \*\.gz -exec gunzip '{}' \; find results/ -type f -name \*\.zip -exec sh -c 'unzip -o {} -d $(dirname {})' \; md5sum --check "expected_output.md5" -- GitLab