From 4f7447ac1248678f4e05b95b5cee733598054af1 Mon Sep 17 00:00:00 2001
From: burri0000 <dominik.burri@unibas.ch>
Date: Fri, 21 Feb 2020 12:28:06 +0100
Subject: [PATCH] alfa for all samples

---
 envs/env_alfa.yaml                            |  1 +
 snakemake/Snakefile                           | 87 +++++++++++++++++--
 snakemake/paired_end.snakefile.smk            | 25 +++---
 snakemake/single_end.snakefile.smk            | 25 +++---
 tests/test_integration_workflow/test.local.sh |  3 +-
 5 files changed, 108 insertions(+), 33 deletions(-)

diff --git a/envs/env_alfa.yaml b/envs/env_alfa.yaml
index 0faa646..9dd3b45 100644
--- a/envs/env_alfa.yaml
+++ b/envs/env_alfa.yaml
@@ -6,3 +6,4 @@ channels:
   - defaults
 dependencies:
   - alfa=1.1.1
+  - rename=1.600
diff --git a/snakemake/Snakefile b/snakemake/Snakefile
index 1e81a7e..1758062 100644
--- a/snakemake/Snakefile
+++ b/snakemake/Snakefile
@@ -12,11 +12,13 @@ import pandas as pd
 
 samples_table = pd.read_csv(config["samples"], header=0, index_col=0, comment='#', engine='python', sep="\t")
 directionality = {"--fr": "fr-firststrand", "--rv": "fr-secondstrand"}
-rename_files = {"fr-firststrand": ("STAR_Signal.UniqueMultiple.out.plus.bg", 
-	"STAR_Signal.UniqueMultiple.out.minus.bg"),
-	"fr-secondstrand": ("STAR_Signal.UniqueMultiple.out.minus.bg", 
-	"STAR_Signal.UniqueMultiple.out.plus.bg")}
-	
+rename_files = {"fr-firststrand": ("Signal.UniqueMultiple.out.plus.bg", 
+	"Signal.UniqueMultiple.out.minus.bg"),
+	"fr-secondstrand": ("Signal.UniqueMultiple.out.minus.bg", 
+	"Signal.UniqueMultiple.out.plus.bg")}
+rename_expr = {"fr-firststrand": ("out.plus", "out.minus"),
+	"fr-secondstrand": ("out.minus", "out.plus")}
+
 localrules: finish
 
 ##################################################################################
@@ -60,7 +62,11 @@ rule finish:
 			"ALFA_plots.Biotypes.pdf"),
 			zip,
 			sample= [i for i in list(samples_table.index.values)],
-			seqmode= [samples_table.loc[i,"seqmode"] for i in list(samples_table.index.values)])
+			seqmode= [samples_table.loc[i,"seqmode"] for i in list(samples_table.index.values)]),
+		alfa = os.path.join(
+			config["output_dir"],
+			"ALFA",
+			"ALFA_plots.Biotypes.pdf")
 
 
 rule create_index_star:
@@ -193,3 +199,72 @@ rule generate_alfa_index:
 			-p {threads} \
 			-o {params.out_dir} &> {log}"
 
+
+# TODO: add rule that executes ALFA on all samples; should be on the same genome & same orientation
+rule alfa_bg_all_samples:
+	''' Run ALFA from stranded bedgraph files '''
+	input:
+		str1 = expand(os.path.join(
+			config["output_dir"],
+			"{seqmode}",
+			"{sample}",
+			"ALFA",
+			"{sample}_Signal.UniqueMultiple.str1.out.bg"),
+			zip,
+			sample= [i for i in list(samples_table.index.values)], 
+			seqmode= [samples_table.loc[i,"seqmode"] for i in list(samples_table.index.values)]),
+		str2 = expand(os.path.join(
+			config["output_dir"],
+			"{seqmode}",
+			"{sample}",
+			"ALFA",
+			"{sample}_Signal.UniqueMultiple.str2.out.bg"),
+			zip,
+			sample= [i for i in list(samples_table.index.values)], 
+			seqmode= [samples_table.loc[i,"seqmode"] for i in list(samples_table.index.values)]),
+		gtf = [os.path.join(config["alfa_indexes"], samples_table.loc[sample1, "organism"], 
+			str(samples_table.loc[sample1, "index_size"]), "ALFA", "sorted_genes.stranded.ALFA_index") for sample1 in list(samples_table.index.values)]
+	output:
+		biotypes = os.path.join(
+			config["output_dir"],
+			"ALFA",
+			"ALFA_plots.Biotypes.pdf"),
+		categories = os.path.join(
+			config["output_dir"],
+			"ALFA",
+			"ALFA_plots.Categories.pdf")
+	params:
+		str1 = lambda wildcards, input: [os.path.abspath(i) for i in input.str1],
+		str2 = lambda wildcards, input: [os.path.abspath(i) for i in input.str2],
+		out_dir = lambda wildcards, output: directory(os.path.dirname(output.biotypes)),
+		in_file_str1 = lambda wildcards, input: [os.path.basename(inp) for inp in input.str1], # copy files into out_dir and create with expand construct of bg + name
+		rename_str1 = [rename_expr[directionality[samples_table.loc[sample1, "kallisto_directionality"]]][0] for sample1 in list(samples_table.index.values)][0],
+		in_file_str2 = lambda wildcards, input: [os.path.basename(inp) for inp in input.str2],
+		rename_str2 = [rename_expr[directionality[samples_table.loc[sample1, "kallisto_directionality"]]][1] for sample1 in list(samples_table.index.values)][0],
+		genome_index = [os.path.abspath(os.path.join(
+			config["alfa_indexes"], 
+			samples_table.loc[sample1, "organism"], 
+			str(samples_table.loc[sample1, "index_size"]), 
+			"ALFA", "sorted_genes")) for sample1 in list(samples_table.index.values)][0],
+		orientation = [directionality[samples_table.loc[sample1, "kallisto_directionality"]] for sample1 in list(samples_table.index.values)][0],
+		bg_and_sample_name = [sample1 + "_Signal.UniqueMultiple." + 
+			rename_expr[directionality[samples_table.loc[sample1, "kallisto_directionality"]]][0] + ".bg " + 
+			sample1 + "_Signal.UniqueMultiple." + rename_expr[directionality[samples_table.loc[sample1, "kallisto_directionality"]]][1] + ".bg " + sample1
+			for sample1 in list(samples_table.index.values)]
+	# singularity:
+	# 	"docker://zavolab/alfa:1.1.1"
+	conda: os.path.abspath("../../envs/env_alfa.yaml")
+	log: os.path.abspath(os.path.join(config["local_log"], "ALFA", "alfa_bg_all_samples.log"))
+	shell:
+		""" 
+		mkdir -p {params.out_dir}; \
+		cd {params.out_dir}; \
+		cp {params.str1} . ; \
+		cp {params.str2} . ; \
+		rename 's/str1.out/{params.rename_str1}/' *str1.out.bg; \
+		rename 's/str2.out/{params.rename_str2}/' *str2.out.bg; \
+		(alfa -g {params.genome_index} \
+			--bedgraph {params.bg_and_sample_name} \
+			-s {params.orientation}) &> {log}; \
+		rm *{params.rename_str1}.bg *{params.rename_str2}.bg
+		"""
\ No newline at end of file
diff --git a/snakemake/paired_end.snakefile.smk b/snakemake/paired_end.snakefile.smk
index 61d8309..b92a203 100644
--- a/snakemake/paired_end.snakefile.smk
+++ b/snakemake/paired_end.snakefile.smk
@@ -354,25 +354,25 @@ rule star_rpm_paired_end:
 			"paired_end",
 			"{sample}",
 			"ALFA",
-			"STAR_Signal.Unique.str1.out.bg"),
+			"{sample}_Signal.Unique.str1.out.bg"),
 			os.path.join(
 			config["output_dir"],
 			"paired_end",
 			"{sample}",
 			"ALFA",
-			"STAR_Signal.UniqueMultiple.str1.out.bg")),
+			"{sample}_Signal.UniqueMultiple.str1.out.bg")),
 		str2 = (os.path.join(
 			config["output_dir"],
 			"paired_end",
 			"{sample}",
 			"ALFA",
-			"STAR_Signal.Unique.str2.out.bg"),
+			"{sample}_Signal.Unique.str2.out.bg"), 
 			os.path.join(
 			config["output_dir"],
 			"paired_end",
 			"{sample}",
 			"ALFA",
-			"STAR_Signal.UniqueMultiple.str2.out.bg"))
+			"{sample}_Signal.UniqueMultiple.str2.out.bg"))
 	params:
 		out_dir = directory(os.path.join(config["output_dir"],
 			"paired_end",
@@ -381,7 +381,7 @@ rule star_rpm_paired_end:
 		prefix = os.path.join(config["output_dir"],
 			"paired_end",
 			"{sample}", 
-			"ALFA", "STAR_"),
+			"ALFA", "{sample}_"),
 		stranded = "Stranded"
 	singularity:
 		"docker://zavolab/star:2.6.0a"
@@ -409,13 +409,13 @@ rule alfa_bg_paired_end:
 			"paired_end",
 			"{sample}",
 			"ALFA",
-			"STAR_Signal.UniqueMultiple.str1.out.bg"),
+			"{sample}_Signal.UniqueMultiple.str1.out.bg"),
 		str2 = os.path.join(
 			config["output_dir"],
 			"paired_end",
 			"{sample}",
 			"ALFA",
-			"STAR_Signal.UniqueMultiple.str2.out.bg"),
+			"{sample}_Signal.UniqueMultiple.str2.out.bg"),
 		gtf = lambda wildcards: os.path.join(config["alfa_indexes"], samples_table.loc[wildcards.sample, "organism"], 
 			str(samples_table.loc[wildcards.sample, "index_size"]), "ALFA", "sorted_genes.stranded.ALFA_index")
 	output:
@@ -438,12 +438,11 @@ rule alfa_bg_paired_end:
 			"{sample}",
 			"ALFA")),
 		orientation = lambda wildcards: directionality[samples_table.loc[wildcards.sample, "kallisto_directionality"]],
-		in_file_str1 = "STAR_Signal.UniqueMultiple.str1.out.bg",
-		rename_str1 = lambda wildcards: rename_files[directionality[samples_table.loc[wildcards.sample, "kallisto_directionality"]]][0],
-		in_file_str2 = "STAR_Signal.UniqueMultiple.str2.out.bg",
-		rename_str2 = lambda wildcards: rename_files[directionality[samples_table.loc[wildcards.sample, "kallisto_directionality"]]][1],
-		genome_index = lambda wildcards: os.path.abspath(os.path.join(config["alfa_indexes"], samples_table.loc[wildcards.sample, "organism"], 
-			str(samples_table.loc[wildcards.sample, "index_size"]), "ALFA", "sorted_genes")),
+		in_file_str1 = lambda wildcards, input: os.path.basename(input.str1),
+		rename_str1 = lambda wildcards: wildcards.sample + "_" + rename_files[directionality[samples_table.loc[wildcards.sample, "kallisto_directionality"]]][0],
+		in_file_str2 = lambda wildcards, input: os.path.basename(input.str2),
+		rename_str2 = lambda wildcards: wildcards.sample + "_" + rename_files[directionality[samples_table.loc[wildcards.sample, "kallisto_directionality"]]][1],
+		genome_index = lambda wildcards, input: os.path.abspath(os.path.join(os.path.dirname(input.gtf), "sorted_genes")),
 		name = "{sample}"
 	singularity:
 		"docker://zavolab/alfa:1.1.1"
diff --git a/snakemake/single_end.snakefile.smk b/snakemake/single_end.snakefile.smk
index 280d4ea..4d5d231 100644
--- a/snakemake/single_end.snakefile.smk
+++ b/snakemake/single_end.snakefile.smk
@@ -280,25 +280,25 @@ rule star_rpm_single_end:
 			"single_end",
 			"{sample}",
 			"ALFA",
-			"STAR_Signal.Unique.str1.out.bg"),
+			"{sample}_Signal.Unique.str1.out.bg"),
 			os.path.join(
 			config["output_dir"],
 			"single_end",
 			"{sample}",
 			"ALFA",
-			"STAR_Signal.UniqueMultiple.str1.out.bg")),
+			"{sample}_Signal.UniqueMultiple.str1.out.bg")),
 		str2 = (os.path.join(
 			config["output_dir"],
 			"single_end",
 			"{sample}",
 			"ALFA",
-			"STAR_Signal.Unique.str2.out.bg"),
+			"{sample}_Signal.Unique.str2.out.bg"),
 			os.path.join(
 			config["output_dir"],
 			"single_end",
 			"{sample}",
 			"ALFA",
-			"STAR_Signal.UniqueMultiple.str2.out.bg"))
+			"{sample}_Signal.UniqueMultiple.str2.out.bg"))
 	params:
 		out_dir = directory(os.path.join(config["output_dir"],
 			"single_end",
@@ -307,7 +307,7 @@ rule star_rpm_single_end:
 		prefix = os.path.join(config["output_dir"],
 			"single_end",
 			"{sample}", 
-			"ALFA", "STAR_"),
+			"ALFA", "{sample}_"),
 		stranded = "Stranded"
 	singularity:
 		"docker://zavolab/star:2.6.0a"
@@ -336,13 +336,13 @@ rule alfa_bg_single_end:
 			"single_end",
 			"{sample}",
 			"ALFA",
-			"STAR_Signal.UniqueMultiple.str1.out.bg"),
+			"{sample}_Signal.UniqueMultiple.str1.out.bg"),
 		str2 = os.path.join(
 			config["output_dir"],
 			"single_end",
 			"{sample}",
 			"ALFA",
-			"STAR_Signal.UniqueMultiple.str2.out.bg"),
+			"{sample}_Signal.UniqueMultiple.str2.out.bg"),
 		gtf = lambda wildcards: os.path.join(config["alfa_indexes"], samples_table.loc[wildcards.sample, "organism"], 
 			str(samples_table.loc[wildcards.sample, "index_size"]), "ALFA", "sorted_genes.stranded.ALFA_index")
 	output:
@@ -365,12 +365,11 @@ rule alfa_bg_single_end:
 			"{sample}",
 			"ALFA")),
 		orientation = lambda wildcards: directionality[samples_table.loc[wildcards.sample, "kallisto_directionality"]],
-		in_file_str1 = "STAR_Signal.UniqueMultiple.str1.out.bg",
-		rename_str1 = lambda wildcards: rename_files[directionality[samples_table.loc[wildcards.sample, "kallisto_directionality"]]][0],
-		in_file_str2 = "STAR_Signal.UniqueMultiple.str2.out.bg",
-		rename_str2 = lambda wildcards: rename_files[directionality[samples_table.loc[wildcards.sample, "kallisto_directionality"]]][1],
-		genome_index = lambda wildcards: os.path.abspath(os.path.join(config["alfa_indexes"], samples_table.loc[wildcards.sample, "organism"], 
-			str(samples_table.loc[wildcards.sample, "index_size"]), "ALFA", "sorted_genes")),
+		in_file_str1 = lambda wildcards, input: os.path.basename(input.str1),
+		rename_str1 = lambda wildcards: wildcards.sample + "_" + rename_files[directionality[samples_table.loc[wildcards.sample, "kallisto_directionality"]]][0],
+		in_file_str2 = lambda wildcards, input: os.path.basename(input.str2),
+		rename_str2 = lambda wildcards: wildcards.sample + "_" + rename_files[directionality[samples_table.loc[wildcards.sample, "kallisto_directionality"]]][1],
+		genome_index = lambda wildcards, input: os.path.abspath(os.path.join(os.path.dirname(input.gtf), "sorted_genes")),
 		name = "{sample}"
 	singularity:
 		"docker://zavolab/alfa:1.1.1"
diff --git a/tests/test_integration_workflow/test.local.sh b/tests/test_integration_workflow/test.local.sh
index 6bcf8d6..f26856c 100755
--- a/tests/test_integration_workflow/test.local.sh
+++ b/tests/test_integration_workflow/test.local.sh
@@ -20,7 +20,8 @@ snakemake \
     --printshellcmds \
     --rerun-incomplete \
     --use-singularity \
-    --singularity-args="--bind ${PWD}"
+    --singularity-args="--bind ${PWD}" \
+    --use-conda
 find results/ -type f -name \*\.gz -exec gunzip '{}' \;
 find results/ -type f -name \*\.zip -exec sh -c 'unzip -o {} -d $(dirname {})' \;
 md5sum --check "expected_output.md5"
-- 
GitLab