diff --git a/Snakefile b/Snakefile index ba176d984b01cdec9eb7768d18cf711bc02bf872..283403578f98200ab045856e651c0d5b9d3be042 100644 --- a/Snakefile +++ b/Snakefile @@ -103,17 +103,21 @@ rule finish: ] ), alfa_reports = expand(os.path.join( - config["output_dir"], - "{seqmode}", - "{sample}", - "ALFA", - "ALFA_plots.Biotypes.pdf"), - zip, - sample= [i for i in list(samples_table.index.values)], - seqmode= [ + config["output_dir"], + "{seqmode}", + "{sample}", + "ALFA", + "ALFA_plots.Biotypes.pdf"), + zip, + sample= [i for i in list(samples_table.index.values)], + seqmode= [ samples_table.loc[i,"seqmode"] for i in list(samples_table.index.values)] ), + alfa_all_samples = os.path.join( + config["output_dir"], + "ALFA", + "ALFA_plots.Categories.pdf"), rule create_index_star: @@ -325,108 +329,72 @@ rule calculate_TIN_scores: directionality = {"--fr": "fr-firststrand", "--rf": "fr-secondstrand"} rename_files = {"fr-firststrand": ("Signal.UniqueMultiple.out.plus.bg", - "Signal.UniqueMultiple.out.minus.bg"), - "fr-secondstrand": ("Signal.UniqueMultiple.out.minus.bg", - "Signal.UniqueMultiple.out.plus.bg")} -rename_expr = {"fr-firststrand": ("out.plus", "out.minus"), - "fr-secondstrand": ("out.minus", "out.plus")} - + "Signal.UniqueMultiple.out.minus.bg"), + "fr-secondstrand": ("Signal.UniqueMultiple.out.minus.bg", + "Signal.UniqueMultiple.out.plus.bg")} + + rule generate_alfa_index: - ''' Generate ALFA index files from sorted GTF file ''' - input: - gtf = lambda wildcards: samples_table["gtf"][samples_table["organism"]==wildcards.organism][0], - chr_len = os.path.join( - config["star_indexes"], - "{organism}", - "{index_size}", - "STAR_index", - "chrNameLength.txt"), - output: - index_stranded = os.path.join(config["alfa_indexes"], "{organism}", - "{index_size}", "ALFA", "sorted_genes.stranded.ALFA_index"), - index_unstranded = os.path.join(config["alfa_indexes"], "{organism}", - "{index_size}", "ALFA", "sorted_genes.unstranded.ALFA_index") - params: - genome_index = "sorted_genes", - out_dir = lambda wildcards, output: os.path.dirname(output.index_stranded) - threads: 4 - singularity: - "docker://zavolab/alfa:1.1.1" - log: - os.path.join(config["log_dir"], "{organism}_{index_size}_generate_alfa_index.log") - shell: - "alfa -a {input.gtf} \ - -g {params.genome_index} \ - --chr_len {input.chr_len} \ - -p {threads} \ - -o {params.out_dir} &> {log}" + ''' Generate ALFA index files from sorted GTF file ''' + input: + gtf = lambda wildcards: samples_table["gtf"][samples_table["organism"]==wildcards.organism][0], + chr_len = os.path.join( + config["star_indexes"], + "{organism}", + "{index_size}", + "STAR_index", + "chrNameLength.txt"), + output: + index_stranded = os.path.join(config["alfa_indexes"], "{organism}", + "{index_size}", "ALFA", "sorted_genes.stranded.ALFA_index"), + index_unstranded = os.path.join(config["alfa_indexes"], "{organism}", + "{index_size}", "ALFA", "sorted_genes.unstranded.ALFA_index") + params: + genome_index = "sorted_genes", + out_dir = lambda wildcards, output: os.path.dirname(output.index_stranded) + threads: 4 + singularity: + "docker://zavolab/alfa:1.1.1" + log: + os.path.join(config["log_dir"], "{organism}_{index_size}_generate_alfa_index.log") + shell: + """ + alfa -a {input.gtf} \ + -g {params.genome_index} \ + --chr_len {input.chr_len} \ + -p {threads} \ + -o {params.out_dir} &> {log} + """ -# TODO: add rule that executes ALFA on all samples; should be on the same genome & same orientation rule alfa_bg_all_samples: - ''' Run ALFA from stranded bedgraph files ''' - input: - str1 = expand(os.path.join( - config["output_dir"], - "{seqmode}", - "{sample}", - "ALFA", - "{sample}_Signal.UniqueMultiple.str1.out.bg"), - zip, - sample= [i for i in list(samples_table.index.values)], - seqmode= [samples_table.loc[i,"seqmode"] for i in list(samples_table.index.values)]), - str2 = expand(os.path.join( - config["output_dir"], - "{seqmode}", - "{sample}", - "ALFA", - "{sample}_Signal.UniqueMultiple.str2.out.bg"), - zip, - sample= [i for i in list(samples_table.index.values)], - seqmode= [samples_table.loc[i,"seqmode"] for i in list(samples_table.index.values)]), - gtf = [os.path.join(config["alfa_indexes"], samples_table.loc[sample1, "organism"], - str(samples_table.loc[sample1, "index_size"]), "ALFA", "sorted_genes.stranded.ALFA_index") for sample1 in list(samples_table.index.values)] - output: - biotypes = os.path.join( - config["output_dir"], - "ALFA", - "ALFA_plots.Biotypes.pdf"), - categories = os.path.join( - config["output_dir"], - "ALFA", - "ALFA_plots.Categories.pdf") - params: - str1 = lambda wildcards, input: [os.path.abspath(i) for i in input.str1], - str2 = lambda wildcards, input: [os.path.abspath(i) for i in input.str2], - out_dir = lambda wildcards, output: directory(os.path.dirname(output.biotypes)), - in_file_str1 = lambda wildcards, input: [os.path.basename(inp) for inp in input.str1], - rename_str1 = [rename_expr[directionality[samples_table.loc[sample1, "kallisto_directionality"]]][0] for sample1 in list(samples_table.index.values)][0], - in_file_str2 = lambda wildcards, input: [os.path.basename(inp) for inp in input.str2], - rename_str2 = [rename_expr[directionality[samples_table.loc[sample1, "kallisto_directionality"]]][1] for sample1 in list(samples_table.index.values)][0], - genome_index = [os.path.abspath(os.path.join( - config["alfa_indexes"], - samples_table.loc[sample1, "organism"], - str(samples_table.loc[sample1, "index_size"]), - "ALFA", "sorted_genes")) for sample1 in list(samples_table.index.values)][0], - orientation = [directionality[samples_table.loc[sample1, "kallisto_directionality"]] for sample1 in list(samples_table.index.values)][0], - bg_and_sample_name = [sample1 + "_Signal.UniqueMultiple." + - rename_expr[directionality[samples_table.loc[sample1, "kallisto_directionality"]]][0] + ".bg " + - sample1 + "_Signal.UniqueMultiple." + rename_expr[directionality[samples_table.loc[sample1, "kallisto_directionality"]]][1] + ".bg " + sample1 - for sample1 in list(samples_table.index.values)] - # singularity: - # "docker://zavolab/alfa:1.1.1" - conda: os.path.abspath("../../envs/env_alfa.yaml") - log: os.path.abspath(os.path.join(config["log_dir"], "ALFA", "alfa_bg_all_samples.log")) - shell: - """ - mkdir -p {params.out_dir}; \ - cd {params.out_dir}; \ - cp {params.str1} . ; \ - cp {params.str2} . ; \ - rename 's/str1.out/{params.rename_str1}/' *str1.out.bg; \ - rename 's/str2.out/{params.rename_str2}/' *str2.out.bg; \ - (alfa -g {params.genome_index} \ - --bedgraph {params.bg_and_sample_name} \ - -s {params.orientation}) &> {log}; \ - rm *{params.rename_str1}.bg *{params.rename_str2}.bg - """ + ''' Run ALFA from stranded bedgraph files on all samples ''' + input: + tables = [os.path.join( + config["output_dir"], + samples_table.loc[sample1, "seqmode"], + str(sample1), + "ALFA", + sample1 + ".ALFA_feature_counts.tsv") + for sample1 in list(samples_table.index.values)] + output: + biotypes = os.path.join( + config["output_dir"], + "ALFA", + "ALFA_plots.Biotypes.pdf"), + categories = os.path.join( + config["output_dir"], + "ALFA", + "ALFA_plots.Categories.pdf") + params: + out_dir = lambda wildcards, output: os.path.dirname(output.biotypes) + log: + os.path.abspath( + os.path.join(config["log_dir"], + "alfa_bg_all_samples.log")) + singularity: + "docker://zavolab/alfa:1.1.1" + shell: + """ + (alfa -c {input.tables} -o {params.out_dir}) &> {log} + """ diff --git a/tests/test_alfa/test.sh b/tests/test_alfa/test.sh index de05e7f27fe18625dfad3f396e7f661aafddd58b..85ed9caeee63afa4cf7001d68aff94bd42978bec 100755 --- a/tests/test_alfa/test.sh +++ b/tests/test_alfa/test.sh @@ -8,6 +8,7 @@ cleanup () { rm -rf logs/ rm -rfv results/alfa_indexes/ rm -rfv results/*/*/ALFA/ + rm -rfv results/ALFA/ cd $user_dir echo "Exit status: $rc" } @@ -32,8 +33,9 @@ snakemake \ --use-conda \ --singularity-args="--bind ${PWD}/../input_files" \ --verbose \ - results/paired_end/synthetic_10_reads_paired_synthetic_10_reads_paired/ALFA/ALFA_plots.Biotypes.pdf \ - results/single_end/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/ALFA/ALFA_plots.Biotypes.pdf + results/paired_end/synthetic_10_reads_paired_synthetic_10_reads_paired/ALFA/ALFA_plots.Categories.pdf \ + results/single_end/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/ALFA/ALFA_plots.Categories.pdf \ + results/ALFA/ALFA_plots.Categories.pdf # Check md5 sum of some output files find results/ -type f -name \*\.gz -exec gunzip '{}' \; diff --git a/workflow/rules/paired_end.snakefile.smk b/workflow/rules/paired_end.snakefile.smk index f61ee56df9acfb9ce7fd783c3c8967ed3e562933..146ec035580dc5f5c1058be964ce8ba4858b059a 100644 --- a/workflow/rules/paired_end.snakefile.smk +++ b/workflow/rules/paired_end.snakefile.smk @@ -430,7 +430,13 @@ rule alfa_bg_paired_end: "paired_end", "{sample}", "ALFA", - "ALFA_plots.Categories.pdf") + "ALFA_plots.Categories.pdf"), + table = os.path.join( + config["output_dir"], + "paired_end", + "{sample}", + "ALFA", + "{sample}.ALFA_feature_counts.tsv") params: out_dir = directory(os.path.join( config["output_dir"], diff --git a/workflow/rules/single_end.snakefile.smk b/workflow/rules/single_end.snakefile.smk index 60e539ea5f4fa9dfdc955200675848d30fd082d0..7a9f072611bc2cb491726cf880a5583587594df7 100644 --- a/workflow/rules/single_end.snakefile.smk +++ b/workflow/rules/single_end.snakefile.smk @@ -357,7 +357,13 @@ rule alfa_bg_single_end: "single_end", "{sample}", "ALFA", - "ALFA_plots.Categories.pdf") + "ALFA_plots.Categories.pdf"), + table = os.path.join( + config["output_dir"], + "single_end", + "{sample}", + "ALFA", + "{sample}.ALFA_feature_counts.tsv") params: out_dir = directory(os.path.join( config["output_dir"],