Skip to content
Snippets Groups Projects
Commit 37fb0fd0 authored by Dominik Burri's avatar Dominik Burri
Browse files

added rule for

- renaming bedgraph
- creating ALFA qc plots

removed conda dependence, moved import statement.

included ALFA in finish rule, corrected annotation.gtf and config.yaml, created new .svg
parent 9f2e0a67
No related branches found
No related tags found
1 merge request!41Alfa integration
......@@ -4,6 +4,7 @@ import os
import sys
import pandas as pd
import shutil
# Get sample table
samples_table = pd.read_csv(
......@@ -16,7 +17,7 @@ samples_table = pd.read_csv(
)
# Global config
localrules: finish
localrules: finish, rename_star_rpm_for_alfa
# Create log directories
os.makedirs(
......@@ -99,7 +100,23 @@ rule finish:
zip,
sample=[i for i in list(samples_table.index.values)],
seqmode=[samples_table.loc[i, 'seqmode']
for i in list(samples_table.index.values)])
for i in list(samples_table.index.values)]),
alfa_reports = expand(os.path.join(
config["output_dir"],
"{seqmode}",
"{sample}",
"ALFA",
"ALFA_plots.Biotypes.pdf"),
zip,
sample= [i for i in list(samples_table.index.values)],
seqmode= [
samples_table.loc[i,"seqmode"]
for i in list(samples_table.index.values)]),
alfa_all_samples = os.path.join(
config["output_dir"],
"ALFA",
"ALFA_plots.Categories.pdf")
......@@ -441,6 +458,47 @@ rule star_rpm:
"""
rule rename_star_rpm_for_alfa:
input:
str1 = os.path.join(
config["output_dir"],
"{seqmode}",
"{sample}",
"STAR_coverage",
"{sample}_Signal.UniqueMultiple.str1.out.bg"),
str2 = os.path.join(
config["output_dir"],
"{seqmode}",
"{sample}",
"STAR_coverage",
"{sample}_Signal.UniqueMultiple.str2.out.bg")
output:
plus = os.path.join(
config["output_dir"],
"{seqmode}",
"{sample}",
"ALFA",
"{sample}_Signal.UniqueMultiple.out.plus.bg"),
minus = os.path.join(
config["output_dir"],
"{seqmode}",
"{sample}",
"ALFA",
"{sample}_Signal.UniqueMultiple.out.minus.bg")
params:
orientation = lambda wildcards: samples_table.loc[wildcards.sample, "kallisto_directionality"]
run:
if params['orientation'] == "--fr":
shutil.copy2(input['str1'], output['plus'])
shutil.copy2(input['str2'], output['minus'])
elif params['orientation'] == "--rf":
shutil.copy2(input['str1'], output['minus'])
shutil.copy2(input['str2'], output['plus'])
rule calculate_TIN_scores:
"""
Caluclate transcript integrity (TIN) score
......@@ -612,4 +670,166 @@ rule salmon_quantmerge_transcripts:
--names {params.sample_name_list} \
--column {params.salmon_merge_on} \
--output {output.salmon_out}) \
1> {log.stdout} 2> {log.stderr}"
\ No newline at end of file
1> {log.stdout} 2> {log.stderr}"
#################################################################################
### ALFA: Annotation Landscape For Aligned reads
#################################################################################
directionality = {"--fr": "fr-firststrand", "--rf": "fr-secondstrand"}
rename_files = {"fr-firststrand": ("Signal.UniqueMultiple.out.plus.bg",
"Signal.UniqueMultiple.out.minus.bg"),
"fr-secondstrand": ("Signal.UniqueMultiple.out.minus.bg",
"Signal.UniqueMultiple.out.plus.bg")}
rule generate_alfa_index:
''' Generate ALFA index files from sorted GTF file '''
input:
gtf = lambda wildcards: samples_table["gtf"][samples_table["organism"]==wildcards.organism][0],
chr_len = os.path.join(
config["star_indexes"],
"{organism}",
"{index_size}",
"STAR_index",
"chrNameLength.txt"),
output:
index_stranded = os.path.join(config["alfa_indexes"],
"{organism}",
"{index_size}",
"ALFA",
"sorted_genes.stranded.ALFA_index"),
index_unstranded = os.path.join(config["alfa_indexes"],
"{organism}",
"{index_size}",
"ALFA",
"sorted_genes.unstranded.ALFA_index")
params:
genome_index = "sorted_genes",
out_dir = lambda wildcards, output: os.path.dirname(output.index_stranded)
threads: 4
singularity:
"docker://zavolab/alfa:1.1.1"
log:
os.path.join(config["log_dir"], "{organism}_{index_size}_generate_alfa_index.log")
shell:
"""
alfa -a {input.gtf} \
-g {params.genome_index} \
--chr_len {input.chr_len} \
-p {threads} \
-o {params.out_dir} &> {log}
"""
rule alfa_qc:
''' Run ALFA from stranded bedgraph files '''
input:
plus = os.path.join(
config["output_dir"],
"{seqmode}",
"{sample}",
"ALFA",
"{sample}_Signal.UniqueMultiple.out.plus.bg"),
minus = os.path.join(
config["output_dir"],
"{seqmode}",
"{sample}",
"ALFA",
"{sample}_Signal.UniqueMultiple.out.minus.bg"),
gtf = lambda wildcards: os.path.join(config["alfa_indexes"],
samples_table.loc[wildcards.sample, "organism"],
str(samples_table.loc[wildcards.sample, "index_size"]),
"ALFA",
"sorted_genes.stranded.ALFA_index")
output:
biotypes = os.path.join(
config["output_dir"],
"{seqmode}",
"{sample}",
"ALFA",
"ALFA_plots.Biotypes.pdf"),
categories = os.path.join(
config["output_dir"],
"{seqmode}",
"{sample}",
"ALFA",
"ALFA_plots.Categories.pdf"),
table = os.path.join(
config["output_dir"],
"{seqmode}",
"{sample}",
"ALFA",
"{sample}.ALFA_feature_counts.tsv")
params:
out_dir = lambda wildcards, output: os.path.dirname(output.biotypes),
alfa_orientation = lambda wildcards: directionality[samples_table.loc[wildcards.sample, "kallisto_directionality"]],
in_file_plus = lambda wildcards, input: os.path.basename(input.plus),
in_file_minus = lambda wildcards, input: os.path.basename(input.minus),
genome_index = lambda wildcards, input: os.path.abspath(os.path.join(os.path.dirname(input.gtf), "sorted_genes")),
name = "{sample}"
singularity:
"docker://zavolab/alfa:1.1.1"
log:
os.path.abspath(os.path.join(
config["log_dir"],
"{seqmode}",
"{sample}",
"alfa_qc.log"))
shell:
"""
cd {params.out_dir}; \
(alfa -g {params.genome_index} \
--bedgraph {params.in_file_plus} {params.in_file_minus} {params.name} \
-s {params.alfa_orientation}) &> {log}
"""
rule alfa_qc_all_samples:
''' Run ALFA from stranded bedgraph files on all samples '''
input:
tables = [os.path.join(
config["output_dir"],
samples_table.loc[sample1, "seqmode"],
str(sample1),
"ALFA",
sample1 + ".ALFA_feature_counts.tsv")
for sample1 in list(samples_table.index.values)]
output:
biotypes = os.path.join(
config["output_dir"],
"ALFA",
"ALFA_plots.Biotypes.pdf"),
categories = os.path.join(
config["output_dir"],
"ALFA",
"ALFA_plots.Categories.pdf")
params:
out_dir = lambda wildcards, output: os.path.dirname(output.biotypes)
log:
os.path.abspath(
os.path.join(config["log_dir"],
"alfa_qc_all_samples.log"))
singularity:
"docker://zavolab/alfa:1.1.1"
shell:
"""
(alfa -c {input.tables} -o {params.out_dir}) &> {log}
"""
This diff is collapsed.
This diff is collapsed.
......@@ -5,4 +5,5 @@
kallisto_indexes: "results/kallisto_indexes/"
salmon_indexes: "results/salmon_indexes/"
star_indexes: "results/star_indexes/"
alfa_indexes: "results/alfa_indexes/"
...
......@@ -15,8 +15,8 @@
1-10000-20000 havana exon 2976 3053 . + . gene_id "ENSG00000223972"; gene_version "5"; transcript_id "ENST00000450305"; transcript_version "2"; exon_number "4"; gene_name "DDX11L1"; gene_source "havana"; gene_biotype "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-201"; transcript_source "havana"; transcript_biotype "transcribed_unprocessed_pseudogene"; exon_id "ENSE00001799933"; exon_version "2"; tag "basic"; transcript_support_level "NA";
1-10000-20000 havana exon 3222 3375 . + . gene_id "ENSG00000223972"; gene_version "5"; transcript_id "ENST00000450305"; transcript_version "2"; exon_number "5"; gene_name "DDX11L1"; gene_source "havana"; gene_biotype "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-201"; transcript_source "havana"; transcript_biotype "transcribed_unprocessed_pseudogene"; exon_id "ENSE00001746346"; exon_version "2"; tag "basic"; transcript_support_level "NA";
1-10000-20000 havana exon 3454 3671 . + . gene_id "ENSG00000223972"; gene_version "5"; transcript_id "ENST00000450305"; transcript_version "2"; exon_number "6"; gene_name "DDX11L1"; gene_source "havana"; gene_biotype "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-201"; transcript_source "havana"; transcript_biotype "transcribed_unprocessed_pseudogene"; exon_id "ENSE00001863096"; exon_version "1"; tag "basic"; transcript_support_level "NA";
1-10000-20000 havana gene 4405 8367 . - . gene_id "ENSG00000227232"; gene_version "5"; gene_name "WASH7P"; gene_source "havana"; gene_biotype "unprocessed_pseudogene";
1-10000-20000 havana transcript 4405 8367 . - . gene_id "ENSG00000227232"; gene_version "5"; transcript_id "ENST00000488147"; transcript_version "1"; gene_name "WASH7P"; gene_source "havana"; gene_biotype "unprocessed_pseudogene"; transcript_name "WASH7P-201"; transcript_source "havana"; transcript_biotype "unprocessed_pseudogene"; tag "basic"; transcript_support_level "NA";
1-10000-20000 havana gene 4405 8367 . - . gene_id "ENSG00000227232"; gene_version "5"; gene_name "WASH7P"; gene_source "havana"; gene_biotype "unprocessed_pseudogene";
1-10000-20000 havana transcript 4405 8367 . - . gene_id "ENSG00000227232"; gene_version "5"; transcript_id "ENST00000488147"; transcript_version "1"; gene_name "WASH7P"; gene_source "havana"; gene_biotype "unprocessed_pseudogene"; transcript_name "WASH7P-201"; transcript_source "havana"; transcript_biotype "unprocessed_pseudogene"; tag "basic"; transcript_support_level "NA";
1-10000-20000 havana exon 8269 8367 . - . gene_id "ENSG00000227232"; gene_version "5"; transcript_id "ENST00000488147"; transcript_version "1"; exon_number "3"; gene_name "WASH7P"; gene_source "havana"; gene_biotype "unprocessed_pseudogene"; transcript_name "WASH7P-201"; transcript_source "havana"; transcript_biotype "unprocessed_pseudogene"; exon_id "ENSE00003477500"; exon_version "1"; tag "basic"; transcript_support_level "NA";
1-10000-20000 havana exon 7916 8062 . - . gene_id "ENSG00000227232"; gene_version "5"; transcript_id "ENST00000488147"; transcript_version "1"; exon_number "4"; gene_name "WASH7P"; gene_source "havana"; gene_biotype "unprocessed_pseudogene"; transcript_name "WASH7P-201"; transcript_source "havana"; transcript_biotype "unprocessed_pseudogene"; exon_id "ENSE00003565697"; exon_version "1"; tag "basic"; transcript_support_level "NA";
1-10000-20000 havana exon 7607 7743 . - . gene_id "ENSG00000227232"; gene_version "5"; transcript_id "ENST00000488147"; transcript_version "1"; exon_number "5"; gene_name "WASH7P"; gene_source "havana"; gene_biotype "unprocessed_pseudogene"; transcript_name "WASH7P-201"; transcript_source "havana"; transcript_biotype "unprocessed_pseudogene"; exon_id "ENSE00003475637"; exon_version "1"; tag "basic"; transcript_support_level "NA";
......
#!/bin/bash
# Tear down test environment
cleanup () {
rc=$?
rm -rf .java/
rm -rf .snakemake/
rm -rf logs/
rm -rfv results/alfa_indexes/
rm -rfv results/*/*/ALFA/
rm -rfv results/*/*/STAR_coverage/
rm -rfv results/ALFA/
cd $user_dir
echo "Exit status: $rc"
}
trap cleanup EXIT
# Set up test environment
set -eo pipefail # ensures that script exits at first command that exits with non-zero status
set -u # ensures that script exits when unset variables are used
set -x # facilitates debugging by printing out executed commands
user_dir=$PWD
script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
cd $script_dir
# Run tests
snakemake \
--snakefile="../../Snakefile" \
--configfile="input_files/config.yaml" \
--cores=4 \
--printshellcmds \
--rerun-incomplete \
--use-singularity \
--singularity-args="--bind ${PWD}" \
--verbose \
results/ALFA/ALFA_plots.Categories.pdf
# Check md5 sum of some output files
find results/ -type f -name \*\.gz -exec gunzip '{}' \;
find results/ -type f -name \*\.zip -exec sh -c 'unzip -o {} -d $(dirname {})' \;
md5sum --check "expected_output.md5"
......@@ -413,4 +413,5 @@ rule pe_genome_quantification_kallisto:
--pseudobam \
{params.directionality} \
{input.reads1} {input.reads2} > {output.pseudoalignment}) \
2> {log.stderr}"
\ No newline at end of file
2> {log.stderr}"
......@@ -369,4 +369,5 @@ rule genome_quantification_kallisto:
--pseudobam \
{params.directionality} \
{input.reads} > {output.pseudoalignment};) \
2> {log.stderr}"
\ No newline at end of file
2> {log.stderr}"
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment