Skip to content
Snippets Groups Projects
Snakefile 35.3 KiB
Newer Older
"""General purpose RNA-Seq analysis pipeline developed by the Zavolan Lab"""
import pandas as pd
Dominik Burri's avatar
Dominik Burri committed
import shutil
# Get sample table
samples_table = pd.read_csv(
    config['samples'],
    header=0,
    index_col=0,
    comment='#',
    engine='python',
    sep="\t",
)
# Global config
localrules: start, finish, rename_star_rpm_for_alfa, prepare_multiqc_config
if cluster_config:
    os.makedirs(
        os.path.join(
            os.getcwd(),
            os.path.dirname(cluster_config['__default__']['out']),
        ),
# Include subworkflows
include: os.path.join("workflow", "rules", "paired_end.snakefile.smk")
include: os.path.join("workflow", "rules", "single_end.snakefile.smk")
        multiqc_report = os.path.join(
            config['output_dir'],
            "multiqc_summary"),
        bigWig = expand(
            os.path.join(
                config["output_dir"],
                "samples",
                "{sample}",
                "bigWig",
                "{unique_type}",
                "{sample}_{unique_type}_{strand}.bw"),
            sample=samples_table.index.values,
            strand=["plus", "minus"],
            unique_type=["Unique", "UniqueMultiple"]),

        salmon_merge_genes = expand(
            os.path.join(
                config["output_dir"],
                "summary_salmon",
                "quantmerge",
                "genes_{salmon_merge_on}.tsv"),
            salmon_merge_on=["tpm", "numreads"]),

        salmon_merge_transcripts = expand(
            os.path.join(
                config["output_dir"],
                "summary_salmon",
                "quantmerge",
                "transcripts_{salmon_merge_on}.tsv"),
            salmon_merge_on=["tpm", "numreads"]),


rule start:
    '''
       Get samples
    '''
    input:
        reads = lambda wildcards:
            samples_table.loc[wildcards.sample, wildcards.mate],

    output:
        reads = os.path.join(
            config["output_dir"],
            "samples",
            "{sample}",
            "start",
            "{sample}.{mate}.fastq.gz")

    log:
        stderr = os.path.join(
            config["log_dir"],
            "samples",
            "{sample}",
            "start_{sample}.{mate}.stderr.log"),
        stdout = os.path.join(
            config["log_dir"],
            "samples",
            "{sample}",
            "start_{sample}.{mate}.stdout.log")

    singularity:
        "docker://bash:5.0.16"

    shell:
        "(cp {input.reads} {output.reads}) \
        1> {log.stdout} 2> {log.stderr} "


rule fastqc:
    '''
        A quality control tool for high throughput sequence data
    '''
    input:
        reads = os.path.join(
            config["output_dir"],
            "samples",
            "{sample}",
            "start",
            "{sample}.{mate}.fastq.gz")

    output:
        outdir = directory(
            os.path.join(
                config["output_dir"],
                "samples",
                "{sample}",
                "fastqc",
                "{mate}"))

    threads: 2

    singularity:
        "docker://zavolab/fastqc:0.11.9-slim"

    log:
        stderr = os.path.join(
            config["log_dir"],
            "samples",
            "{sample}",
            "fastqc_{mate}.stderr.log"),
        stdout = os.path.join(
            config["log_dir"],
            "samples",
            "{sample}",
            "fastqc_{mate}.stdout.log")

    shell:
        "(mkdir -p {output.outdir}; \
        fastqc --outdir {output.outdir} {input.reads}) \
        1> {log.stdout} 2> {log.stderr}"
    input:
        genome = lambda wildcards:
            samples_table['genome']
            [samples_table['organism'] == wildcards.organism]
            [0],
        gtf = lambda wildcards:
            samples_table['gtf']
            [samples_table['organism'] == wildcards.organism]
            [0]

    output:
        chromosome_info = os.path.join(
            config['star_indexes'],
            "{organism}",
            "{index_size}",
            "STAR_index",
        chromosomes_names = os.path.join(
            config['star_indexes'],
            "{organism}",
            "{index_size}",
            "STAR_index",
    params:
        output_dir = os.path.join(
            config['star_indexes'],
            "{organism}",
            "{index_size}",
        outFileNamePrefix = os.path.join(
            config['star_indexes'],
            "{organism}",
            "{index_size}",
    singularity:
Alex Kanitz's avatar
Alex Kanitz committed
        "docker://zavolab/star:2.7.3a-slim"
    threads: 12
            config['log_dir'],
            "{organism}_{index_size}_create_index_star.stderr.log"),
        stdout = os.path.join(
            config['log_dir'],
            "{organism}_{index_size}_create_index_star.stdout.log")

    shell:
        "(mkdir -p {params.output_dir}; \
        chmod -R 777 {params.output_dir}; \
        STAR \
        --runMode genomeGenerate \
        --sjdbOverhang {params.sjdbOverhang} \
        --genomeDir {params.output_dir} \
        --genomeFastaFiles {input.genome} \
        --runThreadN {threads} \
        --outFileNamePrefix {params.outFileNamePrefix} \
        --sjdbGTFfile {input.gtf}) \
        1> {log.stdout} 2> {log.stderr}"
    """
        Create transcriptome from genome and gene annotations
    """
                samples_table['organism'] == wildcards.organism][0],
                samples_table['organism'] == wildcards.organism][0]

            config['output_dir'],
            "transcriptome",
            "{organism}",
            "transcriptome.fa")

    log:
        stderr = os.path.join(
            config['log_dir'],
            "{organism}_extract_transcriptome.log"),
        stdout = os.path.join(
            config['log_dir'],
            "{organism}_extract_transcriptome.log")
        "docker://zavolab/gffread:0.11.7-slim"
    shell:
        "(gffread \
        -w {output.transcriptome} \
        -g {input.genome} {input.gtf}) \
        1> {log.stdout} 2> {log.stderr}"
rule concatenate_transcriptome_and_genome:
    """
        Concatenate genome and transcriptome
    """
    input:
        transcriptome = os.path.join(
            config['output_dir'],
            "transcriptome",
            "{organism}",
            "transcriptome.fa"),

        genome = lambda wildcards:
            samples_table['genome']
            [samples_table['organism'] == wildcards.organism]
            [0]

    output:
        genome_transcriptome = os.path.join(
            config['output_dir'],
            "transcriptome",
            "{organism}",
            "genome_transcriptome.fa")

    singularity:
        "docker://bash:5.0.16"

    log:
        stderr = os.path.join(
            config['log_dir'],
            "{organism}_concatenate_transcriptome_and_genome.stderr.log")

    shell:
        "(cat {input.transcriptome} {input.genome} \
        1> {output.genome_transcriptome}) \
        2> {log.stderr}"


rule create_index_salmon:
    """
        Create index for Salmon quantification
    """
    input:
        genome_transcriptome = os.path.join(
            config['output_dir'],
            "transcriptome",
            "{organism}",
            "genome_transcriptome.fa"),
        chr_names = lambda wildcards:
            os.path.join(
                config['star_indexes'],
                samples_table["organism"][0],
                str(samples_table["index_size"][0]),
                "STAR_index",
                "chrName.txt")
    output:
        index = directory(
            os.path.join(
                config['salmon_indexes'],
                "{organism}",
                "{kmer}",
    singularity:
        "docker://zavolab/salmon:1.1.0-slim"
            config['log_dir'],
            "{organism}_{kmer}_create_index_salmon.stderr.log"),
        stdout = os.path.join(
            config['log_dir'],
            "{organism}_{kmer}_create_index_salmon.stdout.log")

    threads: 8
    shell:
        "(salmon index \
        --transcripts {input.genome_transcriptome} \
        --decoys {input.chr_names} \
        --index {output.index} \
        --kmerLen {params.kmerLen} \
        --threads {threads}) \
        1> {log.stdout} 2> {log.stderr}"
            config['output_dir'],
            "transcriptome",
            "{organism}",
            "transcriptome.fa")

    output:
        index = os.path.join(
            config['kallisto_indexes'],
            "{organism}",
    params:
        output_dir = os.path.join(
            config['kallisto_indexes'],
    singularity:
Alex Kanitz's avatar
Alex Kanitz committed
        "docker://zavolab/kallisto:0.46.1-slim"
            config['log_dir'],
            "{organism}_create_index_kallisto.stderr.log"),
        stdout = os.path.join(
            config['log_dir'],
            "{organism}_create_index_kallisto.stdout.log")

    shell:
        "(mkdir -p {params.output_dir}; \
        chmod -R 777 {params.output_dir}; \
        kallisto index -i {output.index} {input.transcriptome}) \
        1> {log.stdout}  2> {log.stderr}"

rule extract_transcripts_as_bed12:
    input:
        gtf = lambda wildcards:
            samples_table['gtf'][0]
    output:
        bed12 = os.path.join(
            config['output_dir'],
    singularity:
        "docker://zavolab/zgtf:0.1"
    threads: 1
        stdout = os.path.join(
            config['log_dir'],
            "extract_transcripts_as_bed12.stdout.log"),
            config['log_dir'],
        "(gtf2bed12 \
        --gtf {input.gtf} \
        --transcript_type protein_coding \
        --bed12 {output.bed12}); \
        1> {log.stdout} 2> {log.stderr}"
rule index_genomic_alignment_samtools:
    '''
        Index genome bamfile using samtools
    '''
    input:
        bam = os.path.join(
            config["output_dir"],
            "samples",
            "{sample}.{seqmode}.Aligned.sortedByCoord.out.bam"),
    output:
        bai = os.path.join(
            config["output_dir"],
            "samples",
            "{sample}.{seqmode}.Aligned.sortedByCoord.out.bam.bai")

    singularity:
        "docker://zavolab/samtools:1.10-slim"

    threads: 1

    log:
        stderr = os.path.join(
            config["log_dir"],
            "samples",
            "index_genomic_alignment_samtools.{seqmode}.stderr.log"),
        stdout = os.path.join(
            config["log_dir"],
            "samples",
            "index_genomic_alignment_samtools.{seqmode}.stdout.log")

    shell:
        "(samtools index {input.bam} {output.bai};) \
        1> {log.stdout} 2> {log.stderr}"


rule calculate_TIN_scores:
        Calculate transcript integrity (TIN) score
        bam = lambda wildcards:
            expand(
                os.path.join(
                    config['output_dir'],
                    "samples",
                    "{sample}",
                    "map_genome",
                    "{sample}.{seqmode}.Aligned.sortedByCoord.out.bam"),
                sample=wildcards.sample,
                seqmode=samples_table.loc[wildcards.sample, 'seqmode']),
        bai = lambda wildcards:
            expand(
                os.path.join(
                    config['output_dir'],
                    "samples",
                    "{sample}",
                    "map_genome",
                    "{sample}.{seqmode}.Aligned.sortedByCoord.out.bam.bai"),
                sample=wildcards.sample,
                seqmode=samples_table.loc[wildcards.sample, 'seqmode']),
        transcripts_bed12 = os.path.join(
            config['output_dir'],
    output:
        TIN_score = os.path.join(
            config['output_dir'],
            "samples",
            "{sample}",
            "TIN",
            config['log_dir'],
            "samples",
            "{sample}",
            "calculate_TIN_scores.log")
    threads: 8
    singularity:
        "docker://zavolab/tin_score_calculation:0.2.0-slim"
        -r {input.transcripts_bed12} \
        -c 0 \
        --names {params.sample} \
        -n 100 > {output.TIN_score};) 2> {log.stderr}"
rule merge_TIN_scores:
    """
        Merge TIN scores tables
    """
    input:
        TIN_score = expand(
            os.path.join(
                config['output_dir'],
                "samples",
                "{sample}",
                "TIN",
                "TIN_score.tsv"),
            sample=samples_table.index.values),

    output:
        TIN_scores_merged = os.path.join(
            config['output_dir'],
            "TIN_scores_merged.tsv")

    log:
        stderr = os.path.join(
            config['log_dir'],
            "merge_TIN_scores.stderr.log"),
        stdout = os.path.join(
            config["log_dir"],
            "merge_TIN_scores.stdout.log")

    params:
        TIN_score_merged_paths = ",".join(expand(
            os.path.join(
                config['output_dir'],
                "samples",
                "{sample}",
                "TIN",
                "TIN_score.tsv"),
            zip,
            sample=[i for i in list(samples_table.index.values)],
            seqmode=[samples_table.loc[i, 'seqmode']
                     for i in list(samples_table.index.values)]))

    threads: 1

    singularity:
        "docker://zavolab/tin_score_calculation:0.2.0-slim"

    shell:
        "(tin_score_merge.py \
        --input-files {params.TIN_score_merged_paths} \
        --output-file {output.TIN_scores_merged}) \
        1> {log.stdout} 2> {log.stderr}"


rule plot_TIN_scores:
    """
        Generate TIN scores boxplots
    """
    input:
        TIN_scores_merged = os.path.join(
            config['output_dir'],
            "TIN_scores_merged.tsv"),

    output:
        TIN_boxplot_PNG = os.path.join(
            config['output_dir'],
            "TIN_scores_boxplot_mqc.png"),
        TIN_boxplot_PDF = os.path.join(
            config['output_dir'],
            "TIN_scores_boxplot_mqc.pdf")

    params:
        TIN_boxplot_prefix = os.path.join(
            config['output_dir'],
            "TIN_scores_boxplot_mqc")

    log:
        stderr = os.path.join(
            config['log_dir'],
            "plot_TIN_scores.stderr.log"),
        stdout = os.path.join(
            config["log_dir"],
            "plot_TIN_scores.stdout.log")

    threads: 1

    singularity:
        "docker://zavolab/tin_score_calculation:0.2.0-slim"

    shell:
        "(tin_score_plot.py \
        --input-file {input.TIN_scores_merged} \
        --output-file-prefix {params.TIN_boxplot_prefix}) \
        1> {log.stdout} 2> {log.stderr}"


rule salmon_quantmerge_genes:
        salmon_in = expand(
            os.path.join(
                config["output_dir"],
                "samples",
                "{sample}.salmon.{seqmode}",
                "quant.sf"),
            sample=samples_table.index.values,
            seqmode=[samples_table.loc[i, 'seqmode']
                     for i in list(samples_table.index.values)])
            config["output_dir"],
            "summary_salmon",
            "quantmerge",
            "genes_{salmon_merge_on}.tsv")

    params:
        salmon_in = expand(
                "samples",
                "{sample}.salmon.{seqmode}"),
            sample=[i for i in list(samples_table.index.values)],
            seqmode=[samples_table.loc[i, 'seqmode']
                     for i in list(samples_table.index.values)]),
        sample_name_list = expand(
            "{sample}",
            sample=list(samples_table.index.values)),
        salmon_merge_on = "{salmon_merge_on}"
        stderr = os.path.join(
            config["log_dir"],
            "salmon_quantmerge_genes_{salmon_merge_on}.stderr.log"),
        stdout = os.path.join(
            config["log_dir"],
            "salmon_quantmerge_genes_{salmon_merge_on}.stdout.log")

    threads: 1

    singularity:
        "docker://zavolab/salmon:1.1.0-slim"
        --quants {params.salmon_in} \
        --genes \
        --names {params.sample_name_list} \
        --column {params.salmon_merge_on} \
        --output {output.salmon_out};) \
        1> {log.stdout} 2> {log.stderr}"

rule salmon_quantmerge_transcripts:
        salmon_in = expand(
            os.path.join(
                config["output_dir"],
                "samples",
                "{sample}.salmon.{seqmode}",
            sample=[i for i in list(samples_table.index.values)],
            seqmode=[samples_table.loc[i, 'seqmode']
                     for i in list(samples_table.index.values)])
            config["output_dir"],
            "summary_salmon",
            "quantmerge",
            "transcripts_{salmon_merge_on}.tsv")

    params:
        salmon_in = expand(
                "samples",
                "{sample}.salmon.{seqmode}"),
            sample=[i for i in list(samples_table.index.values)],
            seqmode=[samples_table.loc[i, 'seqmode']
                     for i in list(samples_table.index.values)]),

        sample_name_list = expand(
            "{sample}",
            sample=list(samples_table.index.values)),
        salmon_merge_on = "{salmon_merge_on}"
        stderr = os.path.join(
            config["log_dir"],
            "salmon_quantmerge_transcripts_{salmon_merge_on}.stderr.log"),
        stdout = os.path.join(
            config["log_dir"],
            "salmon_quantmerge_transcripts_{salmon_merge_on}.stdout.log")

    threads: 1

        "docker://zavolab/salmon:1.1.0-slim"
        --quants {params.salmon_in} \
        --names {params.sample_name_list} \
        --column {params.salmon_merge_on} \
Dominik Burri's avatar
Dominik Burri committed
        1> {log.stdout} 2> {log.stderr}"


rule star_rpm:
    '''
        Create stranded bedgraph coverage with STARs RPM normalisation
    '''
    input:
        bam = lambda wildcards:
            expand(
                os.path.join(
                    config["output_dir"],
                    "samples",
                    "{sample}",
                    "map_genome",
                    "{sample}.{seqmode}.Aligned.sortedByCoord.out.bam"),
                sample=wildcards.sample,
                seqmode=samples_table.loc[wildcards.sample, 'seqmode']),
        bai = lambda wildcards:
            expand(
                os.path.join(
                    config["output_dir"],
                    "samples",
                    "{sample}",
                    "map_genome",
                    "{sample}.{seqmode}.Aligned.sortedByCoord.out.bam.bai"),
                sample=wildcards.sample,
                seqmode=samples_table.loc[wildcards.sample, 'seqmode']),

    output:
        str1 = os.path.join(
            config["output_dir"],
            "samples",
            "{sample}",
            "STAR_coverage",
            "{sample}_Signal.Unique.str1.out.bg"),
        str2 = os.path.join(
            config["output_dir"],
            "samples",
            "{sample}",
            "STAR_coverage",
            "{sample}_Signal.UniqueMultiple.str1.out.bg"),
        str3 = os.path.join(
            config["output_dir"],
            "samples",
            "{sample}",
            "STAR_coverage",
            "{sample}_Signal.Unique.str2.out.bg"),
        str4 = os.path.join(
            config["output_dir"],
            "samples",
            "{sample}",
            "STAR_coverage",
            "{sample}_Signal.UniqueMultiple.str2.out.bg")

    params:
        out_dir = lambda wildcards, output:
            os.path.dirname(output.str1),
        prefix = lambda wildcards, output:
            os.path.join(
                os.path.dirname(output.str1),
                str(wildcards.sample) + "_"),
        stranded = "Stranded"

    singularity:
        "docker://zavolab/star:2.7.3a-slim"

    log:
        stderr = os.path.join(
            config["log_dir"],
            "samples",
            "{sample}",
            "star_rpm.stderr.log"),
        stdout = os.path.join(
            config["log_dir"],
            "samples",
            "{sample}",
            "star_rpm.stdout.log")

    threads: 4

    shell:
        "(mkdir -p {params.out_dir}; \
        chmod -R 777 {params.out_dir}; \
        STAR \
        --runMode inputAlignmentsFromBAM \
        --runThreadN {threads} \
        --inputBAMfile {input.bam} \
        --outWigType bedGraph \
        --outWigStrand {params.stranded} \
        --outWigNorm RPM \
        --outFileNamePrefix {params.prefix}) \
        1> {log.stdout} 2> {log.stderr}"


rule rename_star_rpm_for_alfa:
    input:
        plus = lambda wildcards:
            expand(
                os.path.join(
                    config["output_dir"],
                    "samples",
                    "{sample}",
                    "STAR_coverage",
                    "{sample}_Signal.{unique}.{plus}.out.bg"),
                sample=wildcards.sample,
                unique=wildcards.unique,
                plus=samples_table.loc[wildcards.sample, 'alfa_plus']),

        minus = lambda wildcards:
            expand(
                os.path.join(
                    config["output_dir"],
                    "samples",
                    "{sample}",
                    "STAR_coverage",
                    "{sample}_Signal.{unique}.{minus}.out.bg"),
                sample=wildcards.sample,
                unique=wildcards.unique,
                minus=samples_table.loc[wildcards.sample, 'alfa_minus'])

    output:
        plus = os.path.join(
            config["output_dir"],
            "samples",
            "{sample}",
            "ALFA",
            "{unique}",
            "{sample}.{unique}.plus.bg"),
        minus = os.path.join(
            config["output_dir"],
            "samples",
            "{sample}",
            "ALFA",
            "{unique}",
            "{sample}.{unique}.minus.bg")

    params:
        orientation = lambda wildcards:
            samples_table.loc[wildcards.sample, "kallisto_directionality"]

    log:
        stderr = os.path.join(
            config["log_dir"],
            "samples",
            "{sample}",
            "rename_star_rpm_for_alfa__{unique}.stderr.log"),
        stdout = os.path.join(
            config["log_dir"],
            "samples",
            "{sample}",
            "rename_star_rpm_for_alfa__{unique}.stdout.log")

    singularity:
        "docker://bash:5.0.16"

    shell:
        "(cp {input.plus} {output.plus}; \
         cp {input.minus} {output.minus};) \
         1>{log.stdout} 2>{log.stderr}"
Dominik Burri's avatar
Dominik Burri committed


rule generate_alfa_index:
    ''' Generate ALFA index files from sorted GTF file '''
    input:
        gtf = lambda wildcards:
            samples_table["gtf"]
            [samples_table["organism"] == wildcards.organism][0],
Dominik Burri's avatar
Dominik Burri committed
        chr_len = os.path.join(
            config["star_indexes"],
            "{organism}",
            "{index_size}",
            "STAR_index",
            "chrNameLength.txt"),

    output:
        index_stranded = os.path.join(
            config["alfa_indexes"],
            "{organism}",
            "{index_size}",
            "ALFA",
Dominik Burri's avatar
Dominik Burri committed
            "sorted_genes.stranded.ALFA_index"),
        index_unstranded = os.path.join(
            config["alfa_indexes"],
            "{organism}",
            "{index_size}",
            "ALFA",
Dominik Burri's avatar
Dominik Burri committed
            "sorted_genes.unstranded.ALFA_index")

    params:
        genome_index = "sorted_genes",
        out_dir = lambda wildcards, output:
            os.path.dirname(output.index_stranded)
Dominik Burri's avatar
Dominik Burri committed

    threads: 4

    singularity:
        "docker://zavolab/alfa:1.1.1-slim"
Dominik Burri's avatar
Dominik Burri committed

    log:
        os.path.join(
            config["log_dir"],
            "{organism}_{index_size}_generate_alfa_index.log")
Dominik Burri's avatar
Dominik Burri committed

    shell:
        "(alfa -a {input.gtf} \
        -g {params.genome_index} \
        --chr_len {input.chr_len} \
        -p {threads} \
        -o {params.out_dir}) &> {log}"
Dominik Burri's avatar
Dominik Burri committed


rule alfa_qc:
    '''
        Run ALFA from stranded bedgraph files
    '''
Dominik Burri's avatar
Dominik Burri committed
    input:
        plus = os.path.join(
            config["output_dir"],
            "samples",
Dominik Burri's avatar
Dominik Burri committed
            "{sample}",
            "ALFA",
            "{unique}",
            "{sample}.{unique}.plus.bg"),
Dominik Burri's avatar
Dominik Burri committed
        minus = os.path.join(
            config["output_dir"],
            "samples",
Dominik Burri's avatar
Dominik Burri committed
            "{sample}",
            "ALFA",
            "{unique}",
            "{sample}.{unique}.minus.bg"),
        gtf = lambda wildcards:
            os.path.join(
                config["alfa_indexes"],
                samples_table.loc[wildcards.sample, "organism"],
                str(samples_table.loc[wildcards.sample, "index_size"]),
                "ALFA",
                "sorted_genes.stranded.ALFA_index")
Dominik Burri's avatar
Dominik Burri committed

    output:
        biotypes = os.path.join(
            config["output_dir"],
            "samples",
Dominik Burri's avatar
Dominik Burri committed
            "{sample}",
            "ALFA",
            "{unique}",
Dominik Burri's avatar
Dominik Burri committed
            "ALFA_plots.Biotypes.pdf"),
        categories = os.path.join(
            config["output_dir"],
            "samples",
Dominik Burri's avatar
Dominik Burri committed
            "{sample}",
            "ALFA",
            "{unique}",
Dominik Burri's avatar
Dominik Burri committed
            "ALFA_plots.Categories.pdf"),
        table = os.path.join(
            config["output_dir"],
            "samples",
Dominik Burri's avatar
Dominik Burri committed
            "{sample}",
            "ALFA",
            "{unique}",
Dominik Burri's avatar
Dominik Burri committed
            "{sample}.ALFA_feature_counts.tsv")

    params:
        out_dir = lambda wildcards, output:
            os.path.dirname(output.biotypes),