Skip to content
Snippets Groups Projects
Snakefile 48.90 KiB
"""General purpose RNA-Seq analysis pipeline developed by the Zavolan Lab"""
import os
import pandas as pd
import shutil
import yaml
from shlex import quote
from typing import Tuple

## Preparations
# Get sample table
samples_table = pd.read_csv(
    config['samples'],
    header=0,
    index_col=0,
    comment='#',
    engine='python',
    sep="\t",
)

# Parse YAML rule config file
if 'rule_config' in config and config['rule_config']:
    try:
        with open(config['rule_config']) as _file:
            rule_config = yaml.safe_load(_file)
        logger.info(f"Loaded rule_config from {config['rule_config']}.")
    except FileNotFoundError:
        logger.error(f"No rule config file found at {config['rule_config']}. Either provide file or remove rule_config parameter from config.yaml! ")
        raise
else:
    rule_config = {}
    logger.warning(f"No rule config specified: using default values for all tools.")

# Create dir for cluster logs, if applicable
if cluster_config:
    os.makedirs(
        os.path.join(
            os.getcwd(),
            os.path.dirname(cluster_config['__default__']['out']),
        ),
        exist_ok=True)


## Function definitions

def get_sample(column_id, search_id=None, search_value=None):
    """ Get relevant per sample information from samples table"""
    if search_id:
        if search_id == 'index':
            return str(samples_table[column_id][samples_table.index == search_value][0])
        else:
            return str(samples_table[column_id][samples_table[search_id] == search_value][0])
    else:
        return str(samples_table[column_id][0])


def parse_rule_config(rule_config: dict, current_rule: str, immutable: Tuple[str, ...] = ()):
    """Get rule specific parameters from rule_config file"""
    
    # If rule config file not present, emtpy string will be returned
    if not rule_config:
        logger.info(f"No rule config specified: using default values for all tools.")
        return ''
    # Same if current rule not specified in rule config
    if current_rule not in rule_config or not rule_config[current_rule]:
        logger.info(f"No additional parameters for rule {current_rule} specified: using default settings.")
        return ''

    # Subset only section for current rule
    rule_config = rule_config[current_rule]
    
    # Build list of parameters and values
    params_vals = []
    for param, val in rule_config.items():
        # Do not allow the user to change wiring-critical, fixed arguments, or arguments that are passed through samples table
        if param in immutable:
            raise ValueError(
                f"The following parameter in rule {current_rule} is critical for the pipeline to "
                f"function as expected and cannot be modified: {param}"
            )
        # Accept only strings; this prevents unintended results potentially
        # arising from users entering reserved YAML keywords or nested
        # structures (lists, dictionaries)
        if isinstance(val, str):
            params_vals.append(str(param))
            # Do not include a value for flags (signified by empty strings)
            if val:
                params_vals.append(val)
        else:
            raise ValueError(
                "Only string values allowed for tool parameters: Found type "
                f"'{type(val).__name__}' for value of parameter '{param}'"
            )
    # Return quoted string
    add_params = ' '.join(quote(item) for item in params_vals)
    logger.info(f"User specified additional parameters for rule {current_rule}:\n {add_params}")
    return add_params


# Global config
localrules: start, finish, rename_star_rpm_for_alfa, prepare_multiqc_config

# Include subworkflows
include: os.path.join("workflow", "rules", "paired_end.snakefile.smk")
include: os.path.join("workflow", "rules", "single_end.snakefile.smk")


rule finish:
    """
        Rule for collecting outputs
    """
    input:
        multiqc_report = os.path.join(
            config['output_dir'],
            "multiqc_summary"),
        bigWig = expand(
            os.path.join(
                config["output_dir"],
                "samples",
                "{sample}",
                "bigWig",
                "{unique_type}",
                "{sample}_{unique_type}_{strand}.bw"),
            sample=pd.unique(samples_table.index.values),
            strand=["plus", "minus"],
            unique_type=["Unique", "UniqueMultiple"]),
        salmon_merge_genes = expand(
            os.path.join(
                config["output_dir"],
                "summary_salmon",
                "quantmerge",
                "genes_{salmon_merge_on}.tsv"),
            salmon_merge_on=["tpm", "numreads"]),
        salmon_merge_transcripts = expand(
            os.path.join(
                config["output_dir"],
                "summary_salmon",
                "quantmerge",
                "transcripts_{salmon_merge_on}.tsv"),
            salmon_merge_on=["tpm", "numreads"]),
        kallisto_merge_transcripts = os.path.join(
            config["output_dir"],
            "summary_kallisto",
            "transcripts_tpm.tsv"),
        kallisto_merge_genes = os.path.join(
            config["output_dir"],
            "summary_kallisto",
            "genes_tpm.tsv")


current_rule = 'start'
rule start:
    '''
       Get samples
    '''
    input:
        reads = lambda wildcards:
            expand(
                pd.Series(
                    samples_table.loc[wildcards.sample, wildcards.mate]
                ).values)

    output:
        reads = os.path.join(
            config["output_dir"],
            "samples",
            "{sample}",
            "start",
            "{sample}.{mate}.fastq.gz")

    log:
        stderr = os.path.join(
            config["log_dir"],
            "samples",
            "{sample}",
            current_rule + "_{sample}.{mate}.stderr.log"),
        stdout = os.path.join(
            config["log_dir"],
            "samples",
            "{sample}",
            current_rule + "_{sample}.{mate}.stdout.log")

    singularity:
        "docker://ubuntu:focal-20210416"

    shell:
        "(cat {input.reads} > {output.reads}) \
        1> {log.stdout} 2> {log.stderr} "


current_rule = 'fastqc'
rule fastqc:
    '''
        A quality control tool for high throughput sequence data
    '''
    input:
        reads = os.path.join(
            config["output_dir"],
            "samples",
            "{sample}",
            "start",
            "{sample}.{mate}.fastq.gz")

    output:
        outdir = directory(
            os.path.join(
                config["output_dir"],
                "samples",
                "{sample}",
                "fastqc",
                "{mate}"))
    
    params:
        additional_params = parse_rule_config(
            rule_config,
            current_rule=current_rule,
            immutable=(
                '--outdir',
                )
            )

    threads: 2

    singularity:
        "docker://quay.io/biocontainers/fastqc:0.11.9--hdfd78af_1"

    log:
        stderr = os.path.join(
            config["log_dir"],
            "samples",
            "{sample}",
            current_rule + "_{mate}.stderr.log"),
        stdout = os.path.join(
            config["log_dir"],
            "samples",
            "{sample}",
            current_rule + "_{mate}.stdout.log")

    shell:
        "(mkdir -p {output.outdir}; \
        fastqc --outdir {output.outdir} \
        --threads {threads} \
        {params.additional_params} \
        {input.reads}) \
        1> {log.stdout} 2> {log.stderr}"


current_rule = 'create_index_star'
rule create_index_star:
    """
        Create index for STAR alignments
    """
    input:
        genome = lambda wildcards:
            os.path.abspath(get_sample(
                'genome',
                search_id='organism',
                search_value=wildcards.organism)),

        gtf = lambda wildcards:
            os.path.abspath(get_sample(
                'gtf',
                search_id='organism',
                search_value=wildcards.organism))

    output:
        chromosome_info = os.path.join(
            config['star_indexes'],
            "{organism}",
            "{index_size}",
            "STAR_index",
            "chrNameLength.txt"),
        chromosomes_names = os.path.join(
            config['star_indexes'],
            "{organism}",
            "{index_size}",
            "STAR_index",
            "chrName.txt")

    params:
        output_dir = os.path.join(
            config['star_indexes'],
            "{organism}",
            "{index_size}",
            "STAR_index"),
        outFileNamePrefix = os.path.join(
            config['star_indexes'],
            "{organism}",
            "{index_size}",
            "STAR_index/STAR_"),
        sjdbOverhang = "{index_size}",
        additional_params = parse_rule_config(
            rule_config,
            current_rule=current_rule,
            immutable=(
                '--runMode',
                '--sjdbOverhang',
                '--genomeDir',
                '--genomeFastaFiles',
                '--outFileNamePrefix',
                '--sjdbGTFfile',
                )
            )

    singularity:
        "docker://quay.io/biocontainers/star:2.7.8a--h9ee0642_1"

    threads: 12

    log:
        stderr = os.path.join(
            config['log_dir'],
            current_rule + "_{organism}_{index_size}.stderr.log"),
        stdout = os.path.join(
            config['log_dir'],
            current_rule + "_{organism}_{index_size}.stdout.log")

    shell:
        "(mkdir -p {params.output_dir}; \
        chmod -R 777 {params.output_dir}; \
        STAR \
        --runMode genomeGenerate \
        --sjdbOverhang {params.sjdbOverhang} \
        --genomeDir {params.output_dir} \
        --genomeFastaFiles {input.genome} \
        --runThreadN {threads} \
        --outFileNamePrefix {params.outFileNamePrefix} \
        --sjdbGTFfile {input.gtf}) \
        {params.additional_params} \
        1> {log.stdout} 2> {log.stderr}"


current_rule = 'extract_transcriptome'
rule extract_transcriptome:
    """
        Create transcriptome from genome and gene annotations
    """
    input:
        genome = lambda wildcards:
            get_sample(
                'genome',
                search_id='organism',
                search_value=wildcards.organism),
        gtf = lambda wildcards:
            get_sample(
                'gtf',
                search_id='organism',
                search_value=wildcards.organism)
    output:
        transcriptome = temp(os.path.join(
            config['output_dir'],
            "transcriptome",
            "{organism}",
            "transcriptome.fa"))

    params:
        additional_params = parse_rule_config(
            rule_config,
            current_rule=current_rule,
            immutable=(
                '-w',
                '-g',
                )
            )

    log:
        stderr = os.path.join(
            config['log_dir'],
            current_rule + "_{organism}.log"),
        stdout = os.path.join(
            config['log_dir'],
            current_rule + "_{organism}.log")

    singularity:
        "docker://quay.io/biocontainers/gffread:0.12.1--h2e03b76_1"

    shell:
        "(gffread \
        -w {output.transcriptome} \
        -g {input.genome} \
        {params.additional_params} \
        {input.gtf}) \
        1> {log.stdout} 2> {log.stderr}"


current_rule = 'concatenate_transcriptome_and_genome' 
rule concatenate_transcriptome_and_genome:
    """
        Concatenate genome and transcriptome
    """
    input:
        transcriptome = os.path.join(
            config['output_dir'],
            "transcriptome",
            "{organism}",
            "transcriptome.fa"),

        genome = lambda wildcards:
            get_sample(
                'genome',
                search_id='organism',
                search_value=wildcards.organism)

    output:
        genome_transcriptome = temp(os.path.join(
            config['output_dir'],
            "transcriptome",
            "{organism}",
            "genome_transcriptome.fa"))

    singularity:
        "docker://ubuntu:focal-20210416"

    log:
        stderr = os.path.join(
            config['log_dir'],
            current_rule + "_{organism}.stderr.log")

    shell:
        "(cat {input.transcriptome} {input.genome} \
        1> {output.genome_transcriptome}) \
        2> {log.stderr}"


current_rule = 'create_index_salmon'
rule create_index_salmon:
    """
        Create index for Salmon quantification
    """
    input:
        genome_transcriptome = os.path.join(
            config['output_dir'],
            "transcriptome",
            "{organism}",
            "genome_transcriptome.fa"),
        chr_names = lambda wildcards:
            os.path.join(
                config['star_indexes'],
                get_sample('organism'),
                get_sample("index_size"),
                "STAR_index",
                "chrName.txt")

    output:
        index = directory(
            os.path.join(
                config['salmon_indexes'],
                "{organism}",
                "{kmer}",
                "salmon.idx"))

    params:
        kmerLen = "{kmer}",
        additional_params = parse_rule_config(
            rule_config,
            current_rule=current_rule,
            immutable=(
                '--transcripts',
                '--decoys',
                '--index',
                '--kmerLen',
                )
            )

    singularity:
        "docker://quay.io/biocontainers/salmon:1.4.0--h84f40af_1"

    log:
        stderr = os.path.join(
            config['log_dir'],
            current_rule + "_{organism}_{kmer}.stderr.log"),
        stdout = os.path.join(
            config['log_dir'],
            current_rule + "_{organism}_{kmer}.stdout.log")

    threads: 8

    shell:
        "(salmon index \
        --transcripts {input.genome_transcriptome} \
        --decoys {input.chr_names} \
        --index {output.index} \
        --kmerLen {params.kmerLen} \
        --threads {threads}) \
        {params.additional_params} \
        1> {log.stdout} 2> {log.stderr}"


current_rule = 'create_index_kallisto'
rule create_index_kallisto:
    """
        Create index for Kallisto quantification
    """
    input:
        transcriptome = os.path.join(
            config['output_dir'],
            "transcriptome",
            "{organism}",
            "transcriptome.fa")

    output:
        index = os.path.join(
            config['kallisto_indexes'],
            "{organism}",
            "kallisto.idx")

    params:
        output_dir = os.path.join(
            config['kallisto_indexes'],
            "{organism}"),
        additional_params = parse_rule_config(
            rule_config,
            current_rule=current_rule,
            immutable=(
                '-i',
                )
            )

    singularity:
        "docker://quay.io/biocontainers/kallisto:0.46.2--h60f4f9f_2"

    log:
        stderr = os.path.join(
            config['log_dir'],
            current_rule + "_{organism}.stderr.log"),
        stdout = os.path.join(
            config['log_dir'],
            current_rule + "_{organism}.stdout.log")

    shell:
        "(mkdir -p {params.output_dir}; \
        chmod -R 777 {params.output_dir}; \
        kallisto index \
        {params.additional_params} \
        -i {output.index} \
        {input.transcriptome}) \
        1> {log.stdout}  2> {log.stderr}"


current_rule = 'extract_transcripts_as_bed12'
rule extract_transcripts_as_bed12:
    """
        Convert transcripts to BED12 format
    """
    input:
        gtf = lambda wildcards:
            get_sample('gtf')

    output:
        bed12 = temp(os.path.join(
            config['output_dir'],
            "full_transcripts_protein_coding.bed"))

    params:
        additional_params = parse_rule_config(
            rule_config,
            current_rule=current_rule,
            immutable=(
                '--gtf',
                '--bed12',
                )
            )

    singularity:
        "docker://quay.io/biocontainers/zgtf:0.1.1--pyh5e36f6f_0"

    threads: 1

    log:
        stdout = os.path.join(
            config['log_dir'],
            current_rule + ".stdout.log"),
        stderr = os.path.join(
            config['log_dir'],
            current_rule + ".stderr.log")

    shell:
        "(gtf2bed12 \
        --gtf {input.gtf} \
        --bed12 {output.bed12}); \
        {params.additional_params} \
        1> {log.stdout} 2> {log.stderr}"


current_rule = 'index_genomic_alignment_samtools'
rule index_genomic_alignment_samtools:
    '''
        Index genome bamfile using samtools
    '''
    input:
        bam = os.path.join(
            config["output_dir"],
            "samples",
            "{sample}",
            "map_genome",
            "{sample}.{seqmode}.Aligned.sortedByCoord.out.bam"),
    output:
        bai = os.path.join(
            config["output_dir"],
            "samples",
            "{sample}",
            "map_genome",
            "{sample}.{seqmode}.Aligned.sortedByCoord.out.bam.bai")

    params:
        additional_params = parse_rule_config(
            rule_config,
            current_rule=current_rule,
            immutable=()
            )

    singularity:
        "docker://quay.io/biocontainers/samtools:1.3.1--h1b8c3c0_8"

    threads: 1

    log:
        stderr = os.path.join(
            config["log_dir"],
            "samples",
            "{sample}",
            current_rule + ".{seqmode}.stderr.log"),
        stdout = os.path.join(
            config["log_dir"],
            "samples",
            "{sample}",
            current_rule + ".{seqmode}.stdout.log")

    shell:
        "(samtools index \
        {params.additional_params} \
        {input.bam} {output.bai};) \
        1> {log.stdout} 2> {log.stderr}"


current_rule = 'calculate_TIN_scores'
rule calculate_TIN_scores:
    """
        Calculate transcript integrity (TIN) score
    """
    input:
        bam = lambda wildcards:
            expand(
                os.path.join(
                    config['output_dir'],
                    "samples",
                    "{sample}",
                    "map_genome",
                    "{sample}.{seqmode}.Aligned.sortedByCoord.out.bam"),
                sample=wildcards.sample,
                seqmode=get_sample(
                    'seqmode',
                    search_id='index',
                    search_value=wildcards.sample)),
        bai = lambda wildcards:
            expand(
                os.path.join(
                    config['output_dir'],
                    "samples",
                    "{sample}",
                    "map_genome",
                    "{sample}.{seqmode}.Aligned.sortedByCoord.out.bam.bai"),
                sample=wildcards.sample,
                seqmode=get_sample(
                    'seqmode',
                    search_id='index',
                    search_value=wildcards.sample)),
        transcripts_bed12 = os.path.join(
            config['output_dir'],
            "full_transcripts_protein_coding.bed")

    output:
        TIN_score = temp(os.path.join(
            config['output_dir'],
            "samples",
            "{sample}",
            "TIN",
            "TIN_score.tsv"))

    params:
        sample = "{sample}",
        additional_params = parse_rule_config(
            rule_config,
            current_rule=current_rule,
            immutable=(
                '-i',
                '-r',
                '--names',
                )
            )

    log:
        stderr = os.path.join(
            config['log_dir'],
            "samples",
            "{sample}",
            current_rule + ".log")

    threads: 8

    singularity:
        "docker://zavolab/tin_score_calculation:0.2.0-slim"

    shell:
        "(tin_score_calculation.py \
        -i {input.bam} \
        -r {input.transcripts_bed12} \
        --names {params.sample} \
        {params.additional_params} \
        > {output.TIN_score};) 2> {log.stderr}"


current_rule = 'salmon_quantmerge_genes'
rule salmon_quantmerge_genes:
    '''
        Merge gene quantifications into a single file
    '''
    input:
        salmon_in = expand(
            os.path.join(
                config["output_dir"],
                "samples",
                "{sample}",
                "{sample}.salmon.{seqmode}",
                "quant.sf"),
            zip,
            sample=pd.unique(samples_table.index.values),
            seqmode=[get_sample(
                'seqmode',
                search_id='index',
                search_value=i)
                for i in pd.unique(samples_table.index.values)])

    output:
        salmon_out = os.path.join(
            config["output_dir"],
            "summary_salmon",
            "quantmerge",
            "genes_{salmon_merge_on}.tsv")

    params:
        salmon_in = expand(
            os.path.join(
                config["output_dir"],
                "samples",
                "{sample}",
                "{sample}.salmon.{seqmode}"),
            zip,
            sample=[i for i in pd.unique(samples_table.index.values)],
            seqmode=[get_sample(
                'seqmode',
                search_id='index',
                search_value=i)
                for i in pd.unique(samples_table.index.values)]),
        sample_name_list = expand(
            "{sample}",
            sample=pd.unique(samples_table.index.values)),
        salmon_merge_on = "{salmon_merge_on}",
        additional_params = parse_rule_config(
            rule_config,
            current_rule=current_rule,
            immutable=(
                '--quants',
                '--genes',
                '--transcripts',
                '--names',
                '--column',
                '--output',
                )
            )
    log:
        stderr = os.path.join(
            config["log_dir"],
            current_rule + "_{salmon_merge_on}.stderr.log"),
        stdout = os.path.join(
            config["log_dir"],
            current_rule + "_{salmon_merge_on}.stdout.log")

    threads: 1

    singularity:
        "docker://quay.io/biocontainers/salmon:1.4.0--h84f40af_1"

    shell:
        "(salmon quantmerge \
        --quants {params.salmon_in} \
        --genes \
        --names {params.sample_name_list} \
        --column {params.salmon_merge_on} \
        --output {output.salmon_out};) \
        {params.additional_params} \
        1> {log.stdout} 2> {log.stderr}"


current_rule = 'salmon_quantmerge_transcripts'
rule salmon_quantmerge_transcripts:
    '''
        Merge transcript quantifications into a single file
    '''
    input:
        salmon_in = expand(
            os.path.join(
                config["output_dir"],
                "samples",
                "{sample}",
                "{sample}.salmon.{seqmode}",
                "quant.sf"),
            zip,
            sample=[i for i in pd.unique(samples_table.index.values)],
            seqmode=[get_sample(
                'seqmode',
                search_id='index',
                search_value=i)
                for i in pd.unique(samples_table.index.values)])

    output:
        salmon_out = os.path.join(
            config["output_dir"],
            "summary_salmon",
            "quantmerge",
            "transcripts_{salmon_merge_on}.tsv")

    params:
        salmon_in = expand(
            os.path.join(
                config["output_dir"],
                "samples",
                "{sample}",
                "{sample}.salmon.{seqmode}"),
            zip,
            sample=[i for i in pd.unique(samples_table.index.values)],
            seqmode=[get_sample(
                'seqmode',
                search_id='index',
                search_value=i)
                for i in pd.unique(samples_table.index.values)]),
        sample_name_list = expand(
            "{sample}",
            sample=pd.unique(samples_table.index.values)),
        salmon_merge_on = "{salmon_merge_on}",
        additional_params = parse_rule_config(
            rule_config,
            current_rule=current_rule,
            immutable=(
                '--quants',
                '--genes',
                '--transcripts',
                '--names',
                '--column',
                '--output',
                )
            )

    log:
        stderr = os.path.join(
            config["log_dir"],
            current_rule + "_{salmon_merge_on}.stderr.log"),
        stdout = os.path.join(
            config["log_dir"],
            current_rule + "_{salmon_merge_on}.stdout.log")

    threads: 1

    singularity:
        "docker://quay.io/biocontainers/salmon:1.4.0--h84f40af_1"

    shell:
        "(salmon quantmerge \
        --quants {params.salmon_in} \
        --names {params.sample_name_list} \
        --column {params.salmon_merge_on} \
        --output {output.salmon_out}) \
        {params.additional_params} \
        1> {log.stdout} 2> {log.stderr}"


current_rule= 'kallisto_merge_genes'
rule kallisto_merge_genes:
    '''
        Merge gene quantifications into single file
    '''
    input:
        pseudoalignment = expand(
            os.path.join(
                config["output_dir"],
                "samples",
                "{sample}",
                "quant_kallisto",
                "{sample}.{seqmode}.kallisto.pseudo.sam"),
            zip,
            sample=[i for i in pd.unique(samples_table.index.values)],
            seqmode=[get_sample(
                'seqmode',
                search_id='index',
                search_value=i)
                for i in pd.unique(samples_table.index.values)]),
        gtf = get_sample('gtf')

    output:
        gn_tpm = os.path.join(
            config["output_dir"],
            "summary_kallisto",
            "genes_tpm.tsv"),
        gn_counts = os.path.join(
            config["output_dir"],
            "summary_kallisto",
            "genes_counts.tsv")

    params:
        dir_out = os.path.join(
            config["output_dir"],
            "summary_kallisto"),
        tables = ','.join(expand(
            os.path.join(
                config["output_dir"],
                "samples",
                "{sample}",
                "quant_kallisto",
                "abundance.h5"),
            sample=[i for i in pd.unique(samples_table.index.values)])),
        sample_name_list = ','.join(expand(
            "{sample}",
            sample=pd.unique(samples_table.index.values))),
        additional_params = parse_rule_config(
            rule_config,
            current_rule=current_rule,
            immutable=(
                '--input',
                '--names',
                '--txOut',
                '--anno',
                '--output',
                )
            )

    log:
        stderr = os.path.join(
            config["log_dir"],
            current_rule + ".stderr.log"),
        stdout = os.path.join(
            config["log_dir"],
            current_rule + ".stdout.log")

    threads: 1

    singularity:
        "docker://zavolab/merge_kallisto:0.6"

    shell:
        "(merge_kallisto.R \
        --input {params.tables} \
        --names {params.sample_name_list} \
        --txOut FALSE \
        --anno {input.gtf} \
        --output {params.dir_out} \
        {params.additional_params} ) \
        1> {log.stdout} 2> {log.stderr}"


current_rule = 'kallisto_merge_transcripts'
rule kallisto_merge_transcripts:
    '''
        Merge transcript quantifications into a single files
    '''
    input:
        pseudoalignment = expand(
            os.path.join(
                config["output_dir"],
                "samples",
                "{sample}",
                "quant_kallisto",
                "{sample}.{seqmode}.kallisto.pseudo.sam"),
            zip,
            sample=[i for i in pd.unique(samples_table.index.values)],
            seqmode=[get_sample(
                'seqmode',
                search_id='index',
                search_value=i)
                for i in pd.unique(samples_table.index.values)]),
    output:
        tx_tpm = os.path.join(
            config["output_dir"],
            "summary_kallisto",
            "transcripts_tpm.tsv"),
        tx_counts = os.path.join(
            config["output_dir"],
            "summary_kallisto",
            "transcripts_counts.tsv")

    params:
        dir_out = os.path.join(
            config["output_dir"],
            "summary_kallisto"),
        tables = ','.join(expand(
            os.path.join(
                config["output_dir"],
                "samples",
                "{sample}",
                "quant_kallisto",
                "abundance.h5"),
            sample=[i for i in pd.unique(samples_table.index.values)])),
        sample_name_list = ','.join(expand(
            "{sample}",
            sample=pd.unique(samples_table.index.values))),
        additional_params = parse_rule_config(
            rule_config,
            current_rule=current_rule,
            immutable=(
                '--input',
                '--names',
                '--txOut',
                '--output',
                )
            )

    log:
        stderr = os.path.join(
            config["log_dir"],
            current_rule + ".stderr.log"),
        stdout = os.path.join(
            config["log_dir"],
            current_rule + ".stdout.log")

    threads: 1

    singularity:
        "docker://zavolab/merge_kallisto:0.6"

    shell:
        "(merge_kallisto.R \
        --input {params.tables} \
        --names {params.sample_name_list} \
        --output {params.dir_out} \
        {params.additional_params}) \
        1> {log.stdout} 2> {log.stderr}"


current_rule = 'pca_salmon'
rule pca_salmon:
    input:
        tpm = os.path.join(
            config["output_dir"],
            "summary_salmon",
            "quantmerge",
            "{molecule}_tpm.tsv"),

    output:
        out = directory(os.path.join(
            config["output_dir"],
            "zpca",
            "pca_salmon_{molecule}"))

    params:
        additional_params = parse_rule_config(
            rule_config,
            current_rule=current_rule,
            immutable=(
                '--tpm',
                '--out',
                )
            )

    log:
        stderr = os.path.join(
            config["log_dir"],
            current_rule + "_{molecule}.stderr.log"),
        stdout = os.path.join(
            config["log_dir"],
            current_rule + "_{molecule}.stdout.log")

    threads: 1

    singularity:
        "docker://zavolab/zpca:0.8.3-1"

    shell:
        "(zpca-tpm  \
        --tpm {input.tpm} \
        --out {output.out} \
        {params.additional_params}) \
        1> {log.stdout} 2> {log.stderr}"


current_rule = 'pca_kallisto'
rule pca_kallisto:
    input:
        tpm = os.path.join(
            config["output_dir"],
            "summary_kallisto",
            "{molecule}_tpm.tsv")


    output:
        out = directory(os.path.join(
            config["output_dir"],
            "zpca",
            "pca_kallisto_{molecule}"))

    params:
        additional_params = parse_rule_config(
            rule_config,
            current_rule=current_rule,
            immutable=(
                '--tpm',
                '--out',
                )
            )

    log:
        stderr = os.path.join(
            config["log_dir"],
            current_rule + "_{molecule}.stderr.log"),
        stdout = os.path.join(
            config["log_dir"],
            current_rule + "_{molecule}.stdout.log")

    threads: 1

    singularity:
        "docker://zavolab/zpca:0.8.3-1"

    shell:
        "(zpca-tpm  \
        --tpm {input.tpm} \
        --out {output.out} \
        {params.additional_params}) \
        1> {log.stdout} 2> {log.stderr}"


current_rule = 'star_rpm'
rule star_rpm:
    '''
        Create stranded bedgraph coverage with STARs RPM normalisation
    '''
    input:
        bam = lambda wildcards:
            expand(
                os.path.join(
                    config["output_dir"],
                    "samples",
                    "{sample}",
                    "map_genome",
                    "{sample}.{seqmode}.Aligned.sortedByCoord.out.bam"),
                sample=wildcards.sample,
                seqmode=get_sample(
                    'seqmode',
                    search_id='index',
                    search_value=wildcards.sample)),
        bai = lambda wildcards:
            expand(
                os.path.join(
                    config["output_dir"],
                    "samples",
                    "{sample}",
                    "map_genome",
                    "{sample}.{seqmode}.Aligned.sortedByCoord.out.bam.bai"),
                sample=wildcards.sample,
                seqmode=get_sample(
                    'seqmode',
                    search_id='index',
                    search_value=wildcards.sample))

    output:
        str1 = temp(os.path.join(
            config["output_dir"],
            "samples",
            "{sample}",
            "STAR_coverage",
            "{sample}_Signal.Unique.str1.out.bg")),
        str2 = temp(os.path.join(
            config["output_dir"],
            "samples",
            "{sample}",
            "STAR_coverage",
            "{sample}_Signal.UniqueMultiple.str1.out.bg")),
        str3 = temp(os.path.join(
            config["output_dir"],
            "samples",
            "{sample}",
            "STAR_coverage",
            "{sample}_Signal.Unique.str2.out.bg")),
        str4 = temp(os.path.join(
            config["output_dir"],
            "samples",
            "{sample}",
            "STAR_coverage",
            "{sample}_Signal.UniqueMultiple.str2.out.bg"))

    shadow: "full"

    params:
        out_dir = lambda wildcards, output:
            os.path.dirname(output.str1),
        prefix = lambda wildcards, output:
            os.path.join(
                os.path.dirname(output.str1),
                str(wildcards.sample) + "_"),
        additional_params = parse_rule_config(
            rule_config,
            current_rule=current_rule,
            immutable=(
                '--runMode',
                '--inputBAMfile',
                '--outWigType',
                '--outFileNamePrefix',
                )
            )

    singularity:
        "docker://quay.io/biocontainers/star:2.7.8a--h9ee0642_1"

    log:
        stderr = os.path.join(
            config["log_dir"],
            "samples",
            "{sample}",
            current_rule + ".stderr.log"),
        stdout = os.path.join(
            config["log_dir"],
            "samples",
            "{sample}",
            current_rule + ".stdout.log")

    threads: 4

    shell:
        "(mkdir -p {params.out_dir}; \
        chmod -R 777 {params.out_dir}; \
        STAR \
        --runMode inputAlignmentsFromBAM \
        --runThreadN {threads} \
        --inputBAMfile {input.bam} \
        --outWigType bedGraph \
        --outFileNamePrefix {params.prefix}) \
        {params.additional_params} \
        1> {log.stdout} 2> {log.stderr}"


current_rule = 'rename_star_rpm_for_alfa'
rule rename_star_rpm_for_alfa:
    input:
        plus = lambda wildcards:
            expand(
                os.path.join(
                    config["output_dir"],
                    "samples",
                    "{sample}",
                    "STAR_coverage",
                    "{sample}_Signal.{unique}.{plus}.out.bg"),
                sample=wildcards.sample,
                unique=wildcards.unique,
                plus=get_sample(
                    'alfa_plus',
                    search_id='index',
                    search_value=wildcards.sample)),
        minus = lambda wildcards:
            expand(
                os.path.join(
                    config["output_dir"],
                    "samples",
                    "{sample}",
                    "STAR_coverage",
                    "{sample}_Signal.{unique}.{minus}.out.bg"),
                sample=wildcards.sample,
                unique=wildcards.unique,
                minus=get_sample(
                    'alfa_minus',
                    search_id='index',
                    search_value=wildcards.sample))

    output:
        plus = temp(os.path.join(
            config["output_dir"],
            "samples",
            "{sample}",
            "ALFA",
            "{unique}",
            "{sample}.{unique}.plus.bg")),
        minus = temp(os.path.join(
            config["output_dir"],
            "samples",
            "{sample}",
            "ALFA",
            "{unique}",
            "{sample}.{unique}.minus.bg"))

    log:
        stderr = os.path.join(
            config["log_dir"],
            "samples",
            "{sample}",
            current_rule + "_{unique}.stderr.log"),
        stdout = os.path.join(
            config["log_dir"],
            "samples",
            "{sample}",
            current_rule + "_{unique}.stdout.log")

    singularity:
        "docker://ubuntu:focal-20210416"

    shell:
        "(cp {input.plus} {output.plus}; \
         cp {input.minus} {output.minus};) \
         1>{log.stdout} 2>{log.stderr}"


current_rule = 'generate_alfa_index'
rule generate_alfa_index:
    ''' Generate ALFA index files from sorted GTF file '''
    input:
        gtf = lambda wildcards:
            os.path.abspath(get_sample(
                'gtf',
                search_id='organism',
                search_value=wildcards.organism)),
        chr_len = os.path.join(
            config["star_indexes"],
            "{organism}",
            "{index_size}",
            "STAR_index",
            "chrNameLength.txt"),

    output:
        index_stranded = os.path.join(
            config["alfa_indexes"],
            "{organism}",
            "{index_size}",
            "ALFA",
            "sorted_genes.stranded.ALFA_index"),
        index_unstranded = os.path.join(
            config["alfa_indexes"],
            "{organism}",
            "{index_size}",
            "ALFA",
            "sorted_genes.unstranded.ALFA_index")

    params:
        genome_index = "sorted_genes",
        out_dir = lambda wildcards, output:
            os.path.dirname(output.index_stranded),
        additional_params = parse_rule_config(
            rule_config,
            current_rule=current_rule,
            immutable=(
                '-a',
                '-g',
                '--chr_len',
                '-o',
                )
            )

    threads: 4

    singularity:
        "docker://quay.io/biocontainers/alfa:1.1.1--pyh5e36f6f_0"

    log:
        os.path.join(
            config["log_dir"],
            current_rule + "_{organism}_{index_size}.log")

    shell:
        "(alfa -a {input.gtf} \
        -g {params.genome_index} \
        --chr_len {input.chr_len} \
        -p {threads} \
        -o {params.out_dir} \
        {params.additional_params}) \
        &> {log}"


current_rule = 'alfa_qc'
rule alfa_qc:
    '''
        Run ALFA from stranded bedgraph files
    '''
    input:
        plus = os.path.join(
            config["output_dir"],
            "samples",
            "{sample}",
            "ALFA",
            "{unique}",
            "{sample}.{unique}.plus.bg"),
        minus = os.path.join(
            config["output_dir"],
            "samples",
            "{sample}",
            "ALFA",
            "{unique}",
            "{sample}.{unique}.minus.bg"),
        gtf = lambda wildcards:
            os.path.join(
                config["alfa_indexes"],
                get_sample(
                    'organism',
                    search_id='index',
                    search_value=wildcards.sample),
                get_sample(
                    'index_size',
                    search_id='index',
                    search_value=wildcards.sample),
                "ALFA",
                "sorted_genes.stranded.ALFA_index")

    output:
        biotypes = temp(os.path.join(
            config["output_dir"],
            "samples",
            "{sample}",
            "ALFA",
            "{unique}",
            "ALFA_plots.Biotypes.pdf")),
        categories = temp(os.path.join(
            config["output_dir"],
            "samples",
            "{sample}",
            "ALFA",
            "{unique}",
            "ALFA_plots.Categories.pdf")),
        table = os.path.join(
            config["output_dir"],
            "samples",
            "{sample}",
            "ALFA",
            "{unique}",
            "{sample}.ALFA_feature_counts.tsv")

    params:
        out_dir = lambda wildcards, output:
            os.path.dirname(output.biotypes),
        genome_index = lambda wildcards, input:
            os.path.abspath(
                os.path.join(
                    os.path.dirname(input.gtf),
                    "sorted_genes")),
        plus = lambda wildcards, input:
            os.path.basename(input.plus),
        minus = lambda wildcards, input:
            os.path.basename(input.minus),
        name = "{sample}",
        alfa_orientation = lambda wildcards:
            get_sample(
                'alfa_directionality',
                search_id='index',
                search_value=wildcards.sample),
        additional_params = parse_rule_config(
            rule_config,
            current_rule=current_rule,
            immutable=(
                '-g',
                '--bedgraph',
                '-s',
                )
            )

    singularity:
        "docker://quay.io/biocontainers/alfa:1.1.1--pyh5e36f6f_0"

    log:
        os.path.join(
            config["log_dir"],
            "samples",
            "{sample}",
            current_rule + ".{unique}.log")

    shell:
        "(cd {params.out_dir}; \
        alfa \
        -g {params.genome_index} \
        --bedgraph {params.plus} {params.minus} {params.name} \
        -s {params.alfa_orientation} \
        {params.additional_params}) \
        &> {log}"


current_rule = 'prepare_multiqc_config'
rule prepare_multiqc_config:
    '''
        Prepare config for the MultiQC
    '''
    input:
        script = os.path.join(
            workflow.basedir,
            "workflow",
            "scripts",
            "zarp_multiqc_config.py")

    output:
        multiqc_config = os.path.join(
            config["output_dir"],
            "multiqc_config.yaml")

    params:
        logo_path = config['report_logo'],
        multiqc_intro_text = config['report_description'],
        url = config['report_url'],
        additional_params = parse_rule_config(
            rule_config,
            current_rule=current_rule,
            immutable=(
                '--config',
                '--intro-text',
                '--custom-logo',
                '--url',
                )
            )

    log:
        stderr = os.path.join(
            config["log_dir"],
            current_rule + ".stderr.log"),
        stdout = os.path.join(
            config["log_dir"],
            current_rule + ".stdout.log")

    shell:
        "(python {input.script} \
        --config {output.multiqc_config} \
        --intro-text '{params.multiqc_intro_text}' \
        --custom-logo {params.logo_path} \
        --url '{params.url}' \
        {params.additional_params}) \
        1> {log.stdout} 2> {log.stderr}"

current_rule = 'multiqc_report'
rule multiqc_report:
    '''
        Create report with MultiQC
    '''
    input:
        fastqc_se = expand(
            os.path.join(
                config['output_dir'],
                "samples",
                "{sample}",
                "fastqc",
                "{mate}"),
            sample=pd.unique(samples_table.index.values),
            mate="fq1"),

        fastqc_pe = expand(
            os.path.join(
                config['output_dir'],
                "samples",
                "{sample}",
                "fastqc",
                "{mate}"),
            sample=[i for i in pd.unique(
                samples_table[samples_table['seqmode'] == 'pe'].index.values)],
            mate="fq2"),

        pseudoalignment = expand(
            os.path.join(
                config['output_dir'],
                "samples",
                "{sample}",
                "quant_kallisto",
                "{sample}.{seqmode}.kallisto.pseudo.sam"),
            zip,
            sample=[i for i in pd.unique(samples_table.index.values)],
            seqmode=[get_sample('seqmode', search_id='index', search_value=i) 
                for i in pd.unique(samples_table.index.values)]),

        TIN_score = expand(
            os.path.join(
                config['output_dir'],
                "samples",
                "{sample}",
                "TIN",
                "TIN_score.tsv"),
            sample=pd.unique(samples_table.index.values)),

        tables = lambda wildcards:
            expand(
                os.path.join(
                    config["output_dir"],
                    "samples",
                    "{sample}",
                    "ALFA",
                    "{unique}",
                    "{sample}.ALFA_feature_counts.tsv"),
                sample=pd.unique(samples_table.index.values),
                unique=["Unique", "UniqueMultiple"]),

        zpca_salmon = expand(os.path.join(
            config["output_dir"],
            "zpca",
            "pca_salmon_{molecule}"),
            molecule=["genes", "transcripts"]),

        zpca_kallisto = expand(os.path.join(
            config["output_dir"],
            "zpca",
            "pca_kallisto_{molecule}"),
            molecule=["genes", "transcripts"]
        ),

        multiqc_config = os.path.join(
            config["output_dir"],
            "multiqc_config.yaml")

    output:
        multiqc_report = directory(
            os.path.join(
                config["output_dir"],
                "multiqc_summary"))
    params:
        results_dir = os.path.join(
            config["output_dir"]),
        log_dir = config["log_dir"],
        additional_params = parse_rule_config(
            rule_config,
            current_rule=current_rule,
            immutable=(
                '--outdir',
                '--config',
                )
            )

    log:
        stderr = os.path.join(
            config["log_dir"],
            current_rule + ".stderr.log"),
        stdout = os.path.join(
            config["log_dir"],
            current_rule + ".stdout.log")

    singularity:
        "docker://zavolab/multiqc-plugins:1.2.1"

    shell:
        "(multiqc \
        --outdir {output.multiqc_report} \
        --config {input.multiqc_config} \
        {params.additional_params} \
        {params.results_dir} \
        {params.log_dir};) \
        1> {log.stdout} 2> {log.stderr}"

current_rule = 'sort_bed_4_big'
rule sort_bed_4_big:
    '''
        sort bedGraphs in order to work with bedGraphtobigWig
    '''
    input:
        bg = os.path.join(
            config["output_dir"],
            "samples",
            "{sample}",
            "ALFA",
            "{unique}",
            "{sample}.{unique}.{strand}.bg")

    output:
        sorted_bg = temp(os.path.join(
            config["output_dir"],
            "samples",
            "{sample}",
            "bigWig",
            "{unique}",
            "{sample}_{unique}_{strand}.sorted.bg"))

    params:
        additional_params = parse_rule_config(
            rule_config,
            current_rule=current_rule,
            immutable=(
                '-i',
                )
            )

    singularity:
        "docker://quay.io/biocontainers/bedtools:2.27.1--h9a82719_5"

    log:
        stderr = os.path.join(
            config["log_dir"],
            "samples",
            "{sample}",
            current_rule + "_{unique}_{strand}.stderr.log")

    shell:
        "(sortBed \
        -i {input.bg} \
        {params.additional_params} \
        > {output.sorted_bg};) 2> {log.stderr}"


current_rule = 'prepare_bigWig'
rule prepare_bigWig:
    '''
        bedGraphtobigWig, for viewing in genome browsers
    '''
    input:
        sorted_bg = os.path.join(
            config["output_dir"],
            "samples",
            "{sample}",
            "bigWig",
            "{unique}",
            "{sample}_{unique}_{strand}.sorted.bg"),
        chr_sizes = lambda wildcards:
            os.path.join(
                config['star_indexes'],
                get_sample(
                    'organism',
                    search_id='index',
                    search_value=wildcards.sample),
                get_sample(
                    'index_size',
                    search_id='index',
                    search_value=wildcards.sample),
                "STAR_index",
                "chrNameLength.txt")

    output:
        bigWig = os.path.join(
            config["output_dir"],
            "samples",
            "{sample}",
            "bigWig",
            "{unique}",
            "{sample}_{unique}_{strand}.bw")

    params:
        additional_params = parse_rule_config(
            rule_config,
            current_rule=current_rule,
            immutable=()
            )

    singularity:
        "docker://quay.io/biocontainers/ucsc-bedgraphtobigwig:377--h0b8a92a_2"

    log:
        stderr = os.path.join(
            config["log_dir"],
            "samples",
            "{sample}",
            current_rule + "_{unique}_{strand}.stderr.log"),

        stdout = os.path.join(
            config["log_dir"],
            "samples",
            "{sample}",
            current_rule + "_{unique}_{strand}.stdout.log")

    shell:
        "(bedGraphToBigWig \
        {params.additional_params} \
        {input.sorted_bg} \
        {input.chr_sizes} \
        {output.bigWig};) \
        1> {log.stdout} 2> {log.stderr}"