Newer
Older
"""General purpose RNA-Seq analysis pipeline developed by the Zavolan Lab"""
BIOPZ-Gypas Foivos
committed
import os
import yaml
from shlex import quote
from typing import Tuple
BIOPZ-Gypas Foivos
committed
# Get sample table
samples_table = pd.read_csv(
config['samples'],
header=0,
index_col=0,
comment='#',
engine='python',
sep="\t",
)
# Parse YAML rule config file
if 'rule_config' in config and config['rule_config']:
try:
with open(config['rule_config']) as _file:
rule_config = yaml.safe_load(_file)
logger.info(f"Loaded rule_config from {config['rule_config']}.")
except FileNotFoundError:
logger.error(f"No rule config file found at {config['rule_config']}. Either provide file or remove rule_config parameter from config.yaml! ")
raise
else:
rule_config = {}
logger.warning(f"No rule config specified: using default values for all tools.")
# Create dir for cluster logs, if applicable
if cluster_config:
os.makedirs(
os.path.join(
os.getcwd(),
os.path.dirname(cluster_config['__default__']['out']),
),
exist_ok=True)
## Function definitions
BIOPZ-Katsantoni Maria
committed
def get_sample(column_id, search_id=None, search_value=None):
""" Get relevant per sample information from samples table"""
BIOPZ-Katsantoni Maria
committed
if search_id:
if search_id == 'index':
return str(samples_table[column_id][samples_table.index == search_value][0])
else:
return str(samples_table[column_id][samples_table[search_id] == search_value][0])
else:
return str(samples_table[column_id][0])
BIOPZ-Katsantoni Maria
committed
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
def parse_rule_config(rule_config: dict, current_rule: str, immutable: Tuple[str, ...] = ()):
"""Get rule specific parameters from rule_config file"""
# If rule config file not present, emtpy string will be returned
if not rule_config:
logger.info(f"No rule config specified: using default values for all tools.")
return ''
# Same if current rule not specified in rule config
if current_rule not in rule_config or not rule_config[current_rule]:
logger.info(f"No additional parameters for rule {current_rule} specified: using default settings.")
return ''
# Subset only section for current rule
rule_config = rule_config[current_rule]
# Build list of parameters and values
params_vals = []
for param, val in rule_config.items():
# Do not allow the user to change wiring-critical, fixed arguments, or arguments that are passed through samples table
if param in immutable:
raise ValueError(
f"The following parameter in rule {current_rule} is critical for the pipeline to "
f"function as expected and cannot be modified: {param}"
)
# Accept only strings; this prevents unintended results potentially
# arising from users entering reserved YAML keywords or nested
# structures (lists, dictionaries)
if isinstance(val, str):
params_vals.append(str(param))
# Do not include a value for flags (signified by empty strings)
if val:
params_vals.append(val)
else:
raise ValueError(
"Only string values allowed for tool parameters: Found type "
f"'{type(val).__name__}' for value of parameter '{param}'"
)
# Return quoted string
add_params = ' '.join(quote(item) for item in params_vals)
logger.info(f"User specified additional parameters for rule {current_rule}:\n {add_params}")
return add_params
# Global config
localrules: start, finish, rename_star_rpm_for_alfa, prepare_multiqc_config
BIOPZ-Gypas Foivos
committed
# Include subworkflows
include: os.path.join("workflow", "rules", "paired_end.snakefile.smk")
include: os.path.join("workflow", "rules", "single_end.snakefile.smk")
BIOPZ-Katsantoni Maria
committed
"""
Rule for collecting outputs
"""
multiqc_report = os.path.join(
config['output_dir'],
"multiqc_summary"),
bigWig = expand(
config["output_dir"],
"samples",
"{sample}",
"bigWig",
"{unique_type}",
"{sample}_{unique_type}_{strand}.bw"),
BIOPZ-Katsantoni Maria
committed
sample=pd.unique(samples_table.index.values),
strand=["plus", "minus"],
unique_type=["Unique", "UniqueMultiple"]),
salmon_merge_genes = expand(
os.path.join(
config["output_dir"],
"summary_salmon",
"quantmerge",
"genes_{salmon_merge_on}.tsv"),
salmon_merge_on=["tpm", "numreads"]),
salmon_merge_transcripts = expand(
os.path.join(
config["output_dir"],
"summary_salmon",
"quantmerge",
"transcripts_{salmon_merge_on}.tsv"),
salmon_merge_on=["tpm", "numreads"]),
BIOPZ-Iborra de Toledo Paula
committed
kallisto_merge_transcripts = os.path.join(
config["output_dir"],
"summary_kallisto",
"transcripts_tpm.tsv"),
kallisto_merge_genes = os.path.join(
config["output_dir"],
"summary_kallisto",
"genes_tpm.tsv")
current_rule = 'start'
rule start:
'''
Get samples
'''
input:
reads = lambda wildcards:
BIOPZ-Katsantoni Maria
committed
expand(
pd.Series(
samples_table.loc[wildcards.sample, wildcards.mate]
).values)
output:
reads = os.path.join(
config["output_dir"],
"samples",
"{sample}",
"start",
"{sample}.{mate}.fastq.gz")
log:
stderr = os.path.join(
config["log_dir"],
"samples",
"{sample}",
current_rule + "_{sample}.{mate}.stderr.log"),
stdout = os.path.join(
config["log_dir"],
"samples",
"{sample}",
current_rule + "_{sample}.{mate}.stdout.log")
BIOPZ-Gypas Foivos
committed
"docker://ubuntu:focal-20210416"
BIOPZ-Katsantoni Maria
committed
"(cat {input.reads} > {output.reads}) \
rule fastqc:
'''
A quality control tool for high throughput sequence data
'''
input:
reads = os.path.join(
config["output_dir"],
"samples",
"{sample}",
"start",
"{sample}.{mate}.fastq.gz")
output:
outdir = directory(
config["output_dir"],
"samples",
"{sample}",
"fastqc",
"{mate}"))
params:
additional_params = parse_rule_config(
rule_config,
current_rule=current_rule,
immutable=(
'--outdir',
)
)
BIOPZ-Gypas Foivos
committed
"docker://quay.io/biocontainers/fastqc:0.11.9--hdfd78af_1"
log:
stderr = os.path.join(
config["log_dir"],
"samples",
"{sample}",
current_rule + "_{mate}.stderr.log"),
stdout = os.path.join(
config["log_dir"],
"samples",
"{sample}",
current_rule + "_{mate}.stdout.log")
fastqc --outdir {output.outdir} \
--threads {threads} \
{params.additional_params} \
{input.reads}) \
BIOPZ-Iborra de Toledo Paula
committed
current_rule = 'create_index_star'
BIOPZ-Gypas Foivos
committed
rule create_index_star:
BIOPZ-Katsantoni Maria
committed
"""
Create index for STAR alignments
"""
input:
genome = lambda wildcards:
os.path.abspath(get_sample(
BIOPZ-Katsantoni Maria
committed
'genome',
search_id='organism',
search_value=wildcards.organism)),
BIOPZ-Katsantoni Maria
committed
os.path.abspath(get_sample(
BIOPZ-Katsantoni Maria
committed
'gtf',
search_id='organism',
search_value=wildcards.organism))
BIOPZ-Katsantoni Maria
committed
output:
chromosome_info = os.path.join(
config['star_indexes'],
"{organism}",
"{index_size}",
"STAR_index",
BIOPZ-Katsantoni Maria
committed
"chrNameLength.txt"),
chromosomes_names = os.path.join(
config['star_indexes'],
"{organism}",
"{index_size}",
"STAR_index",
BIOPZ-Katsantoni Maria
committed
"chrName.txt")
params:
output_dir = os.path.join(
config['star_indexes'],
"{organism}",
"{index_size}",
BIOPZ-Katsantoni Maria
committed
"STAR_index"),
outFileNamePrefix = os.path.join(
config['star_indexes'],
"{organism}",
"{index_size}",
BIOPZ-Katsantoni Maria
committed
"STAR_index/STAR_"),
sjdbOverhang = "{index_size}",
additional_params = parse_rule_config(
rule_config,
current_rule=current_rule,
immutable=(
'--runMode',
'--sjdbOverhang',
'--genomeDir',
'--genomeFastaFiles',
'--outFileNamePrefix',
'--sjdbGTFfile',
)
)
BIOPZ-Katsantoni Maria
committed
BIOPZ-Gypas Foivos
committed
"docker://quay.io/biocontainers/star:2.7.8a--h9ee0642_1"
BIOPZ-Katsantoni Maria
committed
BIOPZ-Katsantoni Maria
committed
BIOPZ-Katsantoni Maria
committed
stderr = os.path.join(
current_rule + "_{organism}_{index_size}.stderr.log"),
BIOPZ-Katsantoni Maria
committed
stdout = os.path.join(
config['log_dir'],
current_rule + "_{organism}_{index_size}.stdout.log")
BIOPZ-Katsantoni Maria
committed
shell:
"(mkdir -p {params.output_dir}; \
chmod -R 777 {params.output_dir}; \
STAR \
--runMode genomeGenerate \
--sjdbOverhang {params.sjdbOverhang} \
--genomeDir {params.output_dir} \
--genomeFastaFiles {input.genome} \
--runThreadN {threads} \
--outFileNamePrefix {params.outFileNamePrefix} \
BIOPZ-Katsantoni Maria
committed
--sjdbGTFfile {input.gtf}) \
{params.additional_params} \
BIOPZ-Katsantoni Maria
committed
1> {log.stdout} 2> {log.stderr}"
BIOPZ-Gypas Foivos
committed
current_rule = 'extract_transcriptome'
BIOPZ-Iborra de Toledo Paula
committed
rule extract_transcriptome:
"""
Create transcriptome from genome and gene annotations
"""
BIOPZ-Iborra de Toledo Paula
committed
input:
genome = lambda wildcards:
BIOPZ-Katsantoni Maria
committed
get_sample(
'genome',
search_id='organism',
search_value=wildcards.organism),
BIOPZ-Iborra de Toledo Paula
committed
gtf = lambda wildcards:
BIOPZ-Katsantoni Maria
committed
get_sample(
'gtf',
search_id='organism',
search_value=wildcards.organism)
BIOPZ-Iborra de Toledo Paula
committed
output:
transcriptome = temp(os.path.join(
config['output_dir'],
"transcriptome",
"{organism}",
params:
additional_params = parse_rule_config(
rule_config,
current_rule=current_rule,
immutable=(
'-w',
'-g',
)
)
BIOPZ-Iborra de Toledo Paula
committed
log:
stderr = os.path.join(
config['log_dir'],
current_rule + "_{organism}.log"),
BIOPZ-Iborra de Toledo Paula
committed
stdout = os.path.join(
config['log_dir'],
current_rule + "_{organism}.log")
BIOPZ-Iborra de Toledo Paula
committed
singularity:
BIOPZ-Gypas Foivos
committed
"docker://quay.io/biocontainers/gffread:0.12.1--h2e03b76_1"
BIOPZ-Iborra de Toledo Paula
committed
shell:
"(gffread \
-w {output.transcriptome} \
-g {input.genome} \
{params.additional_params} \
{input.gtf}) \
BIOPZ-Iborra de Toledo Paula
committed
1> {log.stdout} 2> {log.stderr}"
BIOPZ-Gypas Foivos
committed
current_rule = 'concatenate_transcriptome_and_genome'
rule concatenate_transcriptome_and_genome:
"""
Concatenate genome and transcriptome
"""
input:
transcriptome = os.path.join(
config['output_dir'],
"transcriptome",
"{organism}",
"transcriptome.fa"),
genome = lambda wildcards:
BIOPZ-Katsantoni Maria
committed
get_sample(
'genome',
search_id='organism',
search_value=wildcards.organism)
genome_transcriptome = temp(os.path.join(
config['output_dir'],
"transcriptome",
"{organism}",
"genome_transcriptome.fa"))
BIOPZ-Gypas Foivos
committed
"docker://ubuntu:focal-20210416"
log:
stderr = os.path.join(
config['log_dir'],
current_rule + "_{organism}.stderr.log")
shell:
"(cat {input.transcriptome} {input.genome} \
1> {output.genome_transcriptome}) \
2> {log.stderr}"
current_rule = 'create_index_salmon'
rule create_index_salmon:
"""
Create index for Salmon quantification
"""
input:
config['output_dir'],
"transcriptome",
"{organism}",
"genome_transcriptome.fa"),
chr_names = lambda wildcards:
os.path.join(
config['star_indexes'],
BIOPZ-Katsantoni Maria
committed
get_sample('organism'),
get_sample("index_size"),
output:
index = directory(
os.path.join(
config['salmon_indexes'],
"{organism}",
"{kmer}",
BIOPZ-Katsantoni Maria
committed
"salmon.idx"))
kmerLen = "{kmer}",
additional_params = parse_rule_config(
rule_config,
current_rule=current_rule,
immutable=(
'--transcripts',
'--decoys',
'--index',
'--kmerLen',
)
)
BIOPZ-Katsantoni Maria
committed
BIOPZ-Gypas Foivos
committed
"docker://quay.io/biocontainers/salmon:1.4.0--h84f40af_1"
BIOPZ-Katsantoni Maria
committed
BIOPZ-Katsantoni Maria
committed
stderr = os.path.join(
current_rule + "_{organism}_{kmer}.stderr.log"),
BIOPZ-Katsantoni Maria
committed
stdout = os.path.join(
config['log_dir'],
current_rule + "_{organism}_{kmer}.stdout.log")
BIOPZ-Katsantoni Maria
committed
BIOPZ-Katsantoni Maria
committed
--transcripts {input.genome_transcriptome} \
--decoys {input.chr_names} \
--index {output.index} \
--kmerLen {params.kmerLen} \
BIOPZ-Katsantoni Maria
committed
--threads {threads}) \
{params.additional_params} \
BIOPZ-Katsantoni Maria
committed
1> {log.stdout} 2> {log.stderr}"
BIOPZ-Gypas Foivos
committed
current_rule = 'create_index_kallisto'
BIOPZ-Gypas Foivos
committed
rule create_index_kallisto:
BIOPZ-Katsantoni Maria
committed
"""
Create index for Kallisto quantification
"""
BIOPZ-Iborra de Toledo Paula
committed
transcriptome = os.path.join(
config['output_dir'],
"transcriptome",
"{organism}",
"transcriptome.fa")
output:
index = os.path.join(
config['kallisto_indexes'],
"{organism}",
BIOPZ-Katsantoni Maria
committed
"kallisto.idx")
params:
output_dir = os.path.join(
config['kallisto_indexes'],
"{organism}"),
additional_params = parse_rule_config(
rule_config,
current_rule=current_rule,
immutable=(
'-i',
)
)
BIOPZ-Katsantoni Maria
committed
BIOPZ-Gypas Foivos
committed
"docker://quay.io/biocontainers/kallisto:0.46.2--h60f4f9f_2"
BIOPZ-Katsantoni Maria
committed
BIOPZ-Katsantoni Maria
committed
stderr = os.path.join(
current_rule + "_{organism}.stderr.log"),
BIOPZ-Katsantoni Maria
committed
stdout = os.path.join(
config['log_dir'],
current_rule + "_{organism}.stdout.log")
BIOPZ-Katsantoni Maria
committed
shell:
"(mkdir -p {params.output_dir}; \
chmod -R 777 {params.output_dir}; \
kallisto index \
{params.additional_params} \
-i {output.index} \
{input.transcriptome}) \
BIOPZ-Katsantoni Maria
committed
1> {log.stdout} 2> {log.stderr}"
BIOPZ-Gypas Foivos
committed
current_rule = 'extract_transcripts_as_bed12'
BIOPZ-Katsantoni Maria
committed
"""
Convert transcripts to BED12 format
"""
BIOPZ-Katsantoni Maria
committed
get_sample('gtf')
BIOPZ-Katsantoni Maria
committed
bed12 = temp(os.path.join(
"full_transcripts_protein_coding.bed"))
BIOPZ-Katsantoni Maria
committed
params:
additional_params = parse_rule_config(
rule_config,
current_rule=current_rule,
immutable=(
'--gtf',
'--bed12',
)
)
"docker://zavolab/zgtf:0.1"
BIOPZ-Katsantoni Maria
committed
BIOPZ-Katsantoni Maria
committed
stdout = os.path.join(
config['log_dir'],
current_rule + ".stdout.log"),
BIOPZ-Katsantoni Maria
committed
stderr = os.path.join(
current_rule + ".stderr.log")
BIOPZ-Katsantoni Maria
committed
"(gtf2bed12 \
--gtf {input.gtf} \
--bed12 {output.bed12}); \
{params.additional_params} \
1> {log.stdout} 2> {log.stderr}"
current_rule = 'index_genomic_alignment_samtools'
rule index_genomic_alignment_samtools:
'''
Index genome bamfile using samtools
'''
input:
bam = os.path.join(
config["output_dir"],
"{sample}",
"map_genome",
"{sample}.{seqmode}.Aligned.sortedByCoord.out.bam"),
output:
bai = os.path.join(
config["output_dir"],
"{sample}",
"map_genome",
"{sample}.{seqmode}.Aligned.sortedByCoord.out.bam.bai")
params:
additional_params = parse_rule_config(
rule_config,
current_rule=current_rule,
immutable=()
)
singularity:
BIOPZ-Gypas Foivos
committed
"docker://quay.io/biocontainers/samtools:1.3.1--h1b8c3c0_8"
threads: 1
log:
stderr = os.path.join(
config["log_dir"],
"{sample}",
current_rule + ".{seqmode}.stderr.log"),
stdout = os.path.join(
config["log_dir"],
"{sample}",
current_rule + ".{seqmode}.stdout.log")
shell:
"(samtools index \
{params.additional_params} \
{input.bam} {output.bai};) \
1> {log.stdout} 2> {log.stderr}"
current_rule = 'calculate_TIN_scores'
BIOPZ-Katsantoni Maria
committed
"""
BIOPZ-Katsantoni Maria
committed
"""
bam = lambda wildcards:
expand(
os.path.join(
config['output_dir'],
"samples",
"{sample}",
"map_genome",
"{sample}.{seqmode}.Aligned.sortedByCoord.out.bam"),
sample=wildcards.sample,
BIOPZ-Katsantoni Maria
committed
seqmode=get_sample(
'seqmode',
search_id='index',
search_value=wildcards.sample)),
bai = lambda wildcards:
expand(
os.path.join(
config['output_dir'],
"samples",
"{sample}",
"map_genome",
"{sample}.{seqmode}.Aligned.sortedByCoord.out.bam.bai"),
sample=wildcards.sample,
BIOPZ-Katsantoni Maria
committed
seqmode=get_sample(
'seqmode',
search_id='index',
search_value=wildcards.sample)),
transcripts_bed12 = os.path.join(
config['output_dir'],
BIOPZ-Katsantoni Maria
committed
"full_transcripts_protein_coding.bed")
TIN_score = temp(os.path.join(
BIOPZ-Katsantoni Maria
committed
sample = "{sample}",
additional_params = parse_rule_config(
rule_config,
current_rule=current_rule,
immutable=(
'-i',
'-r',
'--names',
)
)
BIOPZ-Katsantoni Maria
committed
BIOPZ-Katsantoni Maria
committed
stderr = os.path.join(
BIOPZ-Katsantoni Maria
committed
BIOPZ-Katsantoni Maria
committed
"docker://zavolab/tin_score_calculation:0.2.0-slim"
BIOPZ-Katsantoni Maria
committed
BIOPZ-Katsantoni Maria
committed
"(tin_score_calculation.py \
-i {input.bam} \
-r {input.transcripts_bed12} \
--names {params.sample} \
{params.additional_params} \
BIOPZ-Katsantoni Maria
committed
> {output.TIN_score};) 2> {log.stderr}"
current_rule = 'salmon_quantmerge_genes'
rule salmon_quantmerge_genes:
BIOPZ-Katsantoni Maria
committed
'''
Merge gene quantifications into a single file
'''
BIOPZ-Katsantoni Maria
committed
salmon_in = expand(
os.path.join(
config["output_dir"],
BIOPZ-Katsantoni Maria
committed
"{sample}",
"{sample}.salmon.{seqmode}",
"quant.sf"),
BIOPZ-Katsantoni Maria
committed
sample=pd.unique(samples_table.index.values),
seqmode=[get_sample(
'seqmode',
search_id='index',
search_value=i)
for i in pd.unique(samples_table.index.values)])
BIOPZ-Katsantoni Maria
committed
BIOPZ-Katsantoni Maria
committed
salmon_out = os.path.join(
BIOPZ-Katsantoni Maria
committed
"summary_salmon",
"quantmerge",
"genes_{salmon_merge_on}.tsv")
params:
BIOPZ-Katsantoni Maria
committed
os.path.join(
config["output_dir"],
BIOPZ-Katsantoni Maria
committed
"{sample}",
BIOPZ-Katsantoni Maria
committed
sample=[i for i in pd.unique(samples_table.index.values)],
seqmode=[get_sample(
'seqmode',
search_id='index',
search_value=i)
for i in pd.unique(samples_table.index.values)]),
BIOPZ-Katsantoni Maria
committed
sample_name_list = expand(
"{sample}",
BIOPZ-Katsantoni Maria
committed
sample=pd.unique(samples_table.index.values)),
salmon_merge_on = "{salmon_merge_on}",
additional_params = parse_rule_config(
rule_config,
current_rule=current_rule,
immutable=(
'--quants',
'--genes',
'--transcripts',
'--names',
'--column',
'--output',
)
)
BIOPZ-Katsantoni Maria
committed
BIOPZ-Katsantoni Maria
committed
stderr = os.path.join(
config["log_dir"],
current_rule + "_{salmon_merge_on}.stderr.log"),
BIOPZ-Katsantoni Maria
committed
stdout = os.path.join(
config["log_dir"],
current_rule + "_{salmon_merge_on}.stdout.log")
BIOPZ-Katsantoni Maria
committed
threads: 1
singularity:
BIOPZ-Gypas Foivos
committed
"docker://quay.io/biocontainers/salmon:1.4.0--h84f40af_1"
BIOPZ-Katsantoni Maria
committed
shell:
"(salmon quantmerge \
--genes \
--names {params.sample_name_list} \
--column {params.salmon_merge_on} \
BIOPZ-Katsantoni Maria
committed
--output {output.salmon_out};) \
{params.additional_params} \
BIOPZ-Katsantoni Maria
committed
1> {log.stdout} 2> {log.stderr}"
current_rule = 'salmon_quantmerge_transcripts'
rule salmon_quantmerge_transcripts:
BIOPZ-Katsantoni Maria
committed
'''
BIOPZ-Iborra de Toledo Paula
committed
Merge transcript quantifications into a single file
BIOPZ-Katsantoni Maria
committed
'''
BIOPZ-Katsantoni Maria
committed
salmon_in = expand(
os.path.join(
config["output_dir"],
BIOPZ-Katsantoni Maria
committed
"{sample}",
BIOPZ-Katsantoni Maria
committed
"quant.sf"),
BIOPZ-Katsantoni Maria
committed
sample=[i for i in pd.unique(samples_table.index.values)],
seqmode=[get_sample(
'seqmode',
search_id='index',
search_value=i)
for i in pd.unique(samples_table.index.values)])
BIOPZ-Katsantoni Maria
committed
BIOPZ-Katsantoni Maria
committed
salmon_out = os.path.join(
BIOPZ-Katsantoni Maria
committed
"summary_salmon",
"quantmerge",
"transcripts_{salmon_merge_on}.tsv")
params:
BIOPZ-Katsantoni Maria
committed
os.path.join(
config["output_dir"],
BIOPZ-Katsantoni Maria
committed
"{sample}",
BIOPZ-Katsantoni Maria
committed
sample=[i for i in pd.unique(samples_table.index.values)],
seqmode=[get_sample(
'seqmode',
search_id='index',
search_value=i)
for i in pd.unique(samples_table.index.values)]),
BIOPZ-Katsantoni Maria
committed
sample_name_list = expand(
"{sample}",
BIOPZ-Katsantoni Maria
committed
sample=pd.unique(samples_table.index.values)),
salmon_merge_on = "{salmon_merge_on}",
additional_params = parse_rule_config(
rule_config,
current_rule=current_rule,
immutable=(
'--quants',
'--genes',
'--transcripts',
'--names',
'--column',
'--output',
)
)
BIOPZ-Katsantoni Maria
committed
BIOPZ-Katsantoni Maria
committed
stderr = os.path.join(
config["log_dir"],
current_rule + "_{salmon_merge_on}.stderr.log"),
BIOPZ-Katsantoni Maria
committed
stdout = os.path.join(
config["log_dir"],
current_rule + "_{salmon_merge_on}.stdout.log")
BIOPZ-Katsantoni Maria
committed
threads: 1
singularity:
BIOPZ-Gypas Foivos
committed
"docker://quay.io/biocontainers/salmon:1.4.0--h84f40af_1"
BIOPZ-Katsantoni Maria
committed
shell:
"(salmon quantmerge \
--names {params.sample_name_list} \
--column {params.salmon_merge_on} \
BIOPZ-Katsantoni Maria
committed
--output {output.salmon_out}) \
{params.additional_params} \
current_rule= 'kallisto_merge_genes'
BIOPZ-Iborra de Toledo Paula
committed
rule kallisto_merge_genes:
'''
Merge gene quantifications into single file
'''
input:
pseudoalignment = expand(
os.path.join(
config["output_dir"],
"samples",
"{sample}",
"quant_kallisto",
"{sample}.{seqmode}.kallisto.pseudo.sam"),
zip,
sample=[i for i in pd.unique(samples_table.index.values)],
seqmode=[get_sample(
'seqmode',
search_id='index',
search_value=i)
for i in pd.unique(samples_table.index.values)]),
gtf = get_sample('gtf')
output:
gn_tpm = os.path.join(
BIOPZ-Iborra de Toledo Paula
committed
config["output_dir"],
"summary_kallisto",
"genes_tpm.tsv"),
gn_counts = os.path.join(
config["output_dir"],
"summary_kallisto",
"genes_counts.tsv")
BIOPZ-Iborra de Toledo Paula
committed
params:
dir_out = os.path.join(
config["output_dir"],
"summary_kallisto"),
tables = ','.join(expand(
os.path.join(
config["output_dir"],
"samples",
"{sample}",
"quant_kallisto",
"abundance.h5"),
sample=[i for i in pd.unique(samples_table.index.values)])),
sample_name_list = ','.join(expand(
"{sample}",
sample=pd.unique(samples_table.index.values))),
additional_params = parse_rule_config(
rule_config,
current_rule=current_rule,
immutable=(
'--input',
'--names',
'--txOut',
'--anno',
'--output',
)
)
BIOPZ-Iborra de Toledo Paula
committed
log:
stderr = os.path.join(
config["log_dir"],
current_rule + ".stderr.log"),
BIOPZ-Iborra de Toledo Paula
committed
stdout = os.path.join(
config["log_dir"],
current_rule + ".stdout.log")
BIOPZ-Iborra de Toledo Paula
committed
threads: 1
singularity:
"docker://zavolab/merge_kallisto:0.6"
shell:
"(merge_kallisto.R \
--input {params.tables} \
--names {params.sample_name_list} \
--txOut FALSE \
--anno {input.gtf} \
--output {params.dir_out} \
{params.additional_params} ) \
BIOPZ-Iborra de Toledo Paula
committed
1> {log.stdout} 2> {log.stderr}"
current_rule = 'kallisto_merge_transcripts'
BIOPZ-Iborra de Toledo Paula
committed
rule kallisto_merge_transcripts:
'''
Merge transcript quantifications into a single files
'''
input:
pseudoalignment = expand(
os.path.join(
config["output_dir"],
"samples",
"{sample}",
"quant_kallisto",
"{sample}.{seqmode}.kallisto.pseudo.sam"),
zip,
sample=[i for i in pd.unique(samples_table.index.values)],
seqmode=[get_sample(
'seqmode',
search_id='index',
search_value=i)
for i in pd.unique(samples_table.index.values)]),
output:
tx_tpm = os.path.join(
BIOPZ-Iborra de Toledo Paula
committed
config["output_dir"],
"summary_kallisto",
"transcripts_tpm.tsv"),
tx_counts = os.path.join(
config["output_dir"],
"summary_kallisto",
"transcripts_counts.tsv")
BIOPZ-Iborra de Toledo Paula
committed
params:
dir_out = os.path.join(
config["output_dir"],
"summary_kallisto"),
tables = ','.join(expand(
os.path.join(
config["output_dir"],
"samples",
"{sample}",
"quant_kallisto",