Newer
Older
"""General purpose RNA-Seq analysis pipeline developed by the Zavolan Lab"""
BIOPZ-Gypas Foivos
committed
import os
import yaml
from shlex import quote
from typing import Tuple
BIOPZ-Gypas Foivos
committed
# Get sample table
samples_table = pd.read_csv(
config['samples'],
header=0,
index_col=0,
comment='#',
engine='python',
sep="\t",
)
# Validate config with template config
template_config_path = os.path.join(
workflow.current_basedir,
'..',
'tests',
'input_files',
'config.yaml')
try:
with open(template_config_path) as _file:
template_config = yaml.safe_load(_file)
logger.info(f"Loaded template config from {template_config_path}.")
except FileNotFoundError:
logger.error(f"No config file found at {template_config_path}! ")
raise
# Check if config contains required fields in example config
missing = [key for key in template_config if key not in config and key != 'optional']
if missing:
err_msg = f"Required keys missing: {missing}"
logger.error(err_msg)
raise ValueError(err_msg)
# Check if optional field available
if 'optional' not in config:
logger.info(f'No "optional" section found, adding dictionary "optional" to configuration.')
config['optional'] = {}
if type(config['optional']) not in [OrderedDict, dict]:
logger.error(f'No valid section "optional" supplied. Got {config["optional"]}, type: {type(config["optional"])} but require dictionary.')
raise TypeError
# Check optional fields and include in config if not present
for optkey, value in template_config['optional'].items():
if optkey not in config['optional']:
config['optional'][optkey] = value
logger.info(f'No value for optional parameter "{optkey}" found, set to default: "{value}"')
# Parse YAML rule config file
try:
with open(config['optional']['rule_config']) as _file:
rule_config = yaml.safe_load(_file)
logger.info(f"Loaded rule_config from {config['optional']['rule_config']}.")
except TypeError:
logger.error(f'No string supplied at field "rule_config", but: {type(config["optional"]["rule_config"])} with content: {config["optional"]["rule_config"]}')
raise
except FileNotFoundError:
logger.error(f"No rule config file found at {config['optional']['rule_config']}. Either provide file or remove rule_config parameter from config.yaml! ")
raise
except KeyError:
rule_config = {}
logger.warning(f"No rule config specified: using default values for all tools.")
# Create dir for cluster logs, if applicable
if cluster_config:
os.makedirs(
os.path.join(
os.getcwd(),
os.path.dirname(cluster_config['__default__']['out']),
),
exist_ok=True)
## Function definitions
BIOPZ-Katsantoni Maria
committed
def get_sample(column_id, search_id=None, search_value=None):
""" Get relevant per sample information from samples table"""
BIOPZ-Katsantoni Maria
committed
if search_id:
if search_id == 'index':
return str(samples_table[column_id][samples_table.index == search_value][0])
else:
return str(samples_table[column_id][samples_table[search_id] == search_value][0])
else:
return str(samples_table[column_id][0])
BIOPZ-Katsantoni Maria
committed
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
def parse_rule_config(rule_config: dict, current_rule: str, immutable: Tuple[str, ...] = ()):
"""Get rule specific parameters from rule_config file"""
# If rule config file not present, emtpy string will be returned
if not rule_config:
logger.info(f"No rule config specified: using default values for all tools.")
return ''
# Same if current rule not specified in rule config
if current_rule not in rule_config or not rule_config[current_rule]:
logger.info(f"No additional parameters for rule {current_rule} specified: using default settings.")
return ''
# Subset only section for current rule
rule_config = rule_config[current_rule]
# Build list of parameters and values
params_vals = []
for param, val in rule_config.items():
# Do not allow the user to change wiring-critical, fixed arguments, or arguments that are passed through samples table
if param in immutable:
raise ValueError(
f"The following parameter in rule {current_rule} is critical for the pipeline to "
f"function as expected and cannot be modified: {param}"
)
# Accept only strings; this prevents unintended results potentially
# arising from users entering reserved YAML keywords or nested
# structures (lists, dictionaries)
if isinstance(val, str):
params_vals.append(str(param))
# Do not include a value for flags (signified by empty strings)
if val:
params_vals.append(val)
else:
raise ValueError(
"Only string values allowed for tool parameters: Found type "
f"'{type(val).__name__}' for value of parameter '{param}'"
)
# Return quoted string
add_params = ' '.join(quote(item) for item in params_vals)
logger.info(f"User specified additional parameters for rule {current_rule}:\n {add_params}")
return add_params
# Global config
localrules: start, finish, rename_star_rpm_for_alfa, prepare_multiqc_config
BIOPZ-Gypas Foivos
committed
include: os.path.join("rules", "paired_end.snakefile.smk")
include: os.path.join("rules", "single_end.snakefile.smk")
BIOPZ-Katsantoni Maria
committed
"""
Rule for collecting outputs
"""
multiqc_report = os.path.join(
config['output_dir'],
"multiqc_summary"),
bigWig = expand(
config["output_dir"],
"samples",
"{sample}",
"bigWig",
"{unique_type}",
"{sample}_{unique_type}_{strand}.bw"),
BIOPZ-Katsantoni Maria
committed
sample=pd.unique(samples_table.index.values),
strand=["plus", "minus"],
unique_type=["Unique", "UniqueMultiple"]),
salmon_merge_genes = expand(
os.path.join(
config["output_dir"],
"summary_salmon",
"quantmerge",
"genes_{salmon_merge_on}.tsv"),
salmon_merge_on=["tpm", "numreads"]),
salmon_merge_transcripts = expand(
os.path.join(
config["output_dir"],
"summary_salmon",
"quantmerge",
"transcripts_{salmon_merge_on}.tsv"),
salmon_merge_on=["tpm", "numreads"]),
BIOPZ-Iborra de Toledo Paula
committed
kallisto_merge_transcripts = os.path.join(
config["output_dir"],
"summary_kallisto",
"transcripts_tpm.tsv"),
kallisto_merge_genes = os.path.join(
config["output_dir"],
"summary_kallisto",
"genes_tpm.tsv")
current_rule = 'start'
rule start:
'''
Get samples
'''
input:
reads = lambda wildcards:
BIOPZ-Katsantoni Maria
committed
expand(
pd.Series(
samples_table.loc[wildcards.sample, wildcards.mate]
).values)
output:
reads = os.path.join(
config["output_dir"],
"samples",
"{sample}",
"start",
"{sample}.{mate}.fastq.gz")
log:
stderr = os.path.join(
config["log_dir"],
"samples",
"{sample}",
current_rule + "_{sample}.{mate}.stderr.log"),
stdout = os.path.join(
config["log_dir"],
"samples",
"{sample}",
current_rule + "_{sample}.{mate}.stdout.log")
BIOPZ-Gypas Foivos
committed
"docker://ubuntu:focal-20210416"
BIOPZ-Katsantoni Maria
committed
"(cat {input.reads} > {output.reads}) \
rule fastqc:
'''
A quality control tool for high throughput sequence data
'''
input:
reads = os.path.join(
config["output_dir"],
"samples",
"{sample}",
"start",
"{sample}.{mate}.fastq.gz")
output:
outdir = directory(
config["output_dir"],
"samples",
"{sample}",
"fastqc",
"{mate}"))
params:
additional_params = parse_rule_config(
rule_config,
current_rule=current_rule,
immutable=(
'--outdir',
)
)
BIOPZ-Gypas Foivos
committed
"docker://quay.io/biocontainers/fastqc:0.11.9--hdfd78af_1"
conda:
os.path.join(workflow.basedir, "envs", "fastqc.yaml")
log:
stderr = os.path.join(
config["log_dir"],
"samples",
"{sample}",
current_rule + "_{mate}.stderr.log"),
stdout = os.path.join(
config["log_dir"],
"samples",
"{sample}",
current_rule + "_{mate}.stdout.log")
fastqc --outdir {output.outdir} \
--threads {threads} \
{params.additional_params} \
{input.reads}) \
BIOPZ-Iborra de Toledo Paula
committed
current_rule = 'create_index_star'
BIOPZ-Gypas Foivos
committed
rule create_index_star:
BIOPZ-Katsantoni Maria
committed
"""
Create index for STAR alignments
"""
input:
genome = lambda wildcards:
os.path.abspath(get_sample(
BIOPZ-Katsantoni Maria
committed
'genome',
search_id='organism',
search_value=wildcards.organism)),
BIOPZ-Katsantoni Maria
committed
os.path.abspath(get_sample(
BIOPZ-Katsantoni Maria
committed
'gtf',
search_id='organism',
search_value=wildcards.organism))
BIOPZ-Katsantoni Maria
committed
output:
chromosome_info = os.path.join(
config['star_indexes'],
"{organism}",
"{index_size}",
"STAR_index",
BIOPZ-Katsantoni Maria
committed
"chrNameLength.txt"),
chromosomes_names = os.path.join(
config['star_indexes'],
"{organism}",
"{index_size}",
"STAR_index",
BIOPZ-Katsantoni Maria
committed
"chrName.txt")
params:
output_dir = os.path.join(
config['star_indexes'],
"{organism}",
"{index_size}",
BIOPZ-Katsantoni Maria
committed
"STAR_index"),
outFileNamePrefix = os.path.join(
config['star_indexes'],
"{organism}",
"{index_size}",
BIOPZ-Katsantoni Maria
committed
"STAR_index/STAR_"),
sjdbOverhang = "{index_size}",
additional_params = parse_rule_config(
rule_config,
current_rule=current_rule,
immutable=(
'--runMode',
'--sjdbOverhang',
'--genomeDir',
'--genomeFastaFiles',
'--outFileNamePrefix',
'--sjdbGTFfile',
)
)
BIOPZ-Katsantoni Maria
committed
BIOPZ-Gypas Foivos
committed
"docker://quay.io/biocontainers/star:2.7.8a--h9ee0642_1"
BIOPZ-Katsantoni Maria
committed
conda:
os.path.join(workflow.basedir, "envs", "STAR.yaml")
BIOPZ-Katsantoni Maria
committed
BIOPZ-Katsantoni Maria
committed
stderr = os.path.join(
current_rule + "_{organism}_{index_size}.stderr.log"),
BIOPZ-Katsantoni Maria
committed
stdout = os.path.join(
config['log_dir'],
current_rule + "_{organism}_{index_size}.stdout.log")
BIOPZ-Katsantoni Maria
committed
shell:
"(mkdir -p {params.output_dir}; \
chmod -R 777 {params.output_dir}; \
STAR \
--runMode genomeGenerate \
--sjdbOverhang {params.sjdbOverhang} \
--genomeDir {params.output_dir} \
--genomeFastaFiles {input.genome} \
--runThreadN {threads} \
--outFileNamePrefix {params.outFileNamePrefix} \
BIOPZ-Katsantoni Maria
committed
--sjdbGTFfile {input.gtf}) \
{params.additional_params} \
BIOPZ-Katsantoni Maria
committed
1> {log.stdout} 2> {log.stderr}"
BIOPZ-Gypas Foivos
committed
current_rule = 'extract_transcriptome'
BIOPZ-Iborra de Toledo Paula
committed
rule extract_transcriptome:
"""
Create transcriptome from genome and gene annotations
"""
BIOPZ-Iborra de Toledo Paula
committed
input:
genome = lambda wildcards:
BIOPZ-Katsantoni Maria
committed
get_sample(
'genome',
search_id='organism',
search_value=wildcards.organism),
BIOPZ-Iborra de Toledo Paula
committed
gtf = lambda wildcards:
BIOPZ-Katsantoni Maria
committed
get_sample(
'gtf',
search_id='organism',
search_value=wildcards.organism)
BIOPZ-Iborra de Toledo Paula
committed
output:
transcriptome = temp(os.path.join(
config['output_dir'],
"transcriptome",
"{organism}",
params:
additional_params = parse_rule_config(
rule_config,
current_rule=current_rule,
immutable=(
'-w',
'-g',
)
)
BIOPZ-Iborra de Toledo Paula
committed
log:
stderr = os.path.join(
config['log_dir'],
current_rule + "_{organism}.log"),
BIOPZ-Iborra de Toledo Paula
committed
stdout = os.path.join(
config['log_dir'],
current_rule + "_{organism}.log")
BIOPZ-Iborra de Toledo Paula
committed
singularity:
BIOPZ-Gypas Foivos
committed
"docker://quay.io/biocontainers/gffread:0.12.1--h2e03b76_1"
conda:
os.path.join(workflow.basedir, "envs", "gffread.yaml")
BIOPZ-Iborra de Toledo Paula
committed
shell:
"(gffread \
-w {output.transcriptome} \
-g {input.genome} \
{params.additional_params} \
{input.gtf}) \
BIOPZ-Iborra de Toledo Paula
committed
1> {log.stdout} 2> {log.stderr}"
BIOPZ-Gypas Foivos
committed
current_rule = 'concatenate_transcriptome_and_genome'
rule concatenate_transcriptome_and_genome:
"""
Concatenate genome and transcriptome
"""
input:
transcriptome = os.path.join(
config['output_dir'],
"transcriptome",
"{organism}",
"transcriptome.fa"),
genome = lambda wildcards:
BIOPZ-Katsantoni Maria
committed
get_sample(
'genome',
search_id='organism',
search_value=wildcards.organism)
genome_transcriptome = temp(os.path.join(
config['output_dir'],
"transcriptome",
"{organism}",
"genome_transcriptome.fa"))
BIOPZ-Gypas Foivos
committed
"docker://ubuntu:focal-20210416"
log:
stderr = os.path.join(
config['log_dir'],
current_rule + "_{organism}.stderr.log")
shell:
"(cat {input.transcriptome} {input.genome} \
1> {output.genome_transcriptome}) \
2> {log.stderr}"
current_rule = 'create_index_salmon'
rule create_index_salmon:
"""
Create index for Salmon quantification
"""
input:
config['output_dir'],
"transcriptome",
"{organism}",
"genome_transcriptome.fa"),
chr_names = lambda wildcards:
os.path.join(
config['star_indexes'],
BIOPZ-Katsantoni Maria
committed
get_sample('organism'),
get_sample("index_size"),
output:
index = directory(
os.path.join(
config['salmon_indexes'],
"{organism}",
"{kmer}",
BIOPZ-Katsantoni Maria
committed
"salmon.idx"))
kmerLen = "{kmer}",
additional_params = parse_rule_config(
rule_config,
current_rule=current_rule,
immutable=(
'--transcripts',
'--decoys',
'--index',
'--kmerLen',
)
)
BIOPZ-Katsantoni Maria
committed
BIOPZ-Gypas Foivos
committed
"docker://quay.io/biocontainers/salmon:1.4.0--h84f40af_1"
conda:
os.path.join(workflow.basedir, "envs", "salmon.yaml")
BIOPZ-Katsantoni Maria
committed
BIOPZ-Katsantoni Maria
committed
stderr = os.path.join(
current_rule + "_{organism}_{kmer}.stderr.log"),
BIOPZ-Katsantoni Maria
committed
stdout = os.path.join(
config['log_dir'],
current_rule + "_{organism}_{kmer}.stdout.log")
BIOPZ-Katsantoni Maria
committed
BIOPZ-Katsantoni Maria
committed
--transcripts {input.genome_transcriptome} \
--decoys {input.chr_names} \
--index {output.index} \
--kmerLen {params.kmerLen} \
BIOPZ-Katsantoni Maria
committed
--threads {threads}) \
{params.additional_params} \
BIOPZ-Katsantoni Maria
committed
1> {log.stdout} 2> {log.stderr}"
BIOPZ-Gypas Foivos
committed
current_rule = 'create_index_kallisto'
BIOPZ-Gypas Foivos
committed
rule create_index_kallisto:
BIOPZ-Katsantoni Maria
committed
"""
Create index for Kallisto quantification
"""
BIOPZ-Iborra de Toledo Paula
committed
transcriptome = os.path.join(
config['output_dir'],
"transcriptome",
"{organism}",
"transcriptome.fa")
output:
index = os.path.join(
config['kallisto_indexes'],
"{organism}",
BIOPZ-Katsantoni Maria
committed
"kallisto.idx")
params:
output_dir = os.path.join(
config['kallisto_indexes'],
"{organism}"),
additional_params = parse_rule_config(
rule_config,
current_rule=current_rule,
immutable=(
'-i',
)
)
BIOPZ-Katsantoni Maria
committed
BIOPZ-Gypas Foivos
committed
"docker://quay.io/biocontainers/kallisto:0.46.2--h60f4f9f_2"
BIOPZ-Katsantoni Maria
committed
conda:
os.path.join(workflow.basedir, "envs", "kallisto.yaml")
BIOPZ-Katsantoni Maria
committed
stderr = os.path.join(
current_rule + "_{organism}.stderr.log"),
BIOPZ-Katsantoni Maria
committed
stdout = os.path.join(
config['log_dir'],
current_rule + "_{organism}.stdout.log")
BIOPZ-Katsantoni Maria
committed
shell:
"(mkdir -p {params.output_dir}; \
chmod -R 777 {params.output_dir}; \
kallisto index \
{params.additional_params} \
-i {output.index} \
{input.transcriptome}) \
BIOPZ-Katsantoni Maria
committed
1> {log.stdout} 2> {log.stderr}"
BIOPZ-Gypas Foivos
committed
current_rule = 'extract_transcripts_as_bed12'
BIOPZ-Katsantoni Maria
committed
"""
Convert transcripts to BED12 format
"""
BIOPZ-Katsantoni Maria
committed
get_sample('gtf')
BIOPZ-Katsantoni Maria
committed
bed12 = temp(os.path.join(
"full_transcripts_protein_coding.bed"))
BIOPZ-Katsantoni Maria
committed
params:
additional_params = parse_rule_config(
rule_config,
current_rule=current_rule,
immutable=(
'--gtf',
'--bed12',
)
)
"docker://quay.io/biocontainers/zgtf:0.1.1--pyh5e36f6f_0"
BIOPZ-Katsantoni Maria
committed
conda:
os.path.join(workflow.basedir, "envs", "zgtf.yaml")
BIOPZ-Katsantoni Maria
committed
stdout = os.path.join(
config['log_dir'],
current_rule + ".stdout.log"),
BIOPZ-Katsantoni Maria
committed
stderr = os.path.join(
current_rule + ".stderr.log")
BIOPZ-Katsantoni Maria
committed
"(gtf2bed12 \
--gtf {input.gtf} \
--bed12 {output.bed12}); \
{params.additional_params} \
1> {log.stdout} 2> {log.stderr}"
current_rule = 'index_genomic_alignment_samtools'
rule index_genomic_alignment_samtools:
'''
Index genome bamfile using samtools
'''
input:
bam = os.path.join(
config["output_dir"],
"{sample}",
"map_genome",
"{sample}.{seqmode}.Aligned.sortedByCoord.out.bam"),
output:
bai = os.path.join(
config["output_dir"],
"{sample}",
"map_genome",
"{sample}.{seqmode}.Aligned.sortedByCoord.out.bam.bai")
params:
additional_params = parse_rule_config(
rule_config,
current_rule=current_rule,
immutable=()
)
singularity:
BIOPZ-Gypas Foivos
committed
"docker://quay.io/biocontainers/samtools:1.3.1--h1b8c3c0_8"
conda:
os.path.join(workflow.basedir, "envs", "samtools.yaml")
threads: 1
log:
stderr = os.path.join(
config["log_dir"],
"{sample}",
current_rule + ".{seqmode}.stderr.log"),
stdout = os.path.join(
config["log_dir"],
"{sample}",
current_rule + ".{seqmode}.stdout.log")
shell:
"(samtools index \
{params.additional_params} \
{input.bam} {output.bai};) \
1> {log.stdout} 2> {log.stderr}"
current_rule = 'calculate_TIN_scores'
BIOPZ-Katsantoni Maria
committed
"""
BIOPZ-Katsantoni Maria
committed
"""
bam = lambda wildcards:
expand(
os.path.join(
config['output_dir'],
"samples",
"{sample}",
"map_genome",
"{sample}.{seqmode}.Aligned.sortedByCoord.out.bam"),
sample=wildcards.sample,
BIOPZ-Katsantoni Maria
committed
seqmode=get_sample(
'seqmode',
search_id='index',
search_value=wildcards.sample)),
bai = lambda wildcards:
expand(
os.path.join(
config['output_dir'],
"samples",
"{sample}",
"map_genome",
"{sample}.{seqmode}.Aligned.sortedByCoord.out.bam.bai"),
sample=wildcards.sample,
BIOPZ-Katsantoni Maria
committed
seqmode=get_sample(
'seqmode',
search_id='index',
search_value=wildcards.sample)),
transcripts_bed12 = os.path.join(
config['output_dir'],
BIOPZ-Katsantoni Maria
committed
"full_transcripts_protein_coding.bed")
TIN_score = temp(os.path.join(
BIOPZ-Katsantoni Maria
committed
sample = "{sample}",
additional_params = parse_rule_config(
rule_config,
current_rule=current_rule,
immutable=(
'-i',
'-r',
'--names',
)
)
BIOPZ-Katsantoni Maria
committed
BIOPZ-Katsantoni Maria
committed
stderr = os.path.join(
BIOPZ-Katsantoni Maria
committed
BIOPZ-Katsantoni Maria
committed
"docker://quay.io/biocontainers/tin-score-calculation:0.4--pyh5e36f6f_0"
BIOPZ-Katsantoni Maria
committed
conda:
os.path.join(workflow.basedir, "envs", "tin-score-calculation.yaml")
"(calculate-tin.py \
-i {input.bam} \
-r {input.transcripts_bed12} \
--names {params.sample} \
{params.additional_params} \
BIOPZ-Katsantoni Maria
committed
> {output.TIN_score};) 2> {log.stderr}"
current_rule = 'salmon_quantmerge_genes'
rule salmon_quantmerge_genes:
BIOPZ-Katsantoni Maria
committed
'''
Merge gene quantifications into a single file
'''
BIOPZ-Katsantoni Maria
committed
salmon_in = expand(
os.path.join(
config["output_dir"],
BIOPZ-Katsantoni Maria
committed
"{sample}",
"{sample}.salmon.{seqmode}",
"quant.sf"),
BIOPZ-Katsantoni Maria
committed
sample=pd.unique(samples_table.index.values),
seqmode=[get_sample(
'seqmode',
search_id='index',
search_value=i)
for i in pd.unique(samples_table.index.values)])
BIOPZ-Katsantoni Maria
committed
BIOPZ-Katsantoni Maria
committed
salmon_out = os.path.join(
BIOPZ-Katsantoni Maria
committed
"summary_salmon",
"quantmerge",
"genes_{salmon_merge_on}.tsv")
params:
BIOPZ-Katsantoni Maria
committed
os.path.join(
config["output_dir"],
BIOPZ-Katsantoni Maria
committed
"{sample}",
BIOPZ-Katsantoni Maria
committed
sample=[i for i in pd.unique(samples_table.index.values)],
seqmode=[get_sample(
'seqmode',
search_id='index',
search_value=i)
for i in pd.unique(samples_table.index.values)]),
BIOPZ-Katsantoni Maria
committed
sample_name_list = expand(
"{sample}",
BIOPZ-Katsantoni Maria
committed
sample=pd.unique(samples_table.index.values)),
salmon_merge_on = "{salmon_merge_on}",
additional_params = parse_rule_config(
rule_config,
current_rule=current_rule,
immutable=(
'--quants',
'--genes',
'--transcripts',
'--names',
'--column',
'--output',
)
)
BIOPZ-Katsantoni Maria
committed
BIOPZ-Katsantoni Maria
committed
stderr = os.path.join(
config["log_dir"],
current_rule + "_{salmon_merge_on}.stderr.log"),
BIOPZ-Katsantoni Maria
committed
stdout = os.path.join(
config["log_dir"],
current_rule + "_{salmon_merge_on}.stdout.log")
BIOPZ-Katsantoni Maria
committed
threads: 1
singularity:
BIOPZ-Gypas Foivos
committed
"docker://quay.io/biocontainers/salmon:1.4.0--h84f40af_1"
BIOPZ-Katsantoni Maria
committed
conda:
os.path.join(workflow.basedir, "envs", "salmon.yaml")
shell:
"(salmon quantmerge \
--genes \
--names {params.sample_name_list} \
--column {params.salmon_merge_on} \
BIOPZ-Katsantoni Maria
committed
--output {output.salmon_out};) \
{params.additional_params} \
BIOPZ-Katsantoni Maria
committed
1> {log.stdout} 2> {log.stderr}"
current_rule = 'salmon_quantmerge_transcripts'
rule salmon_quantmerge_transcripts:
BIOPZ-Katsantoni Maria
committed
'''
BIOPZ-Iborra de Toledo Paula
committed
Merge transcript quantifications into a single file
BIOPZ-Katsantoni Maria
committed
'''
BIOPZ-Katsantoni Maria
committed
salmon_in = expand(
os.path.join(
config["output_dir"],
BIOPZ-Katsantoni Maria
committed
"{sample}",
BIOPZ-Katsantoni Maria
committed
"quant.sf"),
BIOPZ-Katsantoni Maria
committed
sample=[i for i in pd.unique(samples_table.index.values)],
seqmode=[get_sample(
'seqmode',
search_id='index',
search_value=i)
for i in pd.unique(samples_table.index.values)])
BIOPZ-Katsantoni Maria
committed
BIOPZ-Katsantoni Maria
committed
salmon_out = os.path.join(
BIOPZ-Katsantoni Maria
committed
"summary_salmon",
"quantmerge",
"transcripts_{salmon_merge_on}.tsv")
params:
BIOPZ-Katsantoni Maria
committed
os.path.join(
config["output_dir"],
BIOPZ-Katsantoni Maria
committed
"{sample}",
BIOPZ-Katsantoni Maria
committed
sample=[i for i in pd.unique(samples_table.index.values)],
seqmode=[get_sample(
'seqmode',
search_id='index',
search_value=i)
for i in pd.unique(samples_table.index.values)]),
BIOPZ-Katsantoni Maria
committed
sample_name_list = expand(
"{sample}",
BIOPZ-Katsantoni Maria
committed
sample=pd.unique(samples_table.index.values)),
salmon_merge_on = "{salmon_merge_on}",
additional_params = parse_rule_config(
rule_config,
current_rule=current_rule,
immutable=(
'--quants',
'--genes',
'--transcripts',
'--names',
'--column',
'--output',
)
)
BIOPZ-Katsantoni Maria
committed
BIOPZ-Katsantoni Maria
committed
stderr = os.path.join(
config["log_dir"],
current_rule + "_{salmon_merge_on}.stderr.log"),
BIOPZ-Katsantoni Maria
committed
stdout = os.path.join(
config["log_dir"],
current_rule + "_{salmon_merge_on}.stdout.log")
BIOPZ-Katsantoni Maria
committed
threads: 1
singularity:
BIOPZ-Gypas Foivos
committed
"docker://quay.io/biocontainers/salmon:1.4.0--h84f40af_1"
BIOPZ-Katsantoni Maria
committed
conda:
os.path.join(workflow.basedir, "envs", "salmon.yaml")
shell:
"(salmon quantmerge \
--names {params.sample_name_list} \
--column {params.salmon_merge_on} \
BIOPZ-Katsantoni Maria
committed
--output {output.salmon_out}) \
{params.additional_params} \
current_rule= 'kallisto_merge_genes'
BIOPZ-Iborra de Toledo Paula
committed
rule kallisto_merge_genes:
'''
Merge gene quantifications into single file
'''
input:
pseudoalignment = expand(
os.path.join(
config["output_dir"],
"samples",
"{sample}",
"quant_kallisto",
"{sample}.{seqmode}.kallisto.pseudo.sam"),
zip,
sample=[i for i in pd.unique(samples_table.index.values)],
seqmode=[get_sample(
'seqmode',
search_id='index',
search_value=i)
for i in pd.unique(samples_table.index.values)]),
gtf = get_sample('gtf')
output:
gn_tpm = os.path.join(
BIOPZ-Iborra de Toledo Paula
committed
config["output_dir"],
"summary_kallisto",
"genes_tpm.tsv"),
gn_counts = os.path.join(
config["output_dir"],
"summary_kallisto",
"genes_counts.tsv")
BIOPZ-Iborra de Toledo Paula
committed
params:
dir_out = os.path.join(
config["output_dir"],
"summary_kallisto"),
tables = ','.join(expand(
os.path.join(
config["output_dir"],
"samples",
"{sample}",
"quant_kallisto",
"abundance.h5"),
sample=[i for i in pd.unique(samples_table.index.values)])),
sample_name_list = ','.join(expand(
"{sample}",
sample=pd.unique(samples_table.index.values))),