Newer
Older
"""General purpose RNA-Seq analysis pipeline developed by the Zavolan Lab"""
BIOPZ-Gypas Foivos
committed
import os
import sys
BIOPZ-Gypas Foivos
committed
# Get sample table
samples_table = pd.read_csv(
config['samples'],
header=0,
index_col=0,
comment='#',
engine='python',
sep="\t",
)
# Create log directories
os.makedirs(
os.path.join(
os.getcwd(),
config['log_dir'],
),
BIOPZ-Katsantoni Maria
committed
exist_ok=True)
if cluster_config:
os.makedirs(
os.path.join(
os.getcwd(),
os.path.dirname(cluster_config['__default__']['out']),
),
BIOPZ-Katsantoni Maria
committed
exist_ok=True)
BIOPZ-Gypas Foivos
committed
# Include subworkflows
include: os.path.join("workflow", "rules", "paired_end.snakefile.smk")
include: os.path.join("workflow", "rules", "single_end.snakefile.smk")
BIOPZ-Katsantoni Maria
committed
"""
Rule for collecting outputs
"""
input:
outdir1 = expand(
os.path.join(
config['output_dir'],
"{seqmode}",
"{sample}",
BIOPZ-Katsantoni Maria
committed
"mate1_fastqc"),
zip,
sample=[i for i in list(samples_table.index.values)],
BIOPZ-Katsantoni Maria
committed
seqmode=[samples_table.loc[i, 'seqmode']
for i in list(samples_table.index.values)]),
salmon_gn_estimates = expand(
os.path.join(
config['output_dir'],
"{seqmode}",
"{sample}",
"salmon_quant",
BIOPZ-Katsantoni Maria
committed
"quant.genes.sf"),
zip,
sample=[i for i in list(samples_table.index.values)],
BIOPZ-Katsantoni Maria
committed
seqmode=[samples_table.loc[i, 'seqmode']
for i in list(samples_table.index.values)]),
pseudoalignment = expand(
os.path.join(
config['output_dir'],
"{seqmode}",
"{sample}",
"quant_kallisto",
BIOPZ-Katsantoni Maria
committed
"{sample}.kallisto.pseudo.sam"),
zip,
sample=[i for i in list(samples_table.index.values)],
BIOPZ-Katsantoni Maria
committed
seqmode=[samples_table.loc[i, 'seqmode']
for i in list(samples_table.index.values)]),
TIN_score = expand(
os.path.join(
config['output_dir'],
"{seqmode}",
"{sample}",
"TIN",
BIOPZ-Katsantoni Maria
committed
"TIN_score.tsv"),
zip,
sample=[i for i in list(samples_table.index.values)],
BIOPZ-Katsantoni Maria
committed
seqmode=[samples_table.loc[i, 'seqmode']
for i in list(samples_table.index.values)]),
salmon_merge_genes = expand(
os.path.join(
config["output_dir"],
"summary_salmon",
"quantmerge",
"genes_{salmon_merge_on}.tsv"),
salmon_merge_on=["tpm", "numreads"]),
salmon_merge_transcripts = expand(
os.path.join(
config["output_dir"],
"summary_salmon",
"quantmerge",
"transcripts_{salmon_merge_on}.tsv"),
salmon_merge_on=["tpm", "numreads"]),
star_rpm = expand(
os.path.join(
config["output_dir"],
"{seqmode}",
"{sample}",
"STAR_coverage",
"{sample}_Signal.UniqueMultiple.str1.out.bg"),
zip,
sample=[i for i in list(samples_table.index.values)],
seqmode=[samples_table.loc[i, 'seqmode']
for i in list(samples_table.index.values)])
BIOPZ-Katsantoni Maria
committed
BIOPZ-Iborra de Toledo Paula
committed
BIOPZ-Gypas Foivos
committed
rule create_index_star:
BIOPZ-Katsantoni Maria
committed
"""
Create index for STAR alignments
"""
input:
genome = lambda wildcards:
BIOPZ-Katsantoni Maria
committed
samples_table['genome']
[samples_table['organism'] == wildcards.organism]
[0],
BIOPZ-Katsantoni Maria
committed
samples_table['gtf']
[samples_table['organism'] == wildcards.organism]
[0]
output:
chromosome_info = os.path.join(
config['star_indexes'],
"{organism}",
"{index_size}",
"STAR_index",
BIOPZ-Katsantoni Maria
committed
"chrNameLength.txt"),
chromosomes_names = os.path.join(
config['star_indexes'],
"{organism}",
"{index_size}",
"STAR_index",
BIOPZ-Katsantoni Maria
committed
"chrName.txt")
params:
output_dir = os.path.join(
config['star_indexes'],
"{organism}",
"{index_size}",
BIOPZ-Katsantoni Maria
committed
"STAR_index"),
outFileNamePrefix = os.path.join(
config['star_indexes'],
"{organism}",
"{index_size}",
BIOPZ-Katsantoni Maria
committed
"STAR_index/STAR_"),
sjdbOverhang = "{index_size}"
BIOPZ-Katsantoni Maria
committed
BIOPZ-Katsantoni Maria
committed
BIOPZ-Katsantoni Maria
committed
stderr = os.path.join(
BIOPZ-Katsantoni Maria
committed
"{organism}_{index_size}_create_index_star.stderr.log"),
stdout = os.path.join(
config['log_dir'],
"{organism}_{index_size}_create_index_star.stdout.log")
shell:
"(mkdir -p {params.output_dir}; \
chmod -R 777 {params.output_dir}; \
STAR \
--runMode genomeGenerate \
--sjdbOverhang {params.sjdbOverhang} \
--genomeDir {params.output_dir} \
--genomeFastaFiles {input.genome} \
--runThreadN {threads} \
--outFileNamePrefix {params.outFileNamePrefix} \
BIOPZ-Katsantoni Maria
committed
--sjdbGTFfile {input.gtf}) \
1> {log.stdout} 2> {log.stderr}"
BIOPZ-Gypas Foivos
committed
BIOPZ-Iborra de Toledo Paula
committed
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
rule extract_transcriptome:
""" Create transcriptome from genome and gene annotations """
input:
genome = lambda wildcards:
samples_table['genome'][
samples_table['organism'] == wildcards.organism
][0],
gtf = lambda wildcards:
samples_table['gtf'][
samples_table['organism'] == wildcards.organism
][0]
output:
transcriptome = os.path.join(
config['output_dir'],
"transcriptome",
"{organism}",
"transcriptome.fa",
)
log:
stderr = os.path.join(
config['log_dir'],
"{organism}_extract_transcriptome.log"),
stdout = os.path.join(
config['log_dir'],
"{organism}_extract_transcriptome.log")
singularity:
"docker://zavolab/gffread:0.11.7"
BIOPZ-Iborra de Toledo Paula
committed
shell:
"(gffread \
-w {output.transcriptome} \
-g {input.genome} {input.gtf}) \
1> {log.stdout} 2> {log.stderr}"
BIOPZ-Gypas Foivos
committed
rule create_index_salmon:
BIOPZ-Katsantoni Maria
committed
"""
Create index for Salmon quantification
"""
BIOPZ-Iborra de Toledo Paula
committed
transcriptome = os.path.join(
config['output_dir'],
"transcriptome",
"{organism}",
"transcriptome.fa",
)
output:
index = directory(
os.path.join(
config['salmon_indexes'],
"{organism}",
"{kmer}",
BIOPZ-Katsantoni Maria
committed
"salmon.idx"))
BIOPZ-Katsantoni Maria
committed
kmerLen = "{kmer}"
"docker://zavolab/salmon:1.1.0-slim"
BIOPZ-Katsantoni Maria
committed
BIOPZ-Katsantoni Maria
committed
stderr = os.path.join(
BIOPZ-Katsantoni Maria
committed
"{organism}_{kmer}_create_index_salmon.stderr.log"),
stdout = os.path.join(
config['log_dir'],
"{organism}_{kmer}_create_index_salmon.stdout.log")
BIOPZ-Katsantoni Maria
committed
shell:
"(salmon index \
--transcripts {input.transcriptome} \
--index {output.index} \
--kmerLen {params.kmerLen} \
BIOPZ-Katsantoni Maria
committed
--threads {threads}) \
1> {log.stdout} 2> {log.stderr}"
BIOPZ-Gypas Foivos
committed
rule create_index_kallisto:
BIOPZ-Katsantoni Maria
committed
"""
Create index for Kallisto quantification
"""
BIOPZ-Iborra de Toledo Paula
committed
transcriptome = os.path.join(
config['output_dir'],
"transcriptome",
"{organism}",
"transcriptome.fa",
)
output:
index = os.path.join(
config['kallisto_indexes'],
"{organism}",
BIOPZ-Katsantoni Maria
committed
"kallisto.idx")
params:
output_dir = os.path.join(
config['kallisto_indexes'],
BIOPZ-Katsantoni Maria
committed
"{organism}")
BIOPZ-Katsantoni Maria
committed
BIOPZ-Katsantoni Maria
committed
stderr = os.path.join(
BIOPZ-Katsantoni Maria
committed
"{organism}_create_index_kallisto.stderr.log"),
stdout = os.path.join(
config['log_dir'],
"{organism}_create_index_kallisto.stdout.log")
shell:
"(mkdir -p {params.output_dir}; \
chmod -R 777 {params.output_dir}; \
BIOPZ-Katsantoni Maria
committed
kallisto index -i {output.index} {input.transcriptome}) \
1> {log.stdout} 2> {log.stderr}"
BIOPZ-Gypas Foivos
committed
BIOPZ-Katsantoni Maria
committed
"""
Convert transcripts to BED12 format
"""
BIOPZ-Katsantoni Maria
committed
samples_table['gtf']
[0]
output:
bed12 = os.path.join(
config['output_dir'],
BIOPZ-Katsantoni Maria
committed
"full_transcripts_protein_coding.bed")
"docker://zavolab/gtf_transcript_type_to_bed12:0.1.0-slim"
BIOPZ-Katsantoni Maria
committed
BIOPZ-Katsantoni Maria
committed
BIOPZ-Katsantoni Maria
committed
stderr = os.path.join(
BIOPZ-Katsantoni Maria
committed
"extract_transcripts_as_bed12.stderr.log")
BIOPZ-Katsantoni Maria
committed
"(gtf_transcript_type_to_bed12.pl \
BIOPZ-Katsantoni Maria
committed
--type=protein_coding > {output.bed12}); \
2> {log.stderr}"
BIOPZ-Katsantoni Maria
committed
"""
Caluclate transcript integrity (TIN) score
"""
input:
bai = os.path.join(
config['output_dir'],
"{seqmode}",
"{sample}",
"map_genome",
BIOPZ-Katsantoni Maria
committed
"{sample}_Aligned.sortedByCoord.out.bam.bai"),
transcripts_bed12 = os.path.join(
config['output_dir'],
BIOPZ-Katsantoni Maria
committed
"full_transcripts_protein_coding.bed")
output:
TIN_score = os.path.join(
config['output_dir'],
"{seqmode}",
"{sample}",
"TIN",
BIOPZ-Katsantoni Maria
committed
"TIN_score.tsv")
params:
bam = os.path.join(
config['output_dir'],
"{seqmode}",
"{sample}",
"map_genome",
BIOPZ-Katsantoni Maria
committed
"{sample}_Aligned.sortedByCoord.out.bam"),
sample = "{sample}"
BIOPZ-Katsantoni Maria
committed
stderr = os.path.join(
config['log_dir'],
"{seqmode}",
"{sample}",
BIOPZ-Katsantoni Maria
committed
"calculate_TIN_scores.log")
BIOPZ-Katsantoni Maria
committed
"docker://zavolab/tin_score_calculation:0.1.0-slim"
BIOPZ-Katsantoni Maria
committed
BIOPZ-Katsantoni Maria
committed
"(tin_score_calculation.py \
-i {params.bam} \
-r {input.transcripts_bed12} \
-c 0 \
--names {params.sample} \
BIOPZ-Katsantoni Maria
committed
-n 100 > {output.TIN_score};) 2> {log.stderr}"
rule salmon_quantmerge_genes:
BIOPZ-Katsantoni Maria
committed
'''
Merge gene quantifications into a single file
'''
BIOPZ-Katsantoni Maria
committed
salmon_in = expand(
os.path.join(
config["output_dir"],
"{seqmode}",
"{sample}",
"salmon_quant",
"quant.genes.sf"),
BIOPZ-Katsantoni Maria
committed
sample=list(samples_table.index.values),
seqmode=list(samples_table["seqmode"]))
BIOPZ-Katsantoni Maria
committed
salmon_out = os.path.join(
BIOPZ-Katsantoni Maria
committed
"summary_salmon",
"quantmerge",
"genes_{salmon_merge_on}.tsv")
params:
salmon_dir = expand(
os.path.join(
config["output_dir"],
"{seqmode}",
"{sample}",
"salmon_quant"),
BIOPZ-Katsantoni Maria
committed
sample=list(samples_table.index.values),
seqmode=list(samples_table["seqmode"])),
sample_name_list = expand(
"{sample}",
sample=list(samples_table.index.values)),
salmon_merge_on = "{salmon_merge_on}"
BIOPZ-Katsantoni Maria
committed
BIOPZ-Katsantoni Maria
committed
stderr = os.path.join(
config["log_dir"],
"salmon_quantmerge_genes_{salmon_merge_on}.stderr.log"),
stdout = os.path.join(
config["log_dir"],
"salmon_quantmerge_genes_{salmon_merge_on}.stdout.log")
threads: 1
singularity:
"docker://zavolab/salmon:1.1.0-slim"
BIOPZ-Katsantoni Maria
committed
shell:
"(salmon quantmerge \
--quants {params.salmon_dir} \
--genes \
--names {params.sample_name_list} \
--column {params.salmon_merge_on} \
BIOPZ-Katsantoni Maria
committed
--output {output.salmon_out};) \
1> {log.stdout} 2> {log.stderr}"
rule salmon_quantmerge_transcripts:
BIOPZ-Katsantoni Maria
committed
'''
Merge gene quantifications into a single file
'''
BIOPZ-Katsantoni Maria
committed
salmon_in = expand(
os.path.join(
config["output_dir"],
"{seqmode}",
"{sample}",
"salmon_quant",
"quant.sf"),
BIOPZ-Katsantoni Maria
committed
sample=list(samples_table.index.values),
seqmode=list(samples_table["seqmode"])),
BIOPZ-Katsantoni Maria
committed
salmon_out = os.path.join(
BIOPZ-Katsantoni Maria
committed
"summary_salmon",
"quantmerge",
"transcripts_{salmon_merge_on}.tsv")
params:
salmon_dir = expand(
os.path.join(
config["output_dir"],
"{seqmode}",
"{sample}",
"salmon_quant"),
BIOPZ-Katsantoni Maria
committed
sample=list(samples_table.index.values),
seqmode=list(samples_table["seqmode"])),
sample_name_list = expand(
"{sample}",
sample=list(samples_table.index.values)),
salmon_merge_on = "{salmon_merge_on}"
BIOPZ-Katsantoni Maria
committed
BIOPZ-Katsantoni Maria
committed
stderr = os.path.join(
config["log_dir"],
"salmon_quantmerge_transcripts_{salmon_merge_on}.stderr.log"),
stdout = os.path.join(
config["log_dir"],
"salmon_quantmerge_transcripts_{salmon_merge_on}.stdout.log")
threads: 1
singularity:
"docker://zavolab/salmon:1.1.0-slim"
BIOPZ-Katsantoni Maria
committed
shell:
"(salmon quantmerge \
--quants {params.salmon_dir} \
--names {params.sample_name_list} \
--column {params.salmon_merge_on} \
BIOPZ-Katsantoni Maria
committed
--output {output.salmon_out}) \
1> {log.stdout} 2> {log.stderr}"