-
- add rule for input preparation (GTF to BED12) - add rule for TIN score calculation - update rule graph and DAG image - update Slurm cluster config
- add rule for input preparation (GTF to BED12) - add rule for TIN score calculation - update rule graph and DAG image - update Slurm cluster config
Snakefile 6.10 KiB
################################################################################
### python modules
################################################################################
import os
import sys
import pandas as pd
############################
samples_table = pd.read_csv(config["samples"], header=0, index_col=0, comment='#', engine='python', sep="\t")
localrules: finish
##################################################################################
# Execution dependend on sequencing mode
##################################################################################
include: os.path.join('workflow', 'rules', 'paired_end.snakefile.smk')
include: os.path.join('workflow', 'rules', 'single_end.snakefile.smk')
#################################################################################
### Final rule
#################################################################################
rule finish:
input:
outdir1 = expand(os.path.join(config["output_dir"], "{seqmode}", "{sample}", "mate1_fastqc"),
zip,
sample= [i for i in list(samples_table.index.values)],
seqmode= [samples_table.loc[i,"seqmode"] for i in list(samples_table.index.values)]),
salmon_gn_estimates = expand(os.path.join(config["output_dir"],"{seqmode}","{sample}","salmon_quant","quant.genes.sf"),
zip,
sample= [i for i in list(samples_table.index.values)],
seqmode= [samples_table.loc[i,"seqmode"] for i in list(samples_table.index.values)]),
pseudoalignment = expand(os.path.join(config["output_dir"],"{seqmode}","{sample}","quant_kallisto", "{sample}.kallisto.pseudo.sam"),
zip,
sample= [i for i in list(samples_table.index.values)],
seqmode= [samples_table.loc[i,"seqmode"] for i in list(samples_table.index.values)]),
TIN_score = expand(os.path.join(config["output_dir"], "{seqmode}", "{sample}", "TIN", "TIN_score.tsv"),
zip,
sample= [i for i in list(samples_table.index.values)],
seqmode= [samples_table.loc[i,"seqmode"] for i in list(samples_table.index.values)]),
rule create_index_star:
''' Create index using STAR'''
input:
genome =lambda wildcards: samples_table["genome"][samples_table["organism"]==wildcards.organism][0],
gtf =lambda wildcards: samples_table["gtf"][samples_table["organism"]==wildcards.organism][0]
output:
chromosome_info = os.path.join(
config["star_indexes"],
"{organism}",
"{index_size}",
"STAR_index",
"chrNameLength.txt"),
chromosomes_names = os.path.join(
config["star_indexes"],
"{organism}",
"{index_size}",
"STAR_index",
"chrName.txt")
params:
output_dir = os.path.join(
config["star_indexes"],
"{organism}",
"{index_size}",
"STAR_index"),
outFileNamePrefix = os.path.join(
config["star_indexes"],
"{organism}",
"{index_size}",
"STAR_index/STAR_"),
sjdbOverhang = "{index_size}"
singularity:
"docker://zavolab/star:2.6.0a"
threads: 12
log:
os.path.join( config["local_log"], "{organism}_{index_size}_create_index_star.log")
shell:
"(mkdir -p {params.output_dir}; \
chmod -R 777 {params.output_dir}; \
STAR \
--runMode genomeGenerate \
--sjdbOverhang {params.sjdbOverhang} \
--genomeDir {params.output_dir} \
--genomeFastaFiles {input.genome} \
--runThreadN {threads} \
--outFileNamePrefix {params.outFileNamePrefix} \
--sjdbGTFfile {input.gtf}) &> {log}"
rule create_index_salmon:
'''Create index for salmon quantification'''
input:
transcriptome = lambda wildcards: samples_table['tr_fasta_filtered'][samples_table["organism"]==wildcards.organism][0]
output:
index = directory(os.path.join(
config["salmon_indexes"],
"{organism}",
"{kmer}",
"salmon.idx"))
params:
kmerLen = "{kmer}"
singularity:
"docker://zavolab/salmon:0.11.0"
log:
os.path.join(config["local_log"], "{organism}_{kmer}_create_index_salmon.log")
threads: 8
shell:
"(salmon index \
--transcripts {input.transcriptome} \
--index {output.index} \
--kmerLen {params.kmerLen} \
--threads {threads}) &> {log}"
rule create_index_kallisto:
'''Create index for running Kallisto'''
input:
transcriptome = lambda wildcards: samples_table['tr_fasta_filtered'][samples_table["organism"]==wildcards.organism][0]
output:
index = os.path.join(
config["kallisto_indexes"],
"{organism}",
"kallisto.idx")
params:
output_dir = os.path.join(
config["kallisto_indexes"],
"{organism}")
singularity:
"docker://zavolab/kallisto:0.46.1"
log:
os.path.join(config["local_log"], "{organism}_create_index_kallisto.log")
shell:
"(mkdir -p {params.output_dir}; \
chmod -R 777 {params.output_dir}; \
kallisto index -i {output.index} {input.transcriptome}) &> {log}"
rule extract_transcripts_as_bed12:
''' Extract transcripts: from GTF into BED12 format'''
input:
gtf =lambda wildcards: samples_table["gtf"][0]
output:
bed12 = os.path.join(
config["output_dir"],
"full_transcripts_protein_coding.bed")
singularity:
"docker://zavolab/gtf_transcript_type_to_bed12:0.1.0"
threads: 1
log:
os.path.join( config["local_log"], "extract_transcripts_as_bed12.log")
shell:
"gtf_transcript_type_to_bed12.pl \
--anno={input.gtf} \
--type=protein_coding \
1> {output.bed12} \
2> {log}"
rule calculate_TIN_scores:
'''Calculate TIN score'''
input:
bai = os.path.join(
config["output_dir"],
"{seqmode}",
"{sample}",
"map_genome",
"{sample}_Aligned.sortedByCoord.out.bam.bai"),
transcripts_bed12 = os.path.join(
config["output_dir"],
"full_transcripts_protein_coding.bed")
output:
TIN_score = os.path.join(
config["output_dir"],
"{seqmode}",
"{sample}",
"TIN",
"TIN_score.tsv")
params:
bam = os.path.join(
config["output_dir"],
"{seqmode}",
"{sample}",
"map_genome",
"{sample}_Aligned.sortedByCoord.out.bam"),
sample = "{sample}"
log:
os.path.join(config["local_log"], "{seqmode}", "{sample}", "calculate_TIN_scores.log")
threads: 8
singularity:
"docker://zavolab/tin_score_calculation:0.1.0"
shell:
"tin_score_calculation.py \
-i {params.bam} \
-r {input.transcripts_bed12} \
-c 0 \
--names {params.sample} \
-n 100 \
1> {output.TIN_score} \
2> {log}"