Newer
Older
"""General purpose RNA-Seq analysis pipeline developed by the Zavolan Lab"""
BIOPZ-Gypas Foivos
committed
import os
import sys
BIOPZ-Gypas Foivos
committed
# Get sample table
samples_table = pd.read_csv(
config['samples'],
header=0,
index_col=0,
comment='#',
engine='python',
sep="\t",
)
localrules: finish, rename_star_rpm_for_alfa, prepare_files_for_report, \
prepare_MultiQC_config
# Create log directories
os.makedirs(
os.path.join(
os.getcwd(),
config['log_dir'],
),
BIOPZ-Katsantoni Maria
committed
exist_ok=True)
if cluster_config:
os.makedirs(
os.path.join(
os.getcwd(),
os.path.dirname(cluster_config['__default__']['out']),
),
BIOPZ-Katsantoni Maria
committed
exist_ok=True)
BIOPZ-Gypas Foivos
committed
# Include subworkflows
include: os.path.join("workflow", "rules", "paired_end.snakefile.smk")
include: os.path.join("workflow", "rules", "single_end.snakefile.smk")
BIOPZ-Katsantoni Maria
committed
"""
Rule for collecting outputs
"""
os.path.join(
config['output_dir'],
BIOPZ-Iborra de Toledo Paula
committed
BIOPZ-Gypas Foivos
committed
rule create_index_star:
BIOPZ-Katsantoni Maria
committed
"""
Create index for STAR alignments
"""
BIOPZ-Katsantoni Maria
committed
samples_table['genome']
[samples_table['organism'] == wildcards.organism]
[0],
BIOPZ-Katsantoni Maria
committed
samples_table['gtf']
[samples_table['organism'] == wildcards.organism]
[0]
output:
chromosome_info = os.path.join(
config['star_indexes'],
"{organism}",
"{index_size}",
"STAR_index",
BIOPZ-Katsantoni Maria
committed
"chrNameLength.txt"),
chromosomes_names = os.path.join(
config['star_indexes'],
"{organism}",
"{index_size}",
"STAR_index",
BIOPZ-Katsantoni Maria
committed
"chrName.txt")
params:
output_dir = os.path.join(
config['star_indexes'],
"{organism}",
"{index_size}",
BIOPZ-Katsantoni Maria
committed
"STAR_index"),
outFileNamePrefix = os.path.join(
config['star_indexes'],
"{organism}",
"{index_size}",
BIOPZ-Katsantoni Maria
committed
"STAR_index/STAR_"),
sjdbOverhang = "{index_size}"
BIOPZ-Katsantoni Maria
committed
BIOPZ-Katsantoni Maria
committed
BIOPZ-Katsantoni Maria
committed
stderr = os.path.join(
BIOPZ-Katsantoni Maria
committed
"{organism}_{index_size}_create_index_star.stderr.log"),
stdout = os.path.join(
config['log_dir'],
"{organism}_{index_size}_create_index_star.stdout.log")
shell:
"(mkdir -p {params.output_dir}; \
chmod -R 777 {params.output_dir}; \
STAR \
--runMode genomeGenerate \
--sjdbOverhang {params.sjdbOverhang} \
--genomeDir {params.output_dir} \
--genomeFastaFiles {input.genome} \
--runThreadN {threads} \
--outFileNamePrefix {params.outFileNamePrefix} \
BIOPZ-Katsantoni Maria
committed
--sjdbGTFfile {input.gtf}) \
1> {log.stdout} 2> {log.stderr}"
BIOPZ-Gypas Foivos
committed
BIOPZ-Iborra de Toledo Paula
committed
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
rule extract_transcriptome:
""" Create transcriptome from genome and gene annotations """
input:
genome = lambda wildcards:
samples_table['genome'][
samples_table['organism'] == wildcards.organism
][0],
gtf = lambda wildcards:
samples_table['gtf'][
samples_table['organism'] == wildcards.organism
][0]
output:
transcriptome = os.path.join(
config['output_dir'],
"transcriptome",
"{organism}",
"transcriptome.fa",
)
log:
stderr = os.path.join(
config['log_dir'],
"{organism}_extract_transcriptome.log"),
stdout = os.path.join(
config['log_dir'],
"{organism}_extract_transcriptome.log")
singularity:
BIOPZ-Iborra de Toledo Paula
committed
shell:
"(gffread \
-w {output.transcriptome} \
-g {input.genome} {input.gtf}) \
1> {log.stdout} 2> {log.stderr}"
BIOPZ-Gypas Foivos
committed
rule extract_decoys_salmon:
BIOPZ-Katsantoni Maria
committed
"""
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
Extract names of the genome targets
"""
input:
genome = lambda wildcards:
samples_table['genome']
[samples_table['organism'] == wildcards.organism]
[0],
output:
decoys = os.path.join(
config['output_dir'],
"transcriptome",
"{organism}",
"decoys.txt")
singularity:
"docker://bash:5.0.16"
log:
stderr = os.path.join(
config['log_dir'],
"{organism}_extract_decoys_salmon.stderr.log"),
stdout = os.path.join(
config['log_dir'],
"{organism}_extract_decoys_salmon.stdout.log")
threads: 1
shell:
"""
(grep "^>" <{input.genome} \
| cut -d " " -f 1 > {output.decoys} && \
sed -i.bak -e 's/>//g' {output.decoys}) \
1> {log.stdout} 2> {log.stderr}
"""
rule concatenate_transcriptome_and_genome:
"""
Concatenate genome and transcriptome
BIOPZ-Katsantoni Maria
committed
"""
BIOPZ-Iborra de Toledo Paula
committed
transcriptome = os.path.join(
config['output_dir'],
"transcriptome",
"{organism}",
"transcriptome.fa",
),
genome = lambda wildcards:
samples_table['genome']
[samples_table['organism'] == wildcards.organism]
[0],
output:
genome_transcriptome = os.path.join(
config['output_dir'],
"transcriptome",
"{organism}",
"genome_transcriptome.fa",
BIOPZ-Iborra de Toledo Paula
committed
)
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
singularity:
"docker://bash:5.0.16"
log:
stderr = os.path.join(
config['log_dir'],
"{organism}_concatenate_transcriptome_and_genome.stderr.log"),
stdout = os.path.join(
config['log_dir'],
"{organism}_concatenate_transcriptome_and_genome.stdout.log")
shell:
"(cat {input.transcriptome} {input.genome} \
1> {output.genome_transcriptome}) \
1> {log.stdout} 2> {log.stderr}"
rule create_index_salmon:
"""
Create index for Salmon quantification
"""
input:
genome_transcriptome = os.path.join(
config['output_dir'],
"transcriptome",
"{organism}",
"genome_transcriptome.fa",
),
decoys = os.path.join(
config['output_dir'],
"transcriptome",
"{organism}",
"decoys.txt")
output:
index = directory(
os.path.join(
config['salmon_indexes'],
"{organism}",
"{kmer}",
BIOPZ-Katsantoni Maria
committed
"salmon.idx"))
BIOPZ-Katsantoni Maria
committed
kmerLen = "{kmer}"
"docker://zavolab/salmon:1.1.0-slim"
BIOPZ-Katsantoni Maria
committed
BIOPZ-Katsantoni Maria
committed
stderr = os.path.join(
BIOPZ-Katsantoni Maria
committed
"{organism}_{kmer}_create_index_salmon.stderr.log"),
stdout = os.path.join(
config['log_dir'],
"{organism}_{kmer}_create_index_salmon.stdout.log")
BIOPZ-Katsantoni Maria
committed
--transcripts {input.genome_transcriptome} \
--decoys {input.decoys} \
--index {output.index} \
--kmerLen {params.kmerLen} \
BIOPZ-Katsantoni Maria
committed
--threads {threads}) \
1> {log.stdout} 2> {log.stderr}"
BIOPZ-Gypas Foivos
committed
rule create_index_kallisto:
BIOPZ-Katsantoni Maria
committed
"""
Create index for Kallisto quantification
"""
BIOPZ-Iborra de Toledo Paula
committed
transcriptome = os.path.join(
config['output_dir'],
"transcriptome",
"{organism}",
"transcriptome.fa",
)
output:
index = os.path.join(
config['kallisto_indexes'],
"{organism}",
BIOPZ-Katsantoni Maria
committed
"kallisto.idx")
params:
output_dir = os.path.join(
config['kallisto_indexes'],
BIOPZ-Katsantoni Maria
committed
"{organism}")
BIOPZ-Katsantoni Maria
committed
BIOPZ-Katsantoni Maria
committed
stderr = os.path.join(
BIOPZ-Katsantoni Maria
committed
"{organism}_create_index_kallisto.stderr.log"),
stdout = os.path.join(
config['log_dir'],
"{organism}_create_index_kallisto.stdout.log")
shell:
"(mkdir -p {params.output_dir}; \
chmod -R 777 {params.output_dir}; \
BIOPZ-Katsantoni Maria
committed
kallisto index -i {output.index} {input.transcriptome}) \
1> {log.stdout} 2> {log.stderr}"
BIOPZ-Gypas Foivos
committed
BIOPZ-Katsantoni Maria
committed
"""
Convert transcripts to BED12 format
"""
BIOPZ-Katsantoni Maria
committed
samples_table['gtf']
[0]
output:
bed12 = os.path.join(
config['output_dir'],
BIOPZ-Katsantoni Maria
committed
"full_transcripts_protein_coding.bed")
"docker://zavolab/gtf_transcript_type_to_bed12:0.1.0-slim"
BIOPZ-Katsantoni Maria
committed
BIOPZ-Katsantoni Maria
committed
BIOPZ-Katsantoni Maria
committed
stderr = os.path.join(
BIOPZ-Katsantoni Maria
committed
"extract_transcripts_as_bed12.stderr.log")
BIOPZ-Katsantoni Maria
committed
"(gtf_transcript_type_to_bed12.pl \
BIOPZ-Katsantoni Maria
committed
--type=protein_coding > {output.bed12}); \
2> {log.stderr}"
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
rule index_genomic_alignment_samtools:
'''
Index genome bamfile using samtools
'''
input:
bam = os.path.join(
config["output_dir"],
"{seqmode}",
"{sample}",
"map_genome",
"{sample}_Aligned.sortedByCoord.out.bam")
output:
bai = os.path.join(
config["output_dir"],
"{seqmode}",
"{sample}",
"map_genome",
"{sample}_Aligned.sortedByCoord.out.bam.bai")
singularity:
"docker://zavolab/samtools:1.10-slim"
threads: 1
log:
stderr = os.path.join(
config["log_dir"],
"{seqmode}",
"{sample}",
"index_genomic_alignment_samtools.stderr.log"),
stdout = os.path.join(
config["log_dir"],
"{seqmode}",
"{sample}",
"index_genomic_alignment_samtools.stdout.log")
shell:
"(samtools index {input.bam} {output.bai};) \
1> {log.stdout} 2> {log.stderr}"
rule star_rpm:
''' Create stranded bedgraph coverage with STARs RPM normalisation '''
input:
bam = os.path.join(
config["output_dir"],
"{seqmode}",
"{sample}",
"map_genome",
"{sample}_Aligned.sortedByCoord.out.bam"),
bai = os.path.join(
config["output_dir"],
"{seqmode}",
"{sample}",
"map_genome",
"{sample}_Aligned.sortedByCoord.out.bam.bai")
output:
str1 = (os.path.join(
config["output_dir"],
"{seqmode}",
"{sample}",
"STAR_coverage",
"{sample}_Signal.Unique.str1.out.bg"),
os.path.join(
config["output_dir"],
"{seqmode}",
"{sample}",
"STAR_coverage",
"{sample}_Signal.UniqueMultiple.str1.out.bg")),
str2 = (os.path.join(
config["output_dir"],
"{seqmode}",
"{sample}",
"STAR_coverage",
"{sample}_Signal.Unique.str2.out.bg"),
os.path.join(
config["output_dir"],
"{seqmode}",
"{sample}",
"STAR_coverage",
"{sample}_Signal.UniqueMultiple.str2.out.bg"))
params:
out_dir = lambda wildcards, output: os.path.dirname(output.str1[0]),
prefix = lambda wildcards, output: os.path.join(os.path.dirname(output.str1[0]),
str(wildcards.sample) + "_"),
stranded = "Stranded"
singularity:
"docker://zavolab/star:2.7.3a-slim"
log:
stderr = os.path.join(
config["log_dir"],
"{seqmode}",
"{sample}",
"star_rpm_single_end.stderr.log"),
stdout = os.path.join(
config["log_dir"],
"{seqmode}",
"{sample}",
"star_rpm_single_end.stdout.log")
threads: 4
shell:
"""
(mkdir -p {params.out_dir}; \
chmod -R 777 {params.out_dir}; \
STAR \
--runMode inputAlignmentsFromBAM \
--runThreadN {threads} \
--inputBAMfile {input.bam} \
--outWigType "bedGraph" \
--outWigStrand {params.stranded} \
--outWigNorm "RPM" \
--outFileNamePrefix {params.prefix}) \
1> {log.stdout} 2> {log.stderr}
"""
rule rename_star_rpm_for_alfa:
input:
str1 = os.path.join(
config["output_dir"],
"{seqmode}",
"{sample}",
"STAR_coverage",
str2 = os.path.join(
config["output_dir"],
"{seqmode}",
"{sample}",
"STAR_coverage",
output:
plus = os.path.join(
config["output_dir"],
"{seqmode}",
"{sample}",
"ALFA",
"{unique}",
"{sample}_Signal.{unique}.out.plus.bg"),
minus = os.path.join(
config["output_dir"],
"{seqmode}",
"{sample}",
"ALFA",
"{unique}",
"{sample}_Signal.{unique}.out.minus.bg")
params:
orientation = lambda wildcards: samples_table.loc[wildcards.sample, "kallisto_directionality"]
run:
if params['orientation'] == "--fr":
shutil.copy2(input['str1'], output['plus'])
shutil.copy2(input['str2'], output['minus'])
elif params['orientation'] == "--rf":
shutil.copy2(input['str1'], output['minus'])
shutil.copy2(input['str2'], output['plus'])
BIOPZ-Katsantoni Maria
committed
"""
Caluclate transcript integrity (TIN) score
"""
bam = os.path.join(
config['output_dir'],
"{seqmode}",
"{sample}",
"map_genome",
"{sample}_Aligned.sortedByCoord.out.bam"),
bai = os.path.join(
config['output_dir'],
"{seqmode}",
"{sample}",
"map_genome",
BIOPZ-Katsantoni Maria
committed
"{sample}_Aligned.sortedByCoord.out.bam.bai"),
transcripts_bed12 = os.path.join(
config['output_dir'],
BIOPZ-Katsantoni Maria
committed
"full_transcripts_protein_coding.bed")
output:
TIN_score = os.path.join(
config['output_dir'],
"{seqmode}",
"{sample}",
"TIN",
BIOPZ-Katsantoni Maria
committed
"TIN_score.tsv")
BIOPZ-Katsantoni Maria
committed
sample = "{sample}"
BIOPZ-Katsantoni Maria
committed
stderr = os.path.join(
config['log_dir'],
"{seqmode}",
"{sample}",
BIOPZ-Katsantoni Maria
committed
"calculate_TIN_scores.log")
BIOPZ-Katsantoni Maria
committed
"docker://zavolab/tin_score_calculation:0.2.0-slim"
BIOPZ-Katsantoni Maria
committed
BIOPZ-Katsantoni Maria
committed
"(tin_score_calculation.py \
-i {input.bam} \
-r {input.transcripts_bed12} \
-c 0 \
--names {params.sample} \
BIOPZ-Katsantoni Maria
committed
-n 100 > {output.TIN_score};) 2> {log.stderr}"
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
rule merge_TIN_scores:
"""
Merge TIN scores tables
"""
input:
TIN_score = expand(
os.path.join(
config['output_dir'],
"{seqmode}",
"{sample}",
"TIN",
"TIN_score.tsv"),
zip,
sample=[i for i in list(samples_table.index.values)],
seqmode=[samples_table.loc[i, 'seqmode']
for i in list(samples_table.index.values)])
output:
TIN_scores_merged = os.path.join(
config['output_dir'],
"TIN_scores_merged.tsv")
params:
TIN_score_merged_paths = ",".join(expand(
os.path.join(
config['output_dir'],
"{seqmode}",
"{sample}",
"TIN",
"TIN_score.tsv"),
zip,
sample=[i for i in list(samples_table.index.values)],
seqmode=[samples_table.loc[i, 'seqmode']
for i in list(samples_table.index.values)]))
log:
stderr = os.path.join(
config['log_dir'],
"merge_TIN_scores.stderr.log"),
stdout = os.path.join(
config["log_dir"],
"merge_TIN_scores.stdout.log")
threads: 1
singularity:
"docker://zavolab/tin_score_calculation:0.2.0-slim"
shell:
"(tin_score_merge.py \
--input-files {params.TIN_score_merged_paths} \
--output-file {output.TIN_scores_merged}) \
1> {log.stdout} 2> {log.stderr}"
rule plot_TIN_scores:
"""
Generate TIN scores boxplots
"""
input:
TIN_scores_merged = os.path.join(
config['output_dir'],
"TIN_scores_merged.tsv"),
output:
TIN_boxplot_PNG = os.path.join(
config['output_dir'],
"TIN_scores_boxplot.png"),
TIN_boxplot_PDF = os.path.join(
config['output_dir'],
"TIN_scores_boxplot.pdf")
params:
TIN_boxplot_prefix = os.path.join(
config['output_dir'],
"TIN_scores_boxplot")
log:
stderr = os.path.join(
config['log_dir'],
"plot_TIN_scores.stderr.log"),
stdout = os.path.join(
config["log_dir"],
"plot_TIN_scores.stdout.log")
threads: 1
singularity:
"docker://zavolab/tin_score_calculation:0.2.0-slim"
shell:
"(tin_score_plot.py \
--input-file {input.TIN_scores_merged} \
--output-file-prefix {params.TIN_boxplot_prefix}) \
1> {log.stdout} 2> {log.stderr}"
rule salmon_quantmerge_genes:
BIOPZ-Katsantoni Maria
committed
'''
Merge gene quantifications into a single file
'''
BIOPZ-Katsantoni Maria
committed
salmon_in = expand(
os.path.join(
config["output_dir"],
"{seqmode}",
"{sample}",
"salmon_quant",
"quant.genes.sf"),
BIOPZ-Katsantoni Maria
committed
sample=list(samples_table.index.values),
seqmode=list(samples_table["seqmode"]))
BIOPZ-Katsantoni Maria
committed
salmon_out = os.path.join(
BIOPZ-Katsantoni Maria
committed
"summary_salmon",
"quantmerge",
"genes_{salmon_merge_on}.tsv")
params:
salmon_dir = expand(
os.path.join(
config["output_dir"],
"{seqmode}",
"{sample}",
"salmon_quant"),
BIOPZ-Katsantoni Maria
committed
sample=list(samples_table.index.values),
seqmode=list(samples_table["seqmode"])),
sample_name_list = expand(
"{sample}",
sample=list(samples_table.index.values)),
salmon_merge_on = "{salmon_merge_on}"
BIOPZ-Katsantoni Maria
committed
BIOPZ-Katsantoni Maria
committed
stderr = os.path.join(
config["log_dir"],
"salmon_quantmerge_genes_{salmon_merge_on}.stderr.log"),
stdout = os.path.join(
config["log_dir"],
"salmon_quantmerge_genes_{salmon_merge_on}.stdout.log")
threads: 1
singularity:
"docker://zavolab/salmon:1.1.0-slim"
BIOPZ-Katsantoni Maria
committed
shell:
"(salmon quantmerge \
--quants {params.salmon_dir} \
--genes \
--names {params.sample_name_list} \
--column {params.salmon_merge_on} \
BIOPZ-Katsantoni Maria
committed
--output {output.salmon_out};) \
1> {log.stdout} 2> {log.stderr}"
rule salmon_quantmerge_transcripts:
BIOPZ-Katsantoni Maria
committed
'''
Merge gene quantifications into a single file
'''
BIOPZ-Katsantoni Maria
committed
salmon_in = expand(
os.path.join(
config["output_dir"],
"{seqmode}",
"{sample}",
"salmon_quant",
"quant.sf"),
BIOPZ-Katsantoni Maria
committed
sample=list(samples_table.index.values),
seqmode=list(samples_table["seqmode"])),
BIOPZ-Katsantoni Maria
committed
salmon_out = os.path.join(
BIOPZ-Katsantoni Maria
committed
"summary_salmon",
"quantmerge",
"transcripts_{salmon_merge_on}.tsv")
params:
salmon_dir = expand(
os.path.join(
config["output_dir"],
"{seqmode}",
"{sample}",
"salmon_quant"),
BIOPZ-Katsantoni Maria
committed
sample=list(samples_table.index.values),
seqmode=list(samples_table["seqmode"])),
sample_name_list = expand(
"{sample}",
sample=list(samples_table.index.values)),
salmon_merge_on = "{salmon_merge_on}"
BIOPZ-Katsantoni Maria
committed
BIOPZ-Katsantoni Maria
committed
stderr = os.path.join(
config["log_dir"],
"salmon_quantmerge_transcripts_{salmon_merge_on}.stderr.log"),
stdout = os.path.join(
config["log_dir"],
"salmon_quantmerge_transcripts_{salmon_merge_on}.stdout.log")
threads: 1
singularity:
"docker://zavolab/salmon:1.1.0-slim"
BIOPZ-Katsantoni Maria
committed
shell:
"(salmon quantmerge \
--quants {params.salmon_dir} \
--names {params.sample_name_list} \
--column {params.salmon_merge_on} \
BIOPZ-Katsantoni Maria
committed
--output {output.salmon_out}) \
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
1> {log.stdout} 2> {log.stderr}"
#################################################################################
### ALFA: Annotation Landscape For Aligned reads
#################################################################################
directionality = {"--fr": "fr-firststrand", "--rf": "fr-secondstrand"}
rule generate_alfa_index:
''' Generate ALFA index files from sorted GTF file '''
input:
gtf = lambda wildcards: samples_table["gtf"][samples_table["organism"]==wildcards.organism][0],
chr_len = os.path.join(
config["star_indexes"],
"{organism}",
"{index_size}",
"STAR_index",
"chrNameLength.txt"),
output:
index_stranded = os.path.join(config["alfa_indexes"],
"{organism}",
"{index_size}",
"ALFA",
"sorted_genes.stranded.ALFA_index"),
index_unstranded = os.path.join(config["alfa_indexes"],
"{organism}",
"{index_size}",
"ALFA",
"sorted_genes.unstranded.ALFA_index")
params:
genome_index = "sorted_genes",
out_dir = lambda wildcards, output: os.path.dirname(output.index_stranded)
threads: 4
singularity:
log:
os.path.join(config["log_dir"], "{organism}_{index_size}_generate_alfa_index.log")
shell:
"""
alfa -a {input.gtf} \
-g {params.genome_index} \
--chr_len {input.chr_len} \
-p {threads} \
-o {params.out_dir} &> {log}
"""
rule alfa_qc:
''' Run ALFA from stranded bedgraph files '''
input:
plus = os.path.join(
config["output_dir"],
"{seqmode}",
"{sample}",
"ALFA",
"{unique}",
"{sample}_Signal.{unique}.out.plus.bg"),
minus = os.path.join(
config["output_dir"],
"{seqmode}",
"{sample}",
"ALFA",
"{unique}",
"{sample}_Signal.{unique}.out.minus.bg"),
gtf = lambda wildcards: os.path.join(config["alfa_indexes"],
samples_table.loc[wildcards.sample, "organism"],
str(samples_table.loc[wildcards.sample, "index_size"]),
"ALFA",
"sorted_genes.stranded.ALFA_index")
output:
biotypes = os.path.join(
config["output_dir"],
"{seqmode}",
"{sample}",
"ALFA",
"ALFA_plots.Biotypes.pdf"),
categories = os.path.join(
config["output_dir"],
"{seqmode}",
"{sample}",
"ALFA",
"ALFA_plots.Categories.pdf"),
table = os.path.join(
config["output_dir"],
"{seqmode}",
"{sample}",
"ALFA",
"{sample}.ALFA_feature_counts.tsv")
params:
out_dir = lambda wildcards, output: os.path.dirname(output.biotypes),
alfa_orientation = lambda wildcards: directionality[samples_table.loc[wildcards.sample, "kallisto_directionality"]],
in_file_plus = lambda wildcards, input: os.path.basename(input.plus),
in_file_minus = lambda wildcards, input: os.path.basename(input.minus),
genome_index = lambda wildcards, input: os.path.abspath(os.path.join(os.path.dirname(input.gtf), "sorted_genes")),
name = "{sample}"
singularity:
log:
os.path.abspath(os.path.join(
config["log_dir"],
"{seqmode}",
"{sample}",
shell:
"""
cd {params.out_dir}; \
(alfa -g {params.genome_index} \
--bedgraph {params.in_file_plus} {params.in_file_minus} {params.name} \
-s {params.alfa_orientation}) &> {log}
"""
rule alfa_qc_all_samples:
''' Run ALFA from stranded bedgraph files on all samples '''
input:
tables = [os.path.join(
config["output_dir"],
samples_table.loc[sample1, "seqmode"],
str(sample1),
"ALFA",
sample1 + ".ALFA_feature_counts.tsv")
for sample1 in list(samples_table.index.values)]
output:
biotypes = os.path.join(
config["output_dir"],
"ALFA",
"ALFA_plots.Biotypes.pdf"),
categories = os.path.join(
config["output_dir"],
"ALFA",
"ALFA_plots.Categories.pdf")
params:
out_dir = lambda wildcards, output: os.path.dirname(output.biotypes)
log:
os.path.abspath(
os.path.join(config["log_dir"],
shell:
"""
(alfa -c {input.tables} -o {params.out_dir}) &> {log}
"""
rule alfa_concat_results:
input:
expand(os.path.join(
config["output_dir"],
"ALFA",
"{unique_type}",
"ALFA_plots.{annotation}.pdf"),
unique_type = ["Unique", "UniqueMultiple"],
annotation = ["Categories", "Biotypes"])
output:
expand(os.path.join(
config["output_dir"],
"ALFA",
"ALFA_plots.concat.png"))
params:
density = 300
log:
os.path.abspath(
os.path.join(config["log_dir"],
"alfa_qc_all_samples.concat.log"))
singularity:
"docker://zavolab/imagemagick:7.0.8"
shell:
"""
convert -append -density {params.density} \
{input} {output} &> {log}
"""
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
rule prepare_files_for_report:
'''
Re-structure the results and add comments for MultiQC parsing
'''
input:
outdir1 = expand(
os.path.join(
config['output_dir'],
"{seqmode}",
"{sample}",
"mate1_fastqc"),
zip,
sample=[i for i in list(samples_table.index.values)],
seqmode=[samples_table.loc[i, 'seqmode']
for i in list(samples_table.index.values)]),
pseudoalignment = expand(
os.path.join(
config['output_dir'],
"{seqmode}",
"{sample}",
"quant_kallisto",
"{sample}.kallisto.pseudo.sam"),
zip,
sample=[i for i in list(samples_table.index.values)],
seqmode=[samples_table.loc[i, 'seqmode']
for i in list(samples_table.index.values)]),
TIN_boxplot_PNG = os.path.join(
config['output_dir'],
"TIN_scores_boxplot.png"),
TIN_boxplot_PDF = os.path.join(
config['output_dir'],
"TIN_scores_boxplot.pdf"),
salmon_merge_genes = expand(
os.path.join(
config["output_dir"],
"summary_salmon",
"quantmerge",
"genes_{salmon_merge_on}.tsv"),
salmon_merge_on=["tpm", "numreads"]),
salmon_merge_transcripts = expand(
os.path.join(
config["output_dir"],
"summary_salmon",
"quantmerge",
"transcripts_{salmon_merge_on}.tsv"),
salmon_merge_on=["tpm", "numreads"]),
star_rpm = expand(
os.path.join(
config["output_dir"],
"{seqmode}",
"{sample}",
"STAR_coverage",
"{sample}_Signal.UniqueMultiple.str1.out.bg"),
zip,
sample=[i for i in list(samples_table.index.values)],
seqmode=[samples_table.loc[i, 'seqmode']
for i in list(samples_table.index.values)]),
samples_dir_result = directory(os.path.join(
config["output_dir"],
"samples")),
samples_dir_log = directory(os.path.join(
config["log_dir"],
"samples"))
params:
results_dir = config["output_dir"],
log_dir = config["log_dir"],
log:
stderr = os.path.join(
config["log_dir"],
"prepare_files_for_report.stderr.log"),
stdout = os.path.join(
config["log_dir"],
"prepare_files_for_report.stdout.log")
run:
# remove "single/paired end" from the results directories
os.mkdir(output.samples_dir_result)
# copy paired end results
paired_end_dir = glob.glob(
os.path.join(
params.results_dir,
"paired_end",
"*"))
for s in paired_end_dir:
sample_name = s.split("/")[-1]
shutil.copytree(
s, \
os.path.join(
params.results_dir,
"samples",
sample_name))
single_end_dir = glob.glob(
os.path.join(
params.results_dir,
"single_end",
"*"))
for s in single_end_dir:
sample_name = s.split("/")[-1]
shutil.copytree(
s, \
os.path.join(
params.results_dir,
"samples",
sample_name))
# remove "single/paired end" from the logs directories
os.mkdir(output.samples_dir_log)
# copy paired end results
paired_end_dir = glob.glob(
os.path.join(
params.log_dir,
"paired_end",
"*"))
for s in paired_end_dir:
sample_name = s.split("/")[-1]
shutil.copytree(
s, \
os.path.join(
params.log_dir,
"samples",
sample_name))
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
single_end_dir = glob.glob(
os.path.join(
params.log_dir,
"single_end",
"*"))
for s in single_end_dir:
sample_name = s.split("/")[-1]
shutil.copytree(
s, \
os.path.join(
params.log_dir,
"samples",
sample_name))
# encapsulate salmon quantification results
all_samples_dirs = glob.glob(
os.path.join(
params.results_dir,
"samples",
"*"))
for s in all_samples_dirs:
sample_name = s.split("/")[-1]
shutil.move(
os.path.join(
s,
"salmon_quant"),
os.path.join(
s,
sample_name)
)
os.mkdir(os.path.join(
s,
"salmon_quant"))
shutil.move(
os.path.join(
s,
sample_name),
os.path.join(
s,
"salmon_quant",
sample_name)
)
# adjust FastQC results 'Filename' field:
fastq_zip_list = glob.glob(
os.path.join(
params.results_dir,
"samples",
"*",
"*_fastqc",
"*_fastqc.zip"))
for zipfile in fastq_zip_list:
sample_name = zipfile.split("/")[-3]
absolute_path_zipfile = os.path.abspath(zipfile)
zipfile_path_chunks = absolute_path_zipfile.split(os.path.sep)
dir_path_to_zipfile = os.path.sep + os.path.join(
(*zipfile_path_chunks[:-1]))
zip_f.extractall(dir_path_to_zipfile)
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
fastqc_data_f = os.path.join(
zipfile[:-4],
"fastqc_data.txt")
with open(fastqc_data_f) as f:
log_lines = f.read().splitlines()
log_lines[3] = "Filename\t" + sample_name+"|"+log_lines[3].split("\t")[1]
with open(fastqc_data_f, "w") as f:
for i in log_lines: f.write(i+"\n")
os.remove(zipfile)
# adjust Kallisto quantification logs
kallisto_logs = glob.glob(
os.path.join(
params.log_dir,
"samples",
"*",
"genome_quantification_kallisto.stderr.log"))
for kallisto_log in kallisto_logs:
with open(kallisto_log) as f:
log_lines = f.read().splitlines()
temp = log_lines[8].split(".")
log_lines[8] = temp[0] + "." + temp[2] + "." + temp[3]
with open(kallisto_log+".MODIFIED", "w") as f:
for i in log_lines: f.write(i+"\n")
# add #-comment to all cutadapt logs:
cutadapt_logs = glob.glob(
os.path.join(
params.log_dir,
"samples",
"*",
"remove_*_cutadapt.stdout.log"))
for cutadapt_log in cutadapt_logs:
sample_name = cutadapt_log.split("/")[-2]
with open(cutadapt_log) as f:
log_lines = f.read().splitlines()
log_lines[1] = log_lines[1] + " # " + sample_name
with open(cutadapt_log, "w") as f:
for i in log_lines: f.write(i+"\n")
# adjust TIN boxplots filenames for MutliQC recognition
os.rename(
input.TIN_boxplot_PNG,
os.path.join(
params.results_dir,
"TIN scores_mqc.png"))
os.rename(
input.TIN_boxplot_PDF,
os.path.join(
params.results_dir,
"TIN scores_mqc.pdf"))
# adjust alfa plot filename for MutliQC recognition
os.rename(
input.alfa_concat_out,
os.path.join(
params.results_dir,
"ALFA",
"ALFA_plots.concat_mqc.png"))
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
# remove old result directories
shutil.rmtree(
os.path.join(
params.results_dir,
"paired_end"),
ignore_errors=False,
onerror=None)
shutil.rmtree(
os.path.join(
params.results_dir,
"single_end"),
ignore_errors=False,
onerror=None)
shutil.rmtree(
os.path.join(
params.log_dir,
"paired_end"),
ignore_errors=False,
onerror=None)
shutil.rmtree(
os.path.join(
params.log_dir,
"single_end"),
ignore_errors=False,
onerror=None)
rule prepare_MultiQC_config:
'''
Prepare config for the MultiQC
'''
input:
samples_dir_result = os.path.join(
config["output_dir"],
"samples"),
samples_dir_log = os.path.join(
config["log_dir"],
config["output_dir"],
"MultiQC_config.yaml")
params:
logo_path = os.path.join(
"..",
"..",
"images",
"logo.128px.png"),
results_dir = config["output_dir"]
log:
stderr = os.path.join(
config["log_dir"],
"prepare_MultiQC_config.stderr.log"),
stdout = os.path.join(
config["log_dir"],
"prepare_MultiQC_config.stdout.log")
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
run:
with open(output.multiqc_config, "w") as YAML:
YAML.write("---\n\n")
YAML.write("title: \"Rhea\"\n")
YAML.write("subtitle: \"RNA-Seq processing pipeline developed by the members of Zavolan Lab\"\n")
YAML.write("intro_text: \"Short analysis title from config[analysis_title]\"\n")
YAML.write("custom_logo: \""+params.logo_path+"\"\n")
YAML.write("custom_logo_url: \"https://www.biozentrum.unibas.ch/research/researchgroups/overview/unit/zavolan/research-group-mihaela-zavolan/\"\n")
YAML.write("custom_logo_title: \"ZavoLab\"\n\n")
YAML.write("report_header_info:\n")
YAML.write(" - Project Type: \"Snakemake workflow\"\n")
YAML.write(" - Analysis Type: \"RNA-seq\"\n")
YAML.write(" - Analysis Author: \"config[author_name]\"\n")
YAML.write(" - Contact E-mail: \"config[author_email]\"\n\n")
YAML.write("top_modules:\n\n")
YAML.write(" - fastqc:\n")
YAML.write(" path_filters:\n")
YAML.write(" - \"*/mate1_fastqc/*\"\n")
YAML.write(" - \"*/mate2_fastqc/*\"\n")
YAML.write("\n")
YAML.write(" - cutadapt:\n")
YAML.write(" name: \"Cutadapt: adapter removal\"\n")
YAML.write(" path_filters:\n")
YAML.write(" - \"*/remove_adapters_cutadapt.stdout.log\"\n")
YAML.write("\n")
YAML.write(" - cutadapt:\n")
YAML.write(" name: \"Cutadapt: polyA tails removal\"\n")
YAML.write(" path_filters:\n")
YAML.write(" - \"*/remove_polya_cutadapt.stdout.log\"\n")
YAML.write("\n")
YAML.write(" - star:\n")
YAML.write(" path_filters:\n")
YAML.write(" - \"*/map_genome/*\"\n")
YAML.write("\n")
YAML.write(" - alfa:\n")
YAML.write(" path_filters:\n")
YAML.write(" - \"*/ALFA_plots.concat_mqc.png\"\n")
YAML.write("\n")
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
YAML.write(" - TIN_scores:\n")
YAML.write(" path_filters:\n")
YAML.write(" - \"*/TIN scores_mqc.png\"\n")
YAML.write("\n")
YAML.write(" - salmon:\n")
YAML.write(" path_filters:\n")
YAML.write(" - \"*/salmon_quant/*\"\n")
YAML.write("\n")
YAML.write(" - kallisto:\n")
YAML.write(" path_filters:\n")
YAML.write(" - \"*/genome_quantification_kallisto.stderr.log.MODIFIED\"\n")
YAML.write("\n")
YAML.write("...")
rule MULTIQC_report:
'''
Create report with MultiQC
'''
input:
multiqc_config = os.path.join(
config["output_dir"],
"MultiQC_config.yaml")
output:
MultiQC_report = directory(os.path.join(
config["output_dir"],
"multiqc_summary"))
params:
results_dir = config["output_dir"],
log_dir = config["log_dir"]
log:
stderr = os.path.join(
config["log_dir"],
"MULTIQC_report.stderr.log"),
stdout = os.path.join(
config["log_dir"],
"MULTIQC_report.stdout.log")
singularity:
"docker://ewels/multiqc:1.7"
shell:
"""
multiqc \
--outdir {output.MultiQC_report} \
--config {input.multiqc_config} \
{params.results_dir} \
{params.log_dir} \
1> {log.stdout} 2> {log.stderr}