Newer
Older
BIOPZ-Gypas Foivos
committed
import os
BIOPZ-Katsantoni Maria
committed
BIOPZ-Katsantoni Maria
committed
'''
A quality control tool for high throughput sequence data.
'''
BIOPZ-Katsantoni Maria
committed
reads = lambda wildcards:
samples_table.loc[wildcards.sample, "fq1"]
BIOPZ-Katsantoni Maria
committed
outdir = directory(os.path.join(
config["output_dir"],
"single_end",
"{sample}",
"mate1_fastqc"))
BIOPZ-Katsantoni Maria
committed
seqmode = lambda wildcards:
samples_table.loc[wildcards.sample, "seqmode"]
singularity:
BIOPZ-Katsantoni Maria
committed
BIOPZ-Katsantoni Maria
committed
stderr = os.path.join(
config["log_dir"],
"single_end",
"{sample}",
"fastqc.stderr.log"),
stdout = os.path.join(
config["log_dir"],
"single_end",
"{sample}",
"fastqc.stdout.log")
shell:
"(mkdir -p {output.outdir}; \
fastqc \
--outdir {output.outdir} \
BIOPZ-Katsantoni Maria
committed
{input.reads};) \
1> {log.stdout} 2> {log.stderr}"
rule remove_adapters_cutadapt:
BIOPZ-Katsantoni Maria
committed
'''
Remove adapters
'''
BIOPZ-Katsantoni Maria
committed
reads = lambda wildcards:
samples_table.loc[wildcards.sample, "fq1"]
BIOPZ-Katsantoni Maria
committed
reads = os.path.join(
config["output_dir"],
"single_end",
"{sample}",
"{sample}.remove_adapters_mate1.fastq.gz")
BIOPZ-Katsantoni Maria
committed
adapters_3 = lambda wildcards:
samples_table.loc[wildcards.sample, 'fq1_3p'],
BIOPZ-Katsantoni Maria
committed
adapters_5 = lambda wildcards:
samples_table.loc[wildcards.sample, 'fq1_5p']
singularity:
BIOPZ-Katsantoni Maria
committed
BIOPZ-Katsantoni Maria
committed
BIOPZ-Katsantoni Maria
committed
stderr = os.path.join(
config["log_dir"],
"single_end",
"{sample}",
"remove_adapters_cutadapt.stderr.log"),
stdout = os.path.join(
config["log_dir"],
"single_end",
"{sample}",
"remove_adapters_cutadapt.stdout.log")
shell:
"(cutadapt \
-e 0.1 \
-O 1 \
-j {threads} \
-m 10 \
-n 3 \
-a {params.adapters_3} \
-g {params.adapters_5} \
-o {output.reads} \
BIOPZ-Katsantoni Maria
committed
{input.reads};) \
1> {log.stdout} 2> {log.stderr}"
rule remove_polya_cutadapt:
BIOPZ-Katsantoni Maria
committed
'''
Remove ployA tails
'''
BIOPZ-Katsantoni Maria
committed
reads = os.path.join(
config["output_dir"],
"single_end",
"{sample}",
"{sample}.remove_adapters_mate1.fastq.gz")
BIOPZ-Katsantoni Maria
committed
reads = os.path.join(
config["output_dir"],
"single_end",
"{sample}",
"{sample}.remove_polya_mate1.fastq.gz")
BIOPZ-Katsantoni Maria
committed
polya_3 = lambda wildcards:
samples_table.loc[wildcards.sample, "fq1_polya"]
BIOPZ-Katsantoni Maria
committed
singularity:
BIOPZ-Katsantoni Maria
committed
BIOPZ-Katsantoni Maria
committed
BIOPZ-Katsantoni Maria
committed
stderr = os.path.join(
config["log_dir"],
"single_end",
"{sample}",
"remove_polya_cutadapt.stderr.log"),
stdout = os.path.join(
config["log_dir"],
"single_end",
"{sample}",
"remove_polya_cutadapt.stdout.log")
shell:
"(cutadapt \
--match-read-wildcards \
-j {threads} \
-n 2 \
-e 0.1 \
-O 1 \
-q 6 \
-m 10 \
-a {params.polya_3} \
-o {output.reads} \
BIOPZ-Katsantoni Maria
committed
{input.reads}); \
1> {log.stdout} 2> {log.stderr}"
rule map_genome_star:
BIOPZ-Katsantoni Maria
committed
'''
Map to genome using STAR
'''
input:
index = lambda wildcards:
os.path.join(
config["star_indexes"],
str(samples_table.loc[wildcards.sample, "organism"]),
BIOPZ-Katsantoni Maria
committed
str(samples_table.loc[wildcards.sample, "index_size"]),
"STAR_index",
"chrNameLength.txt"),
reads = os.path.join(
config["output_dir"],
"single_end",
"{sample}",
"{sample}.remove_polya_mate1.fastq.gz")
BIOPZ-Katsantoni Maria
committed
bam = os.path.join(
config["output_dir"],
"single_end",
"{sample}",
"map_genome",
"{sample}_Aligned.sortedByCoord.out.bam"),
BIOPZ-Katsantoni Maria
committed
logfile = os.path.join(
config["output_dir"],
"single_end",
"{sample}",
"map_genome",
"{sample}_Log.final.out")
BIOPZ-Katsantoni Maria
committed
params:
sample_id = "{sample}",
index = lambda wildcards:
os.path.join(
config["star_indexes"],
str(samples_table.loc[wildcards.sample, "organism"]),
BIOPZ-Katsantoni Maria
committed
str(samples_table.loc[wildcards.sample, "index_size"]),
"STAR_index"),
outFileNamePrefix = os.path.join(
BIOPZ-Katsantoni Maria
committed
config["output_dir"],
"single_end",
"{sample}",
"map_genome",
"{sample}_"),
multimappers = lambda wildcards:
samples_table.loc[wildcards.sample, "multimappers"],
soft_clip = lambda wildcards:
samples_table.loc[wildcards.sample, "soft_clip"],
pass_mode = lambda wildcards:
samples_table.loc[wildcards.sample, "pass_mode"],
BIOPZ-Katsantoni Maria
committed
singularity:
BIOPZ-Katsantoni Maria
committed
threads: 12
BIOPZ-Katsantoni Maria
committed
BIOPZ-Katsantoni Maria
committed
stderr = os.path.join(
config["log_dir"],
"single_end",
"{sample}",
"map_genome_star.stderr.log")
shell:
"(STAR \
--runMode alignReads \
-- twopassMode {params.pass_mode} \
--runThreadN {threads} \
--genomeDir {params.index} \
--readFilesIn {input.reads} \
--readFilesCommand zcat \
--outSAMunmapped None \
--outFilterMultimapNmax {params.multimappers} \
--outFilterMultimapScoreRange 1 \
--outFileNamePrefix {params.outFileNamePrefix} \
--outSAMattributes All \
--outStd BAM_SortedByCoordinate \
--outSAMtype BAM SortedByCoordinate \
--outFilterMismatchNoverLmax 0.04 \
--outFilterScoreMinOverLread 0.3 \
--outFilterMatchNminOverLread 0.3 \
--outFilterType BySJout \
--outReadsUnmapped None \
--outSAMattrRGline ID:rcrunch SM:{params.sample_id} \
BIOPZ-Katsantoni Maria
committed
--alignEndsType {params.soft_clip} > {output.bam};) \
2> {log.stderr}"
rule index_genomic_alignment_samtools:
BIOPZ-Katsantoni Maria
committed
'''
Index genome bamfile using samtools
'''
BIOPZ-Katsantoni Maria
committed
bam = os.path.join(
config["output_dir"],
"single_end",
"{sample}",
"map_genome",
"{sample}_Aligned.sortedByCoord.out.bam")
BIOPZ-Katsantoni Maria
committed
BIOPZ-Katsantoni Maria
committed
bai = os.path.join(
config["output_dir"],
"single_end",
"{sample}",
"map_genome",
"{sample}_Aligned.sortedByCoord.out.bam.bai")
BIOPZ-Katsantoni Maria
committed
singularity:
BIOPZ-Katsantoni Maria
committed
BIOPZ-Katsantoni Maria
committed
BIOPZ-Katsantoni Maria
committed
stderr = os.path.join(
config["log_dir"],
"single_end",
"{sample}",
"index_genomic_alignment_samtools.stderr.log"),
stdout = os.path.join(
config["log_dir"],
"single_end",
"{sample}",
"index_genomic_alignment_samtools.stdout.log")
BIOPZ-Katsantoni Maria
committed
"(samtools index {input.bam} {output.bai};) \
1> {log.stdout} 2> {log.stderr}"
rule quantification_salmon:
BIOPZ-Katsantoni Maria
committed
'''
Quantification at transcript and gene level using Salmon
'''
input:
reads = os.path.join(
BIOPZ-Katsantoni Maria
committed
config["output_dir"],
"single_end",
"{sample}",
"{sample}.remove_polya_mate1.fastq.gz"),
index = lambda wildcards:
os.path.join(
config["salmon_indexes"],
str(samples_table.loc[wildcards.sample, "organism"]),
str(samples_table.loc[wildcards.sample, "kmer"]),
"salmon.idx"),
BIOPZ-Katsantoni Maria
committed
gtf = lambda wildcards:
samples_table.loc[wildcards.sample, "gtf_filtered"]
output:
gn_estimates = os.path.join(
BIOPZ-Katsantoni Maria
committed
config["output_dir"],
"single_end",
"{sample}",
"salmon_quant",
"quant.genes.sf"),
tr_estimates = os.path.join(
BIOPZ-Katsantoni Maria
committed
config["output_dir"],
"single_end",
"{sample}",
"salmon_quant",
"quant.sf")
BIOPZ-Katsantoni Maria
committed
params:
output_dir = os.path.join(
BIOPZ-Katsantoni Maria
committed
config["output_dir"],
"single_end",
"{sample}",
"salmon_quant"),
libType = lambda wildcards:
samples_table.loc[wildcards.sample, "libtype"]
BIOPZ-Katsantoni Maria
committed
BIOPZ-Katsantoni Maria
committed
stderr = os.path.join(
config["log_dir"],
"single_end",
"{sample}",
"quantification_salmon.stderr.log"),
stdout = os.path.join(
config["log_dir"],
"single_end",
"{sample}",
"quantification_salmon.stdout.log")
threads: 12
singularity:
"docker://zavolab/salmon:1.1.0-slim"
BIOPZ-Katsantoni Maria
committed
shell:
"(salmon quant \
--libType {params.libType} \
--seqBias \
--validateMappings \
--threads {threads} \
--writeUnmappedNames \
--index {input.index} \
--geneMap {input.gtf} \
--unmatedReads {input.reads} \
BIOPZ-Katsantoni Maria
committed
-o {params.output_dir};) \
1> {log.stdout} 2> {log.stderr}"
BIOPZ-Gypas Foivos
committed
rule genome_quantification_kallisto:
BIOPZ-Katsantoni Maria
committed
'''
Quantification at transcript and gene level using Kallisto
'''
input:
reads = os.path.join(
BIOPZ-Katsantoni Maria
committed
config["output_dir"],
"single_end",
"{sample}",
"{sample}.remove_polya_mate1.fastq.gz"),
index = lambda wildcards:
os.path.join(
config["kallisto_indexes"],
samples_table.loc[wildcards.sample, "organism"],
"kallisto.idx")
BIOPZ-Katsantoni Maria
committed
output:
pseudoalignment = os.path.join(
BIOPZ-Katsantoni Maria
committed
config["output_dir"],
"single_end",
"{sample}",
"quant_kallisto",
"{sample}.kallisto.pseudo.sam")
BIOPZ-Katsantoni Maria
committed
params:
output_dir = os.path.join(
BIOPZ-Katsantoni Maria
committed
config["output_dir"],
"single_end",
"{sample}",
"quant_kallisto"),
fraglen = lambda wildcards:
samples_table.loc[wildcards.sample, 'mean'],
fragsd = lambda wildcards:
samples_table.loc[wildcards.sample, 'sd'],
directionality = lambda wildcards:
samples_table.loc[wildcards.sample, 'kallisto_directionality']
threads: 8
BIOPZ-Katsantoni Maria
committed
stderr = os.path.join(
config["log_dir"],
"single_end",
"{sample}",
"genome_quantification_kallisto.stderr.log")
singularity:
BIOPZ-Katsantoni Maria
committed
shell:
"(kallisto quant \
-i {input.index} \
-o {params.output_dir} \
--single \
-l {params.fraglen} \
-s {params.fragsd} \
--pseudobam \
{params.directionality} \
BIOPZ-Katsantoni Maria
committed
{input.reads} > {output.pseudoalignment};) \
2> {log.stderr}"
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
rule star_rpm_single_end:
''' Create stranded bedgraph coverage with STARs RPM normalisation '''
input:
bam = os.path.join(
config["output_dir"],
"single_end",
"{sample}",
"map_genome",
"{sample}_Aligned.sortedByCoord.out.bam"),
bai = os.path.join(
config["output_dir"],
"single_end",
"{sample}",
"map_genome",
"{sample}_Aligned.sortedByCoord.out.bam.bai")
output:
str1 = (os.path.join(
config["output_dir"],
"single_end",
"{sample}",
"STAR_coverage",
"{sample}_Signal.Unique.str1.out.bg"),
os.path.join(
config["output_dir"],
"single_end",
"{sample}",
"STAR_coverage",
"{sample}_Signal.UniqueMultiple.str1.out.bg")),
str2 = (os.path.join(
config["output_dir"],
"single_end",
"{sample}",
"STAR_coverage",
"{sample}_Signal.Unique.str2.out.bg"),
os.path.join(
config["output_dir"],
"single_end",
"{sample}",
"STAR_coverage",
"{sample}_Signal.UniqueMultiple.str2.out.bg"))
params:
out_dir = lambda wildcards, output: os.path.dirname(output.str1[0]),
prefix = lambda wildcards, output: os.path.join(os.path.dirname(output.str1[0]),
str(wildcards.sample) + "_"),
stranded = "Stranded"
singularity:
"docker://zavolab/star:2.7.3a-slim"
log:
stderr = os.path.join(
config["log_dir"],
"single_end",
"{sample}",
"star_rpm_single_end.stderr.log"),
stdout = os.path.join(
config["log_dir"],
"single_end",
"{sample}",
"star_rpm_single_end.stdout.log")
threads: 4
shell:
"""
(mkdir -p {params.out_dir}; \
chmod -R 777 {params.out_dir}; \
STAR \
--runMode inputAlignmentsFromBAM \
--runThreadN {threads} \
--inputBAMfile {input.bam} \
--outWigType "bedGraph" \
--outWigStrand {params.stranded} \
--outWigNorm "RPM" \
--outFileNamePrefix {params.prefix}) \
1> {log.stdout} 2> {log.stderr}
"""