Skip to content
Snippets Groups Projects
paired_end.snakefile.smk 9.02 KiB
Newer Older
	'''A quality control tool for high throughput sequence data'''

	input:
		reads1 = lambda wildcards: samples_table.loc[wildcards.sample, "fq1"],
		reads2 = lambda wildcards: samples_table.loc[wildcards.sample, "fq2"]
		outdir1 = directory(os.path.join(config["output_dir"],"paired_end", "{sample}", "mate1_fastqc")),
		outdir2 = directory(os.path.join(config["output_dir"],"paired_end", "{sample}", "mate2_fastqc"))
	singularity:
		"docker://zavolab/fastqc:0.11.8"
	log:
		os.path.join(config["local_log"],"paired_end", "{sample}", "fastqc.log")
	shell:
		"(mkdir -p {output.outdir1}; \
		mkdir -p {output.outdir2}; \
		fastqc --outdir {output.outdir1} {input.reads1} & \
		fastqc --outdir {output.outdir2} {input.reads2}) &> {log}"


	'''Remove adapters'''
	input:
		reads1 = lambda wildcards: samples_table.loc[wildcards.sample, "fq1"],
		reads2 = lambda wildcards: samples_table.loc[wildcards.sample, "fq2"]
	output:
		reads1 = os.path.join(
			config["output_dir"],
			"paired_end",
			"{sample}",
			"{sample}.remove_adapters_mate1.fastq.gz"),

		reads2 = os.path.join(
			config["output_dir"],
			"paired_end",
			"{sample}",
			"{sample}.remove_adapters_mate2.fastq.gz")
	params:
		adapter_3_mate1 = lambda wildcards:
			samples_table.loc[wildcards.sample, 'fq1_3p'],
		adapter_5_mate1 = lambda wildcards:
			samples_table.loc[wildcards.sample, 'fq1_5p'],
		adapter_3_mate2 = lambda wildcards:
			samples_table.loc[wildcards.sample, 'fq2_3p'],
		adapter_5_mate2 = lambda wildcards:
			samples_table.loc[wildcards.sample, 'fq2_5p']
	singularity:
		"docker://zavolab/cutadapt:1.16"
	threads: 8
	log:
		os.path.join( config["local_log"], "paired_end", "{sample}", "remove_adapters_cutadapt.log")
	shell:
		"(cutadapt \
		-e 0.1 \
		-j {threads} \
		--pair-filter=both \
		-m 10 \
		-n 3 \
		-a {params.adapter_3_mate1} \
		-g {params.adapter_5_mate1} \
		-A {params.adapter_3_mate2} \
		-G {params.adapter_5_mate2} \
		-o {output.reads1} \
		-p {output.reads2} \
		{input.reads1} \
		{input.reads2}) &> {log}"


	'''Remove polyA tails'''
	input:
		reads1 = os.path.join(
			config["output_dir"],
			"paired_end",
			"{sample}",
			"{sample}.remove_adapters_mate1.fastq.gz"),
		reads2 = os.path.join(
			config["output_dir"],
			"paired_end",
			"{sample}",
			"{sample}.remove_adapters_mate2.fastq.gz")
	output:
		reads1 = os.path.join(
			config["output_dir"],
			"paired_end",
			"{sample}",
			"{sample}.remove_polya_mate1.fastq.gz"),
		reads2 = os.path.join(
			config["output_dir"],
			"paired_end",
			"{sample}",
			"{sample}.remove_polya_mate2.fastq.gz")
	params:
		polya_3_mate1 = lambda wildcards:
			samples_table.loc[wildcards.sample, 'fq1_polya'],
		polya_3_mate2 = lambda wildcards:
			samples_table.loc[wildcards.sample, 'fq2_polya'],
	singularity:
		"docker://zavolab/cutadapt:1.16"
	threads: 8
	log:
		os.path.join( config["local_log"], "paired_end", "{sample}", "remove_polya_cutadapt.log")
	shell:
		'(cutadapt \
		--match-read-wildcards \
		-j {threads} \
		--pair-filter=both \
		-m 10 \
		-n 2 \
		-e 0.1 \
		-q 6 \
		-m 10  \
		-a {params.polya_3_mate1} \
		-A {params.polya_3_mate2} \
		-o {output.reads1} \
		-p {output.reads2} \
		{input.reads1} \
		{input.reads2}) &> {log}'


rule pe_map_genome_star:
	'''Map to genome using STAR'''
			os.path.join(
				config["star_indexes"],
				str(samples_table.loc[wildcards.sample, "organism"]),
				str(samples_table.loc[wildcards.sample, "index_size"]),
		reads1 = os.path.join(
			config["output_dir"],
			"paired_end",
			"{sample}",
			"{sample}.remove_polya_mate1.fastq.gz"),
		reads2 = os.path.join(
			config["output_dir"],
			"paired_end",
			"{sample}",
			"{sample}.remove_polya_mate2.fastq.gz")
	output:
		bam = os.path.join(
			config["output_dir"],
			"paired_end",
			"{sample}",
			"map_genome",
			"{sample}_Aligned.sortedByCoord.out.bam"),
		logfile = os.path.join(
			config["output_dir"],
			"paired_end",
			"{sample}",
			"map_genome",
			"{sample}_Log.final.out")
	params:
		sample_id = "{sample}",
		index = lambda wildcards:
			os.path.join(
				config["star_indexes"],
				str(samples_table.loc[wildcards.sample, "organism"]),
				str(samples_table.loc[wildcards.sample, "index_size"]),
				"STAR_index"),
		outFileNamePrefix = os.path.join(
			config["output_dir"],
			"paired_end",
			"{sample}",
			"map_genome",
			"{sample}_"),
		multimappers = lambda wildcards:
			str(samples_table.loc[wildcards.sample, "multimappers"]),
		soft_clip = lambda wildcards:
			samples_table.loc[wildcards.sample, "soft_clip"],
		pass_mode = lambda wildcards:
			samples_table.loc[wildcards.sample, "pass_mode"]

	singularity:
		"docker://zavolab/star:2.6.0a"

	threads: 12

	log:
		os.path.join( config["local_log"], "paired_end", "{sample}", "map_genome_star.log")

	shell:
		"(STAR \
		--runMode alignReads \
		--twopassMode {params.pass_mode} \
		--runThreadN {threads} \
		--genomeDir {params.index} \
		--readFilesIn {input.reads1} {input.reads2} \
		--readFilesCommand zcat \
		--outSAMunmapped None  \
		--outFilterMultimapNmax {params.multimappers} \
		--outFilterMultimapScoreRange 1 \
		--outFileNamePrefix {params.outFileNamePrefix} \
		--outSAMattributes All \
		--outStd BAM_SortedByCoordinate \
		--outSAMtype BAM SortedByCoordinate \
		--outFilterMismatchNoverLmax 0.04 \
		--outFilterScoreMinOverLread 0.3 \
		--outFilterMatchNminOverLread 0.3 \
		--outFilterType BySJout \
		--outReadsUnmapped None \
		--outSAMattrRGline ID:rnaseq_pipeline SM:{params.sample_id} \
		--alignEndsType {params.soft_clip} > {output.bam};) &> {log}"
    '''Index the genomic alignment'''
    input:
        bam = os.path.join(
			config["output_dir"],
			"paired_end",
			"{sample}",
			"map_genome",
			"{sample}_Aligned.sortedByCoord.out.bam"),
    output:
        bai = os.path.join(
			config["output_dir"],
			"paired_end",
			"{sample}",
			"map_genome",
			"{sample}_Aligned.sortedByCoord.out.bam.bai"),
    singularity:
        "docker://zavolab/samtools:1.8"
    log:
    	os.path.join( config["local_log"], "paired_end", "{sample}", "index_genomic_alignment_samtools.log")

    shell:
        "(samtools index {input.bam} {output.bai};) &> {log}"


	'''Quantification at transcript and gene level using Salmon'''
	input:
		reads1 = os.path.join(
			config["output_dir"],
			"paired_end",
			"{sample}",
			"{sample}.remove_polya_mate1.fastq.gz"),
		reads2 = os.path.join(
			config["output_dir"],
			"paired_end",
			"{sample}",
			"{sample}.remove_polya_mate2.fastq.gz"),
		gtf = lambda wildcards:
			samples_table.loc[wildcards.sample, 'gtf_filtered'],
		index = lambda wildcards:
			os.path.join(
				config["salmon_indexes"],
				str(samples_table.loc[wildcards.sample, "organism"]),
				str(samples_table.loc[wildcards.sample, "kmer"]),
	output:		
		gn_estimates = os.path.join(
			config["output_dir"],
			"paired_end",
			"{sample}",
			"salmon_quant",
			"quant.genes.sf"),
		tr_estimates = os.path.join(
			config["output_dir"],
			"paired_end",
			"{sample}",
			"salmon_quant",
			"quant.sf")
	params:
		output_dir = os.path.join(
			config["output_dir"],
			"paired_end",
			"{sample}",
			"salmon_quant"),
		libType = lambda wildcards:
			samples_table.loc[wildcards.sample, 'libtype']
	log:
		os.path.join(config["local_log"], "paired_end", "{sample}", "genome_quantification_salmon.log")
	threads:	6
	singularity:
		"docker://zavolab/salmon:0.11.0"
	shell:
		"(salmon quant \
        --libType {params.libType} \
        --seqBias \
        --validateMappings \
        --threads {threads} \
        --writeUnmappedNames \
        --index {input.index} \
        --geneMap {input.gtf} \
        -1 {input.reads1} \
        -2 {input.reads2} \
        -o {params.output_dir}) &> {log}"


	'''Quantification at transcript and gene level using Kallisto'''
	input:
		reads1 = os.path.join(
			config["output_dir"],
			"paired_end",
			"{sample}",
			"{sample}.remove_polya_mate1.fastq.gz"),
		reads2 = os.path.join(
			config["output_dir"],
			"paired_end",
			"{sample}",
			"{sample}.remove_polya_mate2.fastq.gz"),
		index = lambda wildcards:
			os.path.join(
				config["kallisto_indexes"],
				samples_table.loc[wildcards.sample, 'organism'],
				"kallisto.idx")
	output:
		pseudoalignment = os.path.join(
			config["output_dir"],
			"paired_end",
			"{sample}",
			"quant_kallisto",
			"{sample}.kallisto.pseudo.sam")
	params:
		output_dir = os.path.join(
				config["output_dir"],
				"paired_end",
				"quant_kallisto"),
		directionality = lambda wildcards:
			samples_table.loc[wildcards.sample, "kallisto_directionality"]
		"docker://zavolab/kallisto:0.46.1"
	threads:	8
	log:
		os.path.join(config["local_log"], "paired_end", "{sample}", "genome_quantification_kallisto.log")
	shell:
		"(kallisto quant \
		-i {input.index} \
		-o {params.output_dir} \
		--pseudobam \
		{params.directionality} \
		{input.reads1} {input.reads2} > {output.pseudoalignment}) &> {log}"