Skip to content
Snippets Groups Projects
single_end.snakefile 10.6 KiB
Newer Older

rule fastqc:
	''' A quality control tool for high throughput sequence data. '''
	input:
		reads = os.path.join(get_fq_1("{sample}"))
	output:
		outdir = directory(os.path.join(config["output_dir"], "single_end", "{sample}", "fastqc"))
	singularity:
		"docker://zavolab/fastqc:0.11.8"
	log:
		os.path.join(config["local_log"], "single_end", "{sample}", "fastqc.log")
	shell:
		"(mkdir -p {output.outdir}; \
		fastqc \
		--outdir {output.outdir} \
		{input.reads}) &> {log}"


rule htseq_qa:
	''' Assess the technical quality of a run. '''
	input:
		reads = os.path.join(get_fq_1("{sample}"))
	output:
		qual_pdf = os.path.join(config["output_dir"], "single_end", "{sample}", "htseq_quality.pdf")
	singularity:
		"docker://zavolab/python_htseq:3.6.5_0.10.0"
	log:
		os.path.join(config["local_log"], "single_end", "{sample}", "htseq_qa.log")
	shell:
		"(htseq-qa \
		-t fastq \
		-o {output.qual_pdf} \
		{input.reads} ) &> {log}"


rule remove_adapters_cutadapt:
	''' Remove adapters '''
	input:
		reads = os.path.join(get_fq_1("{sample}"))
	output:
		reads = os.path.join(config["output_dir"], "single_end", "{sample}", "{sample}.remove_adapters.fastq.gz")
	params:
		adapters_3 = lambda wildcards: 
			sample_table.loc[wildcards.sample, 'fq1_3p']
		adapters_5 = lambda wildcards: 
			sample_table.loc[wildcards.sample, 'fq1_5p']

	singularity:
		"docker://zavolab/cutadapt:1.16"
	threads: 8
	log:
		os.path.join(config["local_log"], "single_end", "{sample}", "remove_adapters_cutadapt.log")
	shell:
		"cutadapt \
		-e 0.1 \
		-O 1 \
		-j {threads} \
		-m 10 \
		-n 3 \
		-a {params.adapters_3} \
		-g {params.adapters_5} \
		-o {output.reads} \
		{input.reads}) &> {log}"


rule remove_polya_cutadapt:
	''' Remove ployA  tails'''
	input:
		reads = os.path.join(get_fq_1("{sample}"))
	output:
		reads = os.path.join(config["output_dir"], "single_end", "{sample}", "{sample}.remove_polya.fastq.gz")
	params:
		polya_3 = 
		adapters_3 = lambda wildcards: 
			sample_table.loc[wildcards.sample, "fq1_polya"]
	singularity:
		"docker://zavolab/cutadapt:1.16"
	threads: 8
	log:
		os.path.join(config["local_log"], "single_end", "{sample}", "remove_polya_cutadapt.log")
	shell:
		"(cutadapt \
		--match-read-wildcards \
		-j {threads} \
		-n 2 \
		-e 0.1 \
		-O 1 \
		-q 6 \
		-m 10  \
		-a {params.polya_3} \
		-o {output.reads} \
		{input.reads}) &> {log}"


rule create_index_star:
	''' Create index using STAR. '''
	input:
		genome = sample_table.loc["{sample}", "genome"],
		gtf = sample_table.loc["{sample}", "gtf"]
	output:
		chromosome_info = os.path.join(
					config["star_indexes"],
					sample_table.loc["{sample}", "organism"],
					sample_table.loc["{sample}", "index_size"], 
					"STAR_index","chrNameLength.txt"),
		chromosome_names = os.path.join(
					config["star_indexes"],
					sample_table.loc["{sample}", "organism"], 
					sample_table.loc["{sample}", "index_size"], 
					"STAR_index", "chrName.txt")
	params:
		output_dir = lambda wildcards:
				os.path.join(
					config["star_indexes"],
					sample_table.loc["{sample}", "organism"],
					sample_table.loc[wildcards.sample, "index_size"], 
					"STAR_index"),
		outFileNamePrefix = lambda wildcards:
				os.path.join(
					config["star_indexes"],
					sample_table.loc["{sample}", "organism"],
					sample_table.loc[wildcards.sample, "index_size"], 
					"STAR_index/STAR_"),
		sjdbOverhang = lambda wildcards:
			sample_table.loc[wildcards.sample, "index_size"]
	singularity:
		"docker://zavolab/star:2.6.0a"
	threads: 12
	log:
		os.path.join(config["local_log"], "single_end", "{sample}", "create_index_star.log")
	shell:
		"(mkdir -p {params.output_dir}; \
		chmod -R 777 {params.output_dir}; \
		STAR \
		--runMode genomeGenerate \
		--sjdbOverhang {params.sjdbOverhang} \
		--genomeDir {params.output_dir} \
		--genomeFastaFiles {input.genome} \
		--runThreadN {threads} \
		--outFileNamePrefix {params.outFileNamePrefix} \
		--sjdbGTFfile {input.gtf}) &> {log}"


rule map_genome_star:
	''' Map to genome using STAR. '''
	input:
		index = os.path.join(
					config["star_indexes"],
					sample_table.loc["{sample}", "organism"],
					sample_table.loc["{sample}", "index_size"], 
					"STAR_index","chrNameLength.txt"),
		reads = os.path.join(config["output_dir"], "single_end", "{sample}", "{sample}.remove_polya.fastq.gz")
	output:
		bam = os.path.join(config["output_dir"], "single_end", 
			"{sample}", 
			"map_genome", 
			"{sample}_Aligned.sortedByCoord.out.bam"),
		logfile = os.path.join(config["output_dir"], "single_end", 
			"{sample}", 
			"map_genome", 
			"{sample}_Log.final.out")
	params:
		sample_id = "{sample}"
		index = lambda wildcards:
				os.path.join(
					config["star_indexes"],
					sample_table.loc["{sample}", "organism"],
					sample_table.loc[wildcards.sample, "index_size"], 
					"STAR_index"),
		outFileNamePrefix = lambda wildcards:
				os.path.join(
					config["output_dir"], 
					"single_end", 
					"{sample}", "map_genome", "{sample}_"),
		multimappers = lambda wildcards:
				sample_table.loc[wildcards.sample, "multimappers"],
		soft_clip = lambda wildcards:
				sample_table.loc[wildcards.sample, "soft_clip"],
		pass_mode = lambda wildcards:
				sample_table.loc[wildcards.sample, "pass_mode"],		
	singularity:
		"docker://zavolab/star:2.6.0a"
	threads: 12
	log:
		os.path.join(config["local_log"], "single_end", "{sample}", "map_genome_star.log")
	shell:
		"(STAR \
		--runMode alignReads \
		-- twopassMode {params.pass_mode} \
		--runThreadN {threads} \
		--genomeDir {params.index} \
		--readFilesIn {input.reads} \
		--readFilesCommand zcat \
		--outSAMunmapped None  \
		--outFilterMultimapNmax {params.multimappers} \
		--outFilterMultimapScoreRange 1 \
		--outFileNamePrefix {params.outFileNamePrefix} \
		--outSAMattributes All \
		--outStd BAM_SortedByCoordinate \
		--outSAMtype BAM SortedByCoordinate \
		--outFilterMismatchNoverLmax 0.04 \
		--outFilterScoreMinOverLread 0.3 \
		--outFilterMatchNminOverLread 0.3 \
		--outFilterType BySJout \
		--outReadsUnmapped None \
		--outSAMattrRGline ID:rcrunch SM:{params.sample_id} \
		--alignEndsType {params.soft_clip}} > {output.bam};) &> {log}"


rule index_genomic_alignment_samtools:
    '''Index genome bamfile using samtools.'''
    input:
        bam = os.path.join(config["output_dir"], 
        	"single_end", 
        	"{sample}", 
			"map_genome", 
			"{sample}_Aligned.sortedByCoord.out.bam")
    output:
        bai = os.path.join(config["output_dir"], 
        	"single_end", 
			"{sample}", 
			"map_genome", 
			"{sample}_Aligned.sortedByCoord.out.bam.bai")
    singularity:
        "docker://zavolab/samtools:1.8"
    threads: 1
    log:
		os.path.join(config["local_log"], "single_end", "{sample}", "index_genomic_alignment_samtools.log")
    shell:
        "(samtools index {input.bam} {output.bai};) &> {log}"


rule create_index_salmon:
	''' Create index for Salmon quantification.  '''
	input:
		transcriptome = sample_table.loc[wildcards.sample, 'tr_fasta_filtered']
	output:
		index = os.path.join(
			config["salmon_indexes"],
			sample_table["{sample}", 'organism'],
			"salmon.idx")
	params:
		kmerLen = lambda wildcards:
			sample_table.loc[wildcards.sample, 'kmer']
	singularity:
		"docker://zavolab/salmon:0.11.0"
	log:
		os.path.join(config["local_log"], "single_end", "{sample}", "create_index_salmon.log")
	threads:	8
	shell:
		"(salmon index \
		--t {input.transcriptome} \
		--i {output.index} \
		--k {params.kmerLen} \
		--threads {threads}) &> {log}"


rule quantification_salmon:
	''' Quantification at transcript and gene level using Salmon. '''
    input:
    	reads = os.path.join(
			config["output_dir"], 
			"single_end", 
			"{sample}", 
			"{sample}.remove_polya.fastq.gz"),
        index = os.path.join(
			config["salmon_indexes"],
			sample_table["{sample}", 'organism'],
			"salmon.idx"),
       	gtf = sample_table.loc["{sample}", "gtf_filtered"]
    output:
    	gn_estimates = os.path.join(
			config["output_dir"], 
			"single_end", 
			"{sample}", 
			"salmon_quant",
			"quant.genes.sf"),
    	tr_estimates = os.path.join(
			config["output_dir"], 
			"single_end", 
			"{sample}", 
			"salmon_quant",
			"quant.sf")
    params:
        output_dir = os.path.join(
			config["output_dir"], 
			"single_end", 
			"{sample}", 
			"salmon_quant"),
        libType = lambda wildcards:
     			sample_table.loc[wildcards.sample, "libtype"]
    log:
		os.path.join(config["local_log"], "single_end", "{sample}", "quantification_salmon.log")
    threads:    12
    conda:
        "envs/salmon.yaml"
    shell:
        "(salmon quant \
        --libType {params.libType} \
        --seqBias \
        --validateMappings \
        --threads {threads} \
        --writeUnmappedNames \
        --index {input.index} \
        --geneMap {input.gtf} \
        --unmatedReads {input.reads} \
        -o {params.output_dir}) &> {log}"



rule create_index_kallisto:
	''' Create index for running Kallisto. '''
	input:
		transcriptome = sample_table.loc["{sample}", 'tr_fasta_filtered']
	output:
		index = os.path.join(
			config["kallisto_indexes"],
			sample_table.loc["{sample}", "organism"],
		 	"kallisto.idx")
	params:
		output_dir = lambda wildcards:
			os.path.join(
				config["kallisto_indexes"], 
				sample_table.loc["{sample}", "organism"]),
	log:
		os.path.join(config["local_log"], "single_end", "{sample}", "create_index_kallisto.log")
	singularity:
		"docker://zavolab/kallisto:0.9"
	shell:
		"(mkdir -p {params.output_dir}; \
		chmod -R 777 {params.output_dir}; \
		kallisto index -i {output.index} {input.transcriptome}) &> {log}"

rule genome_quantification_kallisto:
	''' Quantification at transcript and gene level using Kallisto. '''
	input:
		reads = os.path.join(
			config["output_dir"], 
			"single_end", 
			"{sample}", 
			"{sample}.remove_polya.fastq.gz"),
		index = os.path.join(
			config["kallisto_indexes"],
			sample_table.loc["{sample}", "organism"],
		 	"kallisto.idx")
	output:
		pseudoalignment = os.path.join(
			config["output_dir"], 
			"single_end", 
			"{sample}", 
			"{sample}.kallisto.pseudo.sam")
	params:
		output_dir = lambda wildcards:
			os.path.join(
				config["output_dir"], 
				"single_end", 
				"{sample}", 
				"quant_kallisto"),
		fraglen = lambda wildcards: sample_table.loc[wildcards.sample, 'mean'],
		fragsd = lambda wildcards: sample_table.loc[wildcards.sample, 'sd'],
		directionality = lambda wildcards: sample_table.loc[wildcards.sample, 'kallisto_directionality']
	threads:	    8
	log:
		os.path.join(config["local_log"],"kallisto_align_{sample}.log")
	singularity:
		"docker://zavolab/kallisto:0.9"
	shell:
		"(kallisto quant \
		-i {input.index} \
		-o {params.output_dir} \
		--single \
		-l {params.fraglen} \
		-s {params.fragsd} \
		--pseudobam \
		--{params.directionality}-stranded \
		{input.reads} > {output.pseudoalignment}) &> {log}"