From dda2fb8d082ce9947e74c20cff85546de5c084fa Mon Sep 17 00:00:00 2001 From: BIOPZ-Katsantoni Maria <maria.katsantoni@unibas.ch> Date: Fri, 20 Dec 2019 15:34:14 +0100 Subject: [PATCH] Paired end sequencing analysis subpipeline --- process_data/paired_end.snakemake | 456 ++++++++++++++++++++++++++++++ 1 file changed, 456 insertions(+) create mode 100644 process_data/paired_end.snakemake diff --git a/process_data/paired_end.snakemake b/process_data/paired_end.snakemake new file mode 100644 index 0000000..820a59a --- /dev/null +++ b/process_data/paired_end.snakemake @@ -0,0 +1,456 @@ + +rule fastqc: + '''A quality control tool for high throughput sequence data''' + + input: + reads1 = os.path.join(get_fq_1("{sample}")), + reads2 = os.path.join(get_fq_2("{sample}")) + output: + outdir1 = directory(os.path.join(config["output_dir"], "paired_end", "{sample}", "mate1_fastqc")), + outdir2 = directory(os.path.join(config["output_dir"], "paired_end", "{sample}", "mate2_fastqc")) + singularity: + "docker://zavolab/fastqc:0.11.8" + log: + os.path.join(config["local_log"], "paired_end", "{sample}", "fastqc.log") + shell: + "(mkdir -p {output.outdir1}; \ + mkdir -p {output.outdir2}; \ + fastqc --outdir {output.outdir1} {input.reads1}; \ + fastqc --outdir {output.outdir2} {input.reads2}) &> {log}" + + +rule htseq_qa: + '''Assess the technical quality of a run''' + input: + reads1 = os.path.join(get_fq_1("{sample}")), + reads2 = os.path.join(get_fq_2("{sample}")) + output: + qual_pdf_mate1 = os.path.join(config["output_dir"], "paired_end", "{sample}", "htseq_quality_mate1.pdf"), + qual_pdf_mate2 = os.path.join(config["output_dir"], "paired_end", "{sample}", "htseq_quality_mate2.pdf") + singularity: + "docker://zavolab/python_htseq:3.6.5_0.10.0" + log: + os.path.join(config["local_log"], "paired_end", "{sample}", "htseq_qa_.log") + shell: + "(htseq-qa -t fastq -o {output.qual_pdf_mate1} {input.reads1}; \ + htseq-qa -t fastq -o {output.qual_pdf_mate2} {input.reads2}; ) &> {log}" + + +rule remove_adapters_cutadapt: + '''Remove adapters''' + input: + reads1 = os.path.join(get_fq_1("{sample}")), + reads2 = os.path.join(get_fq_2("{sample}")) + output: + reads1 = os.path.join( + config["output_dir"], + "paired_end", + "{sample}", + "{sample}.remove_adapters_mate1.fastq.gz"), + + reads2 = os.path.join( + config["output_dir"], + "{sample}", + "{sample}.remove_adapters_mate2.fastq.gz") + params: + adapter_3_mate1 = lambda wildcards: + sample_table.loc[wildcards.sample, 'fq1_3p'], + adapter_5_mate1 = lambda wildcards: + sample_table.loc[wildcards.sample, 'fq1_5p'], + adapter_3_mate2 = lambda wildcards: + sample_table.loc[wildcards.sample, 'fq2_3p'], + adapter_5_mate2 = lambda wildcards: + sample_table.loc[wildcards.sample, 'fq2_5p'] + singularity: + "docker://zavolab/cutadapt:1.16" + threads: 8 + log: + os.path.join( config["local_log"], "paired_end", "{sample}", "remove_adapters_cutadapt.log") + shell: + "(cutadapt \ + -e 0.1 \ + -j {threads} \ + --pair-filter=both \ + -m 10 \ + -n 3 \ + -a {params.adapter_3_mate1} \ + -g {params.adapter_5_mate1} \ + -A {params.adapter_3_mate2} \ + -G {params.adapter_5_mate2} \ + -o {output.reads1} \ + -p {output.reads2} \ + {input.reads1} \ + {input.reads2}) &> {log}" + + +rule remove_polya_cutadapt: + '''Remove polyA tails''' + input: + reads1 = os.path.join( + config["output_dir"], + "paired_end", + "{sample}", + "{sample}.remove_adapters_mate1.fastq.gz"), + reads2 = os.path.join( + config["output_dir"], + "paired_end", + "{sample}", + "{sample}.remove_adapters_mate2.fastq.gz") + output: + reads1 = os.path.join( + config["output_dir"], + "paired_end", + "{sample}", + "{sample}.remove_polya_mate1.fastq.gz"), + reads2 = os.path.join( + config["output_dir"], + "paired_end", + "{sample}", + "{sample}.remove_polya_mate2.fastq.gz") + params: + polya_3_mate1 = lambda wildcards: + sample_table.loc[wildcards.sample, 'fq1_polya'], + polya_3_mate2 = lambda wildcards: + sample_table.loc[wildcards.sample, 'fq2_polya'], + singularity: + "docker://zavolab/cutadapt:1.16" + threads: 8 + log: + os.path.join( config["local_log"], "paired_end", "{sample}", "remove_polya_cutadapt.log") + shell: + '(cutadapt \ + --match-read-wildcards \ + -j {threads} \ + --pair-filter=both \ + -m 10 \ + -n 2 \ + -e 0.1 \ + -q 6 \ + -m 10 \ + -a {params.polya_3_mate1} \ + -A {params.polya_3_mate2} \ + -o {output.reads1} \ + -p {output.reads2} \ + {input.reads1} \ + {input.reads2}) &> {log}' + + +rule create_index_star: + ''' Create index using STAR''' + input: + genome = sample_table.loc["{sample}", 'genome'], + gtf = sample_table.loc["{sample}", 'gtf'] + output: + chromosome_info = os.path.join( + config["star_indexes"], + sample_table.loc[wildcards.sample, "organism"], + sample_table.loc["{sample}", "index_size"], + "STAR_index", + "chrNameLength.txt"), + chromosomes_names = os.path.join( + config["star_indexes"], + sample_table.loc["{sample}", "organism"], + sample_table.loc["{sample}", "index_size"], + "STAR_index", + "chrName.txt") + params: + output_dir = lambda wildcards: + os.path.join( + config["star_indexes"], + sample_table.loc[wildcards.sample, "organism"], + sample_table.loc[wildcards.sample, "index_size"], + "STAR_index"), + outFileNamePrefix = os.path.join( + config["star_indexes"], + sample_table.loc[wildcards.sample, "organism"], + sample_table.loc[wildcards.sample, "index_size"], + "STAR_index/STAR_"), + sjdbOverhang = lambda wildcards: + sample_table.loc[wildcards.sample, "index_size"] + singularity: + "docker://zavolab/star:2.6.0a" + threads: 12 + log: + os.path.join( config["local_log"], "paired_end", "{sample}", "create_index_star.log") + shell: + "(mkdir -p {params.output_dir}; \ + chmod -R 777 {params.output_dir}; \ + STAR \ + --runMode genomeGenerate \ + --sjdbOverhang {params.sjdbOverhang} \ + --genomeDir {params.output_dir} \ + --genomeFastaFiles {input.genome} \ + --runThreadN {threads} \ + --outFileNamePrefix {params.outFileNamePrefix} \ + --sjdbGTFfile {input.gtf}) &> {log}" + + +rule map_genome_star: + '''Map to genome using STAR''' + input: + index = os.path.join( + config["star_indexes"], + sample_table.loc["{sample}", "organism"], + sample_table.loc["{sample}", "index_size"], + "STAR_index", + "chrNameLength.txt"), + reads1 = os.path.join( + config["output_dir"], + "paired_end", + "{sample}", + "{sample}.remove_polya_mate1.fastq.gz"), + reads2 = os.path.join( + config["output_dir"], + "paired_end", + "{sample}", + "{sample}.remove_polya_mate2.fastq.gz") + output: + bam = os.path.join( + config["output_dir"], + "paired_end", + "{sample}", + "map_genome", + "{sample}_Aligned.sortedByCoord.out.bam"), + logfile = os.path.join( + config["output_dir"], + "paired_end", + "{sample}", + "map_genome", + "{sample}_Log.final.out") + params: + sample_id = "{sample}", + index = lambda wildcards: + os.path.join( + config["star_indexes"], + sample_table.loc[wildcards.sample, "index_size"], + "STAR_index"), + outFileNamePrefix = os.path.join( + config["output_dir"], + "paired_end", + "{sample}", + "map_genome", + "{sample}_"), + multimappers = lambda wildcards: + sample_table.loc[wildcards.sample, "mulitmappers"], + soft_clip = lambda wildcards: + sample_table.loc[wildcards.sample, "soft_clip"], + pass_mode = lambda wildcards: + sample_table.loc[wildcards.sample, "pass_mode"] + + singularity: + "docker://zavolab/star:2.6.0a" + + threads: 12 + + log: + os.path.join( config["local_log"], "paired_end", "{sample}", "map_genome_star.log") + + shell: + "(STAR \ + --runMode alignReads \ + --twopassMode {params.pass_mode} \ + --runThreadN {threads} \ + --genomeDir {params.index} \ + --readFilesIn {input.reads1} {input.reads2} \ + --readFilesCommand zcat \ + --outSAMunmapped None \ + --outFilterMultimapNmax {params.multimappers} \ + --outFilterMultimapScoreRange 1 \ + --outFileNamePrefix {params.outFileNamePrefix} \ + --outSAMattributes All \ + --outStd BAM_SortedByCoordinate \ + --outSAMtype BAM SortedByCoordinate \ + --outFilterMismatchNoverLmax 0.04 \ + --outFilterScoreMinOverLread 0.3 \ + --outFilterMatchNminOverLread 0.3 \ + --outFilterType BySJout \ + --outReadsUnmapped None \ + --outSAMattrRGline ID:rnaseq_pipeline SM:{params.sample} \ + --alignEndsType {params.soft_clip}} > {output.bam};) &> {log}" + + +rule index_genomic_alignment_samtools: + '''Index the genomic alignment''' + input: + bam = os.path.join( + config["output_dir"], + "paired_end", + "{sample}", + "map_genome", + "{sample}_Aligned.sortedByCoord.out.bam"), + output: + bai = os.path.join( + config["output_dir"], + "paired_end", + "{sample}", + "map_genome", + "{sample}_Aligned.sortedByCoord.out.bam.bai"), + singularity: + "docker://zavolab/samtools:1.8" + log: + os.path.join( config["local_log"], "paired_end", "{sample}", "index_genomic_alignment_samtools.log") + + shell: + "(samtools index {input.bam} {output.bai};) &> {log}" + + +rule create_index_salmon: + '''Create index for salmon quantification''' + input: + transcriptome = sample_table.loc["{sample}", 'tr_fasta_filtered'] + output: + index = os.path.join( + config["salmon_indexes"], + sample_table["{sample}", 'organism'], + "salmon.idx") + params: + kmerLen = lambda wildcards: + sample_table.loc[wildcards.sample, 'kmer'] + singularity: + "docker://zavolab/salmon:0.11.0" + log: + os.path.join(config["local_log"], "paired_end", "{sample}", "create_index_salmon.log") + threads: 8 + shell: + "(salmon index \ + --t {input.transcriptome} \ + --i {output.index} \ + --k {params.kmerLen} \ + --threads {threads}) &> {log}" + + +rule quantification_salmon: + '''Quantification at transcript and gene level using Salmon''' + input: + reads1 = os.path.join( + config["output_dir"], + "paired_end", + "{sample}", + "{sample}.remove_polya_mate1.fastq.gz"), + reads2 = os.path.join( + config["output_dir"], + "paired_end", + "{sample}", + "{sample}.remove_polya_mate2.fastq.gz"), + gtf = sample_table["{sample}", 'gtf_filtered'], + index = os.path.join( + config["salmon_indexes"], + sample_table["{sample}", 'organism'], + "salmon.idx") + output: + gn_estimates = os.path.join( + config["output_dir"], + "paired_end", + "{sample}", + "salmon_quant", + "quant.genes.sf"), + tr_estimates = os.path.join( + config["output_dir"], + "paired_end", + "{sample}", + "salmon_quant", + "quant.sf") + params: + output_dir = os.path.join( + config["output_dir"], + "paired_end", + "{sample}", + "salmon_quant"), + libType = lambda wildcards: + sample_table.loc[wildcards.sample, 'libtype'] + log: + os.path.join(config["local_log"], "paired_end", "{sample}", "genome_quantification_salmon.log") + threads: 6 + singularity: + "docker://zavolab/salmon:0.11.0" + shell: + "(salmon quant \ + --libType {params.libType} \ + --seqBias \ + --validateMappings \ + --threads {threads} \ + --writeUnmappedNames \ + --index {input.index} \ + --geneMap {input.gtf} \ + -1 {input.reads1} \ + -2 {input.reads2} \ + -o {params.output_dir}) &> {log}" + + +rule create_index_kallisto: + '''Create index for running Kallisto''' + input: + transcriptome = sample_table[,"{sample}", 'tr_fasta_filtered'] + output: + index = os.path.join( + config["kallisto_indexes"], + sample_table["{sample}", 'organism'], + "kallisto.idx") + params: + output_dir = lambda wildcards: + os.path.join( + config["kallisto_indexes"], + sample_table[wildcards.sample, 'organism']) + singularity: + "docker://zavolab/kallisto:0.9" + log: + os.path.join(config["local_log"], "paired_end", "{sample}", "create_index_kallisto.log") + shell: + "(mkdir -p {params.output_dir}; \ + chmod -R 777 {params.output_dir}; \ + kallisto index -i {output.index} {input.transcriptome}) &> {log}" + + +rule genome_quantification_kallisto: + '''Quantification at transcript and gene level using Kallisto''' + input: + reads1 = os.path.join( + config["output_dir"], + "paired_end", + "{sample}", + "{sample}.remove_polya_mate1.fastq.gz"), + reads2 = os.path.join( + config["output_dir"], + "paired_end", + "{sample}", + "{sample}.remove_polya_mate2.fastq.gz"), + index = os.path.join( + config["kallisto_indexes"], + sample_table["{sample}", 'organism'], + "kallisto.idx") + output: + pseudoalignment = os.path.join( + config["output_dir"], + "paired_end", + "{sample}", + "quant_kallisto", + "{sample}.kallisto.pseudo.sam") + params: + output_dir = lambda wildcards: + os.path.join( + config["output_dir"], + "paired_end", + wildcards.sample, + "quant_kallisto"), + directionality = lambda wildcards: + sample_table.loc[wildcards.sample, "kallisto_directionality"] + singularity: + "docker://zavolab/kallisto:0.9" + threads: 8 + log: + os.path.join(config["local_log"], "paired_end", "{sample}", "genome_quantification_kallisto.log") + shell: + "(kallisto quant \ + -i {input.index} \ + -o {params.output_dir} \ + --pseudobam \ + --{params.directionality}-stranded \ + {input.reads1} {input.reads2} > {output.pseudoalignment}) &> {log}" + + + + + + + + -- GitLab