From 44d3f3e31b0ba564fda14aaf334abf99ecf3655c Mon Sep 17 00:00:00 2001 From: fgypas <fgypas@gmail.com> Date: Fri, 20 Dec 2019 18:25:22 +0100 Subject: [PATCH] Rename snakemake/paired_end.snakemake to snakemake/paired_end.snakefile. Fix wiring of rules. Add fake tests. --- snakemake/Snakefile | 126 +++++++++- snakemake/config.yaml | 8 +- ...red_end.snakemake => paired_end.snakefile} | 188 ++++---------- snakemake/single_end.snakefile | 238 ++++++------------ tests/samples.tsv | 3 + 5 files changed, 245 insertions(+), 318 deletions(-) rename snakemake/{paired_end.snakemake => paired_end.snakefile} (64%) create mode 100644 tests/samples.tsv diff --git a/snakemake/Snakefile b/snakemake/Snakefile index e0b8f6f..b541c1e 100644 --- a/snakemake/Snakefile +++ b/snakemake/Snakefile @@ -1,20 +1,132 @@ +configfile: "config.yaml" + +################################################################################ +### python modules +################################################################################ + +import os +import sys import pandas as pd -configfile: "config.yaml" +############################ + +samples_table = pd.read_csv(config["samples"], header=0, index_col=0, comment='#', engine='python', sep="\t") localrules: finish +################################################################################## +# Execution dependend on sequencing mode +################################################################################## + +include: 'paired_end.snakefile' +include: 'single_end.snakefile' + ################################################################################# ### Final rule ################################################################################# rule finish: input: - final_sample = expand() + outdir1 = expand(os.path.join(config["output_dir"], "paired_end", "{sample}", "mate1_fastqc"), sample=samples_table.index.values), + outdir2 = expand(os.path.join(config["output_dir"], "paired_end", "{sample}", "mate2_fastqc"), sample=samples_table.index.values) -################################################################################## -# Execution dependend on sequencing mode -################################################################################## -include: 'paired_end.snakefile' -include: 'single_end.snakefile' +rule create_index_star: + ''' Create index using STAR''' + input: + genome = lambda wildcards: samples_table.loc[wildcards.sample, 'genome'], + gtf = lambda wildcards: samples_table.loc[wildcards.sample, 'gtf'] + output: + chromosome_info = os.path.join( + config["star_indexes"], + "{organism}", + "{index_size}", + "STAR_index", + "chrNameLength.txt"), + chromosomes_names = os.path.join( + config["star_indexes"], + "{organism}", + "{index_size}", + "STAR_index", + "chrName.txt") + params: + output_dir = os.path.join( + config["star_indexes"], + "{organism}", + "{index_size}", + "STAR_index"), + outFileNamePrefix = os.path.join( + config["star_indexes"], + "{organism}", + "{index_size}", + "STAR_index/STAR_"), + sjdbOverhang = lambda wildcards: + samples_table[wildcards.sample, "index_size"], + singularity: + "docker://zavolab/star:2.6.0a" + threads: 12 + log: + os.path.join( config["local_log"], "{organism}_{index_size}_create_index_star.log") + shell: + "(mkdir -p {params.output_dir}; \ + chmod -R 777 {params.output_dir}; \ + STAR \ + --runMode genomeGenerate \ + --sjdbOverhang {params.sjdbOverhang} \ + --genomeDir {params.output_dir} \ + --genomeFastaFiles {input.genome} \ + --runThreadN {threads} \ + --outFileNamePrefix {params.outFileNamePrefix} \ + --sjdbGTFfile {input.gtf}) &> {log}" + + +rule create_index_salmon: + '''Create index for salmon quantification''' + input: + transcriptome = lambda wildcards: + samples_table.loc[wildcards.sample, 'tr_fasta_filtered'] + output: + index = os.path.join( + config["salmon_indexes"], + "{organism}", + "salmon.idx") + params: + kmerLen = lambda wildcards: + samples_table.loc[wildcards.sample, 'kmer'] + singularity: + "docker://zavolab/salmon:0.11.0" + log: + os.path.join(config["local_log"], "{organism}_create_index_salmon.log") + threads: 8 + shell: + "(salmon index \ + --t {input.transcriptome} \ + --i {output.index} \ + --k {params.kmerLen} \ + --threads {threads}) &> {log}" + + +rule create_index_kallisto: + '''Create index for running Kallisto''' + input: + transcriptome = lambda wildcards: + samples_table.loc[wildcards.sample, 'tr_fasta_filtered'] + output: + index = os.path.join( + config["kallisto_indexes"], + "{organism}", + "kallisto.idx") + params: + output_dir = lambda wildcards: + os.path.join( + config["kallisto_indexes"], + samples_table[wildcards.sample, 'organism']) + singularity: + "docker://zavolab/kallisto:0.9" + log: + os.path.join(config["local_log"], "{organism}_create_index_kallisto.log") + shell: + "(mkdir -p {params.output_dir}; \ + chmod -R 777 {params.output_dir}; \ + kallisto index -i {output.index} {input.transcriptome}) &> {log}" + diff --git a/snakemake/config.yaml b/snakemake/config.yaml index 65bdfa9..80641a9 100644 --- a/snakemake/config.yaml +++ b/snakemake/config.yaml @@ -16,12 +16,14 @@ database_path: "/scicore/home/zavolan/GROUP/Rna_Seq_pipeline/Blabla" STAR_idx_folder: "STAR_indices" output_dir: "results" + star_indexes: "results" + salmon_indexes: "results" + kallisto_indexes: "results" local_log: "logs/local_log" cluster_log: "logs/cluster_log" + ############################################################################## ### Sample info ############################################################################## - input_dir: "samples" - sample: ["test"] - test: {libType: A, fldMean: 300, fldSD: 100} + samples: "../tests/samples.tsv" ... diff --git a/snakemake/paired_end.snakemake b/snakemake/paired_end.snakefile similarity index 64% rename from snakemake/paired_end.snakemake rename to snakemake/paired_end.snakefile index b9443eb..ebc283a 100644 --- a/snakemake/paired_end.snakemake +++ b/snakemake/paired_end.snakefile @@ -1,13 +1,15 @@ -rule fastqc: +rule pe_fastqc: '''A quality control tool for high throughput sequence data''' input: - reads1 = os.path.join(get_fq_1("{sample}")), - reads2 = os.path.join(get_fq_2("{sample}")) + reads1 = lambda wildcards: samples_table.loc[wildcards.sample, "fq1"], + reads2 = lambda wildcards: samples_table.loc[wildcards.sample, "fq2"] output: outdir1 = directory(os.path.join(config["output_dir"], "paired_end", "{sample}", "mate1_fastqc")), outdir2 = directory(os.path.join(config["output_dir"], "paired_end", "{sample}", "mate2_fastqc")) + threads: + 2 singularity: "docker://zavolab/fastqc:0.11.8" log: @@ -15,15 +17,14 @@ rule fastqc: shell: "(mkdir -p {output.outdir1}; \ mkdir -p {output.outdir2}; \ - fastqc --outdir {output.outdir1} {input.reads1}; \ + fastqc --outdir {output.outdir1} {input.reads1}; & \ fastqc --outdir {output.outdir2} {input.reads2}) &> {log}" - -rule htseq_qa: +rule pe_htseq_qa: '''Assess the technical quality of a run''' input: - reads1 = os.path.join(get_fq_1("{sample}")), - reads2 = os.path.join(get_fq_2("{sample}")) + reads1 = lambda wildcards: samples_table.loc[wildcards.sample, "fq1"], + reads2 = lambda wildcards: samples_table.loc[wildcards.sample, "fq2"] output: qual_pdf_mate1 = os.path.join(config["output_dir"], "paired_end", "{sample}", "htseq_quality_mate1.pdf"), qual_pdf_mate2 = os.path.join(config["output_dir"], "paired_end", "{sample}", "htseq_quality_mate2.pdf") @@ -38,11 +39,11 @@ rule htseq_qa: htseq-qa -t fastq -o {output.qual_pdf_mate2} {input.reads2}; ) &> {log}" -rule remove_adapters_cutadapt: +rule pe_remove_adapters_cutadapt: '''Remove adapters''' input: - reads1 = os.path.join(get_fq_1("{sample}")), - reads2 = os.path.join(get_fq_2("{sample}")) + reads1 = lambda wildcards: samples_table.loc[wildcards.sample, "fq1"], + reads2 = lambda wildcards: samples_table.loc[wildcards.sample, "fq2"] output: reads1 = os.path.join( config["output_dir"], @@ -56,13 +57,13 @@ rule remove_adapters_cutadapt: "{sample}.remove_adapters_mate2.fastq.gz") params: adapter_3_mate1 = lambda wildcards: - sample_table.loc[wildcards.sample, 'fq1_3p'], + samples_table.loc[wildcards.sample, 'fq1_3p'], adapter_5_mate1 = lambda wildcards: - sample_table.loc[wildcards.sample, 'fq1_5p'], + samples_table.loc[wildcards.sample, 'fq1_5p'], adapter_3_mate2 = lambda wildcards: - sample_table.loc[wildcards.sample, 'fq2_3p'], + samples_table.loc[wildcards.sample, 'fq2_3p'], adapter_5_mate2 = lambda wildcards: - sample_table.loc[wildcards.sample, 'fq2_5p'] + samples_table.loc[wildcards.sample, 'fq2_5p'] singularity: "docker://zavolab/cutadapt:1.16" threads: 8 @@ -85,7 +86,7 @@ rule remove_adapters_cutadapt: {input.reads2}) &> {log}" -rule remove_polya_cutadapt: +rule pe_remove_polya_cutadapt: '''Remove polyA tails''' input: reads1 = os.path.join( @@ -111,9 +112,9 @@ rule remove_polya_cutadapt: "{sample}.remove_polya_mate2.fastq.gz") params: polya_3_mate1 = lambda wildcards: - sample_table.loc[wildcards.sample, 'fq1_polya'], + samples_table.loc[wildcards.sample, 'fq1_polya'], polya_3_mate2 = lambda wildcards: - sample_table.loc[wildcards.sample, 'fq2_polya'], + samples_table.loc[wildcards.sample, 'fq2_polya'], singularity: "docker://zavolab/cutadapt:1.16" threads: 8 @@ -137,65 +138,16 @@ rule remove_polya_cutadapt: {input.reads2}) &> {log}' -rule create_index_star: - ''' Create index using STAR''' +rule pe_map_genome_star: + '''Map to genome using STAR''' input: - genome = sample_table.loc["{sample}", 'genome'], - gtf = sample_table.loc["{sample}", 'gtf'] - output: - chromosome_info = os.path.join( - config["star_indexes"], - sample_table.loc[wildcards.sample, "organism"], - sample_table.loc["{sample}", "index_size"], - "STAR_index", - "chrNameLength.txt"), - chromosomes_names = os.path.join( - config["star_indexes"], - sample_table.loc["{sample}", "organism"], - sample_table.loc["{sample}", "index_size"], - "STAR_index", - "chrName.txt") - params: - output_dir = lambda wildcards: + index = lambda wildcards: os.path.join( config["star_indexes"], - sample_table.loc[wildcards.sample, "organism"], - sample_table.loc[wildcards.sample, "index_size"], - "STAR_index"), - outFileNamePrefix = os.path.join( - config["star_indexes"], - sample_table.loc[wildcards.sample, "organism"], - sample_table.loc[wildcards.sample, "index_size"], - "STAR_index/STAR_"), - sjdbOverhang = lambda wildcards: - sample_table.loc[wildcards.sample, "index_size"] - singularity: - "docker://zavolab/star:2.6.0a" - threads: 12 - log: - os.path.join( config["local_log"], "paired_end", "{sample}", "create_index_star.log") - shell: - "(mkdir -p {params.output_dir}; \ - chmod -R 777 {params.output_dir}; \ - STAR \ - --runMode genomeGenerate \ - --sjdbOverhang {params.sjdbOverhang} \ - --genomeDir {params.output_dir} \ - --genomeFastaFiles {input.genome} \ - --runThreadN {threads} \ - --outFileNamePrefix {params.outFileNamePrefix} \ - --sjdbGTFfile {input.gtf}) &> {log}" - - -rule map_genome_star: - '''Map to genome using STAR''' - input: - index = os.path.join( - config["star_indexes"], - sample_table.loc["{sample}", "organism"], - sample_table.loc["{sample}", "index_size"], - "STAR_index", - "chrNameLength.txt"), + samples_table.loc[wildcards.sample, "organism"], + samples_table.loc[wildcards.sample, "index_size"], + "STAR_index", + "chrNameLength.txt"), reads1 = os.path.join( config["output_dir"], "paired_end", @@ -224,7 +176,7 @@ rule map_genome_star: index = lambda wildcards: os.path.join( config["star_indexes"], - sample_table.loc[wildcards.sample, "index_size"], + samples_table.loc[wildcards.sample, "index_size"], "STAR_index"), outFileNamePrefix = os.path.join( config["output_dir"], @@ -233,11 +185,11 @@ rule map_genome_star: "map_genome", "{sample}_"), multimappers = lambda wildcards: - sample_table.loc[wildcards.sample, "mulitmappers"], + samples_table.loc[wildcards.sample, "mulitmappers"], soft_clip = lambda wildcards: - sample_table.loc[wildcards.sample, "soft_clip"], + samples_table.loc[wildcards.sample, "soft_clip"], pass_mode = lambda wildcards: - sample_table.loc[wildcards.sample, "pass_mode"] + samples_table.loc[wildcards.sample, "pass_mode"] singularity: "docker://zavolab/star:2.6.0a" @@ -271,7 +223,7 @@ rule map_genome_star: --alignEndsType {params.soft_clip}} > {output.bam};) &> {log}" -rule index_genomic_alignment_samtools: +rule pe_index_genomic_alignment_samtools: '''Index the genomic alignment''' input: bam = os.path.join( @@ -296,32 +248,7 @@ rule index_genomic_alignment_samtools: "(samtools index {input.bam} {output.bai};) &> {log}" -rule create_index_salmon: - '''Create index for salmon quantification''' - input: - transcriptome = sample_table.loc["{sample}", 'tr_fasta_filtered'] - output: - index = os.path.join( - config["salmon_indexes"], - sample_table["{sample}", 'organism'], - "salmon.idx") - params: - kmerLen = lambda wildcards: - sample_table.loc[wildcards.sample, 'kmer'] - singularity: - "docker://zavolab/salmon:0.11.0" - log: - os.path.join(config["local_log"], "paired_end", "{sample}", "create_index_salmon.log") - threads: 8 - shell: - "(salmon index \ - --t {input.transcriptome} \ - --i {output.index} \ - --k {params.kmerLen} \ - --threads {threads}) &> {log}" - - -rule quantification_salmon: +rule pe_quantification_salmon: '''Quantification at transcript and gene level using Salmon''' input: reads1 = os.path.join( @@ -334,11 +261,13 @@ rule quantification_salmon: "paired_end", "{sample}", "{sample}.remove_polya_mate2.fastq.gz"), - gtf = sample_table["{sample}", 'gtf_filtered'], - index = os.path.join( - config["salmon_indexes"], - sample_table["{sample}", 'organism'], - "salmon.idx") + gtf = lambda wildcards: + samples_table.loc[wildcards.sample, 'gtf_filtered'], + index = lambda wildcards: + os.path.join( + config["salmon_indexes"], + samples_table.loc[wildcards.sample, 'organism'], + "salmon.idx") output: gn_estimates = os.path.join( config["output_dir"], @@ -359,7 +288,7 @@ rule quantification_salmon: "{sample}", "salmon_quant"), libType = lambda wildcards: - sample_table.loc[wildcards.sample, 'libtype'] + samples_table.loc[wildcards.sample, 'libtype'] log: os.path.join(config["local_log"], "paired_end", "{sample}", "genome_quantification_salmon.log") threads: 6 @@ -379,31 +308,7 @@ rule quantification_salmon: -o {params.output_dir}) &> {log}" -rule create_index_kallisto: - '''Create index for running Kallisto''' - input: - transcriptome = sample_table[,"{sample}", 'tr_fasta_filtered'] - output: - index = os.path.join( - config["kallisto_indexes"], - sample_table["{sample}", 'organism'], - "kallisto.idx") - params: - output_dir = lambda wildcards: - os.path.join( - config["kallisto_indexes"], - sample_table[wildcards.sample, 'organism']) - singularity: - "docker://zavolab/kallisto:0.9" - log: - os.path.join(config["local_log"], "paired_end", "{sample}", "create_index_kallisto.log") - shell: - "(mkdir -p {params.output_dir}; \ - chmod -R 777 {params.output_dir}; \ - kallisto index -i {output.index} {input.transcriptome}) &> {log}" - - -rule genome_quantification_kallisto: +rule pe_genome_quantification_kallisto: '''Quantification at transcript and gene level using Kallisto''' input: reads1 = os.path.join( @@ -416,10 +321,11 @@ rule genome_quantification_kallisto: "paired_end", "{sample}", "{sample}.remove_polya_mate2.fastq.gz"), - index = os.path.join( - config["kallisto_indexes"], - sample_table["{sample}", 'organism'], - "kallisto.idx") + index = lambda wildcards: + os.path.join( + config["kallisto_indexes"], + samples_table.loc[wildcards.sample, 'organism'], + "kallisto.idx") output: pseudoalignment = os.path.join( config["output_dir"], @@ -435,7 +341,7 @@ rule genome_quantification_kallisto: wildcards.sample, "quant_kallisto"), directionality = lambda wildcards: - sample_table.loc[wildcards.sample, "kallisto_directionality"] + samples_table.loc[wildcards.sample, "kallisto_directionality"] singularity: "docker://zavolab/kallisto:0.9" threads: 8 diff --git a/snakemake/single_end.snakefile b/snakemake/single_end.snakefile index 72abaec..970dec4 100644 --- a/snakemake/single_end.snakefile +++ b/snakemake/single_end.snakefile @@ -1,8 +1,8 @@ - +import os rule fastqc: ''' A quality control tool for high throughput sequence data. ''' input: - reads = os.path.join(get_fq_1("{sample}")) + reads = lambda wildcards: samples_table.loc[wildcards.sample, "fq1"], output: outdir = directory(os.path.join(config["output_dir"], "single_end", "{sample}", "fastqc")) singularity: @@ -19,7 +19,7 @@ rule fastqc: rule htseq_qa: ''' Assess the technical quality of a run. ''' input: - reads = os.path.join(get_fq_1("{sample}")) + reads = lambda wildcards: samples_table.loc[wildcards.sample, "fq1"] output: qual_pdf = os.path.join(config["output_dir"], "single_end", "{sample}", "htseq_quality.pdf") singularity: @@ -36,14 +36,14 @@ rule htseq_qa: rule remove_adapters_cutadapt: ''' Remove adapters ''' input: - reads = os.path.join(get_fq_1("{sample}")) + reads = lambda wildcards: samples_table.loc[wildcards.sample, "fq1"] output: reads = os.path.join(config["output_dir"], "single_end", "{sample}", "{sample}.remove_adapters.fastq.gz") params: adapters_3 = lambda wildcards: - sample_table.loc[wildcards.sample, 'fq1_3p'] + samples_table.loc[wildcards.sample, 'fq1_3p'], adapters_5 = lambda wildcards: - sample_table.loc[wildcards.sample, 'fq1_5p'] + samples_table.loc[wildcards.sample, 'fq1_5p'] singularity: "docker://zavolab/cutadapt:1.16" @@ -66,13 +66,12 @@ rule remove_adapters_cutadapt: rule remove_polya_cutadapt: ''' Remove ployA tails''' input: - reads = os.path.join(get_fq_1("{sample}")) + reads = lambda wildcards: samples_table[wildcards.sample, "fq1"] output: reads = os.path.join(config["output_dir"], "single_end", "{sample}", "{sample}.remove_polya.fastq.gz") params: - polya_3 = - adapters_3 = lambda wildcards: - sample_table.loc[wildcards.sample, "fq1_polya"] + polya_3 = lambda wildcards: + samples_table.loc[wildcards.sample, "fq1_polya"] singularity: "docker://zavolab/cutadapt:1.16" threads: 8 @@ -92,63 +91,15 @@ rule remove_polya_cutadapt: {input.reads}) &> {log}" -rule create_index_star: - ''' Create index using STAR. ''' - input: - genome = sample_table.loc["{sample}", "genome"], - gtf = sample_table.loc["{sample}", "gtf"] - output: - chromosome_info = os.path.join( - config["star_indexes"], - sample_table.loc["{sample}", "organism"], - sample_table.loc["{sample}", "index_size"], - "STAR_index","chrNameLength.txt"), - chromosome_names = os.path.join( - config["star_indexes"], - sample_table.loc["{sample}", "organism"], - sample_table.loc["{sample}", "index_size"], - "STAR_index", "chrName.txt") - params: - output_dir = lambda wildcards: - os.path.join( - config["star_indexes"], - sample_table.loc["{sample}", "organism"], - sample_table.loc[wildcards.sample, "index_size"], - "STAR_index"), - outFileNamePrefix = lambda wildcards: - os.path.join( - config["star_indexes"], - sample_table.loc["{sample}", "organism"], - sample_table.loc[wildcards.sample, "index_size"], - "STAR_index/STAR_"), - sjdbOverhang = lambda wildcards: - sample_table.loc[wildcards.sample, "index_size"] - singularity: - "docker://zavolab/star:2.6.0a" - threads: 12 - log: - os.path.join(config["local_log"], "single_end", "{sample}", "create_index_star.log") - shell: - "(mkdir -p {params.output_dir}; \ - chmod -R 777 {params.output_dir}; \ - STAR \ - --runMode genomeGenerate \ - --sjdbOverhang {params.sjdbOverhang} \ - --genomeDir {params.output_dir} \ - --genomeFastaFiles {input.genome} \ - --runThreadN {threads} \ - --outFileNamePrefix {params.outFileNamePrefix} \ - --sjdbGTFfile {input.gtf}) &> {log}" - - rule map_genome_star: ''' Map to genome using STAR. ''' input: - index = os.path.join( - config["star_indexes"], - sample_table.loc["{sample}", "organism"], - sample_table.loc["{sample}", "index_size"], - "STAR_index","chrNameLength.txt"), + index = lambda wildcards: + os.path.join( + config["star_indexes"], + samples_table.loc[wildcards.sample, "organism"], + samples_table.loc[wildcards.sample, "index_size"], + "STAR_index","chrNameLength.txt"), reads = os.path.join(config["output_dir"], "single_end", "{sample}", "{sample}.remove_polya.fastq.gz") output: bam = os.path.join(config["output_dir"], "single_end", @@ -160,12 +111,12 @@ rule map_genome_star: "map_genome", "{sample}_Log.final.out") params: - sample_id = "{sample}" + sample_id = "{sample}", index = lambda wildcards: os.path.join( config["star_indexes"], - sample_table.loc["{sample}", "organism"], - sample_table.loc[wildcards.sample, "index_size"], + samples_table.loc["{sample}", "organism"], + samples_table.loc[wildcards.sample, "index_size"], "STAR_index"), outFileNamePrefix = lambda wildcards: os.path.join( @@ -173,11 +124,11 @@ rule map_genome_star: "single_end", "{sample}", "map_genome", "{sample}_"), multimappers = lambda wildcards: - sample_table.loc[wildcards.sample, "multimappers"], + samples_table.loc[wildcards.sample, "multimappers"], soft_clip = lambda wildcards: - sample_table.loc[wildcards.sample, "soft_clip"], + samples_table.loc[wildcards.sample, "soft_clip"], pass_mode = lambda wildcards: - sample_table.loc[wildcards.sample, "pass_mode"], + samples_table.loc[wildcards.sample, "pass_mode"], singularity: "docker://zavolab/star:2.6.0a" threads: 12 @@ -208,128 +159,80 @@ rule map_genome_star: rule index_genomic_alignment_samtools: - '''Index genome bamfile using samtools.''' - input: - bam = os.path.join(config["output_dir"], - "single_end", - "{sample}", + '''Index genome bamfile using samtools.''' + input: + bam = os.path.join(config["output_dir"], + "single_end", + "{sample}", "map_genome", "{sample}_Aligned.sortedByCoord.out.bam") - output: - bai = os.path.join(config["output_dir"], - "single_end", + output: + bai = os.path.join(config["output_dir"], + "single_end", "{sample}", "map_genome", "{sample}_Aligned.sortedByCoord.out.bam.bai") - singularity: - "docker://zavolab/samtools:1.8" - threads: 1 - log: - os.path.join(config["local_log"], "single_end", "{sample}", "index_genomic_alignment_samtools.log") - shell: - "(samtools index {input.bam} {output.bai};) &> {log}" - - -rule create_index_salmon: - ''' Create index for Salmon quantification. ''' - input: - transcriptome = sample_table.loc[wildcards.sample, 'tr_fasta_filtered'] - output: - index = os.path.join( - config["salmon_indexes"], - sample_table["{sample}", 'organism'], - "salmon.idx") - params: - kmerLen = lambda wildcards: - sample_table.loc[wildcards.sample, 'kmer'] singularity: - "docker://zavolab/salmon:0.11.0" + "docker://zavolab/samtools:1.8" + threads: 1 log: - os.path.join(config["local_log"], "single_end", "{sample}", "create_index_salmon.log") - threads: 8 + os.path.join(config["local_log"], "single_end", "{sample}", "index_genomic_alignment_samtools.log") shell: - "(salmon index \ - --t {input.transcriptome} \ - --i {output.index} \ - --k {params.kmerLen} \ - --threads {threads}) &> {log}" + "(samtools index {input.bam} {output.bai};) &> {log}" rule quantification_salmon: ''' Quantification at transcript and gene level using Salmon. ''' - input: - reads = os.path.join( + input: + reads = os.path.join( config["output_dir"], "single_end", "{sample}", "{sample}.remove_polya.fastq.gz"), - index = os.path.join( - config["salmon_indexes"], - sample_table["{sample}", 'organism'], - "salmon.idx"), - gtf = sample_table.loc["{sample}", "gtf_filtered"] - output: - gn_estimates = os.path.join( + index = lambda wildcards: + os.path.join( + config["salmon_indexes"], + samples_table[wildcards.sample, 'organism'], + "salmon.idx"), + gtf = lambda wildcards: samples_table.loc[wildcards.sample, "gtf_filtered"] + output: + gn_estimates = os.path.join( config["output_dir"], "single_end", "{sample}", "salmon_quant", "quant.genes.sf"), - tr_estimates = os.path.join( + tr_estimates = os.path.join( config["output_dir"], "single_end", "{sample}", "salmon_quant", "quant.sf") - params: - output_dir = os.path.join( + params: + output_dir = os.path.join( config["output_dir"], "single_end", "{sample}", "salmon_quant"), - libType = lambda wildcards: - sample_table.loc[wildcards.sample, "libtype"] - log: - os.path.join(config["local_log"], "single_end", "{sample}", "quantification_salmon.log") - threads: 12 - conda: - "envs/salmon.yaml" - shell: - "(salmon quant \ - --libType {params.libType} \ - --seqBias \ - --validateMappings \ - --threads {threads} \ - --writeUnmappedNames \ - --index {input.index} \ - --geneMap {input.gtf} \ - --unmatedReads {input.reads} \ - -o {params.output_dir}) &> {log}" - - - -rule create_index_kallisto: - ''' Create index for running Kallisto. ''' - input: - transcriptome = sample_table.loc["{sample}", 'tr_fasta_filtered'] - output: - index = os.path.join( - config["kallisto_indexes"], - sample_table.loc["{sample}", "organism"], - "kallisto.idx") - params: - output_dir = lambda wildcards: - os.path.join( - config["kallisto_indexes"], - sample_table.loc["{sample}", "organism"]), + libType = lambda wildcards: + samples_table.loc[wildcards.sample, "libtype"] log: - os.path.join(config["local_log"], "single_end", "{sample}", "create_index_kallisto.log") - singularity: - "docker://zavolab/kallisto:0.9" + os.path.join(config["local_log"], "single_end", "{sample}", "quantification_salmon.log") + threads: 12 + conda: + "envs/salmon.yaml" shell: - "(mkdir -p {params.output_dir}; \ - chmod -R 777 {params.output_dir}; \ - kallisto index -i {output.index} {input.transcriptome}) &> {log}" + "(salmon quant \ + --libType {params.libType} \ + --seqBias \ + --validateMappings \ + --threads {threads} \ + --writeUnmappedNames \ + --index {input.index} \ + --geneMap {input.gtf} \ + --unmatedReads {input.reads} \ + -o {params.output_dir}) &> {log}" + rule genome_quantification_kallisto: ''' Quantification at transcript and gene level using Kallisto. ''' @@ -339,10 +242,11 @@ rule genome_quantification_kallisto: "single_end", "{sample}", "{sample}.remove_polya.fastq.gz"), - index = os.path.join( - config["kallisto_indexes"], - sample_table.loc["{sample}", "organism"], - "kallisto.idx") + index = lambda wildcards: + os.path.join( + config["kallisto_indexes"], + samples_table.loc[wildcards.sample, "organism"], + "kallisto.idx") output: pseudoalignment = os.path.join( config["output_dir"], @@ -356,9 +260,9 @@ rule genome_quantification_kallisto: "single_end", "{sample}", "quant_kallisto"), - fraglen = lambda wildcards: sample_table.loc[wildcards.sample, 'mean'], - fragsd = lambda wildcards: sample_table.loc[wildcards.sample, 'sd'], - directionality = lambda wildcards: sample_table.loc[wildcards.sample, 'kallisto_directionality'] + fraglen = lambda wildcards: samples_table.loc[wildcards.sample, 'mean'], + fragsd = lambda wildcards: samples_table.loc[wildcards.sample, 'sd'], + directionality = lambda wildcards: samples_table.loc[wildcards.sample, 'kallisto_directionality'] threads: 8 log: os.path.join(config["local_log"],"kallisto_align_{sample}.log") diff --git a/tests/samples.tsv b/tests/samples.tsv new file mode 100644 index 0000000..fd5216c --- /dev/null +++ b/tests/samples.tsv @@ -0,0 +1,3 @@ +sample fq1 fq2 +LN18C_rep1 /Users/foivosgypas/Desktop/samples/BSSE_QGF_131557_HHK5FDRXX_1_7_1_LN18C_1_GAATGAGA_GAGGCATT_S1_L001_R1_001_MM_1.fastq.gz /Users/foivosgypas/Desktop/samples/BSSE_QGF_131557_HHK5FDRXX_1_7_1_LN18C_1_GAATGAGA_GAGGCATT_S1_L001_R2_001_MM_1.fastq.gz +LN18C_rep2 /Users/foivosgypas/Desktop/samples/BSSE_QGF_131558_HHK5FDRXX_1_7_2_LN18C_2_AGGCAGAG_AGAATGCC_S2_L001_R1_001_MM_1.fastq.gz /Users/foivosgypas/Desktop/samples/BSSE_QGF_131558_HHK5FDRXX_1_7_2_LN18C_2_AGGCAGAG_AGAATGCC_S2_L001_R2_001_MM_1.fastq.gz -- GitLab