diff --git a/Snakefile b/Snakefile index 695d72f563aa363e773650bdb20107fa0cbfbdf8..68ac17457edc0de709f044087c31545f6136d517 100644 --- a/Snakefile +++ b/Snakefile @@ -24,135 +24,138 @@ os.makedirs( os.getcwd(), config['log_dir'], ), - exist_ok=True, -) + exist_ok=True) + if cluster_config: os.makedirs( os.path.join( os.getcwd(), os.path.dirname(cluster_config['__default__']['out']), ), - exist_ok=True, - ) + exist_ok=True) # Include subworkflows include: os.path.join("workflow", "rules", "paired_end.snakefile.smk") include: os.path.join("workflow", "rules", "single_end.snakefile.smk") -# Final rule rule finish: - """Rule for collecting outputs""" + """ + Rule for collecting outputs + """ input: outdir1 = expand( os.path.join( config['output_dir'], "{seqmode}", "{sample}", - "mate1_fastqc", - ), + "mate1_fastqc"), zip, sample=[i for i in list(samples_table.index.values)], - seqmode=[ - samples_table.loc[i, 'seqmode'] - for i in list(samples_table.index.values) - ] - ), + seqmode=[samples_table.loc[i, 'seqmode'] + for i in list(samples_table.index.values)]), salmon_gn_estimates = expand( os.path.join( config['output_dir'], "{seqmode}", "{sample}", "salmon_quant", - "quant.genes.sf", - ), + "quant.genes.sf"), zip, sample=[i for i in list(samples_table.index.values)], - seqmode=[ - samples_table.loc[i, 'seqmode'] - for i in list(samples_table.index.values) - ] - ), + seqmode=[samples_table.loc[i, 'seqmode'] + for i in list(samples_table.index.values)]), pseudoalignment = expand( os.path.join( config['output_dir'], "{seqmode}", "{sample}", "quant_kallisto", - "{sample}.kallisto.pseudo.sam", - ), + "{sample}.kallisto.pseudo.sam"), zip, sample=[i for i in list(samples_table.index.values)], - seqmode=[ - samples_table.loc[i, 'seqmode'] - for i in list(samples_table.index.values) - ] - ), + seqmode=[samples_table.loc[i, 'seqmode'] + for i in list(samples_table.index.values)]), TIN_score = expand( os.path.join( config['output_dir'], "{seqmode}", "{sample}", "TIN", - "TIN_score.tsv", - ), + "TIN_score.tsv"), zip, sample=[i for i in list(samples_table.index.values)], - seqmode=[ - samples_table.loc[i, 'seqmode'] - for i in list(samples_table.index.values) - ] - ), - salmon_merge_genes = expand(os.path.join(config["output_dir"], "summary_salmon", "quantmerge", "genes_{salmon_merge_on}.tsv"), salmon_merge_on=["tpm", "numreads"]), - salmon_merge_transcripts = expand(os.path.join(config["output_dir"], "summary_salmon", "quantmerge", "transcripts_{salmon_merge_on}.tsv"), salmon_merge_on=["tpm", "numreads"]) + seqmode=[samples_table.loc[i, 'seqmode'] + for i in list(samples_table.index.values)]), + salmon_merge_genes = expand( + os.path.join( + config["output_dir"], + "summary_salmon", + "quantmerge", + "genes_{salmon_merge_on}.tsv"), + salmon_merge_on=["tpm", "numreads"]), + salmon_merge_transcripts = expand( + os.path.join( + config["output_dir"], + "summary_salmon", + "quantmerge", + "transcripts_{salmon_merge_on}.tsv"), + salmon_merge_on=["tpm", "numreads"]) + rule create_index_star: - """Create index for STAR alignments""" + """ + Create index for STAR alignments + """ input: genome = lambda wildcards: - samples_table['genome'][ - samples_table['organism'] == wildcards.organism - ][0], + samples_table['genome'] + [samples_table['organism'] == wildcards.organism] + [0], gtf = lambda wildcards: - samples_table['gtf'][ - samples_table['organism'] == wildcards.organism - ][0], + samples_table['gtf'] + [samples_table['organism'] == wildcards.organism] + [0] + output: chromosome_info = os.path.join( config['star_indexes'], "{organism}", "{index_size}", "STAR_index", - "chrNameLength.txt", - ), + "chrNameLength.txt"), chromosomes_names = os.path.join( config['star_indexes'], "{organism}", "{index_size}", "STAR_index", - "chrName.txt", - ), + "chrName.txt") + params: output_dir = os.path.join( config['star_indexes'], "{organism}", "{index_size}", - "STAR_index", - ), + "STAR_index"), outFileNamePrefix = os.path.join( config['star_indexes'], "{organism}", "{index_size}", - "STAR_index/STAR_", - ), - sjdbOverhang = "{index_size}", + "STAR_index/STAR_"), + sjdbOverhang = "{index_size}" + singularity: "docker://zavolab/star:2.7.3a-slim" + threads: 12 + log: - os.path.join( + stderr = os.path.join( config['log_dir'], - "{organism}_{index_size}_create_index_star.log", - ) + "{organism}_{index_size}_create_index_star.stderr.log"), + stdout = os.path.join( + config['log_dir'], + "{organism}_{index_size}_create_index_star.stdout.log") + shell: "(mkdir -p {params.output_dir}; \ chmod -R 777 {params.output_dir}; \ @@ -163,223 +166,292 @@ rule create_index_star: --genomeFastaFiles {input.genome} \ --runThreadN {threads} \ --outFileNamePrefix {params.outFileNamePrefix} \ - --sjdbGTFfile {input.gtf}) &> {log}" + --sjdbGTFfile {input.gtf}) \ + 1> {log.stdout} 2> {log.stderr}" rule create_index_salmon: - """Create index for Salmon quantification""" + """ + Create index for Salmon quantification + """ input: transcriptome = lambda wildcards: - samples_table['tr_fasta_filtered'][ - samples_table['organism'] == wildcards.organism - ][0] + samples_table['tr_fasta_filtered'] + [samples_table['organism'] == wildcards.organism] + [0] + output: index = directory( os.path.join( config['salmon_indexes'], "{organism}", "{kmer}", - "salmon.idx", - ) - ), + "salmon.idx")) + params: - kmerLen = "{kmer}", + kmerLen = "{kmer}" + singularity: "docker://zavolab/salmon:1.1.0-slim" + log: - os.path.join( + stderr = os.path.join( config['log_dir'], - "{organism}_{kmer}_create_index_salmon.log" - ) + "{organism}_{kmer}_create_index_salmon.stderr.log"), + stdout = os.path.join( + config['log_dir'], + "{organism}_{kmer}_create_index_salmon.stdout.log") + threads: 8 + shell: "(salmon index \ --transcripts {input.transcriptome} \ --index {output.index} \ --kmerLen {params.kmerLen} \ - --threads {threads}) &> {log}" + --threads {threads}) \ + 1> {log.stdout} 2> {log.stderr}" rule create_index_kallisto: - """Create index for Kallisto quantification""" + """ + Create index for Kallisto quantification + """ input: transcriptome = lambda wildcards: - samples_table['tr_fasta_filtered'][ - samples_table['organism'] == wildcards.organism - ][0], + samples_table['tr_fasta_filtered'] + [samples_table['organism'] == wildcards.organism] + [0] + output: index = os.path.join( config['kallisto_indexes'], "{organism}", - "kallisto.idx", - ), + "kallisto.idx") + params: output_dir = os.path.join( config['kallisto_indexes'], - "{organism}", - ), + "{organism}") + singularity: "docker://zavolab/kallisto:0.46.1-slim" + log: - os.path.join( + stderr = os.path.join( config['log_dir'], - "{organism}_create_index_kallisto.log" - ) + "{organism}_create_index_kallisto.stderr.log"), + stdout = os.path.join( + config['log_dir'], + "{organism}_create_index_kallisto.stdout.log") + shell: "(mkdir -p {params.output_dir}; \ chmod -R 777 {params.output_dir}; \ - kallisto index -i {output.index} {input.transcriptome}) &> {log}" + kallisto index -i {output.index} {input.transcriptome}) \ + 1> {log.stdout} 2> {log.stderr}" rule extract_transcripts_as_bed12: - """Convert transcripts to BED12 format""" + """ + Convert transcripts to BED12 format + """ input: gtf = lambda wildcards: - samples_table['gtf'][0], + samples_table['gtf'] + [0] + output: bed12 = os.path.join( config['output_dir'], - "full_transcripts_protein_coding.bed", - ), + "full_transcripts_protein_coding.bed") + singularity: "docker://zavolab/gtf_transcript_type_to_bed12:0.1.0-slim" + threads: 1 + log: - os.path.join( + stderr = os.path.join( config['log_dir'], - "extract_transcripts_as_bed12.log", - ) + "extract_transcripts_as_bed12.stderr.log") + shell: - "gtf_transcript_type_to_bed12.pl \ + "(gtf_transcript_type_to_bed12.pl \ --anno={input.gtf} \ - --type=protein_coding \ - 1> {output.bed12} \ - 2> {log}" + --type=protein_coding > {output.bed12}); \ + 2> {log.stderr}" rule calculate_TIN_scores: - """Caluclate transcript integrity (TIN) score""" + """ + Caluclate transcript integrity (TIN) score + """ input: bai = os.path.join( config['output_dir'], "{seqmode}", "{sample}", "map_genome", - "{sample}_Aligned.sortedByCoord.out.bam.bai" - ), + "{sample}_Aligned.sortedByCoord.out.bam.bai"), transcripts_bed12 = os.path.join( config['output_dir'], - "full_transcripts_protein_coding.bed" - ), + "full_transcripts_protein_coding.bed") + output: TIN_score = os.path.join( config['output_dir'], "{seqmode}", "{sample}", "TIN", - "TIN_score.tsv", - ), + "TIN_score.tsv") + params: bam = os.path.join( config['output_dir'], "{seqmode}", "{sample}", "map_genome", - "{sample}_Aligned.sortedByCoord.out.bam" - ), - sample = "{sample}", + "{sample}_Aligned.sortedByCoord.out.bam"), + sample = "{sample}" + log: - os.path.join( + stderr = os.path.join( config['log_dir'], "{seqmode}", "{sample}", - "calculate_TIN_scores.log", - ) + "calculate_TIN_scores.log") + threads: 8 + singularity: "docker://zavolab/tin_score_calculation:0.1.0-slim" + shell: - "tin_score_calculation.py \ + "(tin_score_calculation.py \ -i {params.bam} \ -r {input.transcripts_bed12} \ -c 0 \ --names {params.sample} \ - -n 100 \ - 1> {output.TIN_score} \ - 2> {log}" + -n 100 > {output.TIN_score};) 2> {log.stderr}" rule salmon_quantmerge_genes: - ''' Merge gene quantifications into a single file. ''' + ''' + Merge gene quantifications into a single file + ''' input: - salmon_in = expand(os.path.join( - config["output_dir"], - "{seqmode}", - "{sample}", - "salmon_quant", - "quant.genes.sf"), + salmon_in = expand( + os.path.join( + config["output_dir"], + "{seqmode}", + "{sample}", + "salmon_quant", + "quant.genes.sf"), zip, - sample= list(samples_table.index.values), - seqmode= list(samples_table["seqmode"])), + sample=list(samples_table.index.values), + seqmode=list(samples_table["seqmode"])) + output: - salmon_out = os.path.join(config["output_dir"], "summary_salmon", "quantmerge", "genes_{salmon_merge_on}.tsv") - params: - salmon_dir = expand(os.path.join( + salmon_out = os.path.join( config["output_dir"], - "{seqmode}", - "{sample}", - "salmon_quant"), + "summary_salmon", + "quantmerge", + "genes_{salmon_merge_on}.tsv") + + params: + salmon_dir = expand( + os.path.join( + config["output_dir"], + "{seqmode}", + "{sample}", + "salmon_quant"), zip, - sample= list(samples_table.index.values), - seqmode= list(samples_table["seqmode"])), - sample_name_list = expand("{sample}", sample= list(samples_table.index.values)), + sample=list(samples_table.index.values), + seqmode=list(samples_table["seqmode"])), + sample_name_list = expand( + "{sample}", + sample=list(samples_table.index.values)), salmon_merge_on = "{salmon_merge_on}" + log: - os.path.join(config["log_dir"], "salmon_quantmerge_genes_{salmon_merge_on}.log") - threads: 1 + stderr = os.path.join( + config["log_dir"], + "salmon_quantmerge_genes_{salmon_merge_on}.stderr.log"), + stdout = os.path.join( + config["log_dir"], + "salmon_quantmerge_genes_{salmon_merge_on}.stdout.log") + + threads: 1 + singularity: "docker://zavolab/salmon:1.1.0-slim" + shell: "(salmon quantmerge \ --quants {params.salmon_dir} \ --genes \ --names {params.sample_name_list} \ --column {params.salmon_merge_on} \ - --output {output.salmon_out}) &> {log}" + --output {output.salmon_out};) \ + 1> {log.stdout} 2> {log.stderr}" + rule salmon_quantmerge_transcripts: - ''' Merge gene quantifications into a single file. ''' + ''' + Merge gene quantifications into a single file + ''' input: - salmon_in = expand(os.path.join( - config["output_dir"], - "{seqmode}", - "{sample}", - "salmon_quant", - "quant.sf"), + salmon_in = expand( + os.path.join( + config["output_dir"], + "{seqmode}", + "{sample}", + "salmon_quant", + "quant.sf"), zip, - sample= list(samples_table.index.values), - seqmode= list(samples_table["seqmode"])), + sample=list(samples_table.index.values), + seqmode=list(samples_table["seqmode"])), + output: - salmon_out = os.path.join(config["output_dir"], "summary_salmon", "quantmerge", "transcripts_{salmon_merge_on}.tsv") - params: - salmon_dir = expand(os.path.join( + salmon_out = os.path.join( config["output_dir"], - "{seqmode}", - "{sample}", - "salmon_quant"), + "summary_salmon", + "quantmerge", + "transcripts_{salmon_merge_on}.tsv") + + params: + salmon_dir = expand( + os.path.join( + config["output_dir"], + "{seqmode}", + "{sample}", + "salmon_quant"), zip, - sample= list(samples_table.index.values), - seqmode= list(samples_table["seqmode"])), - sample_name_list = expand("{sample}", sample= list(samples_table.index.values)), + sample=list(samples_table.index.values), + seqmode=list(samples_table["seqmode"])), + sample_name_list = expand( + "{sample}", + sample=list(samples_table.index.values)), salmon_merge_on = "{salmon_merge_on}" + log: - os.path.join(config["log_dir"], "salmon_quantmerge_transcripts_{salmon_merge_on}.log") - threads: 1 + stderr = os.path.join( + config["log_dir"], + "salmon_quantmerge_transcripts_{salmon_merge_on}.stderr.log"), + stdout = os.path.join( + config["log_dir"], + "salmon_quantmerge_transcripts_{salmon_merge_on}.stdout.log") + + threads: 1 + singularity: "docker://zavolab/salmon:1.1.0-slim" + shell: "(salmon quantmerge \ --quants {params.salmon_dir} \ --names {params.sample_name_list} \ --column {params.salmon_merge_on} \ - --output {output.salmon_out}) &> {log}" + --output {output.salmon_out}) \ + 1> {log.stdout} 2> {log.stderr}" diff --git a/workflow/rules/paired_end.snakefile.smk b/workflow/rules/paired_end.snakefile.smk index a0d87d17e597f1fe3bc9fcf692807e4cf8b2273e..77869565b586e138c5b555dd0052c3ccfae1912a 100644 --- a/workflow/rules/paired_end.snakefile.smk +++ b/workflow/rules/paired_end.snakefile.smk @@ -1,43 +1,73 @@ rule pe_fastqc: - '''A quality control tool for high throughput sequence data''' - + ''' + A quality control tool for high throughput sequence data + ''' input: - reads1 = lambda wildcards: samples_table.loc[wildcards.sample, "fq1"], - reads2 = lambda wildcards: samples_table.loc[wildcards.sample, "fq2"] + reads1 = lambda wildcards: + samples_table.loc[wildcards.sample, "fq1"], + reads2 = lambda wildcards: + samples_table.loc[wildcards.sample, "fq2"] + output: - outdir1 = directory(os.path.join(config["output_dir"],"paired_end", "{sample}", "mate1_fastqc")), - outdir2 = directory(os.path.join(config["output_dir"],"paired_end", "{sample}", "mate2_fastqc")) - threads: - 2 + outdir1 = directory(os.path.join( + config["output_dir"], + "paired_end", + "{sample}", + "mate1_fastqc")), + outdir2 = directory(os.path.join( + config["output_dir"], + "paired_end", + "{sample}", + "mate2_fastqc")) + + threads: 2 + singularity: "docker://zavolab/fastqc:0.11.9-slim" + log: - os.path.join(config["log_dir"],"paired_end", "{sample}", "fastqc.log") + stderr = os.path.join( + config["log_dir"], + "paired_end", + "{sample}", + "fastqc.stderr.log"), + stdout = os.path.join( + config["log_dir"], + "paired_end", + "{sample}", + "fastqc.stdout.log") + shell: "(mkdir -p {output.outdir1}; \ mkdir -p {output.outdir2}; \ - fastqc --outdir {output.outdir1} {input.reads1} & \ - fastqc --outdir {output.outdir2} {input.reads2}) &> {log}" + fastqc --outdir {output.outdir1} {input.reads1}; \ + fastqc --outdir {output.outdir2} {input.reads2}); \ + 1> {log.stdout} 2> {log.stderr}" rule pe_remove_adapters_cutadapt: - '''Remove adapters''' + ''' + Remove adapters + ''' input: - reads1 = lambda wildcards: samples_table.loc[wildcards.sample, "fq1"], - reads2 = lambda wildcards: samples_table.loc[wildcards.sample, "fq2"] + reads1 = lambda wildcards: + samples_table.loc[wildcards.sample, "fq1"], + reads2 = lambda wildcards: + samples_table.loc[wildcards.sample, "fq2"] + output: reads1 = os.path.join( config["output_dir"], "paired_end", "{sample}", "{sample}.remove_adapters_mate1.fastq.gz"), - reads2 = os.path.join( config["output_dir"], "paired_end", "{sample}", "{sample}.remove_adapters_mate2.fastq.gz") + params: adapter_3_mate1 = lambda wildcards: samples_table.loc[wildcards.sample, 'fq1_3p'], @@ -47,11 +77,24 @@ rule pe_remove_adapters_cutadapt: samples_table.loc[wildcards.sample, 'fq2_3p'], adapter_5_mate2 = lambda wildcards: samples_table.loc[wildcards.sample, 'fq2_5p'] + singularity: "docker://zavolab/cutadapt:1.16-slim" + threads: 8 + log: - os.path.join( config["log_dir"], "paired_end", "{sample}", "remove_adapters_cutadapt.log") + stderr = os.path.join( + config["log_dir"], + "paired_end", + "{sample}", + "remove_adapters_cutadapt.stderr.log"), + stdout = os.path.join( + config["log_dir"], + "paired_end", + "{sample}", + "remove_adapters_cutadapt.stdout.log") + shell: "(cutadapt \ -e 0.1 \ @@ -66,11 +109,14 @@ rule pe_remove_adapters_cutadapt: -o {output.reads1} \ -p {output.reads2} \ {input.reads1} \ - {input.reads2}) &> {log}" + {input.reads2}); \ + 1> {log.stdout} 2>{log.stderr}" rule pe_remove_polya_cutadapt: - '''Remove polyA tails''' + ''' + Remove polyA tails + ''' input: reads1 = os.path.join( config["output_dir"], @@ -82,6 +128,7 @@ rule pe_remove_polya_cutadapt: "paired_end", "{sample}", "{sample}.remove_adapters_mate2.fastq.gz") + output: reads1 = os.path.join( config["output_dir"], @@ -93,18 +140,32 @@ rule pe_remove_polya_cutadapt: "paired_end", "{sample}", "{sample}.remove_polya_mate2.fastq.gz") + params: polya_3_mate1 = lambda wildcards: samples_table.loc[wildcards.sample, 'fq1_polya'], polya_3_mate2 = lambda wildcards: - samples_table.loc[wildcards.sample, 'fq2_polya'], + samples_table.loc[wildcards.sample, 'fq2_polya'] + singularity: "docker://zavolab/cutadapt:1.16-slim" + threads: 8 + log: - os.path.join( config["log_dir"], "paired_end", "{sample}", "remove_polya_cutadapt.log") + stderr = os.path.join( + config["log_dir"], + "paired_end", + "{sample}", + "remove_polya_cutadapt.stderr.log"), + stdout = os.path.join( + config["log_dir"], + "paired_end", + "{sample}", + "remove_adapters_cutadapt.stdout.log") + shell: - '(cutadapt \ + "(cutadapt \ --match-read-wildcards \ -j {threads} \ --pair-filter=both \ @@ -118,11 +179,14 @@ rule pe_remove_polya_cutadapt: -o {output.reads1} \ -p {output.reads2} \ {input.reads1} \ - {input.reads2}) &> {log}' + {input.reads2};) \ + 1> {log.stdout} 2>{log.stderr}" rule pe_map_genome_star: - '''Map to genome using STAR''' + ''' + Map to genome using STAR + ''' input: index = lambda wildcards: os.path.join( @@ -141,6 +205,7 @@ rule pe_map_genome_star: "paired_end", "{sample}", "{sample}.remove_polya_mate2.fastq.gz") + output: bam = os.path.join( config["output_dir"], @@ -154,6 +219,7 @@ rule pe_map_genome_star: "{sample}", "map_genome", "{sample}_Log.final.out") + params: sample_id = "{sample}", index = lambda wildcards: @@ -181,7 +247,11 @@ rule pe_map_genome_star: threads: 12 log: - os.path.join( config["log_dir"], "paired_end", "{sample}", "map_genome_star.log") + stderr = os.path.join( + config["log_dir"], + "paired_end", + "{sample}", + "map_genome_star.stderr.log") shell: "(STAR \ @@ -204,11 +274,14 @@ rule pe_map_genome_star: --outFilterType BySJout \ --outReadsUnmapped None \ --outSAMattrRGline ID:rnaseq_pipeline SM:{params.sample_id} \ - --alignEndsType {params.soft_clip} > {output.bam};) &> {log}" + --alignEndsType {params.soft_clip} > {output.bam};) \ + 2> {log.stderr}" rule pe_index_genomic_alignment_samtools: - '''Index the genomic alignment''' + ''' + Index the genomic alignment + ''' input: bam = os.path.join( config["output_dir"], @@ -223,16 +296,31 @@ rule pe_index_genomic_alignment_samtools: "{sample}", "map_genome", "{sample}_Aligned.sortedByCoord.out.bam.bai"), + singularity: "docker://zavolab/samtools:1.10-slim" + log: - os.path.join( config["log_dir"], "paired_end", "{sample}", "index_genomic_alignment_samtools.log") + stderr = os.path.join( + config["log_dir"], + "paired_end", + "{sample}", + "index_genomic_alignment_samtools.stderr.log"), + stdout = os.path.join( + config["log_dir"], + "paired_end", + "{sample}", + "index_genomic_alignment_samtools.stdout.log") + shell: - "(samtools index {input.bam} {output.bai};) &> {log}" + "(samtools index {input.bam} {output.bai};) \ + 1> {log.stdout} 2> {log.stderr}" rule pe_quantification_salmon: - '''Quantification at transcript and gene level using Salmon''' + ''' + Quantification at transcript and gene level using Salmon + ''' input: reads1 = os.path.join( config["output_dir"], @@ -252,7 +340,8 @@ rule pe_quantification_salmon: str(samples_table.loc[wildcards.sample, "organism"]), str(samples_table.loc[wildcards.sample, "kmer"]), "salmon.idx") - output: + + output: gn_estimates = os.path.join( config["output_dir"], "paired_end", @@ -265,6 +354,7 @@ rule pe_quantification_salmon: "{sample}", "salmon_quant", "quant.sf") + params: output_dir = os.path.join( config["output_dir"], @@ -273,11 +363,24 @@ rule pe_quantification_salmon: "salmon_quant"), libType = lambda wildcards: samples_table.loc[wildcards.sample, 'libtype'] + log: - os.path.join(config["log_dir"], "paired_end", "{sample}", "genome_quantification_salmon.log") - threads: 6 + stderr = os.path.join( + config["log_dir"], + "paired_end", + "{sample}", + "genome_quantification_salmon.stderr.log"), + stdout = os.path.join( + config["log_dir"], + "paired_end", + "{sample}", + "genome_quantification_salmon.stdout.log"), + + threads: 6 + singularity: "docker://zavolab/salmon:1.1.0-slim" + shell: "(salmon quant \ --libType {params.libType} \ @@ -289,11 +392,13 @@ rule pe_quantification_salmon: --geneMap {input.gtf} \ -1 {input.reads1} \ -2 {input.reads2} \ - -o {params.output_dir}) &> {log}" + -o {params.output_dir}) 1> {log.stdout} 2> {log.stderr}" rule pe_genome_quantification_kallisto: - '''Quantification at transcript and gene level using Kallisto''' + ''' + Quantification at transcript and gene level using Kallisto + ''' input: reads1 = os.path.join( config["output_dir"], @@ -310,6 +415,7 @@ rule pe_genome_quantification_kallisto: config["kallisto_indexes"], samples_table.loc[wildcards.sample, 'organism'], "kallisto.idx") + output: pseudoalignment = os.path.join( config["output_dir"], @@ -317,24 +423,33 @@ rule pe_genome_quantification_kallisto: "{sample}", "quant_kallisto", "{sample}.kallisto.pseudo.sam") + params: output_dir = os.path.join( - config["output_dir"], - "paired_end", - "{sample}", - "quant_kallisto"), + config["output_dir"], + "paired_end", + "{sample}", + "quant_kallisto"), directionality = lambda wildcards: samples_table.loc[wildcards.sample, "kallisto_directionality"] + singularity: "docker://zavolab/kallisto:0.46.1-slim" - threads: 8 + + threads: 8 + log: - os.path.join(config["log_dir"], "paired_end", "{sample}", "genome_quantification_kallisto.log") + stderr = os.path.join( + config["log_dir"], + "paired_end", + "{sample}", + "genome_quantification_kallisto.stderr.log") + shell: "(kallisto quant \ -i {input.index} \ -o {params.output_dir} \ --pseudobam \ {params.directionality} \ - {input.reads1} {input.reads2} > {output.pseudoalignment}) &> {log}" - + {input.reads1} {input.reads2} > {output.pseudoalignment}) \ + 2> {log.stderr}" diff --git a/workflow/rules/single_end.snakefile.smk b/workflow/rules/single_end.snakefile.smk index aa0ed104eb0abf83407ac7faa85103407fd8fb5b..eaebe797ebc3a17f707a1e49e5e62695581a05dc 100644 --- a/workflow/rules/single_end.snakefile.smk +++ b/workflow/rules/single_end.snakefile.smk @@ -1,40 +1,84 @@ import os + rule fastqc: - ''' A quality control tool for high throughput sequence data. ''' + ''' + A quality control tool for high throughput sequence data. + ''' input: - reads = lambda wildcards: samples_table.loc[wildcards.sample, "fq1"], + reads = lambda wildcards: + samples_table.loc[wildcards.sample, "fq1"] + output: - outdir = directory(os.path.join(config["output_dir"], "single_end", "{sample}", "mate1_fastqc")) + outdir = directory(os.path.join( + config["output_dir"], + "single_end", + "{sample}", + "mate1_fastqc")) + params: - seqmode= lambda wildcards: samples_table.loc[wildcards.sample, "seqmode"] + seqmode = lambda wildcards: + samples_table.loc[wildcards.sample, "seqmode"] + singularity: "docker://zavolab/fastqc:0.11.9-slim" + log: - os.path.join(config["log_dir"], "single_end", "{sample}", "fastqc.log") + stderr = os.path.join( + config["log_dir"], + "single_end", + "{sample}", + "fastqc.stderr.log"), + stdout = os.path.join( + config["log_dir"], + "single_end", + "{sample}", + "fastqc.stdout.log") + shell: "(mkdir -p {output.outdir}; \ fastqc \ --outdir {output.outdir} \ - {input.reads}) &> {log}" + {input.reads};) \ + 1> {log.stdout} 2> {log.stderr}" rule remove_adapters_cutadapt: - ''' Remove adapters ''' + ''' + Remove adapters + ''' input: - reads = lambda wildcards: samples_table.loc[wildcards.sample, "fq1"] + reads = lambda wildcards: + samples_table.loc[wildcards.sample, "fq1"] + output: - reads = os.path.join(config["output_dir"], "single_end", "{sample}", "{sample}.remove_adapters_mate1.fastq.gz") + reads = os.path.join( + config["output_dir"], + "single_end", + "{sample}", + "{sample}.remove_adapters_mate1.fastq.gz") + params: - adapters_3 = lambda wildcards: + adapters_3 = lambda wildcards: samples_table.loc[wildcards.sample, 'fq1_3p'], - adapters_5 = lambda wildcards: + adapters_5 = lambda wildcards: samples_table.loc[wildcards.sample, 'fq1_5p'] singularity: "docker://zavolab/cutadapt:1.16-slim" + threads: 8 + log: - os.path.join(config["log_dir"], "single_end", "{sample}", "remove_adapters_cutadapt.log") + stderr = os.path.join( + config["log_dir"], + "single_end", + "{sample}", + "remove_adapters_cutadapt.stderr.log"), + stdout = os.path.join( + config["log_dir"], + "single_end", + "{sample}", + "remove_adapters_cutadapt.stdout.log") shell: "(cutadapt \ -e 0.1 \ @@ -45,23 +89,49 @@ rule remove_adapters_cutadapt: -a {params.adapters_3} \ -g {params.adapters_5} \ -o {output.reads} \ - {input.reads}) &> {log}" + {input.reads};) \ + 1> {log.stdout} 2> {log.stderr}" rule remove_polya_cutadapt: - ''' Remove ployA tails''' + ''' + Remove ployA tails + ''' input: - reads = os.path.join(config["output_dir"], "single_end", "{sample}", "{sample}.remove_adapters_mate1.fastq.gz") + reads = os.path.join( + config["output_dir"], + "single_end", + "{sample}", + "{sample}.remove_adapters_mate1.fastq.gz") + output: - reads = os.path.join(config["output_dir"], "single_end", "{sample}", "{sample}.remove_polya_mate1.fastq.gz") + reads = os.path.join( + config["output_dir"], + "single_end", + "{sample}", + "{sample}.remove_polya_mate1.fastq.gz") + params: - polya_3 = lambda wildcards: + polya_3 = lambda wildcards: samples_table.loc[wildcards.sample, "fq1_polya"] + singularity: "docker://zavolab/cutadapt:1.16-slim" + threads: 8 + log: - os.path.join(config["log_dir"], "single_end", "{sample}", "remove_polya_cutadapt.log") + stderr = os.path.join( + config["log_dir"], + "single_end", + "{sample}", + "remove_polya_cutadapt.stderr.log"), + stdout = os.path.join( + config["log_dir"], + "single_end", + "{sample}", + "remove_polya_cutadapt.stdout.log") + shell: "(cutadapt \ --match-read-wildcards \ @@ -73,51 +143,75 @@ rule remove_polya_cutadapt: -m 10 \ -a {params.polya_3} \ -o {output.reads} \ - {input.reads}) &> {log}" + {input.reads}); \ + 1> {log.stdout} 2> {log.stderr}" rule map_genome_star: - ''' Map to genome using STAR. ''' + ''' + Map to genome using STAR + ''' input: index = lambda wildcards: os.path.join( config["star_indexes"], str(samples_table.loc[wildcards.sample, "organism"]), - str(samples_table.loc[wildcards.sample, "index_size"]), - "STAR_index","chrNameLength.txt"), - reads = os.path.join(config["output_dir"], "single_end", "{sample}", "{sample}.remove_polya_mate1.fastq.gz") + str(samples_table.loc[wildcards.sample, "index_size"]), + "STAR_index", + "chrNameLength.txt"), + reads = os.path.join( + config["output_dir"], + "single_end", + "{sample}", + "{sample}.remove_polya_mate1.fastq.gz") + output: - bam = os.path.join(config["output_dir"], "single_end", - "{sample}", - "map_genome", + bam = os.path.join( + config["output_dir"], + "single_end", + "{sample}", + "map_genome", "{sample}_Aligned.sortedByCoord.out.bam"), - logfile = os.path.join(config["output_dir"], "single_end", - "{sample}", - "map_genome", + logfile = os.path.join( + config["output_dir"], + "single_end", + "{sample}", + "map_genome", "{sample}_Log.final.out") + params: sample_id = "{sample}", index = lambda wildcards: os.path.join( config["star_indexes"], str(samples_table.loc[wildcards.sample, "organism"]), - str(samples_table.loc[wildcards.sample, "index_size"]), + str(samples_table.loc[wildcards.sample, "index_size"]), "STAR_index"), outFileNamePrefix = os.path.join( - config["output_dir"], - "single_end", - "{sample}", "map_genome", "{sample}_"), + config["output_dir"], + "single_end", + "{sample}", + "map_genome", + "{sample}_"), multimappers = lambda wildcards: samples_table.loc[wildcards.sample, "multimappers"], soft_clip = lambda wildcards: samples_table.loc[wildcards.sample, "soft_clip"], pass_mode = lambda wildcards: samples_table.loc[wildcards.sample, "pass_mode"], + singularity: "docker://zavolab/star:2.7.3a-slim" + threads: 12 + log: - os.path.join(config["log_dir"], "single_end", "{sample}", "map_genome_star.log") + stderr = os.path.join( + config["log_dir"], + "single_end", + "{sample}", + "map_genome_star.stderr.log") + shell: "(STAR \ --runMode alignReads \ @@ -139,39 +233,61 @@ rule map_genome_star: --outFilterType BySJout \ --outReadsUnmapped None \ --outSAMattrRGline ID:rcrunch SM:{params.sample_id} \ - --alignEndsType {params.soft_clip} > {output.bam};) &> {log}" + --alignEndsType {params.soft_clip} > {output.bam};) \ + 2> {log.stderr}" rule index_genomic_alignment_samtools: - '''Index genome bamfile using samtools.''' + ''' + Index genome bamfile using samtools + ''' input: - bam = os.path.join(config["output_dir"], - "single_end", - "{sample}", - "map_genome", + bam = os.path.join( + config["output_dir"], + "single_end", + "{sample}", + "map_genome", "{sample}_Aligned.sortedByCoord.out.bam") + output: - bai = os.path.join(config["output_dir"], - "single_end", - "{sample}", - "map_genome", + bai = os.path.join( + config["output_dir"], + "single_end", + "{sample}", + "map_genome", "{sample}_Aligned.sortedByCoord.out.bam.bai") + singularity: "docker://zavolab/samtools:1.10-slim" + threads: 1 + log: - os.path.join(config["log_dir"], "single_end", "{sample}", "index_genomic_alignment_samtools.log") + stderr = os.path.join( + config["log_dir"], + "single_end", + "{sample}", + "index_genomic_alignment_samtools.stderr.log"), + stdout = os.path.join( + config["log_dir"], + "single_end", + "{sample}", + "index_genomic_alignment_samtools.stdout.log") + shell: - "(samtools index {input.bam} {output.bai};) &> {log}" + "(samtools index {input.bam} {output.bai};) \ + 1> {log.stdout} 2> {log.stderr}" rule quantification_salmon: - ''' Quantification at transcript and gene level using Salmon. ''' + ''' + Quantification at transcript and gene level using Salmon + ''' input: reads = os.path.join( - config["output_dir"], - "single_end", - "{sample}", + config["output_dir"], + "single_end", + "{sample}", "{sample}.remove_polya_mate1.fastq.gz"), index = lambda wildcards: os.path.join( @@ -179,33 +295,49 @@ rule quantification_salmon: str(samples_table.loc[wildcards.sample, "organism"]), str(samples_table.loc[wildcards.sample, "kmer"]), "salmon.idx"), - gtf = lambda wildcards: samples_table.loc[wildcards.sample, "gtf_filtered"] + gtf = lambda wildcards: + samples_table.loc[wildcards.sample, "gtf_filtered"] + output: gn_estimates = os.path.join( - config["output_dir"], - "single_end", - "{sample}", + config["output_dir"], + "single_end", + "{sample}", "salmon_quant", "quant.genes.sf"), tr_estimates = os.path.join( - config["output_dir"], - "single_end", - "{sample}", + config["output_dir"], + "single_end", + "{sample}", "salmon_quant", "quant.sf") + params: output_dir = os.path.join( - config["output_dir"], - "single_end", - "{sample}", + config["output_dir"], + "single_end", + "{sample}", "salmon_quant"), libType = lambda wildcards: samples_table.loc[wildcards.sample, "libtype"] + log: - os.path.join(config["log_dir"], "single_end", "{sample}", "quantification_salmon.log") - threads: 12 + stderr = os.path.join( + config["log_dir"], + "single_end", + "{sample}", + "quantification_salmon.stderr.log"), + stdout = os.path.join( + config["log_dir"], + "single_end", + "{sample}", + "quantification_salmon.stdout.log") + + threads: 12 + singularity: "docker://zavolab/salmon:1.1.0-slim" + shell: "(salmon quant \ --libType {params.libType} \ @@ -216,43 +348,59 @@ rule quantification_salmon: --index {input.index} \ --geneMap {input.gtf} \ --unmatedReads {input.reads} \ - -o {params.output_dir}) &> {log}" + -o {params.output_dir};) \ + 1> {log.stdout} 2> {log.stderr}" rule genome_quantification_kallisto: - ''' Quantification at transcript and gene level using Kallisto. ''' + ''' + Quantification at transcript and gene level using Kallisto + ''' input: reads = os.path.join( - config["output_dir"], - "single_end", - "{sample}", + config["output_dir"], + "single_end", + "{sample}", "{sample}.remove_polya_mate1.fastq.gz"), index = lambda wildcards: os.path.join( config["kallisto_indexes"], samples_table.loc[wildcards.sample, "organism"], "kallisto.idx") + output: pseudoalignment = os.path.join( - config["output_dir"], - "single_end", - "{sample}", + config["output_dir"], + "single_end", + "{sample}", "quant_kallisto", "{sample}.kallisto.pseudo.sam") + params: output_dir = os.path.join( - config["output_dir"], - "single_end", - "{sample}", - "quant_kallisto"), - fraglen = lambda wildcards: samples_table.loc[wildcards.sample, 'mean'], - fragsd = lambda wildcards: samples_table.loc[wildcards.sample, 'sd'], - directionality = lambda wildcards: samples_table.loc[wildcards.sample, 'kallisto_directionality'] - threads: 8 + config["output_dir"], + "single_end", + "{sample}", + "quant_kallisto"), + fraglen = lambda wildcards: + samples_table.loc[wildcards.sample, 'mean'], + fragsd = lambda wildcards: + samples_table.loc[wildcards.sample, 'sd'], + directionality = lambda wildcards: + samples_table.loc[wildcards.sample, 'kallisto_directionality'] + + threads: 8 + log: - os.path.join(config["log_dir"], "single_end", "{sample}", "genome_quantification_kallisto.log.log") + stderr = os.path.join( + config["log_dir"], + "single_end", + "{sample}", + "genome_quantification_kallisto.stderr.log") + singularity: "docker://zavolab/kallisto:0.46.1-slim" + shell: "(kallisto quant \ -i {input.index} \ @@ -262,5 +410,5 @@ rule genome_quantification_kallisto: -s {params.fragsd} \ --pseudobam \ {params.directionality} \ - {input.reads} > {output.pseudoalignment}) &> {log}" - + {input.reads} > {output.pseudoalignment};) \ + 2> {log.stderr}"