diff --git a/Snakefile b/Snakefile index 46f802b19cb943a188eed87b0c2310cdbea251d8..f0087cb8127454f6fb2af5572c9010ea01d0708d 100644 --- a/Snakefile +++ b/Snakefile @@ -433,7 +433,6 @@ rule extract_transcripts_as_bed12: shell: "(gtf2bed12 \ --gtf {input.gtf} \ - --transcript_type protein_coding \ --bed12 {output.bed12}); \ 1> {log.stdout} 2> {log.stderr}" @@ -543,7 +542,7 @@ rule calculate_TIN_scores: -r {input.transcripts_bed12} \ -c 0 \ --names {params.sample} \ - -n 100 > {output.TIN_score};) 2> {log.stderr}" + > {output.TIN_score};) 2> {log.stderr}" rule salmon_quantmerge_genes: @@ -831,10 +830,6 @@ rule pca_salmon: "quantmerge", "{molecule}_tpm.tsv"), - params: - tpm_filter = "0", - tpm_pseudocount = "1" - output: out = directory(os.path.join( config["output_dir"], @@ -857,8 +852,6 @@ rule pca_salmon: shell: "(zpca-tpm \ --tpm {input.tpm} \ - --tpm-filter {params.tpm_filter} \ - --tpm-pseudocount {params.tpm_pseudocount} \ --out {output.out} \ --verbose) \ 1> {log.stdout} 2> {log.stderr}" @@ -871,9 +864,6 @@ rule pca_kallisto: "summary_kallisto", "{molecule}_tpm.tsv") - params: - tpm_filter = "0", - tpm_pseudocount = "1" output: out = directory(os.path.join( @@ -897,8 +887,6 @@ rule pca_kallisto: shell: "(zpca-tpm \ --tpm {input.tpm} \ - --tpm-filter {params.tpm_filter} \ - --tpm-pseudocount {params.tpm_pseudocount} \ --out {output.out} \ --verbose) \ 1> {log.stdout} 2> {log.stderr}" @@ -970,8 +958,7 @@ rule star_rpm: prefix = lambda wildcards, output: os.path.join( os.path.dirname(output.str1), - str(wildcards.sample) + "_"), - stranded = "Stranded" + str(wildcards.sample) + "_") singularity: "docker://zavolab/star:2.7.3a-slim" @@ -998,8 +985,6 @@ rule star_rpm: --runThreadN {threads} \ --inputBAMfile {input.bam} \ --outWigType bedGraph \ - --outWigStrand {params.stranded} \ - --outWigNorm RPM \ --outFileNamePrefix {params.prefix}) \ 1> {log.stdout} 2> {log.stderr}" @@ -1051,13 +1036,6 @@ rule rename_star_rpm_for_alfa: "{unique}", "{sample}.{unique}.minus.bg")) - params: - orientation = lambda wildcards: - get_sample( - 'kallisto_directionality', - search_id='index', - search_value=wildcards.sample), - log: stderr = os.path.join( config["log_dir"], diff --git a/pipeline_documentation.md b/pipeline_documentation.md index c5e45a9158aa09403a97d5a825c969bab97c3c9e..04a62fca206f543e185517f3da5d306194a39533 100644 --- a/pipeline_documentation.md +++ b/pipeline_documentation.md @@ -236,7 +236,7 @@ Create index for [**kallisto**](#third-party-software-used) quantification. #### `extract_transcripts_as_bed12` Convert transcripts from `.gtf` to extended 12-column `.bed` format with -[custom-script][custom-script-gtf-to-bed12]. +[custom-script][custom-script-gtf-to-bed12]. Note that the default transcript type setting is used, which is "protein_coding". - **Input** - Gene annotation file (`.gtf`) @@ -290,9 +290,6 @@ Create stranded bedGraph coverage (`.bg`) with - **Output** - Coverage file (`.bg`); used in [**multiqc_report**](#multiqc_report) and [**rename_star_rpm_for_alfa**](#rename_star_rpm_for_alfa) -- **Non-configurable & non-default** - - `--outWigStrans="Stranded"` - - `--outWigNorm="RPM"` #### `rename_star_rpm_for_alfa` @@ -360,6 +357,8 @@ Calculates the Transcript Integrity Number (TIN) for each transcript with - **Output** - TIN score table (custom `tsv`); used in [**merge_TIN_scores**](#merge_tin_scores) +- **Non-configurable & non-default** + - `-c 0`: minimum number of read mapped to a transcript #### `salmon_quantmerge_genes` @@ -408,6 +407,8 @@ Merge gene-level expression estimates for all samples with - Gene TPM table (custom `.tsv`) - Gene read count table (custom `.tsv`) - Mapping gene/transcript IDs table (custom `.tsv`) +- **Non-configurable & non-default** + - `-txOut FALSE`: gene-level summarization (default would be transcript level) #### `kallisto_merge_transcripts` @@ -445,6 +446,7 @@ Run PCA analysis on salmon genes and transcripts with [custom script][custom-scr - **Output** - Directory with PCA plots, scree plot and top loading scores. + #### `generate_alfa_index` Create index for [**ALFA**](#third-party-software-used). @@ -548,13 +550,10 @@ Remove adapter sequences from reads with - Reads file (`.fastq.gz`); used in [**remove_polya_cutadapt**](#remove_polya_cutadapt) - **Non-configurable & non-default** - - `-e 0.1`: maximum error-rate of 10% - `-j 8`: use 8 threads - - `-m 10`: Discard processed reads that are shorter than 10 + - `-m 10`: Discard processed reads that are shorter than 10 (default 0, that might cause problems in downstream programs) - `-n 2`: search for all the given adapter sequences repeatedly, either until - no adapter match was found or until 2 rounds have been performed. - - `--pair-filter=any`: **(paired-end only)** filtering criteria must apply to - any of the two reads in order for a read pair to be discarded + no adapter match was found or until 2 rounds have been performed. (default 1) #### `remove_polya_cutadapt` @@ -573,14 +572,9 @@ Remove poly(A) tails from reads with [**map_genome_star**](#map_genome_star) and [**quantification_salmon**](#quantification_salmon) - **Non-configurable & non-default** - - `-e 0.1`: maximum error-rate of 10% - `-j 8`: use 8 threads - `-m 10`: Discard processed reads that are shorter than 10 - - `-n 1`: search for all the given adapter sequences repeatedly, either until - no adapter match was found or until 1 round has been performed. - - `--pair-filter=any`: **(paired-end only)** filtering criteria must apply to - any of the two reads in order for a read pair to be discarded - - `-O 1`: **(single-end only)** minimal overlap of 1 + - `-O 1`: minimal overlap of 1 (default: 3) #### `map_genome_star` @@ -608,19 +602,13 @@ Align short reads to reference genome and/or transcriptome with and [**star_rpm**](#star_rpm) - STAR log file - **Non-configurable & non-default** - - `--outSAMunmapped=None`: do not output unmapped reads in SAM file - `--outFilterMultimapScoreRange=0`: the score range below the maximum score - for multimapping alignments + for multimapping alignments (default 1) - `--outSAMattributes=All`: NH HI AS nM NM MD jM jI MC ch - - `--outStd=BAM_SortedByCoordinate`: which output will be directed to `STDOUT` - - `--outSAMtype=BAM_SortedByCoordinate`: type of SAM/BAM output - - `--outFilterMismatchNoverLmax=0.04`: alignment will be output only if its - ratio of mismatches to *mapped* length is less than or equal to this value - - `--outFilterScoreMinOverLread=0.3`: same as outFilterScoreMin, but - normalized to read length (sum of mates’ lengths for paired-end reads) - - `--outFilterMatchNminOverLread=0.3`: minimal fraction of aligned bases + - `--outStd=BAM_SortedByCoordinate`: which output will be directed to `STDOUT` (default 'Log') + - `--outSAMtype=BAM SortedByCoordinate`: type of SAM/BAM output (default SAM) - `--outFilterType=BySJout`: reduces the number of ”spurious” junctions - - `--outReadsUnmapped=None`: do not output unmapped reads + - `--outSAMattrRGline`: ID:rnaseq_pipeline SM: *sampleID* #### `quantification_salmon` @@ -640,10 +628,12 @@ Estimate transcript- and gene-level expression with - `--fldSD`: standard deviation of distribution of fragment lengths; specify in sample table column `sd` **(single-end only)** - **Output** - - Gene expression table (custom `.tsv`); used in + - Gene expression table (`quant.sf`); used in [**salmon_quantmerge_genes**](#salmon_quantmerge_genes) - - Transcript expression table (custom `.tsv`); used in + - Transcript expression table ( `quant.sf`); used in [**salmon_quantmerge_transcripts**](#salmon_quantmerge_transcripts) + - `meta_info.json` + - `flenDist.txt` - **Non-configurable & non-default** - `--seqBias`: [correct for sequence specific biases](https://salmon.readthedocs.io/en/latest/salmon.html#seqbias) @@ -652,8 +642,6 @@ Estimate transcript- and gene-level expression with sensitivity and specificity of mapping and, as a result, can [improve quantification accuracy](https://salmon.readthedocs.io/en/latest/salmon.html#validatemappings). - - `--writeUnmappedNames`: write out the names of reads (or mates in paired-end - reads) that do not map to the transcriptome. #### `genome_quantification_kallisto` @@ -671,8 +659,11 @@ Generate pseudoalignments of reads to transcripts with - `-s`: standard deviation of distribution of fragment lengths; specify in sample table column `sd` **(single-end only)** - **Output** - - Pseudoalignments file (`.sam`); used in - [**multiqc_report**](#multiqc_report) + - Pseudoalignments file (`.sam`) and + - abundance (`.h5`) + used in [**kallisto_merge_genes**](#kallisto_merge_genes) +- **Non-configurable & non-default** + - `--pseudobam`: Save pseudoalignments to transcriptome to BAM file [code-alfa]: <https://github.com/biocompibens/ALFA> [code-bedgraphtobigwig]: <https://github.com/ucscGenomeBrowser/kent> @@ -687,8 +678,8 @@ Generate pseudoalignments of reads to transcripts with [code-salmon]: <https://github.com/COMBINE-lab/salmon> [code-samtools]: <https://github.com/samtools/samtools> [code-star]: <https://github.com/alexdobin/STAR> -[custom-script-gtf-to-bed12]: <https://git.scicore.unibas.ch/zavolan_group/tools/gtf_transcript_type_to_bed12> -[custom-script-tin]: <https://git.scicore.unibas.ch/zavolan_group/tools/tin_score_calculation> +[custom-script-gtf-to-bed12]: <https://github.com/zavolanlab/zgtf> +[custom-script-tin]: <https://github.com/zavolanlab/tin-score-calculation> [custom-script-merge-kallisto]: <https://github.com/zavolanlab/merge_kallisto> [custom-script-zpca]: <https://github.com/zavolanlab/zpca> [docs-alfa]: <https://github.com/biocompibens/ALFA#manual> diff --git a/workflow/rules/paired_end.snakefile.smk b/workflow/rules/paired_end.snakefile.smk index 1777e2c5ba535a947ba37b99725a1a5f0a81ef89..41c87ebf82032876c2930e5de6d1672465ad41ab 100644 --- a/workflow/rules/paired_end.snakefile.smk +++ b/workflow/rules/paired_end.snakefile.smk @@ -58,9 +58,7 @@ rule pe_remove_adapters_cutadapt: shell: "(cutadapt \ - -e 0.1 \ -j {threads} \ - --pair-filter=any \ -m 10 \ -n 2 \ -a {params.adapter_3_mate1} \ @@ -144,10 +142,7 @@ rule pe_remove_polya_cutadapt: shell: "(cutadapt \ -j {threads} \ - --pair-filter=any \ -m 10 \ - -n 1 \ - -e 0.1 \ -O 1 \ -a {params.polya_3_mate1} \ -g {params.polya_5_mate1} \ @@ -255,24 +250,18 @@ rule pe_map_genome_star: shell: "(STAR \ - --runMode alignReads \ --twopassMode {params.pass_mode} \ --runThreadN {threads} \ --genomeDir {params.index} \ --readFilesIn {input.reads1} {input.reads2} \ --readFilesCommand zcat \ - --outSAMunmapped None \ --outFilterMultimapNmax {params.multimappers} \ --outFilterMultimapScoreRange 0 \ --outFileNamePrefix {params.outFileNamePrefix} \ --outSAMattributes All \ --outStd BAM_SortedByCoordinate \ --outSAMtype BAM SortedByCoordinate \ - --outFilterMismatchNoverLmax 0.04 \ - --outFilterScoreMinOverLread 0.3 \ - --outFilterMatchNminOverLread 0.3 \ --outFilterType BySJout \ - --outReadsUnmapped None \ --outSAMattrRGline ID:rnaseq_pipeline SM:{params.sample_id} \ --alignEndsType {params.soft_clip} > {output.bam};) \ 2> {log.stderr}" diff --git a/workflow/rules/single_end.snakefile.smk b/workflow/rules/single_end.snakefile.smk index af71c9784e0abc87716d558d0fda690fa5013c4f..9563e063794e8305132f855c4dc194e25295d30c 100644 --- a/workflow/rules/single_end.snakefile.smk +++ b/workflow/rules/single_end.snakefile.smk @@ -47,7 +47,6 @@ rule remove_adapters_cutadapt: "remove_adapters_cutadapt.se.stdout.log") shell: "(cutadapt \ - -e 0.1 \ -j {threads} \ -m 10 \ -n 2 \ @@ -108,8 +107,6 @@ rule remove_polya_cutadapt: shell: "(cutadapt \ -j {threads} \ - -n 1 \ - -e 0.1 \ -O 1 \ -m 10 \ -a {params.polya_3} \ @@ -197,24 +194,18 @@ rule map_genome_star: shell: "(STAR \ - --runMode alignReads \ -- twopassMode {params.pass_mode} \ --runThreadN {threads} \ --genomeDir {params.index} \ --readFilesIn {input.reads} \ --readFilesCommand zcat \ - --outSAMunmapped None \ --outFilterMultimapNmax {params.multimappers} \ --outFilterMultimapScoreRange 0 \ --outFileNamePrefix {params.outFileNamePrefix} \ --outSAMattributes All \ --outStd BAM_SortedByCoordinate \ --outSAMtype BAM SortedByCoordinate \ - --outFilterMismatchNoverLmax 0.04 \ - --outFilterScoreMinOverLread 0.3 \ - --outFilterMatchNminOverLread 0.3 \ --outFilterType BySJout \ - --outReadsUnmapped None \ --outSAMattrRGline ID:rnaseq_pipeline SM:{params.sample_id} \ --alignEndsType {params.soft_clip} > {output.bam};) \ 2> {log.stderr}" @@ -324,7 +315,6 @@ rule quantification_salmon: --threads {threads} \ --fldMean {params.fraglen} \ --fldSD {params.fragsd} \ - --writeUnmappedNames \ --index {input.index} \ --geneMap {input.gtf} \ --unmatedReads {input.reads} \