Remove extra parameters, that had or should have the default values

and are therefore not required in the rules. - Snakefile star_rpm: --outWigNorm (default RPM was used) star_rpm: --outWigStrand (default Stranded was used) rename_star_rpm_for_alfa: orientation in params is redundant (Fixes #152) - single_end.snakefile.smk map_genome_star: outFilterMismatchNoverLmax map_genome_star: outFilterScoreMinOverLread map_genome_star: outFilterMatchNminOverLread quantification_salmon: --writeUnmappedNames - paired_end.snakefile.smk pe_map_genome_star: outFilterMismatchNoverLmax pe_map_genome_star: outFilterScoreMinOverLread pe_map_genome_star: outFilterMatchNminOverLread quantification_salmon: --writeUnmappedNames

Remove extra parameters, that had or should have the default values
5396adf5 · BIOPZ-Katsantoni Maria · BIOPZ-Gypas Foivos · 6b6b2830 · 5396adf5 · 5396adf5
Commit 5396adf5 authored 4 years ago by BIOPZ-Katsantoni Maria Committed by BIOPZ-Gypas Foivos 4 years ago
--- a/Snakefile
+++ b/Snakefile
@@ -433,7 +433,6 @@ rule extract_transcripts_as_bed12:
    shell:
        "(gtf2bed12 \
        --gtf {input.gtf} \
-        --transcript_type protein_coding \
        --bed12 {output.bed12}); \
        1> {log.stdout} 2> {log.stderr}"

@@ -543,7 +542,7 @@ rule calculate_TIN_scores:
        -r {input.transcripts_bed12} \
        -c 0 \
        --names {params.sample} \
-        -n 100 > {output.TIN_score};) 2> {log.stderr}"
+        > {output.TIN_score};) 2> {log.stderr}"


 rule salmon_quantmerge_genes:
@@ -831,10 +830,6 @@ rule pca_salmon:
            "quantmerge",
            "{molecule}_tpm.tsv"),

-    params:
-        tpm_filter = "0",
-        tpm_pseudocount = "1"
-
    output:
        out = directory(os.path.join(
            config["output_dir"],
@@ -857,8 +852,6 @@ rule pca_salmon:
    shell:
        "(zpca-tpm  \
        --tpm {input.tpm} \
-        --tpm-filter {params.tpm_filter} \
-        --tpm-pseudocount {params.tpm_pseudocount} \
        --out {output.out} \
        --verbose) \
        1> {log.stdout} 2> {log.stderr}"
@@ -871,9 +864,6 @@ rule pca_kallisto:
            "summary_kallisto",
            "{molecule}_tpm.tsv")

-    params:
-        tpm_filter = "0",
-        tpm_pseudocount = "1"

    output:
        out = directory(os.path.join(
@@ -897,8 +887,6 @@ rule pca_kallisto:
    shell:
        "(zpca-tpm  \
        --tpm {input.tpm} \
-        --tpm-filter {params.tpm_filter} \
-        --tpm-pseudocount {params.tpm_pseudocount} \
        --out {output.out} \
        --verbose) \
        1> {log.stdout} 2> {log.stderr}"
@@ -970,8 +958,7 @@ rule star_rpm:
        prefix = lambda wildcards, output:
            os.path.join(
                os.path.dirname(output.str1),
-                str(wildcards.sample) + "_"),
-        stranded = "Stranded"
+                str(wildcards.sample) + "_")

    singularity:
        "docker://zavolab/star:2.7.3a-slim"
@@ -998,8 +985,6 @@ rule star_rpm:
        --runThreadN {threads} \
        --inputBAMfile {input.bam} \
        --outWigType bedGraph \
-        --outWigStrand {params.stranded} \
-        --outWigNorm RPM \
        --outFileNamePrefix {params.prefix}) \
        1> {log.stdout} 2> {log.stderr}"

@@ -1051,13 +1036,6 @@ rule rename_star_rpm_for_alfa:
            "{unique}",
            "{sample}.{unique}.minus.bg"))

-    params:
-        orientation = lambda wildcards:
-            get_sample(
-                'kallisto_directionality',
-                search_id='index',
-                search_value=wildcards.sample),
-
    log:
        stderr = os.path.join(
            config["log_dir"],

--- a/pipeline_documentation.md
+++ b/pipeline_documentation.md
@@ -236,7 +236,7 @@ Create index for [**kallisto**](#third-party-software-used) quantification.
 #### `extract_transcripts_as_bed12`

 Convert transcripts from `.gtf` to extended 12-column `.bed` format with
-[custom-script][custom-script-gtf-to-bed12].
+[custom-script][custom-script-gtf-to-bed12]. Note that the default transcript type setting is used, which is "protein_coding".

 - **Input**
  - Gene annotation file (`.gtf`)
@@ -290,9 +290,6 @@ Create stranded bedGraph coverage (`.bg`) with
 - **Output**
  - Coverage file (`.bg`); used in [**multiqc_report**](#multiqc_report) and
    [**rename_star_rpm_for_alfa**](#rename_star_rpm_for_alfa)
- **Non-configurable & non-default**
-  - `--outWigStrans="Stranded"`
-  - `--outWigNorm="RPM"`

 #### `rename_star_rpm_for_alfa`

@@ -360,6 +357,8 @@ Calculates the Transcript Integrity Number (TIN) for each transcript with
 - **Output**
  - TIN score table (custom `tsv`); used in
    [**merge_TIN_scores**](#merge_tin_scores)
+- **Non-configurable & non-default**
+  - `-c 0`: minimum number of read mapped to a transcript

 #### `salmon_quantmerge_genes`

@@ -408,6 +407,8 @@ Merge gene-level expression estimates for all samples with
  - Gene TPM table (custom `.tsv`)
  - Gene read count table (custom `.tsv`)
  - Mapping gene/transcript IDs table (custom `.tsv`)
+- **Non-configurable & non-default**
+  - `-txOut FALSE`: gene-level summarization (default would be transcript level) 

 #### `kallisto_merge_transcripts`

@@ -445,6 +446,7 @@ Run PCA analysis on salmon genes and transcripts with [custom script][custom-scr
 - **Output**
  - Directory with PCA plots, scree plot and top loading scores.

+
 #### `generate_alfa_index`

 Create index for [**ALFA**](#third-party-software-used).
@@ -548,13 +550,10 @@ Remove adapter sequences from reads with
  - Reads file (`.fastq.gz`); used in
    [**remove_polya_cutadapt**](#remove_polya_cutadapt)
 - **Non-configurable & non-default**
-  - `-e 0.1`: maximum error-rate of 10%
  - `-j 8`: use 8 threads
-  - `-m 10`: Discard processed reads that are shorter than 10
+  - `-m 10`: Discard processed reads that are shorter than 10 (default 0, that might cause problems in downstream programs)
  - `-n 2`: search for all the given adapter sequences repeatedly, either until
-    no adapter match was found or until 2 rounds have been performed.
-  - `--pair-filter=any`: **(paired-end only)** filtering criteria must apply to
-    any of the two reads in order for a read pair to be discarded
+    no adapter match was found or until 2 rounds have been performed. (default 1)

 #### `remove_polya_cutadapt`

@@ -573,14 +572,9 @@ Remove poly(A) tails from reads with
    [**map_genome_star**](#map_genome_star) and
    [**quantification_salmon**](#quantification_salmon)
 - **Non-configurable & non-default**
-  - `-e 0.1`: maximum error-rate of 10%
  - `-j 8`: use 8 threads
  - `-m 10`: Discard processed reads that are shorter than 10
-  - `-n 1`: search for all the given adapter sequences repeatedly, either until
-    no adapter match was found or until 1 round has been performed.
-  - `--pair-filter=any`: **(paired-end only)** filtering criteria must apply to
-    any of the two reads in order for a read pair to be discarded
-  - `-O 1`: **(single-end only)** minimal overlap of 1
+  - `-O 1`: minimal overlap of 1 (default: 3)

 #### `map_genome_star`

@@ -608,19 +602,13 @@ Align short reads to reference genome and/or transcriptome with
    and [**star_rpm**](#star_rpm)
  - STAR log file
 - **Non-configurable & non-default**
-  - `--outSAMunmapped=None`: do not output unmapped reads in SAM file
  - `--outFilterMultimapScoreRange=0`: the score range below the maximum score
-    for multimapping alignments
+    for multimapping alignments (default 1)
  - `--outSAMattributes=All`: NH HI AS nM NM MD jM jI MC ch
-  - `--outStd=BAM_SortedByCoordinate`: which output will be directed to `STDOUT`
-  - `--outSAMtype=BAM_SortedByCoordinate`: type of SAM/BAM output
-  - `--outFilterMismatchNoverLmax=0.04`: alignment will be output only if its
-    ratio of mismatches to *mapped* length is less than or equal to this value
-  - `--outFilterScoreMinOverLread=0.3`: same as outFilterScoreMin, but
-    normalized to read length (sum of mates’ lengths for paired-end reads)
-  - `--outFilterMatchNminOverLread=0.3`: minimal fraction of aligned bases
+  - `--outStd=BAM_SortedByCoordinate`: which output will be directed to `STDOUT` (default 'Log')
+  - `--outSAMtype=BAM SortedByCoordinate`: type of SAM/BAM output (default SAM)
  - `--outFilterType=BySJout`: reduces the number of ”spurious” junctions
-  - `--outReadsUnmapped=None`: do not output unmapped reads
+  - `--outSAMattrRGline`: ID:rnaseq_pipeline SM: *sampleID*

 #### `quantification_salmon`

@@ -640,10 +628,12 @@ Estimate transcript- and gene-level expression with
  - `--fldSD`: standard deviation of distribution of fragment lengths; specify
    in sample table column `sd` **(single-end only)**
 - **Output**
-  - Gene expression table (custom `.tsv`); used in
+  - Gene expression table (`quant.sf`); used in
    [**salmon_quantmerge_genes**](#salmon_quantmerge_genes)
-  - Transcript expression table (custom `.tsv`); used in
+  - Transcript expression table ( `quant.sf`); used in
    [**salmon_quantmerge_transcripts**](#salmon_quantmerge_transcripts)
+  - `meta_info.json`
+  - `flenDist.txt`
 - **Non-configurable & non-default**
  - `--seqBias`: [correct for sequence specific
    biases](https://salmon.readthedocs.io/en/latest/salmon.html#seqbias)
@@ -652,8 +642,6 @@ Estimate transcript- and gene-level expression with
    sensitivity and specificity of mapping and, as a result, can [improve
    quantification
    accuracy](https://salmon.readthedocs.io/en/latest/salmon.html#validatemappings).
-  - `--writeUnmappedNames`: write out the names of reads (or mates in paired-end
-    reads) that do not map to the transcriptome.

 #### `genome_quantification_kallisto`

@@ -671,8 +659,11 @@ Generate pseudoalignments of reads to transcripts with
  - `-s`: standard deviation of distribution of fragment lengths; specify in
    sample table column `sd` **(single-end only)**
 - **Output**
-  - Pseudoalignments file (`.sam`); used in
-    [**multiqc_report**](#multiqc_report)
+  - Pseudoalignments file (`.sam`) and
+  - abundance (`.h5`) 
+  used in [**kallisto_merge_genes**](#kallisto_merge_genes)
+- **Non-configurable & non-default**
+  - `--pseudobam`: Save pseudoalignments to transcriptome to BAM file

 [code-alfa]: <https://github.com/biocompibens/ALFA>
 [code-bedgraphtobigwig]: <https://github.com/ucscGenomeBrowser/kent>
@@ -687,8 +678,8 @@ Generate pseudoalignments of reads to transcripts with
 [code-salmon]: <https://github.com/COMBINE-lab/salmon>
 [code-samtools]: <https://github.com/samtools/samtools>
 [code-star]: <https://github.com/alexdobin/STAR>
-[custom-script-gtf-to-bed12]: <https://git.scicore.unibas.ch/zavolan_group/tools/gtf_transcript_type_to_bed12>
-[custom-script-tin]: <https://git.scicore.unibas.ch/zavolan_group/tools/tin_score_calculation>
+[custom-script-gtf-to-bed12]: <https://github.com/zavolanlab/zgtf>
+[custom-script-tin]: <https://github.com/zavolanlab/tin-score-calculation>
 [custom-script-merge-kallisto]: <https://github.com/zavolanlab/merge_kallisto>
 [custom-script-zpca]: <https://github.com/zavolanlab/zpca>
 [docs-alfa]: <https://github.com/biocompibens/ALFA#manual>

--- a/workflow/rules/paired_end.snakefile.smk
+++ b/workflow/rules/paired_end.snakefile.smk
@@ -58,9 +58,7 @@ rule pe_remove_adapters_cutadapt:

    shell:
        "(cutadapt \
-        -e 0.1 \
        -j {threads} \
-        --pair-filter=any \
        -m 10 \
        -n 2 \
        -a {params.adapter_3_mate1} \
@@ -144,10 +142,7 @@ rule pe_remove_polya_cutadapt:
    shell:
        "(cutadapt \
        -j {threads} \
-        --pair-filter=any \
        -m 10 \
-        -n 1 \
-        -e 0.1 \
        -O 1 \
        -a {params.polya_3_mate1} \
        -g {params.polya_5_mate1} \
@@ -255,24 +250,18 @@ rule pe_map_genome_star:

    shell:
        "(STAR \
-        --runMode alignReads \
        --twopassMode {params.pass_mode} \
        --runThreadN {threads} \
        --genomeDir {params.index} \
        --readFilesIn {input.reads1} {input.reads2} \
        --readFilesCommand zcat \
-        --outSAMunmapped None  \
        --outFilterMultimapNmax {params.multimappers} \
        --outFilterMultimapScoreRange 0 \
        --outFileNamePrefix {params.outFileNamePrefix} \
        --outSAMattributes All \
        --outStd BAM_SortedByCoordinate \
        --outSAMtype BAM SortedByCoordinate \
-        --outFilterMismatchNoverLmax 0.04 \
-        --outFilterScoreMinOverLread 0.3 \
-        --outFilterMatchNminOverLread 0.3 \
        --outFilterType BySJout \
-        --outReadsUnmapped None \
        --outSAMattrRGline ID:rnaseq_pipeline SM:{params.sample_id} \
        --alignEndsType {params.soft_clip} > {output.bam};) \
        2> {log.stderr}"

--- a/workflow/rules/single_end.snakefile.smk
+++ b/workflow/rules/single_end.snakefile.smk
@@ -47,7 +47,6 @@ rule remove_adapters_cutadapt:
            "remove_adapters_cutadapt.se.stdout.log")
    shell:
        "(cutadapt \
-        -e 0.1 \
        -j {threads} \
        -m 10 \
        -n 2 \
@@ -108,8 +107,6 @@ rule remove_polya_cutadapt:
    shell:
        "(cutadapt \
        -j {threads} \
-        -n 1 \
-        -e 0.1 \
        -O 1 \
        -m 10  \
        -a {params.polya_3} \
@@ -197,24 +194,18 @@ rule map_genome_star:

    shell:
        "(STAR \
-        --runMode alignReads \
        -- twopassMode {params.pass_mode} \
        --runThreadN {threads} \
        --genomeDir {params.index} \
        --readFilesIn {input.reads} \
        --readFilesCommand zcat \
-        --outSAMunmapped None  \
        --outFilterMultimapNmax {params.multimappers} \
        --outFilterMultimapScoreRange 0 \
        --outFileNamePrefix {params.outFileNamePrefix} \
        --outSAMattributes All \
        --outStd BAM_SortedByCoordinate \
        --outSAMtype BAM SortedByCoordinate \
-        --outFilterMismatchNoverLmax 0.04 \
-        --outFilterScoreMinOverLread 0.3 \
-        --outFilterMatchNminOverLread 0.3 \
        --outFilterType BySJout \
-        --outReadsUnmapped None \
        --outSAMattrRGline ID:rnaseq_pipeline SM:{params.sample_id} \
        --alignEndsType {params.soft_clip} > {output.bam};) \
        2> {log.stderr}"
@@ -324,7 +315,6 @@ rule quantification_salmon:
        --threads {threads} \
        --fldMean {params.fraglen} \
        --fldSD {params.fragsd} \
-        --writeUnmappedNames \
        --index {input.index} \
        --geneMap {input.gtf} \
        --unmatedReads {input.reads} \