From 2a9347408c8b8bbf83017fddca06fb641bfc710d Mon Sep 17 00:00:00 2001
From: BIOPZ-Katsantoni Maria <maria.katsantoni@unibas.ch>
Date: Wed, 14 Jul 2021 16:29:55 +0200
Subject: [PATCH] Remove sample specific options and use them as rule specific.
 Partially solves issue #158. The features softclip, passmode, multimappers
 which were related to STAR are relocated to the rule config and given a
 default value. The documentation is updated and these options are removed
 from the immutable lists in the corresponding rules.

---
 pipeline_documentation.md               | 10 +++-------
 tests/input_files/rule_config.yaml      | 12 ++++++++++++
 tests/input_files/samples.tsv           |  6 +++---
 workflow/rules/paired_end.snakefile.smk | 21 ---------------------
 workflow/rules/single_end.snakefile.smk | 21 ---------------------
 5 files changed, 18 insertions(+), 52 deletions(-)

diff --git a/pipeline_documentation.md b/pipeline_documentation.md
index 5f96d48..88e0689 100644
--- a/pipeline_documentation.md
+++ b/pipeline_documentation.md
@@ -116,9 +116,6 @@ gtf_filtered | Required for [Salmon](#third-party-software-used). Path to filter
 genome | Required for [STAR](#third-party-software-used). Path to genome `.fa` file. File needs to be in subdirectory corresponding to `organism` field. Example: `/path/to/GRCh38/genome.fa` | `str`
 sd | Required for [kallisto](#third-party-software-used) and [Salmon](#third-party-software-used), but only for single-end libraries. Estimated standard deviation of fragment length distribution. Can be assessed from, e.g., BioAnalyzer profiles. Value ignored for paired-end libraries. | `int`
 mean | Required for [kallisto](#third-party-software-used) and [Salmon](#third-party-software-used), but only for single-end libraries. Estimated mean of fragment length distribution. Can be assessed, e.g., from BioAnalyzer profiles. Value ignored for paired-end libraries. | `int`
-multimappers | Required for [STAR](#third-party-software-used). Maximum number of multiple alignments allowed for a read; if exceeded, the read is considered unmapped. | `int`
-soft_clip | Required for [STAR](#third-party-software-used). One of `Local` (standard local alignment with soft-clipping allowed) or `EndToEnd` (force end-to-end read alignment, do not soft-clip). | `str`
-pass_mode | Required for [STAR](#third-party-software-used). One of `None` (1-pass mapping) or `Basic` (basic 2-pass mapping, with all 1st-pass junctions inserted into the genome indices on the fly). | `str`
 libtype | Required for [Salmon](#third-party-software-used). See [Salmon manual][docs-salmon] for allowed values. If in doubt, enter `A` to automatically infer the library type. | `str`
 kallisto_directionality | Required for [kallisto](#third-party-software-used) and [ALFA](#third-party-software-used). One of `--fr-stranded` (strand-specific reads, first read forward) and `--rf-stranded` (strand-specific reads, first read reverse) | `str`
 fq1_polya3p | Required for [Cutadapt](#third-party-software-used). Stretch of `A`s or `T`s, depending on read orientation. Trimmed from the 3' end of the read. Use value such as `XXXXXXXXXXXXXXX` if no poly(A) stretch present or if no trimming is desired. | `str`
@@ -599,13 +596,12 @@ Align short reads to reference genome and/or transcriptome with
     [**remove_polya_cutadapt**](#remove_polya_cutadapt)
   - Index; from [**create_index_star**](#create_index_star)
 - **Parameters**
-  - **samples.tsv**
-    - `--outFilterMultimapNmax`: maximum number of multiple alignments allowed; if exceeded, read is considered unmapped; specify in sample table column `multimappers`
-    - `--alignEndsType`: one of `Local` (standard local alignment with soft-clipping allowed) or `EndToEnd` (force end-to-end read alignment, do not soft-clip); specify in sample table column `soft_clip`
-    - `--twopassMode`: one of `None` (1-pass mapping) or `Basic` (basic 2-pass mapping, with all 1st-pass junctions inserted into the genome indices on the fly); specify in sample table column `pass_mode`
   - **rule_config.yaml**
     - `--outFilterMultimapScoreRange=0`: the score range below the maximum score for multimapping alignments (default 1)
     - `--outFilterType=BySJout`: reduces the number of ”spurious” junctions
+    - `--outFilterMultimapNmax`: maximum number of multiple alignments allowed; if exceeded, read is considered unmapped; specify in sample table column `multimappers`
+    - `--alignEndsType`: one of `Local` (standard local alignment with soft-clipping allowed) or `EndToEnd` (force end-to-end read alignment, do not soft-clip); specify in sample table column `soft_clip`
+    - `--twopassMode`: one of `None` (1-pass mapping) or `Basic` (basic 2-pass mapping, with all 1st-pass junctions inserted into the genome indices on the fly); specify in sample table column `pass_mode`
 - **Output**
   - Aligned reads file (`.bam`); used in
     [**calculate_TIN_scores**](#calculate_TIN_scores),
diff --git a/tests/input_files/rule_config.yaml b/tests/input_files/rule_config.yaml
index 2cdec07..5999ddb 100644
--- a/tests/input_files/rule_config.yaml
+++ b/tests/input_files/rule_config.yaml
@@ -122,12 +122,24 @@ map_genome_star:
     --outFilterMultimapScoreRange: '0'
     # keep only those reads that contain junctions that passed filtering into SJ.out.tab. (default 'Normal', ZARP recommends 'BySJout', as this reduces the number of ”spurious” junctions )
     --outFilterType: 'BySJout'
+    # type of read ends alignment: force end-to-end read alignment, do not soft-clip
+    --alignEndsType: 'EndToEnd'
+    #  extract junctions, insert them into the genome index and re-map reads in a 2nd mapping pass
+    --twopassMode: Basic
+    # alignments (all of them) will be output only if the read maps to no more loci than 10
+    --outFilterMultimapNmax: '10'
 
 pe_map_genome_star:
     # the score range below the maximum score for multimapping alignments (default 1, ZARP recommends 0)
     --outFilterMultimapScoreRange: '0'
     # keep only those reads that contain junctions that passed filtering into SJ.out.tab. (default 'Normal', ZARP recommends 'BySJout', as this reduces the number of ”spurious” junctions )
     --outFilterType: 'BySJout'
+    # type of read ends alignment: force end-to-end read alignment, do not soft-clip
+    --alignEndsType: 'EndToEnd'
+    #  extract junctions, insert them into the genome index and re-map reads in a 2nd mapping pass
+    --twopassMode: Basic
+    # alignments (all of them) will be output only if the read maps to no more loci than 10
+    --outFilterMultimapNmax: '10'
 
 quantification_salmon:
     # correct for sequence specific biases](https://salmon.readthedocs.io/en/latest/salmon.html#seqbias
diff --git a/tests/input_files/samples.tsv b/tests/input_files/samples.tsv
index 4b26302..d74233f 100644
--- a/tests/input_files/samples.tsv
+++ b/tests/input_files/samples.tsv
@@ -1,3 +1,3 @@
-sample	seqmode	fq1	index_size	kmer	fq1_3p	fq1_5p	organism	gtf	genome	sd	mean	multimappers	soft_clip	pass_mode	libtype	fq1_polya_3p	fq1_polya_5p	kallisto_directionality	alfa_directionality	alfa_plus	alfa_minus	fq2	fq2_3p	fq2_5p	fq2_polya_3p	fq2_polya_5p
-synthetic_10_reads_paired_synthetic_10_reads_paired	pe	../input_files/project1/synthetic.mate_1.fastq.gz	75	31	AGATCGGAAGAGCACA	XXXXXXXXXXXXX	homo_sapiens	../input_files/homo_sapiens/annotation.gtf	../input_files/homo_sapiens/genome.fa	100	250	10	EndToEnd	None	A	AAAAAAAAAAAAAAAAA	XXXXXXXXXXXXXXXXX	--fr	fr-firststrand	str1	str2	../input_files/project1/synthetic.mate_2.fastq.gz	AGATCGGAAGAGCGT	XXXXXXXXXXXXX	XXXXXXXXXXXXXXXXX	TTTTTTTTTTTTTTTTT
-synthetic_10_reads_mate_1_synthetic_10_reads_mate_1	se	../input_files/project2/synthetic.mate_1.fastq.gz	75	31	AGATCGGAAGAGCACA	XXXXXXXXXXXXX	homo_sapiens	../input_files/homo_sapiens/annotation.gtf	../input_files/homo_sapiens/genome.fa	100	250	10	EndToEnd	None	A	AAAAAAAAAAAAAAAAA	XXXXXXXXXXXXXXXXX	--fr	fr-firststrand	str1	str2	XXXXXXXXXXXXX	XXXXXXXXXXXXX	XXXXXXXXXXXXX	XXXXXXXXXXXXX	XXXXXXXXXXXXX
+sample	seqmode	fq1	index_size	kmer	fq1_3p	fq1_5p	organism	gtf	genome	sd	mean	libtype	fq1_polya_3p	fq1_polya_5p	kallisto_directionality	alfa_directionality	alfa_plus	alfa_minus	fq2	fq2_3p	fq2_5p	fq2_polya_3p	fq2_polya_5p
+synthetic_10_reads_paired_synthetic_10_reads_paired	pe	../input_files/project1/synthetic.mate_1.fastq.gz	75	31	AGATCGGAAGAGCACA	XXXXXXXXXXXXX	homo_sapiens	../input_files/homo_sapiens/annotation.gtf	../input_files/homo_sapiens/genome.fa	100	250	A	AAAAAAAAAAAAAAAAA	XXXXXXXXXXXXXXXXX	--fr	fr-firststrand	str1	str2	../input_files/project1/synthetic.mate_2.fastq.gz	AGATCGGAAGAGCGT	XXXXXXXXXXXXX	XXXXXXXXXXXXXXXXX	TTTTTTTTTTTTTTTTT
+synthetic_10_reads_mate_1_synthetic_10_reads_mate_1	se	../input_files/project2/synthetic.mate_1.fastq.gz	75	31	AGATCGGAAGAGCACA	XXXXXXXXXXXXX	homo_sapiens	../input_files/homo_sapiens/annotation.gtf	../input_files/homo_sapiens/genome.fa	100	250	A	AAAAAAAAAAAAAAAAA	XXXXXXXXXXXXXXXXX	--fr	fr-firststrand	str1	str2	XXXXXXXXXXXXX	XXXXXXXXXXXXX	XXXXXXXXXXXXX	XXXXXXXXXXXXX	XXXXXXXXXXXXX
diff --git a/workflow/rules/paired_end.snakefile.smk b/workflow/rules/paired_end.snakefile.smk
index 37c5546..1d03822 100644
--- a/workflow/rules/paired_end.snakefile.smk
+++ b/workflow/rules/paired_end.snakefile.smk
@@ -251,36 +251,18 @@ rule pe_map_genome_star:
             "{sample}",
             "map_genome",
             "{sample}.pe."),
-        multimappers = lambda wildcards:
-            get_sample(
-                'multimappers',
-                search_id='index',
-                search_value=wildcards.sample),
-        soft_clip = lambda wildcards:
-            get_sample(
-                'soft_clip',
-                search_id='index',
-                search_value=wildcards.sample),
-        pass_mode = lambda wildcards:
-            get_sample(
-                'pass_mode',
-                search_id='index',
-                search_value=wildcards.sample),
         additional_params = parse_rule_config(
             rule_config,
             current_rule=current_rule,
             immutable=(
-                '--twopassMode',
                 '--genomeDir',
                 '--readFilesIn',
                 '--readFilesCommand',
-                '--outFilterMultimapNmax',
                 '--outFileNamePrefix',
                 '--outSAMattributes',
                 '--outStd',
                 '--outSAMtype',
                 '--outSAMattrRGline',
-                '--alignEndsType',
                 )
             )
 
@@ -301,18 +283,15 @@ rule pe_map_genome_star:
 
     shell:
         "(STAR \
-        --twopassMode {params.pass_mode} \
         --runThreadN {threads} \
         --genomeDir {params.index} \
         --readFilesIn {input.reads1} {input.reads2} \
         --readFilesCommand zcat \
-        --outFilterMultimapNmax {params.multimappers} \
         --outFileNamePrefix {params.outFileNamePrefix} \
         --outSAMattributes All \
         --outStd BAM_SortedByCoordinate \
         --outSAMtype BAM SortedByCoordinate \
         --outSAMattrRGline ID:rnaseq_pipeline SM:{params.sample_id} \
-        --alignEndsType {params.soft_clip} \
         {params.additional_params} \
         > {output.bam};) \
         2> {log.stderr}"
diff --git a/workflow/rules/single_end.snakefile.smk b/workflow/rules/single_end.snakefile.smk
index e270aba..6898c13 100644
--- a/workflow/rules/single_end.snakefile.smk
+++ b/workflow/rules/single_end.snakefile.smk
@@ -195,36 +195,18 @@ rule map_genome_star:
             "{sample}",
             "map_genome",
             "{sample}.se."),
-        multimappers = lambda wildcards:
-            get_sample(
-                'multimappers',
-                search_id='index',
-                search_value=wildcards.sample),
-        soft_clip = lambda wildcards:
-            get_sample(
-                'soft_clip',
-                search_id='index',
-                search_value=wildcards.sample),
-        pass_mode = lambda wildcards:
-            get_sample(
-                'pass_mode',
-                search_id='index',
-                search_value=wildcards.sample),
         additional_params = parse_rule_config(
             rule_config,
             current_rule=current_rule,
             immutable=(
-                '--twopassMode',
                 '--genomeDir',
                 '--readFilesIn',
                 '--readFilesCommand',
-                '--outFilterMultimapNmax',
                 '--outFileNamePrefix',
                 '--outSAMattributes',
                 '--outStd',
                 '--outSAMtype',
                 '--outSAMattrRGline',
-                '--alignEndsType',
                 )
             )
 
@@ -245,18 +227,15 @@ rule map_genome_star:
 
     shell:
         "(STAR \
-        --twopassMode {params.pass_mode} \
         --runThreadN {threads} \
         --genomeDir {params.index} \
         --readFilesIn {input.reads} \
         --readFilesCommand zcat \
-        --outFilterMultimapNmax {params.multimappers} \
         --outFileNamePrefix {params.outFileNamePrefix} \
         --outSAMattributes All \
         --outStd BAM_SortedByCoordinate \
         --outSAMtype BAM SortedByCoordinate \
         --outSAMattrRGline ID:rnaseq_pipeline SM:{params.sample_id} \
-        --alignEndsType {params.soft_clip} \
         {params.additional_params} \
         > {output.bam};) \
         2> {log.stderr}"
-- 
GitLab