From 90b58bfa1befd5986b2e1531f826c850d20f0274 Mon Sep 17 00:00:00 2001 From: BIOPZ-Herrmann Christina <christina.herrmann@unibas.ch> Date: Fri, 20 Aug 2021 09:44:08 +0000 Subject: [PATCH] Collapse directionality params in sample table --- README.md | 3 + pipeline_documentation.md | 13 ++- scripts/prepare_inputs.py | 95 ++++++------------- tests/input_files/samples.multiple_lanes.tsv | 10 +- tests/input_files/samples.tsv | 6 +- tests/input_files/samples_alfa.tsv | 10 +- .../expected_output.md5 | 2 +- workflow/Snakefile | 51 ++++++++-- workflow/rules/paired_end.snakefile.smk | 10 +- workflow/rules/single_end.snakefile.smk | 10 +- 10 files changed, 105 insertions(+), 105 deletions(-) diff --git a/README.md b/README.md index deeeda1..6bd20fa 100644 --- a/README.md +++ b/README.md @@ -168,6 +168,9 @@ files should look like, specifically: - [samples.tsv](tests/input_files/samples.tsv) - [config.yaml](tests/input_files/config.yaml) + - For more details and explanations, refer to the [pipeline-documentation] + + 4. Create a runner script. Pick one of the following choices for either local or cluster execution. Before execution of the respective command, you need to remember to update the argument of the `--singularity-args` option of a diff --git a/pipeline_documentation.md b/pipeline_documentation.md index bbef50f..070b366 100644 --- a/pipeline_documentation.md +++ b/pipeline_documentation.md @@ -119,8 +119,7 @@ mean | Required for [kallisto](#third-party-software-used) and [Salmon](#third-p multimappers | Required for [STAR](#third-party-software-used). Maximum number of multiple alignments allowed for a read; if exceeded, the read is considered unmapped. | `int` soft_clip | Required for [STAR](#third-party-software-used). One of `Local` (standard local alignment with soft-clipping allowed) or `EndToEnd` (force end-to-end read alignment, do not soft-clip). | `str` pass_mode | Required for [STAR](#third-party-software-used). One of `None` (1-pass mapping) or `Basic` (basic 2-pass mapping, with all 1st-pass junctions inserted into the genome indices on the fly). | `str` -libtype | Required for [Salmon](#third-party-software-used). See [Salmon manual][docs-salmon] for allowed values. If in doubt, enter `A` to automatically infer the library type. | `str` -kallisto_directionality | Required for [kallisto](#third-party-software-used) and [ALFA](#third-party-software-used). One of `--fr-stranded` (strand-specific reads, first read forward) and `--rf-stranded` (strand-specific reads, first read reverse) | `str` +libtype | Required for [Salmon](#third-party-software-used), and, after internal conversion, for [kallisto](#third-party-software-used) and [ALFA](#third-party-software-used) . See [Salmon manual][docs-salmon] for allowed values. **WARNING**: do *NOT* use `A` to automatically infer the salmon library type, this will cause kallisto and ALFA to fail. | `str` fq1_polya3p | Required for [Cutadapt](#third-party-software-used). Stretch of `A`s or `T`s, depending on read orientation. Trimmed from the 3' end of the read. Use value such as `XXXXXXXXXXXXXXX` if no poly(A) stretch present or if no trimming is desired. | `str` fq1_polya5p | Required for [Cutadapt](#third-party-software-used). Stretch of `A`s or `T`s, depending on read orientation. Trimmed from the 5' end of the read. Use value such as `XXXXXXXXXXXXXXX` if no poly(A) stretch present or if no trimming is desired. | `str` fq2_polya3p | Required for [Cutadapt](#third-party-software-used). Stretch of `A`s or `T`s, depending on read orientation. Trimmed from the 3' end of the read. Use value such as `XXXXXXXXXXXXXXX` if no poly(A) stretch present or if no trimming is desired. Value ignored for single-end libraries. | `str` @@ -299,7 +298,7 @@ Rename and copy stranded bedGraph coverage tracks such that they comply with > Local rule > > Renaming to `plus.bg` and `minus.bg` depends on library orientation, which is -> provided by user in sample table column `kallisto_directionality`. +> provided by user in sample table column `libtype`. - **Input** - Coverage file (`.bg`); from [**star_rpm**](#star_rpm) @@ -465,15 +464,15 @@ Create index for [**ALFA**](#third-party-software-used). Annotate alignments with [**ALFA**](#third-party-software-used). -> For details on output plots, see [ALFA documentation][docs-alfa]. +> For details on output plots, see [ALFA documentation][docs-alfa]. +> Note: the read orientation of a sample will be inferred from salmon `libtype` specified in `samples.tsv` - **Input** - Coverage files, renamed (`.bg`); from [**rename_star_rpm_for_alfa**](#rename_star_rpm_for_alfa) - ALFA index, stranded; from [**generate_alfa_index**](#generate_alfa_index) - **Parameters** - - **samples.tsv** - - `-s`: library orientation; specified by user in sample table column `kallisto_directionality` + - **Output** - Figures for biotypes and feature categories (`.pdf`) - Feature counts table (custom `.tsv`); used in @@ -665,6 +664,7 @@ Estimate transcript- and gene-level expression with Generate pseudoalignments of reads to transcripts with [**kallisto**](#third-party-software-used). +> Note: the kallisto strandedness parameter will be inferred from salmon `libtype` specified in `samples.tsv` - **Input** - Reads file (`.fastq.gz`); from @@ -672,7 +672,6 @@ Generate pseudoalignments of reads to transcripts with - Index; from [**create_index_kallisto**](#create_index_kallisto) - **Parameters** - **samples.tsv** - - `directionality`; specify in sample table column `kallisto_directionality` - `-l`: mean of distribution of fragment lengths; specify in sample table column `mean` **(single-end only)** - `-s`: standard deviation of distribution of fragment lengths; specify in sample table column `sd` **(single-end only)** - **Output** diff --git a/scripts/prepare_inputs.py b/scripts/prepare_inputs.py index 426902a..bc164aa 100755 --- a/scripts/prepare_inputs.py +++ b/scripts/prepare_inputs.py @@ -137,9 +137,11 @@ def parse_cli_args() -> argparse.Namespace: behavior.add_argument( "--libtype", type=str, - default="A", - help="Salmon: library type (default: %(default)s)", + default="", + help="Salmon library type (default: %(default)s). Leave empty to infer one of 'SF', 'SR', 'ISF', 'ISR'." + "Warning: If value is provided by user, it will be applied to ALL samples. If the table contains samples from different sequencing modes this might cause errors in zarp.", metavar="STR", + choices=["", "SF", "SR", "ISF", "ISR", "OSF", "OSR", "MSF", "MSR"] ) report = parser.add_argument_group("report") @@ -311,72 +313,37 @@ def kmer_from_read_length( return k -def get_strand_param_kallisto(directionality: str) -> str: +def get_libtype(directionality: str, seqmode: str) -> str: """ - Returns appropriate strand info parameter for kallisto - (https://pachterlab.github.io/kallisto/), given a string indicating the - "directionality" of a sequencing library. + Returns libtype (https://salmon.readthedocs.io/en/latest/library_type.html) given strings indicating the + "directionality", and sequencing mode of a sequencing library, respectively. :param directionality: direction in which library was sequenced (one of "SENSE" and "ANTISENSE") - :returns: appropriate kallisto option for specified directionality; an - empty string is returned if the directionality value is empty or not - recognized - """ - if directionality == "SENSE": - option = "--fr" - elif directionality == "ANTISENSE": - option = "--rf" - else: - option = "" - return option - + :param seqmode: sequencing mode(one of + "pe" and "se") -def get_strand_param_alfa(directionality: str) -> str: + :returns: salmon code (one of 'SF', 'SR', 'ISF', 'ISR') for specified directionality; """ - Returns appropriate strand info parameter for ALFA - (https://github.com/biocompibens/ALFA), given a string indicating the - "directionality" of a sequencing library. - :param directionality: direction in which library was sequenced (one of - "SENSE" and "ANTISENSE") - - :returns: appropriate ALFA option for specified directionality; an empty - string is returned if the directionality value is empty or not - recognized - """ - if directionality == 'SENSE': - option = 'fr-firststrand' - elif directionality == 'ANTISENSE': - option = 'fr-secondstrand' + if seqmode == "pe": + option = "I" else: - option = '' - return option + option = "" - -def get_strand_names_alfa(directionality: str) -> Tuple[str, str]: - """ - Returns appropriate strand name suffixes for ALFA - (https://github.com/biocompibens/ALFA), given a string indicating the - "directionality" of a sequencing library. - - :param directionality: direction in which library was sequenced (one of - "SENSE" and "ANTISENSE") - - :returns: tuple of ALFA strand name suffixes for two coverage tracks of a - paired-end sequencing library - """ if directionality == "SENSE": - plus = "str1" - minus = "str2" + option += "SF" elif directionality == "ANTISENSE": - minus = "str1" - plus = "str2" + option += "SR" else: - plus = "" - minus = "" - return (plus, minus) + logger.error( + f"[ERROR] Can't infer library type." + f"Make sure directionality and sequencing mode are specified correctly." + ) + sys.exit("Execution aborted.") + + return option def get_polya_adapter_seqs(directionality: str) -> Tuple[str, str]: @@ -583,9 +550,6 @@ def main(args): mean = lk_mean fq1_polya_3p, fq1_polya_5p = get_polya_adapter_seqs(lk_mate1_direction) fq2_polya_3p, fq2_polya_5p = get_polya_adapter_seqs(lk_mate2_direction) - kallisto_directionality = get_strand_param_kallisto(lk_mate1_direction) - alfa_directionality = get_strand_param_alfa(lk_mate1_direction) - alfa_plus, alfa_minus = get_strand_names_alfa(lk_mate1_direction) # construct row in Snakemake input table snakemake_table.loc[index, 'sample'] = sample @@ -603,17 +567,20 @@ def main(args): snakemake_table.loc[index, 'genome'] = genome snakemake_table.loc[index, 'sd'] = sd snakemake_table.loc[index, 'mean'] = mean - snakemake_table.loc[index, 'kallisto_directionality'] = \ - kallisto_directionality - snakemake_table.loc[index, 'alfa_directionality'] = alfa_directionality - snakemake_table.loc[index, 'alfa_plus'] = alfa_plus - snakemake_table.loc[index, 'alfa_minus'] = alfa_minus # add CLI argument-dependent parameters snakemake_table.loc[index, 'multimappers'] = args.multimappers snakemake_table.loc[index, 'soft_clip'] = args.soft_clip snakemake_table.loc[index, 'pass_mode'] = args.pass_mode - snakemake_table.loc[index, 'libtype'] = args.libtype + + if not args.libtype: + snakemake_table.loc[index, 'libtype'] = get_libtype(lk_mate1_direction, seqmode) + elif args.libtype in ['SF', 'SR', 'ISF', 'ISR', 'OSF', 'OSR', 'MSF', 'MSR']: + snakemake_table.loc[index, 'libtype'] = args.libtype + logger.warning( + f"Library type {args.libtype} set for sample {sample}." + ) + if args.trim_polya is True: snakemake_table.loc[index, 'fq1_polya_3p'] = fq1_polya_3p snakemake_table.loc[index, 'fq1_polya_5p'] = fq1_polya_5p diff --git a/tests/input_files/samples.multiple_lanes.tsv b/tests/input_files/samples.multiple_lanes.tsv index 7968fb4..6b32ab8 100644 --- a/tests/input_files/samples.multiple_lanes.tsv +++ b/tests/input_files/samples.multiple_lanes.tsv @@ -1,5 +1,5 @@ -sample seqmode fq1 fq2 index_size kmer fq1_3p fq1_5p fq2_3p fq2_5p organism gtf genome sd mean kallisto_directionality alfa_directionality alfa_plus alfa_minus multimappers soft_clip pass_mode libtype fq1_polya_3p fq1_polya_5p fq2_polya_3p fq2_polya_5p -synthetic_10_reads_paired_synthetic_10_reads_paired pe ../input_files/pe_lane1/synthetic_split_lane1.mate_1.fastq.gz ../input_files/pe_lane1/synthetic_split_lane1.mate_2.fastq.gz 75 31 AGATCGGAAGAGCACA XXXXXXXXXXXXXXX AGATCGGAAGAGCGT XXXXXXXXXXXXXXX homo_sapiens ../input_files/homo_sapiens/annotation.gtf ../input_files/homo_sapiens/genome.fa 100 250 --fr fr-firststrand str1 str2 10 EndToEnd None A AAAAAAAAAAAAAAA XXXXXXXXXXXXXXX XXXXXXXXXXXXXXX TTTTTTTTTTTTTTT -synthetic_10_reads_paired_synthetic_10_reads_paired pe ../input_files/pe_lane2/synthetic_split_lane2.mate_1.fastq.gz ../input_files/pe_lane2/synthetic_split_lane2.mate_2.fastq.gz 75 31 AGATCGGAAGAGCACA XXXXXXXXXXXXXXX AGATCGGAAGAGCGT XXXXXXXXXXXXXXX homo_sapiens ../input_files/homo_sapiens/annotation.gtf ../input_files/homo_sapiens/genome.fa 100 250 --fr fr-firststrand str1 str2 10 EndToEnd None A AAAAAAAAAAAAAAA XXXXXXXXXXXXXXX XXXXXXXXXXXXXXX TTTTTTTTTTTTTTT -synthetic_10_reads_mate_1_synthetic_10_reads_mate_1 se ../input_files/se_lane1/synthetic_split_lane1.mate_1.fastq.gz XXXXXXXXXXXXXXX 75 31 AGATCGGAAGAGCACA XXXXXXXXXXXXXXX XXXXXXXXXXXXXXX XXXXXXXXXXXXXXX homo_sapiens ../input_files/homo_sapiens/annotation.gtf ../input_files/homo_sapiens/genome.fa 100 250 --fr fr-firststrand str1 str2 10 EndToEnd None A AAAAAAAAAAAAAAA XXXXXXXXXXXXXXX XXXXXXXXXXXXXXX XXXXXXXXXXXXXXX -synthetic_10_reads_mate_1_synthetic_10_reads_mate_1 se ../input_files/se_lane2/synthetic_split_lane2.mate_1.fastq.gz XXXXXXXXXXXXXXX 75 31 AGATCGGAAGAGCACA XXXXXXXXXXXXXXX XXXXXXXXXXXXXXX XXXXXXXXXXXXXXX homo_sapiens ../input_files/homo_sapiens/annotation.gtf ../input_files/homo_sapiens/genome.fa 100 250 --fr fr-firststrand str1 str2 10 EndToEnd None A AAAAAAAAAAAAAAA XXXXXXXXXXXXXXX XXXXXXXXXXXXXXX XXXXXXXXXXXXXXX +sample seqmode fq1 fq2 index_size kmer fq1_3p fq1_5p fq2_3p fq2_5p organism gtf genome sd mean multimappers soft_clip pass_mode libtype fq1_polya_3p fq1_polya_5p fq2_polya_3p fq2_polya_5p +synthetic_10_reads_paired_synthetic_10_reads_paired pe ../input_files/pe_lane1/synthetic_split_lane1.mate_1.fastq.gz ../input_files/pe_lane1/synthetic_split_lane1.mate_2.fastq.gz 75 31 AGATCGGAAGAGCACA XXXXXXXXXXXXXXX AGATCGGAAGAGCGT XXXXXXXXXXXXXXX homo_sapiens ../input_files/homo_sapiens/annotation.gtf ../input_files/homo_sapiens/genome.fa 100 250 10 EndToEnd None ISF AAAAAAAAAAAAAAA XXXXXXXXXXXXXXX XXXXXXXXXXXXXXX TTTTTTTTTTTTTTT +synthetic_10_reads_paired_synthetic_10_reads_paired pe ../input_files/pe_lane2/synthetic_split_lane2.mate_1.fastq.gz ../input_files/pe_lane2/synthetic_split_lane2.mate_2.fastq.gz 75 31 AGATCGGAAGAGCACA XXXXXXXXXXXXXXX AGATCGGAAGAGCGT XXXXXXXXXXXXXXX homo_sapiens ../input_files/homo_sapiens/annotation.gtf ../input_files/homo_sapiens/genome.fa 100 250 10 EndToEnd None ISF AAAAAAAAAAAAAAA XXXXXXXXXXXXXXX XXXXXXXXXXXXXXX TTTTTTTTTTTTTTT +synthetic_10_reads_mate_1_synthetic_10_reads_mate_1 se ../input_files/se_lane1/synthetic_split_lane1.mate_1.fastq.gz XXXXXXXXXXXXXXX 75 31 AGATCGGAAGAGCACA XXXXXXXXXXXXXXX XXXXXXXXXXXXXXX XXXXXXXXXXXXXXX homo_sapiens ../input_files/homo_sapiens/annotation.gtf ../input_files/homo_sapiens/genome.fa 100 250 10 EndToEnd None SF AAAAAAAAAAAAAAA XXXXXXXXXXXXXXX XXXXXXXXXXXXXXX XXXXXXXXXXXXXXX +synthetic_10_reads_mate_1_synthetic_10_reads_mate_1 se ../input_files/se_lane2/synthetic_split_lane2.mate_1.fastq.gz XXXXXXXXXXXXXXX 75 31 AGATCGGAAGAGCACA XXXXXXXXXXXXXXX XXXXXXXXXXXXXXX XXXXXXXXXXXXXXX homo_sapiens ../input_files/homo_sapiens/annotation.gtf ../input_files/homo_sapiens/genome.fa 100 250 10 EndToEnd None SF AAAAAAAAAAAAAAA XXXXXXXXXXXXXXX XXXXXXXXXXXXXXX XXXXXXXXXXXXXXX diff --git a/tests/input_files/samples.tsv b/tests/input_files/samples.tsv index 4b26302..58ac18f 100644 --- a/tests/input_files/samples.tsv +++ b/tests/input_files/samples.tsv @@ -1,3 +1,3 @@ -sample seqmode fq1 index_size kmer fq1_3p fq1_5p organism gtf genome sd mean multimappers soft_clip pass_mode libtype fq1_polya_3p fq1_polya_5p kallisto_directionality alfa_directionality alfa_plus alfa_minus fq2 fq2_3p fq2_5p fq2_polya_3p fq2_polya_5p -synthetic_10_reads_paired_synthetic_10_reads_paired pe ../input_files/project1/synthetic.mate_1.fastq.gz 75 31 AGATCGGAAGAGCACA XXXXXXXXXXXXX homo_sapiens ../input_files/homo_sapiens/annotation.gtf ../input_files/homo_sapiens/genome.fa 100 250 10 EndToEnd None A AAAAAAAAAAAAAAAAA XXXXXXXXXXXXXXXXX --fr fr-firststrand str1 str2 ../input_files/project1/synthetic.mate_2.fastq.gz AGATCGGAAGAGCGT XXXXXXXXXXXXX XXXXXXXXXXXXXXXXX TTTTTTTTTTTTTTTTT -synthetic_10_reads_mate_1_synthetic_10_reads_mate_1 se ../input_files/project2/synthetic.mate_1.fastq.gz 75 31 AGATCGGAAGAGCACA XXXXXXXXXXXXX homo_sapiens ../input_files/homo_sapiens/annotation.gtf ../input_files/homo_sapiens/genome.fa 100 250 10 EndToEnd None A AAAAAAAAAAAAAAAAA XXXXXXXXXXXXXXXXX --fr fr-firststrand str1 str2 XXXXXXXXXXXXX XXXXXXXXXXXXX XXXXXXXXXXXXX XXXXXXXXXXXXX XXXXXXXXXXXXX +sample seqmode fq1 index_size kmer fq1_3p fq1_5p organism gtf genome sd mean multimappers soft_clip pass_mode libtype fq1_polya_3p fq1_polya_5p fq2 fq2_3p fq2_5p fq2_polya_3p fq2_polya_5p +synthetic_10_reads_paired_synthetic_10_reads_paired pe ../input_files/project1/synthetic.mate_1.fastq.gz 75 31 AGATCGGAAGAGCACA XXXXXXXXXXXXX homo_sapiens ../input_files/homo_sapiens/annotation.gtf ../input_files/homo_sapiens/genome.fa 100 250 10 EndToEnd None ISF AAAAAAAAAAAAAAAAA XXXXXXXXXXXXXXXXX ../input_files/project1/synthetic.mate_2.fastq.gz AGATCGGAAGAGCGT XXXXXXXXXXXXX XXXXXXXXXXXXXXXXX TTTTTTTTTTTTTTTTT +synthetic_10_reads_mate_1_synthetic_10_reads_mate_1 se ../input_files/project2/synthetic.mate_1.fastq.gz 75 31 AGATCGGAAGAGCACA XXXXXXXXXXXXX homo_sapiens ../input_files/homo_sapiens/annotation.gtf ../input_files/homo_sapiens/genome.fa 100 250 10 EndToEnd None SF AAAAAAAAAAAAAAAAA XXXXXXXXXXXXXXXXX XXXXXXXXXXXXX XXXXXXXXXXXXX XXXXXXXXXXXXX XXXXXXXXXXXXX XXXXXXXXXXXXX diff --git a/tests/input_files/samples_alfa.tsv b/tests/input_files/samples_alfa.tsv index af1835c..9ef1c3c 100644 --- a/tests/input_files/samples_alfa.tsv +++ b/tests/input_files/samples_alfa.tsv @@ -1,5 +1,5 @@ -sample seqmode fq1 index_size kmer fq2 fq1_3p fq1_5p fq2_3p fq2_5p organism gtf genome sd mean multimappers soft_clip pass_mode libtype kallisto_directionality fq1_polya fq2_polya alfa_directionality alfa_plus alfa_minus -paired_end_R1_on_plus_sense pe XXXXXXXXXXXXX 75 31 XXXXXXXXXXXXX GATCGGAAGAGCACA XXXXXXXXXXXXX AGATCGGAAGAGCGT XXXXXXXXXXXXX homo_sapiens ../input_files/homo_sapiens/quick_start.gtf XXXXXXXXXXXXX 100 250 10 EndToEnd None A --fr AAAAAAAAAAAAAAAAA TTTTTTTTTTTTTTTTT fr-firststrand str1 str2 -paired_end_R1_on_plus_antisense pe XXXXXXXXXXXXX 75 31 XXXXXXXXXXXXX AGATCGGAAGAGCACA XXXXXXXXXXXXX AGATCGGAAGAGCGT XXXXXXXXXXXXX homo_sapiens ../input_files/homo_sapiens/quick_start.gtf XXXXXXXXXXXXX 100 250 10 EndToEnd None A --rf AAAAAAAAAAAAAAAAA TTTTTTTTTTTTTTTTT fr-secondstrand str2 str1 -paired_end_R1_on_minus_sense pe XXXXXXXXXXXXX 75 31 XXXXXXXXXXXXX AGATCGGAAGAGCACA XXXXXXXXXXXXX AGATCGGAAGAGCGT XXXXXXXXXXXXX homo_sapiens ../input_files/homo_sapiens/quick_start.gtf XXXXXXXXXXXXX 100 250 10 EndToEnd None A --fr AAAAAAAAAAAAAAAAA TTTTTTTTTTTTTTTTT fr-firststrand str1 str2 -paired_end_R1_on_minus_antisense pe XXXXXXXXXXXXX 75 31 XXXXXXXXXXXXX AGATCGGAAGAGCACA XXXXXXXXXXXXX AGATCGGAAGAGCGT XXXXXXXXXXXXX homo_sapiens ../input_files/homo_sapiens/quick_start.gtf XXXXXXXXXXXXX 100 250 10 EndToEnd None A --rf AAAAAAAAAAAAAAAAA TTTTTTTTTTTTTTTTT fr-secondstrand str2 str1 +sample seqmode fq1 index_size kmer fq2 fq1_3p fq1_5p fq2_3p fq2_5p organism gtf genome sd mean multimappers soft_clip pass_mode libtype fq1_polya fq2_polya +paired_end_R1_on_plus_sense pe XXXXXXXXXXXXX 75 31 XXXXXXXXXXXXX GATCGGAAGAGCACA XXXXXXXXXXXXX AGATCGGAAGAGCGT XXXXXXXXXXXXX homo_sapiens ../input_files/homo_sapiens/quick_start.gtf XXXXXXXXXXXXX 100 250 10 EndToEnd None ISF AAAAAAAAAAAAAAAAA TTTTTTTTTTTTTTTTT +paired_end_R1_on_plus_antisense pe XXXXXXXXXXXXX 75 31 XXXXXXXXXXXXX AGATCGGAAGAGCACA XXXXXXXXXXXXX AGATCGGAAGAGCGT XXXXXXXXXXXXX homo_sapiens ../input_files/homo_sapiens/quick_start.gtf XXXXXXXXXXXXX 100 250 10 EndToEnd None ISR AAAAAAAAAAAAAAAAA TTTTTTTTTTTTTTTTT +paired_end_R1_on_minus_sense pe XXXXXXXXXXXXX 75 31 XXXXXXXXXXXXX AGATCGGAAGAGCACA XXXXXXXXXXXXX AGATCGGAAGAGCGT XXXXXXXXXXXXX homo_sapiens ../input_files/homo_sapiens/quick_start.gtf XXXXXXXXXXXXX 100 250 10 EndToEnd None ISF AAAAAAAAAAAAAAAAA TTTTTTTTTTTTTTTTT +paired_end_R1_on_minus_antisense pe XXXXXXXXXXXXX 75 31 XXXXXXXXXXXXX AGATCGGAAGAGCACA XXXXXXXXXXXXX AGATCGGAAGAGCGT XXXXXXXXXXXXX homo_sapiens ../input_files/homo_sapiens/quick_start.gtf XXXXXXXXXXXXX 100 250 10 EndToEnd None ISR AAAAAAAAAAAAAAAAA TTTTTTTTTTTTTTTTT diff --git a/tests/test_scripts_prepare_inputs_table/expected_output.md5 b/tests/test_scripts_prepare_inputs_table/expected_output.md5 index 53afed5..aca34c0 100644 --- a/tests/test_scripts_prepare_inputs_table/expected_output.md5 +++ b/tests/test_scripts_prepare_inputs_table/expected_output.md5 @@ -1,2 +1,2 @@ 40bd0f0fcecdd0d9bc932f63c2811478 config.yaml -c8dcc5a203e9046806c4090525960151 samples.tsv +d8fb1773e3b83b6fab0a0d44c9fa71e6 samples.tsv \ No newline at end of file diff --git a/workflow/Snakefile b/workflow/Snakefile index 52317cb..198ae95 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -18,6 +18,19 @@ samples_table = pd.read_csv( sep="\t", ) +# dict for translation of "directionality parameters" +directionality_dict = { + "SF": + {"kallisto":"--fr-stranded", + "alfa": "fr-secondstrand", + "alfa_plus": "str1", + "alfa_minus": "str2"}, + "SR": + {"kallisto":"--rf-stranded", + "alfa": "fr-firststrand", + "alfa_plus": "str2", + "alfa_minus": "str1"}, +} # Validat config validate(config, os.path.join("..", "resources", "config_schema.json")) logger.info(f'Config file after validation: {config}') @@ -50,6 +63,24 @@ def get_sample(column_id, search_id=None, search_value=None): else: return str(samples_table[column_id][0]) +def get_directionality(libtype, tool): + """ Get directionality value for different tools""" + directionality ="" + + for key in directionality_dict.keys(): + # Use the first of 'SF' or 'SR' that is found in libtype to look up directionality params for the current tool + if key in libtype: + directionality = directionality_dict[key][tool] + break + + # If libtype contains neither 'SF', nor 'SR' we don't know what to do + if not directionality: + raise ValueError( + f"Unknown libtype {libtype}.\n" + "Provide one of 'SF', 'SR', 'ISF', 'ISR', 'OSF', 'OSR', 'MSF', 'MSR' in samples.tsv!" + ) + + return directionality def parse_rule_config(rule_config: dict, current_rule: str, immutable: Tuple[str, ...] = ()): """Get rule specific parameters from rule_config file""" @@ -1314,10 +1345,10 @@ rule rename_star_rpm_for_alfa: "{sample}_Signal.{unique}.{plus}.out.bg"), sample=wildcards.sample, unique=wildcards.unique, - plus=get_sample( - 'alfa_plus', + plus=get_directionality(get_sample( + 'libtype', search_id='index', - search_value=wildcards.sample)), + search_value=wildcards.sample),"alfa_plus")), minus = lambda wildcards: expand( os.path.join( @@ -1328,10 +1359,10 @@ rule rename_star_rpm_for_alfa: "{sample}_Signal.{unique}.{minus}.out.bg"), sample=wildcards.sample, unique=wildcards.unique, - minus=get_sample( - 'alfa_minus', + minus=get_directionality(get_sample( + 'libtype', search_id='index', - search_value=wildcards.sample)) + search_value=wildcards.sample), "alfa_minus")) output: plus = temp(os.path.join( @@ -1514,10 +1545,10 @@ rule alfa_qc: os.path.basename(input.minus), name = "{sample}", alfa_orientation = lambda wildcards: - get_sample( - 'alfa_directionality', - search_id='index', - search_value=wildcards.sample), + get_directionality(get_sample( + 'libtype', + search_id='index', + search_value=wildcards.sample),"alfa"), additional_params = parse_rule_config( rule_config, current_rule=current_rule, diff --git a/workflow/rules/paired_end.snakefile.smk b/workflow/rules/paired_end.snakefile.smk index d0a9bc2..081fa53 100644 --- a/workflow/rules/paired_end.snakefile.smk +++ b/workflow/rules/paired_end.snakefile.smk @@ -496,10 +496,10 @@ rule pe_genome_quantification_kallisto: "{sample}", "quant_kallisto"), directionality = lambda wildcards: - get_sample( - 'kallisto_directionality', - search_id='index', - search_value=wildcards.sample), + get_directionality(get_sample( + 'libtype', + search_id='index', + search_value=wildcards.sample),"kallisto"), additional_params = parse_rule_config( rule_config, current_rule=current_rule, @@ -536,7 +536,7 @@ rule pe_genome_quantification_kallisto: -i {input.index} \ -o {params.output_dir} \ -t {threads} \ - {params.directionality}-stranded \ + {params.directionality} \ {params.additional_params} \ --pseudobam \ {input.reads1} {input.reads2} > {output.pseudoalignment}) \ diff --git a/workflow/rules/single_end.snakefile.smk b/workflow/rules/single_end.snakefile.smk index 7dfb2c2..c64d71d 100644 --- a/workflow/rules/single_end.snakefile.smk +++ b/workflow/rules/single_end.snakefile.smk @@ -449,10 +449,10 @@ rule genome_quantification_kallisto: search_id='index', search_value=wildcards.sample), directionality = lambda wildcards: - get_sample( - 'kallisto_directionality', - search_id='index', - search_value=wildcards.sample), + get_directionality(get_sample( + 'libtype', + search_id='index', + search_value=wildcards.sample),"kallisto"), additional_params = parse_rule_config( rule_config, current_rule=current_rule, @@ -491,7 +491,7 @@ rule genome_quantification_kallisto: -l {params.fraglen} \ -s {params.fragsd} \ -t {threads} \ - {params.directionality}-stranded \ + {params.directionality} \ {params.additional_params} \ --pseudobam \ {input.reads} > {output.pseudoalignment};) \ -- GitLab