Update salmon transcriptome index generation

4e3cac05 · BIOPZ-Bak Maciej · Alex Kanitz · 4b1b5941 · 4e3cac05 · 4e3cac05
Commit 4e3cac05 authored 4 years ago by BIOPZ-Bak Maciej Committed by Alex Kanitz 4 years ago
--- a/Snakefile
+++ b/Snakefile
@@ -121,6 +121,7 @@ rule create_index_star:
        --sjdbGTFfile {input.gtf}) \
        1> {log.stdout} 2> {log.stderr}"

+
 rule extract_transcriptome:
    """ Create transcriptome from genome and gene annotations """
    input:
@@ -154,9 +155,49 @@ rule extract_transcriptome:
        -g {input.genome} {input.gtf}) \
        1> {log.stdout} 2> {log.stderr}"

-rule create_index_salmon:
+
+rule extract_decoys_salmon:
    """
-        Create index for Salmon quantification
+        Extract names of the genome targets
+    """
+    input:
+        genome = lambda wildcards:
+            samples_table['genome']
+            [samples_table['organism'] == wildcards.organism]
+            [0],
+
+    output:
+        decoys = os.path.join(
+                config['output_dir'],
+                "transcriptome",
+                "{organism}",
+                "decoys.txt")
+
+    singularity:
+        "docker://bash:5.0.16"
+
+    log:
+        stderr = os.path.join(
+            config['log_dir'],
+            "{organism}_extract_decoys_salmon.stderr.log"),
+        stdout = os.path.join(
+            config['log_dir'],
+            "{organism}_extract_decoys_salmon.stdout.log")
+
+    threads: 1
+
+    shell:
+        """
+        (grep "^>" <{input.genome} \
+        | cut -d " " -f 1 > {output.decoys} && \
+        sed -i.bak -e 's/>//g' {output.decoys}) \
+        1> {log.stdout} 2> {log.stderr}
+        """
+
+
+rule concatenate_transcriptome_and_genome:
+    """
+        Concatenate genome and transcriptome
    """
    input:
        transcriptome = os.path.join(
@@ -164,7 +205,53 @@ rule create_index_salmon:
                "transcriptome",
                "{organism}",
                "transcriptome.fa",
+            ),
+        genome = lambda wildcards:
+            samples_table['genome']
+            [samples_table['organism'] == wildcards.organism]
+            [0],
+
+    output:
+        genome_transcriptome = os.path.join(
+                config['output_dir'],
+                "transcriptome",
+                "{organism}",
+                "genome_transcriptome.fa",
            )
+
+    singularity:
+        "docker://bash:5.0.16"
+
+    log:
+        stderr = os.path.join(
+            config['log_dir'],
+            "{organism}_concatenate_transcriptome_and_genome.stderr.log"),
+        stdout = os.path.join(
+            config['log_dir'],
+            "{organism}_concatenate_transcriptome_and_genome.stdout.log")
+
+    shell:
+        "(cat {input.transcriptome} {input.genome} \
+        1> {output.genome_transcriptome}) \
+        1> {log.stdout} 2> {log.stderr}"
+
+
+rule create_index_salmon:
+    """
+        Create index for Salmon quantification
+    """
+    input:
+        genome_transcriptome = os.path.join(
+                config['output_dir'],
+                "transcriptome",
+                "{organism}",
+                "genome_transcriptome.fa",
+            ),
+        decoys = os.path.join(
+                config['output_dir'],
+                "transcriptome",
+                "{organism}",
+                "decoys.txt")
    output:
        index = directory(
            os.path.join(
@@ -191,7 +278,8 @@ rule create_index_salmon:

    shell:
        "(salmon index \
-        --transcripts {input.transcriptome} \
+        --transcripts {input.genome_transcriptome} \
+        --decoys {input.decoys} \
        --index {output.index} \
        --kmerLen {params.kmerLen} \
        --threads {threads}) \

--- a/images/dag_test_workflow.svg
+++ b/images/dag_test_workflow.svg
--- a/images/rule_graph.svg
+++ b/images/rule_graph.svg
--- a/pipeline_documentation.md
+++ b/pipeline_documentation.md
@@ -7,6 +7,8 @@ This document describes the individual rules of the pipeline for information pur
 * create log directories
 * **create_index_star**
 * **extract_transcriptome**
+* **extract_decoys_salmon**
+* **concatenate_transcriptome_and_genome**
 * **create_index_salmon**
 * **create_index_kallisto**
 * **extract_transcripts_as_bed12**
@@ -98,7 +100,23 @@ Create transcriptome from genome and gene annotations using [gffread](https://gi
 **Input:** `genome` and `gtf` of the input samples table    
 **Output:** transcriptome fasta file.    

- 
+
+#### extract_decoys_salmon
+Salmon indexing requires the names of the genome targets (https://combine-lab.github.io/alevin-tutorial/2019/selective-alignment/). Extract target names from the genome.
+
+
+**Input:** `genome` of the input samples table    
+**Output:** text file with the genome targert names   
+
+
+#### concatenate_transcriptome_and_genome
+Salmon indexing requires concatenated transcriptome and genome reference file (https://combine-lab.github.io/alevin-tutorial/2019/selective-alignment/).
+
+
+**Input:** `genome` of the input samples table and extracted transcriptome    
+**Output:** fasta file with concatenated genome and transcriptome   
+
+
 #### create_index_salmon
 Create index for [Salmon](https://salmon.readthedocs.io/en/latest/salmon.html) quantification. Salmon index of transcriptome, required for mapping-based mode of Salmon. The index is created via an auxiliary k-mer hash over k-mers of length 31. While mapping algorithms will make use of arbitrarily long matches between the query and reference, the k-mer size selected here will act as the minimum acceptable length for a valid match.  A k-mer size of 31 seems to work well for reads of 75bp or longer, although smaller size might improve sensitivity. A smaller k-mer size is suggested when working with shorter reads.


--- a/tests/input_files/cluster.json
+++ b/tests/input_files/cluster.json
@@ -26,6 +26,18 @@
    "threads":"1",
    "mem":"1G"
  },
+  "extract_decoys_salmon":
+  {
+    "time": "00:30:00",
+    "threads":"1",
+    "mem":"10G"
+  },
+  "concatenate_transcriptome_and_genome":
+  {
+    "time": "00:30:00",
+    "threads":"1",
+    "mem":"10G"
+  },
  "create_index_salmon":
  {
    "time": "03:00:00",

--- a/tests/test_integration_workflow/expected_output.md5
+++ b/tests/test_integration_workflow/expected_output.md5
 cbaebdb67aee4784b64aff7fec9fda42  results/kallisto_indexes/homo_sapiens/kallisto.idx
 0ac1afd9a4f380afd70be75b21814c64  results/salmon_indexes/homo_sapiens/31/salmon.idx/versionInfo.json
 51b5292e3a874119c0e1aa566e95d70c  results/salmon_indexes/homo_sapiens/31/salmon.idx/duplicate_clusters.tsv
-4c1ab7841bbd1a1e8e3b15e7750ecc38  results/salmon_indexes/homo_sapiens/31/salmon.idx/info.json
+7f8679a6e6622e1b611642b5735f357c  results/salmon_indexes/homo_sapiens/31/salmon.idx/info.json
 dee7cdc194d5d0617552b7a3b5ad8dfb  results/star_indexes/homo_sapiens/75/STAR_index/chrLength.txt
 8e2e96e2d6b7f29940ad5de40662b7cb  results/star_indexes/homo_sapiens/75/STAR_index/chrNameLength.txt
 d0826904b8afa45352906ad9591f2bfb  results/star_indexes/homo_sapiens/75/STAR_index/chrName.txt