From 8a1129a8692cd893aebe56d0062d662608c6e91d Mon Sep 17 00:00:00 2001 From: Alex Kanitz <alexander.kanitz@unibas.ch> Date: Thu, 7 Jul 2022 10:44:48 +0000 Subject: [PATCH] feat: merge mapping workflow --- .gitignore | 2 + .gitlab-ci.yml | 4 +- LICENSE | 2 +- README.md | 294 ++++-- RUNS/JOB/map/cluster.json | 67 ++ RUNS/JOB/map/config.yaml | 47 + RUNS/JOB/map/intermediate_files.txt | 25 + RUNS/JOB/map/run_workflow_local.sh | 35 + RUNS/JOB/map/run_workflow_slurm.sh | 49 + .../cluster.json | 0 .../config.yaml | 15 +- .../run_workflow_local.sh | 4 +- .../run_workflow_slurm.sh | 4 +- environment.root.yml | 2 +- environment.yml | 2 +- images/rule_graph_map.svg | 385 ++++++++ ..._annotation.svg => rule_graph_prepare.svg} | 56 +- images/workflow_dag_map.svg | 387 ++++++++ ...nnotation.svg => workflow_dag_prepare.svg} | 76 +- scripts/blocksort.sh | 29 + scripts/nh_filter.py | 39 + scripts/oligomapOutputToSam_nhfiltered.py | 237 +++++ ...es_inferior_alignments_multimappers.1_5.pl | 311 +++++++ scripts/sam_trx_to_sam_gen.pl | 838 ++++++++++++++++++ scripts/sam_uncollapse.pl | 164 ++++ test/cluster_map.json | 67 ++ ...e_annotation.jsob => cluster_prepare.json} | 0 test/config_map.yaml | 45 + ...re_annotation.yaml => config_prepare.yaml} | 16 +- test/expected_output.files | 52 +- test/expected_output.md5 | 52 +- test/test_cleanup.sh | 23 + test/test_dag.sh | 20 +- test/test_files/test_lib.fa.gz | Bin 0 -> 4492 bytes test/test_rule_graph.sh | 20 +- test/test_workflow_local.sh | 43 +- test/test_workflow_slurm.sh | 62 +- workflow/map/Snakefile | 647 ++++++++++++++ workflow/prepare/Snakefile | 661 ++++++++++++++ workflow/prepare_annotation/Snakefile | 470 ---------- 40 files changed, 4540 insertions(+), 712 deletions(-) create mode 100644 RUNS/JOB/map/cluster.json create mode 100644 RUNS/JOB/map/config.yaml create mode 100644 RUNS/JOB/map/intermediate_files.txt create mode 100755 RUNS/JOB/map/run_workflow_local.sh create mode 100755 RUNS/JOB/map/run_workflow_slurm.sh rename RUNS/JOB/{prepare_annotation => prepare}/cluster.json (100%) rename RUNS/JOB/{prepare_annotation => prepare}/config.yaml (89%) rename RUNS/JOB/{prepare_annotation => prepare}/run_workflow_local.sh (85%) rename RUNS/JOB/{prepare_annotation => prepare}/run_workflow_slurm.sh (91%) create mode 100644 images/rule_graph_map.svg rename images/{rule_graph_prepare_annotation.svg => rule_graph_prepare.svg} (93%) create mode 100644 images/workflow_dag_map.svg rename images/{workflow_dag_prepare_annotation.svg => workflow_dag_prepare.svg} (97%) create mode 100755 scripts/blocksort.sh create mode 100755 scripts/nh_filter.py create mode 100755 scripts/oligomapOutputToSam_nhfiltered.py create mode 100755 scripts/sam_remove_duplicates_inferior_alignments_multimappers.1_5.pl create mode 100755 scripts/sam_trx_to_sam_gen.pl create mode 100755 scripts/sam_uncollapse.pl create mode 100644 test/cluster_map.json rename test/{cluster_prepare_annotation.jsob => cluster_prepare.json} (100%) create mode 100644 test/config_map.yaml rename test/{config_prepare_annotation.yaml => config_prepare.yaml} (79%) create mode 100755 test/test_cleanup.sh create mode 100644 test/test_files/test_lib.fa.gz create mode 100644 workflow/map/Snakefile create mode 100644 workflow/prepare/Snakefile delete mode 100644 workflow/prepare_annotation/Snakefile diff --git a/.gitignore b/.gitignore index b7175b0..10ca242 100644 --- a/.gitignore +++ b/.gitignore @@ -214,3 +214,5 @@ results logs nohup.out .wget-hsts +wget-log* +snakemake_report_*.html diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 0471a0d..2e38272 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -10,7 +10,9 @@ before_script: test: script: + - bash test/test_workflow_local.sh - bash test/test_dag.sh - bash test/test_rule_graph.sh - - bash test/test_workflow_local.sh +after_script: + - bash test/test_cleanup.sh diff --git a/LICENSE b/LICENSE index 087df66..a2f050f 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2019 AnnotationPipelines +Copyright (c) 2019 Zavolab, University of Basel Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index c4a101b..89cce75 100644 --- a/README.md +++ b/README.md @@ -1,36 +1,45 @@ -# mir-prepare-annotation +# _MIRFLOWZ_ -[Snakemake][snakemake] workflow to download and prepare the necessary files for -smallRNA-seq related pipelines [mir-map][mir-map] and [mir-quant][mir-quant]. - -The scheme below is a visual representation of an example run of the workflow: - -> ![rule-graph-prep-anno][rule-graph-prep-anno] +Suite of [Snakemake][snakemake] workflows for the mapping and quantification +of smallRNA-seq libraries, including miRNA and isomiR quantification. ## Installation +All workflows live inside this repository and will be available for you to run +after following the installation instructions layed out in this section. + ### Cloning the repository Traverse to the desired path on your file system, then clone the repository and -move into it with: +change into it with: ```bash -git clone ssh://git@git.scicore.unibas.ch:2222/zavolan_group/pipelines/mir-prepare-annotation.git -cd mir-prepare-annotation +git clone ssh://git@git.scicore.unibas.ch:2222/zavolan_group/pipelines/mirflowz.git +cd mirflowz ``` -### Setting up a virtual environment +### Dependencies + +For improved reproducibility and reusability of the workflows, as well as an +easy means to run them on a high performance computing (HPC) cluster managed, +e.g., by [Slurm][slurm], all steps of the workflows run inside their own +containers. As a consequence, running this workflow has very few individual +dependencies. It does, however, require the package manager [Conda][conda] and +the container engine [Singularity][singularity] to be installed before you +proceed. -Workflow dependencies can be conveniently installed with the [Conda][conda] -package manager. We recommend that you install [Miniconda][miniconda] for your -system. +> **NOTE:** If you have root permissions for your system and you do not already +> have `singularity` installed globally on your system, you can conveniently +> use Conda to install it. In that case, replace `environment.yml` with +> `environment.root.yml` in the first command below. -For improved reproducibility and reusability of the workflow, as well as an -easy means to run it on a high performance computing (HPC) cluster managed, -e.g., by [Slurm][slurm], all steps of the workflow run in their own container. -As a consequence, running this workflow has very few individual dependencies. -It does, however, require that the container engine [Singularity][singularity] -is installed. +### Setting up a virtual environment + +It you do not already have [Conda][conda] installed globally on your system, +we recommend that you install [Miniconda][miniconda]. For faster creation of +the environment (and Conda environments in general), you can also install +[Mamba][mamba] on top of Conda. In that case, replace `conda` with `mamba` in +the commands below (particularly in `conda env create`). Create and activate the environment with necessary dependencies with Conda: @@ -39,60 +48,113 @@ conda env create -f environment.yml conda activate mir-pipelines ``` -> **NOTE:** For faster creation of the environment (and Conda environments in -> general), you can also install [Mamba][mamba] on top of Conda. -> -> **NOTE:** If you have root permissions for your system and you do not have -> `singularity` installed globally on your system, you can use Conda to install -> it. In that case, replace `environment.yml` with `environment.root.yml` in -> the first command above. - ### Testing your installation -Several tests are prepared to check the integrity of the workflow. +Several tests are provided to check the integrity of the installation. Follow +the instructions in this section to make sure the workflows are ready to use. + +#### Run test workflows on local machine -Change into the test directory: +Execute the following command to run the test workflows on your local machine: ```bash -cd test/ +bash test/test_workflow_local.sh ``` -#### DAG and rule graph +#### Run test workflows via Slurm -Execute the following commands to generate DAG and rule graph images. Outputs -will be found in the `images/` directory. +Execute the following command to run the test workflows on a Slurm-managed +high-performance computing (HPC) cluster: ```bash -./test_dag.sh -./test_rule_graph.sh +bash test/test_workflow_slurm.sh ``` -#### Run workflow on local machine +> **NOTE:** The Slurm tests were configured to run on the developer's cluster. +> Several files may need to be modified if you would like to run tests (and +> the actual workflows) on other systems. These may possibly include the +> following (relative to the repository root directory), but potentially others +> as well: +> +> * `jobscript.sh` +> * `RUNS/JOB/{prepare,map,quantify}/cluster.json` +> * `test/test_workflow_slurm.sh` +> +> Consult the manual of +> your batch scheduling system, as well as the section of the Snakemake manual +> dealing with [cluster execution]. -Execute the following command to run the test workflow on your local machine: +#### DAG and rule graph + +Execute the following commands to generate DAG and rule graph images for each +workflow. Outputs will be found in the `images/` directory in the repository +root. + +> **NOTE:** It is essential that you run the DAG and rule graph tests only +> _after_ running the test workflow. This is because they require files to be +> available that will only be created when running that workflows. ```bash -./test_workflow_local.sh +bash test/test_dag.sh +bash test/test_rule_graph.sh ``` -#### Run workflow via Slurm +#### Clean up test results -Execute the following command to run the test workflow on a Slurm-managed -high-performance computing (HPC) cluster: +After successfully running the tests above, you can run the following command +to remove all artifacts generated by the test runs: ```bash -./test_workflow_slurm.sh +bash test/test_cleanup.sh ``` -> **NOTE:** This was set up to run on the developer's Slurm cluster. Several -> files may need to be modified on other systems, including `jobscript.sh`, -> `workflow/prepare_annotation/cluster.json` and `test/test_workflow_slurm.sh` -> itself (all relative to the repository's root directory). Consult the manual -> of your batch scheduling system, as well as the section of the Snakemake -> manual dealing with [cluster execution]. - ## Usage +Now that your virtual environment is set up and the workflows are deployed and +tested, you can go ahead and run the workflows on your samples. + +But first, here is a brief description of what each of the three workflows +does: + +### Workflow description + +The repository contains the following workflows, all implemented in Snakemake +and fully containerized: + +#### _PREPARE_ + +The first workflow, **_PREPARE_** downloads and processes "genome resources" +from the publicly available repositories [Ensembl][ensembl] and +[miRBase][mirbase] according to your instructions. Resources are then processed +to prepare indexes and other contingent resources that will be used in later +steps. + +The scheme below is a visual representation of an example run of the +**_PREPARE_** workflow: + +> ![rule-graph-prepare][rule-graph-prepare] + +#### _MAP_ + +The second workflow, **_MAP_** aligns the user-provided short read smallRNA-seq +libraries against the references generated with the **_PREPARE_** workflow. For +increased fidelity it uses two separate aligning tools, [Segemehl][segemehl] +and our in-house tool [Oligomap][oligomap]. In both cases, reads are aligned +separately to the genome and the transcriptome. Afterwards, alignments are +merged in a way that only the best alignment (or alignments) of each read are +kept. + +The scheme below is a visual representation of an example run of the **_MAP_** +workflow: + +> ![rule-graph-map][rule-graph-map] + +#### _QUANTIFY_ + +Coming soon... + +### Running the workflows + Assuming that you are currently inside the repository's root directory, change to the run root directory: @@ -100,39 +162,68 @@ to the run root directory: cd RUNS ``` -Now make a clean copy of the `JOB` directory and name it what you want, e.g., -`MY_ANALYSIS`: +Now make a clean copy of the `JOB` directory and name it whatever you want, +e.g., `MY_ANALYSIS`: ```bash cp -r JOB MY_ANALYSIS ``` -Now traverse to the directory from where you will actually execute the pipeline -with: +Now traverse to the new directory. You will see that there are three +subdirectories, one for each workflow, change into the one you would like to +run (probably `prepare`). ```bash -cd MY_ANALYSIS/prepare_annotation +cd MY_ANALYSIS +cd {prepare,map,quantify} ``` -Before running the pipeline adjust the parameters in file -`config_prepare_annotation.yaml`: +Before running the workflow adjust the parameters in file `config.yaml`. The +file explains what each of the parameters means and how you can meaningfully +fill them in. + +To start workflow execution, run: + +```bash +./run_workflow_slurm.sh +``` + +> **NOTE:** Check back in the installation section to find more information on +> how to run the workflows on your HPC system. Although we do provide a +> workflow runner to execute the workflows locally (`run_workflow_local.sh`) on +> your laptop or desktop machine, we recommend against that for real-world +> data, as the resources requirements for running the workflows are very high +> (can be >50 Gigs of memory!). + +After successful execution of the workflow, results and logs will be found in +`results/` and `logs/` directories, respectively. + +### Appendix: Configuration files + +_MIRFLOWZ_ comes with template configuration files for each individual +workflow. These contain notes on how to fill in each parameter. + +#### _PREPARE_ + +**File location:** `RUNS/JOB/prepare/config.yaml` ```yaml --- ############################## GLOBAL PARAMETERS ############################## -## Isomirs annotation file -## Number of base pairs to add/substract from 5' (start) and 3' (end) coordinates. -bp_5p: [0] # array of numbers, e.g., [-2,-1,0,+1], to include 2 upstream and 1 downstream nts -bp_3p: [0] # array of numbers, e.g., [-2,-1,0,+1], to include 2 upstream and 1 downstream nts - -## Directories -output_dir: "results" +# Directories +# Usually there is no need to change these scripts_dir: "../../../scripts" +output_dir: "results" local_log: "logs/local" cluster_log: "logs/cluster" +# Isomirs annotation file +# Number of base pairs to add/substract from 5' (start) and 3' (end) coordinates. +bp_5p: [0] # array of numbers, e.g., [-2,-1,0,+1], to include 2 upstream and 1 downstream nts +bp_3p: [0] # array of numbers, e.g., [-2,-1,0,+1], to include 2 upstream and 1 downstream nts + # List of "organism/prefix" identifiers organism: ["org/pre"] # e.g., ["homo_sapiens/GRCh38.100", "mus_musculus/GRCm37.98"] @@ -161,33 +252,74 @@ org/pre: # One section for each list item in "organism"; names have to match pr > annotation pipelines. The miRNA annotation file is expected to originate from > miRBase, or follow their exact layout. -To start pipeline execution locally: +#### _MAP_ -```bash -./run_workflow_local.sh -``` +**File location:** `RUNS/JOB/map/config.yaml` -To start pipeline execution via Slurm: +```yaml +--- -```bash -./run_workflow_slurm.sh -``` +############################## GLOBAL PARAMETERS ############################## + +# Directories +# Usually there is no need to change these +scripts_dir: "../../../scripts" +output_dir: "results" +local_log: "logs/local" +cluster_log: "logs/cluster" -> *This is strongly recommended due to excessive resource needs of some tools!* +# Resources: genome, transcriptome, genes, miRs +# All of these are produced by the "prepare" workflow +genome: "path/to/genome.processed.fa" +gtf: "path/to/gene_annotations.filtered.gtf" +transcriptome: "path/to/transcriptome_idtrim.fa" +transcriptome_index_segemehl: "path/to/transcriptome_index_segemehl.idx" +genome_index_segemehl: "path/to/genome_index_segemehl.idx" +exons: "path/to/exons.bed" +header_of_collapsed_fasta: "path/to/headerOfCollapsedFasta.sam" + +# Tool parameters: quality filter +q_value: 10 # Q (Phred) score; minimum quality score to keep +p_value: 50 # minimum % of bases that must have Q quality + +# Tool parameters: adapter removal +error_rate: 0.1 # fraction of allowed errors +minimum_length: 15 # discard processed reads shorter than the indicated length +overlap: 3 # minimum overlap length of adapter and read to trim the bases +max_n: 0 # discard reads containing more than the indicated number of N bases + +# Tool parameters: mapping +max_length_reads: 30 # maximum length of processed reads to map with oligomap +nh: 100 # discard reads with more mappings than the indicated number + +# Sample information +input_dir: "path/to/input_directory" +sample: ["sample_1"] # put all samples, separated by comma & without file extension + # (e.g., "sample_1" instead of "sample_1.fa.gz") + +######################## PARAMETERS SPECIFIC TO SAMPLE ######################## + +sample_1: # One section for each list item in "sample"; names have to match precisely + adapter: "XXXXXXXXXXXXXXXXXXXX" # 3' adapter sequence to trim + format: "fa" # file format; currently supported: "fa" -After successful execution of the workflow, results and logs will be found in -`results/` and `logs/` directories, respectively. +... +``` + +#### _QUANTIFY_ -> **Note:** See the note in the installation section for configuring workflow -> runs on your individual batch scheduling system, Slurm or otherwise. +Coming soon... [conda]: <https://docs.conda.io/projects/conda/en/latest/index.html> [cluster execution]: <https://snakemake.readthedocs.io/en/stable/executing/cluster-cloud.html#cluster-execution> +[ensembl]: <https://ensembl.org/> [mamba]: <https://github.com/mamba-org/mamba> [miniconda-installation]: <https://docs.conda.io/en/latest/miniconda.html> -[mir-map]: <https://git.scicore.unibas.ch/zavolan_group/pipelines/mir-map> -[mir-quant]: <https://git.scicore.unibas.ch/zavolan_group/pipelines/mir-quant> -[rule-graph-prep-anno]: images/rule_graph_prepare_annotation.svg -[snakemake]: <https://snakemake.readthedocs.io/en/stable/> +[mirbase]: <https://mirbase.org/> +[oligomap]: <https://bio.tools/oligomap> +[rule-graph-prepare]: images/rule_graph_prepare.svg +[rule-graph-map]: images/rule_graph_map.svg +[segemehl]: <https://www.bioinf.uni-leipzig.de/Software/segemehl/> [singularity]: <https://sylabs.io/singularity/> [slurm]: <https://slurm.schedmd.com/documentation.html> +[snakemake]: <https://snakemake.readthedocs.io/en/stable/> diff --git a/RUNS/JOB/map/cluster.json b/RUNS/JOB/map/cluster.json new file mode 100644 index 0000000..f771d03 --- /dev/null +++ b/RUNS/JOB/map/cluster.json @@ -0,0 +1,67 @@ +{ + "__default__" : + { + "queue": "6hours", + "time": "05:00:00", + "threads": "1", + "mem": "4G" + }, + + "cutadapt": + { + "threads":"{resources.threads}" + }, + + "mapping_genome_segemehl": + { + "queue": "1day", + "time": "{resources.time}:00:00", + "threads":"{resources.threads}", + "mem":"{resources.mem}G" + }, + + "mapping_transcriptome_segemehl": + { + "queue": "1day", + "time": "{resources.time}:00:00", + "threads":"{resources.threads}", + "mem":"{resources.mem}G" + }, + + "mapping_genome_oligomap": + { + "time": "{resources.time}:00:00", + "threads":"{resources.threads}", + "mem":"{resources.mem}G" + }, + + "mapping_transcriptome_oligomap": + { + "time": "{resources.time}:00:00", + "threads":"{resources.threads}", + "mem":"{resources.mem}G" + }, + + "sort_transcriptome_oligomap": + { + "threads":"{resources.threads}" + }, + + "sort_genome_oligomap": + { + "time": "{resources.time}:00:00", + "threads":"{resources.threads}" + }, + + "oligomap_genome_toSAM": + { + "time": "{resources.time}-00:00:00", + "queue": "{resources.queue}day" + }, + + "remove_inferiors": + { + "threads":"{resources.threads}", + "mem":"{resources.mem}G" + } +} diff --git a/RUNS/JOB/map/config.yaml b/RUNS/JOB/map/config.yaml new file mode 100644 index 0000000..bcd4f7b --- /dev/null +++ b/RUNS/JOB/map/config.yaml @@ -0,0 +1,47 @@ +--- + +############################## GLOBAL PARAMETERS ############################## + +# Directories +# Usually there is no need to change these +scripts_dir: "../../../scripts" +output_dir: "results" +local_log: "logs/local" +cluster_log: "logs/cluster" + +# Resources: genome, transcriptome, genes, miRs +# All of these are produced by the "prepare" workflow +genome: "path/to/genome.processed.fa" +gtf: "path/to/gene_annotations.filtered.gtf" +transcriptome: "path/to/transcriptome_idtrim.fa" +transcriptome_index_segemehl: "path/to/transcriptome_index_segemehl.idx" +genome_index_segemehl: "path/to/genome_index_segemehl.idx" +exons: "path/to/exons.bed" +header_of_collapsed_fasta: "path/to/headerOfCollapsedFasta.sam" + +# Tool parameters: quality filter +q_value: 10 # Q (Phred) score; minimum quality score to keep +p_value: 50 # minimum % of bases that must have Q quality + +# Tool parameters: adapter removal +error_rate: 0.1 # fraction of allowed errors +minimum_length: 15 # discard processed reads shorter than the indicated length +overlap: 3 # minimum overlap length of adapter and read to trim the bases +max_n: 0 # discard reads containing more than the indicated number of N bases + +# Tool parameters: mapping +max_length_reads: 30 # maximum length of processed reads to map with oligomap +nh: 100 # discard reads with more mappings than the indicated number + +# Sample information +input_dir: "path/to/input_directory" +sample: ["sample_1"] # put all samples, separated by comma & without file extension + # (e.g., "sample_1" instead of "sample_1.fa.gz") + +######################## PARAMETERS SPECIFIC TO SAMPLE ######################## + +sample_1: # One section for each list item in "sample"; names have to match precisely + adapter: "XXXXXXXXXXXXXXXXXXXX" # 3' adapter sequence to trim + format: "fa" # file format; currently supported: "fa" + +... diff --git a/RUNS/JOB/map/intermediate_files.txt b/RUNS/JOB/map/intermediate_files.txt new file mode 100644 index 0000000..fca722a --- /dev/null +++ b/RUNS/JOB/map/intermediate_files.txt @@ -0,0 +1,25 @@ +results/small_input/formatted.fasta +results/small_input/header_sorted_catMappings.sam +results/small_input/fa/reads.fa +results/small_input/GenomeMappings.sam +results/small_input/segemehlTranscriptome_map.sam +results/small_input/noheader_GenomeMappings.sam +results/small_input/oligoGenome_map.fa +results/small_input/cutted.fasta +results/small_input/segemehlGenome_map.sam +results/small_input/oligoTranscriptome_converted.sam +results/small_input/nhfiltered_GenomeMappings.sam +results/small_input/oligoTranscriptome_map.fa +results/small_input/oligoTranscriptome_report.txt +results/small_input/TransToGen.sam +results/small_input/concatenated_header_catMappings.sam +results/small_input/oligoGenome_converted.sam +results/small_input/TranscriptomeMappings.sam +results/small_input/noheader_TranscriptomeMappings.sam +results/small_input/oligoTranscriptome_sorted.fa +results/small_input/collapsed.fasta +results/small_input/catMappings.sam +results/small_input/filtered_for_oligomap.fasta +results/small_input/oligoGenome_sorted.fa +results/small_input/nhfiltered_TranscriptomeMappings.sam +results/small_input/oligoGenome_report.txt \ No newline at end of file diff --git a/RUNS/JOB/map/run_workflow_local.sh b/RUNS/JOB/map/run_workflow_local.sh new file mode 100755 index 0000000..c31cd6b --- /dev/null +++ b/RUNS/JOB/map/run_workflow_local.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +# Tear down environment +cleanup () { + rc=$? + rm $(cat intermediate_files.txt) + cd $user_dir + echo "Exit status: $rc" +} +trap cleanup EXIT + +# Set up test environment +set -eo pipefail # ensures that script exits at first command that exits with non-zero status +set -u # ensures that script exits when unset variables are used +set -x # facilitates debugging by printing out executed commands +user_dir=$PWD +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" +cd $script_dir + +# Run workflow +snakemake \ + --printshellcmds \ + --snakefile="../snakemake/Snakefile" \ + --use-singularity \ + --singularity-args "--bind ${PWD}/../" \ + --cores=4 \ + --rerun-incomplete \ + --configfile="config.yaml" \ + --verbose + +# Snakemake report +snakemake \ + --snakefile="../snakemake/Snakefile" \ + --configfile="config.yaml" \ + --report="snakemake_report.html" diff --git a/RUNS/JOB/map/run_workflow_slurm.sh b/RUNS/JOB/map/run_workflow_slurm.sh new file mode 100755 index 0000000..fcc8b52 --- /dev/null +++ b/RUNS/JOB/map/run_workflow_slurm.sh @@ -0,0 +1,49 @@ +#!/bin/bash + +# Tear down environment +cleanup () { + rc=$? + rm $(cat intermediate_files.txt) + cd $user_dir + echo "Exit status: $rc" +} +trap cleanup EXIT + +# Set up test environment +set -eo pipefail # ensures that script exits at first command that exits with non-zero status +set -u # ensures that script exits when unset variables are used +set -x # facilitates debugging by printing out executed commands +mkdir -p logs/cluster +mkdir -p logs/local +mkdir -p results/ +user_dir=$PWD +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" +cd $script_dir + +# Run workflow +snakemake \ + --snakefile="../snakemake/Snakefile" \ + --configfile="config.yaml" \ + --cluster-config="../cluster.json" \ + --cores=256 \ + --jobscript="../jobscript.sh" \ + --printshellcmds \ + --rerun-incomplete \ + --use-singularity \ + --singularity-args="--no-home --bind ${PWD}/../" \ + --cluster "sbatch \ + --cpus-per-task={cluster.threads} \ + --mem={cluster.mem} \ + --qos={cluster.queue} \ + --time={cluster.time} \ + --export=JOB_NAME={rule} \ + -o {params.cluster_log} \ + -p scicore \ + --open-mode=append" \ + --verbose + +# Snakemake report +snakemake \ + --snakefile="../snakemake/Snakefile" \ + --configfile="config.yaml" \ + --report="snakemake_report.html" diff --git a/RUNS/JOB/prepare_annotation/cluster.json b/RUNS/JOB/prepare/cluster.json similarity index 100% rename from RUNS/JOB/prepare_annotation/cluster.json rename to RUNS/JOB/prepare/cluster.json diff --git a/RUNS/JOB/prepare_annotation/config.yaml b/RUNS/JOB/prepare/config.yaml similarity index 89% rename from RUNS/JOB/prepare_annotation/config.yaml rename to RUNS/JOB/prepare/config.yaml index b6f688c..81a4fbb 100644 --- a/RUNS/JOB/prepare_annotation/config.yaml +++ b/RUNS/JOB/prepare/config.yaml @@ -2,17 +2,18 @@ ############################## GLOBAL PARAMETERS ############################## -## Isomirs annotation file -## Number of base pairs to add/substract from 5' (start) and 3' (end) coordinates. -bp_5p: [0] # array of numbers, e.g., [-2,-1,0,+1], to include 2 upstream and 1 downstream nts -bp_3p: [0] # array of numbers, e.g., [-2,-1,0,+1], to include 2 upstream and 1 downstream nts - -## Directories -output_dir: "results" +# Directories +# Usually there is no need to change these scripts_dir: "../../../scripts" +output_dir: "results" local_log: "logs/local" cluster_log: "logs/cluster" +# Isomirs annotation file +# Number of base pairs to add/substract from 5' (start) and 3' (end) coordinates. +bp_5p: [0] # array of numbers, e.g., [-2,-1,0,+1], to include 2 upstream and 1 downstream nts +bp_3p: [0] # array of numbers, e.g., [-2,-1,0,+1], to include 2 upstream and 1 downstream nts + # List of "organism/prefix" identifiers organism: ["org/pre"] # e.g., ["homo_sapiens/GRCh38.100", "mus_musculus/GRCm37.98"] diff --git a/RUNS/JOB/prepare_annotation/run_workflow_local.sh b/RUNS/JOB/prepare/run_workflow_local.sh similarity index 85% rename from RUNS/JOB/prepare_annotation/run_workflow_local.sh rename to RUNS/JOB/prepare/run_workflow_local.sh index 085f3b7..63caff9 100755 --- a/RUNS/JOB/prepare_annotation/run_workflow_local.sh +++ b/RUNS/JOB/prepare/run_workflow_local.sh @@ -18,7 +18,7 @@ cd $script_dir # Run workflow snakemake \ - --snakefile="../../../workflow/prepare_annotation/Snakefile" \ + --snakefile="../../../workflow/prepare/Snakefile" \ --configfile="config.yaml" \ --use-singularity \ --singularity-args "--bind ${PWD}/../../../" \ @@ -29,6 +29,6 @@ snakemake \ # Snakemake report snakemake \ - --snakefile="../../../workflow/prepare_annotation/Snakefile" \ + --snakefile="../../../workflow/prepare/Snakefile" \ --configfile="config.yaml" \ --report="snakemake_report.html" diff --git a/RUNS/JOB/prepare_annotation/run_workflow_slurm.sh b/RUNS/JOB/prepare/run_workflow_slurm.sh similarity index 91% rename from RUNS/JOB/prepare_annotation/run_workflow_slurm.sh rename to RUNS/JOB/prepare/run_workflow_slurm.sh index be575ff..d5d9390 100755 --- a/RUNS/JOB/prepare_annotation/run_workflow_slurm.sh +++ b/RUNS/JOB/prepare/run_workflow_slurm.sh @@ -25,7 +25,7 @@ cd $script_dir # Run workflow snakemake \ - --snakefile="../../../workflow/prepare_annotation/Snakefile" \ + --snakefile="../../../workflow/prepare/Snakefile" \ --configfile="config.yaml" \ --cluster-config="cluster.json" \ --cluster "sbatch \ @@ -47,6 +47,6 @@ snakemake \ # Snakemake report snakemake \ - --snakefile="../../../workflow/prepare_annotation/Snakefile" \ + --snakefile="../../../workflow/prepare/Snakefile" \ --configfile="config.yaml" \ --report="snakemake_report.html" diff --git a/environment.root.yml b/environment.root.yml index b961a0d..3156841 100644 --- a/environment.root.yml +++ b/environment.root.yml @@ -1,4 +1,4 @@ -name: mir-pipelines +name: mirflowz channels: - bioconda - defaults diff --git a/environment.yml b/environment.yml index 533927f..f80ec00 100644 --- a/environment.yml +++ b/environment.yml @@ -1,4 +1,4 @@ -name: mir-pipelines +name: mirflowz channels: - bioconda - defaults diff --git a/images/rule_graph_map.svg b/images/rule_graph_map.svg new file mode 100644 index 0000000..ba79835 --- /dev/null +++ b/images/rule_graph_map.svg @@ -0,0 +1,385 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no"?> +<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" + "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"> +<!-- Generated by graphviz version 2.40.1 (20161225.0304) + --> +<!-- Title: snakemake_dag Pages: 1 --> +<svg width="646pt" height="1484pt" + viewBox="0.00 0.00 646.00 1484.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"> +<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 1480)"> +<title>snakemake_dag</title> +<polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-1480 642,-1480 642,4 -4,4"/> +<!-- 0 --> +<g id="node1" class="node"> +<title>0</title> +<path fill="none" stroke="#56d892" stroke-width="2" d="M356,-36C356,-36 326,-36 326,-36 320,-36 314,-30 314,-24 314,-24 314,-12 314,-12 314,-6 320,0 326,0 326,0 356,0 356,0 362,0 368,-6 368,-12 368,-12 368,-24 368,-24 368,-30 362,-36 356,-36"/> +<text text-anchor="middle" x="341" y="-15.5" font-family="sans" font-size="10.00" fill="#000000">finish</text> +</g> +<!-- 1 --> +<g id="node2" class="node"> +<title>1</title> +<path fill="none" stroke="#80d856" stroke-width="2" d="M364.5,-108C364.5,-108 317.5,-108 317.5,-108 311.5,-108 305.5,-102 305.5,-96 305.5,-96 305.5,-84 305.5,-84 305.5,-78 311.5,-72 317.5,-72 317.5,-72 364.5,-72 364.5,-72 370.5,-72 376.5,-78 376.5,-84 376.5,-84 376.5,-96 376.5,-96 376.5,-102 370.5,-108 364.5,-108"/> +<text text-anchor="middle" x="341" y="-87.5" font-family="sans" font-size="10.00" fill="#000000">index_bam</text> +</g> +<!-- 1->0 --> +<g id="edge1" class="edge"> +<title>1->0</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M341,-71.8314C341,-64.131 341,-54.9743 341,-46.4166"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="344.5001,-46.4132 341,-36.4133 337.5001,-46.4133 344.5001,-46.4132"/> +</g> +<!-- 2 --> +<g id="node3" class="node"> +<title>2</title> +<path fill="none" stroke="#70d856" stroke-width="2" d="M377.5,-180C377.5,-180 304.5,-180 304.5,-180 298.5,-180 292.5,-174 292.5,-168 292.5,-168 292.5,-156 292.5,-156 292.5,-150 298.5,-144 304.5,-144 304.5,-144 377.5,-144 377.5,-144 383.5,-144 389.5,-150 389.5,-156 389.5,-156 389.5,-168 389.5,-168 389.5,-174 383.5,-180 377.5,-180"/> +<text text-anchor="middle" x="341" y="-159.5" font-family="sans" font-size="10.00" fill="#000000">sort_by_position</text> +</g> +<!-- 2->1 --> +<g id="edge2" class="edge"> +<title>2->1</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M341,-143.8314C341,-136.131 341,-126.9743 341,-118.4166"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="344.5001,-118.4132 341,-108.4133 337.5001,-118.4133 344.5001,-118.4132"/> +</g> +<!-- 3 --> +<g id="node4" class="node"> +<title>3</title> +<path fill="none" stroke="#56d85b" stroke-width="2" d="M376.5,-252C376.5,-252 305.5,-252 305.5,-252 299.5,-252 293.5,-246 293.5,-240 293.5,-240 293.5,-228 293.5,-228 293.5,-222 299.5,-216 305.5,-216 305.5,-216 376.5,-216 376.5,-216 382.5,-216 388.5,-222 388.5,-228 388.5,-228 388.5,-240 388.5,-240 388.5,-246 382.5,-252 376.5,-252"/> +<text text-anchor="middle" x="341" y="-231.5" font-family="sans" font-size="10.00" fill="#000000">convert_to_bam</text> +</g> +<!-- 3->2 --> +<g id="edge3" class="edge"> +<title>3->2</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M341,-215.8314C341,-208.131 341,-198.9743 341,-190.4166"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="344.5001,-190.4132 341,-180.4133 337.5001,-190.4133 344.5001,-190.4132"/> +</g> +<!-- 4 --> +<g id="node5" class="node"> +<title>4</title> +<path fill="none" stroke="#d85656" stroke-width="2" d="M379.5,-324C379.5,-324 302.5,-324 302.5,-324 296.5,-324 290.5,-318 290.5,-312 290.5,-312 290.5,-300 290.5,-300 290.5,-294 296.5,-288 302.5,-288 302.5,-288 379.5,-288 379.5,-288 385.5,-288 391.5,-294 391.5,-300 391.5,-300 391.5,-312 391.5,-312 391.5,-318 385.5,-324 379.5,-324"/> +<text text-anchor="middle" x="341" y="-303.5" font-family="sans" font-size="10.00" fill="#000000">uncollapse_reads</text> +</g> +<!-- 4->3 --> +<g id="edge4" class="edge"> +<title>4->3</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M341,-287.8314C341,-280.131 341,-270.9743 341,-262.4166"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="344.5001,-262.4132 341,-252.4133 337.5001,-262.4133 344.5001,-262.4132"/> +</g> +<!-- 5 --> +<g id="node6" class="node"> +<title>5</title> +<path fill="none" stroke="#c6d856" stroke-width="2" d="M379,-396C379,-396 303,-396 303,-396 297,-396 291,-390 291,-384 291,-384 291,-372 291,-372 291,-366 297,-360 303,-360 303,-360 379,-360 379,-360 385,-360 391,-366 391,-372 391,-372 391,-384 391,-384 391,-390 385,-396 379,-396"/> +<text text-anchor="middle" x="341" y="-375.5" font-family="sans" font-size="10.00" fill="#000000">remove_inferiors</text> +</g> +<!-- 5->4 --> +<g id="edge5" class="edge"> +<title>5->4</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M341,-359.8314C341,-352.131 341,-342.9743 341,-334.4166"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="344.5001,-334.4132 341,-324.4133 337.5001,-334.4133 344.5001,-334.4132"/> +</g> +<!-- 6 --> +<g id="node7" class="node"> +<title>6</title> +<path fill="none" stroke="#61d856" stroke-width="2" d="M356,-468C356,-468 326,-468 326,-468 320,-468 314,-462 314,-456 314,-456 314,-444 314,-444 314,-438 320,-432 326,-432 326,-432 356,-432 356,-432 362,-432 368,-438 368,-444 368,-444 368,-456 368,-456 368,-462 362,-468 356,-468"/> +<text text-anchor="middle" x="341" y="-447.5" font-family="sans" font-size="10.00" fill="#000000">sort_id</text> +</g> +<!-- 6->5 --> +<g id="edge6" class="edge"> +<title>6->5</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M341,-431.8314C341,-424.131 341,-414.9743 341,-406.4166"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="344.5001,-406.4132 341,-396.4133 337.5001,-406.4133 344.5001,-406.4132"/> +</g> +<!-- 7 --> +<g id="node8" class="node"> +<title>7</title> +<path fill="none" stroke="#d88556" stroke-width="2" d="M366,-540C366,-540 316,-540 316,-540 310,-540 304,-534 304,-528 304,-528 304,-516 304,-516 304,-510 310,-504 316,-504 316,-504 366,-504 366,-504 372,-504 378,-510 378,-516 378,-516 378,-528 378,-528 378,-534 372,-540 366,-540"/> +<text text-anchor="middle" x="341" y="-519.5" font-family="sans" font-size="10.00" fill="#000000">add_header</text> +</g> +<!-- 7->6 --> +<g id="edge7" class="edge"> +<title>7->6</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M341,-503.8314C341,-496.131 341,-486.9743 341,-478.4166"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="344.5001,-478.4132 341,-468.4133 337.5001,-478.4133 344.5001,-478.4132"/> +</g> +<!-- 8 --> +<g id="node9" class="node"> +<title>8</title> +<path fill="none" stroke="#8fd856" stroke-width="2" d="M369,-612C369,-612 313,-612 313,-612 307,-612 301,-606 301,-600 301,-600 301,-588 301,-588 301,-582 307,-576 313,-576 313,-576 369,-576 369,-576 375,-576 381,-582 381,-588 381,-588 381,-600 381,-600 381,-606 375,-612 369,-612"/> +<text text-anchor="middle" x="341" y="-591.5" font-family="sans" font-size="10.00" fill="#000000">cat_mapping</text> +</g> +<!-- 8->7 --> +<g id="edge8" class="edge"> +<title>8->7</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M341,-575.8314C341,-568.131 341,-558.9743 341,-550.4166"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="344.5001,-550.4132 341,-540.4133 337.5001,-550.4133 344.5001,-550.4132"/> +</g> +<!-- 9 --> +<g id="node10" class="node"> +<title>9</title> +<path fill="none" stroke="#d6d856" stroke-width="2" d="M304,-684C304,-684 248,-684 248,-684 242,-684 236,-678 236,-672 236,-672 236,-660 236,-660 236,-654 242,-648 248,-648 248,-648 304,-648 304,-648 310,-648 316,-654 316,-660 316,-660 316,-672 316,-672 316,-678 310,-684 304,-684"/> +<text text-anchor="middle" x="276" y="-663.5" font-family="sans" font-size="10.00" fill="#000000">trans_to_gen</text> +</g> +<!-- 9->8 --> +<g id="edge9" class="edge"> +<title>9->8</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M292.4022,-647.8314C300.0415,-639.3694 309.2683,-629.1489 317.6207,-619.8971"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="320.2738,-622.1813 324.3769,-612.4133 315.0779,-617.4906 320.2738,-622.1813"/> +</g> +<!-- 10 --> +<g id="node11" class="node"> +<title>10</title> +<path fill="none" stroke="#56b9d8" stroke-width="2" d="M349,-756C349,-756 201,-756 201,-756 195,-756 189,-750 189,-744 189,-744 189,-732 189,-732 189,-726 195,-720 201,-720 201,-720 349,-720 349,-720 355,-720 361,-726 361,-732 361,-732 361,-744 361,-744 361,-750 355,-756 349,-756"/> +<text text-anchor="middle" x="275" y="-735.5" font-family="sans" font-size="10.00" fill="#000000">remove_headers_transcriptome</text> +</g> +<!-- 10->9 --> +<g id="edge11" class="edge"> +<title>10->9</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M275.2523,-719.8314C275.3593,-712.131 275.4865,-702.9743 275.6053,-694.4166"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="279.1049,-694.4609 275.7443,-684.4133 272.1056,-694.3637 279.1049,-694.4609"/> +</g> +<!-- 11 --> +<g id="node12" class="node"> +<title>11</title> +<path fill="none" stroke="#56c9d8" stroke-width="2" d="M326,-828C326,-828 220,-828 220,-828 214,-828 208,-822 208,-816 208,-816 208,-804 208,-804 208,-798 214,-792 220,-792 220,-792 326,-792 326,-792 332,-792 338,-798 338,-804 338,-804 338,-816 338,-816 338,-822 332,-828 326,-828"/> +<text text-anchor="middle" x="273" y="-807.5" font-family="sans" font-size="10.00" fill="#000000">filter_nh_transcriptome</text> +</g> +<!-- 11->10 --> +<g id="edge12" class="edge"> +<title>11->10</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M273.5047,-791.8314C273.7186,-784.131 273.9729,-774.9743 274.2106,-766.4166"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="277.7094,-766.5066 274.4885,-756.4133 270.7121,-766.3122 277.7094,-766.5066"/> +</g> +<!-- 12 --> +<g id="node13" class="node"> +<title>12</title> +<path fill="none" stroke="#56d8a2" stroke-width="2" d="M317,-900C317,-900 187,-900 187,-900 181,-900 175,-894 175,-888 175,-888 175,-876 175,-876 175,-870 181,-864 187,-864 187,-864 317,-864 317,-864 323,-864 329,-870 329,-876 329,-876 329,-888 329,-888 329,-894 323,-900 317,-900"/> +<text text-anchor="middle" x="252" y="-879.5" font-family="sans" font-size="10.00" fill="#000000">merge_transcriptome_maps</text> +</g> +<!-- 12->11 --> +<g id="edge13" class="edge"> +<title>12->11</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M257.2992,-863.8314C259.5698,-856.0463 262.2746,-846.7729 264.794,-838.1347"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="268.1894,-838.9933 267.6295,-828.4133 261.4694,-837.0332 268.1894,-838.9933"/> +</g> +<!-- 13 --> +<g id="node14" class="node"> +<title>13</title> +<path fill="none" stroke="#56d873" stroke-width="2" d="M174,-1116C174,-1116 12,-1116 12,-1116 6,-1116 0,-1110 0,-1104 0,-1104 0,-1092 0,-1092 0,-1086 6,-1080 12,-1080 12,-1080 174,-1080 174,-1080 180,-1080 186,-1086 186,-1092 186,-1092 186,-1104 186,-1104 186,-1110 180,-1116 174,-1116"/> +<text text-anchor="middle" x="93" y="-1095.5" font-family="sans" font-size="10.00" fill="#000000">mapping_transcriptome_segemehl</text> +</g> +<!-- 13->12 --> +<g id="edge14" class="edge"> +<title>13->12</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M96.0562,-1079.8994C102.321,-1047.6053 119.327,-979.3819 157,-936 168.0222,-923.3075 182.7958,-912.932 197.4032,-904.7901"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="199.2392,-907.7785 206.442,-900.0086 195.9659,-901.5909 199.2392,-907.7785"/> +</g> +<!-- 14 --> +<g id="node15" class="node"> +<title>14</title> +<path fill="none" stroke="#afd856" stroke-width="2" d="M361.5,-1260C361.5,-1260 294.5,-1260 294.5,-1260 288.5,-1260 282.5,-1254 282.5,-1248 282.5,-1248 282.5,-1236 282.5,-1236 282.5,-1230 288.5,-1224 294.5,-1224 294.5,-1224 361.5,-1224 361.5,-1224 367.5,-1224 373.5,-1230 373.5,-1236 373.5,-1236 373.5,-1248 373.5,-1248 373.5,-1254 367.5,-1260 361.5,-1260"/> +<text text-anchor="middle" x="328" y="-1239.5" font-family="sans" font-size="10.00" fill="#000000">fastx_collapser</text> +</g> +<!-- 14->13 --> +<g id="edge16" class="edge"> +<title>14->13</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M298.2371,-1223.7623C255.7707,-1197.7404 177.7809,-1149.9508 131.1322,-1121.3661"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="132.8765,-1118.3301 122.5213,-1116.0896 129.2191,-1124.2987 132.8765,-1118.3301"/> +</g> +<!-- 20 --> +<g id="node21" class="node"> +<title>20</title> +<path fill="none" stroke="#569ad8" stroke-width="2" d="M385.5,-1188C385.5,-1188 270.5,-1188 270.5,-1188 264.5,-1188 258.5,-1182 258.5,-1176 258.5,-1176 258.5,-1164 258.5,-1164 258.5,-1158 264.5,-1152 270.5,-1152 270.5,-1152 385.5,-1152 385.5,-1152 391.5,-1152 397.5,-1158 397.5,-1164 397.5,-1164 397.5,-1176 397.5,-1176 397.5,-1182 391.5,-1188 385.5,-1188"/> +<text text-anchor="middle" x="328" y="-1167.5" font-family="sans" font-size="10.00" fill="#000000">filter_fasta_for_oligomap</text> +</g> +<!-- 14->20 --> +<g id="edge23" class="edge"> +<title>14->20</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M328,-1223.8314C328,-1216.131 328,-1206.9743 328,-1198.4166"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="331.5001,-1198.4132 328,-1188.4133 324.5001,-1198.4133 331.5001,-1198.4132"/> +</g> +<!-- 25 --> +<g id="node26" class="node"> +<title>25</title> +<path fill="none" stroke="#567bd8" stroke-width="2" d="M626,-1188C626,-1188 492,-1188 492,-1188 486,-1188 480,-1182 480,-1176 480,-1176 480,-1164 480,-1164 480,-1158 486,-1152 492,-1152 492,-1152 626,-1152 626,-1152 632,-1152 638,-1158 638,-1164 638,-1164 638,-1176 638,-1176 638,-1182 632,-1188 626,-1188"/> +<text text-anchor="middle" x="559" y="-1167.5" font-family="sans" font-size="10.00" fill="#000000">mapping_genome_segemehl</text> +</g> +<!-- 14->25 --> +<g id="edge29" class="edge"> +<title>14->25</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M373.6382,-1227.7751C407.1926,-1217.3166 453.3477,-1202.9306 491.2064,-1191.1305"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="492.6366,-1194.3509 501.1421,-1188.0336 490.5535,-1187.6679 492.6366,-1194.3509"/> +</g> +<!-- 15 --> +<g id="node16" class="node"> +<title>15</title> +<path fill="none" stroke="#56d8c1" stroke-width="2" d="M345.5,-1332C345.5,-1332 310.5,-1332 310.5,-1332 304.5,-1332 298.5,-1326 298.5,-1320 298.5,-1320 298.5,-1308 298.5,-1308 298.5,-1302 304.5,-1296 310.5,-1296 310.5,-1296 345.5,-1296 345.5,-1296 351.5,-1296 357.5,-1302 357.5,-1308 357.5,-1308 357.5,-1320 357.5,-1320 357.5,-1326 351.5,-1332 345.5,-1332"/> +<text text-anchor="middle" x="328" y="-1311.5" font-family="sans" font-size="10.00" fill="#000000">cutadapt</text> +</g> +<!-- 15->14 --> +<g id="edge17" class="edge"> +<title>15->14</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M328,-1295.8314C328,-1288.131 328,-1278.9743 328,-1270.4166"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="331.5001,-1270.4132 328,-1260.4133 324.5001,-1270.4133 331.5001,-1270.4132"/> +</g> +<!-- 16 --> +<g id="node17" class="node"> +<title>16</title> +<path fill="none" stroke="#56a9d8" stroke-width="2" d="M362.5,-1404C362.5,-1404 293.5,-1404 293.5,-1404 287.5,-1404 281.5,-1398 281.5,-1392 281.5,-1392 281.5,-1380 281.5,-1380 281.5,-1374 287.5,-1368 293.5,-1368 293.5,-1368 362.5,-1368 362.5,-1368 368.5,-1368 374.5,-1374 374.5,-1380 374.5,-1380 374.5,-1392 374.5,-1392 374.5,-1398 368.5,-1404 362.5,-1404"/> +<text text-anchor="middle" x="328" y="-1383.5" font-family="sans" font-size="10.00" fill="#000000">fasta_formatter</text> +</g> +<!-- 16->15 --> +<g id="edge18" class="edge"> +<title>16->15</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M328,-1367.8314C328,-1360.131 328,-1350.9743 328,-1342.4166"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="331.5001,-1342.4132 328,-1332.4133 324.5001,-1342.4133 331.5001,-1342.4132"/> +</g> +<!-- 17 --> +<g id="node18" class="node"> +<title>17</title> +<path fill="none" stroke="#d86656" stroke-width="2" d="M385.5,-1476C385.5,-1476 270.5,-1476 270.5,-1476 264.5,-1476 258.5,-1470 258.5,-1464 258.5,-1464 258.5,-1452 258.5,-1452 258.5,-1446 264.5,-1440 270.5,-1440 270.5,-1440 385.5,-1440 385.5,-1440 391.5,-1440 397.5,-1446 397.5,-1452 397.5,-1452 397.5,-1464 397.5,-1464 397.5,-1470 391.5,-1476 385.5,-1476"/> +<text text-anchor="middle" x="328" y="-1455.5" font-family="sans" font-size="10.00" fill="#000000">uncompress_zipped_files</text> +</g> +<!-- 17->16 --> +<g id="edge19" class="edge"> +<title>17->16</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M328,-1439.8314C328,-1432.131 328,-1422.9743 328,-1414.4166"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="331.5001,-1414.4132 328,-1404.4133 324.5001,-1414.4133 331.5001,-1414.4132"/> +</g> +<!-- 18 --> +<g id="node19" class="node"> +<title>18</title> +<path fill="none" stroke="#d8ac56" stroke-width="2" d="M326,-972C326,-972 178,-972 178,-972 172,-972 166,-966 166,-960 166,-960 166,-948 166,-948 166,-942 172,-936 178,-936 178,-936 326,-936 326,-936 332,-936 338,-942 338,-948 338,-948 338,-960 338,-960 338,-966 332,-972 326,-972"/> +<text text-anchor="middle" x="252" y="-951.5" font-family="sans" font-size="10.00" fill="#000000">oligomap_transcriptome_toSAM</text> +</g> +<!-- 18->12 --> +<g id="edge15" class="edge"> +<title>18->12</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M252,-935.8314C252,-928.131 252,-918.9743 252,-910.4166"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="255.5001,-910.4132 252,-900.4133 248.5001,-910.4133 255.5001,-910.4132"/> +</g> +<!-- 19 --> +<g id="node20" class="node"> +<title>19</title> +<path fill="none" stroke="#56d8d0" stroke-width="2" d="M376,-1116C376,-1116 216,-1116 216,-1116 210,-1116 204,-1110 204,-1104 204,-1104 204,-1092 204,-1092 204,-1086 210,-1080 216,-1080 216,-1080 376,-1080 376,-1080 382,-1080 388,-1086 388,-1092 388,-1092 388,-1104 388,-1104 388,-1110 382,-1116 376,-1116"/> +<text text-anchor="middle" x="296" y="-1095.5" font-family="sans" font-size="10.00" fill="#000000">mapping_transcriptome_oligomap</text> +</g> +<!-- 19->18 --> +<g id="edge21" class="edge"> +<title>19->18</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M246.4713,-1079.9463C230.6746,-1071.5649 214.9778,-1059.8565 206,-1044 193.896,-1022.6221 208.7881,-997.7098 224.7693,-979.5223"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="227.3609,-981.8751 231.6095,-972.1694 222.2357,-977.1072 227.3609,-981.8751"/> +</g> +<!-- 21 --> +<g id="node22" class="node"> +<title>21</title> +<path fill="none" stroke="#9fd856" stroke-width="2" d="M363,-1044C363,-1044 227,-1044 227,-1044 221,-1044 215,-1038 215,-1032 215,-1032 215,-1020 215,-1020 215,-1014 221,-1008 227,-1008 227,-1008 363,-1008 363,-1008 369,-1008 375,-1014 375,-1020 375,-1020 375,-1032 375,-1032 375,-1038 369,-1044 363,-1044"/> +<text text-anchor="middle" x="295" y="-1023.5" font-family="sans" font-size="10.00" fill="#000000">sort_transcriptome_oligomap</text> +</g> +<!-- 19->21 --> +<g id="edge24" class="edge"> +<title>19->21</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M295.7477,-1079.8314C295.6407,-1072.131 295.5135,-1062.9743 295.3947,-1054.4166"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="298.8944,-1054.3637 295.2557,-1044.4133 291.8951,-1054.4609 298.8944,-1054.3637"/> +</g> +<!-- 20->19 --> +<g id="edge22" class="edge"> +<title>20->19</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M319.925,-1151.8314C316.3898,-1143.8771 312.164,-1134.369 308.2544,-1125.5723"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="311.4435,-1124.1299 304.1837,-1116.4133 305.0468,-1126.9729 311.4435,-1124.1299"/> +</g> +<!-- 27 --> +<g id="node28" class="node"> +<title>27</title> +<path fill="none" stroke="#568ad8" stroke-width="2" d="M549.5,-1116C549.5,-1116 418.5,-1116 418.5,-1116 412.5,-1116 406.5,-1110 406.5,-1104 406.5,-1104 406.5,-1092 406.5,-1092 406.5,-1086 412.5,-1080 418.5,-1080 418.5,-1080 549.5,-1080 549.5,-1080 555.5,-1080 561.5,-1086 561.5,-1092 561.5,-1092 561.5,-1104 561.5,-1104 561.5,-1110 555.5,-1116 549.5,-1116"/> +<text text-anchor="middle" x="484" y="-1095.5" font-family="sans" font-size="10.00" fill="#000000">mapping_genome_oligomap</text> +</g> +<!-- 20->27 --> +<g id="edge32" class="edge"> +<title>20->27</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M367.3654,-1151.8314C388.1367,-1142.2446 413.7983,-1130.4008 435.7581,-1120.2655"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="437.3048,-1123.4065 444.9177,-1116.038 434.3714,-1117.0508 437.3048,-1123.4065"/> +</g> +<!-- 21->18 --> +<g id="edge20" class="edge"> +<title>21->18</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M284.1493,-1007.8314C279.2977,-999.7079 273.4783,-989.9637 268.132,-981.0118"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="271.1292,-979.2041 262.9968,-972.4133 265.1194,-982.7933 271.1292,-979.2041"/> +</g> +<!-- 22 --> +<g id="node23" class="node"> +<title>22</title> +<path fill="none" stroke="#56d8b1" stroke-width="2" d="M465.5,-684C465.5,-684 346.5,-684 346.5,-684 340.5,-684 334.5,-678 334.5,-672 334.5,-672 334.5,-660 334.5,-660 334.5,-654 340.5,-648 346.5,-648 346.5,-648 465.5,-648 465.5,-648 471.5,-648 477.5,-654 477.5,-660 477.5,-660 477.5,-672 477.5,-672 477.5,-678 471.5,-684 465.5,-684"/> +<text text-anchor="middle" x="406" y="-663.5" font-family="sans" font-size="10.00" fill="#000000">remove_headers_genome</text> +</g> +<!-- 22->8 --> +<g id="edge10" class="edge"> +<title>22->8</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M389.5978,-647.8314C381.9585,-639.3694 372.7317,-629.1489 364.3793,-619.8971"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="366.9221,-617.4906 357.6231,-612.4133 361.7262,-622.1813 366.9221,-617.4906"/> +</g> +<!-- 23 --> +<g id="node24" class="node"> +<title>23</title> +<path fill="none" stroke="#5663d8" stroke-width="2" d="M455.5,-828C455.5,-828 378.5,-828 378.5,-828 372.5,-828 366.5,-822 366.5,-816 366.5,-816 366.5,-804 366.5,-804 366.5,-798 372.5,-792 378.5,-792 378.5,-792 455.5,-792 455.5,-792 461.5,-792 467.5,-798 467.5,-804 467.5,-804 467.5,-816 467.5,-816 467.5,-822 461.5,-828 455.5,-828"/> +<text text-anchor="middle" x="417" y="-807.5" font-family="sans" font-size="10.00" fill="#000000">nh_filter_genome</text> +</g> +<!-- 23->22 --> +<g id="edge25" class="edge"> +<title>23->22</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M415.6068,-791.7623C413.7306,-767.201 410.3731,-723.2474 408.1659,-694.3541"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="411.6334,-693.794 407.3818,-684.0896 404.6538,-694.3272 411.6334,-693.794"/> +</g> +<!-- 24 --> +<g id="node25" class="node"> +<title>24</title> +<path fill="none" stroke="#d8cb56" stroke-width="2" d="M496.5,-900C496.5,-900 395.5,-900 395.5,-900 389.5,-900 383.5,-894 383.5,-888 383.5,-888 383.5,-876 383.5,-876 383.5,-870 389.5,-864 395.5,-864 395.5,-864 496.5,-864 496.5,-864 502.5,-864 508.5,-870 508.5,-876 508.5,-876 508.5,-888 508.5,-888 508.5,-894 502.5,-900 496.5,-900"/> +<text text-anchor="middle" x="446" y="-879.5" font-family="sans" font-size="10.00" fill="#000000">merge_genome_maps</text> +</g> +<!-- 24->23 --> +<g id="edge26" class="edge"> +<title>24->23</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M438.6821,-863.8314C435.5124,-855.9617 431.73,-846.5712 428.2187,-837.8533"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="431.3992,-836.3815 424.4165,-828.4133 424.9061,-838.9968 431.3992,-836.3815"/> +</g> +<!-- 25->24 --> +<g id="edge28" class="edge"> +<title>25->24</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M564.4447,-1151.5968C572.5737,-1121.0938 585.4612,-1058.4231 571,-1008 560.5969,-971.7264 552.2103,-963.1481 526,-936 515.0304,-924.6379 501.2945,-914.3226 488.316,-905.8234"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="489.8165,-902.6309 479.4952,-900.239 486.0721,-908.5453 489.8165,-902.6309"/> +</g> +<!-- 26 --> +<g id="node27" class="node"> +<title>26</title> +<path fill="none" stroke="#d89c56" stroke-width="2" d="M505.5,-972C505.5,-972 386.5,-972 386.5,-972 380.5,-972 374.5,-966 374.5,-960 374.5,-960 374.5,-948 374.5,-948 374.5,-942 380.5,-936 386.5,-936 386.5,-936 505.5,-936 505.5,-936 511.5,-936 517.5,-942 517.5,-948 517.5,-948 517.5,-960 517.5,-960 517.5,-966 511.5,-972 505.5,-972"/> +<text text-anchor="middle" x="446" y="-951.5" font-family="sans" font-size="10.00" fill="#000000">oligomap_genome_toSAM</text> +</g> +<!-- 26->24 --> +<g id="edge27" class="edge"> +<title>26->24</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M446,-935.8314C446,-928.131 446,-918.9743 446,-910.4166"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="449.5001,-910.4132 446,-900.4133 442.5001,-910.4133 449.5001,-910.4132"/> +</g> +<!-- 27->26 --> +<g id="edge31" class="edge"> +<title>27->26</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M453.5042,-1079.7757C441.3178,-1070.7056 428.6228,-1058.5508 422,-1044 412.8156,-1023.8211 420.3523,-999.4424 429.3342,-981.15"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="432.4908,-982.6682 434.082,-972.1935 426.306,-979.3896 432.4908,-982.6682"/> +</g> +<!-- 28 --> +<g id="node29" class="node"> +<title>28</title> +<path fill="none" stroke="#d8bc56" stroke-width="2" d="M550.5,-1044C550.5,-1044 443.5,-1044 443.5,-1044 437.5,-1044 431.5,-1038 431.5,-1032 431.5,-1032 431.5,-1020 431.5,-1020 431.5,-1014 437.5,-1008 443.5,-1008 443.5,-1008 550.5,-1008 550.5,-1008 556.5,-1008 562.5,-1014 562.5,-1020 562.5,-1020 562.5,-1032 562.5,-1032 562.5,-1038 556.5,-1044 550.5,-1044"/> +<text text-anchor="middle" x="497" y="-1023.5" font-family="sans" font-size="10.00" fill="#000000">sort_genome_oligomap</text> +</g> +<!-- 27->28 --> +<g id="edge33" class="edge"> +<title>27->28</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M487.2804,-1079.8314C488.6708,-1072.131 490.3241,-1062.9743 491.8692,-1054.4166"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="495.3428,-1054.8761 493.6754,-1044.4133 488.4542,-1053.6322 495.3428,-1054.8761"/> +</g> +<!-- 28->26 --> +<g id="edge30" class="edge"> +<title>28->26</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M484.1305,-1007.8314C478.3165,-999.6232 471.3304,-989.7606 464.9354,-980.7323"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="467.679,-978.5505 459.0427,-972.4133 461.9669,-982.5966 467.679,-978.5505"/> +</g> +</g> +</svg> diff --git a/images/rule_graph_prepare_annotation.svg b/images/rule_graph_prepare.svg similarity index 93% rename from images/rule_graph_prepare_annotation.svg rename to images/rule_graph_prepare.svg index 1b6f8cd..16daf3a 100644 --- a/images/rule_graph_prepare_annotation.svg +++ b/images/rule_graph_prepare.svg @@ -12,17 +12,17 @@ <!-- 0 --> <g id="node1" class="node"> <title>0</title> -<path fill="none" stroke="#d88556" stroke-width="2" d="M357.5,-36C357.5,-36 327.5,-36 327.5,-36 321.5,-36 315.5,-30 315.5,-24 315.5,-24 315.5,-12 315.5,-12 315.5,-6 321.5,0 327.5,0 327.5,0 357.5,0 357.5,0 363.5,0 369.5,-6 369.5,-12 369.5,-12 369.5,-24 369.5,-24 369.5,-30 363.5,-36 357.5,-36"/> +<path fill="none" stroke="#ced856" stroke-width="2" d="M357.5,-36C357.5,-36 327.5,-36 327.5,-36 321.5,-36 315.5,-30 315.5,-24 315.5,-24 315.5,-12 315.5,-12 315.5,-6 321.5,0 327.5,0 327.5,0 357.5,0 357.5,0 363.5,0 369.5,-6 369.5,-12 369.5,-12 369.5,-24 369.5,-24 369.5,-30 363.5,-36 357.5,-36"/> <text text-anchor="middle" x="342.5" y="-15.5" font-family="sans" font-size="10.00" fill="#000000">finish</text> </g> <!-- 1 --> <g id="node2" class="node"> <title>1</title> -<path fill="none" stroke="#56d8c9" stroke-width="2" d="M362,-108C362,-108 167,-108 167,-108 161,-108 155,-102 155,-96 155,-96 155,-84 155,-84 155,-78 161,-72 167,-72 167,-72 362,-72 362,-72 368,-72 374,-78 374,-84 374,-84 374,-96 374,-96 374,-102 368,-108 362,-108"/> +<path fill="none" stroke="#56b1d8" stroke-width="2" d="M362,-108C362,-108 167,-108 167,-108 161,-108 155,-102 155,-96 155,-96 155,-84 155,-84 155,-78 161,-72 167,-72 167,-72 362,-72 362,-72 368,-72 374,-78 374,-84 374,-84 374,-96 374,-96 374,-102 368,-108 362,-108"/> <text text-anchor="middle" x="264.5" y="-87.5" font-family="sans" font-size="10.00" fill="#000000">generate_segemehl_index_transcriptome</text> </g> <!-- 1->0 --> -<g id="edge1" class="edge"> +<g id="edge3" class="edge"> <title>1->0</title> <path fill="none" stroke="#c0c0c0" stroke-width="2" d="M284.1827,-71.8314C293.6051,-63.1337 305.0401,-52.5783 315.2796,-43.1265"/> <polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="317.8495,-45.5175 322.8236,-36.1628 313.1016,-40.3738 317.8495,-45.5175"/> @@ -30,7 +30,7 @@ <!-- 2 --> <g id="node3" class="node"> <title>2</title> -<path fill="none" stroke="#56d882" stroke-width="2" d="M289,-180C289,-180 246,-180 246,-180 240,-180 234,-174 234,-168 234,-168 234,-156 234,-156 234,-150 240,-144 246,-144 246,-144 289,-144 289,-144 295,-144 301,-150 301,-156 301,-156 301,-168 301,-168 301,-174 295,-180 289,-180"/> +<path fill="none" stroke="#d8cb56" stroke-width="2" d="M289,-180C289,-180 246,-180 246,-180 240,-180 234,-174 234,-168 234,-168 234,-156 234,-156 234,-150 240,-144 246,-144 246,-144 289,-144 289,-144 295,-144 301,-150 301,-156 301,-156 301,-168 301,-168 301,-174 295,-180 289,-180"/> <text text-anchor="middle" x="267.5" y="-159.5" font-family="sans" font-size="10.00" fill="#000000">trim_fasta</text> </g> <!-- 2->1 --> @@ -42,7 +42,7 @@ <!-- 3 --> <g id="node4" class="node"> <title>3</title> -<path fill="none" stroke="#d85656" stroke-width="2" d="M334.5,-252C334.5,-252 206.5,-252 206.5,-252 200.5,-252 194.5,-246 194.5,-240 194.5,-240 194.5,-228 194.5,-228 194.5,-222 200.5,-216 206.5,-216 206.5,-216 334.5,-216 334.5,-216 340.5,-216 346.5,-222 346.5,-228 346.5,-228 346.5,-240 346.5,-240 346.5,-246 340.5,-252 334.5,-252"/> +<path fill="none" stroke="#569ad8" stroke-width="2" d="M334.5,-252C334.5,-252 206.5,-252 206.5,-252 200.5,-252 194.5,-246 194.5,-240 194.5,-240 194.5,-228 194.5,-228 194.5,-222 200.5,-216 206.5,-216 206.5,-216 334.5,-216 334.5,-216 340.5,-216 346.5,-222 346.5,-228 346.5,-228 346.5,-240 346.5,-240 346.5,-246 340.5,-252 334.5,-252"/> <text text-anchor="middle" x="270.5" y="-231.5" font-family="sans" font-size="10.00" fill="#000000">extract_transcriptome_seqs</text> </g> <!-- 3->2 --> @@ -66,7 +66,7 @@ <!-- 6 --> <g id="node7" class="node"> <title>6</title> -<path fill="none" stroke="#d86e56" stroke-width="2" d="M179,-540C179,-540 12,-540 12,-540 6,-540 0,-534 0,-528 0,-528 0,-516 0,-516 0,-510 6,-504 12,-504 12,-504 179,-504 179,-504 185,-504 191,-510 191,-516 191,-516 191,-528 191,-528 191,-534 185,-540 179,-540"/> +<path fill="none" stroke="#5682d8" stroke-width="2" d="M179,-540C179,-540 12,-540 12,-540 6,-540 0,-534 0,-528 0,-528 0,-516 0,-516 0,-510 6,-504 12,-504 12,-504 179,-504 179,-504 185,-504 191,-510 191,-516 191,-516 191,-528 191,-528 191,-534 185,-540 179,-540"/> <text text-anchor="middle" x="95.5" y="-519.5" font-family="sans" font-size="10.00" fill="#000000">generate_segemehl_index_genome</text> </g> <!-- 4->6 --> @@ -78,7 +78,7 @@ <!-- 9 --> <g id="node10" class="node"> <title>9</title> -<path fill="none" stroke="#70d856" stroke-width="2" d="M236,-468C236,-468 129,-468 129,-468 123,-468 117,-462 117,-456 117,-456 117,-444 117,-444 117,-438 123,-432 129,-432 129,-432 236,-432 236,-432 242,-432 248,-438 248,-444 248,-444 248,-456 248,-456 248,-462 242,-468 236,-468"/> +<path fill="none" stroke="#56d8c9" stroke-width="2" d="M236,-468C236,-468 129,-468 129,-468 123,-468 117,-462 117,-456 117,-456 117,-444 117,-444 117,-438 123,-432 129,-432 129,-432 236,-432 236,-432 242,-432 248,-438 248,-444 248,-444 248,-456 248,-456 248,-462 242,-468 236,-468"/> <text text-anchor="middle" x="182.5" y="-447.5" font-family="sans" font-size="10.00" fill="#000000">create_header_genome</text> </g> <!-- 4->9 --> @@ -90,7 +90,7 @@ <!-- 13 --> <g id="node14" class="node"> <title>13</title> -<path fill="none" stroke="#56b1d8" stroke-width="2" d="M366.5,-684C366.5,-684 316.5,-684 316.5,-684 310.5,-684 304.5,-678 304.5,-672 304.5,-672 304.5,-660 304.5,-660 304.5,-654 310.5,-648 316.5,-648 316.5,-648 366.5,-648 366.5,-648 372.5,-648 378.5,-654 378.5,-660 378.5,-660 378.5,-672 378.5,-672 378.5,-678 372.5,-684 366.5,-684"/> +<path fill="none" stroke="#9fd856" stroke-width="2" d="M366.5,-684C366.5,-684 316.5,-684 316.5,-684 310.5,-684 304.5,-678 304.5,-672 304.5,-672 304.5,-660 304.5,-660 304.5,-654 310.5,-648 316.5,-648 316.5,-648 366.5,-648 366.5,-648 372.5,-648 378.5,-654 378.5,-660 378.5,-660 378.5,-672 378.5,-672 378.5,-678 372.5,-684 366.5,-684"/> <text text-anchor="middle" x="341.5" y="-663.5" font-family="sans" font-size="10.00" fill="#000000">mirna_anno</text> </g> <!-- 4->13 --> @@ -102,7 +102,7 @@ <!-- 14 --> <g id="node15" class="node"> <title>14</title> -<path fill="none" stroke="#56c9d8" stroke-width="2" d="M440,-684C440,-684 409,-684 409,-684 403,-684 397,-678 397,-672 397,-672 397,-660 397,-660 397,-654 403,-648 409,-648 409,-648 440,-648 440,-648 446,-648 452,-654 452,-660 452,-660 452,-672 452,-672 452,-678 446,-684 440,-684"/> +<path fill="none" stroke="#d86e56" stroke-width="2" d="M440,-684C440,-684 409,-684 409,-684 403,-684 397,-678 397,-672 397,-672 397,-660 397,-660 397,-654 403,-648 409,-648 409,-648 440,-648 440,-648 446,-648 452,-654 452,-660 452,-660 452,-672 452,-672 452,-678 446,-684 440,-684"/> <text text-anchor="middle" x="424.5" y="-663.5" font-family="sans" font-size="10.00" fill="#000000">dict_chr</text> </g> <!-- 4->14 --> @@ -114,7 +114,7 @@ <!-- 21 --> <g id="node22" class="node"> <title>21</title> -<path fill="none" stroke="#5682d8" stroke-width="2" d="M592.5,-612C592.5,-612 506.5,-612 506.5,-612 500.5,-612 494.5,-606 494.5,-600 494.5,-600 494.5,-588 494.5,-588 494.5,-582 500.5,-576 506.5,-576 506.5,-576 592.5,-576 592.5,-576 598.5,-576 604.5,-582 604.5,-588 604.5,-588 604.5,-600 604.5,-600 604.5,-606 598.5,-612 592.5,-612"/> +<path fill="none" stroke="#d85656" stroke-width="2" d="M592.5,-612C592.5,-612 506.5,-612 506.5,-612 500.5,-612 494.5,-606 494.5,-600 494.5,-600 494.5,-588 494.5,-588 494.5,-582 500.5,-576 506.5,-576 506.5,-576 592.5,-576 592.5,-576 598.5,-576 604.5,-582 604.5,-588 604.5,-588 604.5,-600 604.5,-600 604.5,-606 598.5,-612 592.5,-612"/> <text text-anchor="middle" x="549.5" y="-591.5" font-family="sans" font-size="10.00" fill="#000000">create_index_fasta</text> </g> <!-- 4->21 --> @@ -126,7 +126,7 @@ <!-- 5 --> <g id="node6" class="node"> <title>5</title> -<path fill="none" stroke="#59d856" stroke-width="2" d="M409,-324C409,-324 346,-324 346,-324 340,-324 334,-318 334,-312 334,-312 334,-300 334,-300 334,-294 340,-288 346,-288 346,-288 409,-288 409,-288 415,-288 421,-294 421,-300 421,-300 421,-312 421,-312 421,-318 415,-324 409,-324"/> +<path fill="none" stroke="#d89c56" stroke-width="2" d="M409,-324C409,-324 346,-324 346,-324 340,-324 334,-318 334,-312 334,-312 334,-300 334,-300 334,-294 340,-288 346,-288 346,-288 409,-288 409,-288 415,-288 421,-294 421,-300 421,-300 421,-312 421,-312 421,-318 415,-324 409,-324"/> <text text-anchor="middle" x="377.5" y="-303.5" font-family="sans" font-size="10.00" fill="#000000">filter_anno_gtf</text> </g> <!-- 5->3 --> @@ -138,7 +138,7 @@ <!-- 8 --> <g id="node9" class="node"> <title>8</title> -<path fill="none" stroke="#ced856" stroke-width="2" d="M438.5,-252C438.5,-252 376.5,-252 376.5,-252 370.5,-252 364.5,-246 364.5,-240 364.5,-240 364.5,-228 364.5,-228 364.5,-222 370.5,-216 376.5,-216 376.5,-216 438.5,-216 438.5,-216 444.5,-216 450.5,-222 450.5,-228 450.5,-228 450.5,-240 450.5,-240 450.5,-246 444.5,-252 438.5,-252"/> +<path fill="none" stroke="#59d856" stroke-width="2" d="M438.5,-252C438.5,-252 376.5,-252 376.5,-252 370.5,-252 364.5,-246 364.5,-240 364.5,-240 364.5,-228 364.5,-228 364.5,-222 370.5,-216 376.5,-216 376.5,-216 438.5,-216 438.5,-216 444.5,-216 450.5,-222 450.5,-228 450.5,-228 450.5,-240 450.5,-240 450.5,-246 444.5,-252 438.5,-252"/> <text text-anchor="middle" x="407.5" y="-231.5" font-family="sans" font-size="10.00" fill="#000000">get_exons_gtf</text> </g> <!-- 5->8 --> @@ -148,7 +148,7 @@ <polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="399.2124,-262.9902 399.8278,-252.4133 392.7508,-260.2979 399.2124,-262.9902"/> </g> <!-- 6->0 --> -<g id="edge3" class="edge"> +<g id="edge5" class="edge"> <title>6->0</title> <path fill="none" stroke="#c0c0c0" stroke-width="2" d="M94.0496,-503.8284C91.9993,-476.5296 88.5,-423.3034 88.5,-378 88.5,-378 88.5,-378 88.5,-162 88.5,-119.2408 90.4989,-100.3597 122.5,-72 149.3909,-48.169 249.9753,-30.8466 305.3538,-22.8826"/> <polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="306.0511,-26.319 315.466,-21.4601 305.0759,-19.3873 306.0511,-26.319"/> @@ -156,11 +156,11 @@ <!-- 7 --> <g id="node8" class="node"> <title>7</title> -<path fill="none" stroke="#566bd8" stroke-width="2" d="M438.5,-108C438.5,-108 404.5,-108 404.5,-108 398.5,-108 392.5,-102 392.5,-96 392.5,-96 392.5,-84 392.5,-84 392.5,-78 398.5,-72 404.5,-72 404.5,-72 438.5,-72 438.5,-72 444.5,-72 450.5,-78 450.5,-84 450.5,-84 450.5,-96 450.5,-96 450.5,-102 444.5,-108 438.5,-108"/> +<path fill="none" stroke="#70d856" stroke-width="2" d="M438.5,-108C438.5,-108 404.5,-108 404.5,-108 398.5,-108 392.5,-102 392.5,-96 392.5,-96 392.5,-84 392.5,-84 392.5,-78 398.5,-72 404.5,-72 404.5,-72 438.5,-72 438.5,-72 444.5,-72 450.5,-78 450.5,-84 450.5,-84 450.5,-96 450.5,-96 450.5,-102 444.5,-108 438.5,-108"/> <text text-anchor="middle" x="421.5" y="-87.5" font-family="sans" font-size="10.00" fill="#000000">gtftobed</text> </g> <!-- 7->0 --> -<g id="edge6" class="edge"> +<g id="edge4" class="edge"> <title>7->0</title> <path fill="none" stroke="#c0c0c0" stroke-width="2" d="M401.565,-71.8314C392.0217,-63.1337 380.4401,-52.5783 370.0694,-43.1265"/> <polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="372.1772,-40.3121 362.4287,-36.1628 367.462,-45.4857 372.1772,-40.3121"/> @@ -180,11 +180,11 @@ <!-- 10 --> <g id="node11" class="node"> <title>10</title> -<path fill="none" stroke="#56d89a" stroke-width="2" d="M495,-468C495,-468 462,-468 462,-468 456,-468 450,-462 450,-456 450,-456 450,-444 450,-444 450,-438 456,-432 462,-432 462,-432 495,-432 495,-432 501,-432 507,-438 507,-444 507,-444 507,-456 507,-456 507,-462 501,-468 495,-468"/> +<path fill="none" stroke="#56d882" stroke-width="2" d="M495,-468C495,-468 462,-468 462,-468 456,-468 450,-462 450,-456 450,-456 450,-444 450,-444 450,-438 456,-432 462,-432 462,-432 495,-432 495,-432 501,-432 507,-438 507,-444 507,-444 507,-456 507,-456 507,-462 501,-468 495,-468"/> <text text-anchor="middle" x="478.5" y="-447.5" font-family="sans" font-size="10.00" fill="#000000">gfftobed</text> </g> <!-- 10->0 --> -<g id="edge5" class="edge"> +<g id="edge1" class="edge"> <title>10->0</title> <path fill="none" stroke="#c0c0c0" stroke-width="2" d="M478.5,-431.8146C478.5,-404.4983 478.5,-351.25 478.5,-306 478.5,-306 478.5,-306 478.5,-162 478.5,-121.1184 484.9901,-103.9619 459.5,-72 439.6741,-47.1403 405.7974,-33.2394 379.5835,-25.7916"/> <polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="380.1623,-22.3246 369.6002,-23.1563 378.3757,-29.0927 380.1623,-22.3246"/> @@ -192,7 +192,7 @@ <!-- 19 --> <g id="node20" class="node"> <title>19</title> -<path fill="none" stroke="#9fd856" stroke-width="2" d="M602,-396C602,-396 519,-396 519,-396 513,-396 507,-390 507,-384 507,-384 507,-372 507,-372 507,-366 513,-360 519,-360 519,-360 602,-360 602,-360 608,-360 614,-366 614,-372 614,-372 614,-384 614,-384 614,-390 608,-396 602,-396"/> +<path fill="none" stroke="#d8b456" stroke-width="2" d="M602,-396C602,-396 519,-396 519,-396 513,-396 507,-390 507,-384 507,-384 507,-372 507,-372 507,-366 513,-360 519,-360 519,-360 602,-360 602,-360 608,-360 614,-366 614,-372 614,-372 614,-384 614,-384 614,-390 608,-396 602,-396"/> <text text-anchor="middle" x="560.5" y="-375.5" font-family="sans" font-size="10.00" fill="#000000">filter_mature_mirs</text> </g> <!-- 10->19 --> @@ -204,7 +204,7 @@ <!-- 11 --> <g id="node12" class="node"> <title>11</title> -<path fill="none" stroke="#56d86b" stroke-width="2" d="M489,-540C489,-540 412,-540 412,-540 406,-540 400,-534 400,-528 400,-528 400,-516 400,-516 400,-510 406,-504 412,-504 412,-504 489,-504 489,-504 495,-504 501,-510 501,-516 501,-516 501,-528 501,-528 501,-534 495,-540 489,-540"/> +<path fill="none" stroke="#56c9d8" stroke-width="2" d="M489,-540C489,-540 412,-540 412,-540 406,-540 400,-534 400,-528 400,-528 400,-516 400,-516 400,-510 406,-504 412,-504 412,-504 489,-504 489,-504 495,-504 501,-510 501,-516 501,-516 501,-528 501,-528 501,-534 495,-540 489,-540"/> <text text-anchor="middle" x="450.5" y="-519.5" font-family="sans" font-size="10.00" fill="#000000">filter_mir_1_anno</text> </g> <!-- 11->10 --> @@ -216,7 +216,7 @@ <!-- 12 --> <g id="node13" class="node"> <title>12</title> -<path fill="none" stroke="#56d8b1" stroke-width="2" d="M461,-612C461,-612 388,-612 388,-612 382,-612 376,-606 376,-600 376,-600 376,-588 376,-588 376,-582 382,-576 388,-576 388,-576 461,-576 461,-576 467,-576 473,-582 473,-588 473,-588 473,-600 473,-600 473,-606 467,-612 461,-612"/> +<path fill="none" stroke="#566bd8" stroke-width="2" d="M461,-612C461,-612 388,-612 388,-612 382,-612 376,-606 376,-600 376,-600 376,-588 376,-588 376,-582 382,-576 388,-576 388,-576 461,-576 461,-576 467,-576 473,-582 473,-588 473,-588 473,-600 473,-600 473,-606 467,-612 461,-612"/> <text text-anchor="middle" x="424.5" y="-591.5" font-family="sans" font-size="10.00" fill="#000000">map_chr_names</text> </g> <!-- 12->11 --> @@ -240,11 +240,11 @@ <!-- 15 --> <g id="node16" class="node"> <title>15</title> -<path fill="none" stroke="#569ad8" stroke-width="2" d="M584.5,-108C584.5,-108 522.5,-108 522.5,-108 516.5,-108 510.5,-102 510.5,-96 510.5,-96 510.5,-84 510.5,-84 510.5,-78 516.5,-72 522.5,-72 522.5,-72 584.5,-72 584.5,-72 590.5,-72 596.5,-78 596.5,-84 596.5,-84 596.5,-96 596.5,-96 596.5,-102 590.5,-108 584.5,-108"/> +<path fill="none" stroke="#56d86b" stroke-width="2" d="M584.5,-108C584.5,-108 522.5,-108 522.5,-108 516.5,-108 510.5,-102 510.5,-96 510.5,-96 510.5,-84 510.5,-84 510.5,-78 516.5,-72 522.5,-72 522.5,-72 584.5,-72 584.5,-72 590.5,-72 596.5,-78 596.5,-84 596.5,-84 596.5,-96 596.5,-96 596.5,-102 590.5,-108 584.5,-108"/> <text text-anchor="middle" x="553.5" y="-87.5" font-family="sans" font-size="10.00" fill="#000000">iso_anno_final</text> </g> <!-- 15->0 --> -<g id="edge4" class="edge"> +<g id="edge6" class="edge"> <title>15->0</title> <path fill="none" stroke="#c0c0c0" stroke-width="2" d="M510.2918,-75.256C471.7195,-62.0939 415.8391,-43.0257 379.3404,-30.5711"/> <polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="380.1849,-27.1612 369.5904,-27.2441 377.9242,-33.7861 380.1849,-27.1612"/> @@ -252,7 +252,7 @@ <!-- 16 --> <g id="node17" class="node"> <title>16</title> -<path fill="none" stroke="#b6d856" stroke-width="2" d="M594.5,-180C594.5,-180 520.5,-180 520.5,-180 514.5,-180 508.5,-174 508.5,-168 508.5,-168 508.5,-156 508.5,-156 508.5,-150 514.5,-144 520.5,-144 520.5,-144 594.5,-144 594.5,-144 600.5,-144 606.5,-150 606.5,-156 606.5,-156 606.5,-168 606.5,-168 606.5,-174 600.5,-180 594.5,-180"/> +<path fill="none" stroke="#d88556" stroke-width="2" d="M594.5,-180C594.5,-180 520.5,-180 520.5,-180 514.5,-180 508.5,-174 508.5,-168 508.5,-168 508.5,-156 508.5,-156 508.5,-150 514.5,-144 520.5,-144 520.5,-144 594.5,-144 594.5,-144 600.5,-144 606.5,-150 606.5,-156 606.5,-156 606.5,-168 606.5,-168 606.5,-174 600.5,-180 594.5,-180"/> <text text-anchor="middle" x="557.5" y="-159.5" font-family="sans" font-size="10.00" fill="#000000">iso_anno_concat</text> </g> <!-- 16->15 --> @@ -264,7 +264,7 @@ <!-- 17 --> <g id="node18" class="node"> <title>17</title> -<path fill="none" stroke="#d89c56" stroke-width="2" d="M599,-252C599,-252 520,-252 520,-252 514,-252 508,-246 508,-240 508,-240 508,-228 508,-228 508,-222 514,-216 520,-216 520,-216 599,-216 599,-216 605,-216 611,-222 611,-228 611,-228 611,-240 611,-240 611,-246 605,-252 599,-252"/> +<path fill="none" stroke="#b6d856" stroke-width="2" d="M599,-252C599,-252 520,-252 520,-252 514,-252 508,-246 508,-240 508,-240 508,-228 508,-228 508,-222 514,-216 520,-216 520,-216 599,-216 599,-216 605,-216 611,-222 611,-228 611,-228 611,-240 611,-240 611,-246 605,-252 599,-252"/> <text text-anchor="middle" x="559.5" y="-231.5" font-family="sans" font-size="10.00" fill="#000000">iso_anno_rename</text> </g> <!-- 17->16 --> @@ -276,7 +276,7 @@ <!-- 18 --> <g id="node19" class="node"> <title>18</title> -<path fill="none" stroke="#d8cb56" stroke-width="2" d="M578.5,-324C578.5,-324 542.5,-324 542.5,-324 536.5,-324 530.5,-318 530.5,-312 530.5,-312 530.5,-300 530.5,-300 530.5,-294 536.5,-288 542.5,-288 542.5,-288 578.5,-288 578.5,-288 584.5,-288 590.5,-294 590.5,-300 590.5,-300 590.5,-312 590.5,-312 590.5,-318 584.5,-324 578.5,-324"/> +<path fill="none" stroke="#56d89a" stroke-width="2" d="M578.5,-324C578.5,-324 542.5,-324 542.5,-324 536.5,-324 530.5,-318 530.5,-312 530.5,-312 530.5,-300 530.5,-300 530.5,-294 536.5,-288 542.5,-288 542.5,-288 578.5,-288 578.5,-288 584.5,-288 590.5,-294 590.5,-300 590.5,-300 590.5,-312 590.5,-312 590.5,-318 584.5,-324 578.5,-324"/> <text text-anchor="middle" x="560.5" y="-303.5" font-family="sans" font-size="10.00" fill="#000000">iso_anno</text> </g> <!-- 18->17 --> @@ -286,7 +286,7 @@ <polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="563.3944,-262.3637 559.7557,-252.4133 556.3951,-262.4609 563.3944,-262.3637"/> </g> <!-- 19->18 --> -<g id="edge24" class="edge"> +<g id="edge25" class="edge"> <title>19->18</title> <path fill="none" stroke="#c0c0c0" stroke-width="2" d="M560.5,-359.8314C560.5,-352.131 560.5,-342.9743 560.5,-334.4166"/> <polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="564.0001,-334.4132 560.5,-324.4133 557.0001,-334.4133 564.0001,-334.4132"/> @@ -294,11 +294,11 @@ <!-- 20 --> <g id="node21" class="node"> <title>20</title> -<path fill="none" stroke="#d8b456" stroke-width="2" d="M712.5,-396C712.5,-396 644.5,-396 644.5,-396 638.5,-396 632.5,-390 632.5,-384 632.5,-384 632.5,-372 632.5,-372 632.5,-366 638.5,-360 644.5,-360 644.5,-360 712.5,-360 712.5,-360 718.5,-360 724.5,-366 724.5,-372 724.5,-372 724.5,-384 724.5,-384 724.5,-390 718.5,-396 712.5,-396"/> +<path fill="none" stroke="#56d8b1" stroke-width="2" d="M712.5,-396C712.5,-396 644.5,-396 644.5,-396 638.5,-396 632.5,-390 632.5,-384 632.5,-384 632.5,-372 632.5,-372 632.5,-366 638.5,-360 644.5,-360 644.5,-360 712.5,-360 712.5,-360 718.5,-360 724.5,-366 724.5,-372 724.5,-372 724.5,-384 724.5,-384 724.5,-390 718.5,-396 712.5,-396"/> <text text-anchor="middle" x="678.5" y="-375.5" font-family="sans" font-size="10.00" fill="#000000">extract_chr_len</text> </g> <!-- 20->18 --> -<g id="edge25" class="edge"> +<g id="edge24" class="edge"> <title>20->18</title> <path fill="none" stroke="#c0c0c0" stroke-width="2" d="M648.7236,-359.8314C633.6307,-350.6221 615.1243,-339.3301 598.9657,-329.4706"/> <polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="600.6263,-326.3838 590.2669,-324.1628 596.9802,-332.3593 600.6263,-326.3838"/> diff --git a/images/workflow_dag_map.svg b/images/workflow_dag_map.svg new file mode 100644 index 0000000..bfbb331 --- /dev/null +++ b/images/workflow_dag_map.svg @@ -0,0 +1,387 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no"?> +<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" + "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"> +<!-- Generated by graphviz version 2.40.1 (20161225.0304) + --> +<!-- Title: snakemake_dag Pages: 1 --> +<svg width="646pt" height="1489pt" + viewBox="0.00 0.00 646.00 1489.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"> +<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 1485)"> +<title>snakemake_dag</title> +<polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-1485 642,-1485 642,4 -4,4"/> +<!-- 0 --> +<g id="node1" class="node"> +<title>0</title> +<path fill="none" stroke="#569ad8" stroke-width="2" d="M356,-36C356,-36 326,-36 326,-36 320,-36 314,-30 314,-24 314,-24 314,-12 314,-12 314,-6 320,0 326,0 326,0 356,0 356,0 362,0 368,-6 368,-12 368,-12 368,-24 368,-24 368,-30 362,-36 356,-36"/> +<text text-anchor="middle" x="341" y="-15.5" font-family="sans" font-size="10.00" fill="#000000">finish</text> +</g> +<!-- 1 --> +<g id="node2" class="node"> +<title>1</title> +<path fill="none" stroke="#56b9d8" stroke-width="2" d="M364.5,-108C364.5,-108 317.5,-108 317.5,-108 311.5,-108 305.5,-102 305.5,-96 305.5,-96 305.5,-84 305.5,-84 305.5,-78 311.5,-72 317.5,-72 317.5,-72 364.5,-72 364.5,-72 370.5,-72 376.5,-78 376.5,-84 376.5,-84 376.5,-96 376.5,-96 376.5,-102 370.5,-108 364.5,-108"/> +<text text-anchor="middle" x="341" y="-87.5" font-family="sans" font-size="10.00" fill="#000000">index_bam</text> +</g> +<!-- 1->0 --> +<g id="edge1" class="edge"> +<title>1->0</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M341,-71.8314C341,-64.131 341,-54.9743 341,-46.4166"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="344.5001,-46.4132 341,-36.4133 337.5001,-46.4133 344.5001,-46.4132"/> +</g> +<!-- 2 --> +<g id="node3" class="node"> +<title>2</title> +<path fill="none" stroke="#9fd856" stroke-width="2" d="M377.5,-180C377.5,-180 304.5,-180 304.5,-180 298.5,-180 292.5,-174 292.5,-168 292.5,-168 292.5,-156 292.5,-156 292.5,-150 298.5,-144 304.5,-144 304.5,-144 377.5,-144 377.5,-144 383.5,-144 389.5,-150 389.5,-156 389.5,-156 389.5,-168 389.5,-168 389.5,-174 383.5,-180 377.5,-180"/> +<text text-anchor="middle" x="341" y="-159.5" font-family="sans" font-size="10.00" fill="#000000">sort_by_position</text> +</g> +<!-- 2->1 --> +<g id="edge2" class="edge"> +<title>2->1</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M341,-143.8314C341,-136.131 341,-126.9743 341,-118.4166"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="344.5001,-118.4132 341,-108.4133 337.5001,-118.4133 344.5001,-118.4132"/> +</g> +<!-- 3 --> +<g id="node4" class="node"> +<title>3</title> +<path fill="none" stroke="#d8cb56" stroke-width="2" d="M376.5,-252C376.5,-252 305.5,-252 305.5,-252 299.5,-252 293.5,-246 293.5,-240 293.5,-240 293.5,-228 293.5,-228 293.5,-222 299.5,-216 305.5,-216 305.5,-216 376.5,-216 376.5,-216 382.5,-216 388.5,-222 388.5,-228 388.5,-228 388.5,-240 388.5,-240 388.5,-246 382.5,-252 376.5,-252"/> +<text text-anchor="middle" x="341" y="-231.5" font-family="sans" font-size="10.00" fill="#000000">convert_to_bam</text> +</g> +<!-- 3->2 --> +<g id="edge3" class="edge"> +<title>3->2</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M341,-215.8314C341,-208.131 341,-198.9743 341,-190.4166"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="344.5001,-190.4132 341,-180.4133 337.5001,-190.4133 344.5001,-190.4132"/> +</g> +<!-- 4 --> +<g id="node5" class="node"> +<title>4</title> +<path fill="none" stroke="#70d856" stroke-width="2" d="M379.5,-324C379.5,-324 302.5,-324 302.5,-324 296.5,-324 290.5,-318 290.5,-312 290.5,-312 290.5,-300 290.5,-300 290.5,-294 296.5,-288 302.5,-288 302.5,-288 379.5,-288 379.5,-288 385.5,-288 391.5,-294 391.5,-300 391.5,-300 391.5,-312 391.5,-312 391.5,-318 385.5,-324 379.5,-324"/> +<text text-anchor="middle" x="341" y="-303.5" font-family="sans" font-size="10.00" fill="#000000">uncollapse_reads</text> +</g> +<!-- 4->3 --> +<g id="edge4" class="edge"> +<title>4->3</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M341,-287.8314C341,-280.131 341,-270.9743 341,-262.4166"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="344.5001,-262.4132 341,-252.4133 337.5001,-262.4133 344.5001,-262.4132"/> +</g> +<!-- 5 --> +<g id="node6" class="node"> +<title>5</title> +<path fill="none" stroke="#d87556" stroke-width="2" d="M379,-396C379,-396 303,-396 303,-396 297,-396 291,-390 291,-384 291,-384 291,-372 291,-372 291,-366 297,-360 303,-360 303,-360 379,-360 379,-360 385,-360 391,-366 391,-372 391,-372 391,-384 391,-384 391,-390 385,-396 379,-396"/> +<text text-anchor="middle" x="341" y="-375.5" font-family="sans" font-size="10.00" fill="#000000">remove_inferiors</text> +</g> +<!-- 5->4 --> +<g id="edge5" class="edge"> +<title>5->4</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M341,-359.8314C341,-352.131 341,-342.9743 341,-334.4166"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="344.5001,-334.4132 341,-324.4133 337.5001,-334.4133 344.5001,-334.4132"/> +</g> +<!-- 6 --> +<g id="node7" class="node"> +<title>6</title> +<path fill="none" stroke="#56d8a2" stroke-width="2" d="M356,-468C356,-468 326,-468 326,-468 320,-468 314,-462 314,-456 314,-456 314,-444 314,-444 314,-438 320,-432 326,-432 326,-432 356,-432 356,-432 362,-432 368,-438 368,-444 368,-444 368,-456 368,-456 368,-462 362,-468 356,-468"/> +<text text-anchor="middle" x="341" y="-447.5" font-family="sans" font-size="10.00" fill="#000000">sort_id</text> +</g> +<!-- 6->5 --> +<g id="edge6" class="edge"> +<title>6->5</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M341,-431.8314C341,-424.131 341,-414.9743 341,-406.4166"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="344.5001,-406.4132 341,-396.4133 337.5001,-406.4133 344.5001,-406.4132"/> +</g> +<!-- 7 --> +<g id="node8" class="node"> +<title>7</title> +<path fill="none" stroke="#56a9d8" stroke-width="2" d="M366,-540C366,-540 316,-540 316,-540 310,-540 304,-534 304,-528 304,-528 304,-516 304,-516 304,-510 310,-504 316,-504 316,-504 366,-504 366,-504 372,-504 378,-510 378,-516 378,-516 378,-528 378,-528 378,-534 372,-540 366,-540"/> +<text text-anchor="middle" x="341" y="-519.5" font-family="sans" font-size="10.00" fill="#000000">add_header</text> +</g> +<!-- 7->6 --> +<g id="edge7" class="edge"> +<title>7->6</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M341,-503.8314C341,-496.131 341,-486.9743 341,-478.4166"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="344.5001,-478.4132 341,-468.4133 337.5001,-478.4133 344.5001,-478.4132"/> +</g> +<!-- 8 --> +<g id="node9" class="node"> +<title>8</title> +<path fill="none" stroke="#d8ac56" stroke-width="2" d="M369,-612C369,-612 313,-612 313,-612 307,-612 301,-606 301,-600 301,-600 301,-588 301,-588 301,-582 307,-576 313,-576 313,-576 369,-576 369,-576 375,-576 381,-582 381,-588 381,-588 381,-600 381,-600 381,-606 375,-612 369,-612"/> +<text text-anchor="middle" x="341" y="-591.5" font-family="sans" font-size="10.00" fill="#000000">cat_mapping</text> +</g> +<!-- 8->7 --> +<g id="edge8" class="edge"> +<title>8->7</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M341,-575.8314C341,-568.131 341,-558.9743 341,-550.4166"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="344.5001,-550.4132 341,-540.4133 337.5001,-550.4133 344.5001,-550.4132"/> +</g> +<!-- 9 --> +<g id="node10" class="node"> +<title>9</title> +<path fill="none" stroke="#d85656" stroke-width="2" d="M304,-684C304,-684 248,-684 248,-684 242,-684 236,-678 236,-672 236,-672 236,-660 236,-660 236,-654 242,-648 248,-648 248,-648 304,-648 304,-648 310,-648 316,-654 316,-660 316,-660 316,-672 316,-672 316,-678 310,-684 304,-684"/> +<text text-anchor="middle" x="276" y="-663.5" font-family="sans" font-size="10.00" fill="#000000">trans_to_gen</text> +</g> +<!-- 9->8 --> +<g id="edge9" class="edge"> +<title>9->8</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M292.4022,-647.8314C300.0415,-639.3694 309.2683,-629.1489 317.6207,-619.8971"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="320.2738,-622.1813 324.3769,-612.4133 315.0779,-617.4906 320.2738,-622.1813"/> +</g> +<!-- 10 --> +<g id="node11" class="node"> +<title>10</title> +<path fill="none" stroke="#56d8b1" stroke-width="2" d="M349,-756C349,-756 201,-756 201,-756 195,-756 189,-750 189,-744 189,-744 189,-732 189,-732 189,-726 195,-720 201,-720 201,-720 349,-720 349,-720 355,-720 361,-726 361,-732 361,-732 361,-744 361,-744 361,-750 355,-756 349,-756"/> +<text text-anchor="middle" x="275" y="-735.5" font-family="sans" font-size="10.00" fill="#000000">remove_headers_transcriptome</text> +</g> +<!-- 10->9 --> +<g id="edge11" class="edge"> +<title>10->9</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M275.2523,-719.8314C275.3593,-712.131 275.4865,-702.9743 275.6053,-694.4166"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="279.1049,-694.4609 275.7443,-684.4133 272.1056,-694.3637 279.1049,-694.4609"/> +</g> +<!-- 11 --> +<g id="node12" class="node"> +<title>11</title> +<path fill="none" stroke="#d88556" stroke-width="2" d="M326,-828C326,-828 220,-828 220,-828 214,-828 208,-822 208,-816 208,-816 208,-804 208,-804 208,-798 214,-792 220,-792 220,-792 326,-792 326,-792 332,-792 338,-798 338,-804 338,-804 338,-816 338,-816 338,-822 332,-828 326,-828"/> +<text text-anchor="middle" x="273" y="-807.5" font-family="sans" font-size="10.00" fill="#000000">filter_nh_transcriptome</text> +</g> +<!-- 11->10 --> +<g id="edge12" class="edge"> +<title>11->10</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M273.5047,-791.8314C273.7186,-784.131 273.9729,-774.9743 274.2106,-766.4166"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="277.7094,-766.5066 274.4885,-756.4133 270.7121,-766.3122 277.7094,-766.5066"/> +</g> +<!-- 12 --> +<g id="node13" class="node"> +<title>12</title> +<path fill="none" stroke="#56d892" stroke-width="2" d="M317,-900C317,-900 187,-900 187,-900 181,-900 175,-894 175,-888 175,-888 175,-876 175,-876 175,-870 181,-864 187,-864 187,-864 317,-864 317,-864 323,-864 329,-870 329,-876 329,-876 329,-888 329,-888 329,-894 323,-900 317,-900"/> +<text text-anchor="middle" x="252" y="-879.5" font-family="sans" font-size="10.00" fill="#000000">merge_transcriptome_maps</text> +</g> +<!-- 12->11 --> +<g id="edge13" class="edge"> +<title>12->11</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M257.2992,-863.8314C259.5698,-856.0463 262.2746,-846.7729 264.794,-838.1347"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="268.1894,-838.9933 267.6295,-828.4133 261.4694,-837.0332 268.1894,-838.9933"/> +</g> +<!-- 13 --> +<g id="node14" class="node"> +<title>13</title> +<path fill="none" stroke="#61d856" stroke-width="2" d="M174,-1116C174,-1116 12,-1116 12,-1116 6,-1116 0,-1110 0,-1104 0,-1104 0,-1092 0,-1092 0,-1086 6,-1080 12,-1080 12,-1080 174,-1080 174,-1080 180,-1080 186,-1086 186,-1092 186,-1092 186,-1104 186,-1104 186,-1110 180,-1116 174,-1116"/> +<text text-anchor="middle" x="93" y="-1095.5" font-family="sans" font-size="10.00" fill="#000000">mapping_transcriptome_segemehl</text> +</g> +<!-- 13->12 --> +<g id="edge14" class="edge"> +<title>13->12</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M96.0562,-1079.8994C102.321,-1047.6053 119.327,-979.3819 157,-936 168.0222,-923.3075 182.7958,-912.932 197.4032,-904.7901"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="199.2392,-907.7785 206.442,-900.0086 195.9659,-901.5909 199.2392,-907.7785"/> +</g> +<!-- 14 --> +<g id="node15" class="node"> +<title>14</title> +<path fill="none" stroke="#c6d856" stroke-width="2" d="M361.5,-1260C361.5,-1260 294.5,-1260 294.5,-1260 288.5,-1260 282.5,-1254 282.5,-1248 282.5,-1248 282.5,-1236 282.5,-1236 282.5,-1230 288.5,-1224 294.5,-1224 294.5,-1224 361.5,-1224 361.5,-1224 367.5,-1224 373.5,-1230 373.5,-1236 373.5,-1236 373.5,-1248 373.5,-1248 373.5,-1254 367.5,-1260 361.5,-1260"/> +<text text-anchor="middle" x="328" y="-1239.5" font-family="sans" font-size="10.00" fill="#000000">fastx_collapser</text> +</g> +<!-- 14->13 --> +<g id="edge16" class="edge"> +<title>14->13</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M298.2371,-1223.7623C255.7707,-1197.7404 177.7809,-1149.9508 131.1322,-1121.3661"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="132.8765,-1118.3301 122.5213,-1116.0896 129.2191,-1124.2987 132.8765,-1118.3301"/> +</g> +<!-- 20 --> +<g id="node21" class="node"> +<title>20</title> +<path fill="none" stroke="#56d8c1" stroke-width="2" d="M385.5,-1188C385.5,-1188 270.5,-1188 270.5,-1188 264.5,-1188 258.5,-1182 258.5,-1176 258.5,-1176 258.5,-1164 258.5,-1164 258.5,-1158 264.5,-1152 270.5,-1152 270.5,-1152 385.5,-1152 385.5,-1152 391.5,-1152 397.5,-1158 397.5,-1164 397.5,-1164 397.5,-1176 397.5,-1176 397.5,-1182 391.5,-1188 385.5,-1188"/> +<text text-anchor="middle" x="328" y="-1167.5" font-family="sans" font-size="10.00" fill="#000000">filter_fasta_for_oligomap</text> +</g> +<!-- 14->20 --> +<g id="edge23" class="edge"> +<title>14->20</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M328,-1223.8314C328,-1216.131 328,-1206.9743 328,-1198.4166"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="331.5001,-1198.4132 328,-1188.4133 324.5001,-1198.4133 331.5001,-1198.4132"/> +</g> +<!-- 25 --> +<g id="node26" class="node"> +<title>25</title> +<path fill="none" stroke="#5663d8" stroke-width="2" d="M626,-1188C626,-1188 492,-1188 492,-1188 486,-1188 480,-1182 480,-1176 480,-1176 480,-1164 480,-1164 480,-1158 486,-1152 492,-1152 492,-1152 626,-1152 626,-1152 632,-1152 638,-1158 638,-1164 638,-1164 638,-1176 638,-1176 638,-1182 632,-1188 626,-1188"/> +<text text-anchor="middle" x="559" y="-1167.5" font-family="sans" font-size="10.00" fill="#000000">mapping_genome_segemehl</text> +</g> +<!-- 14->25 --> +<g id="edge29" class="edge"> +<title>14->25</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M373.6382,-1227.7751C407.1926,-1217.3166 453.3477,-1202.9306 491.2064,-1191.1305"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="492.6366,-1194.3509 501.1421,-1188.0336 490.5535,-1187.6679 492.6366,-1194.3509"/> +</g> +<!-- 15 --> +<g id="node16" class="node"> +<title>15</title> +<path fill="none" stroke="#568ad8" stroke-width="2" d="M345.5,-1332C345.5,-1332 310.5,-1332 310.5,-1332 304.5,-1332 298.5,-1326 298.5,-1320 298.5,-1320 298.5,-1308 298.5,-1308 298.5,-1302 304.5,-1296 310.5,-1296 310.5,-1296 345.5,-1296 345.5,-1296 351.5,-1296 357.5,-1302 357.5,-1308 357.5,-1308 357.5,-1320 357.5,-1320 357.5,-1326 351.5,-1332 345.5,-1332"/> +<text text-anchor="middle" x="328" y="-1311.5" font-family="sans" font-size="10.00" fill="#000000">cutadapt</text> +</g> +<!-- 15->14 --> +<g id="edge17" class="edge"> +<title>15->14</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M328,-1295.8314C328,-1288.131 328,-1278.9743 328,-1270.4166"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="331.5001,-1270.4132 328,-1260.4133 324.5001,-1270.4133 331.5001,-1270.4132"/> +</g> +<!-- 16 --> +<g id="node17" class="node"> +<title>16</title> +<path fill="none" stroke="#d86656" stroke-width="2" d="M362.5,-1404C362.5,-1404 293.5,-1404 293.5,-1404 287.5,-1404 281.5,-1398 281.5,-1392 281.5,-1392 281.5,-1380 281.5,-1380 281.5,-1374 287.5,-1368 293.5,-1368 293.5,-1368 362.5,-1368 362.5,-1368 368.5,-1368 374.5,-1374 374.5,-1380 374.5,-1380 374.5,-1392 374.5,-1392 374.5,-1398 368.5,-1404 362.5,-1404"/> +<text text-anchor="middle" x="328" y="-1383.5" font-family="sans" font-size="10.00" fill="#000000">fasta_formatter</text> +</g> +<!-- 16->15 --> +<g id="edge18" class="edge"> +<title>16->15</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M328,-1367.8314C328,-1360.131 328,-1350.9743 328,-1342.4166"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="331.5001,-1342.4132 328,-1332.4133 324.5001,-1342.4133 331.5001,-1342.4132"/> +</g> +<!-- 17 --> +<g id="node18" class="node"> +<title>17</title> +<path fill="none" stroke="#d6d856" stroke-width="2" d="M385.5,-1481C385.5,-1481 270.5,-1481 270.5,-1481 264.5,-1481 258.5,-1475 258.5,-1469 258.5,-1469 258.5,-1452 258.5,-1452 258.5,-1446 264.5,-1440 270.5,-1440 270.5,-1440 385.5,-1440 385.5,-1440 391.5,-1440 397.5,-1446 397.5,-1452 397.5,-1452 397.5,-1469 397.5,-1469 397.5,-1475 391.5,-1481 385.5,-1481"/> +<text text-anchor="middle" x="328" y="-1469" font-family="sans" font-size="10.00" fill="#000000">uncompress_zipped_files</text> +<text text-anchor="middle" x="328" y="-1458" font-family="sans" font-size="10.00" fill="#000000">format: fa</text> +<text text-anchor="middle" x="328" y="-1447" font-family="sans" font-size="10.00" fill="#000000">sample: test_lib</text> +</g> +<!-- 17->16 --> +<g id="edge19" class="edge"> +<title>17->16</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M328,-1439.7476C328,-1431.8767 328,-1422.7743 328,-1414.3232"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="331.5001,-1414.1086 328,-1404.1087 324.5001,-1414.1087 331.5001,-1414.1086"/> +</g> +<!-- 18 --> +<g id="node19" class="node"> +<title>18</title> +<path fill="none" stroke="#56d873" stroke-width="2" d="M326,-972C326,-972 178,-972 178,-972 172,-972 166,-966 166,-960 166,-960 166,-948 166,-948 166,-942 172,-936 178,-936 178,-936 326,-936 326,-936 332,-936 338,-942 338,-948 338,-948 338,-960 338,-960 338,-966 332,-972 326,-972"/> +<text text-anchor="middle" x="252" y="-951.5" font-family="sans" font-size="10.00" fill="#000000">oligomap_transcriptome_toSAM</text> +</g> +<!-- 18->12 --> +<g id="edge15" class="edge"> +<title>18->12</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M252,-935.8314C252,-928.131 252,-918.9743 252,-910.4166"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="255.5001,-910.4132 252,-900.4133 248.5001,-910.4133 255.5001,-910.4132"/> +</g> +<!-- 19 --> +<g id="node20" class="node"> +<title>19</title> +<path fill="none" stroke="#d89c56" stroke-width="2" d="M376,-1116C376,-1116 216,-1116 216,-1116 210,-1116 204,-1110 204,-1104 204,-1104 204,-1092 204,-1092 204,-1086 210,-1080 216,-1080 216,-1080 376,-1080 376,-1080 382,-1080 388,-1086 388,-1092 388,-1092 388,-1104 388,-1104 388,-1110 382,-1116 376,-1116"/> +<text text-anchor="middle" x="296" y="-1095.5" font-family="sans" font-size="10.00" fill="#000000">mapping_transcriptome_oligomap</text> +</g> +<!-- 19->18 --> +<g id="edge20" class="edge"> +<title>19->18</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M246.4713,-1079.9463C230.6746,-1071.5649 214.9778,-1059.8565 206,-1044 193.896,-1022.6221 208.7881,-997.7098 224.7693,-979.5223"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="227.3609,-981.8751 231.6095,-972.1694 222.2357,-977.1072 227.3609,-981.8751"/> +</g> +<!-- 21 --> +<g id="node22" class="node"> +<title>21</title> +<path fill="none" stroke="#567bd8" stroke-width="2" d="M363,-1044C363,-1044 227,-1044 227,-1044 221,-1044 215,-1038 215,-1032 215,-1032 215,-1020 215,-1020 215,-1014 221,-1008 227,-1008 227,-1008 363,-1008 363,-1008 369,-1008 375,-1014 375,-1020 375,-1020 375,-1032 375,-1032 375,-1038 369,-1044 363,-1044"/> +<text text-anchor="middle" x="295" y="-1023.5" font-family="sans" font-size="10.00" fill="#000000">sort_transcriptome_oligomap</text> +</g> +<!-- 19->21 --> +<g id="edge24" class="edge"> +<title>19->21</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M295.7477,-1079.8314C295.6407,-1072.131 295.5135,-1062.9743 295.3947,-1054.4166"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="298.8944,-1054.3637 295.2557,-1044.4133 291.8951,-1054.4609 298.8944,-1054.3637"/> +</g> +<!-- 20->19 --> +<g id="edge22" class="edge"> +<title>20->19</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M319.925,-1151.8314C316.3898,-1143.8771 312.164,-1134.369 308.2544,-1125.5723"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="311.4435,-1124.1299 304.1837,-1116.4133 305.0468,-1126.9729 311.4435,-1124.1299"/> +</g> +<!-- 27 --> +<g id="node28" class="node"> +<title>27</title> +<path fill="none" stroke="#8fd856" stroke-width="2" d="M549.5,-1116C549.5,-1116 418.5,-1116 418.5,-1116 412.5,-1116 406.5,-1110 406.5,-1104 406.5,-1104 406.5,-1092 406.5,-1092 406.5,-1086 412.5,-1080 418.5,-1080 418.5,-1080 549.5,-1080 549.5,-1080 555.5,-1080 561.5,-1086 561.5,-1092 561.5,-1092 561.5,-1104 561.5,-1104 561.5,-1110 555.5,-1116 549.5,-1116"/> +<text text-anchor="middle" x="484" y="-1095.5" font-family="sans" font-size="10.00" fill="#000000">mapping_genome_oligomap</text> +</g> +<!-- 20->27 --> +<g id="edge32" class="edge"> +<title>20->27</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M367.3654,-1151.8314C388.1367,-1142.2446 413.7983,-1130.4008 435.7581,-1120.2655"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="437.3048,-1123.4065 444.9177,-1116.038 434.3714,-1117.0508 437.3048,-1123.4065"/> +</g> +<!-- 21->18 --> +<g id="edge21" class="edge"> +<title>21->18</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M284.1493,-1007.8314C279.2977,-999.7079 273.4783,-989.9637 268.132,-981.0118"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="271.1292,-979.2041 262.9968,-972.4133 265.1194,-982.7933 271.1292,-979.2041"/> +</g> +<!-- 22 --> +<g id="node23" class="node"> +<title>22</title> +<path fill="none" stroke="#56c9d8" stroke-width="2" d="M465.5,-684C465.5,-684 346.5,-684 346.5,-684 340.5,-684 334.5,-678 334.5,-672 334.5,-672 334.5,-660 334.5,-660 334.5,-654 340.5,-648 346.5,-648 346.5,-648 465.5,-648 465.5,-648 471.5,-648 477.5,-654 477.5,-660 477.5,-660 477.5,-672 477.5,-672 477.5,-678 471.5,-684 465.5,-684"/> +<text text-anchor="middle" x="406" y="-663.5" font-family="sans" font-size="10.00" fill="#000000">remove_headers_genome</text> +</g> +<!-- 22->8 --> +<g id="edge10" class="edge"> +<title>22->8</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M389.5978,-647.8314C381.9585,-639.3694 372.7317,-629.1489 364.3793,-619.8971"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="366.9221,-617.4906 357.6231,-612.4133 361.7262,-622.1813 366.9221,-617.4906"/> +</g> +<!-- 23 --> +<g id="node24" class="node"> +<title>23</title> +<path fill="none" stroke="#d8bc56" stroke-width="2" d="M455.5,-828C455.5,-828 378.5,-828 378.5,-828 372.5,-828 366.5,-822 366.5,-816 366.5,-816 366.5,-804 366.5,-804 366.5,-798 372.5,-792 378.5,-792 378.5,-792 455.5,-792 455.5,-792 461.5,-792 467.5,-798 467.5,-804 467.5,-804 467.5,-816 467.5,-816 467.5,-822 461.5,-828 455.5,-828"/> +<text text-anchor="middle" x="417" y="-807.5" font-family="sans" font-size="10.00" fill="#000000">nh_filter_genome</text> +</g> +<!-- 23->22 --> +<g id="edge25" class="edge"> +<title>23->22</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M415.6068,-791.7623C413.7306,-767.201 410.3731,-723.2474 408.1659,-694.3541"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="411.6334,-693.794 407.3818,-684.0896 404.6538,-694.3272 411.6334,-693.794"/> +</g> +<!-- 24 --> +<g id="node25" class="node"> +<title>24</title> +<path fill="none" stroke="#afd856" stroke-width="2" d="M496.5,-900C496.5,-900 395.5,-900 395.5,-900 389.5,-900 383.5,-894 383.5,-888 383.5,-888 383.5,-876 383.5,-876 383.5,-870 389.5,-864 395.5,-864 395.5,-864 496.5,-864 496.5,-864 502.5,-864 508.5,-870 508.5,-876 508.5,-876 508.5,-888 508.5,-888 508.5,-894 502.5,-900 496.5,-900"/> +<text text-anchor="middle" x="446" y="-879.5" font-family="sans" font-size="10.00" fill="#000000">merge_genome_maps</text> +</g> +<!-- 24->23 --> +<g id="edge26" class="edge"> +<title>24->23</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M438.6821,-863.8314C435.5124,-855.9617 431.73,-846.5712 428.2187,-837.8533"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="431.3992,-836.3815 424.4165,-828.4133 424.9061,-838.9968 431.3992,-836.3815"/> +</g> +<!-- 25->24 --> +<g id="edge27" class="edge"> +<title>25->24</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M564.4447,-1151.5968C572.5737,-1121.0938 585.4612,-1058.4231 571,-1008 560.5969,-971.7264 552.2103,-963.1481 526,-936 515.0304,-924.6379 501.2945,-914.3226 488.316,-905.8234"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="489.8165,-902.6309 479.4952,-900.239 486.0721,-908.5453 489.8165,-902.6309"/> +</g> +<!-- 26 --> +<g id="node27" class="node"> +<title>26</title> +<path fill="none" stroke="#56d85b" stroke-width="2" d="M505.5,-972C505.5,-972 386.5,-972 386.5,-972 380.5,-972 374.5,-966 374.5,-960 374.5,-960 374.5,-948 374.5,-948 374.5,-942 380.5,-936 386.5,-936 386.5,-936 505.5,-936 505.5,-936 511.5,-936 517.5,-942 517.5,-948 517.5,-948 517.5,-960 517.5,-960 517.5,-966 511.5,-972 505.5,-972"/> +<text text-anchor="middle" x="446" y="-951.5" font-family="sans" font-size="10.00" fill="#000000">oligomap_genome_toSAM</text> +</g> +<!-- 26->24 --> +<g id="edge28" class="edge"> +<title>26->24</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M446,-935.8314C446,-928.131 446,-918.9743 446,-910.4166"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="449.5001,-910.4132 446,-900.4133 442.5001,-910.4133 449.5001,-910.4132"/> +</g> +<!-- 27->26 --> +<g id="edge30" class="edge"> +<title>27->26</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M453.5042,-1079.7757C441.3178,-1070.7056 428.6228,-1058.5508 422,-1044 412.8156,-1023.8211 420.3523,-999.4424 429.3342,-981.15"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="432.4908,-982.6682 434.082,-972.1935 426.306,-979.3896 432.4908,-982.6682"/> +</g> +<!-- 28 --> +<g id="node29" class="node"> +<title>28</title> +<path fill="none" stroke="#56d8d0" stroke-width="2" d="M550.5,-1044C550.5,-1044 443.5,-1044 443.5,-1044 437.5,-1044 431.5,-1038 431.5,-1032 431.5,-1032 431.5,-1020 431.5,-1020 431.5,-1014 437.5,-1008 443.5,-1008 443.5,-1008 550.5,-1008 550.5,-1008 556.5,-1008 562.5,-1014 562.5,-1020 562.5,-1020 562.5,-1032 562.5,-1032 562.5,-1038 556.5,-1044 550.5,-1044"/> +<text text-anchor="middle" x="497" y="-1023.5" font-family="sans" font-size="10.00" fill="#000000">sort_genome_oligomap</text> +</g> +<!-- 27->28 --> +<g id="edge33" class="edge"> +<title>27->28</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M487.2804,-1079.8314C488.6708,-1072.131 490.3241,-1062.9743 491.8692,-1054.4166"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="495.3428,-1054.8761 493.6754,-1044.4133 488.4542,-1053.6322 495.3428,-1054.8761"/> +</g> +<!-- 28->26 --> +<g id="edge31" class="edge"> +<title>28->26</title> +<path fill="none" stroke="#c0c0c0" stroke-width="2" d="M484.1305,-1007.8314C478.3165,-999.6232 471.3304,-989.7606 464.9354,-980.7323"/> +<polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="467.679,-978.5505 459.0427,-972.4133 461.9669,-982.5966 467.679,-978.5505"/> +</g> +</g> +</svg> diff --git a/images/workflow_dag_prepare_annotation.svg b/images/workflow_dag_prepare.svg similarity index 97% rename from images/workflow_dag_prepare_annotation.svg rename to images/workflow_dag_prepare.svg index 4e105ff..76a83c7 100644 --- a/images/workflow_dag_prepare_annotation.svg +++ b/images/workflow_dag_prepare.svg @@ -12,13 +12,13 @@ <!-- 0 --> <g id="node1" class="node"> <title>0</title> -<path fill="none" stroke="#d86e56" stroke-width="2" stroke-dasharray="5,2" d="M357.5,-36C357.5,-36 327.5,-36 327.5,-36 321.5,-36 315.5,-30 315.5,-24 315.5,-24 315.5,-12 315.5,-12 315.5,-6 321.5,0 327.5,0 327.5,0 357.5,0 357.5,0 363.5,0 369.5,-6 369.5,-12 369.5,-12 369.5,-24 369.5,-24 369.5,-30 363.5,-36 357.5,-36"/> +<path fill="none" stroke="#569ad8" stroke-width="2" stroke-dasharray="5,2" d="M357.5,-36C357.5,-36 327.5,-36 327.5,-36 321.5,-36 315.5,-30 315.5,-24 315.5,-24 315.5,-12 315.5,-12 315.5,-6 321.5,0 327.5,0 327.5,0 357.5,0 357.5,0 363.5,0 369.5,-6 369.5,-12 369.5,-12 369.5,-24 369.5,-24 369.5,-30 363.5,-36 357.5,-36"/> <text text-anchor="middle" x="342.5" y="-15.5" font-family="sans" font-size="10.00" fill="#000000">finish</text> </g> <!-- 1 --> <g id="node2" class="node"> <title>1</title> -<path fill="none" stroke="#b6d856" stroke-width="2" stroke-dasharray="5,2" d="M207,-108C207,-108 12,-108 12,-108 6,-108 0,-102 0,-96 0,-96 0,-84 0,-84 0,-78 6,-72 12,-72 12,-72 207,-72 207,-72 213,-72 219,-78 219,-84 219,-84 219,-96 219,-96 219,-102 213,-108 207,-108"/> +<path fill="none" stroke="#566bd8" stroke-width="2" stroke-dasharray="5,2" d="M207,-108C207,-108 12,-108 12,-108 6,-108 0,-102 0,-96 0,-96 0,-84 0,-84 0,-78 6,-72 12,-72 12,-72 207,-72 207,-72 213,-72 219,-78 219,-84 219,-84 219,-96 219,-96 219,-102 213,-108 207,-108"/> <text text-anchor="middle" x="109.5" y="-87.5" font-family="sans" font-size="10.00" fill="#000000">generate_segemehl_index_transcriptome</text> </g> <!-- 1->0 --> @@ -30,7 +30,7 @@ <!-- 2 --> <g id="node3" class="node"> <title>2</title> -<path fill="none" stroke="#566bd8" stroke-width="2" stroke-dasharray="5,2" d="M131,-180C131,-180 88,-180 88,-180 82,-180 76,-174 76,-168 76,-168 76,-156 76,-156 76,-150 82,-144 88,-144 88,-144 131,-144 131,-144 137,-144 143,-150 143,-156 143,-156 143,-168 143,-168 143,-174 137,-180 131,-180"/> +<path fill="none" stroke="#d8b456" stroke-width="2" stroke-dasharray="5,2" d="M131,-180C131,-180 88,-180 88,-180 82,-180 76,-174 76,-168 76,-168 76,-156 76,-156 76,-150 82,-144 88,-144 88,-144 131,-144 131,-144 137,-144 143,-150 143,-156 143,-156 143,-168 143,-168 143,-174 137,-180 131,-180"/> <text text-anchor="middle" x="109.5" y="-159.5" font-family="sans" font-size="10.00" fill="#000000">trim_fasta</text> </g> <!-- 2->1 --> @@ -42,7 +42,7 @@ <!-- 3 --> <g id="node4" class="node"> <title>3</title> -<path fill="none" stroke="#9fd856" stroke-width="2" stroke-dasharray="5,2" d="M173.5,-252C173.5,-252 45.5,-252 45.5,-252 39.5,-252 33.5,-246 33.5,-240 33.5,-240 33.5,-228 33.5,-228 33.5,-222 39.5,-216 45.5,-216 45.5,-216 173.5,-216 173.5,-216 179.5,-216 185.5,-222 185.5,-228 185.5,-228 185.5,-240 185.5,-240 185.5,-246 179.5,-252 173.5,-252"/> +<path fill="none" stroke="#88d856" stroke-width="2" stroke-dasharray="5,2" d="M173.5,-252C173.5,-252 45.5,-252 45.5,-252 39.5,-252 33.5,-246 33.5,-240 33.5,-240 33.5,-228 33.5,-228 33.5,-222 39.5,-216 45.5,-216 45.5,-216 173.5,-216 173.5,-216 179.5,-216 185.5,-222 185.5,-228 185.5,-228 185.5,-240 185.5,-240 185.5,-246 179.5,-252 173.5,-252"/> <text text-anchor="middle" x="109.5" y="-231.5" font-family="sans" font-size="10.00" fill="#000000">extract_transcriptome_seqs</text> </g> <!-- 3->2 --> @@ -54,7 +54,7 @@ <!-- 4 --> <g id="node5" class="node"> <title>4</title> -<path fill="none" stroke="#d88556" stroke-width="2" stroke-dasharray="5,2" d="M541,-761C541,-761 402,-761 402,-761 396,-761 390,-755 390,-749 390,-749 390,-737 390,-737 390,-731 396,-725 402,-725 402,-725 541,-725 541,-725 547,-725 553,-731 553,-737 553,-737 553,-749 553,-749 553,-755 547,-761 541,-761"/> +<path fill="none" stroke="#9fd856" stroke-width="2" stroke-dasharray="5,2" d="M541,-761C541,-761 402,-761 402,-761 396,-761 390,-755 390,-749 390,-749 390,-737 390,-737 390,-731 396,-725 402,-725 402,-725 541,-725 541,-725 547,-725 553,-731 553,-737 553,-737 553,-749 553,-749 553,-755 547,-761 541,-761"/> <text text-anchor="middle" x="471.5" y="-746" font-family="sans" font-size="10.00" fill="#000000">genome_process</text> <text text-anchor="middle" x="471.5" y="-735" font-family="sans" font-size="10.00" fill="#000000">organism: homo_sapiens/chrY</text> </g> @@ -67,7 +67,7 @@ <!-- 6 --> <g id="node7" class="node"> <title>6</title> -<path fill="none" stroke="#56c9d8" stroke-width="2" stroke-dasharray="5,2" d="M399,-617C399,-617 232,-617 232,-617 226,-617 220,-611 220,-605 220,-605 220,-593 220,-593 220,-587 226,-581 232,-581 232,-581 399,-581 399,-581 405,-581 411,-587 411,-593 411,-593 411,-605 411,-605 411,-611 405,-617 399,-617"/> +<path fill="none" stroke="#56d89a" stroke-width="2" stroke-dasharray="5,2" d="M399,-617C399,-617 232,-617 232,-617 226,-617 220,-611 220,-605 220,-605 220,-593 220,-593 220,-587 226,-581 232,-581 232,-581 399,-581 399,-581 405,-581 411,-587 411,-593 411,-593 411,-605 411,-605 411,-611 405,-617 399,-617"/> <text text-anchor="middle" x="315.5" y="-596.5" font-family="sans" font-size="10.00" fill="#000000">generate_segemehl_index_genome</text> </g> <!-- 4->6 --> @@ -79,7 +79,7 @@ <!-- 9 --> <g id="node10" class="node"> <title>9</title> -<path fill="none" stroke="#59d856" stroke-width="2" stroke-dasharray="5,2" d="M486,-545C486,-545 379,-545 379,-545 373,-545 367,-539 367,-533 367,-533 367,-521 367,-521 367,-515 373,-509 379,-509 379,-509 486,-509 486,-509 492,-509 498,-515 498,-521 498,-521 498,-533 498,-533 498,-539 492,-545 486,-545"/> +<path fill="none" stroke="#ced856" stroke-width="2" stroke-dasharray="5,2" d="M486,-545C486,-545 379,-545 379,-545 373,-545 367,-539 367,-533 367,-533 367,-521 367,-521 367,-515 373,-509 379,-509 379,-509 486,-509 486,-509 492,-509 498,-515 498,-521 498,-521 498,-533 498,-533 498,-539 492,-545 486,-545"/> <text text-anchor="middle" x="432.5" y="-524.5" font-family="sans" font-size="10.00" fill="#000000">create_header_genome</text> </g> <!-- 4->9 --> @@ -91,7 +91,7 @@ <!-- 13 --> <g id="node14" class="node"> <title>13</title> -<path fill="none" stroke="#56d86b" stroke-width="2" stroke-dasharray="5,2" d="M529.5,-689C529.5,-689 479.5,-689 479.5,-689 473.5,-689 467.5,-683 467.5,-677 467.5,-677 467.5,-665 467.5,-665 467.5,-659 473.5,-653 479.5,-653 479.5,-653 529.5,-653 529.5,-653 535.5,-653 541.5,-659 541.5,-665 541.5,-665 541.5,-677 541.5,-677 541.5,-683 535.5,-689 529.5,-689"/> +<path fill="none" stroke="#d85656" stroke-width="2" stroke-dasharray="5,2" d="M529.5,-689C529.5,-689 479.5,-689 479.5,-689 473.5,-689 467.5,-683 467.5,-677 467.5,-677 467.5,-665 467.5,-665 467.5,-659 473.5,-653 479.5,-653 479.5,-653 529.5,-653 529.5,-653 535.5,-653 541.5,-659 541.5,-665 541.5,-665 541.5,-677 541.5,-677 541.5,-683 535.5,-689 529.5,-689"/> <text text-anchor="middle" x="504.5" y="-668.5" font-family="sans" font-size="10.00" fill="#000000">mirna_anno</text> </g> <!-- 4->13 --> @@ -103,7 +103,7 @@ <!-- 14 --> <g id="node15" class="node"> <title>14</title> -<path fill="none" stroke="#d8cb56" stroke-width="2" stroke-dasharray="5,2" d="M603,-689C603,-689 572,-689 572,-689 566,-689 560,-683 560,-677 560,-677 560,-665 560,-665 560,-659 566,-653 572,-653 572,-653 603,-653 603,-653 609,-653 615,-659 615,-665 615,-665 615,-677 615,-677 615,-683 609,-689 603,-689"/> +<path fill="none" stroke="#70d856" stroke-width="2" stroke-dasharray="5,2" d="M603,-689C603,-689 572,-689 572,-689 566,-689 560,-683 560,-677 560,-677 560,-665 560,-665 560,-659 566,-653 572,-653 572,-653 603,-653 603,-653 609,-653 615,-659 615,-665 615,-665 615,-677 615,-677 615,-683 609,-689 603,-689"/> <text text-anchor="middle" x="587.5" y="-668.5" font-family="sans" font-size="10.00" fill="#000000">dict_chr</text> </g> <!-- 4->14 --> @@ -115,7 +115,7 @@ <!-- 21 --> <g id="node22" class="node"> <title>21</title> -<path fill="none" stroke="#d8b456" stroke-width="2" stroke-dasharray="5,2" d="M858.5,-689C858.5,-689 772.5,-689 772.5,-689 766.5,-689 760.5,-683 760.5,-677 760.5,-677 760.5,-665 760.5,-665 760.5,-659 766.5,-653 772.5,-653 772.5,-653 858.5,-653 858.5,-653 864.5,-653 870.5,-659 870.5,-665 870.5,-665 870.5,-677 870.5,-677 870.5,-683 864.5,-689 858.5,-689"/> +<path fill="none" stroke="#d88556" stroke-width="2" stroke-dasharray="5,2" d="M858.5,-689C858.5,-689 772.5,-689 772.5,-689 766.5,-689 760.5,-683 760.5,-677 760.5,-677 760.5,-665 760.5,-665 760.5,-659 766.5,-653 772.5,-653 772.5,-653 858.5,-653 858.5,-653 864.5,-653 870.5,-659 870.5,-665 870.5,-665 870.5,-677 870.5,-677 870.5,-683 864.5,-689 858.5,-689"/> <text text-anchor="middle" x="815.5" y="-668.5" font-family="sans" font-size="10.00" fill="#000000">create_index_fasta</text> </g> <!-- 4->21 --> @@ -127,7 +127,7 @@ <!-- 5 --> <g id="node6" class="node"> <title>5</title> -<path fill="none" stroke="#70d856" stroke-width="2" stroke-dasharray="5,2" d="M283,-326.5C283,-326.5 144,-326.5 144,-326.5 138,-326.5 132,-320.5 132,-314.5 132,-314.5 132,-302.5 132,-302.5 132,-296.5 138,-290.5 144,-290.5 144,-290.5 283,-290.5 283,-290.5 289,-290.5 295,-296.5 295,-302.5 295,-302.5 295,-314.5 295,-314.5 295,-320.5 289,-326.5 283,-326.5"/> +<path fill="none" stroke="#56b1d8" stroke-width="2" stroke-dasharray="5,2" d="M283,-326.5C283,-326.5 144,-326.5 144,-326.5 138,-326.5 132,-320.5 132,-314.5 132,-314.5 132,-302.5 132,-302.5 132,-296.5 138,-290.5 144,-290.5 144,-290.5 283,-290.5 283,-290.5 289,-290.5 295,-296.5 295,-302.5 295,-302.5 295,-314.5 295,-314.5 295,-320.5 289,-326.5 283,-326.5"/> <text text-anchor="middle" x="213.5" y="-311.5" font-family="sans" font-size="10.00" fill="#000000">filter_anno_gtf</text> <text text-anchor="middle" x="213.5" y="-300.5" font-family="sans" font-size="10.00" fill="#000000">organism: homo_sapiens/chrY</text> </g> @@ -140,7 +140,7 @@ <!-- 8 --> <g id="node9" class="node"> <title>8</title> -<path fill="none" stroke="#56d89a" stroke-width="2" stroke-dasharray="5,2" d="M280.5,-252C280.5,-252 218.5,-252 218.5,-252 212.5,-252 206.5,-246 206.5,-240 206.5,-240 206.5,-228 206.5,-228 206.5,-222 212.5,-216 218.5,-216 218.5,-216 280.5,-216 280.5,-216 286.5,-216 292.5,-222 292.5,-228 292.5,-228 292.5,-240 292.5,-240 292.5,-246 286.5,-252 280.5,-252"/> +<path fill="none" stroke="#d86e56" stroke-width="2" stroke-dasharray="5,2" d="M280.5,-252C280.5,-252 218.5,-252 218.5,-252 212.5,-252 206.5,-246 206.5,-240 206.5,-240 206.5,-228 206.5,-228 206.5,-222 212.5,-216 218.5,-216 218.5,-216 280.5,-216 280.5,-216 286.5,-216 292.5,-222 292.5,-228 292.5,-228 292.5,-240 292.5,-240 292.5,-246 286.5,-252 280.5,-252"/> <text text-anchor="middle" x="249.5" y="-231.5" font-family="sans" font-size="10.00" fill="#000000">get_exons_gtf</text> </g> <!-- 5->8 --> @@ -158,7 +158,7 @@ <!-- 7 --> <g id="node8" class="node"> <title>7</title> -<path fill="none" stroke="#d89c56" stroke-width="2" stroke-dasharray="5,2" d="M283.5,-108C283.5,-108 249.5,-108 249.5,-108 243.5,-108 237.5,-102 237.5,-96 237.5,-96 237.5,-84 237.5,-84 237.5,-78 243.5,-72 249.5,-72 249.5,-72 283.5,-72 283.5,-72 289.5,-72 295.5,-78 295.5,-84 295.5,-84 295.5,-96 295.5,-96 295.5,-102 289.5,-108 283.5,-108"/> +<path fill="none" stroke="#56d8c9" stroke-width="2" stroke-dasharray="5,2" d="M283.5,-108C283.5,-108 249.5,-108 249.5,-108 243.5,-108 237.5,-102 237.5,-96 237.5,-96 237.5,-84 237.5,-84 237.5,-78 243.5,-72 249.5,-72 249.5,-72 283.5,-72 283.5,-72 289.5,-72 295.5,-78 295.5,-84 295.5,-84 295.5,-96 295.5,-96 295.5,-102 289.5,-108 283.5,-108"/> <text text-anchor="middle" x="266.5" y="-87.5" font-family="sans" font-size="10.00" fill="#000000">gtftobed</text> </g> <!-- 7->0 --> @@ -182,7 +182,7 @@ <!-- 10 --> <g id="node11" class="node"> <title>10</title> -<path fill="none" stroke="#d85656" stroke-width="2" stroke-dasharray="5,2" d="M590,-473C590,-473 557,-473 557,-473 551,-473 545,-467 545,-461 545,-461 545,-449 545,-449 545,-443 551,-437 557,-437 557,-437 590,-437 590,-437 596,-437 602,-443 602,-449 602,-449 602,-461 602,-461 602,-467 596,-473 590,-473"/> +<path fill="none" stroke="#59d856" stroke-width="2" stroke-dasharray="5,2" d="M590,-473C590,-473 557,-473 557,-473 551,-473 545,-467 545,-461 545,-461 545,-449 545,-449 545,-443 551,-437 557,-437 557,-437 590,-437 590,-437 596,-437 602,-443 602,-449 602,-449 602,-461 602,-461 602,-467 596,-473 590,-473"/> <text text-anchor="middle" x="573.5" y="-452.5" font-family="sans" font-size="10.00" fill="#000000">gfftobed</text> </g> <!-- 10->0 --> @@ -194,7 +194,7 @@ <!-- 19 --> <g id="node20" class="node"> <title>19</title> -<path fill="none" stroke="#56b1d8" stroke-width="2" stroke-dasharray="5,2" d="M864,-401C864,-401 781,-401 781,-401 775,-401 769,-395 769,-389 769,-389 769,-377 769,-377 769,-371 775,-365 781,-365 781,-365 864,-365 864,-365 870,-365 876,-371 876,-377 876,-377 876,-389 876,-389 876,-395 870,-401 864,-401"/> +<path fill="none" stroke="#b6d856" stroke-width="2" stroke-dasharray="5,2" d="M864,-401C864,-401 781,-401 781,-401 775,-401 769,-395 769,-389 769,-389 769,-377 769,-377 769,-371 775,-365 781,-365 781,-365 864,-365 864,-365 870,-365 876,-371 876,-377 876,-377 876,-389 876,-389 876,-395 870,-401 864,-401"/> <text text-anchor="middle" x="822.5" y="-380.5" font-family="sans" font-size="10.00" fill="#000000">filter_mature_mirs</text> </g> <!-- 10->19 --> @@ -206,7 +206,7 @@ <!-- 11 --> <g id="node12" class="node"> <title>11</title> -<path fill="none" stroke="#569ad8" stroke-width="2" stroke-dasharray="5,2" d="M612,-545C612,-545 535,-545 535,-545 529,-545 523,-539 523,-533 523,-533 523,-521 523,-521 523,-515 529,-509 535,-509 535,-509 612,-509 612,-509 618,-509 624,-515 624,-521 624,-521 624,-533 624,-533 624,-539 618,-545 612,-545"/> +<path fill="none" stroke="#56d8b1" stroke-width="2" stroke-dasharray="5,2" d="M612,-545C612,-545 535,-545 535,-545 529,-545 523,-539 523,-533 523,-533 523,-521 523,-521 523,-515 529,-509 535,-509 535,-509 612,-509 612,-509 618,-509 624,-515 624,-521 624,-521 624,-533 624,-533 624,-539 618,-545 612,-545"/> <text text-anchor="middle" x="573.5" y="-524.5" font-family="sans" font-size="10.00" fill="#000000">filter_mir_1_anno</text> </g> <!-- 11->10 --> @@ -218,7 +218,7 @@ <!-- 12 --> <g id="node13" class="node"> <title>12</title> -<path fill="none" stroke="#ced856" stroke-width="2" stroke-dasharray="5,2" d="M610,-617C610,-617 537,-617 537,-617 531,-617 525,-611 525,-605 525,-605 525,-593 525,-593 525,-587 531,-581 537,-581 537,-581 610,-581 610,-581 616,-581 622,-587 622,-593 622,-593 622,-605 622,-605 622,-611 616,-617 610,-617"/> +<path fill="none" stroke="#56d882" stroke-width="2" stroke-dasharray="5,2" d="M610,-617C610,-617 537,-617 537,-617 531,-617 525,-611 525,-605 525,-605 525,-593 525,-593 525,-587 531,-581 537,-581 537,-581 610,-581 610,-581 616,-581 622,-587 622,-593 622,-593 622,-605 622,-605 622,-611 616,-617 610,-617"/> <text text-anchor="middle" x="573.5" y="-596.5" font-family="sans" font-size="10.00" fill="#000000">map_chr_names</text> </g> <!-- 12->11 --> @@ -242,7 +242,7 @@ <!-- 15 --> <g id="node16" class="node"> <title>15</title> -<path fill="none" stroke="#5682d8" stroke-width="2" stroke-dasharray="5,2" d="M747.5,-108C747.5,-108 685.5,-108 685.5,-108 679.5,-108 673.5,-102 673.5,-96 673.5,-96 673.5,-84 673.5,-84 673.5,-78 679.5,-72 685.5,-72 685.5,-72 747.5,-72 747.5,-72 753.5,-72 759.5,-78 759.5,-84 759.5,-84 759.5,-96 759.5,-96 759.5,-102 753.5,-108 747.5,-108"/> +<path fill="none" stroke="#56c9d8" stroke-width="2" stroke-dasharray="5,2" d="M747.5,-108C747.5,-108 685.5,-108 685.5,-108 679.5,-108 673.5,-102 673.5,-96 673.5,-96 673.5,-84 673.5,-84 673.5,-78 679.5,-72 685.5,-72 685.5,-72 747.5,-72 747.5,-72 753.5,-72 759.5,-78 759.5,-84 759.5,-84 759.5,-96 759.5,-96 759.5,-102 753.5,-108 747.5,-108"/> <text text-anchor="middle" x="716.5" y="-87.5" font-family="sans" font-size="10.00" fill="#000000">iso_anno_final</text> </g> <!-- 15->0 --> @@ -254,7 +254,7 @@ <!-- 16 --> <g id="node17" class="node"> <title>16</title> -<path fill="none" stroke="#88d856" stroke-width="2" stroke-dasharray="5,2" d="M939.5,-180C939.5,-180 865.5,-180 865.5,-180 859.5,-180 853.5,-174 853.5,-168 853.5,-168 853.5,-156 853.5,-156 853.5,-150 859.5,-144 865.5,-144 865.5,-144 939.5,-144 939.5,-144 945.5,-144 951.5,-150 951.5,-156 951.5,-156 951.5,-168 951.5,-168 951.5,-174 945.5,-180 939.5,-180"/> +<path fill="none" stroke="#5682d8" stroke-width="2" stroke-dasharray="5,2" d="M939.5,-180C939.5,-180 865.5,-180 865.5,-180 859.5,-180 853.5,-174 853.5,-168 853.5,-168 853.5,-156 853.5,-156 853.5,-150 859.5,-144 865.5,-144 865.5,-144 939.5,-144 939.5,-144 945.5,-144 951.5,-150 951.5,-156 951.5,-156 951.5,-168 951.5,-168 951.5,-174 945.5,-180 939.5,-180"/> <text text-anchor="middle" x="902.5" y="-159.5" font-family="sans" font-size="10.00" fill="#000000">iso_anno_concat</text> </g> <!-- 16->15 --> @@ -266,7 +266,7 @@ <!-- 17 --> <g id="node18" class="node"> <title>17</title> -<path fill="none" stroke="#56d882" stroke-width="2" stroke-dasharray="5,2" d="M1245,-252C1245,-252 1166,-252 1166,-252 1160,-252 1154,-246 1154,-240 1154,-240 1154,-228 1154,-228 1154,-222 1160,-216 1166,-216 1166,-216 1245,-216 1245,-216 1251,-216 1257,-222 1257,-228 1257,-228 1257,-240 1257,-240 1257,-246 1251,-252 1245,-252"/> +<path fill="none" stroke="#d8cb56" stroke-width="2" stroke-dasharray="5,2" d="M1245,-252C1245,-252 1166,-252 1166,-252 1160,-252 1154,-246 1154,-240 1154,-240 1154,-228 1154,-228 1154,-222 1160,-216 1166,-216 1166,-216 1245,-216 1245,-216 1251,-216 1257,-222 1257,-228 1257,-228 1257,-240 1257,-240 1257,-246 1251,-252 1245,-252"/> <text text-anchor="middle" x="1205.5" y="-231.5" font-family="sans" font-size="10.00" fill="#000000">iso_anno_rename</text> </g> <!-- 17->16 --> @@ -278,7 +278,7 @@ <!-- 18 --> <g id="node19" class="node"> <title>18</title> -<path fill="none" stroke="#56d8b1" stroke-width="2" stroke-dasharray="5,2" d="M1097.5,-329C1097.5,-329 1059.5,-329 1059.5,-329 1053.5,-329 1047.5,-323 1047.5,-317 1047.5,-317 1047.5,-300 1047.5,-300 1047.5,-294 1053.5,-288 1059.5,-288 1059.5,-288 1097.5,-288 1097.5,-288 1103.5,-288 1109.5,-294 1109.5,-300 1109.5,-300 1109.5,-317 1109.5,-317 1109.5,-323 1103.5,-329 1097.5,-329"/> +<path fill="none" stroke="#d89c56" stroke-width="2" stroke-dasharray="5,2" d="M1097.5,-329C1097.5,-329 1059.5,-329 1059.5,-329 1053.5,-329 1047.5,-323 1047.5,-317 1047.5,-317 1047.5,-300 1047.5,-300 1047.5,-294 1053.5,-288 1059.5,-288 1059.5,-288 1097.5,-288 1097.5,-288 1103.5,-288 1109.5,-294 1109.5,-300 1109.5,-300 1109.5,-317 1109.5,-317 1109.5,-323 1103.5,-329 1097.5,-329"/> <text text-anchor="middle" x="1078.5" y="-317" font-family="sans" font-size="10.00" fill="#000000">iso_anno</text> <text text-anchor="middle" x="1078.5" y="-306" font-family="sans" font-size="10.00" fill="#000000">bp_3p: -1</text> <text text-anchor="middle" x="1078.5" y="-295" font-family="sans" font-size="10.00" fill="#000000">bp_5p: -1</text> @@ -298,7 +298,7 @@ <!-- 23 --> <g id="node24" class="node"> <title>23</title> -<path fill="none" stroke="#56d8b1" stroke-width="2" stroke-dasharray="5,2" d="M1177.5,-329C1177.5,-329 1139.5,-329 1139.5,-329 1133.5,-329 1127.5,-323 1127.5,-317 1127.5,-317 1127.5,-300 1127.5,-300 1127.5,-294 1133.5,-288 1139.5,-288 1139.5,-288 1177.5,-288 1177.5,-288 1183.5,-288 1189.5,-294 1189.5,-300 1189.5,-300 1189.5,-317 1189.5,-317 1189.5,-323 1183.5,-329 1177.5,-329"/> +<path fill="none" stroke="#d89c56" stroke-width="2" stroke-dasharray="5,2" d="M1177.5,-329C1177.5,-329 1139.5,-329 1139.5,-329 1133.5,-329 1127.5,-323 1127.5,-317 1127.5,-317 1127.5,-300 1127.5,-300 1127.5,-294 1133.5,-288 1139.5,-288 1139.5,-288 1177.5,-288 1177.5,-288 1183.5,-288 1189.5,-294 1189.5,-300 1189.5,-300 1189.5,-317 1189.5,-317 1189.5,-323 1183.5,-329 1177.5,-329"/> <text text-anchor="middle" x="1158.5" y="-317" font-family="sans" font-size="10.00" fill="#000000">iso_anno</text> <text text-anchor="middle" x="1158.5" y="-306" font-family="sans" font-size="10.00" fill="#000000">bp_3p: -1</text> <text text-anchor="middle" x="1158.5" y="-295" font-family="sans" font-size="10.00" fill="#000000">bp_5p: 0</text> @@ -312,7 +312,7 @@ <!-- 25 --> <g id="node26" class="node"> <title>25</title> -<path fill="none" stroke="#56d8b1" stroke-width="2" stroke-dasharray="5,2" d="M1257.5,-329C1257.5,-329 1219.5,-329 1219.5,-329 1213.5,-329 1207.5,-323 1207.5,-317 1207.5,-317 1207.5,-300 1207.5,-300 1207.5,-294 1213.5,-288 1219.5,-288 1219.5,-288 1257.5,-288 1257.5,-288 1263.5,-288 1269.5,-294 1269.5,-300 1269.5,-300 1269.5,-317 1269.5,-317 1269.5,-323 1263.5,-329 1257.5,-329"/> +<path fill="none" stroke="#d89c56" stroke-width="2" stroke-dasharray="5,2" d="M1257.5,-329C1257.5,-329 1219.5,-329 1219.5,-329 1213.5,-329 1207.5,-323 1207.5,-317 1207.5,-317 1207.5,-300 1207.5,-300 1207.5,-294 1213.5,-288 1219.5,-288 1219.5,-288 1257.5,-288 1257.5,-288 1263.5,-288 1269.5,-294 1269.5,-300 1269.5,-300 1269.5,-317 1269.5,-317 1269.5,-323 1263.5,-329 1257.5,-329"/> <text text-anchor="middle" x="1238.5" y="-317" font-family="sans" font-size="10.00" fill="#000000">iso_anno</text> <text text-anchor="middle" x="1238.5" y="-306" font-family="sans" font-size="10.00" fill="#000000">bp_3p: -1</text> <text text-anchor="middle" x="1238.5" y="-295" font-family="sans" font-size="10.00" fill="#000000">bp_5p: 1</text> @@ -326,7 +326,7 @@ <!-- 27 --> <g id="node28" class="node"> <title>27</title> -<path fill="none" stroke="#56d8b1" stroke-width="2" stroke-dasharray="5,2" d="M625.5,-329C625.5,-329 587.5,-329 587.5,-329 581.5,-329 575.5,-323 575.5,-317 575.5,-317 575.5,-300 575.5,-300 575.5,-294 581.5,-288 587.5,-288 587.5,-288 625.5,-288 625.5,-288 631.5,-288 637.5,-294 637.5,-300 637.5,-300 637.5,-317 637.5,-317 637.5,-323 631.5,-329 625.5,-329"/> +<path fill="none" stroke="#d89c56" stroke-width="2" stroke-dasharray="5,2" d="M625.5,-329C625.5,-329 587.5,-329 587.5,-329 581.5,-329 575.5,-323 575.5,-317 575.5,-317 575.5,-300 575.5,-300 575.5,-294 581.5,-288 587.5,-288 587.5,-288 625.5,-288 625.5,-288 631.5,-288 637.5,-294 637.5,-300 637.5,-300 637.5,-317 637.5,-317 637.5,-323 631.5,-329 625.5,-329"/> <text text-anchor="middle" x="606.5" y="-317" font-family="sans" font-size="10.00" fill="#000000">iso_anno</text> <text text-anchor="middle" x="606.5" y="-306" font-family="sans" font-size="10.00" fill="#000000">bp_3p: 0</text> <text text-anchor="middle" x="606.5" y="-295" font-family="sans" font-size="10.00" fill="#000000">bp_5p: -1</text> @@ -340,7 +340,7 @@ <!-- 29 --> <g id="node30" class="node"> <title>29</title> -<path fill="none" stroke="#56d8b1" stroke-width="2" stroke-dasharray="5,2" d="M703.5,-329C703.5,-329 667.5,-329 667.5,-329 661.5,-329 655.5,-323 655.5,-317 655.5,-317 655.5,-300 655.5,-300 655.5,-294 661.5,-288 667.5,-288 667.5,-288 703.5,-288 703.5,-288 709.5,-288 715.5,-294 715.5,-300 715.5,-300 715.5,-317 715.5,-317 715.5,-323 709.5,-329 703.5,-329"/> +<path fill="none" stroke="#d89c56" stroke-width="2" stroke-dasharray="5,2" d="M703.5,-329C703.5,-329 667.5,-329 667.5,-329 661.5,-329 655.5,-323 655.5,-317 655.5,-317 655.5,-300 655.5,-300 655.5,-294 661.5,-288 667.5,-288 667.5,-288 703.5,-288 703.5,-288 709.5,-288 715.5,-294 715.5,-300 715.5,-300 715.5,-317 715.5,-317 715.5,-323 709.5,-329 703.5,-329"/> <text text-anchor="middle" x="685.5" y="-317" font-family="sans" font-size="10.00" fill="#000000">iso_anno</text> <text text-anchor="middle" x="685.5" y="-306" font-family="sans" font-size="10.00" fill="#000000">bp_3p: 0</text> <text text-anchor="middle" x="685.5" y="-295" font-family="sans" font-size="10.00" fill="#000000">bp_5p: 0</text> @@ -354,7 +354,7 @@ <!-- 31 --> <g id="node32" class="node"> <title>31</title> -<path fill="none" stroke="#56d8b1" stroke-width="2" stroke-dasharray="5,2" d="M781.5,-329C781.5,-329 745.5,-329 745.5,-329 739.5,-329 733.5,-323 733.5,-317 733.5,-317 733.5,-300 733.5,-300 733.5,-294 739.5,-288 745.5,-288 745.5,-288 781.5,-288 781.5,-288 787.5,-288 793.5,-294 793.5,-300 793.5,-300 793.5,-317 793.5,-317 793.5,-323 787.5,-329 781.5,-329"/> +<path fill="none" stroke="#d89c56" stroke-width="2" stroke-dasharray="5,2" d="M781.5,-329C781.5,-329 745.5,-329 745.5,-329 739.5,-329 733.5,-323 733.5,-317 733.5,-317 733.5,-300 733.5,-300 733.5,-294 739.5,-288 745.5,-288 745.5,-288 781.5,-288 781.5,-288 787.5,-288 793.5,-294 793.5,-300 793.5,-300 793.5,-317 793.5,-317 793.5,-323 787.5,-329 781.5,-329"/> <text text-anchor="middle" x="763.5" y="-317" font-family="sans" font-size="10.00" fill="#000000">iso_anno</text> <text text-anchor="middle" x="763.5" y="-306" font-family="sans" font-size="10.00" fill="#000000">bp_3p: 0</text> <text text-anchor="middle" x="763.5" y="-295" font-family="sans" font-size="10.00" fill="#000000">bp_5p: 1</text> @@ -368,7 +368,7 @@ <!-- 33 --> <g id="node34" class="node"> <title>33</title> -<path fill="none" stroke="#56d8b1" stroke-width="2" stroke-dasharray="5,2" d="M861.5,-329C861.5,-329 823.5,-329 823.5,-329 817.5,-329 811.5,-323 811.5,-317 811.5,-317 811.5,-300 811.5,-300 811.5,-294 817.5,-288 823.5,-288 823.5,-288 861.5,-288 861.5,-288 867.5,-288 873.5,-294 873.5,-300 873.5,-300 873.5,-317 873.5,-317 873.5,-323 867.5,-329 861.5,-329"/> +<path fill="none" stroke="#d89c56" stroke-width="2" stroke-dasharray="5,2" d="M861.5,-329C861.5,-329 823.5,-329 823.5,-329 817.5,-329 811.5,-323 811.5,-317 811.5,-317 811.5,-300 811.5,-300 811.5,-294 817.5,-288 823.5,-288 823.5,-288 861.5,-288 861.5,-288 867.5,-288 873.5,-294 873.5,-300 873.5,-300 873.5,-317 873.5,-317 873.5,-323 867.5,-329 861.5,-329"/> <text text-anchor="middle" x="842.5" y="-317" font-family="sans" font-size="10.00" fill="#000000">iso_anno</text> <text text-anchor="middle" x="842.5" y="-306" font-family="sans" font-size="10.00" fill="#000000">bp_3p: 1</text> <text text-anchor="middle" x="842.5" y="-295" font-family="sans" font-size="10.00" fill="#000000">bp_5p: -1</text> @@ -382,7 +382,7 @@ <!-- 35 --> <g id="node36" class="node"> <title>35</title> -<path fill="none" stroke="#56d8b1" stroke-width="2" stroke-dasharray="5,2" d="M939.5,-329C939.5,-329 903.5,-329 903.5,-329 897.5,-329 891.5,-323 891.5,-317 891.5,-317 891.5,-300 891.5,-300 891.5,-294 897.5,-288 903.5,-288 903.5,-288 939.5,-288 939.5,-288 945.5,-288 951.5,-294 951.5,-300 951.5,-300 951.5,-317 951.5,-317 951.5,-323 945.5,-329 939.5,-329"/> +<path fill="none" stroke="#d89c56" stroke-width="2" stroke-dasharray="5,2" d="M939.5,-329C939.5,-329 903.5,-329 903.5,-329 897.5,-329 891.5,-323 891.5,-317 891.5,-317 891.5,-300 891.5,-300 891.5,-294 897.5,-288 903.5,-288 903.5,-288 939.5,-288 939.5,-288 945.5,-288 951.5,-294 951.5,-300 951.5,-300 951.5,-317 951.5,-317 951.5,-323 945.5,-329 939.5,-329"/> <text text-anchor="middle" x="921.5" y="-317" font-family="sans" font-size="10.00" fill="#000000">iso_anno</text> <text text-anchor="middle" x="921.5" y="-306" font-family="sans" font-size="10.00" fill="#000000">bp_3p: 1</text> <text text-anchor="middle" x="921.5" y="-295" font-family="sans" font-size="10.00" fill="#000000">bp_5p: 0</text> @@ -396,7 +396,7 @@ <!-- 37 --> <g id="node38" class="node"> <title>37</title> -<path fill="none" stroke="#56d8b1" stroke-width="2" stroke-dasharray="5,2" d="M1017.5,-329C1017.5,-329 981.5,-329 981.5,-329 975.5,-329 969.5,-323 969.5,-317 969.5,-317 969.5,-300 969.5,-300 969.5,-294 975.5,-288 981.5,-288 981.5,-288 1017.5,-288 1017.5,-288 1023.5,-288 1029.5,-294 1029.5,-300 1029.5,-300 1029.5,-317 1029.5,-317 1029.5,-323 1023.5,-329 1017.5,-329"/> +<path fill="none" stroke="#d89c56" stroke-width="2" stroke-dasharray="5,2" d="M1017.5,-329C1017.5,-329 981.5,-329 981.5,-329 975.5,-329 969.5,-323 969.5,-317 969.5,-317 969.5,-300 969.5,-300 969.5,-294 975.5,-288 981.5,-288 981.5,-288 1017.5,-288 1017.5,-288 1023.5,-288 1029.5,-294 1029.5,-300 1029.5,-300 1029.5,-317 1029.5,-317 1029.5,-323 1023.5,-329 1017.5,-329"/> <text text-anchor="middle" x="999.5" y="-317" font-family="sans" font-size="10.00" fill="#000000">iso_anno</text> <text text-anchor="middle" x="999.5" y="-306" font-family="sans" font-size="10.00" fill="#000000">bp_3p: 1</text> <text text-anchor="middle" x="999.5" y="-295" font-family="sans" font-size="10.00" fill="#000000">bp_5p: 1</text> @@ -410,7 +410,7 @@ <!-- 20 --> <g id="node21" class="node"> <title>20</title> -<path fill="none" stroke="#56d8c9" stroke-width="2" stroke-dasharray="5,2" d="M974.5,-401C974.5,-401 906.5,-401 906.5,-401 900.5,-401 894.5,-395 894.5,-389 894.5,-389 894.5,-377 894.5,-377 894.5,-371 900.5,-365 906.5,-365 906.5,-365 974.5,-365 974.5,-365 980.5,-365 986.5,-371 986.5,-377 986.5,-377 986.5,-389 986.5,-389 986.5,-395 980.5,-401 974.5,-401"/> +<path fill="none" stroke="#56d86b" stroke-width="2" stroke-dasharray="5,2" d="M974.5,-401C974.5,-401 906.5,-401 906.5,-401 900.5,-401 894.5,-395 894.5,-389 894.5,-389 894.5,-377 894.5,-377 894.5,-371 900.5,-365 906.5,-365 906.5,-365 974.5,-365 974.5,-365 980.5,-365 986.5,-371 986.5,-377 986.5,-377 986.5,-389 986.5,-389 986.5,-395 980.5,-401 974.5,-401"/> <text text-anchor="middle" x="940.5" y="-380.5" font-family="sans" font-size="10.00" fill="#000000">extract_chr_len</text> </g> <!-- 20->18 --> @@ -476,7 +476,7 @@ <!-- 22 --> <g id="node23" class="node"> <title>22</title> -<path fill="none" stroke="#56d882" stroke-width="2" stroke-dasharray="5,2" d="M1366,-252C1366,-252 1287,-252 1287,-252 1281,-252 1275,-246 1275,-240 1275,-240 1275,-228 1275,-228 1275,-222 1281,-216 1287,-216 1287,-216 1366,-216 1366,-216 1372,-216 1378,-222 1378,-228 1378,-228 1378,-240 1378,-240 1378,-246 1372,-252 1366,-252"/> +<path fill="none" stroke="#d8cb56" stroke-width="2" stroke-dasharray="5,2" d="M1366,-252C1366,-252 1287,-252 1287,-252 1281,-252 1275,-246 1275,-240 1275,-240 1275,-228 1275,-228 1275,-222 1281,-216 1287,-216 1287,-216 1366,-216 1366,-216 1372,-216 1378,-222 1378,-228 1378,-228 1378,-240 1378,-240 1378,-246 1372,-252 1366,-252"/> <text text-anchor="middle" x="1326.5" y="-231.5" font-family="sans" font-size="10.00" fill="#000000">iso_anno_rename</text> </g> <!-- 22->16 --> @@ -494,7 +494,7 @@ <!-- 24 --> <g id="node25" class="node"> <title>24</title> -<path fill="none" stroke="#56d882" stroke-width="2" stroke-dasharray="5,2" d="M1487,-252C1487,-252 1408,-252 1408,-252 1402,-252 1396,-246 1396,-240 1396,-240 1396,-228 1396,-228 1396,-222 1402,-216 1408,-216 1408,-216 1487,-216 1487,-216 1493,-216 1499,-222 1499,-228 1499,-228 1499,-240 1499,-240 1499,-246 1493,-252 1487,-252"/> +<path fill="none" stroke="#d8cb56" stroke-width="2" stroke-dasharray="5,2" d="M1487,-252C1487,-252 1408,-252 1408,-252 1402,-252 1396,-246 1396,-240 1396,-240 1396,-228 1396,-228 1396,-222 1402,-216 1408,-216 1408,-216 1487,-216 1487,-216 1493,-216 1499,-222 1499,-228 1499,-228 1499,-240 1499,-240 1499,-246 1493,-252 1487,-252"/> <text text-anchor="middle" x="1447.5" y="-231.5" font-family="sans" font-size="10.00" fill="#000000">iso_anno_rename</text> </g> <!-- 24->16 --> @@ -512,7 +512,7 @@ <!-- 26 --> <g id="node27" class="node"> <title>26</title> -<path fill="none" stroke="#56d882" stroke-width="2" stroke-dasharray="5,2" d="M519,-252C519,-252 440,-252 440,-252 434,-252 428,-246 428,-240 428,-240 428,-228 428,-228 428,-222 434,-216 440,-216 440,-216 519,-216 519,-216 525,-216 531,-222 531,-228 531,-228 531,-240 531,-240 531,-246 525,-252 519,-252"/> +<path fill="none" stroke="#d8cb56" stroke-width="2" stroke-dasharray="5,2" d="M519,-252C519,-252 440,-252 440,-252 434,-252 428,-246 428,-240 428,-240 428,-228 428,-228 428,-222 434,-216 440,-216 440,-216 519,-216 519,-216 525,-216 531,-222 531,-228 531,-228 531,-240 531,-240 531,-246 525,-252 519,-252"/> <text text-anchor="middle" x="479.5" y="-231.5" font-family="sans" font-size="10.00" fill="#000000">iso_anno_rename</text> </g> <!-- 26->16 --> @@ -530,7 +530,7 @@ <!-- 28 --> <g id="node29" class="node"> <title>28</title> -<path fill="none" stroke="#56d882" stroke-width="2" stroke-dasharray="5,2" d="M640,-252C640,-252 561,-252 561,-252 555,-252 549,-246 549,-240 549,-240 549,-228 549,-228 549,-222 555,-216 561,-216 561,-216 640,-216 640,-216 646,-216 652,-222 652,-228 652,-228 652,-240 652,-240 652,-246 646,-252 640,-252"/> +<path fill="none" stroke="#d8cb56" stroke-width="2" stroke-dasharray="5,2" d="M640,-252C640,-252 561,-252 561,-252 555,-252 549,-246 549,-240 549,-240 549,-228 549,-228 549,-222 555,-216 561,-216 561,-216 640,-216 640,-216 646,-216 652,-222 652,-228 652,-228 652,-240 652,-240 652,-246 646,-252 640,-252"/> <text text-anchor="middle" x="600.5" y="-231.5" font-family="sans" font-size="10.00" fill="#000000">iso_anno_rename</text> </g> <!-- 28->16 --> @@ -548,7 +548,7 @@ <!-- 30 --> <g id="node31" class="node"> <title>30</title> -<path fill="none" stroke="#56d882" stroke-width="2" stroke-dasharray="5,2" d="M761,-252C761,-252 682,-252 682,-252 676,-252 670,-246 670,-240 670,-240 670,-228 670,-228 670,-222 676,-216 682,-216 682,-216 761,-216 761,-216 767,-216 773,-222 773,-228 773,-228 773,-240 773,-240 773,-246 767,-252 761,-252"/> +<path fill="none" stroke="#d8cb56" stroke-width="2" stroke-dasharray="5,2" d="M761,-252C761,-252 682,-252 682,-252 676,-252 670,-246 670,-240 670,-240 670,-228 670,-228 670,-222 676,-216 682,-216 682,-216 761,-216 761,-216 767,-216 773,-222 773,-228 773,-228 773,-240 773,-240 773,-246 767,-252 761,-252"/> <text text-anchor="middle" x="721.5" y="-231.5" font-family="sans" font-size="10.00" fill="#000000">iso_anno_rename</text> </g> <!-- 30->16 --> @@ -566,7 +566,7 @@ <!-- 32 --> <g id="node33" class="node"> <title>32</title> -<path fill="none" stroke="#56d882" stroke-width="2" stroke-dasharray="5,2" d="M882,-252C882,-252 803,-252 803,-252 797,-252 791,-246 791,-240 791,-240 791,-228 791,-228 791,-222 797,-216 803,-216 803,-216 882,-216 882,-216 888,-216 894,-222 894,-228 894,-228 894,-240 894,-240 894,-246 888,-252 882,-252"/> +<path fill="none" stroke="#d8cb56" stroke-width="2" stroke-dasharray="5,2" d="M882,-252C882,-252 803,-252 803,-252 797,-252 791,-246 791,-240 791,-240 791,-228 791,-228 791,-222 797,-216 803,-216 803,-216 882,-216 882,-216 888,-216 894,-222 894,-228 894,-228 894,-240 894,-240 894,-246 888,-252 882,-252"/> <text text-anchor="middle" x="842.5" y="-231.5" font-family="sans" font-size="10.00" fill="#000000">iso_anno_rename</text> </g> <!-- 32->16 --> @@ -584,7 +584,7 @@ <!-- 34 --> <g id="node35" class="node"> <title>34</title> -<path fill="none" stroke="#56d882" stroke-width="2" stroke-dasharray="5,2" d="M1003,-252C1003,-252 924,-252 924,-252 918,-252 912,-246 912,-240 912,-240 912,-228 912,-228 912,-222 918,-216 924,-216 924,-216 1003,-216 1003,-216 1009,-216 1015,-222 1015,-228 1015,-228 1015,-240 1015,-240 1015,-246 1009,-252 1003,-252"/> +<path fill="none" stroke="#d8cb56" stroke-width="2" stroke-dasharray="5,2" d="M1003,-252C1003,-252 924,-252 924,-252 918,-252 912,-246 912,-240 912,-240 912,-228 912,-228 912,-222 918,-216 924,-216 924,-216 1003,-216 1003,-216 1009,-216 1015,-222 1015,-228 1015,-228 1015,-240 1015,-240 1015,-246 1009,-252 1003,-252"/> <text text-anchor="middle" x="963.5" y="-231.5" font-family="sans" font-size="10.00" fill="#000000">iso_anno_rename</text> </g> <!-- 34->16 --> @@ -602,7 +602,7 @@ <!-- 36 --> <g id="node37" class="node"> <title>36</title> -<path fill="none" stroke="#56d882" stroke-width="2" stroke-dasharray="5,2" d="M1124,-252C1124,-252 1045,-252 1045,-252 1039,-252 1033,-246 1033,-240 1033,-240 1033,-228 1033,-228 1033,-222 1039,-216 1045,-216 1045,-216 1124,-216 1124,-216 1130,-216 1136,-222 1136,-228 1136,-228 1136,-240 1136,-240 1136,-246 1130,-252 1124,-252"/> +<path fill="none" stroke="#d8cb56" stroke-width="2" stroke-dasharray="5,2" d="M1124,-252C1124,-252 1045,-252 1045,-252 1039,-252 1033,-246 1033,-240 1033,-240 1033,-228 1033,-228 1033,-222 1039,-216 1045,-216 1045,-216 1124,-216 1124,-216 1130,-216 1136,-222 1136,-228 1136,-228 1136,-240 1136,-240 1136,-246 1130,-252 1124,-252"/> <text text-anchor="middle" x="1084.5" y="-231.5" font-family="sans" font-size="10.00" fill="#000000">iso_anno_rename</text> </g> <!-- 36->16 --> diff --git a/scripts/blocksort.sh b/scripts/blocksort.sh new file mode 100755 index 0000000..38e0e8c --- /dev/null +++ b/scripts/blocksort.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +# define function to print usage +display_usage() { + echo "Sort oligomap aligments based on their numerical names." + echo "" + echo "Usage: $0 input_file number_of_threads output_file" + echo "" + echo "Args:" + echo " input_file: oligomap aligments" + echo " number_of_threads: number of threads to run the sorting with" + echo " output_file: path to sorted output file" +} + +# show usage if user supplied less than two arguments +if [ $# -ne 3 ] +then + display_usage + exit 1 +fi + +# show usage if user has supplied -h or --help +if [[ ( $# == "--help") || $# == "-h" ]] +then + display_usage + exit 0 +fi + +cat $1 | awk -v RS="" '{ gsub("\n", "*"); print }' | sort -n --parallel=$2 | awk -v ORS="\n\n" '{ gsub("\*", "\n"); print }' > $3 diff --git a/scripts/nh_filter.py b/scripts/nh_filter.py new file mode 100755 index 0000000..dc5876a --- /dev/null +++ b/scripts/nh_filter.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python + +import sys +import argparse +import getopt +import pysam +import os + +if sys.argv[1] in ['--help', '-h', '-help']: + sys.exit("\nDescription: Checks for NH tag to remove reads that aligned more than max_NH value.\nUsage: filter_nh.py [SAM file] [max_NH] [OUTPUT file]\n") +elif len(sys.argv) < 4 or len(sys.argv) > 4: + sys.exit("\n Arguments ERROR. See [nh_filter.py --help]\n") + +def main(): + + sys.stdout.write("Removing reads aligned more than %s times... \n"%(sys.argv[2])) + + infile = pysam.Samfile(sys.argv[1], "r", check_sq=False) + out = pysam.Samfile(sys.argv[3] , "w", template = infile ) + + keep = True + + for DNAread in infile.fetch(): + intags = DNAread.tags + + for entry in intags: + if 'NH' in entry and entry[1] > int(sys.argv[2]): + keep = False + if keep: + out.write(DNAread) + + keep = True + + out.close() + sys.stdout.write("DONE!\n") + + +if __name__ == '__main__': + main() diff --git a/scripts/oligomapOutputToSam_nhfiltered.py b/scripts/oligomapOutputToSam_nhfiltered.py new file mode 100755 index 0000000..38df296 --- /dev/null +++ b/scripts/oligomapOutputToSam_nhfiltered.py @@ -0,0 +1,237 @@ +#!/usr/bin/env python +import sys +sys.path.append("/scicore/home/zavolan/clipz/newClipz6/lib/python") + +################################################# +# Transforms oligomap output to SAM format and +# keeps only best alignments. +# +# input: +# - the output of Oligomap +# - nh filter value +# output: +# - stdout: a SAM file with the best +# alignments for each read *(e.g. all 3 +# alignments with 0 errors, or all alignments +# with 1 error for each read +# +# Paula Iborra. Zavolan Lab. +# Adated version of Alessandro Crippa script. +################################################# + + +import sys +import re +from argparse import ArgumentParser, RawTextHelpFormatter + + +### ARGUMENTS ### PI, modified June 2019. + +parser = ArgumentParser( + description="Oligomap output to SAM. NH filter applicable." + ) +parser.add_argument( + '-v','--version', + action='version', + version='%(prog)s 1.0', + help="Show program's version number and exit" + ) +parser.add_argument( + '-n','--nhfilter', + help="Add NH tag to output, remove reads that contain more aligments than given NH value (with min error).", + type=int, + ) +parser.add_argument( + '-i', '--input', + help="Input File. The output of oligomap mapping.", + required=True, + ) + +args = parser.parse_args() + + +readSeqs = {} # dictionary of reads +filt = {} # dict with all filtered reads (heavy multimappers NH > 100) by error. {'seqName':error} (error with which it has been discarted) +seqToMinError = {} +nh = {} # nh dictionary per read evaluated + + +def addReadToList(d,nh,minerr,seqName,flag,target,positionInTarget,errors,mapq,cigarStr,rnext,pnext,tlen,sequence,qual,editDistance,matchingString): + + if len(d) == 0: # if empty dictionary + if seqName not in list(minerr.keys()): # means: seqName NOT found previously + d[seqName] = [] + minerr[seqName] = errors + nh[seqName] = 1 + d[seqName].append([seqName,flag,target,positionInTarget,mapq,cigarStr,rnext,pnext,tlen,sequence,qual,editDistance,matchingString]) + else: # means: seqName found previously and filtered by NH ---- empty dicty due to d.clear() + if errors < minerr[seqName]: # Aligment found with lower error than the ones stored in minerr dict + d[seqName] = [] + minerr[seqName] = errors + nh[seqName] = 1 + d[seqName].append([seqName,flag,target,positionInTarget,mapq,cigarStr,rnext,pnext,tlen,sequence,qual,editDistance,matchingString]) + + else: # if dictionary not empty + if seqName == list(d.keys())[0]: #same seqName as the one stored in dict found + # check the errors fo this new seqName to include it or not + if errors == minerr[seqName]: # if seqName errors is equal to the one stored in minerr -> keep + nh[seqName] += 1 # increase nh +1 + if args.nhfilter: # if NH filter + if nh[seqName] <= (args.nhfilter): + d[seqName].append([seqName,flag,target,positionInTarget,mapq,cigarStr,rnext,pnext,tlen,sequence,qual,editDistance,matchingString]) + else: # if after adding +1 to nh, the total is > than the filter value. seqName discarded. + # Clear d dictonary + d.clear() + sys.stderr.write("Filtered by NH | Read %s | Errors = %s \n"%(seqName,errors)) + + else: # no NH filtering, keep all reads including heavy multimappers + d[seqName].append([seqName,flag,target,positionInTarget,mapq,cigarStr,rnext,pnext,tlen,sequence,qual,editDistance,matchingString]) + + elif errors < minerr[seqName]: # error minor to the one previously stored. + sys.stderr.write("Filtered by ERROR | Read %s | Errors = %s \n"%(seqName,minerr[seqName])) + minerr[seqName] = min(errors, minerr[seqName]) # Update minor error. + d[seqName] = [] # Create empy list for seqName (removed aligmants previously stored with higher error). + nh[seqName] = 1 # NH starts at 1 + d[seqName].append([seqName,flag,target,positionInTarget,mapq,cigarStr,rnext,pnext,tlen,sequence,qual,editDistance,matchingString]) + else: # if seqName errors is > than the error stored, discard + pass + + elif seqName != list(d.keys())[0]: #new seqName found + # PRINT to the output file the previous seqName stored in dict + for i in d.keys(): + sys.stderr.write("Printed read %s | Errors = %s | NH = %s \n"%(i, minerr[i], nh[i])) + for al in d[i]: + nhtag= ('NH:i:'+str(nh[i])) + al.append(nhtag) + print('\t'.join([str(x) for x in al])) + + # Clean dictonaries before saving the current seqName being evaluated + d.clear() + nh.clear() + minerr.clear() + + # Restart the dictionaries with the new seqName + d[seqName] = [] + minerr[seqName] = errors + nh[seqName] = 1 + d[seqName].append([seqName,flag,target,positionInTarget,mapq,cigarStr,rnext,pnext,tlen,sequence,qual,editDistance,matchingString]) + + + +def readInput(fi): + i = 0 + #read oligomap results + while True: + line1 = fi.readline() #e.g. seq_61 (21 nc) 21..1 chr1 1..21 + if not line1: break #[check if file is empty] + line2 = fi.readline() #e.g. chr1 + if not line2: break #[pointless check] + line3 = fi.readline() #e.g. errors: 1 orientation: - + if not line3: break #[pointless check] + line4 = fi.readline() #e.g. CAAGCAGAAGACGGCATACGC ->input sequence + if not line4: break #[pointless check] + line5 = fi.readline() #e.g. |||||||||||||||||||| + if not line5: break #[pointless check] + line6 = fi.readline() #e.g. CAAGCAGAAGACGGCATACGA ->target sequence + if not line6: break #[pointless check] + line7 = fi.readline() #e.g. "" + i += 1 + #we got a result + if line1[0].isdigit(): #out readNames start with a number + #parse lines + line1 = re.sub( '\s+', ' ', line1).strip().split() + line3 = re.sub( '\s+', ' ', line3).strip().split() + + seqName = line1[0].strip() + target = line2.strip() + positionInTarget = line1[5].split('.')[0].strip() + #position2InTarget= line1[5].split('.')[2].strip() + errors = line3[1].strip() + strand = line3[3].strip() + sequence = line4.strip() + referenceSeq = line6.strip() + mapq='255' + rnext = '*' + pnext= '0' + tlen='0' + qual='*' + + #flag + if strand == "+": + flag = "0" + else: + flag = "16" + #define cigar string + if errors == '0': + #perfect match, cigar string is just the number of nucleotides + "M" + cigarStr = str(len(sequence)) + "M" + matchingString = str(len(sequence)) + editDistance = "NM:i:0" + else: + #cigar and mismatch strings are built here + if line5[0] == " ": #if the first nucleotide is mutated + cigarStr= "1M" + elif line5[-2] == " ": #if the last nucleotide is mutated + cigarStr= str(len(line5)-2) +"M" + else: + cigarStr= str(line5.strip().index(' ')) + "M" #if any other nucleotide is mutated + editDistance = "NM:i:1" + #depending on where in the read a deletion,insertion or mutation occurs, we create ad hoc mismatch strings. + #case 1: deletion in the seq + if '-' in sequence: + indelerr = "1D" + if line5[0] == " ": #the 1st nt is deleted + cigarStr = indelerr + str(len(sequence)-1) + "M" + matchingString = "^" + referenceSeq[0] + str(len(sequence)-1) + elif line5[-2] == " ": #the last nt is deleted ([-2] because [-1] is "\n") + cigarStr = str(len(sequence)-1) + "M" + indelerr + matchingString = str(len(sequence)-1) + "^" + referenceSeq[len(sequence)-1] + "0" #"0" is required if the last entry is ^[something] + else: #deletion occurs "in" the read + tmp = cigarStr + cigarStr = cigarStr + indelerr + str(line5.strip().count('|')-int(tmp[:-1])) + "M" + matchingString = str(line5.strip().index(' ')) + "^" + referenceSeq[int(tmp[:-1])] + str(len(sequence) - line5.strip().index(' ') -1) + #case 2: insertion in the seq + elif '-' in referenceSeq: + indelerr = "1I" + matchingString = str(len(sequence)) + if line5[0] == " ": #the 1st nt is inserted + cigarStr = indelerr + str(len(sequence)-1) + "M" + elif line5[-2] == " ": #the last nt is inserted + cigarStr = str(len(sequence)-1) + "M" + indelerr + else: #addition occurs "in" the read + tmp = cigarStr + cigarStr = cigarStr + indelerr + str(line5.strip().count('|')-int(tmp[:-1])) + "M" + #case 3: single point mutation + else: + cigarStr = str(len(sequence)) + "M" + if line5[0] == " ": #the 1st nt is inserted + matchingString = referenceSeq[0] + str(len(sequence)-1) + elif line5[-2] == " ": #the last nt is inserted + matchingString = str(len(sequence)-1) + referenceSeq[len(sequence)-1] + else: #mutation occurs "in" the read + matchingString = str(line5.strip().index(' ')) + referenceSeq[line5.strip().index(' ')] + str(len(sequence) - line5.strip().index(' ') -1) + #indelerr = referenceSeq[int(cigarStr)] #this info is not used in the cigar string + sequence = re.sub( '-', '', sequence).strip() + + matchingString = ('MD:Z:'+matchingString) + #addReadToList(readSeqs, readFilt, readNh, seqName,flag,target,positionInTarget,errors,cigarStr,sequence,editDistance,matchingString) + sys.stderr.write("Record: %i | Sequence: %s \n"%(i,seqName)) + addReadToList(readSeqs, nh, seqToMinError, seqName,flag,target,positionInTarget,errors,mapq,cigarStr,rnext,pnext,tlen,sequence,qual,editDistance,matchingString) + + +fi = open(args.input, "r") #read file +sys.stderr.write("###########################\nSTART READING...\n###########################\n") +readInput(fi) #process +fi.close() + +#print last aligments stored in dict +if len(readSeqs) != 0: + for i in readSeqs.keys(): + sys.stderr.write("Printed read %s | Errors = %s | NH = %s \n"%(i, seqToMinError[i], nh[i])) + for al in readSeqs[i]: + nhtag= ('NH:i:'+str(nh[i])) + al.append(nhtag) + print('\t'.join([str(x) for x in al])) + +sys.stderr.write("SUCCESSFULLY FINISHED.") +sys.exit() \ No newline at end of file diff --git a/scripts/sam_remove_duplicates_inferior_alignments_multimappers.1_5.pl b/scripts/sam_remove_duplicates_inferior_alignments_multimappers.1_5.pl new file mode 100755 index 0000000..ed04165 --- /dev/null +++ b/scripts/sam_remove_duplicates_inferior_alignments_multimappers.1_5.pl @@ -0,0 +1,311 @@ +#!/usr/bin/env perl +use lib "/scicore/home/zavolan/clipz/newClipz7/lib/perl"; +#==================# +# HEADER START # +#==================# +### Name: sam_remove_duplicates_inferior_alignments_multimappers.pl +### Created: Aug 29, 2013 +### Author: Alexander Kanitz +### Company: Zavolan Group, Biozentrum, University of Basel +### Requirements: GetOpt::Long +#==================# +# HEADER END # +#==================# + + +#==========================# +# PRE-REQUISITES START # +#==========================# +#---> PRAGMAS / PACKAGES <---# +use strict; +use warnings; +use Getopt::Long; + +#---> USAGE <---# +my $usage_info = &usage; + +#---> OPTIONS / ARGUMENTS <---# +my $usage = ''; +my $quiet = ''; +my $head = ''; +my $in = ''; +my $out = ''; +my $new_header = ''; +my $multi = -1; +my $mm = ''; +my $hm = ''; +my $options_result = GetOptions ( + 'usage|help' => \$usage, + 'quiet' => \$quiet, + 'print-header' => \$head, + 'new-header=s' => \$new_header, + 'keep-mm:i' => \$multi, + 'mm=s' => \$mm, + 'heavy-mm=s' => \$hm, + #-----------------------# + 'in=s' => \$in, + 'out=s' => \$out +); + +## Die if command line parsing was not successful or required arguments are missing +die $usage_info if $usage || !$options_result; +die $usage_info if !$in || !$out; + +## Die if indicated files do not exist +die "[ERROR] File '$in' not found.\n$usage_info" unless -e $in; +die "[ERROR] File '$new_header' not found.\n$usage_info" unless $new_header eq "" || -e $new_header; + +# Unset $head switch if $new_header is set +$head = 0 if $new_header; + +#==========================# +# PRE-REQUISITES END # +#==========================# + + +#================# +# MAIN START # +#================# +#---> STATUS MESSAGE <---# +print "Starting '$0'...\n" unless $quiet; + +#---> BODY <---# +&filter_sam($in, $out, $multi, $mm, $head, $new_header); + +#---> STATUS MESSAGE <---# +print "Done.\n" unless $quiet; + +#---> PROGRAM EXIT <---# +exit 0; +#================# +# MAIN END # +#================# + + +#=======================# +# SUBROUTINES START # +#=======================# +sub usage { +### Function: Returns usage information for current script +### Accepts: n/a +### Returns: String with usage information +### Type: Specialized +'Usage: perl ./sam_remove_duplicates_inferior_alignments_multimappers.pl [OPTIONS] --in [SAM] --out [SAM] + +Description: From a sorted SAM file, first removes duplicate records (defined by identical entries for the fields QNAME, FLAG, RNAME, POS & CIGAR), then all QNAME duplicates except for the one(s) with the shortest edit distance. Finally, unless the --keep-mm is set, all alignments of queries with the same edit distance, but different coordinates ("multimappers") are discarded. + +Arguments: +--in [SAM] Input SAM file sorted by QNAME [Required] +--out [SAM] Output SAM file [Required] +--print-header Print header (keep input file header if --new-header not specified) +--new-header [FILE] Uses file indicated in argument as header +--keep-mm [INT] Keep queries with up to INT different alignments. Set INT to "0" to keep all alignments for each query. By default, all alignments of "multimappers" are removed. +--mm [TAB] Print the QNAMEs and mapping counts of all "multimappers" to TAB (format: QNAME /TAB/ number of mappings; one entry per line). +--heavy-mm [TAB] Like --mm, but only prints alignments for reads that map more than --keep-mm times. +--usage|help Show this information and die +--quiet Shut up! + +Notes: +Script requires NM tags (i.e. edit distances) to be present in all records of the input SAM file. +CAUTION: Only marginal validation of the input file type/format performed! + +Version 1.5 (2019-03-27) +Written by Alexander Kanitz on 2013-08-29 +'; +} +#-----------------------# +sub filter_sam { +### Function: From a sorted SAM file, first removes duplicate records (i.e. same entry name and coordinates, specifically the fields: QNAME, FLAG, RNAME, POS & CIGAR), then all QNAME duplicates except for the one(s) with the shortest edit distance, then (optionally) all alignments of "multimappers" (same QNAME, same edit distance, but different coordinates). +### Accepts: 1. Input file [FILE|SAM]; 2. Output file [FILE|SAM]; 3. Multimapper switch: 0 = remove multimappers, 1 = keep multimappers; 4. Output file for multimapper IDs/QNAMEs; 5. Header switch: FALSE = do not print header, TRUE = print header; 6. Header file (prepends SAM records in output) +### Returns: n/a +### Dependencies: n/a +### Type: Specialized + #---> PASS ARGUMENTS ---# + my ($in, $out, $multi, $mm, $head, $new_header) = @_; + + #---> STATUS MESSAGE <---# + print STDOUT "Filtering SAM file '$in'..." . "\n" unless $quiet; + + #---> SUBROUTINE VARIABLES <---# + my $regex_header = '^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$'; + my $regex_comment = '/^\@CO\t.*/'; + my $last_line; # holds the last line + my $last_id; # holds the last distinct QNAME/read ID + my @AoH; # holds references to hashes containing the field values of the last lines that share the same QNAME/read ID + my @field_keys = qw/QNAME FLAG RNAME POS MAPQ CIGAR RNEXT PNEXT TLEN SEQ QUAL/; # holds the bareword keys for the hashes containing field values + my $final_record = 0; + + #---> BODY <---# + + #---> Open input and output filehandles <---# + open IN, "<", $in or die "[ERROR] Could not open file '$in'!\n"; + open OUT, ">", $out or die "[ERROR] Could not open file '$out'!\n"; + open MM, ">", $mm or die "[ERROR] Could not open file '$mm'!\n" if $mm; + open HM, ">", $hm or die "[ERROR] Could not open file '$hm'!\n" if $hm; + + #---> Process header lines (assumed at the top of the file!) <---# + while (<IN>) { + if ( /$regex_header/ || /$regex_comment/ ) { + print OUT if $head; + } + else { + $last_line = $_; + last; + } + } + + #---> Reset filehandle position <---# + { + use bytes; + seek IN, -length($last_line), 1; + } + + #---> Print header from separate file (if indicated in command line) <---# + if ($new_header) { + open HEAD, "<", $new_header or die "[ERROR] Could not open file 'new_header'!\n"; + while (<HEAD>) { + print OUT; + } + close HEAD; + } + + #---> Traverse non-header lines <---# + while (my $line = <IN>) { + + #---> Field values to hash <---# + chomp $line; + my @field_values = split "\t", $line; + die "[ERROR] Input file does not look like a valid SAM file!" unless scalar @field_values >= 11; # Assert presence of at least 11 fields + my %fields; + @fields{@field_keys} = @field_values[0 .. 10]; + foreach (@field_values[11 .. $#field_values]) { + my ($tag, $value) = split ":", $_, 2; + $fields{$tag} = $value; + } + die "[ERROR] Edit distance ('NM tag') missing!" unless defined $fields{"NM"}; # Assert presence of NM tag + + #---> Manage AoH: Grow if QNAMEs identical, else compare AoH entries, print record(s) and reset AoH <---# + if ( defined $last_id && $fields{"QNAME"} ne $last_id ) { + die "[ERROR] SAM file appears to be corrupt!" unless scalar @AoH > 0; # Assert integrity of SAM records + @AoH = @{&sam_AoH_filter_records_w_ident_QNAME(\@AoH)} if scalar @AoH > 1; + my @out_lines = @{&sam_AoH_join_records(\@AoH)}; + if ( $multi < 0 ) { + print OUT $out_lines[0] if scalar @out_lines == 1; # Print unique mapper entries + } + elsif ( $multi == 0 || $multi >= scalar @out_lines ) { + print OUT foreach @out_lines; # Print all or $multi multimappers if requested + } + else + { + print HM $last_id . "\t" . scalar @out_lines . "\n"; + } + print MM $last_id . "\t" . scalar @out_lines . "\n" if $mm && scalar @out_lines > 1; # Print multimapper QNAMEs if requested + @AoH = (); + } + push @AoH, \%fields; + $last_id = $fields{"QNAME"}; + $last_line = $line; + + } + + #---> Account for final record(s) separately due to EOF <---# + die "[ERROR] SAM file appears to be corrupt or empty!" unless scalar @AoH > 0; # Assert integrity of SAM records + @AoH = @{&sam_AoH_filter_records_w_ident_QNAME(\@AoH)} if scalar @AoH > 1; + my @out_lines = @{&sam_AoH_join_records(\@AoH)}; + if ( $multi < 0 ) { + print OUT $out_lines[0] if scalar @out_lines == 1; # Print unique mapper entries + } + elsif ( $multi == 0 || $multi >= scalar @out_lines ) { + print OUT foreach @out_lines; # Print all or $multi multimappers if requested + } + else + { + print HM $last_id . "\t" . scalar @out_lines . "\n"; + } + print MM $last_id . "\t" . scalar @out_lines . "\n" if $mm && scalar @out_lines > 1; # Print multimapper QNAMEs if requested + + #---> Close input and output filehandles <---# + close OUT; + close IN; + close MM if $mm; + close HM if $hm; + + #---> END BODY <---# + + #---> STATUS MESSAGE <---# + print STDOUT "Written filtered SAM file to '$out'.\n" unless $quiet; + + #---> RETURN VALUE <---# + return 0; +} +#-----------------------# +sub sam_AoH_filter_records_w_ident_QNAME { +### Function: Compares records of a SAM file that share the same QNAME/read ID: True duplicates (i.e. QNAME and coordinates are equal) are discarded first. Then all records but the ones with the lowest edit distances are discarded. The remaining entry or entries are returned in an array of hashes (array length of > 1 if read is a "multimapper"). +### Accepts: Array of hashes of SAM records with identical QNAME field (as generated by the subroutine 'filter_sam', written by Alexander Kanitz, 29-AUG-2013) +### Returns: Reference to array of hashes +### Dependencies: n/a +### Type: Generic + #---> PASS ARGUMENTS ---# + my $AoH_ref = shift; + + #---> BODY <---# + + #---> Remove "true duplicates" (same QNAME, FLAG, RNAME, POS & CIGAR) / KEEP ONE!!! <---# + my %true_dup; + for my $hash_ref (@$AoH_ref) { + my $id_coord = join "", $hash_ref->{"QNAME"}, $hash_ref->{"FLAG"}, $hash_ref->{"RNAME"}, $hash_ref->{"POS"}, $hash_ref->{"CIGAR"}; # make string to unambiguously define mapping position + $true_dup{$id_coord} = $hash_ref; # Add to hash + } + $AoH_ref = [values %true_dup]; + + #---> Keep only ones with shortest edit distance <---# + my $min_edit_distance; + my @edit_distances; + foreach my $hash_ref (@$AoH_ref) { + my $edit_distance = ($hash_ref->{"NM"} =~ /^.*:(\d+)/)[0]; + $min_edit_distance = $edit_distance if ! defined $min_edit_distance || $edit_distance < $min_edit_distance; + push @edit_distances, $edit_distance; + } + for ( my $index = $#edit_distances; $index >= 0; $index-- ) { + splice @$AoH_ref, $index, 1 if $edit_distances[$index] != $min_edit_distance; + } + + #---> RETURN VALUE <---# + return $AoH_ref; +} +#-----------------------# +sub sam_AoH_join_records { +### Function: Joins the fields of SAM records stored in an array of hashes in the proper order (additional tags in alphanumerical order); re-computes or adds the NG tag field +### Accepts: Array of hashes of SAM records with identical QNAME field (as generated by the subroutine 'filter_sam', written by Alexander Kanitz, 29-AUG-2013) +### Returns: Array of strings, one element for each record, appended with a newline character for easy printing +### Dependencies: n/a +### Type: Generic + #---> PASS ARGUMENTS ---# + my $AoH_ref = shift; + + #---> SUBROUTINE VARIABLES <---# + my @field_keys = qw/QNAME FLAG RNAME POS MAPQ CIGAR RNEXT PNEXT TLEN SEQ QUAL/; # holds the bareword keys for the hashes containing field values in the right order + my @records; + + #---> BODY <---# + + foreach my $record (@$AoH_ref) { + my $nh = scalar @$AoH_ref; + $record->{NH} = "i:${nh}"; + my @fields_ordered; + foreach my $key (@field_keys) { + push @fields_ordered, $record->{$key}; + delete $record->{$key}; + } + foreach my $extra_field (sort keys %$record) { + push @fields_ordered, ( $extra_field . ":" . $record->{$extra_field} ); + } + push @records, ( join( "\t", @fields_ordered) . "\n" ); + } + + #---> RETURN VALUE <---# + return \@records; +} +#=======================# +# SUBROUTINES END # +#=======================# diff --git a/scripts/sam_trx_to_sam_gen.pl b/scripts/sam_trx_to_sam_gen.pl new file mode 100755 index 0000000..08e74f7 --- /dev/null +++ b/scripts/sam_trx_to_sam_gen.pl @@ -0,0 +1,838 @@ +#!/usr/bin/env perl +use lib "/scicore/home/zavolan/clipz/newClipz6/lib/perl"; +#!/usr/bin/perl + +#==================# +# HEADER START # +#==================# +### Name: sam_trx_to_sam_gen.pl +### Created: Oct 4, 2013 +### Author: Alexander Kanitz +### Company: Zavolan Group, Biozentrum, University of Basel +### Requirements: Getopt::Long +#==================# +# HEADER END # +#==================# + + +#==========================# +# PRE-REQUISITES START # +#==========================# + +#---> PRAGMAS / PACKAGES <---# +use strict; +use warnings; +use Getopt::Long; + +#---> USAGE <---# +my $usage_info = &usage; + +#---> OPTIONS / ARGUMENTS <---# +my $usage = ''; +my $quiet = ''; +my $in_sam = ''; +my $in_bed = ''; +my $out_sam = ''; +my $head = ''; +my $no_strand_info = 0; +my $min_overlap = 0; +my $monoexonic = 0; +my $tag = ''; +my $report = 0; +my $options_result = GetOptions ( + 'usage|help' => \$usage, + 'quiet' => \$quiet, + 'head' => \$head, + 'no-strand-info' => \$no_strand_info, + 'min-overlap=i' => \$min_overlap, + 'include-monoexonic' => \$monoexonic, + 'tag=s' => \$tag, + 'print-report' => \$report, + #-----------------------# + 'in=s' => \$in_sam, + 'exons=s' => \$in_bed, + 'out=s' => \$out_sam +); + +# Die if option parsing was not successful or --usage / --help was requested +die $usage_info if $usage || !$options_result; + +# Die if required options are not provided +die $usage_info if !$in_sam || !$in_bed || !$out_sam; + +# Construct optional tag +$tag = "\tZZ:Z:" . $tag if $tag; + +#==========================# +# PRE-REQUISITES END # +#==========================# + + +#================# +# MAIN START # +#================# + +#---> STATUS MESSAGE <---# +print "Starting '$0'...\n" unless $quiet; + +#---> MAIN VARIABLES <---# +my $exon_hoaoa_ref; + +#---> BODY <---# + + #---> Construct hash of arrays of arrays of exons <---# + $exon_hoaoa_ref = &exons_bed_to_hoaoa($in_bed); +# $exon_hoaoa_ref = &exons_hoaoa_remove_single_exons($exon_hoaoa_ref) unless $monoexonic; + $exon_hoaoa_ref = &exons_hoaoa_reorder_exons_on_minus_strand($exon_hoaoa_ref); + $exon_hoaoa_ref = &exons_hoaoa_add_cumulative_length($exon_hoaoa_ref); + $exon_hoaoa_ref = &exons_hoaoa_add_intron_length($exon_hoaoa_ref); + + #---> Map reads <---# + &trx_sam_to_gen_sam($in_sam, $exon_hoaoa_ref, $out_sam); + +#---> STATUS MESSAGE <---# +print "Done.\n" unless $quiet; + +#---> PROGRAM EXIT <---# +exit 0; + +#================# +# MAIN END # +#================# + + +#=======================# +# SUBROUTINES START # +#=======================# +sub usage { +### Function: Returns usage information for current script +### Accepts: n/a +### Returns: String with usage information +### Type: Specialized +'Usage: perl ./sam_trx_to_sam_gen.pl [OPTIONS] --in [FILE|SAM] --exons [FILE|BED] --out [FILE|SAM] + +Description: Re-maps a SAM file resulting from aligning a library of sequencing reads against a transcriptome to genomic coordinates. All reads that do not cross an exon-exon boundary by a specified minimum overlap are discarded by default. + +================================================== +Required arguments: +--in [FILE|SAM] Input SAM file (transcript coordinates) +--exons [FILE|BED] BED file of exons (genomic coordinates) +--out [FILE|SAM] Output SAM file (genomic coordinates) +================================================== +Optional arguments: +--min-overlap [INT] Minimum required overlap between read and any of the exons +--no-strand-info Used library preparation protocol does not preserve strand information (if unset, all reads mapping to the opposite strand of annotated transcripts - i.e. the appropriate SAM flag is set - are discarded) +--include-monoexonic Do not discard alignments against single exons (default: skip) +--head Print SAM header +--tag [STRING] Tag of the form "ZZ:Z:STRING" that is appended to the end of each line in the output file +--print-report Print statistics on the number of processed, printed and discarded alignments to STDOUT +--usage|help Show this information and die +--quiet Shut up! + +Comments: +- The script was written for SAM files produced by a recent (>= 0.1.4) version of segemehl. SAM files of other origin may or may not work. +- Only the 0x10 bit (which informs whether a read sequence was reverse complemented for the alignment) of the FLAG field is considered; all other bits are ignored. +- CAUTION: Only marginal validation of the input file type/format performed! + +Version 1.6 (2014-08-04) +Written by Alexander Kanitz on 2013-10-04 +'; +} +#-----------------------# +sub exons_bed_to_hoaoa { +### Function: Reads a BED file of exons and loads them into a hash (key: transcript ID) of arrays (chromosome, strand, exons) of arrays (exon genomic start, exon genomic stop) +### Accepts: 1. sorted (name, start, end) BED6 file of exons (coordinates relative to genome; 0-based, open-ended) +### Returns: Reference to hash of arrays of arrays +### Dependencies: n/a +### Type: Generic + #---> PASS ARGUMENTS ---# + my $bed = shift; + + #---> STATUS MESSAGE <---# + print STDOUT "Loading exons..." . "\n" unless $quiet; + + #---> SUBROUTINE VARIABLES <---# + my %hoaoa; + my @outer_array; + + #---> BODY <---# + + #---> Open input filehandle <---# + open IN, "<", $bed or die "[ERROR] Could not open file '$bed'!\nExecution aborted\n"; + + #---> Traverse input file line by line <---# + while (my $line = <IN>) { + + #---> Field values to array <---# + chomp $line; + my ($chr, $start, $end, $name, $score, $strand) = split "\t", $line; + die "[ERROR] Input file does not look like a valid BED6 file!\nExecution aborted\n" unless $strand; # Assert presence of strand information + warn "[WARNING] Incompatible strand information in line $.:\n$line\nOnly '+' and '-' are allowed. Entry skipped.\n" and next unless ($strand eq "+") || ($strand eq "-"); + + #---> Generate outer array (chromosome, strand) if no record yet exists for this transcript <---# + if (! exists $hoaoa{$name}) { + undef @outer_array; + push @outer_array, $chr, $strand; + } + + #---> Verify that one transcript does not contain exons annotated on different strands <---# + die "[ERROR] Transcript $name contains exons annotated on different strands in/around line $.:\n$line\nExecution aborted\n" if $outer_array[1] ne $strand; + + #---> Generate inner array ()exon length, genomic start / end) and push to outer array <---# + my @inner_array; + push @inner_array, ($start + 1), $end; # Transform to 1-based, close-ended (SAM!) + + #---> Push inner to outer array and add outer array to hash <---# + push @outer_array, \@inner_array; + $hoaoa{$name} = [ @outer_array ]; + } + + #---> Close input filehandle <---# + close IN; + + #---> STATUS MESSAGE <---# + print STDOUT "Exons loaded." . "\n" unless $quiet; + + #---> RETURN VALUE <---# + return \%hoaoa; + +} +#-----------------------# +sub exons_hoaoa_reorder_exons_on_minus_strand { +### Function: Reverses the order of exons for transcripts annotated on the Crick/minus strand from exons hoaoa generated by sub "exon_bed_to_hoaoa" +### Accepts: Reference to hash of arrays of arrays +### Returns: Reference to hash of arrays of arrays +### Dependencies: Subroutine "exon_bed_to_hoaoa" (Alexander Kanitz) +### Type: Specialized + #---> PASS ARGUMENTS ---# + my $hoaoa_ref = shift; + + #---> STATUS MESSAGE <---# + print STDOUT "Reversing exon order for transcripts on Crick strand..." . "\n" unless $quiet; + + #---> BODY <---# + + #---> Reverse order of exons for transcripts annotated on Crick strand <---# + foreach my $outer_array_ref ( values %$hoaoa_ref ) { + # If transcript is annotated on Crick strand + if ( $outer_array_ref->[1] eq "-" ) { + # Slice out chr and strand + my @chr_str = splice @$outer_array_ref, 0, 2; + ## Swap start and end coordinates + foreach my $exon_array_ref (@$outer_array_ref) { + my $old_start_new_stop = $$exon_array_ref[0]; + $$exon_array_ref[0] = $$exon_array_ref[1]; + $$exon_array_ref[1] = $old_start_new_stop; + } + # Reverse rest (i.e. references to exons / inner arrays) + my @reversed_inner_array_refs = reverse @$outer_array_ref; + # Empty original array + @$outer_array_ref = (); + # Rebuild array from slices + push @$outer_array_ref, @chr_str, @reversed_inner_array_refs; + } + } + + #---> STATUS MESSAGE <---# + print STDOUT "Exon order reversed." . "\n" unless $quiet; + + #---> RETURN VALUE <---# + return $hoaoa_ref; + +} +#-----------------------# +sub exons_hoaoa_add_cumulative_length { +### Function: Add exon length and cumulative exon length to exons hoaoa generated by sub "exon_bed_to_hoaoa" (added as third and fourth elements to inner arrays) +### Accepts: Reference to hash of arrays of arrays +### Returns: Reference to hash of arrays of arrays +### Dependencies: Subroutine "exon_bed_to_hoaoa" (Alexander Kanitz) +### Type: Specialized + #---> PASS ARGUMENTS ---# + my $hoaoa_ref = shift; + + #---> STATUS MESSAGE <---# + print STDOUT "Calculating cumulative exon lengths..." . "\n" unless $quiet; + + #---> BODY <---# + + #---> Add cumulative exon length to inner arrays <---# + ## Traverse through each key $outer_array of hash %$hoaoa_ref + foreach my $outer_array_ref ( values %$hoaoa_ref ) { + # Initialize cumulative length variable + my $previous = 0; + ## For each inner array reference... + for my $exon_no ( 0 .. (scalar @$outer_array_ref - 3) ) { + # Calculate current exon length + $outer_array_ref->[$exon_no + 2]->[2] = abs($outer_array_ref->[$exon_no + 2]->[1] - $outer_array_ref->[$exon_no + 2]->[0]) + 1; + # Calculate cumulative length by adding current exon length to previous cumulative length; assign to new element in inner array + $outer_array_ref->[$exon_no + 2]->[3] = $outer_array_ref->[$exon_no + 2]->[2] + $previous; + # Update cumulative length + $previous = $outer_array_ref->[$exon_no + 2]->[3]; + } + } + + #---> STATUS MESSAGE <---# + print STDOUT "Cumulative exon lengths calculated." . "\n" unless $quiet; + + #---> RETURN VALUE <---# + return $hoaoa_ref; + +} +#-----------------------# +sub exons_hoaoa_add_intron_length { +### Function: Add distance between exon and previous exon to exons hoaoa generated by sub "exon_bed_to_hoaoa" (added as fifth element to inner arrays) +### Accepts: Reference to hash of arrays of arrays +### Returns: Reference to hash of arrays of arrays +### Dependencies: Subroutine "exon_bed_to_hoaoa" (Alexander Kanitz) +### Type: Specialized + #---> PASS ARGUMENTS ---# + my $hoaoa_ref = shift; + + #---> STATUS MESSAGE <---# + print STDOUT "Calculating intron lengths..." . "\n" unless $quiet; + + #---> BODY <---# + + #---> Add length of previous intron to inner arrays <---# + ## Traverse through each key $outer_array of hash %$hoaoa_ref + foreach my $outer_array_ref ( values %$hoaoa_ref ) { + # Initialize variable that holds end coordinate of previous exon + my $end_prev; + ## For each inner array reference... + for my $exon_no ( 0 .. (scalar @$outer_array_ref - 3) ) { + # Calculate intron length by subtracting the end coordinate of the previous exon from the start coordinate of the current exon (adjust offset!); set 0 if there is no previous exon; assign value to new element in inner array + $outer_array_ref->[$exon_no + 2]->[4] = (defined $end_prev) ? abs($outer_array_ref->[$exon_no + 2]->[0] - $end_prev) - 1 : 0; + # Update/set end coordinate for next iteration + $end_prev = $outer_array_ref->[$exon_no + 2]->[1]; + } + } + + #---> STATUS MESSAGE <---# + print STDOUT "Intron lengths calculated." . "\n" unless $quiet; + + #---> RETURN VALUE <---# + return $hoaoa_ref; + +} +#-----------------------# +sub trx_sam_to_gen_sam { +### Function: Reads transcript alignments from a SAM file and maps them to genomic coordinates +### Accepts: 1. SAM input file; 2. Hash of arrays of arrays of exon genomic coordinates generated by subroutine "exons_bed_to_hoaoa" or derivatives; 3. Filename for SAM output file +### Returns: n/a +### Dependencies: Subroutine "exons_bed_to_hoaoa" (Alexander Kanitz) +### Type: Specialized + #---> PASS ARGUMENTS ---# + my ($in_sam, $exons_hoaoa_ref, $out_sam) = @_; + + #---> STATUS MESSAGE <---# + print STDOUT "Processing SAM file (may take long)..." . "\n" unless $quiet; + + #---> SUBROUTINE VARIABLES <---# + my $pos; + my $processed = 0; + my $incomplete = 0; + my $rc = 0; + my $not_found = 0; + my $no_eej = 0; + my $below_min_overlap = 0; + my $printed = 0; + + #---> BODY <---# + + #---> Open input and output filehandles <---# + open IN, "<", $in_sam or die "[ERROR] Could not open file '$in_sam'!\nExecution aborted\n"; + open OUT, ">", $out_sam or die "[ERROR] Could not open file '$out_sam'!\nExecution aborted\n"; + + #---> Process header lines (assumed at the top of the file!) <---# + $pos = tell IN; + while (<IN>) { + if ( $_ =~ /^\@[A-Za-z][A-Za-z]\s/ ) { + print OUT if $head; # Print header if --head switch is set + $pos = tell IN; + } + else { + last; + } + } + + #---> Reset filehandle position <---# + seek IN, $pos, 0; + $. = $. - 1; + + #---> Traverse non-header lines <---# + while (<IN>) { + + #---> Process SAM line <---# + chomp; + next if $_ eq ""; + $processed++; + my ($QNAME, $FLAG, $RNAME, $POS, $MAPQ, $CIGAR, $RNEXT, $PNEXT, $TLEN, $SEQ, $QUAL, $REST) = split "\t", $_, 12; + + #---> Check if all (i.e. last) required values are defined <---# + unless (defined $QUAL) { + warn "[WARNING] Line ${.} does not look like a proper SAM line. Entry skipped.\n"; + $incomplete++; + next; + } + + + #---> Extract MD tag <---# + my $MD; + if ( defined $REST ) { + $REST =~ s/\t(MD:Z:\S+)//; + $MD = $1; + } + + #---> Skip reads when alignments are reverse complemented but strand information is available <---# + if ( &rc_bit($FLAG) == 1 && ! $no_strand_info ) { + $rc++; + next; + } + + #---> Adjust start and calculate end position of transcript alignment <---# + my $ref_length = 0; + $ref_length += $_ for $CIGAR =~ /(\d+)[MDN]/g; # Only M(ism)atches, D(eletions) and Ns contribute to the aligned part of the reference + my $END_POS = $POS + $ref_length - 1; + + #---> Obtain genomic coordinates, lengths of intersections with exons and lengths of spanned introns <---# + my $coords_frags_array_ref = &get_coords_and_frags($exons_hoaoa_ref, $RNAME, $POS, $END_POS, $min_overlap, $monoexonic); + if ($coords_frags_array_ref == 1) { # Skip alignment if previous function call returned error code 1 (i.e. aligns to a transcript or transcript region not in the lookup table) + $not_found++; + next; + } + if ($coords_frags_array_ref == 2) { # Skip alignment if previous function call returned error code 2 (i.e. alignment does not intersect multiple exons) + $no_eej++; + next; + } + if ($coords_frags_array_ref == 3) { # Skip alignment if previous function call returned error code 3 (i.e. the specified minimum overlap is not met) + $below_min_overlap++; + next; + } + my ($chr, $str, $start, @frags) = @$coords_frags_array_ref; + + #---> Evaluate strand information and update FLAG, SEQ, QUAL, CIGAR & MD if necessary <---# + if ( $str eq "-" ) { + $FLAG = ($FLAG == 0) ? 16 : 0; # The Watson ("+") strand is the reference strand for the genome alignments, therefore a perfect match (FLAG = 0) to a transcript on the "+" and "-" strands should be assigned FLAG values of 0 and 16, respectively. The situation is inversed if a reverse complement match occurs (only considered when --no-strand-info is set!) + $CIGAR = &reverse_CIGAR($CIGAR); + $SEQ = reverse &complement($SEQ); + $QUAL = reverse $QUAL; + $MD = &reverse_complement_MD($MD) if defined $MD; + }; + + #---> Add introns to CIGAR string <---# + $CIGAR = &add_introns_to_CIGAR($CIGAR, @frags) unless scalar @frags == 0; + + #---> Print entry <---# + print OUT + $QNAME . "\t" . + $FLAG . "\t" . + $chr . "\t" . + $start . "\t" . + $MAPQ . "\t" . + $CIGAR . "\t" . + $RNEXT . "\t" . + $PNEXT . "\t" . + $TLEN . "\t" . + $SEQ . "\t" . + $QUAL . "\t" . + $REST; + print OUT $tag if $tag; + print OUT "\t" . $MD if defined $MD; + print OUT "\n"; + $printed++; + } + + #---> Print report <---# + my $discarded = $incomplete + $rc + $not_found + $no_eej + $below_min_overlap; + my $stats = "======\nREPORT\n======\n"; + if ( $processed ) { + $stats .= sprintf "Alignments processed: %d (%.2f%%)\n", $processed, $processed / $processed * 100; + $stats .= sprintf " - Converted to genome space: %d (%.2f%%)\n", $printed, $printed / $processed * 100; + $stats .= sprintf " - Discarded: %d (%.2f%%)\n", $discarded, $discarded / $processed * 100; + $stats .= sprintf " - Sequence reverse complemented: %d (%.2f%%)\n", $rc, $rc / $processed * 100; + $stats .= sprintf " - Not covering exon-exon junctions: %d (%.2f%%)\n", $no_eej, $no_eej / $processed * 100; + $stats .= sprintf " - Not meeting minimum exon overlap: %d (%.2f%%)\n", $below_min_overlap, $below_min_overlap / $processed * 100; + $stats .= sprintf " - Features/regions not present in exon lookup table: %d (%.2f%%)\n", $not_found, $not_found / $processed * 100; + $stats .= sprintf " - Incomplete/non-standard records: %d (%.2f%%)\n", $incomplete, $incomplete / $processed * 100; + } else { + $stats .= sprintf "Alignments processed: %d\n", $processed; + } + + print STDOUT $stats if $report; + + #---> Close input and output filehandles <---# + close OUT; + close IN; + + #---> STATUS MESSAGE <---# + print STDOUT "SAM file processed. Output written to '$out_sam'" . "\n" unless $quiet; + +} +#-----------------------# +sub get_coords_and_frags { +### Function: Gets the reference sequence (i.e. chromosome), strand, and starting position of the alignment in genomic coordinates and calculates the lengths of overlaps with intersected exons as well as the lengths of spanned introns +### Accepts: 1. Reference to hash of arrays of arrays generated by subroutine "exons_bed_to_hoaoa" or derivatives; 2./3. RNAME (i.e. transcript ID) and POS (i.e. starting position) of the transcript SAM entry, respectively; 4. end position of the transcript alignment; 5. the allowed minimum overlap +### Returns: Reference to array of 1. chromosome, 2. strand, 3. starting position of the alignment in genomic coordinates, 4. overlap with first exon (integer), (for multiple fragments, alternating: A. length of spanned intron, B. overlap with next fragment), N-1. length of spanned intron, N. overlap with last fragment +### Dependencies: Subroutine "exons_bed_to_hoaoa" (Alexander Kanitz) +### Type: Specialized + #---> PASS ARGUMENTS ---# + my ($exons_hoaoa_ref, $trx_id, $start_trx, $end_trx, $min_overlap, $monoexonic) = @_; + + #---> SUBROUTINE VARIABLES <---# + my @return; + my @frags; + my $start_gen; + my $next_index; + my $prev_cum_length = 0; + my $skip = 1; + + #---> BODY <---# + + #---> Get array containing properties of transcript <---# + # Throw warning and return an error code if transcript is missing from lookup table + warn "[WARNING] Transcript '$trx_id' not found in lookup table! Check integrity and compatibility of exon/transcript annotations used here and during mapping. Entry skipped.\n" and return 1 unless exists $exons_hoaoa_ref->{$trx_id}; + # Load entry into dedicated array + my @trx_entry = @{$exons_hoaoa_ref->{$trx_id}}; + # Return an error code if transcript has only one exon and --include-monoexonic is not set + return 2 if scalar @trx_entry == 3 && ! $monoexonic; + + #---> Find starting point of alignment and first fragment overlap <---# + for my $idx ( 2 .. (scalar @trx_entry - 1) ) { + + # For the first fragment, the start coordinate of the alignment should be smaller than the cumulative exon length of the current exon + if ( $start_trx <= $trx_entry[$idx]->[3]) { + # Check if the end coordinate of the alignment is also smaller than the cumulative length of the current exon + if ( $end_trx <= $trx_entry[$idx]->[3] ) { + # Return an error code if whole alignment is contained in one exon and --inlcude-monoexonic is not set + return 2 unless $monoexonic; + # Calculate genomic start position for alignments to the Watson ("+") and Crick ("-") strands + $start_gen = $trx_entry[1] eq "+" ? $trx_entry[$idx]->[1] - ($trx_entry[$idx]->[3] - $start_trx) : $trx_entry[$idx]->[1] + ($trx_entry[$idx]->[3] - $end_trx); + } + else { + # Calculate overlap + my $start_overlap = $trx_entry[$idx]->[3] - $start_trx + 1; + # Return an error code if overlap is smaller than allowed + return 3 if $start_overlap < $min_overlap; + # Push overlap to fragments array + push @frags, $start_overlap; + # Calculate genomic start position for alignments to the Watson ("+") strand + $start_gen = $trx_entry[$idx]->[1] - $start_overlap + 1 if $trx_entry[1] eq "+"; + # Set skip switch + $skip = 0; + } + # Set previous cumulative length for next iteration + $prev_cum_length = $trx_entry[$idx]->[3]; + # Set flag to indicate that the start coordinate was already found + $next_index = $idx + 1; + # Break out of loop + last; + } + } + + #---> Find intermediate and last fragments <---# + unless ($skip) { + for my $idx ( $next_index .. (scalar @trx_entry - 1) ) { + + # For the last fragment, the end coordinate of the alignment should be smaller than the cumulative exon length of the current exon + if ( $end_trx <= $trx_entry[$idx]->[3] ) { + # Calculate overlap + my $end_overlap = $end_trx - $prev_cum_length; + # Return an error code if overlap is smaller than allowed + return 3 if $end_overlap < $min_overlap; + # Push exon overlap and spanned intron length to fragments array + push @frags, $trx_entry[$idx]->[4], $end_overlap; + # Calculate genomic start position for alignments to the Crick ("-") strand + $start_gen = $trx_entry[$idx]->[0] - $end_overlap + 1 if $trx_entry[1] eq "-"; + # Break out of loop + last; + } + else { + # Save length of spanned intron and exon overlap in fragments array + push @frags, $trx_entry[$idx]->[4], $trx_entry[$idx]->[2]; + # Set previous cumulative length for next iteration + $prev_cum_length = $trx_entry[$idx]->[3]; + } + } + } + + #---> Throw warning and return an error code if coordinates are out of bounds (incomplete/incompatible lookup table) <---# + warn "[WARNING] Alignment (start: $start_trx, end: $end_trx) out of bounds in transcript '$trx_id'! Check integrity and compatibility of exon/transcript annotations used here and during mapping. Entry skipped.\n" and return 1 unless $start_gen; + + #---> Reverse fragment order for features on the Crick ("-") strand <---# + @frags = reverse @frags if $trx_entry[1] eq "-"; + + #---> Build return array <---# + push @return, @trx_entry[0..1], $start_gen, @frags; + + #---> RETURN VALUE <---# + return \@return; + +} +#-----------------------# +sub complement { +### Function: Returns the complement of the input sequence +### Accepts: 1. String (all characters but A, C, G, T and their lower case versions are ignored) +### Returns: Complement of input sequence +### Dependencies: n/a +### Type: Generic + + #---> PASS ARGUMENTS ---# + my $seq = shift; + + #---> BODY <---# + + # Build complement by transliteration + $seq =~ tr/aAcCgGtT/tTgGcCaA/; + + #---> RETURN VALUE <---# + return $seq; + +} +#-----------------------# +sub reverse_CIGAR { +### Function: Reverses a CIGAR string +### Accepts: 1. CIGAR string +### Returns: Reversed CIGAR string +### Dependencies: n/a +### Type: Generic + + #---> PASS ARGUMENTS ---# + my $CIGAR = shift; + + #---> SUBROUTINE VARIABLES <---# + my @CIGAR_new; + + #---> BODY <---# + + #---> Transform CIGAR string to array <---# + my @CIGAR_old = split /(\D)/, $CIGAR; + + #---> Pairwise reversal <---# + push @CIGAR_new, (splice @CIGAR_old, -2) while @CIGAR_old; + + #---> Assemble updated CIGAR string <---# + $CIGAR = join "", @CIGAR_new; + + #---> RETURN VALUE <---# + return $CIGAR; + +} +#-----------------------# +sub reverse_complement_MD { +### Function: Reverses MD tag +### Accepts: 1. MD tag +### Returns: Reversed MD tag +### Dependencies: n/a +### Type: Generic + + #---> PASS ARGUMENTS ---# + my $MD = shift; + + #---> BODY <---# + + #---> Remove tag name and type <---# + $MD =~ s/MD:Z://; + + # Build complement by transliteration + $MD =~ tr/aAcCgGtT/tTgGcCaA/; + + #---> Transform MD string to array <---# + my @MD = split /(\D+)/, $MD; + + #---> Reverse string of deleted letters after caret (MD specifications) <---# + for my $group (@MD) { + $group = "^" . reverse $group if $group =~ s/\A\^//; + } + + #---> Reversal <---# + @MD = reverse @MD; + + #---> Assemble updated MD string <---# + $MD = join "", "MD:Z:", @MD; + + #---> RETURN VALUE <---# + return $MD; + +} +#-----------------------# +sub add_introns_to_CIGAR { +### Function: For split/spliced alignments, includes introns (Ns) in a CIGAR string +### Accepts: 1. CIGAR string; 2. array of integers, containing the lengths of exon overlaps and, interspersed, the length(s) of the spanned intron(s), e.g.: 25 (length overlap exon 3), 10000 (length intron between exons 3 and 4), 50 (overlap exon 4), 5000 (length intron between exons 4 and 5), 25 (overlap exon 5) +### Returns: Updated CIGAR string +### Dependencies: n/a +### Type: Generic + + #---> PASS ARGUMENTS ---# + my $CIGAR = shift; + my @frags = @_; + + #---> SUBROUTINE VARIABLES <---# + my $intron_position = 0; + + #---> BODY <---# + + #---> Transform CIGAR string to array <---# + my @CIGAR = split /(\D)/, $CIGAR; + + #---> Insert corresponding insert for each pair of fragments <---# + for (my $idx_frags = 0; $idx_frags < scalar (@frags - 1); $idx_frags += 2) { + $intron_position += $frags[$idx_frags]; + + #---> Generate array of cumulative lengths that corresponds to CIGAR array; only (mis)matches and deletions count <---# + my @cum_len = (); + my $curr_cum_len = 0; + for (my $idx = 1; $idx < scalar @CIGAR; $idx += 2) { + if ($CIGAR[$idx] =~ /^[MD]$/) { + $curr_cum_len += $CIGAR[$idx - 1]; + } + push @cum_len, $curr_cum_len; + } + + #---> Find insertion position, split previous entry and insert N's <---# + for (my $idx_cum_len = 0; $idx_cum_len < scalar @cum_len; $idx_cum_len++) { + if ($intron_position <= $cum_len[$idx_cum_len]) { + my $part_2 = $cum_len[$idx_cum_len] - $intron_position; + my $part_1 = $CIGAR[$idx_cum_len * 2] - $part_2; + if ($part_2) { + splice @CIGAR, ($idx_cum_len * 2), 2, $part_1, $CIGAR[$idx_cum_len * 2 + 1], $frags[$idx_frags + 1], "N", $part_2, $CIGAR[$idx_cum_len * 2 + 1]; + } + else { + splice @CIGAR, ($idx_cum_len * 2), 2, $part_1, $CIGAR[$idx_cum_len * 2 + 1], $frags[$idx_frags + 1], "N"; + } + last; + } + } + } + + #---> Assemble updates CIGAR string <---# + $CIGAR = join "", @CIGAR; + + #---> RETURN VALUE <---# + return $CIGAR; + +} +#-----------------------# +sub rc_bit { +### Function: Extract the 0x10 (sequence reverse complemented) bit of the SAM FLAG field +### Accepts: 1. SAM FLAG field value +### Returns: Value of 0x10 bit +### Dependencies: n/a +### Type: Generic + + #---> PASS ARGUMENTS ---# + my $number = shift; + + #---> SUBROUTINE VARIABLES <---# + my $bin; + my $bit; + + #---> BODY <---# + + #---> Convert FLAG from decimal to binary <---# + $bin = sprintf "%012b", $number; + + #---> Insert corresponding insert for each pair of fragments <---# + $bit = substr $bin, -5, 1; + + #---> RETURN VALUE <---# + return $bit; + +} + +#===================================# +# DEBUGGING & UNUSED ROUTINES # +#===================================# +sub print_trx { + + my $exon_hoaoa_ref = shift; + my $id = shift; + + my @out_arr = @{$$exon_hoaoa_ref{$id}}; + print "\nid: " . $id . "\n"; + print "chromosome: " . $out_arr[0] . "\n"; + print "strand: " . $out_arr[1] . "\n"; + for my $idx (2 .. scalar @out_arr - 1) { + my @in_arr = @{$out_arr[$idx]}; + print "---\nexon: " . ($idx - 2) . "\n"; + print "genome start position: " . $in_arr[0] . "\n"; + print "genome end position: " . $in_arr[1] . "\n"; + print "exon length: " . $in_arr[2] . "\n"; + print "cumulative exon length: " . $in_arr[3] . "\n"; + print "previous intron length: " . $in_arr[4] . "\n"; + } + print "\n"; + +} +#-----------------------# +sub CIGAR_aligned_unaligned_stretches { + + my $CIGAR = shift; + + my @CIGAR = split /(\D)/, $CIGAR; + + my @results; + + my $last_index = 0; + + my( @indices )= grep { $CIGAR[$_] eq "N" } 0..$#CIGAR; + + foreach my $index (@indices) { + my @temp_array = (); + @temp_array = @CIGAR[$last_index .. ($index - 2)]; + $last_index = $index + 1; + my $cum_len = 0; + for (my $idx = 1; $idx < scalar @temp_array; $idx += 2) { + if ($temp_array[$idx] =~ /^[MD]$/) { + $cum_len += $temp_array[$idx - 1]; + } + } + push @results, $cum_len, $CIGAR[$index - 1]; + } + + my @temp_array = (); + @temp_array = @CIGAR[$last_index .. $#CIGAR]; + my $cum_len = 0; + for (my $idx = 1; $idx < scalar @temp_array; $idx += 2) { + if ($temp_array[$idx] =~ /^[MD]$/) { + $cum_len += $temp_array[$idx - 1]; + } + } + push @results, $cum_len; + + return \@results; + +} +#-----------------------# +sub exons_hoaoa_remove_single_exons { +### Function: Removes transcripts with only a single exon entry from exons hoaoa generated by sub "exon_bed_to_hoaoa" +### Accepts: Reference to hash of arrays of arrays +### Returns: Reference to hash of arrays of arrays +### Dependencies: Subroutine "exon_bed_to_hoaoa" (Alexander Kanitz) +### Type: Specialized + #---> PASS ARGUMENTS ---# + my $hoaoa_ref = shift; + + #---> STATUS MESSAGE <---# + print STDOUT "Removing transcripts with only one exon..." . "\n" unless $quiet; + + #---> BODY <---# + + #---> Remove transcripts with only one exon <---# + ## Traverse through each key $outer_array of hash %$hoaoa_ref + foreach my $transcript ( keys %$hoaoa_ref ) { + ## Delete hash entry if outer array has only three elements (i.e. one inner array ref / one exon) + unless (scalar @{$hoaoa_ref->{$transcript}} > 3) { + delete $hoaoa_ref->{$transcript}; + } + } + + #---> STATUS MESSAGE <---# + print STDOUT "Single exon transcripts removed." . "\n" unless $quiet; + + #---> RETURN VALUE <---# + return $hoaoa_ref; + +} +#=======================# +# SUBROUTINES END # +#=======================# diff --git a/scripts/sam_uncollapse.pl b/scripts/sam_uncollapse.pl new file mode 100755 index 0000000..e876b0d --- /dev/null +++ b/scripts/sam_uncollapse.pl @@ -0,0 +1,164 @@ +#!/usr/bin/perl + +#==================# +# HEADER START # +#==================# +### Name: sam_uncollapse.pl +### Created: Nov 21, 2013 +### Author: Alexander Kanitz +### Company: Zavolan Group, Biozentrum, University of Basel +### Requirements: GetOpt::Long +#==================# +# HEADER END # +#==================# + + +#==========================# +# PRE-REQUISITES START # +#==========================# +#---> PRAGMAS / PACKAGES <---# +use strict; +use warnings; +use Getopt::Long; + +#---> USAGE <---# +my $usage_info = &usage; + +#---> OPTIONS / ARGUMENTS <---# +my $usage = ''; +my $quiet = ''; +my $suffix = ''; +my $in = ''; +my $out = ''; +my $options_result = GetOptions ( + 'usage|help' => \$usage, + 'quiet' => \$quiet, + 'suffix' => \$suffix, + #-----------------------# + 'i|in=s' => \$in, + 'o|out=s' => \$out +); +die $usage_info if $usage || !$options_result; +die $usage_info if !$in || !$out; + +#==========================# +# PRE-REQUISITES END # +#==========================# + + +#================# +# MAIN START # +#================# +#---> STATUS MESSAGE <---# +print "Starting '$0'...\n" unless $quiet; + +#---> BODY <---# + + #---> Read & re-write file <---# + &sam_uncollapse($in, $out, $suffix); + +#---> STATUS MESSAGE <---# +print "Done.\n" unless $quiet; + +#---> PROGRAM EXIT <---# +exit 0; +#================# +# MAIN END # +#================# + + +#=======================# +# SUBROUTINES START # +#=======================# +sub usage { +### Function: Returns usage information for current script +### Accepts: n/a +### Returns: String with usage information +### Type: Specialized +'Usage: perl ./sam_uncollapse.pl [OPTIONS] --in [SAM] --out [SAM] + +Description: Reverses the collapsing of reads with identical sequences as done with "fastx_collapser" ("FASTX Toolkit") or similar. Reads and writes files in SAM format. Each line is printed n times, where n is the suffix appended to the read/query name via a dash. + +================================================== +Required arguments: +--in Input SAM file. +--out Output SAM file. +================================================== +Optional arguments: +--suffix Add serial number suffix to each QNAME during uncollapsing (separated by a ".") to allow distinction of multimappers by QNAME +--usage|help Show this information and die +--quiet Shut up! + +Comments: +CAUTION: Only marginal validation of the input file type/format performed! + +Version 1.2 (2014-08-26) +Written by Alexander Kanitz on 2013-11-21 +'; +} +#-----------------------# +sub sam_uncollapse { +### Function: For each line of a SAM file, parses the identifier QNAME for the presence of a number n appended to its end via a dash ('-') and re-writes the line n times. Header lines are reproduced as they are. +### Accepts: 1. Filename [SAM] +### Returns: n/a +### Dependencies: n/a +### Type: Generic + + #---> PASS ARGUMENTS ---# + my $in = shift; + my $out = shift; + my $suffix = shift; + + #---> STATUS MESSAGE <---# + print STDERR "Processing file '$in'..." . "\n" unless $quiet; + + #---> SUBROUTINE VARIABLES <---# + my $line; + + #---> BODY <---# + + #---> Open files <---# + open IN, "<", $in; + open OUT, ">", $out; + + #---> Push line to array <---# + while ($line = <IN>) { + # Print header line + print OUT $line and next if $line =~ m/\A\@\w\w\t/; + # Get QNAME + my ($id, $rest) = split /\t/, $line, 2; + # Find and remove appended copy number n + $id =~ s/-(\d+)\Z//; + # Write appended copy number n to variable + my $repeat = $1; + # If --suffix option is set... + if ($suffix) { + # Iterate over number of identical reads/alignments + for my $suffix (1..$repeat) { + # Recreate line with suffix + $line = join "\t", "$id.$suffix", $rest; + # Print line + print OUT $line; + } + + } + # Else... + else { + # Recreate line + $line = join "\t", $id, $rest; + # Print line n times + print OUT $line x $repeat; + } + } + + #---> Close file <---# + close OUT; + close IN; + + #---> STATUS MESSAGE <---# + print STDERR "File '$out' written." . "\n" unless $quiet; + +} +#=======================# +# SUBROUTINES END # +#=======================# diff --git a/test/cluster_map.json b/test/cluster_map.json new file mode 100644 index 0000000..f771d03 --- /dev/null +++ b/test/cluster_map.json @@ -0,0 +1,67 @@ +{ + "__default__" : + { + "queue": "6hours", + "time": "05:00:00", + "threads": "1", + "mem": "4G" + }, + + "cutadapt": + { + "threads":"{resources.threads}" + }, + + "mapping_genome_segemehl": + { + "queue": "1day", + "time": "{resources.time}:00:00", + "threads":"{resources.threads}", + "mem":"{resources.mem}G" + }, + + "mapping_transcriptome_segemehl": + { + "queue": "1day", + "time": "{resources.time}:00:00", + "threads":"{resources.threads}", + "mem":"{resources.mem}G" + }, + + "mapping_genome_oligomap": + { + "time": "{resources.time}:00:00", + "threads":"{resources.threads}", + "mem":"{resources.mem}G" + }, + + "mapping_transcriptome_oligomap": + { + "time": "{resources.time}:00:00", + "threads":"{resources.threads}", + "mem":"{resources.mem}G" + }, + + "sort_transcriptome_oligomap": + { + "threads":"{resources.threads}" + }, + + "sort_genome_oligomap": + { + "time": "{resources.time}:00:00", + "threads":"{resources.threads}" + }, + + "oligomap_genome_toSAM": + { + "time": "{resources.time}-00:00:00", + "queue": "{resources.queue}day" + }, + + "remove_inferiors": + { + "threads":"{resources.threads}", + "mem":"{resources.mem}G" + } +} diff --git a/test/cluster_prepare_annotation.jsob b/test/cluster_prepare.json similarity index 100% rename from test/cluster_prepare_annotation.jsob rename to test/cluster_prepare.json diff --git a/test/config_map.yaml b/test/config_map.yaml new file mode 100644 index 0000000..d082d98 --- /dev/null +++ b/test/config_map.yaml @@ -0,0 +1,45 @@ +--- + +############################## GLOBAL PARAMETERS ############################## + +# Directories +output_dir: "results/" +local_log: "logs/local" +cluster_log: "logs/cluster" +scripts_dir: "../scripts" + +# Resources: genome, transcriptome, genes, miRs +# All of these are produced by the "prepare" workflow +genome: "results/homo_sapiens/chrY/genome.processed.fa" +gtf: "results/homo_sapiens/chrY/gene_annotations.filtered.gtf" +transcriptome: "results/homo_sapiens/chrY/transcriptome_idtrim.fa" +transcriptome_index_segemehl: "results/homo_sapiens/chrY/transcriptome_index_segemehl.idx" +genome_index_segemehl: "results/homo_sapiens/chrY/genome_index_segemehl.idx" +exons: "results/homo_sapiens/chrY/exons.bed" +header_of_collapsed_fasta: "results/homo_sapiens/chrY/headerOfCollapsedFasta.sam" + +# Tool parameters: quality filter +q_value: 10 +p_value: 50 + +# Tool parameters: adapter removal +error_rate: 0.1 +minimum_length: 15 +overlap: 3 +max_n: 0 + +# Tool parameters: mapping +max_length_reads: 30 +nh: 100 + +# Sample information +input_dir: "test_files" +sample: ["test_lib"] + +######################## PARAMETERS SPECIFIC TO SAMPLE ######################## + +test_lib: + adapter: "AACTGTAGGCACCATCAAT" + format: "fa" + +... diff --git a/test/config_prepare_annotation.yaml b/test/config_prepare.yaml similarity index 79% rename from test/config_prepare_annotation.yaml rename to test/config_prepare.yaml index 6a7ada5..ed62e83 100644 --- a/test/config_prepare_annotation.yaml +++ b/test/config_prepare.yaml @@ -2,17 +2,17 @@ ############################## GLOBAL PARAMETERS ############################## -## Isomirs annotation file -## Number of base pairs to add/substract from 5' (start) and 3' (end) coordinates. -bp_5p: [-1,0,+1] -bp_3p: [-1,0,+1] - -## Directories +# Directories output_dir: "results" scripts_dir: "../scripts" local_log: "logs/local" cluster_log: "logs/cluster" +# Isomirs annotation file +# Number of base pairs to add/substract from 5' (start) and 3' (end) coordinates. +bp_5p: [-1,0,+1] +bp_3p: [-1,0,+1] + # List of "organism/prefix" organism: ["homo_sapiens/chrY"] @@ -29,7 +29,7 @@ homo_sapiens/chrY: # Other organisms available at: https://github.com/dpryan79/ChromosomeMappings map_chr_url: "https://raw.githubusercontent.com/dpryan79/ChromosomeMappings/master/GRCh38_UCSC2ensembl.txt" # Chromosome name mapping parameters: - column: 1 # Column number from input file where to change chromosome name - delimiter: "TAB" # Delimiter of the input file + column: 1 + delimiter: "TAB" ... diff --git a/test/expected_output.files b/test/expected_output.files index ad5ad26..c20e63c 100644 --- a/test/expected_output.files +++ b/test/expected_output.files @@ -1,19 +1,47 @@ -results/homo_sapiens/chrY/raw/Homo_sapiens.GRCh38.dna_sm.chromosome.Y.fa +results/homo_sapiens/chrY/chr_size.txt +results/homo_sapiens/chrY/exons.bed +results/homo_sapiens/chrY/exons.gtf +results/homo_sapiens/chrY/gene_annotations.filtered.gtf +results/homo_sapiens/chrY/genome_index_segemehl.idx results/homo_sapiens/chrY/genome.processed.fa results/homo_sapiens/chrY/genome.processed.fa.fai -results/homo_sapiens/chrY/raw/Homo_sapiens.GRCh38.98.gtf -results/homo_sapiens/chrY/gene_annotations.filtered.gtf +results/homo_sapiens/chrY/headerOfCollapsedFasta.sam +results/homo_sapiens/chrY/isomirs_annotation.bed +results/homo_sapiens/chrY/mirna_chr_mapped.gff3 +results/homo_sapiens/chrY/mirna_filtered.bed +results/homo_sapiens/chrY/mirna_filtered.gff3 +results/homo_sapiens/chrY/mirna_mature_filtered.bed results/homo_sapiens/chrY/transcriptome.fa results/homo_sapiens/chrY/transcriptome_idtrim.fa results/homo_sapiens/chrY/transcriptome_index_segemehl.idx -results/homo_sapiens/chrY/genome_index_segemehl.idx -results/homo_sapiens/chrY/exons.gtf -results/homo_sapiens/chrY/exons.bed +results/homo_sapiens/chrY/raw/Homo_sapiens.GRCh38.98.gtf +results/homo_sapiens/chrY/raw/Homo_sapiens.GRCh38.dna_sm.chromosome.Y.fa results/homo_sapiens/chrY/raw/mirna.gff3 results/homo_sapiens/chrY/UCSC2ensembl.txt -results/homo_sapiens/chrY/mirna_chr_mapped.gff3 -results/homo_sapiens/chrY/mirna_filtered.gff3 -results/homo_sapiens/chrY/mirna_filtered.bed -results/homo_sapiens/chrY/chr_size.txt -results/homo_sapiens/chrY/mirna_mature_filtered.bed -results/homo_sapiens/chrY/isomirs_annotation.bed +results/test_lib/catMappings.sam +results/test_lib/collapsed.fasta +results/test_lib/concatenated_header_catMappings.sam +results/test_lib/convertedSortedMappings_test_lib.bam +results/test_lib/convertedSortedMappings_test_lib.bam.bai +results/test_lib/cut.fasta +results/test_lib/fa/reads.fa +results/test_lib/filtered_for_oligomap.fasta +results/test_lib/formatted.fasta +results/test_lib/GenomeMappings.sam +results/test_lib/header_sorted_catMappings.sam +results/test_lib/nhfiltered_GenomeMappings.sam +results/test_lib/nhfiltered_TranscriptomeMappings.sam +results/test_lib/noheader_GenomeMappings.sam +results/test_lib/noheader_TranscriptomeMappings.sam +results/test_lib/oligoGenome_converted.sam +results/test_lib/oligoGenome_map.fa +results/test_lib/oligoGenome_report.txt +results/test_lib/oligoGenome_sorted.fa +results/test_lib/oligoTranscriptome_converted.sam +results/test_lib/oligoTranscriptome_map.fa +results/test_lib/oligoTranscriptome_report.txt +results/test_lib/oligoTranscriptome_sorted.fa +results/test_lib/segemehlGenome_map.sam +results/test_lib/segemehlTranscriptome_map.sam +results/test_lib/TranscriptomeMappings.sam +results/test_lib/TransToGen.sam diff --git a/test/expected_output.md5 b/test/expected_output.md5 index 57c9cf0..d176435 100644 --- a/test/expected_output.md5 +++ b/test/expected_output.md5 @@ -1,19 +1,47 @@ -eb44404d89516497e6480d4dd33f2381 results/homo_sapiens/chrY/raw/Homo_sapiens.GRCh38.dna_sm.chromosome.Y.fa +1e6a0b3d0e678014f87afdd80f4025b9 results/homo_sapiens/chrY/chr_size.txt +51ac61c61825929f8f05c4b4f821f04d results/homo_sapiens/chrY/exons.bed +6fe52e2e126ef2e0c368fb1bf267f453 results/homo_sapiens/chrY/exons.gtf +0b3dfe8cf4d644637671572fca629f69 results/homo_sapiens/chrY/gene_annotations.filtered.gtf +11b0b7c50160aa8837dd92eda516c124 results/homo_sapiens/chrY/genome_index_segemehl.idx 583f395125f769102ff08ff84b60e0d3 results/homo_sapiens/chrY/genome.processed.fa f37a213f94d11bf2260f50f2c9f199d2 results/homo_sapiens/chrY/genome.processed.fa.fai +40054d82cc01b4b44dbe476bdb50141c results/homo_sapiens/chrY/headerOfCollapsedFasta.sam +909a2fc878c5ac0437344e4f5c6e58e3 results/homo_sapiens/chrY/isomirs_annotation.bed +ba7404239073e3b67204af1803729884 results/homo_sapiens/chrY/mirna_chr_mapped.gff3 +a923f50eea2708cd889886ae5179ee18 results/homo_sapiens/chrY/mirna_filtered.bed +91e1facd80f93ef61f242050dd7d03c3 results/homo_sapiens/chrY/mirna_filtered.gff3 +e7e85f57e0476d1805c1cb64131dd75c results/homo_sapiens/chrY/mirna_mature_filtered.bed d5eaafa9aec63e3fab632fc49392b54b results/homo_sapiens/chrY/raw/Homo_sapiens.GRCh38.98.gtf -0b3dfe8cf4d644637671572fca629f69 results/homo_sapiens/chrY/gene_annotations.filtered.gtf +eb44404d89516497e6480d4dd33f2381 results/homo_sapiens/chrY/raw/Homo_sapiens.GRCh38.dna_sm.chromosome.Y.fa +6bc49275f74ed1b43d80cf7598d387b9 results/homo_sapiens/chrY/raw/mirna.gff3 5ab1c2f39ab35fabc6673c73beb3097b results/homo_sapiens/chrY/transcriptome.fa bf1e37165b908729327599801ff5147b results/homo_sapiens/chrY/transcriptome_idtrim.fa a5a6fd2cab7d7919b80761fc25f2777a results/homo_sapiens/chrY/transcriptome_index_segemehl.idx -11b0b7c50160aa8837dd92eda516c124 results/homo_sapiens/chrY/genome_index_segemehl.idx -6fe52e2e126ef2e0c368fb1bf267f453 results/homo_sapiens/chrY/exons.gtf -51ac61c61825929f8f05c4b4f821f04d results/homo_sapiens/chrY/exons.bed -6bc49275f74ed1b43d80cf7598d387b9 results/homo_sapiens/chrY/raw/mirna.gff3 d2095c371c9b8b2c7cacd1024abf2d18 results/homo_sapiens/chrY/UCSC2ensembl.txt -ba7404239073e3b67204af1803729884 results/homo_sapiens/chrY/mirna_chr_mapped.gff3 -91e1facd80f93ef61f242050dd7d03c3 results/homo_sapiens/chrY/mirna_filtered.gff3 -a923f50eea2708cd889886ae5179ee18 results/homo_sapiens/chrY/mirna_filtered.bed -1e6a0b3d0e678014f87afdd80f4025b9 results/homo_sapiens/chrY/chr_size.txt -e7e85f57e0476d1805c1cb64131dd75c results/homo_sapiens/chrY/mirna_mature_filtered.bed -909a2fc878c5ac0437344e4f5c6e58e3 results/homo_sapiens/chrY/isomirs_annotation.bed +db60b643dd35fe014a452a30ce748f84 results/test_lib/catMappings.sam +da07cdd64fddbc1d018c92c7b8b3c9bd results/test_lib/collapsed.fasta +a8579cb9828810a81a9be000a6c0c38d results/test_lib/concatenated_header_catMappings.sam +244208bcf475ce5eac0940fc15e477fd results/test_lib/convertedSortedMappings_test_lib.bam +a1a1afd1e5ed8e4cc81d380c8456777c results/test_lib/convertedSortedMappings_test_lib.bam.bai +c0daa909634f9611954188928adf87cb results/test_lib/cut.fasta +6f36e04dc0cf4ce4a0115445ac133a86 results/test_lib/fa/reads.fa +a8239c8468e0f1a32eedf1a1f3d4b572 results/test_lib/filtered_for_oligomap.fasta +6c03db8848d24a36ad31879cadec7582 results/test_lib/formatted.fasta +c994ca3e27f45cf0d8260dc4faf5d3fa results/test_lib/GenomeMappings.sam +f4d86fc90874aeeed5d4bff4eacd6bb3 results/test_lib/header_sorted_catMappings.sam +c994ca3e27f45cf0d8260dc4faf5d3fa results/test_lib/nhfiltered_GenomeMappings.sam +6b9e79f12cb9e7d38827d396034ac62e results/test_lib/nhfiltered_TranscriptomeMappings.sam +db60b643dd35fe014a452a30ce748f84 results/test_lib/noheader_GenomeMappings.sam +d41d8cd98f00b204e9800998ecf8427e results/test_lib/noheader_TranscriptomeMappings.sam +1af83e4998536f0bb07af3785c22e455 results/test_lib/oligoGenome_converted.sam +d7fb4b61c4e21aa59d709cd63aa8a34b results/test_lib/oligoGenome_map.fa +cf92ecdb9bc4ad395a2d4d1cde8e85c2 results/test_lib/oligoGenome_report.txt +81d670b806fbf429e1df7b31721dcb9c results/test_lib/oligoGenome_sorted.fa +d41d8cd98f00b204e9800998ecf8427e results/test_lib/oligoTranscriptome_converted.sam +d41d8cd98f00b204e9800998ecf8427e results/test_lib/oligoTranscriptome_map.fa +2909ec89b63190055195f8052561073f results/test_lib/oligoTranscriptome_report.txt +d41d8cd98f00b204e9800998ecf8427e results/test_lib/oligoTranscriptome_sorted.fa +78f3db3abce3bd901e01965da7a674a6 results/test_lib/segemehlGenome_map.sam +6b9e79f12cb9e7d38827d396034ac62e results/test_lib/segemehlTranscriptome_map.sam +6b9e79f12cb9e7d38827d396034ac62e results/test_lib/TranscriptomeMappings.sam +d41d8cd98f00b204e9800998ecf8427e results/test_lib/TransToGen.sam diff --git a/test/test_cleanup.sh b/test/test_cleanup.sh new file mode 100755 index 0000000..5f2f6d7 --- /dev/null +++ b/test/test_cleanup.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +# Tear down test environment +cleanup () { + rc=$? + cd $user_dir + echo "Exit status: $rc" +} +trap cleanup EXIT + +# Set up test environment +set -eo pipefail # ensures that script exits at first command that exits with non-zero status +set -u # ensures that script exits when unset variables are used +set -x # facilitates debugging by printing out executed commands +user_dir=$PWD + +# Remove all generated by the test runs +rm -rf .snakemake/ +rm -rf .tmp/ +rm -rf logs/ +rm -rf results/ +rm -rf snakemake_report_*.html +rm -rf wget-log* diff --git a/test/test_dag.sh b/test/test_dag.sh index 5112be1..2b52bb5 100755 --- a/test/test_dag.sh +++ b/test/test_dag.sh @@ -3,8 +3,6 @@ # Tear down test environment cleanup () { rc=$? - rm -rf .snakemake - rm -rf logs/ cd $user_dir echo "Exit status: $rc" } @@ -18,12 +16,22 @@ user_dir=$PWD script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" cd $script_dir -# Run tests +# Run test: prepare workflow snakemake \ - --snakefile="../workflow/prepare_annotation/Snakefile" \ - --configfile="config_prepare_annotation.yaml" \ + --snakefile="../workflow/prepare/Snakefile" \ + --configfile="config_prepare.yaml" \ --dag \ --printshellcmds \ --dryrun \ --verbose \ - | dot -Tsvg > "../images/workflow_dag_prepare_annotation.svg" + | dot -Tsvg > "../images/workflow_dag_prepare.svg" + +# Run test: map workflow +snakemake \ + --snakefile="../workflow/map/Snakefile" \ + --configfile="config_map.yaml" \ + --dag \ + --printshellcmds \ + --dryrun \ + --verbose \ + | dot -Tsvg > "../images/workflow_dag_map.svg" diff --git a/test/test_files/test_lib.fa.gz b/test/test_files/test_lib.fa.gz new file mode 100644 index 0000000000000000000000000000000000000000..73e84e6eda918245126b8c6277446841aa7ae210 GIT binary patch literal 4492 zcmb2|=HO^No)^c!T%4PjlM|nrS5R7_mzKz|HYRe?>Rm#AzQ=Vq82_2W(Q3Da@ol}X z`%KlG^6eWWpH#e;ThQ&Y>Xekp0ee%;<vn#z8uuT!j}3etX&=<La!F8<pXKy9$(}nV z82<}?b@Z3Nj^(jm`=<npHf#KA(Ehsb*YQJ(=g9p(vt><p?UmVHEN)$2?{m!vn)v&* z>H|TU>lG|ts+?JBjxiUe=W<7d%Kd+{<ZYJx4pqLacaP`Kp2Gh*StezT?pM*0ZT%)2 zR`%B?23=d*-&nTlZoFFRo_dFefeeY-QBFKTtLA1*uCjl(rd{(<r{FTrtJktyH*DR^ zynEHN%-FMw?N&~Z7W~f0_5ahkmT!wUa_X$S{#v+V#z~7Uy?Z95U&|H^@l{-L+~7pl z3U%?U^X^WwPVTh+*6{7klQ+L4+%sk!@hx+m;lX^k<<sQ3*N-!(rDyGv3%?NHxO(nq zTUEWuk#`r%t(k22nz<~?aHZ5H%d&^gi%j?q>TYFIde<mYlXT{j`ZfcvD|6L@nU5S; z5Ny3Jlr>aYx$>*jTJ^np87p0q4g|UfR>l9G=~&ECP%@oiQU0I(0cTzJFqWO@cycQw z;H$&r1(#in83cLP-4e|{a&~<|+x|yZat-%H3)O9=-Ce!l1pljwbDr)+ePxrcea^p7 zqCYXyK69zu8v_v~hrosfi#!-?^dB!d${%y?O=0CLg~<UOPpo`){R)zdmemp65>>D8 zTRY>d#MNKZMNe$t;tEzSNDYxGW}PelI3!9j`>1GH*Nl>@-<FBR#PzT0DPlI=+7;av z+Sa<wd|T7C4toRc@C#A-lI+Hgr(GBMmzKPYe|_UwN`gRAM@Ij3E_ntnhiuXH)1n?{ zC0Y5iu3b9Kk!@#;@XV<2mj#w0$Lj)P|1LYWsQRPrf2OU*j`x4gco+WaYVj2VJGG5n zSD77EeZnrhvRvViU=qz4-z<AF!liYhAy3Qs5bpI{iW*97?p|7pWE{e}&R@^{KdtTZ z?3r5U)9n><j+7gR&fA=jutevL+`q{uDr1se+@)Dq689c0DTwC~RSj347A;yL7*Mf0 z_BQjoaFvfMxy*&9NVXST@XXumRCz$*{2RTK3tLy5;c#V~rg(+(R;=RHknA~yHzm## zRYmQW(wX~NJAK*La;Gly+N+TXd>+R{Px+Q!`L-l@?}NgJE^^NeO<m<(|8*?jyd}A; z^ud#MlZu(s4AyL0Tk(3!4*L@S?Z-Uo-X+SFE#z9TEI8QkCqw3*d18EQiR(9sHi}I6 z{gm$t&y)0fn`{*q_-NjKnULdq)a3Ff)7?iWpV}N8@vdhX_loDTf$Q2&{xJ8cvS<C@ z!Mu8Pmt;fb{i}bBSj}@j>2W7t)0}a<!$BZTp_d^j*uwbZOg;g<j0C%MWj2XzNwYLp zG+y2}$7jodJqDM4Ejj;6t-kFPd*<XaCf^SSWXv9VUgu8y^qQk~+1mw4dXv{HCOw$2 zFKOL@H?NIj&wM-WbH;K1k$w7Iwf|qczpT0z{pHEw8JbO8X=?SG%g)t{WUAL&=`Pow zTQ+(7;!_j3o->vo>2p`_)k^y+YusW#`*X+{))|KnT?o3HroZNRcS+3Z+gA$D%wSuW zRrGr9&zD>amj`~f%``ro)lgZxCQR_nlS7&I-tx@XUUlsJzT4n#q~TxpGtK+IyN91~ zI1?xmsrFod`-bIHx^@bEt_u~9zZUqJ=fZF1KR@<qnS`?!KCk}Y$jxwdx<mcHOTRfC zHpXmU>%ku>$-E-f`?zRY);hnG^U<1hF}r8mI<UW*%*A^t=p0w`;w!m*Mqf_7{Nu&q zF#Wgg-<uf?VdcB#bKVziP<{2vuUwox!gil)z)}%Mb~DihQVX`r$ZK4!_tUxg_V|)N z_u{!P2yRtAaOb?(GDhjhDXZ2Q*Ka!Uatqr9hYR<2^?Z-o#ju4(<_1FvYsQq7(K(A} z>|gQtL;huHhpQ>m-DSQpFHw6kx0m(JI>s}l0!a}`a|-tVs5NgZn`|4uKbNPgmMxS2 zLUE%~U2bNA_x`_^!aJYyFF37uLEhyz-!7&vbzJ73WpD4PF0Z?JOk?BO4N=T-Nna$J zpF2O+Pd(!)_EdSn)y{j}*DI=<I2Z6O(3l%Azu@kMf;^eT3BrfpW-jG@Xc;_X@);T9 znrUMBl8(xbw$W9lDHn7Y1&+%;vVO_9>kixBr;8cZ>OGcQuN(VaJ#eFX9@~o@djn_8 z2n}N_(GKfQv6mB`cI*B(t(3@Bk2hZqyHn)f{HC5k@rC`iU+0%+ef={*e8Ds07nK<e zx0Dy8%drNO%WmQ44X9_{ayz7ft*MUd<j0bOsjJ;*9v2HdImxBW+s0=u2k&QuiN z=2&yb=K|xZ%HWB?;tuw991CV??Rs>o&^*Wb^403-MbEudm%N!bWy-C<4O*>V7gWX^ zs65ci6}$7RxeRk}deQu7hN^2z=GQT-%?YnBoj3DNP_#{0U0aCe?L~`!D9;sGetVgk z<kk}xRJW}wijADM^5#(=Lz`RL4RK0u`tNHWXABWG>GdvPFl~>nGC09}#V_6L%0AW| zd*mPNVScgTZOf_ZK($=;ty8bK<OVZohc4U^5qB$uMV_JY&9BGoAN_b=h%eOXzrNb* z&PkySJ}YwOufN{x5a1wSpxdrq7jMB5a9FI5GivEe(;Fw(950#4pMK{7!>l~s*Nl&E zZd2U<<w}ypormS0mYkQ3yYcWo<JK)_b$OO2zCG2kRpr$q+0LE)DxXvCs4#1)&N{eo zn)ESVOU5jYU(>UGwsGr!TVuKZXlTPO+qFgI0guc=wuY$b{#9zW%@1M>x}&0X#p6w+ zciJ9)Wv)JTX-^fs?Y>?;6L}7*+O6EEQ)8mfSA8-)^5!M$b6d|{T32Y_pw&>sw`fc6 z>`nI^Ua1xtq~1)K`T5$@-234*>kN-vtIR66%JBEs4`F592C1fd=M;b1@~rum`TDe= z)xT|DUoH9VnP#_Q!=|&TE-Nb{KZwk{Jt4Q~_H5g=YTlo&ADYH^<-Vc-!y(1RGC!wI z4_vjrrEj)cW#z)1FL%B3;M^HF@j>Z^nL*{>G?`olQ$5a|J-evt8iVb+neXizX0df8 zSFgMJ)8Wj;&CD+*Tn)CZHrw{P_UEHRUGub6YtusxpE6EX@;J}!Qn2COX)(!)M{oB_ z1~He*9kTE`xj6sIi>OSqH$Gpw;=eBEas0et)2s#CR%cjc{T2^er#&@AkN?+#XEsZZ zi5%E@F>0b|>qe99<$*J<E<SSkgz2X%({4pDZ~q=45qv9J%W3+lb$9;?h;O@kVfx2S zZ{GTf*~|5t2c$C|U3lnW<<e*V8=YQj_x@X=)F9O)r(A1y!aw)s-*ZnACeDgk5)<xa zw!-6B!<7BYPKnzGyH9;+lJQgO^w&+-<<q=8w?xG+yEjku<GfEbLSI%tX85AhdAYGA z{mW%fpQZEOx*JM9J;^H;G@nE2JbS?7UHyKMLQc=7bePTS@4IyCfN&bu<_ACi9noA6 zwk;+8<i|fAZnq<E9seZu{MBr}-JUCIt}MLD?)aGVaKz_jMcKQ<PQCTK6dF}@rg~Sy z=h~zbyVy$kS6-N!)h&M}Zi15cG2JNUB2kY$AL4WO>}F8D9j~x-S5S@LWrigtOD-6v zYt)yobGE*nFMFIf&}et@gq>@JpZ{E;oTw%LO+mcui&*gOwJs}D!*_dXxUSr)ubSiI zd+NNebn>I^Ed}?@|A%QVnmFN7yXg-_HpUY@3{|t5yxDGjU+JQ{Jol*gZ#Oj=JI=!V zOL4!wm}-mVMGL2OzW?{Q;Ao=y64#&aT0Ur<nJ}qW{Lb!aPgtW=g#Sc-JkP?ji+e_t z&avJJMpJg>%((Me@Jvy;ob1nc+j_n$JngfylJzf`am0>cmS5^wfnR?gI!wR(dx}u< zy!ns2<S)fV%;$-GmigxR<%az$eB(1O1lc&0uTT$m3ry?$niQ83dtydlbo3HIr}dNE zYOi&_`t)~R>P+tLNv6~7;-Bz7o)P79B&0MyGGf`K;?EUlY|EEO{he{~qvI^eAj|Id zliOC`I5qRmvrCg*j3xyuO|>vkUf5#)Nn-P<x$4_mQ%^a1eP4QMTW`lh*3bKFSMGW1 zB&Nx|a?;bq9%nvp2+p!=(7d^B(UG#9o|PUOFBi#GOs`p=F8KAqgkL?LTAs%FUArIH z{XNZa_hLfc;p-~*f8V^kpr_S)veE3+!ldsFsu#9I-?<iT{QItSSIDU!OU}xaSzNm{ zk#*bo>t-t1SG#7ejIXKAc-M7WZ^50iSrSX<<sQ|!eaEmki&buBo<XjeJ=@->i*|x( z2DxV2!*<LKJMv9b|C>;8^5@5|Tw>Svv1UvN<55~vAh<bY?!ODwD->Ry{}sZp|IN3A zCzFe7rtq>|t^KXSnLRtr<gK!P;e#vRu4<Yj2sBK|KL5u3WrZrEgs*h(&1H#STyMXT z=3G#FdyB!Yi%#p6Op{jLobhw<vfy2dGM&3NFdUPt6jJq<NG|)O!M*oD%M^<T({E|c zpLN>snC#}8A9p4-{F=w~d2Q5DiA}3>xKwm^%ndtt)bglou5m(M<vyGJGA!<!vX0-D z+_&jWki_oCCnsvZowK|l<bKi8EkdqSa*iH6+H<Y*^xf~~+F5>{R<gf%^JXvVauM8Z z(>$a0sIj@Z*n<e!g*WF1%(K6ACUCZmaJA*<H2v%R4MFUm1*g19pL=DS%ZX!Ik5v}E z?fKWkeS7}<<DA#|eKqaVCmx92zU|I~=bq)?TRlE|P4U0Ga@`b>V;g(9dylhQ%P+Fc zuQ=ZE$<}u7+1qTper_pW8*B6N@47AKvQZzO9saG$?KkV-Ylpb`_diatZ{Kv`y8iQP zsq-XfabL-wxJ7V<+fONltsTtYYNk6Jd|N(usYtTVjbH)qqnBQ75%{J1ETv=0`w3Cs zHav-CPIQTw9~=4Xdg6TFXrAn^Cl*ED7gsL7_jm6mR)@EGi<7n2iha-8nE%bQCfUu+ zLUaGV-)d_5xvFN{7tFjZvDVc$FE8V;^X7oFHT<0lpXQ{#{;XBsn>$CPu;^~-HI?l3 zE^${y<^KDfe5u|aUVCW%<%4f_&dRI*6y_}Y(*J%);q3&qYf-=T6uK`>T;J%_)iy<D z#%&90?^RL1_xQiHE}zKuV)ML>q0i!OPAN_1*3~=w(0o&a+)9p%nY-juetw?lP;;~3 z-ew#1`<c2Po)QP=7iH$0`(!8aZ`F61Xug^DZ*=Y*`;n2Z{c4gS>-vDRcb2GgHqAOy zANV<2n<MXayWG0{C$e|t)Ms!$d6cJZ?QN63RQc>u%^+!^$gRtdZ3=xCur26OYuL-T z2ljkw3Eg&7S>`?KJu`;qe)+3^?R|c0VMzI(qL*@vsW$SK@3XJ^JvbZPr+sAA-*@>> zbZ)0F`51Y5W{~>ilAkZGY~!<LJhd{kJHJBzg2KMtdu@uOvm$B?qt{u-*~=d;uV?tm zu=Kj8wo|;cu7H{RVe#eFLH=*Wy#hCH{CfCIk(zpz`%9m-xsQ%W?fousQuxfAYaRQf zHi)X`nwYKInmAi~%ELy}y5;ko@@A%PQZu<LUAANAuE^Opi*9rLG}p*q%EytWALt%d zP`)zRwK1pv>LrVlNrfl$w6lG)w|=?V6=(VJWbrFg-srcF%qw+!l@`X&+w}5&(0?7a zLbW4VYGqDqs<}g7Y`U$O`Z%h4PhS5Nrt6PdjepBs5>i<5=CX6q6tj-W#``Vq=jiCK zHT|}8gQjNw%wu2g8lE=SyPdPG^smG5tL9Oiey^Y3ZQ~Y^e)oITsmj}LJXC+|Tb05X z*5%~!c4b$K&nDTUzn<N&$gz5~@yPnQnWe7}@6L|V{qRES<x8o$_`6F?Tv`?;Z8VtA zb%%YCr9<dq$Ind*q$AnZ>EFCy?wj*o+2!PdgZo&IKZtA8{L=iUcyD-w&@WB*pr2tA zTc0kT+gu=BaGI~tvi#qd^Qpn}U07olGNs0^h&?ko_jGrKH+R+CWv7CApX@!l{~d?! z-es5Kwkn=(`EmNvqT8E-n{rMI-_c3EUVGjrw@^69OQf{=*1UwO)AKBDe!Q$Bd{<<n zHQV$iZSjd&tvxH`q@w@MPEs{E{atLua#NeHJHN&HpW2cuKRNdLTQ>9e8ok@M<aYjN Nv`YG6ur-^30RUthxVHcR literal 0 HcmV?d00001 diff --git a/test/test_rule_graph.sh b/test/test_rule_graph.sh index c9e4399..65ca37b 100755 --- a/test/test_rule_graph.sh +++ b/test/test_rule_graph.sh @@ -3,8 +3,6 @@ # Tear down test environment cleanup () { rc=$? - rm -rf .snakemake - rm -rf logs/ cd $user_dir echo "Exit status: $rc" } @@ -18,12 +16,22 @@ user_dir=$PWD script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" cd $script_dir -# Run tests +# Run test: prepare workflow snakemake \ - --snakefile="../workflow/prepare_annotation/Snakefile" \ - --configfile="config_prepare_annotation.yaml" \ + --snakefile="../workflow/prepare/Snakefile" \ + --configfile="config_prepare.yaml" \ --rulegraph \ --printshellcmds \ --dryrun \ --verbose \ - | dot -Tsvg > "../images/rule_graph_prepare_annotation.svg" + | dot -Tsvg > "../images/rule_graph_prepare.svg" + +# Run test: map workflow +snakemake \ + --snakefile="../workflow/map/Snakefile" \ + --configfile="config_map.yaml" \ + --rulegraph \ + --printshellcmds \ + --dryrun \ + --verbose \ + | dot -Tsvg > "../images/rule_graph_map.svg" diff --git a/test/test_workflow_local.sh b/test/test_workflow_local.sh index 7d6be0a..8925277 100755 --- a/test/test_workflow_local.sh +++ b/test/test_workflow_local.sh @@ -3,11 +3,6 @@ # Tear down test environment cleanup () { rc=$? - rm -rf .snakemake/ - rm -rf .tmp/ - rm -rf logs/ - rm -rf results/ - rm -rf snakemake_report_*.html cd $user_dir echo "Exit status: $rc" } @@ -21,10 +16,10 @@ user_dir=$PWD script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" cd $script_dir -# Run tests +# Run test: prepare workflow snakemake \ - --snakefile="../workflow/prepare_annotation/Snakefile" \ - --configfile="config_prepare_annotation.yaml" \ + --snakefile="../workflow/prepare/Snakefile" \ + --configfile="config_prepare.yaml" \ --use-singularity \ --singularity-args "--bind ${PWD}/../" \ --cores=4 \ @@ -32,19 +27,35 @@ snakemake \ --rerun-incomplete \ --verbose -# Snakemake report +# Run test: map workflow snakemake \ - --snakefile="../workflow/prepare_annotation/Snakefile" \ - --configfile="config_prepare_annotation.yaml" \ - --report="snakemake_report_prepare_annotation.html" + --snakefile="../workflow/map/Snakefile" \ + --configfile="config_map.yaml" \ + --use-singularity \ + --singularity-args "--bind ${PWD}/../" \ + --cores=4 \ + --printshellcmds \ + --rerun-incomplete \ + --verbose + +# Snakemake report: prepare workflow +snakemake \ + --snakefile="../workflow/prepare/Snakefile" \ + --configfile="config_prepare.yaml" \ + --report="snakemake_report_prepare.html" + +# Snakemake report: map workflow +snakemake \ + --snakefile="../workflow/map/Snakefile" \ + --configfile="config_map.yaml" \ + --report="snakemake_report_map.html" # Check md5 sum of some output files find results/ -type f -name \*\.gz -exec gunzip '{}' \; find results/ -type f -name \*\.zip -exec sh -c 'unzip -o {} -d $(dirname {})' \; md5sum --check "expected_output.md5" -# Checksum file generated with -# find results/ \ -# -type f \ -# > expected_output.files; +# Generate checksum files +# (run only when using new test data and after verifying results!) +# find results/ -type f > expected_output.files; # md5sum $(cat expected_output.files) > expected_output.md5 diff --git a/test/test_workflow_slurm.sh b/test/test_workflow_slurm.sh index 905fa44..c72b678 100755 --- a/test/test_workflow_slurm.sh +++ b/test/test_workflow_slurm.sh @@ -3,11 +3,6 @@ # Tear down test environment cleanup () { rc=$? - rm -rf .snakemake/ - rm -rf .tmp/ - rm -rf logs/ - rm -rf results/ - rm -rf snakemake_report_*.html/ cd $user_dir echo "Exit status: $rc" } @@ -20,15 +15,15 @@ set -x # facilitates debugging by printing out executed commands user_dir=$PWD script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" cd $script_dir -mkdir -p logs/cluster/homo_sapiens/GRCh38.98_chrY -mkdir -p logs/local/homo_sapiens/GRCh38.98_chrY -mkdir -p results/homo_sapiens/GRCh38.98_chrY +mkdir -p logs/cluster/{homo_sapiens/chrY,results/test_lib} +mkdir -p logs/local/{homo_sapiens/chrY,results/test_lib} +mkdir -p results/{homo_sapiens/chrY,results/test_lib} -# Run tests +# Run test: prepare workflow snakemake \ - --snakefile="../workflow/prepare_annotation/Snakefile" \ - --configfile="config_prepare_annotation.yaml" \ - --cluster-config="cluster_prepare_annotation.json" \ + --snakefile="../workflow/prepare/Snakefile" \ + --configfile="config_prepare.yaml" \ + --cluster-config="cluster_prepare.json" \ --cluster "sbatch \ --cpus-per-task={cluster.threads} \ --mem={cluster.mem} \ @@ -46,19 +41,46 @@ snakemake \ --rerun-incomplete \ --verbose -# Snakemake report +# Run test: map workflow snakemake \ - --snakefile="../workflow/prepare_annotation/Snakefile" \ - --configfile="config_prepare_annotation.yaml" \ - --report="snakemake_report_prepare_annotation.html" + --snakefile="../workflow/map/Snakefile" \ + --configfile="config_map.yaml" \ + --cluster-config="cluster_map.json" \ + --cluster "sbatch \ + --cpus-per-task={cluster.threads} \ + --mem={cluster.mem} \ + --qos={cluster.queue} \ + --time={cluster.time} \ + --export=JOB_NAME={rule} \ + -o {params.cluster_log} \ + -p scicore \ + --open-mode=append" \ + --jobscript="../jobscript.sh" \ + --use-singularity \ + --singularity-args="--no-home --bind ${PWD}/../" \ + --cores=256 \ + --printshellcmds \ + --rerun-incomplete \ + --verbose + +# Snakemake report: prepare workflow +snakemake \ + --snakefile="../workflow/prepare/Snakefile" \ + --configfile="config_prepare.yaml" \ + --report="snakemake_report_prepare.html" + +# Snakemake report: map workflow +snakemake \ + --snakefile="../workflow/map/Snakefile" \ + --configfile="config_map.yaml" \ + --report="snakemake_report_map.html" # Check md5 sum of some output files find results/ -type f -name \*\.gz -exec gunzip '{}' \; find results/ -type f -name \*\.zip -exec sh -c 'unzip -o {} -d $(dirname {})' \; md5sum --check "expected_output.md5" -# Checksum file generated with -# find results/ \ -# -type f \ -# > expected_output.files; +# Generate checksum files +# (run only when using new test data and after verifying results!) +# find results/ -type f > expected_output.files; # md5sum $(cat expected_output.files) > expected_output.md5 diff --git a/workflow/map/Snakefile b/workflow/map/Snakefile new file mode 100644 index 0000000..dbb10e2 --- /dev/null +++ b/workflow/map/Snakefile @@ -0,0 +1,647 @@ +################################################################################# +# (c) 2020 Paula Iborra, Zavolan Lab, Biozentrum, University of Basel +# (@) paula.iborradetoledo@unibas.ch / paula.iborra@alumni.esci.upf.edu +# +# Workflow to map small RNA-seq reads (e.g. from miRNA sequencing libraries). +################################################################################# + +import os + +localrules: finish + +################################################################################# +### Finish rule +################################################################################# + +rule finish: + input: + maps = expand( + os.path.join( + config["output_dir"], + "{sample}", + "convertedSortedMappings_{sample}.bam.bai" + ), + sample=config["sample"] + ) + +################################################################################# +### Uncompress fastq files +################################################################################# + +rule uncompress_zipped_files: + input: + reads = os.path.join(config["input_dir"], "{sample}.{format}.gz") + output: + reads = os.path.join(config["output_dir"], "{sample}", "{format}", "reads.{format}") + params: + cluster_log = os.path.join(config["cluster_log"], "uncompress_zipped_files_{sample}_{format}.log") + log: + os.path.join(config["local_log"], "uncompress_zipped_files_{sample}_{format}.log") + singularity: + "docker://zavolab/ubuntu:18.04" + shell: + "(zcat {input.reads} > {output.reads}) &> {log}" + +################################################################################# +### Quality filter +################################################################################# + +rule fastq_quality_filter: + input: + reads = os.path.join(config["output_dir"], "{sample}", "fastq", "reads.fastq") + output: + reads = os.path.join(config["output_dir"], "{sample}", "fastq", "filtered_reads.fastq") + params: + cluster_log = os.path.join(config["cluster_log"], "fastq_quality_filter_{sample}.log"), + p = config["p_value"], + q = config["q_value"] + log: + os.path.join(config["local_log"], "fastq_quality_filter_{sample}.log") + singularity: + "docker://zavolab/fastx:0.0.14" + shell: + "(fastq_quality_filter -v -q {params.q} -p {params.p} -i {input.reads} > {output.reads}) &> {log}" + +################################################################################# +### Convert fastq to fasta +################################################################################# + +rule fastq_to_fasta: + input: + reads = os.path.join(config["output_dir"], "{sample}", "fastq", "filtered_reads.fastq") + output: + reads = os.path.join(config["output_dir"], "{sample}", "fastq", "reads.fa") + params: + cluster_log = os.path.join(config["cluster_log"], "fastq_to_fasta_{sample}.log") + log: + os.path.join(config["local_log"], "fastq_to_fasta_{sample}.log") + singularity: + "docker://zavolab/fastx:0.0.14" + shell: + "(fastq_to_fasta -r -n -i {input.reads} > {output.reads}) &> {log}" + +################################################################################# +### Format fasta file +################################################################################# + +rule fasta_formatter: + input: + reads = lambda wildcards: os.path.join(config["output_dir"], wildcards.sample, config[wildcards.sample]['format'], "reads.fa" ) + output: + reads = os.path.join(config["output_dir"], "{sample}", "formatted.fasta") + params: + cluster_log = os.path.join(config["cluster_log"], "fasta_formatter_{sample}.log") + log: + os.path.join(config["local_log"], "fasta_formatter_{sample}.log") + singularity: + "docker://zavolab/fastx:0.0.14" + shell: + "(fasta_formatter -w 0 -i {input.reads} > {output.reads}) &> {log}" + +################################################################################# +### Remove adapters +################################################################################# + +rule cutadapt: + input: + reads = os.path.join(config["output_dir"], "{sample}", "formatted.fasta") + output: + reads = os.path.join(config["output_dir"], "{sample}", "cut.fasta") + params: + cluster_log = os.path.join(config["cluster_log"], "cutadapt_{sample}.log"), + adapter = lambda wildcards: config[ wildcards.sample ]['adapter'], + error_rate = config["error_rate"], + minimum_length = config["minimum_length"], + overlap = config["overlap"], + max_n = config["max_n"] + log: + os.path.join(config["local_log"],"cutadapt_{sample}.log") + resources: + threads = 8 + singularity: + "docker://zavolab/cutadapt:1.16" + shell: + "(cutadapt \ + -a {params.adapter} \ + --error-rate {params.error_rate} \ + --minimum-length {params.minimum_length} \ + --overlap {params.overlap} \ + --trim-n \ + --max-n {params.max_n} \ + --cores {resources.threads} \ + -o {output.reads} {input.reads}) &> {log}" + +################################################################################# +### Collapse identical reads +################################################################################# + +rule fastx_collapser: + input: + reads = os.path.join(config["output_dir"], "{sample}", "cut.fasta") + output: + reads = os.path.join(config["output_dir"], "{sample}", "collapsed.fasta") + params: + cluster_log = os.path.join(config["cluster_log"],"fastx_collapser_{sample}.log") + log: + os.path.join(config["local_log"], "fastx_collapser_{sample}.log") + singularity: + "docker://zavolab/fastx:0.0.14" + shell: + "(fastx_collapser -i {input.reads} > {output.reads}) &> {log}" + +################################################################################# +### Segemehl genome mapping +################################################################################# + +rule mapping_genome_segemehl: + input: + reads = os.path.join(config["output_dir"], "{sample}", "collapsed.fasta"), + genome = config["genome"], + genome_index_segemehl = config["genome_index_segemehl"] + output: + gmap = os.path.join(config["output_dir"], "{sample}", "segemehlGenome_map.sam") + params: + cluster_log = os.path.join(config["cluster_log"], "mapping_genome_segemehl_{sample}.log") + log: + os.path.join(config["local_log"],"mapping_genome_segemehl_{sample}.log") + resources: + mem = 50, + time = 12, + threads = 8 + singularity: + "docker://zavolab/segemehl:0.2.0" + shell: + "segemehl.x \ + -i {input.genome_index_segemehl} \ + -d {input.genome} \ + -t {threads} \ + -q {input.reads} \ + -outfile {output.gmap}" + +################################################################################# +### Segemehl transcriptome mapping +################################################################################# + +rule mapping_transcriptome_segemehl: + input: + reads = os.path.join(config["output_dir"], "{sample}", "collapsed.fasta"), + transcriptome = config["transcriptome"], + transcriptome_index_segemehl = config["transcriptome_index_segemehl"] + output: + tmap = os.path.join(config["output_dir"], "{sample}", "segemehlTranscriptome_map.sam") + params: + cluster_log = os.path.join(config["cluster_log"], "mapping_transcriptome_segemehl_{sample}.log") + log: + os.path.join(config["local_log"], "mapping_transcriptome_segemehl_{sample}.log") + resources: + mem = 10, + time = 12, + threads = 8 + singularity: + "docker://zavolab/segemehl:0.2.0" + shell: + "segemehl.x \ + -i {input.transcriptome_index_segemehl} \ + -d {input.transcriptome} \ + -t {threads} \ + -q {input.reads} \ + -outfile {output.tmap}" + +################################################################################# +### Filter fasta for oligomap mapping +################################################################################# + +rule filter_fasta_for_oligomap: + input: + reads = os.path.join(config["output_dir"], "{sample}", "collapsed.fasta"), + script = os.path.join(config["scripts_dir"], "validation_fasta.py") + output: + reads = os.path.join(config["output_dir"], "{sample}", "filtered_for_oligomap.fasta") + params: + cluster_log = os.path.join(config["cluster_log"], "filter_fasta_for_oligomap_{sample}.log"), + max_length_reads = config["max_length_reads"], + log: + os.path.join(config["local_log"], "filter_fasta_for_oligomap_{sample}.log") + singularity: + "docker://zavolab/python:3.6.5" + shell: + "(python {input.script} -r {params.max_length_reads} -i {input.reads} -o {output.reads}) &> {log}" + +################################################################################# +### Oligomap genome mapping +################################################################################# + +rule mapping_genome_oligomap: + input: + reads = os.path.join(config["output_dir"], "{sample}", "filtered_for_oligomap.fasta"), + target = config["genome"] + output: + gmap = os.path.join(config["output_dir"], "{sample}", "oligoGenome_map.fa"), + report = os.path.join(config["output_dir"], "{sample}", "oligoGenome_report.txt") + params: + cluster_log = os.path.join(config["cluster_log"], "mapping_genome_oligomap_{sample}.log") + log: + os.path.join(config["local_log"], "mapping_genome_oligomap_{sample}.log") + resources: + mem = 50, + time = 6, + threads = 8 + singularity: + "docker://zavolab/oligomap:1.0" + shell: + "oligomap {input.target} {input.reads} -r {output.report} > {output.gmap}" + +################################################################################# +### Oligomap genome sorting +################################################################################# + +rule sort_genome_oligomap: + input: + tmap = os.path.join(config["output_dir"], "{sample}", "oligoGenome_map.fa"), + report = os.path.join(config["output_dir"], "{sample}", "oligoGenome_report.txt"), + script = os.path.join(config["scripts_dir"], "blocksort.sh") + output: + sort = os.path.join(config["output_dir"], "{sample}", "oligoGenome_sorted.fa") + params: + cluster_log = os.path.join(config["cluster_log"], "sorting_genome_oligomap_{sample}.log") + log: + os.path.join(config["local_log"], "sorting_genome_oligomap_{sample}.log") + resources: + threads = 8, + time = 6 + shell: + "(bash {input.script} {input.tmap} {resources.threads} {output.sort}) &> {log}" + +################################################################################# +### Oligomap genome mapping output to SAM +################################################################################# + +rule oligomap_genome_toSAM: + input: + report = os.path.join(config["output_dir"], "{sample}", "oligoGenome_report.txt"), + sort = os.path.join(config["output_dir"], "{sample}", "oligoGenome_sorted.fa"), + script = os.path.join(config["scripts_dir"], "oligomapOutputToSam_nhfiltered.py") + output: + gmap = os.path.join(config["output_dir"], "{sample}", "oligoGenome_converted.sam") + params: + cluster_log = os.path.join(config["cluster_log"], "oligomap_genome_toSAM_{sample}.log"), + nh = config["nh"] + log: + os.path.join(config["local_log"], "oligomap_genome_toSAM_{sample}.log") + resources: + time = 1, + queue = 1 + singularity: + "docker://zavolab/python:3.6.5" + shell: + "(python {input.script} -i {input.sort} -n {params.nh} > {output.gmap}) &> {log}" + +################################################################################# +### Oligomap trancriptome mapping +################################################################################# + +rule mapping_transcriptome_oligomap: + input: + reads = os.path.join(config["output_dir"], "{sample}", "filtered_for_oligomap.fasta"), + target = config["transcriptome"] + output: + tmap = os.path.join(config["output_dir"], "{sample}", "oligoTranscriptome_map.fa"), + report = os.path.join(config["output_dir"], "{sample}", "oligoTranscriptome_report.txt") + params: + cluster_log = os.path.join(config["cluster_log"], "mapping_transcriptome_oligomap_{sample}.log") + log: + os.path.join(config["local_log"], "mapping_transcriptome_oligomap_{sample}.log") + resources: + mem = 10, + time = 6, + threads = 8 + singularity: + "docker://zavolab/oligomap:1.0" + shell: + "oligomap {input.target} {input.reads} -s -r {output.report} > {output.tmap}" + +################################################################################# +### Oligomap trancriptome sorting +################################################################################# + +rule sort_transcriptome_oligomap: + input: + tmap = os.path.join(config["output_dir"], "{sample}", "oligoTranscriptome_map.fa"), + report = os.path.join(config["output_dir"], "{sample}", "oligoTranscriptome_report.txt"), + script = os.path.join(config["scripts_dir"], "blocksort.sh") + output: + sort = os.path.join(config["output_dir"], "{sample}", "oligoTranscriptome_sorted.fa") + params: + cluster_log = os.path.join(config["cluster_log"], "sorting_transcriptome_oligomap_{sample}.log") + log: + os.path.join(config["local_log"], "sorting_transcriptome_oligomap_{sample}.log") + resources: + threads = 8 + shell: + "(bash {input.script} {input.tmap} {resources.threads} {output.sort}) &> {log}" + +################################################################################# +### Oligomap transcriptome mapping ouput to SAM +################################################################################# + +rule oligomap_transcriptome_toSAM: + input: + report = os.path.join(config["output_dir"], "{sample}", "oligoTranscriptome_report.txt"), + sort = os.path.join(config["output_dir"], "{sample}", "oligoTranscriptome_sorted.fa"), + script = os.path.join(config["scripts_dir"], "oligomapOutputToSam_nhfiltered.py") + output: + tmap = os.path.join(config["output_dir"], "{sample}", "oligoTranscriptome_converted.sam") + params: + cluster_log = os.path.join(config["cluster_log"], "oligomap_transcriptome_toSAM_{sample}.log"), + nh = config["nh"] + log: + os.path.join(config["local_log"], "oligomap_transcriptome_toSAM_{sample}.log") + singularity: + "docker://zavolab/python:3.6.5" + shell: + "(python {input.script} -i {input.sort} -n {params.nh} > {output.tmap}) &> {log}" + +################################################################################# +### Merge genome mappings +################################################################################# + +rule merge_genome_maps: + input: + gmap1 = os.path.join(config["output_dir"], "{sample}", "segemehlGenome_map.sam"), + gmap2 = os.path.join(config["output_dir"], "{sample}", "oligoGenome_converted.sam") + output: + gmaps = os.path.join(config["output_dir"], "{sample}", "GenomeMappings.sam") + params: + cluster_log = os.path.join(config["cluster_log"], "merge_genome_maps_{sample}.log") + log: + os.path.join(config["local_log"], "merge_genome_maps_{sample}.log") + singularity: + "docker://zavolab/ubuntu:18.04" + shell: + "(cat {input.gmap1} {input.gmap2} > {output.gmaps}) &> {log}" + +################################################################################# +### Merge trancriptome mappings +################################################################################# + +rule merge_transcriptome_maps: + input: + tmap1 = os.path.join(config["output_dir"], "{sample}", "segemehlTranscriptome_map.sam"), + tmap2 = os.path.join(config["output_dir"], "{sample}", "oligoTranscriptome_converted.sam") + output: + tmaps = os.path.join(config["output_dir"], "{sample}", "TranscriptomeMappings.sam") + params: + cluster_log = os.path.join(config["cluster_log"], "merge_transcriptome_maps_{sample}.log") + log: + os.path.join(config["local_log"], "merge_transcriptome_maps_{sample}.log") + singularity: + "docker://zavolab/ubuntu:18.04" + shell: + "(cat {input.tmap1} {input.tmap2} > {output.tmaps}) &> {log}" + +################################################################################# +### Filter NH genome +################################################################################# + +rule nh_filter_genome: + input: + gmaps = os.path.join(config["output_dir"], "{sample}", "GenomeMappings.sam"), + script = os.path.join(config["scripts_dir"], "nh_filter.py") + output: + gmaps = os.path.join(config["output_dir"], "{sample}", "nhfiltered_GenomeMappings.sam") + params: + cluster_log = os.path.join(config["cluster_log"], "nh_filter_genome_{sample}.log"), + nh = config["nh"] + log: + os.path.join(config["local_log"], "nh_filter_genome_{sample}.log") + singularity: + "docker://zavolab/python:3.6.5" + shell: + "(python {input.script} {input.gmaps} {params.nh} {output.gmaps}) &> {log}" + +################################################################################# +### Filter NH transcriptome +################################################################################# + +rule filter_nh_transcriptome: + input: + tmaps = os.path.join(config["output_dir"], "{sample}", "TranscriptomeMappings.sam"), + script = os.path.join(config["scripts_dir"], "nh_filter.py") + output: + tmaps = os.path.join(config["output_dir"], "{sample}", "nhfiltered_TranscriptomeMappings.sam") + params: + cluster_log = os.path.join(config["cluster_log"], "filter_nh_transcriptome_{sample}.log"), + nh = config["nh"] + log: + os.path.join(config["local_log"], "filter_nh_transcriptome_{sample}.log") + singularity: + "docker://zavolab/python:3.6.5" + shell: + "(python {input.script} {input.tmaps} {params.nh} {output.tmaps}) &> {log}" + +################################################################################# +### Remove header genome mappings +################################################################################# + +rule remove_headers_genome: + input: + gmap = os.path.join(config["output_dir"], "{sample}", "nhfiltered_GenomeMappings.sam") + output: + gmap = os.path.join(config["output_dir"], "{sample}", "noheader_GenomeMappings.sam") + params: + cluster_log = os.path.join(config["cluster_log"], "remove_headers_genome_{sample}.log") + log: + os.path.join(config["local_log"], "remove_headers_genome_{sample}.log") + singularity: + "docker://zavolab/samtools:1.8" + shell: + "samtools view {input.gmap} > {output.gmap}" + +################################################################################# +### Remove header transcriptome mappings +################################################################################# + +rule remove_headers_transcriptome: + input: + tmap = os.path.join(config["output_dir"], "{sample}", "nhfiltered_TranscriptomeMappings.sam") + output: + tmap = os.path.join(config["output_dir"], "{sample}", "noheader_TranscriptomeMappings.sam") + params: + cluster_log = os.path.join(config["cluster_log"], "remove_headers_transcriptome_{sample}.log") + log: + os.path.join(config["local_log"], "remove_headers_transcriptome_{sample}.log") + singularity: + "docker://zavolab/samtools:1.8" + shell: + "samtools view {input.tmap} > {output.tmap}" + +################################################################################# +### Transcriptome to genome coordinates +################################################################################# + +rule trans_to_gen: + input: + tmap = os.path.join(config["output_dir"], "{sample}", "noheader_TranscriptomeMappings.sam"), + script = os.path.join(config["scripts_dir"], "sam_trx_to_sam_gen.pl"), + exons = config["exons"] + output: + genout = os.path.join(config["output_dir"], "{sample}", "TransToGen.sam") + params: + cluster_log = os.path.join(config["cluster_log"], "trans_to_gen_{sample}.log") + log: + os.path.join(config["local_log"], "trans_to_gen_{sample}.log") + singularity: + "docker://zavolab/perl:5.28" + shell: + "(perl {input.script} --in {input.tmap} --exons {input.exons} --out {output.genout}) &> {log}" + +################################################################################# +### Concatenate genome and trancriptome mappings +################################################################################# + +rule cat_mapping: + input: + gmap1 = os.path.join(config["output_dir"], "{sample}", "TransToGen.sam"), + gmap2 = os.path.join(config["output_dir"], "{sample}", "noheader_GenomeMappings.sam") + output: + catmaps = os.path.join(config["output_dir"], "{sample}", "catMappings.sam") + params: + cluster_log = os.path.join(config["cluster_log"], "cat_mapping_{sample}.log") + log: + os.path.join(config["local_log"], "cat_mapping_{sample}.log") + singularity: + "docker://zavolab/ubuntu:18.04" + shell: + "(cat {input.gmap1} {input.gmap2} > {output.catmaps}) &> {log}" + +################################################################################# +### Add header +################################################################################# + +rule add_header: + input: + header = config["header_of_collapsed_fasta"], + catmaps = os.path.join(config["output_dir"], "{sample}", "catMappings.sam") + output: + concatenate = os.path.join(config["output_dir"], "{sample}", "concatenated_header_catMappings.sam") + params: + cluster_log = os.path.join(config["cluster_log"], "add_header_{sample}.log") + log: + os.path.join(config["local_log"], "add_header_{sample}.log") + singularity: + "docker://zavolab/ubuntu:18.04" + shell: + "(cat {input.header} {input.catmaps} > {output.concatenate}) &> {log}" + +################################################################################# +### Sort mapped file by IDs +################################################################################# + +rule sort_id: + input: + concatenate = os.path.join(config["output_dir"], "{sample}", "concatenated_header_catMappings.sam") + output: + sort = os.path.join(config["output_dir"], "{sample}", "header_sorted_catMappings.sam") + params: + cluster_log = os.path.join(config["cluster_log"], "sort_id_{sample}.log") + log: + os.path.join(config["local_log"], "sort_id_{sample}.log") + singularity: + "docker://zavolab/samtools:1.8" + shell: + "(samtools sort -n -o {output.sort} {input.concatenate}) &> {log}" + +################################################################################# +### Remove inferior mappings (keeping multimappers) +################################################################################# + +rule remove_inferiors: + input: + sort = os.path.join(config["output_dir"], "{sample}", "header_sorted_catMappings.sam"), + script = os.path.join(config["scripts_dir"], "sam_remove_duplicates_inferior_alignments_multimappers.1_5.pl") + output: + remove_inf = os.path.join(config["output_dir"], "{sample}", "removeInferiors.sam") + params: + cluster_log = os.path.join(config["cluster_log"], "remove_inferiors_{sample}.log") + log: + os.path.join(config["local_log"], "remove_inferiors_{sample}.log") + resources: + mem = 15, + threads = 4 + singularity: + "docker://zavolab/perl:5.28" + shell: + "(perl {input.script} --print-header --keep-mm --in {input.sort} --out {output.remove_inf}) &> {log}" + +################################################################################# +### Uncollapse reads +################################################################################# + +rule uncollapse_reads: + input: + maps = os.path.join(config["output_dir"], "{sample}", "removeInferiors.sam"), + script = os.path.join(config["scripts_dir"], "sam_uncollapse.pl") + output: + maps = os.path.join(config["output_dir"], "{sample}", "uncollapsedMappings.sam") + params: + cluster_log = os.path.join(config["cluster_log"], "uncollapse_reads_{sample}.log") + log: + os.path.join(config["local_log"], "uncollapse_reads_{sample}.log") + singularity: + "docker://zavolab/perl:5.28" + shell: + "(perl {input.script} --suffix --in {input.maps} --out {output.maps}) &> {log}" + +################################################################################# +### Convert SAM to BAM +################################################################################# + +rule convert_to_bam: + input: + maps = os.path.join(config["output_dir"], "{sample}", "uncollapsedMappings.sam") + output: + maps = os.path.join(config["output_dir"], "{sample}", "mappingsConverted.bam") + params: + cluster_log = os.path.join(config["cluster_log"], "convert_to_bam_{sample}.log") + log: + os.path.join(config["local_log"], "convert_to_bam_{sample}.log") + singularity: + "docker://zavolab/samtools:1.8" + shell: + "(samtools view -b {input.maps} > {output.maps}) &> {log}" + +################################################################################# +### Sort by coordinate position +################################################################################# + +rule sort_by_position: + input: + maps = os.path.join(config["output_dir"], "{sample}", "mappingsConverted.bam") + output: + maps = os.path.join(config["output_dir"], "{sample}", "convertedSortedMappings_{sample}.bam") + params: + cluster_log = os.path.join(config["cluster_log"], "sort_by_position_{sample}.log") + log: + os.path.join(config["local_log"], "sort_by_position_{sample}.log") + singularity: + "docker://zavolab/samtools:1.8" + shell: + "(samtools sort {input.maps} > {output.maps}) &> {log}" + +################################################################################# +### Create bam index +################################################################################# + +rule index_bam: + input: + maps = os.path.join(config["output_dir"], "{sample}", "convertedSortedMappings_{sample}.bam") + output: + maps = os.path.join(config["output_dir"], "{sample}", "convertedSortedMappings_{sample}.bam.bai") + params: + cluster_log = os.path.join(config["cluster_log"], "index_bam_{sample}.log") + log: + os.path.join(config["local_log"], "index_bam_{sample}.log") + singularity: + "docker://zavolab/samtools:1.8" + shell: + "(samtools index -b {input.maps} > {output.maps}) &> {log}" diff --git a/workflow/prepare/Snakefile b/workflow/prepare/Snakefile new file mode 100644 index 0000000..9e49339 --- /dev/null +++ b/workflow/prepare/Snakefile @@ -0,0 +1,661 @@ +################################################################################ +# (c) 2020 Paula Iborra, Zavolan Lab, Biozentrum, University of Basel +# (@) paula.iborradetoledo@unibas.ch / paula.iborra@alumni.esci.upf.edu +# +# Snakemake workflow to download and prepare the necessary files for +# smallRNA-seq related workflows. +################################################################################ + +import os + +# Rules that require internet connection for downloading files are included +# in the localrules +localrules: finish, genome_process, filter_anno_gtf, mirna_anno, dict_chr + +################################################################################ +### Finish rule +################################################################################ + +rule finish: + input: + idx_transcriptome = expand( + os.path.join( + config["output_dir"], + "{organism}", + "transcriptome_index_segemehl.idx"), + organism=config["organism"]), + idx_genome = expand( + os.path.join( + config["output_dir"], + "{organism}", + "genome_index_segemehl.idx"), + organism=config["organism"]), + exons = expand( + os.path.join( + config["output_dir"], + "{organism}", + "exons.bed"), + organism=config["organism"]), + header = expand( + os.path.join( + config["output_dir"], + "{organism}", + "headerOfCollapsedFasta.sam"), + organism=config["organism"]), + mirnafilt = expand( + os.path.join( + config["output_dir"], + "{organism}", + "mirna_filtered.bed"), + organism=config["organism"]), + isomirs = expand( + os.path.join( + config["output_dir"], + "{organism}", + "isomirs_annotation.bed"), + organism=config["organism"]) + +################################################################################ +### Download and process genome IDs +################################################################################ + +rule genome_process: + input: + script = os.path.join(config["scripts_dir"],"genome_process.sh") + output: + genome = os.path.join( + config["output_dir"], "{organism}", "genome.processed.fa" + ) + params: + url = lambda wildcards: config[ wildcards.organism ]["genome_url"], + dir_out = os.path.join(config["output_dir"], "{organism}") + log: + os.path.join(config["local_log"], "{organism}","genome_process.log") + singularity: + "docker://zavolab/ubuntu:18.04" + shell: + "(bash {input.script} {params.dir_out} {log} {params.url})" + +################################################################################ +### Download and filter gtf by transcript_level +################################################################################ + +rule filter_anno_gtf: + input: + script = os.path.join(config["scripts_dir"],"filter_anno_gtf.sh"), + output: + gtf = os.path.join( + config["output_dir"], "{organism}","gene_annotations.filtered.gtf" + ) + params: + url = lambda wildcards: config[ wildcards.organism ]['gtf_url'], + dir_out = os.path.join(config["output_dir"], "{organism}") + log: + os.path.join(config["local_log"], "{organism}","filter_anno_gtf.log") + singularity: + "docker://zavolab/ubuntu:18.04" + shell: + "(bash {input.script} {params.dir_out} {log} {params.url}) &> {log}" + +################################################################################ +### Extract transcriptome sequences in FASTA from genome. +################################################################################ + +rule extract_transcriptome_seqs: + input: + genome = os.path.join( + config["output_dir"], "{organism}", "genome.processed.fa" + ), + gtf = os.path.join( + config["output_dir"], "{organism}","gene_annotations.filtered.gtf" + ) + output: + fasta = os.path.join( + config["output_dir"], "{organism}","transcriptome.fa" + ) + params: + cluster_log = os.path.join( + config["cluster_log"], "{organism}","extract_transcriptome_seqs.log" + ) + log: + os.path.join( + config["local_log"], "{organism}","extract_transcriptome_seqs.log" + ) + singularity: + "docker://zavolab/cufflinks:2.2.1" + shell: + "(gffread -w {output.fasta} -g {input.genome} {input.gtf}) &> {log}" + +############################################################################### +## Trim transcript IDs from FASTA file +############################################################################### + +rule trim_fasta: + input: + fasta = os.path.join( + config["output_dir"], "{organism}","transcriptome.fa" + ), + script = os.path.join(config["scripts_dir"], "validation_fasta.py") + output: + fasta = os.path.join( + config["output_dir"], "{organism}","transcriptome_idtrim.fa" + ) + params: + cluster_log = os.path.join( + config["cluster_log"], "{organism}","trim_fasta.log" + ) + log: + os.path.join(config["local_log"], "{organism}","trim_fasta.log") + singularity: + "docker://zavolab/ubuntu:18.04" + shell: + """(awk \ + -F" " \ + "/^>/ {{print \$1; next}} 1" \ + {input.fasta} \ + > {output.fasta} \ + ) &> {log}""" + +################################################################################ +### Generate segemehl index for transcripts +################################################################################ + +rule generate_segemehl_index_transcriptome: + input: + fasta = os.path.join( + config["output_dir"], "{organism}","transcriptome_idtrim.fa" + ) + output: + idx = os.path.join( + config["output_dir"], + "{organism}", + "transcriptome_index_segemehl.idx" + ) + params: + cluster_log = os.path.join( + config["cluster_log"], + "{organism}", + "generate_segemehl_index_transcriptome.log" + ) + log: + os.path.join( + config["local_log"], + "{organism}", + "generate_segemehl_index_transcriptome.log" + ) + resources: + mem = 10, + threads = 8, + time = 6 + singularity: + "docker://zavolab/segemehl:0.2.0" + shell: + "(segemehl.x -x {output.idx} -d {input.fasta}) &> {log}" + +################################################################################ +### Generate segemehl index for genome +################################################################################ + +rule generate_segemehl_index_genome: + input: + genome = os.path.join( + config["output_dir"], "{organism}", "genome.processed.fa" + ) + output: + idx = os.path.join + (config["output_dir"], "{organism}","genome_index_segemehl.idx" + ) + params: + cluster_log = os.path.join( + config["cluster_log"], + "{organism}", + "generate_segemehl_index_genome.log" + ) + log: + os.path.join( + config["local_log"], + "{organism}", + "generate_segemehl_index_genome.log" + ) + resources: + mem = 50, + threads = 8, + time = 6 + singularity: + "docker://zavolab/segemehl:0.2.0" + shell: + "(segemehl.x -x {output.idx} -d {input.genome}) &> {log}" + +################################################################################ +### GTF file of exons (genomic coordinates) +################################################################################ + +rule get_exons_gtf: + input: + gtf = os.path.join( + config["output_dir"], "{organism}","gene_annotations.filtered.gtf" + ), + script = os.path.join(config["scripts_dir"], "get_lines_w_pattern.sh") + output: + exons = os.path.join(config["output_dir"], "{organism}","exons.gtf") + params: + cluster_log = os.path.join( + config["cluster_log"], "{organism}","get_exons_gtf.log" + ) + log: + os.path.join(config["local_log"], "{organism}", "get_exons_gtf.log") + singularity: + "docker://zavolab/ubuntu:18.04" + shell: + "(bash \ + {input.script} \ + -f {input.gtf} \ + -c 3 \ + -p exon \ + -o {output.exons} \ + ) &> {log}" + +################################################################################ +### Convert GTF file of exons to BED file +################################################################################ + +rule gtftobed: + input: + exons = os.path.join(config["output_dir"], "{organism}","exons.gtf"), + script = os.path.join(config["scripts_dir"], "gtf_exons_bed.1.1.2.R") + output: + exons = os.path.join(config["output_dir"], "{organism}","exons.bed") + params: + cluster_log = os.path.join( + config["cluster_log"], "{organism}","gtftobed.log" + ) + log: + os.path.join(config["local_log"], "{organism}","gtftobed.log") + singularity: + "docker://zavolab/r-zavolab:3.5.1" + shell: + "(Rscript \ + {input.script} \ + --gtf {input.exons} \ + -o {output.exons} \ + ) &> {log}" + +################################################################################ +### Create header for SAM file +################################################################################ + +rule create_header_genome: + input: + genome = os.path.join( + config["output_dir"], "{organism}", "genome.processed.fa" + ) + output: + header = os.path.join( + config["output_dir"], "{organism}","headerOfCollapsedFasta.sam" + ) + params: + cluster_log = os.path.join( + config["cluster_log"], "{organism}","create_header_genome.log" + ) + log: + os.path.join( + config["local_log"], "{organism}","create_header_genome.log" + ) + singularity: + "docker://zavolab/samtools:1.8" + shell: + "(samtools dict -o {output.header} --uri=NA {input.genome}) &> {log}" + +################################################################################ +### Download miRNA annotation +################################################################################ + +rule mirna_anno: + input: + genome = os.path.join( + config["output_dir"], "{organism}", "genome.processed.fa" + ) + output: + anno = os.path.join( + config["output_dir"], "{organism}","raw", "mirna.gff3" + ) + params: + anno = lambda wildcards: config[ wildcards.organism ]["mirna_url"], + cluster_log = os.path.join( + config["cluster_log"], "{organism}","mirna_anno.log" + ), + log: + os.path.join(config["local_log"], "{organism}","mirna_anno.log") + singularity: + "docker://zavolab/ubuntu:18.04" + shell: + "(wget {params.anno} -O {output.anno}) &> {log}" + +################################################################################ +### Download dictionary mapping chr +################################################################################ + +rule dict_chr: + input: + genome = os.path.join( + config["output_dir"], "{organism}", "genome.processed.fa" + ) + output: + map_chr = os.path.join( + config["output_dir"], "{organism}", "UCSC2ensembl.txt" + ) + params: + map_chr = lambda wildcards: config[wildcards.organism]["map_chr_url"], + cluster_log = os.path.join( + config["cluster_log"], "{organism}","dict_chr.log" + ), + log: + os.path.join(config["local_log"], "{organism}","dict_chr.log") + singularity: + "docker://zavolab/ubuntu:18.04" + shell: + "(wget {params.map_chr} -O {output.map_chr}) &> {log}" + +################################################################################ +### Mapping chromosomes names, UCSC <-> ENSEMBL +################################################################################ + +rule map_chr_names: + input: + anno = os.path.join( + config["output_dir"], "{organism}","raw", "mirna.gff3" + ), + script = os.path.join(config["scripts_dir"], "map_chromosomes.pl"), + map_chr = os.path.join( + config["output_dir"], "{organism}", "UCSC2ensembl.txt" + ) + output: + gff = os.path.join( + config["output_dir"], "{organism}", "mirna_chr_mapped.gff3" + ) + params: + cluster_log = os.path.join( + config["cluster_log"], "{organism}", "map_chr_names.log" + ), + column = lambda wildcards: config[ wildcards.organism ]["column"], + delimiter = lambda wildcards: config[ wildcards.organism ]["delimiter"] + log: + os.path.join(config["local_log"], "{organism}","map_chr_names.log") + singularity: + "docker://zavolab/perl:5.28" + shell: + "(perl {input.script} \ + {input.anno} \ + {params.column} \ + {params.delimiter} \ + {input.map_chr} \ + {output.gff} \ + ) &> {log}" + +################################################################################ +### Filtering _1 miR IDs +################################################################################ + +rule filter_mir_1_anno: + input: + gff = os.path.join( + config["output_dir"], "{organism}", "mirna_chr_mapped.gff3" + ) + output: + gff = os.path.join( + config["output_dir"], "{organism}", "mirna_filtered.gff3" + ) + params: + script = os.path.join(config["scripts_dir"], "filter_mir_1_anno.sh"), + cluster_log = os.path.join( + config["cluster_log"], "{organism}","filter_mir_1_anno.log" + ), + log: + os.path.join(config["local_log"], "{organism}", "filter_mir_1_anno.log") + singularity: + "docker://zavolab/ubuntu:18.04" + shell: + "(bash {params.script} -f {input.gff} -o {output.gff}) &> {log}" + +################################################################################ +### GFF to BED (improve intersect memory efficient allowing to use -sorted) +################################################################################ + +rule gfftobed: + input: + gff = os.path.join( + config["output_dir"], "{organism}", "mirna_filtered.gff3" + ) + output: + bed = os.path.join( + config["output_dir"], "{organism}", "mirna_filtered.bed" + ) + params: + cluster_log = os.path.join( + config["cluster_log"], "{organism}", "gfftobed.log" + ), + out_dir = os.path.join(config["output_dir"]) + log: + os.path.join(config["local_log"], "{organism}", "gfftobed.log") + singularity: + "docker://zavolab/bedops:2.4.35" + shell: + "(convert2bed -i gff < {input.gff} \ + --sort-tmpdir={params.out_dir} \ + > {output.bed} \ + ) &> {log}" + +################################################################################ +### Index genome fasta file +################################################################################ + +rule create_index_fasta: + input: + genome = os.path.join( + config["output_dir"], "{organism}", "genome.processed.fa" + ), + output: + genome = os.path.join( + config["output_dir"], "{organism}", "genome.processed.fa.fai" + ), + params: + cluster_log = os.path.join( + config["cluster_log"], "{organism}","create_index_fasta.log" + ) + log: + os.path.join(config["local_log"], "{organism}","create_index_fasta.log") + singularity: + "docker://zavolab/samtools:1.8" + shell: + "(samtools faidx {input.genome}) &> {log}" + +################################################################################ +### Extract chromosome length +################################################################################ + +rule extract_chr_len: + input: + genome = os.path.join( + config["output_dir"], "{organism}", "genome.processed.fa.fai" + ) + output: + chrsize = os.path.join( + config["output_dir"], "{organism}", "chr_size.txt" + ) + params: + cluster_log = os.path.join( + config["cluster_log"], "{organism}","extract_chr_len.log" + ) + log: + os.path.join(config["local_log"], "{organism}","extract_chr_len.log") + singularity: + "docker://zavolab/ubuntu:18.04" + shell: + "(cut -f1,2 {input.genome} > {output.chrsize}) &> {log}" + +################################################################################ +### Extract mature miRNA +################################################################################ + +rule filter_mature_mirs: + input: + bed = os.path.join( + config["output_dir"], "{organism}", "mirna_filtered.bed" + ) + output: + bed = os.path.join( + config["output_dir"], "{organism}", "mirna_mature_filtered.bed" + ) + params: + cluster_log = os.path.join( + config["cluster_log"], "{organism}", "filter_mature_mirs.log" + ), + precursor = "miRNA_primary_transcript" + log: + os.path.join( + config["local_log"], "{organism}", "filter_mature_mirs.log" + ) + singularity: + "docker://zavolab/ubuntu:18.04", + shell: + "(grep -v {params.precursor} {input.bed} > {output.bed}) &> {log}" + +################################################################################ +### Create isomirs annotation file from mature miRNA +################################################################################ + +rule iso_anno: + input: + bed = os.path.join( + config["output_dir"], "{organism}", "mirna_mature_filtered.bed" + ), + chrsize = os.path.join( + config["output_dir"], "{organism}", "chr_size.txt" + ) + output: + bed = os.path.join( + config["output_dir"], + "{organism}", + "iso_anno_5p{bp_5p}_3p{bp_3p}.bed" + ) + params: + cluster_log = os.path.join( + config["cluster_log"], + "{organism}", + "iso_anno_5p{bp_5p}_3p{bp_3p}.log" + ), + bp_5p = lambda wildcards: wildcards.bp_5p, + bp_3p = lambda wildcards: wildcards.bp_3p + log: + os.path.join( + config["local_log"], + "{organism}", + "iso_anno_5p{bp_5p}_3p{bp_3p}.log" + ) + singularity: + "docker://zavolab/bedtools:2.28.0" + shell: + "(bedtools slop \ + -i {input.bed} \ + -g {input.chrsize} \ + -l {params.bp_5p} \ + -r {params.bp_3p} \ + > {output.bed} \ + ) &> {log}" + +################################################################################ +### Change miRNA names to isomirs names +################################################################################ + +rule iso_anno_rename: + input: + bed = os.path.join( + config["output_dir"], + "{organism}", + "iso_anno_5p{bp_5p}_3p{bp_3p}.bed" + ) + output: + bed = os.path.join( + config["output_dir"], + "{organism}", + "iso_anno_rename_5p{bp_5p}_3p{bp_3p}.bed" + ) + params: + cluster_log = os.path.join( + config["cluster_log"], + "{organism}", + "iso_anno_rename_5p{bp_5p}_3p{bp_3p}.log" + ), + bp_5p = lambda wildcards: wildcards.bp_5p, + bp_3p = lambda wildcards: wildcards.bp_3p + log: + os.path.join( + config["local_log"], + "{organism}", + "iso_anno_rename_5p{bp_5p}_3p{bp_3p}.log" + ) + singularity: + "docker://zavolab/ubuntu:18.04" + shell: + "(sed \ + 's/;Derives/_5p{params.bp_5p}_3p{params.bp_3p};Derives/' \ + {input.bed} \ + > {output.bed} \ + ) &> {log}" + +################################################################################ +### Concatenate all isomirs annotation files +################################################################################ + +rule iso_anno_concat: + input: + bed = lambda wildcards: expand(os.path.join( + config["output_dir"], + "{organism}", + "iso_anno_rename_5p{bp_5p}_3p{bp_3p}.bed" + ), + organism = config["organism"], + bp_3p = config['bp_3p'], + bp_5p = config['bp_5p']) + output: + bed = os.path.join( + config["output_dir"], "{organism}", "iso_anno_concat.bed" + ) + params: + cluster_log = os.path.join( + config["cluster_log"], "{organism}", "iso_anno_concat.log" + ), + prefix = os.path.join( + config["output_dir"], "{organism}", "iso_anno_rename" + ) + log: + os.path.join(config["local_log"], "{organism}", "iso_anno_concat.log") + singularity: + "docker://zavolab/ubuntu:18.04" + shell: + "(cat {params.prefix}* > {output.bed}) &> {log}" + +################################################################################ +### Remove non changing isomirs (5p0_3p0) +################################################################################ + +rule iso_anno_final: + input: + bed = os.path.join( + config["output_dir"], "{organism}", "iso_anno_concat.bed" + ) + output: + bed = os.path.join( + config["output_dir"], "{organism}", "isomirs_annotation.bed" + ) + params: + cluster_log = os.path.join( + config["cluster_log"], "{organism}", "iso_anno_final.log" + ), + pattern = "5p0_3p0" + log: + os.path.join(config["local_log"], "{organism}", "iso_anno_final.log") + singularity: + "docker://zavolab/ubuntu:18.04" + shell: + "(grep -v '{params.pattern}' {input.bed} > {output.bed}) &> {log}" diff --git a/workflow/prepare_annotation/Snakefile b/workflow/prepare_annotation/Snakefile deleted file mode 100644 index db9b71a..0000000 --- a/workflow/prepare_annotation/Snakefile +++ /dev/null @@ -1,470 +0,0 @@ -################################################################################# -# (c) 2020 Paula Iborra, Zavolan Lab, Biozentrum, University of Basel -# (@) paula.iborradetoledo@unibas.ch / paula.iborra@alumni.esci.upf.edu -# -# Pipeline to download and prepare the necessary files for smallRNA-seq related pipelines. -################################################################################# - -import os - -# Global config -# Rules that requires internet connection for downloading files should be included in the localrules -localrules: finish, genome_process, filter_anno_gtf, mirna_anno, dict_chr - -################################################################################# -### Finish rule -################################################################################# - -rule finish: - input: - idx_transcriptome = expand( - os.path.join( - config["output_dir"], - "{organism}", - "transcriptome_index_segemehl.idx"), - organism=config["organism"]), - idx_genome = expand( - os.path.join( - config["output_dir"], - "{organism}", - "genome_index_segemehl.idx"), - organism=config["organism"]), - exons = expand( - os.path.join( - config["output_dir"], - "{organism}", - "exons.bed"), - organism=config["organism"]), - header = expand( - os.path.join( - config["output_dir"], - "{organism}", - "headerOfCollapsedFasta.sam"), - organism=config["organism"]), - mirnafilt = expand( - os.path.join( - config["output_dir"], - "{organism}", - "mirna_filtered.bed"), - organism=config["organism"]), - isomirs = expand( - os.path.join( - config["output_dir"], - "{organism}", - "isomirs_annotation.bed"), - organism=config["organism"]) - -################################################################################# -### Download and process genome IDs -################################################################################# - -rule genome_process: - input: - script = os.path.join(config["scripts_dir"],"genome_process.sh"), - output: - genome = os.path.join(config["output_dir"],"{organism}", "genome.processed.fa") - params: - url = lambda wildcards: config[ wildcards.organism ]["genome_url"], - dir_out = os.path.join(config["output_dir"],"{organism}") - log: - os.path.join(config["local_log"],"{organism}","genome_process.log") - singularity: - "docker://zavolab/ubuntu:18.04" - shell: - "(bash {input.script} {params.dir_out} {log} {params.url})" - -################################################################################# -### Download and filter gtf by transcript_level -################################################################################# - -rule filter_anno_gtf: - input: - script = os.path.join(config["scripts_dir"],"filter_anno_gtf.sh"), - output: - gtf = os.path.join(config["output_dir"],"{organism}","gene_annotations.filtered.gtf") - params: - url = lambda wildcards: config[ wildcards.organism ]['gtf_url'], - dir_out = os.path.join(config["output_dir"],"{organism}") - log: - os.path.join(config["local_log"],"{organism}","filter_anno_gtf.log") - singularity: - "docker://zavolab/ubuntu:18.04" - shell: - "(bash {input.script} {params.dir_out} {log} {params.url}) &> {log}" - -################################################################################# -### Extract transcriptome sequences in FASTA from genome. -################################################################################# - -rule extract_transcriptome_seqs: - input: - genome = os.path.join(config["output_dir"],"{organism}", "genome.processed.fa"), - gtf = os.path.join(config["output_dir"],"{organism}","gene_annotations.filtered.gtf") - output: - fasta = os.path.join(config["output_dir"],"{organism}","transcriptome.fa") - params: - cluster_log = os.path.join(config["cluster_log"],"{organism}","extract_transcriptome_seqs.log") - log: - os.path.join(config["local_log"],"{organism}","extract_transcriptome_seqs.log") - singularity: - "docker://zavolab/cufflinks:2.2.1" - shell: - "(gffread -w {output.fasta} -g {input.genome} {input.gtf}) &> {log}" - -################################################################################ -## Trim transcript IDs from FASTA file -################################################################################ - -rule trim_fasta: - input: - fasta = os.path.join(config["output_dir"], "{organism}","transcriptome.fa"), - script = os.path.join(config["scripts_dir"], "validation_fasta.py") - output: - fasta = os.path.join(config["output_dir"],"{organism}","transcriptome_idtrim.fa") - params: - cluster_log = os.path.join(config["cluster_log"],"{organism}","trim_fasta.log") - log: - os.path.join(config["local_log"],"{organism}","trim_fasta.log") - singularity: - "docker://zavolab/ubuntu:18.04" - shell: - """(awk -F" " "/^>/ {{print \$1; next}} 1" {input.fasta} > {output.fasta}) &> {log}""" - -################################################################################# -### Generate segemehl index for transcripts -################################################################################# - -rule generate_segemehl_index_transcriptome: - input: - fasta = os.path.join(config["output_dir"],"{organism}","transcriptome_idtrim.fa") - output: - idx = os.path.join(config["output_dir"],"{organism}","transcriptome_index_segemehl.idx") - params: - cluster_log = os.path.join(config["cluster_log"],"{organism}","generate_segemehl_index_transcriptome.log"), - log: - os.path.join(config["local_log"],"{organism}","generate_segemehl_index_transcriptome.log") - resources: - mem = 10, - threads = 8, - time = 6 - singularity: - "docker://zavolab/segemehl:0.2.0" - shell: - "(segemehl.x -x {output.idx} -d {input.fasta}) &> {log}" - -################################################################################# -### Generate segemehl index for genome -################################################################################# - -rule generate_segemehl_index_genome: - input: - genome = os.path.join(config["output_dir"],"{organism}", "genome.processed.fa") - output: - idx = os.path.join(config["output_dir"],"{organism}","genome_index_segemehl.idx") - params: - cluster_log = os.path.join(config["cluster_log"],"{organism}","generate_segemehl_index_genome.log"), - log: - os.path.join(config["local_log"],"{organism}","generate_segemehl_index_genome.log") - resources: - mem = 50, - threads = 8, - time = 6 - singularity: - "docker://zavolab/segemehl:0.2.0" - shell: - "(segemehl.x -x {output.idx} -d {input.genome}) &> {log}" - -################################################################################# -### GTF file of exons (genomic coordinates) -################################################################################# - -rule get_exons_gtf: - input: - gtf = os.path.join(config["output_dir"],"{organism}","gene_annotations.filtered.gtf"), - script = os.path.join(config["scripts_dir"], "get_lines_w_pattern.sh") - output: - exons = os.path.join(config["output_dir"],"{organism}","exons.gtf") - params: - cluster_log = os.path.join(config["cluster_log"],"{organism}","get_exons_gtf.log") - log: - os.path.join(config["local_log"],"{organism}", "get_exons_gtf.log") - singularity: - "docker://zavolab/ubuntu:18.04" - shell: - "(bash {input.script} -f {input.gtf} -c 3 -p exon -o {output.exons} ) &> {log}" - -################################################################################# -### Convert GTF file of exons to BED file -################################################################################# - -rule gtftobed: - input: - exons = os.path.join(config["output_dir"],"{organism}","exons.gtf"), - script = os.path.join(config["scripts_dir"], "gtf_exons_bed.1.1.2.R") - output: - exons = os.path.join(config["output_dir"],"{organism}","exons.bed") - params: - cluster_log = os.path.join(config["cluster_log"],"{organism}","gtftobed.log") - log: - os.path.join(config["local_log"],"{organism}","gtftobed.log") - singularity: - "docker://zavolab/r-zavolab:3.5.1" - shell: - "(Rscript {input.script} --gtf {input.exons} -o {output.exons}) &> {log}" - -################################################################################# -### Create header for SAM file -################################################################################# - -rule create_header_genome: - input: - genome = os.path.join(config["output_dir"],"{organism}", "genome.processed.fa") - output: - header = os.path.join(config["output_dir"],"{organism}","headerOfCollapsedFasta.sam") - params: - cluster_log = os.path.join(config["cluster_log"],"{organism}","create_header_genome.log") - log: - os.path.join(config["local_log"],"{organism}","create_header_genome.log") - singularity: - "docker://zavolab/samtools:1.8" - shell: - "(samtools dict -o {output.header} {input.genome}) &> {log}" - -################################################################################# -### Download miRNA annotation -################################################################################# - -rule mirna_anno: - input: - genome = os.path.join(config["output_dir"],"{organism}", "genome.processed.fa") - output: - anno = os.path.join(config["output_dir"],"{organism}","raw", "mirna.gff3") - params: - anno = lambda wildcards: config[ wildcards.organism ]["mirna_url"], - cluster_log = os.path.join(config["cluster_log"],"{organism}","mirna_anno.log"), - log: - os.path.join(config["local_log"],"{organism}","mirna_anno.log") - singularity: - "docker://zavolab/ubuntu:18.04" - shell: - "(wget {params.anno} -O {output.anno}) &> {log}" - -################################################################################# -### Download dictionary mapping chr -################################################################################# - -rule dict_chr: - input: - genome = os.path.join(config["output_dir"],"{organism}", "genome.processed.fa") - output: - map_chr = os.path.join(config["output_dir"],"{organism}", "UCSC2ensembl.txt") - params: - map_chr = lambda wildcards: config[ wildcards.organism ]["map_chr_url"], - cluster_log = os.path.join(config["cluster_log"],"{organism}","dict_chr.log"), - log: - os.path.join(config["local_log"],"{organism}","dict_chr.log") - singularity: - "docker://zavolab/ubuntu:18.04" - shell: - "(wget {params.map_chr} -O {output.map_chr}) &> {log}" - -################################################################################# -### Mapping chromosomes names, UCSC <-> ENSEMBL -################################################################################# - -rule map_chr_names: - input: - anno = os.path.join(config["output_dir"],"{organism}","raw", "mirna.gff3"), - script = os.path.join(config["scripts_dir"], "map_chromosomes.pl"), - map_chr = os.path.join(config["output_dir"],"{organism}", "UCSC2ensembl.txt") - output: - gff = os.path.join(config["output_dir"],"{organism}", "mirna_chr_mapped.gff3") - params: - cluster_log = os.path.join(config["cluster_log"],"{organism}", "map_chr_names.log"), - column = lambda wildcards: config[ wildcards.organism ]["column"], - delimiter = lambda wildcards: config[ wildcards.organism ]["delimiter"] - log: - os.path.join(config["local_log"],"{organism}","map_chr_names.log") - singularity: - "docker://zavolab/perl:5.28" - shell: - "(perl {input.script} {input.anno} \ - {params.column} \ - {params.delimiter} \ - {input.map_chr} \ - {output.gff}) &> {log}" - -################################################################################# -### Filtering _1 miR IDs -################################################################################# - -rule filter_mir_1_anno: - input: - gff = os.path.join(config["output_dir"],"{organism}", "mirna_chr_mapped.gff3") - output: - gff = os.path.join(config["output_dir"],"{organism}", "mirna_filtered.gff3") - params: - script = os.path.join(config["scripts_dir"], "filter_mir_1_anno.sh"), - cluster_log = os.path.join(config["cluster_log"],"{organism}","filter_mir_1_anno.log"), - log: - os.path.join(config["local_log"],"{organism}", "filter_mir_1_anno.log") - singularity: - "docker://zavolab/ubuntu:18.04" - shell: - "(bash {params.script} -f {input.gff} -o {output.gff}) &> {log}" - -################################################################################# -### GFF to BED (improve intersect memory efficient allowing to use -sorted) -################################################################################# - -rule gfftobed: - input: - gff = os.path.join(config["output_dir"],"{organism}", "mirna_filtered.gff3") - output: - bed= os.path.join(config["output_dir"],"{organism}", "mirna_filtered.bed") - params: - cluster_log = os.path.join(config["cluster_log"],"{organism}", "gfftobed.log"), - out_dir= os.path.join(config["output_dir"]) - log: - os.path.join(config["local_log"],"{organism}", "gfftobed.log") - singularity: - "docker://zavolab/bedops:2.4.35" - shell: - "(convert2bed -i gff < {input.gff} --sort-tmpdir={params.out_dir} > {output.bed}) &> {log}" - -################################################################################# -### Index genome fasta file -################################################################################# - -rule create_index_fasta: - input: - genome = os.path.join(config["output_dir"],"{organism}", "genome.processed.fa"), - output: - genome = os.path.join(config["output_dir"],"{organism}", "genome.processed.fa.fai"), - params: - cluster_log = os.path.join(config["cluster_log"],"{organism}","create_index_fasta.log") - log: - os.path.join(config["local_log"],"{organism}","create_index_fasta.log") - singularity: - "docker://zavolab/samtools:1.8" - shell: - "(samtools faidx {input.genome}) &> {log}" - -################################################################################# -### Extract chromosome length -################################################################################# - -rule extract_chr_len: - input: - genome = os.path.join(config["output_dir"],"{organism}", "genome.processed.fa.fai"), - output: - chrsize = os.path.join(config["output_dir"],"{organism}", "chr_size.txt"), - params: - cluster_log = os.path.join(config["cluster_log"],"{organism}","extract_chr_len.log") - log: - os.path.join(config["local_log"],"{organism}","extract_chr_len.log") - singularity: - "docker://zavolab/ubuntu:18.04" - shell: - "(cut -f1,2 {input.genome} > {output.chrsize}) &> {log}" - -################################################################################# -### Extract mature miRNA -################################################################################# - -rule filter_mature_mirs: - input: - bed= os.path.join(config["output_dir"],"{organism}", "mirna_filtered.bed"), - output: - bed= os.path.join(config["output_dir"],"{organism}", "mirna_mature_filtered.bed") - params: - cluster_log = os.path.join(config["cluster_log"],"{organism}", "filter_mature_mirs.log"), - precursor="miRNA_primary_transcript" - log: - os.path.join(config["local_log"],"{organism}", "filter_mature_mirs.log") - singularity: - "docker://zavolab/ubuntu:18.04", - shell: - "(grep -v {params.precursor} {input.bed} > {output.bed}) &> {log}" - -################################################################################# -### Create isomirs annotation file from mature miRNA -################################################################################# - -rule iso_anno: - input: - bed= os.path.join(config["output_dir"],"{organism}", "mirna_mature_filtered.bed"), - chrsize = os.path.join(config["output_dir"],"{organism}", "chr_size.txt") - output: - bed= os.path.join(config["output_dir"],"{organism}", "iso_anno_5p{bp_5p}_3p{bp_3p}.bed") - params: - cluster_log = os.path.join(config["cluster_log"],"{organism}", "iso_anno_5p{bp_5p}_3p{bp_3p}.log"), - bp_5p = lambda wildcards: wildcards.bp_5p, - bp_3p = lambda wildcards: wildcards.bp_3p - log: - os.path.join(config["local_log"],"{organism}", "iso_anno_5p{bp_5p}_3p{bp_3p}.log") - singularity: - "docker://zavolab/bedtools:2.28.0" - shell: - "(bedtools slop -i {input.bed} -g {input.chrsize} -l {params.bp_5p} -r {params.bp_3p} > {output.bed}) &> {log}" - -################################################################################# -### Change miRNA names to isomirs names -################################################################################# - -rule iso_anno_rename: - input: - bed= os.path.join(config["output_dir"],"{organism}", "iso_anno_5p{bp_5p}_3p{bp_3p}.bed") - output: - bed= os.path.join(config["output_dir"],"{organism}", "iso_anno_rename_5p{bp_5p}_3p{bp_3p}.bed") - params: - cluster_log = os.path.join(config["cluster_log"],"{organism}", "iso_anno_rename_5p{bp_5p}_3p{bp_3p}.log"), - bp_5p = lambda wildcards: wildcards.bp_5p, - bp_3p = lambda wildcards: wildcards.bp_3p - log: - os.path.join(config["local_log"],"{organism}", "iso_anno_rename_5p{bp_5p}_3p{bp_3p}.log") - singularity: - "docker://zavolab/ubuntu:18.04" - shell: - "( sed 's/;Derives/_5p{params.bp_5p}_3p{params.bp_3p};Derives/' {input.bed} > {output.bed}) &> {log}" - -################################################################################# -### Concatenate all isomirs annotation files -################################################################################# - -rule iso_anno_concat: - input: - bed = lambda wildcards: expand(os.path.join(config["output_dir"],"{organism}", "iso_anno_rename_5p{bp_5p}_3p{bp_3p}.bed"), - organism= config["organism"], - bp_3p= config['bp_3p'], - bp_5p= config['bp_5p']) - output: - bed= os.path.join(config["output_dir"],"{organism}", "iso_anno_concat.bed") - params: - cluster_log = os.path.join(config["cluster_log"],"{organism}", "iso_anno_concat.log"), - prefix= os.path.join(config["output_dir"],"{organism}", "iso_anno_rename") - log: - os.path.join(config["local_log"],"{organism}", "iso_anno_concat.log") - singularity: - "docker://zavolab/ubuntu:18.04" - shell: - "( cat {params.prefix}* > {output.bed}) &> {log}" - -################################################################################# -### Remove non changing isomirs (5p0_3p0) -################################################################################# - -rule iso_anno_final: - input: - bed= os.path.join(config["output_dir"],"{organism}", "iso_anno_concat.bed") - output: - bed= os.path.join(config["output_dir"],"{organism}", "isomirs_annotation.bed") - params: - cluster_log = os.path.join(config["cluster_log"],"{organism}", "iso_anno_final.log"), - pattern= "5p0_3p0" - log: - os.path.join(config["local_log"],"{organism}", "iso_anno_final.log") - singularity: - "docker://zavolab/ubuntu:18.04" - shell: - "( grep -v '{params.pattern}' {input.bed} > {output.bed}) &> {log}" -- GitLab