From 78fc6434afe37a933720df86b3fcc217ed8894db Mon Sep 17 00:00:00 2001
From: Iris Mestres Pascual <iris.mestrespascual@unibas.ch>
Date: Fri, 20 Jan 2023 12:04:06 +0000
Subject: [PATCH] refactor: merge prepare, map & quantify workflows

---
 .gitlab-ci.yml                                |  18 ---
 test/config.yaml                              |  98 ++++++++++++++++
 test/config_map.yaml                          |  44 -------
 test/config_prepare.yaml                      |  33 ------
 test/config_quantify.yaml                     |  23 ----
 test/test_dag.sh                              |  28 +----
 test/test_rule_graph.sh                       |  28 +----
 test/test_workflow_local.sh                   |  48 ++------
 test/test_workflow_slurm.sh                   |  72 +-----------
 workflow/Snakefile                            | 108 ++++++++++++++++++
 workflow/config.yaml                          | 106 +++++++++++++++++
 workflow/rules/.gitkeep                       |   0
 workflow/{map/Snakefile => rules/map.smk}     |  22 +++-
 .../{prepare/Snakefile => rules/prepare.smk}  |  44 +++++--
 .../Snakefile => rules/quantify.smk}          |  32 ++++--
 15 files changed, 406 insertions(+), 298 deletions(-)
 delete mode 100644 .gitlab-ci.yml
 create mode 100644 test/config.yaml
 delete mode 100644 test/config_map.yaml
 delete mode 100644 test/config_prepare.yaml
 delete mode 100644 test/config_quantify.yaml
 create mode 100644 workflow/Snakefile
 create mode 100644 workflow/config.yaml
 create mode 100644 workflow/rules/.gitkeep
 rename workflow/{map/Snakefile => rules/map.smk} (98%)
 rename workflow/{prepare/Snakefile => rules/prepare.smk} (96%)
 rename workflow/{quantify/Snakefile => rules/quantify.smk} (93%)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
deleted file mode 100644
index 8a55175..0000000
--- a/.gitlab-ci.yml
+++ /dev/null
@@ -1,18 +0,0 @@
-default:
-  tags:
-    - shell
-
-before_script:
-  - mamba env create --force -f environment.yml
-  - conda activate mirflowz
-  - echo $CONDA_DEFAULT_ENV
-  - snakemake --version
-
-test:
-  script:
-    - bash test/test_workflow_local.sh
-    - bash test/test_dag.sh
-    - bash test/test_rule_graph.sh
-
-after_script:
-  - bash test/test_cleanup.sh
diff --git a/test/config.yaml b/test/config.yaml
new file mode 100644
index 0000000..98b4422
--- /dev/null
+++ b/test/config.yaml
@@ -0,0 +1,98 @@
+---
+#### GLOBAL PARAMETERS #####
+
+# Directories
+# Usually there is no need to change these
+map_input_dir: "test_files" # For the mapping worflow
+quantify_input_dir: "results" # For the quantify worflow
+output_dir: "results"
+scripts_dir: "../scripts"
+local_log: "logs/local"
+cluster_log: "logs/cluster"
+
+# Inputs information
+sample: ["test_lib"]
+
+#######################################################################################################
+####
+#### PREPARE PARAMETERS
+####
+#######################################################################################################
+
+# Isomirs annotation 
+# Number of base pairs to add/substract from 5' (start) and 3' (end) coordinates.
+bp_5p: [-1, 0, +1]
+bp_3p: [-1, 0, +1]
+
+# List of inputs
+organism: ["homo_sapiens/chrY"]
+
+#### PARAMETERS SPECIFIC TO INPUTS #####
+
+homo_sapiens/chrY:
+  # URLs to genome, gene & miRNA annotations
+  genome_url: "ftp://ftp.ensembl.org/pub/release-98/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna_sm.chromosome.Y.fa.gz"
+  gtf_url: "ftp://ftp.ensembl.org/pub/release-98/gtf/homo_sapiens/Homo_sapiens.GRCh38.98.gtf.gz"
+  mirna_url: "https://www.mirbase.org/ftp/CURRENT/genomes/hsa.gff3"
+
+  # Chromosome name mappings between UCSC <-> Ensembl
+  # Other organisms available at: https://github.com/dpryan79/ChromosomeMappings
+  map_chr_url: "https://raw.githubusercontent.com/dpryan79/ChromosomeMappings/master/GRCh38_UCSC2ensembl.txt"
+  # Chromosome name mapping parameters:
+  column: 1
+  delimiter: "TAB"
+
+#######################################################################################################
+####
+#### MAP PARAMETERS
+####
+#######################################################################################################
+
+# Resources: genome, transcriptome, genes, miRs
+# All of these are produced by the "prepare" workflow
+
+genome: "results/homo_sapiens/chrY/genome.processed.fa"
+gtf: "results/homo_sapiens/chrY/gene_annotations.filtered.gtf"
+transcriptome: "results/homo_sapiens/chrY/transcriptome_idtrim.fa"
+transcriptome_index_segemehl: "results/homo_sapiens/chrY/transcriptome_index_segemehl.idx"
+genome_index_segemehl: "results/homo_sapiens/chrY/genome_index_segemehl.idx"
+exons: "results/homo_sapiens/chrY/exons.bed"
+header_of_collapsed_fasta: "results/homo_sapiens/chrY/headerOfCollapsedFasta.sam"
+
+# Tool parameters: quality filter
+q_value: 10
+p_value: 50
+
+# Tool parameters: adapter removal
+error_rate: 0.1
+minimum_length: 15
+overlap: 3
+max_n: 0
+
+# Tool parameters: mapping
+max_length_reads: 30
+nh: 100
+
+
+#### PARAMETERS SPECIFIC TO INPUTS ####
+
+test_lib:
+  adapter: "AACTGTAGGCACCATCAAT"
+  format: "fa"
+
+#######################################################################################################
+####
+#### QUANTIFY PARAMETERS
+####
+#######################################################################################################
+
+
+# Types of miRNAs to quantify
+#mir_list: ["miRNA", "miRNA_primary_transcript", "isomirs"]
+mir_list: ["miRNA", "miRNA_primary_transcript"]
+
+# Resources: miR annotations, chromosome name mappings
+# All of these are produced by the "prepare" workflow
+mirnas_anno: "results/homo_sapiens/chrY/mirna_filtered.bed"
+isomirs_anno: "results/homo_sapiens/chrY/isomirs_annotation.bed"
+...
\ No newline at end of file
diff --git a/test/config_map.yaml b/test/config_map.yaml
deleted file mode 100644
index 4ad569e..0000000
--- a/test/config_map.yaml
+++ /dev/null
@@ -1,44 +0,0 @@
----
-#### GLOBAL PARAMETERS ####
-
-# Directories
-# Usually there is no need to change these
-output_dir: "results/"
-local_log:  "logs/local"
-cluster_log: "logs/cluster"
-scripts_dir: "../scripts"
-
-# Resources: genome, transcriptome, genes, miRs
-# All of these are produced by the "prepare" workflow
-genome: "results/homo_sapiens/chrY/genome.processed.fa"
-gtf: "results/homo_sapiens/chrY/gene_annotations.filtered.gtf"
-transcriptome: "results/homo_sapiens/chrY/transcriptome_idtrim.fa"
-transcriptome_index_segemehl: "results/homo_sapiens/chrY/transcriptome_index_segemehl.idx"
-genome_index_segemehl: "results/homo_sapiens/chrY/genome_index_segemehl.idx"
-exons: "results/homo_sapiens/chrY/exons.bed"
-header_of_collapsed_fasta: "results/homo_sapiens/chrY/headerOfCollapsedFasta.sam"
-
-# Tool parameters: quality filter
-q_value: 10
-p_value: 50
-
-# Tool parameters: adapter removal
-error_rate: 0.1
-minimum_length: 15
-overlap: 3
-max_n: 0
-
-# Tool parameters: mapping
-max_length_reads: 30
-nh: 100
-
-# Inputs information
-input_dir: "test_files"
-sample: ["test_lib"]
-
-#### PARAMETERS SPECIFIC TO INPUTS ####
-
-test_lib:
-    adapter: "AACTGTAGGCACCATCAAT"
-    format: "fa"
-...
diff --git a/test/config_prepare.yaml b/test/config_prepare.yaml
deleted file mode 100644
index a74dba6..0000000
--- a/test/config_prepare.yaml
+++ /dev/null
@@ -1,33 +0,0 @@
----
-#### GLOBAL PARAMETERS #####
-
-# Directories
-# Usually there is no need to change these
-output_dir: "results"
-scripts_dir: "../scripts"
-local_log: "logs/local"
-cluster_log: "logs/cluster"
-
-# Isomirs annotation file
-# Number of base pairs to add/substract from 5' (start) and 3' (end) coordinates.
-bp_5p: [-1, 0, +1]
-bp_3p: [-1, 0, +1]
-
-# List of inputs
-organism: ["homo_sapiens/chrY"]
-
-#### PARAMETERS SPECIFIC TO INPUTS #####
-
-homo_sapiens/chrY:
-  # URLs to genome, gene & miRNA annotations
-  genome_url: "ftp://ftp.ensembl.org/pub/release-98/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna_sm.chromosome.Y.fa.gz"
-  gtf_url: "ftp://ftp.ensembl.org/pub/release-98/gtf/homo_sapiens/Homo_sapiens.GRCh38.98.gtf.gz"
-  mirna_url: "https://www.mirbase.org/ftp/CURRENT/genomes/hsa.gff3"
-
-  # Chromosome name mappings between UCSC <-> Ensembl
-  # Other organisms available at: https://github.com/dpryan79/ChromosomeMappings
-  map_chr_url: "https://raw.githubusercontent.com/dpryan79/ChromosomeMappings/master/GRCh38_UCSC2ensembl.txt"
-  # Chromosome name mapping parameters:
-  column: 1
-  delimiter: "TAB"
-...
diff --git a/test/config_quantify.yaml b/test/config_quantify.yaml
deleted file mode 100644
index 691cedf..0000000
--- a/test/config_quantify.yaml
+++ /dev/null
@@ -1,23 +0,0 @@
----
-#### GLOBAL PARAMETERS ####
-
-# Directories
-# Usually there is no need to change these
-output_dir: "results"
-scripts_dir: "../scripts"
-local_log: "logs/local"
-cluster_log: "logs/cluster"
-
-# Types of miRNAs to quantify
-#mir_list: ["miRNA", "miRNA_primary_transcript", "isomirs"]
-mir_list: ["miRNA", "miRNA_primary_transcript"]
-
-# Resources: miR annotations, chromosome name mappings
-# All of these are produced by the "prepare" workflow
-mirnas_anno: "results/homo_sapiens/chrY/mirna_filtered.bed"
-isomirs_anno: "results/homo_sapiens/chrY/isomirs_annotation.bed"
-
-# Inputs information
-input_dir: "results"
-sample: ["test_lib"]  # put all samples, separated by comma
-...
diff --git a/test/test_dag.sh b/test/test_dag.sh
index f472b15..de93730 100755
--- a/test/test_dag.sh
+++ b/test/test_dag.sh
@@ -16,32 +16,12 @@ user_dir=$PWD
 script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
 cd $script_dir
 
-# Run test: prepare workflow
+# Run test
 snakemake \
-    --snakefile="../workflow/prepare/Snakefile" \
-    --configfile="config_prepare.yaml" \
+    --snakefile="../workflow/Snakefile" \
+    --configfile="config.yaml" \
     --dag \
     --printshellcmds \
     --dryrun \
     --verbose \
-    | dot -Tsvg > "../images/workflow_dag_prepare.svg"
-
-# Run test: map workflow
-snakemake \
-    --snakefile="../workflow/map/Snakefile" \
-    --configfile="config_map.yaml" \
-    --dag \
-    --printshellcmds \
-    --dryrun \
-    --verbose \
-    | dot -Tsvg > "../images/workflow_dag_map.svg"
-
-# Run test: quantify workflow
-snakemake \
-    --snakefile="../workflow/quantify/Snakefile" \
-    --configfile="config_quantify.yaml" \
-    --dag \
-    --printshellcmds \
-    --dryrun \
-    --verbose \
-    | dot -Tsvg > "../images/workflow_dag_quantify.svg"
+    | dot -Tsvg > "../images/workflow_dag.svg"
\ No newline at end of file
diff --git a/test/test_rule_graph.sh b/test/test_rule_graph.sh
index 3b0b235..66ca559 100755
--- a/test/test_rule_graph.sh
+++ b/test/test_rule_graph.sh
@@ -16,32 +16,12 @@ user_dir=$PWD
 script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
 cd $script_dir
 
-# Run test: prepare workflow
+# Run test
 snakemake \
-    --snakefile="../workflow/prepare/Snakefile" \
-    --configfile="config_prepare.yaml" \
+    --snakefile="../workflow/Snakefile" \
+    --configfile="config.yaml" \
     --rulegraph \
     --printshellcmds \
     --dryrun \
     --verbose \
-    | dot -Tsvg > "../images/rule_graph_prepare.svg"
-
-# Run test: map workflow
-snakemake \
-    --snakefile="../workflow/map/Snakefile" \
-    --configfile="config_map.yaml" \
-    --rulegraph \
-    --printshellcmds \
-    --dryrun \
-    --verbose \
-    | dot -Tsvg > "../images/rule_graph_map.svg"
-
-# Run test: quantify workflow
-snakemake \
-    --snakefile="../workflow/quantify/Snakefile" \
-    --configfile="config_quantify.yaml" \
-    --rulegraph \
-    --printshellcmds \
-    --dryrun \
-    --verbose \
-    | dot -Tsvg > "../images/rule_graph_quantify.svg"
+    | dot -Tsvg > "../images/rule_graph_.svg"
diff --git a/test/test_workflow_local.sh b/test/test_workflow_local.sh
index e59a3f4..8c87878 100755
--- a/test/test_workflow_local.sh
+++ b/test/test_workflow_local.sh
@@ -5,6 +5,7 @@ cleanup () {
     rc=$?
     cd $user_dir
     echo "Exit status: $rc"
+    rm -rf .snakemake
 }
 trap cleanup EXIT
 
@@ -16,56 +17,21 @@ user_dir=$PWD
 script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
 cd $script_dir
 
-# Run test: prepare workflow
+# Run test
 snakemake \
-    --snakefile="../workflow/prepare/Snakefile" \
-    --configfile="config_prepare.yaml" \
+    --snakefile="../workflow/Snakefile" \
+    --cores 4  \
     --use-singularity \
     --singularity-args "--bind ${PWD}/../" \
-    --cores=4 \
     --printshellcmds \
     --rerun-incomplete \
     --verbose
 
-# Run test: map workflow
-snakemake \
-    --snakefile="../workflow/map/Snakefile" \
-    --configfile="config_map.yaml" \
-    --use-singularity \
-    --singularity-args "--bind ${PWD}/../" \
-    --cores=4 \
-    --printshellcmds \
-    --rerun-incomplete \
-    --verbose
-
-# Run test: quantify workflow
-snakemake \
-    --snakefile="../workflow/quantify/Snakefile" \
-    --configfile="config_quantify.yaml" \
-    --use-singularity \
-    --singularity-args "--bind ${PWD}/../" \
-    --cores=4 \
-    --printshellcmds \
-    --rerun-incomplete \
-    --verbose 
-
-# Snakemake report: prepare workflow
-snakemake \
-    --snakefile="../workflow/prepare/Snakefile" \
-    --configfile="config_prepare.yaml" \
-    --report="snakemake_report_prepare.html"
-
-# Snakemake report: map workflow
-snakemake \
-    --snakefile="../workflow/map/Snakefile" \
-    --configfile="config_map.yaml" \
-    --report="snakemake_report_map.html"
 
-# Snakemake report: quantify workflow
+# Snakemake report
 snakemake \
-    --snakefile="../workflow/quantify/Snakefile" \
-    --configfile="config_quantify.yaml" \
-    --report="snakemake_report_quantify.html"
+    --snakefile="../workflow/Snakefile" \
+    --report="snakemake_report.html"
 
 # Check md5 sum of some output files
 find results/ -type f -name \*\.gz -exec gunzip '{}' \;
diff --git a/test/test_workflow_slurm.sh b/test/test_workflow_slurm.sh
index 0c50de3..a1e1925 100755
--- a/test/test_workflow_slurm.sh
+++ b/test/test_workflow_slurm.sh
@@ -21,56 +21,10 @@ mkdir -p logs/cluster/{homo_sapiens/chrY,test_lib}
 mkdir -p logs/local/{homo_sapiens/chrY,test_lib}
 mkdir -p results/{homo_sapiens/chrY,test_lib}
 
-# Run test: prepare workflow
+# Run test
 snakemake \
-    --snakefile="../workflow/prepare/Snakefile" \
-    --configfile="config_prepare.yaml" \
-    --cluster-config="cluster.json" \
-    --cluster "sbatch \
-        --cpus-per-task={cluster.threads} \
-        --mem={cluster.mem} \
-        --qos={cluster.queue} \
-        --time={cluster.time} \
-        --export=JOB_NAME={rule} \
-        -o {params.cluster_log} \
-        -p scicore \
-        --open-mode=append" \
-    --jobscript="../jobscript.sh" \
-    --jobs=20 \
-    --use-singularity \
-    --singularity-args="--no-home --bind ${PWD}/../" \
-    --cores=256 \
-    --printshellcmds \
-    --rerun-incomplete \
-    --verbose
-
-# Run test: map workflow
-snakemake \
-    --snakefile="../workflow/map/Snakefile" \
-    --configfile="config_map.yaml" \
-    --cluster-config="cluster.json" \
-    --cluster "sbatch \
-        --cpus-per-task={cluster.threads} \
-        --mem={cluster.mem} \
-        --qos={cluster.queue} \
-        --time={cluster.time} \
-        --export=JOB_NAME={rule} \
-        -o {params.cluster_log} \
-        -p scicore \
-        --open-mode=append" \
-    --jobscript="../jobscript.sh" \
-    --jobs=20 \
-    --use-singularity \
-    --singularity-args="--no-home --bind ${PWD}/../" \
+    --snakefile="../workflow/Snakefile" \
     --cores=256 \
-    --printshellcmds \
-    --rerun-incomplete \
-    --verbose
-
-# Run test: quantify workflow
-snakemake \
-    --snakefile="../workflow/quantify/Snakefile" \
-    --configfile="config_quantify.yaml" \
     --cluster-config="cluster.json" \
     --cluster "sbatch \
         --cpus-per-task={cluster.threads} \
@@ -84,29 +38,15 @@ snakemake \
     --jobscript="../jobscript.sh" \
     --jobs=20 \
     --use-singularity \
-    --singularity-args="--no-home --bind ${PWD}/../" \
-    --cores=256 \
+    --singularity-args="--bind ${PWD}/../" \
     --printshellcmds \
     --rerun-incomplete \
     --verbose
 
-# Snakemake report: prepare workflow
-snakemake \
-    --snakefile="../workflow/prepare/Snakefile" \
-    --configfile="config_prepare.yaml" \
-    --report="snakemake_report_prepare.html"
-
-# Snakemake report: map workflow
-snakemake \
-    --snakefile="../workflow/map/Snakefile" \
-    --configfile="config_map.yaml" \
-    --report="snakemake_report_map.html"
-
-# Snakemake report: quantify workflow
+# Snakemake report
 snakemake \
-    --snakefile="../workflow/quantify/Snakefile" \
-    --configfile="config_quantify.yaml" \
-    --report="snakemake_report_quantify.html"
+    --snakefile="../workflow/Snakefile" \
+    --report="snakemake_report.html"
 
 # Check md5 sum of some output files
 find results/ -type f -name \*\.gz -exec gunzip '{}' \;
diff --git a/workflow/Snakefile b/workflow/Snakefile
new file mode 100644
index 0000000..9467cd3
--- /dev/null
+++ b/workflow/Snakefile
@@ -0,0 +1,108 @@
+###############################################################################
+# (c) 2020 Paula Iborra, Zavolan Lab, Biozentrum, University of Basel
+# (@) paula.iborradetoledo@unibas.ch / paula.iborra@alumni.esci.upf.edu
+#
+# Snakemake workflow to:
+#   - download and prepare the necessary files for smallRNA-seq related workflows.
+#   - map small RNA-seq reads (e.g. from miRNA sequencing libraries)
+#   - quantify miRNAs, including isomiRs, from miRNA-seq alignments.
+#
+###############################################################################
+#
+# USAGE:
+# snakemake \
+#    --use-singularity \
+#    --singularity-args "--bind $PWD/../" \
+#    --cores 4 \
+#    --printshellcmds \
+#    --rerun-incomplete \
+#    --verbose
+#
+################################################################################
+
+import os
+
+###############################################################################
+### Including subworkflows
+###############################################################################
+
+include: os.path.join("rules" , "prepare.smk")
+include: os.path.join("rules" , "map.smk")
+include: os.path.join("rules" , "quantify.smk")
+
+###############################################################################
+### Finish rule
+###############################################################################
+
+
+rule finish:
+    input:
+        idx_transcriptome=expand(
+            os.path.join(
+                config["output_dir"],
+                "{organism}",
+                "transcriptome_index_segemehl.idx",
+            ),
+            organism=config["organism"],
+        ),
+        idx_genome=expand(
+            os.path.join(
+                config["output_dir"],
+                "{organism}",
+                "genome_index_segemehl.idx",
+            ),
+            organism=config["organism"],
+        ),
+        exons=expand(
+            os.path.join(
+                config["output_dir"], 
+                "{organism}", 
+                "exons.bed",
+            ),
+            organism=config["organism"],
+        ),
+        header=expand(
+            os.path.join(
+                config["output_dir"],
+                "{organism}",
+                "headerOfCollapsedFasta.sam",
+            ),
+            organism=config["organism"],
+        ),
+        mirnafilt=expand(
+            os.path.join(
+                config["output_dir"], 
+                "{organism}", 
+                "mirna_filtered.bed",
+            ),
+            organism=config["organism"],
+        ),
+        isomirs=expand(
+            os.path.join(
+                config["output_dir"], 
+                "{organism}", 
+                "isomirs_annotation.bed",
+            ),
+            organism=config["organism"],
+        ),
+        maps=expand(
+            os.path.join(
+                config["output_dir"],
+                "{sample}",
+                "convertedSortedMappings_{sample}.bam.bai",
+            ),
+            sample=config["sample"],
+        ),
+        table1=expand(
+            os.path.join(
+                config["output_dir"], 
+                "TABLES", 
+                "counts.{mir}.tab"
+            ),
+            mir=config["mir_list"],
+        ),
+        table2=os.path.join(
+            config["output_dir"], 
+            "TABLES", 
+            "counts.isomirs.tab"
+        ),
\ No newline at end of file
diff --git a/workflow/config.yaml b/workflow/config.yaml
new file mode 100644
index 0000000..25e52d7
--- /dev/null
+++ b/workflow/config.yaml
@@ -0,0 +1,106 @@
+---
+#### GLOBAL PARAMETERS #####
+
+# Directories
+# Usually there is no need to change these
+map_input_dir: "path/to/map_input_directory" # For the mapping worflow
+quantify_input_dir: "path/to/quantify_input_directory" # For the quantify worflow
+output_dir: "results"
+scripts_dir: "../scripts"
+local_log: "logs/local"
+cluster_log: "logs/cluster"
+
+# Inputs information
+sample: ["sample_1", "sample_2"]  # put all sample names, separated by comma
+
+#######################################################################################################
+####
+#### PREPARE PARAMETERS
+####
+#######################################################################################################
+
+# Isomirs annotation file
+# Number of base pairs to add/substract from 5' (start) and 3' (end) coordinates.
+bp_5p: [0] # array of numbers, e.g., [-2,-1,0,+1], to include 2 upstream and 1 downstream nts
+bp_3p: [0] # array of numbers, e.g., [-2,-1,0,+1], to include 2 upstream and 1 downstream nts
+
+# List of inputs
+organism: ["org/pre"] # e.g., ["homo_sapiens/GRCh38.100", "mus_musculus/GRCm37.98"]
+# this string specifies a path, and the "/" is important for this
+# "pre" specifies the assembly version
+
+
+#### PARAMETERS SPECIFIC TO INPUTS #####
+
+org/pre: # One section for each list item in "organism"; entry should match precisely what
+# is in the "organism" section above, one entry per list item above, omitting the ""
+  # URLs to genome, gene & miRNA annotations
+  genome_url: # FTP/HTTP URL to gzipped genome in FASTA format, Ensembl style
+  # e.g. "ftp://ftp.ensembl.org/pub/release-106/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna_sm.primary_assembly.fa.gz"
+  gtf_url: # FTP/HTTP URL to gzipped gene annotations in GTF format, Ensembl style
+  # e.g. "ftp://ftp.ensembl.org/pub/release-106/gtf/homo_sapiens/Homo_sapiens.GRCh38.106.chr.gtf.gz"
+  mirna_url: # FTP/HTTP URL to unzipped microRNA annotations in GFF format, miRBase style
+  # e.g. "https://www.mirbase.org/ftp/CURRENT/genomes/hsa.gff3"
+
+  # Chromosome name mappings between UCSC <-> Ensembl
+  # Other organisms available at: https://github.com/dpryan79/ChromosomeMappings
+  map_chr_url: # FTP/HTTP URL to mapping table
+  # e.g. "https://raw.githubusercontent.com/dpryan79/ChromosomeMappings/master/GRCh38_UCSC2ensembl.txt"
+  # Chromosome name mapping parameters:
+  column: 1 # Column number from input file where to change chromosome name
+  delimiter: "TAB" # Delimiter of the input file
+
+#######################################################################################################
+####
+#### MAP PARAMETERS
+####
+#######################################################################################################
+
+# Resources: genome, transcriptome, genes, miRs
+# All of these are produced by the "prepare" workflow
+genome: "path/to/genome.processed.fa"
+gtf: "path/to/gene_annotations.filtered.gtf"
+transcriptome: "path/to/transcriptome_idtrim.fa"
+transcriptome_index_segemehl: "path/to/transcriptome_index_segemehl.idx"
+genome_index_segemehl: "path/to/genome_index_segemehl.idx"
+exons: "path/to/exons.bed"
+header_of_collapsed_fasta: "path/to/headerOfCollapsedFasta.sam"
+
+# Tool parameters: quality filter
+q_value: 10  # Q (Phred) score; minimum quality score to keep
+p_value: 50  # minimum % of bases that must have Q quality
+
+# Tool parameters: adapter removal
+error_rate: 0.1  # fraction of allowed errors
+minimum_length: 15  # discard processed reads shorter than the indicated length
+overlap: 3  # minimum overlap length of adapter and read to trim the bases
+max_n: 0  # discard reads containing more than the indicated number of N bases
+
+# Tool parameters: mapping
+max_length_reads: 30  # maximum length of processed reads to map with oligomap
+nh: 100  # discard reads with more mappings than the indicated number
+
+
+#### PARAMETERS SPECIFIC TO INPUTS ####
+
+sample_1:  # one section per list item in "sample"; names have to match
+    adapter: "XXXXXXXXXXXXXXXXXXXX"  # 3' adapter sequence to trim
+    format: "fa"  # file format; currently supported: "fa"
+
+
+#######################################################################################################
+####
+#### QUANTIFY PARAMETERS
+####
+#######################################################################################################
+
+
+# Types of miRNAs to quantify
+# Remove miRNA types you are not interested in
+mir_list: ["miRNA", "miRNA_primary_transcript", "isomirs"]
+
+# Resources: miR annotations, chromosome name mappings
+# All of these are produced by the "prepare" workflow
+mirnas_anno: "path/to/mirna_filtered.bed"
+isomirs_anno: "path/to/isomirs_annotation.bed"
+...
\ No newline at end of file
diff --git a/workflow/rules/.gitkeep b/workflow/rules/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/workflow/map/Snakefile b/workflow/rules/map.smk
similarity index 98%
rename from workflow/map/Snakefile
rename to workflow/rules/map.smk
index aace1f1..f5ef040 100644
--- a/workflow/map/Snakefile
+++ b/workflow/rules/map.smk
@@ -4,22 +4,33 @@
 #
 # Workflow to map small RNA-seq reads (e.g. from miRNA sequencing libraries).
 ###############################################################################
-
+#
+# USAGE:
+# snakemake \
+#    --snakefile="map.smk" \
+#    -- cores 4 \
+#    --use-singularity \
+#    --singularity-args "--bind $PWD/../" \
+#    --printshellcmds \
+#    --rerun-incomplete \
+#    --verbose
+#
+################################################################################
 import os
 
+configfile: "config.yaml"
 
 # Rules that require internet connection for downloading files are included
 # in the localrules
 localrules:
-    finish,
-
+    finish_map,
 
 ###############################################################################
 ### Finish rule
 ###############################################################################
 
 
-rule finish:
+rule finish_map:
     input:
         maps=expand(
             os.path.join(
@@ -30,7 +41,6 @@ rule finish:
             sample=config["sample"],
         ),
 
-
 ###############################################################################
 ### Uncompress fastq files
 ###############################################################################
@@ -38,7 +48,7 @@ rule finish:
 
 rule uncompress_zipped_files:
     input:
-        reads=os.path.join(config["input_dir"], "{sample}.{format}.gz"),
+        reads=os.path.join(config["map_input_dir"], "{sample}.{format}.gz"),
     output:
         reads=os.path.join(
             config["output_dir"], "{sample}", "{format}", "reads.{format}"
diff --git a/workflow/prepare/Snakefile b/workflow/rules/prepare.smk
similarity index 96%
rename from workflow/prepare/Snakefile
rename to workflow/rules/prepare.smk
index c16afb8..b254a90 100644
--- a/workflow/prepare/Snakefile
+++ b/workflow/rules/prepare.smk
@@ -2,17 +2,30 @@
 # (c) 2020 Paula Iborra, Zavolan Lab, Biozentrum, University of Basel
 # (@) paula.iborradetoledo@unibas.ch / paula.iborra@alumni.esci.upf.edu
 #
-# Snakemake workflow to download and prepare the necessary files for
-# smallRNA-seq related workflows.
+# Snakemake workflow to download and prepare the necessary files 
+# for smallRNA-seq related workflows.
+#
 ###############################################################################
-
+#
+# USAGE:
+# snakemake \
+#    --snakefile="prepare.smk" \
+#    --cores 4 \
+#    --use-singularity \
+#    --singularity-args "--bind $PWD/../" \
+#    --printshellcmds \
+#    --rerun-incomplete \
+#    --verbose
+#
+################################################################################
 import os
 
+configfile: "config.yaml" 
 
 # Rules that require internet connection for downloading files are included
 # in the localrules
 localrules:
-    finish,
+    finish_prepare,
     genome_process,
     filter_anno_gtf,
     mirna_anno,
@@ -24,7 +37,7 @@ localrules:
 ###############################################################################
 
 
-rule finish:
+rule finish_prepare:
     input:
         idx_transcriptome=expand(
             os.path.join(
@@ -43,7 +56,11 @@ rule finish:
             organism=config["organism"],
         ),
         exons=expand(
-            os.path.join(config["output_dir"], "{organism}", "exons.bed"),
+            os.path.join(
+                config["output_dir"], 
+                "{organism}", 
+                "exons.bed",
+            ),
             organism=config["organism"],
         ),
         header=expand(
@@ -56,23 +73,26 @@ rule finish:
         ),
         mirnafilt=expand(
             os.path.join(
-                config["output_dir"], "{organism}", "mirna_filtered.bed"
+                config["output_dir"], 
+                "{organism}", 
+                "mirna_filtered.bed",
             ),
             organism=config["organism"],
         ),
         isomirs=expand(
             os.path.join(
-                config["output_dir"], "{organism}", "isomirs_annotation.bed"
+                config["output_dir"], 
+                "{organism}", 
+                "isomirs_annotation.bed",
             ),
             organism=config["organism"],
         ),
 
 
+
 ###############################################################################
 ### Download and process genome IDs
 ###############################################################################
-
-
 rule genome_process:
     input:
         script=os.path.join(config["scripts_dir"], "genome_process.sh"),
@@ -582,7 +602,7 @@ rule filter_mature_mirs:
 
 
 ###############################################################################
-### Create isomirs annotation file from mature miRNA
+### Create isomirs annotation file from mature miRNA 
 ###############################################################################
 
 
@@ -728,4 +748,4 @@ rule iso_anno_final:
     singularity:
         "docker://zavolab/ubuntu:18.04"
     shell:
-        "(grep -v '{params.pattern}' {input.bed} > {output.bed}) &> {log}"
+        "(grep -v '{params.pattern}' {input.bed} > {output.bed}) &> {log}"
\ No newline at end of file
diff --git a/workflow/quantify/Snakefile b/workflow/rules/quantify.smk
similarity index 93%
rename from workflow/quantify/Snakefile
rename to workflow/rules/quantify.smk
index b30667d..6eabb23 100644
--- a/workflow/quantify/Snakefile
+++ b/workflow/rules/quantify.smk
@@ -4,14 +4,27 @@
 #
 # Pipeline to quantify miRNAs, including isomiRs, from miRNA-seq alignments.
 ###############################################################################
+#
+# USAGE:
+# snakemake \
+#    --snakefile="quanitfy.smk" \
+#    --cores 4 \
+#    --use-singularity \
+#    --singularity-args "--bind $PWD/../" \ 
+#    --printshellcmds \
+#    --rerun-incomplete \
+#    --verbose
+#
+################################################################################
 
 import os
 
+configfile: "config.yaml"
 
 # Rules that require internet connection for downloading files are included
 # in the localrules
 localrules:
-    finish,
+    finish_quantify,
 
 
 ###############################################################################
@@ -19,17 +32,22 @@ localrules:
 ###############################################################################
 
 
-rule finish:
+rule finish_quantify:
     input:
         table1=expand(
-            os.path.join(config["output_dir"], "TABLES", "counts.{mir}.tab"),
+            os.path.join(
+                config["output_dir"], 
+                "TABLES", 
+                "counts.{mir}.tab",
+            ),
             mir=config["mir_list"],
         ),
         table2=os.path.join(
-            config["output_dir"], "TABLES", "counts.isomirs.tab"
+            config["output_dir"], 
+            "TABLES", 
+            "counts.isomirs.tab",
         ),
 
-
 ###############################################################################
 ### BAM to BED
 ###############################################################################
@@ -38,7 +56,7 @@ rule finish:
 rule bamtobed:
     input:
         alignment=os.path.join(
-            config["input_dir"],
+            config["quantify_input_dir"],
             "{sample}",
             "convertedSortedMappings_{sample}.bam",
         ),
@@ -314,4 +332,4 @@ rule merge_tables:
         --output_file {output.table} \
         --prefix {params.prefix} \
         --verbose \
-        ) &> {log}"
+        ) &> {log}"
\ No newline at end of file
-- 
GitLab