From 78fc6434afe37a933720df86b3fcc217ed8894db Mon Sep 17 00:00:00 2001 From: Iris Mestres Pascual <iris.mestrespascual@unibas.ch> Date: Fri, 20 Jan 2023 12:04:06 +0000 Subject: [PATCH] refactor: merge prepare, map & quantify workflows --- .gitlab-ci.yml | 18 --- test/config.yaml | 98 ++++++++++++++++ test/config_map.yaml | 44 ------- test/config_prepare.yaml | 33 ------ test/config_quantify.yaml | 23 ---- test/test_dag.sh | 28 +---- test/test_rule_graph.sh | 28 +---- test/test_workflow_local.sh | 48 ++------ test/test_workflow_slurm.sh | 72 +----------- workflow/Snakefile | 108 ++++++++++++++++++ workflow/config.yaml | 106 +++++++++++++++++ workflow/rules/.gitkeep | 0 workflow/{map/Snakefile => rules/map.smk} | 22 +++- .../{prepare/Snakefile => rules/prepare.smk} | 44 +++++-- .../Snakefile => rules/quantify.smk} | 32 ++++-- 15 files changed, 406 insertions(+), 298 deletions(-) delete mode 100644 .gitlab-ci.yml create mode 100644 test/config.yaml delete mode 100644 test/config_map.yaml delete mode 100644 test/config_prepare.yaml delete mode 100644 test/config_quantify.yaml create mode 100644 workflow/Snakefile create mode 100644 workflow/config.yaml create mode 100644 workflow/rules/.gitkeep rename workflow/{map/Snakefile => rules/map.smk} (98%) rename workflow/{prepare/Snakefile => rules/prepare.smk} (96%) rename workflow/{quantify/Snakefile => rules/quantify.smk} (93%) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml deleted file mode 100644 index 8a55175..0000000 --- a/.gitlab-ci.yml +++ /dev/null @@ -1,18 +0,0 @@ -default: - tags: - - shell - -before_script: - - mamba env create --force -f environment.yml - - conda activate mirflowz - - echo $CONDA_DEFAULT_ENV - - snakemake --version - -test: - script: - - bash test/test_workflow_local.sh - - bash test/test_dag.sh - - bash test/test_rule_graph.sh - -after_script: - - bash test/test_cleanup.sh diff --git a/test/config.yaml b/test/config.yaml new file mode 100644 index 0000000..98b4422 --- /dev/null +++ b/test/config.yaml @@ -0,0 +1,98 @@ +--- +#### GLOBAL PARAMETERS ##### + +# Directories +# Usually there is no need to change these +map_input_dir: "test_files" # For the mapping worflow +quantify_input_dir: "results" # For the quantify worflow +output_dir: "results" +scripts_dir: "../scripts" +local_log: "logs/local" +cluster_log: "logs/cluster" + +# Inputs information +sample: ["test_lib"] + +####################################################################################################### +#### +#### PREPARE PARAMETERS +#### +####################################################################################################### + +# Isomirs annotation +# Number of base pairs to add/substract from 5' (start) and 3' (end) coordinates. +bp_5p: [-1, 0, +1] +bp_3p: [-1, 0, +1] + +# List of inputs +organism: ["homo_sapiens/chrY"] + +#### PARAMETERS SPECIFIC TO INPUTS ##### + +homo_sapiens/chrY: + # URLs to genome, gene & miRNA annotations + genome_url: "ftp://ftp.ensembl.org/pub/release-98/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna_sm.chromosome.Y.fa.gz" + gtf_url: "ftp://ftp.ensembl.org/pub/release-98/gtf/homo_sapiens/Homo_sapiens.GRCh38.98.gtf.gz" + mirna_url: "https://www.mirbase.org/ftp/CURRENT/genomes/hsa.gff3" + + # Chromosome name mappings between UCSC <-> Ensembl + # Other organisms available at: https://github.com/dpryan79/ChromosomeMappings + map_chr_url: "https://raw.githubusercontent.com/dpryan79/ChromosomeMappings/master/GRCh38_UCSC2ensembl.txt" + # Chromosome name mapping parameters: + column: 1 + delimiter: "TAB" + +####################################################################################################### +#### +#### MAP PARAMETERS +#### +####################################################################################################### + +# Resources: genome, transcriptome, genes, miRs +# All of these are produced by the "prepare" workflow + +genome: "results/homo_sapiens/chrY/genome.processed.fa" +gtf: "results/homo_sapiens/chrY/gene_annotations.filtered.gtf" +transcriptome: "results/homo_sapiens/chrY/transcriptome_idtrim.fa" +transcriptome_index_segemehl: "results/homo_sapiens/chrY/transcriptome_index_segemehl.idx" +genome_index_segemehl: "results/homo_sapiens/chrY/genome_index_segemehl.idx" +exons: "results/homo_sapiens/chrY/exons.bed" +header_of_collapsed_fasta: "results/homo_sapiens/chrY/headerOfCollapsedFasta.sam" + +# Tool parameters: quality filter +q_value: 10 +p_value: 50 + +# Tool parameters: adapter removal +error_rate: 0.1 +minimum_length: 15 +overlap: 3 +max_n: 0 + +# Tool parameters: mapping +max_length_reads: 30 +nh: 100 + + +#### PARAMETERS SPECIFIC TO INPUTS #### + +test_lib: + adapter: "AACTGTAGGCACCATCAAT" + format: "fa" + +####################################################################################################### +#### +#### QUANTIFY PARAMETERS +#### +####################################################################################################### + + +# Types of miRNAs to quantify +#mir_list: ["miRNA", "miRNA_primary_transcript", "isomirs"] +mir_list: ["miRNA", "miRNA_primary_transcript"] + +# Resources: miR annotations, chromosome name mappings +# All of these are produced by the "prepare" workflow +mirnas_anno: "results/homo_sapiens/chrY/mirna_filtered.bed" +isomirs_anno: "results/homo_sapiens/chrY/isomirs_annotation.bed" +... \ No newline at end of file diff --git a/test/config_map.yaml b/test/config_map.yaml deleted file mode 100644 index 4ad569e..0000000 --- a/test/config_map.yaml +++ /dev/null @@ -1,44 +0,0 @@ ---- -#### GLOBAL PARAMETERS #### - -# Directories -# Usually there is no need to change these -output_dir: "results/" -local_log: "logs/local" -cluster_log: "logs/cluster" -scripts_dir: "../scripts" - -# Resources: genome, transcriptome, genes, miRs -# All of these are produced by the "prepare" workflow -genome: "results/homo_sapiens/chrY/genome.processed.fa" -gtf: "results/homo_sapiens/chrY/gene_annotations.filtered.gtf" -transcriptome: "results/homo_sapiens/chrY/transcriptome_idtrim.fa" -transcriptome_index_segemehl: "results/homo_sapiens/chrY/transcriptome_index_segemehl.idx" -genome_index_segemehl: "results/homo_sapiens/chrY/genome_index_segemehl.idx" -exons: "results/homo_sapiens/chrY/exons.bed" -header_of_collapsed_fasta: "results/homo_sapiens/chrY/headerOfCollapsedFasta.sam" - -# Tool parameters: quality filter -q_value: 10 -p_value: 50 - -# Tool parameters: adapter removal -error_rate: 0.1 -minimum_length: 15 -overlap: 3 -max_n: 0 - -# Tool parameters: mapping -max_length_reads: 30 -nh: 100 - -# Inputs information -input_dir: "test_files" -sample: ["test_lib"] - -#### PARAMETERS SPECIFIC TO INPUTS #### - -test_lib: - adapter: "AACTGTAGGCACCATCAAT" - format: "fa" -... diff --git a/test/config_prepare.yaml b/test/config_prepare.yaml deleted file mode 100644 index a74dba6..0000000 --- a/test/config_prepare.yaml +++ /dev/null @@ -1,33 +0,0 @@ ---- -#### GLOBAL PARAMETERS ##### - -# Directories -# Usually there is no need to change these -output_dir: "results" -scripts_dir: "../scripts" -local_log: "logs/local" -cluster_log: "logs/cluster" - -# Isomirs annotation file -# Number of base pairs to add/substract from 5' (start) and 3' (end) coordinates. -bp_5p: [-1, 0, +1] -bp_3p: [-1, 0, +1] - -# List of inputs -organism: ["homo_sapiens/chrY"] - -#### PARAMETERS SPECIFIC TO INPUTS ##### - -homo_sapiens/chrY: - # URLs to genome, gene & miRNA annotations - genome_url: "ftp://ftp.ensembl.org/pub/release-98/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna_sm.chromosome.Y.fa.gz" - gtf_url: "ftp://ftp.ensembl.org/pub/release-98/gtf/homo_sapiens/Homo_sapiens.GRCh38.98.gtf.gz" - mirna_url: "https://www.mirbase.org/ftp/CURRENT/genomes/hsa.gff3" - - # Chromosome name mappings between UCSC <-> Ensembl - # Other organisms available at: https://github.com/dpryan79/ChromosomeMappings - map_chr_url: "https://raw.githubusercontent.com/dpryan79/ChromosomeMappings/master/GRCh38_UCSC2ensembl.txt" - # Chromosome name mapping parameters: - column: 1 - delimiter: "TAB" -... diff --git a/test/config_quantify.yaml b/test/config_quantify.yaml deleted file mode 100644 index 691cedf..0000000 --- a/test/config_quantify.yaml +++ /dev/null @@ -1,23 +0,0 @@ ---- -#### GLOBAL PARAMETERS #### - -# Directories -# Usually there is no need to change these -output_dir: "results" -scripts_dir: "../scripts" -local_log: "logs/local" -cluster_log: "logs/cluster" - -# Types of miRNAs to quantify -#mir_list: ["miRNA", "miRNA_primary_transcript", "isomirs"] -mir_list: ["miRNA", "miRNA_primary_transcript"] - -# Resources: miR annotations, chromosome name mappings -# All of these are produced by the "prepare" workflow -mirnas_anno: "results/homo_sapiens/chrY/mirna_filtered.bed" -isomirs_anno: "results/homo_sapiens/chrY/isomirs_annotation.bed" - -# Inputs information -input_dir: "results" -sample: ["test_lib"] # put all samples, separated by comma -... diff --git a/test/test_dag.sh b/test/test_dag.sh index f472b15..de93730 100755 --- a/test/test_dag.sh +++ b/test/test_dag.sh @@ -16,32 +16,12 @@ user_dir=$PWD script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" cd $script_dir -# Run test: prepare workflow +# Run test snakemake \ - --snakefile="../workflow/prepare/Snakefile" \ - --configfile="config_prepare.yaml" \ + --snakefile="../workflow/Snakefile" \ + --configfile="config.yaml" \ --dag \ --printshellcmds \ --dryrun \ --verbose \ - | dot -Tsvg > "../images/workflow_dag_prepare.svg" - -# Run test: map workflow -snakemake \ - --snakefile="../workflow/map/Snakefile" \ - --configfile="config_map.yaml" \ - --dag \ - --printshellcmds \ - --dryrun \ - --verbose \ - | dot -Tsvg > "../images/workflow_dag_map.svg" - -# Run test: quantify workflow -snakemake \ - --snakefile="../workflow/quantify/Snakefile" \ - --configfile="config_quantify.yaml" \ - --dag \ - --printshellcmds \ - --dryrun \ - --verbose \ - | dot -Tsvg > "../images/workflow_dag_quantify.svg" + | dot -Tsvg > "../images/workflow_dag.svg" \ No newline at end of file diff --git a/test/test_rule_graph.sh b/test/test_rule_graph.sh index 3b0b235..66ca559 100755 --- a/test/test_rule_graph.sh +++ b/test/test_rule_graph.sh @@ -16,32 +16,12 @@ user_dir=$PWD script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" cd $script_dir -# Run test: prepare workflow +# Run test snakemake \ - --snakefile="../workflow/prepare/Snakefile" \ - --configfile="config_prepare.yaml" \ + --snakefile="../workflow/Snakefile" \ + --configfile="config.yaml" \ --rulegraph \ --printshellcmds \ --dryrun \ --verbose \ - | dot -Tsvg > "../images/rule_graph_prepare.svg" - -# Run test: map workflow -snakemake \ - --snakefile="../workflow/map/Snakefile" \ - --configfile="config_map.yaml" \ - --rulegraph \ - --printshellcmds \ - --dryrun \ - --verbose \ - | dot -Tsvg > "../images/rule_graph_map.svg" - -# Run test: quantify workflow -snakemake \ - --snakefile="../workflow/quantify/Snakefile" \ - --configfile="config_quantify.yaml" \ - --rulegraph \ - --printshellcmds \ - --dryrun \ - --verbose \ - | dot -Tsvg > "../images/rule_graph_quantify.svg" + | dot -Tsvg > "../images/rule_graph_.svg" diff --git a/test/test_workflow_local.sh b/test/test_workflow_local.sh index e59a3f4..8c87878 100755 --- a/test/test_workflow_local.sh +++ b/test/test_workflow_local.sh @@ -5,6 +5,7 @@ cleanup () { rc=$? cd $user_dir echo "Exit status: $rc" + rm -rf .snakemake } trap cleanup EXIT @@ -16,56 +17,21 @@ user_dir=$PWD script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" cd $script_dir -# Run test: prepare workflow +# Run test snakemake \ - --snakefile="../workflow/prepare/Snakefile" \ - --configfile="config_prepare.yaml" \ + --snakefile="../workflow/Snakefile" \ + --cores 4 \ --use-singularity \ --singularity-args "--bind ${PWD}/../" \ - --cores=4 \ --printshellcmds \ --rerun-incomplete \ --verbose -# Run test: map workflow -snakemake \ - --snakefile="../workflow/map/Snakefile" \ - --configfile="config_map.yaml" \ - --use-singularity \ - --singularity-args "--bind ${PWD}/../" \ - --cores=4 \ - --printshellcmds \ - --rerun-incomplete \ - --verbose - -# Run test: quantify workflow -snakemake \ - --snakefile="../workflow/quantify/Snakefile" \ - --configfile="config_quantify.yaml" \ - --use-singularity \ - --singularity-args "--bind ${PWD}/../" \ - --cores=4 \ - --printshellcmds \ - --rerun-incomplete \ - --verbose - -# Snakemake report: prepare workflow -snakemake \ - --snakefile="../workflow/prepare/Snakefile" \ - --configfile="config_prepare.yaml" \ - --report="snakemake_report_prepare.html" - -# Snakemake report: map workflow -snakemake \ - --snakefile="../workflow/map/Snakefile" \ - --configfile="config_map.yaml" \ - --report="snakemake_report_map.html" -# Snakemake report: quantify workflow +# Snakemake report snakemake \ - --snakefile="../workflow/quantify/Snakefile" \ - --configfile="config_quantify.yaml" \ - --report="snakemake_report_quantify.html" + --snakefile="../workflow/Snakefile" \ + --report="snakemake_report.html" # Check md5 sum of some output files find results/ -type f -name \*\.gz -exec gunzip '{}' \; diff --git a/test/test_workflow_slurm.sh b/test/test_workflow_slurm.sh index 0c50de3..a1e1925 100755 --- a/test/test_workflow_slurm.sh +++ b/test/test_workflow_slurm.sh @@ -21,56 +21,10 @@ mkdir -p logs/cluster/{homo_sapiens/chrY,test_lib} mkdir -p logs/local/{homo_sapiens/chrY,test_lib} mkdir -p results/{homo_sapiens/chrY,test_lib} -# Run test: prepare workflow +# Run test snakemake \ - --snakefile="../workflow/prepare/Snakefile" \ - --configfile="config_prepare.yaml" \ - --cluster-config="cluster.json" \ - --cluster "sbatch \ - --cpus-per-task={cluster.threads} \ - --mem={cluster.mem} \ - --qos={cluster.queue} \ - --time={cluster.time} \ - --export=JOB_NAME={rule} \ - -o {params.cluster_log} \ - -p scicore \ - --open-mode=append" \ - --jobscript="../jobscript.sh" \ - --jobs=20 \ - --use-singularity \ - --singularity-args="--no-home --bind ${PWD}/../" \ - --cores=256 \ - --printshellcmds \ - --rerun-incomplete \ - --verbose - -# Run test: map workflow -snakemake \ - --snakefile="../workflow/map/Snakefile" \ - --configfile="config_map.yaml" \ - --cluster-config="cluster.json" \ - --cluster "sbatch \ - --cpus-per-task={cluster.threads} \ - --mem={cluster.mem} \ - --qos={cluster.queue} \ - --time={cluster.time} \ - --export=JOB_NAME={rule} \ - -o {params.cluster_log} \ - -p scicore \ - --open-mode=append" \ - --jobscript="../jobscript.sh" \ - --jobs=20 \ - --use-singularity \ - --singularity-args="--no-home --bind ${PWD}/../" \ + --snakefile="../workflow/Snakefile" \ --cores=256 \ - --printshellcmds \ - --rerun-incomplete \ - --verbose - -# Run test: quantify workflow -snakemake \ - --snakefile="../workflow/quantify/Snakefile" \ - --configfile="config_quantify.yaml" \ --cluster-config="cluster.json" \ --cluster "sbatch \ --cpus-per-task={cluster.threads} \ @@ -84,29 +38,15 @@ snakemake \ --jobscript="../jobscript.sh" \ --jobs=20 \ --use-singularity \ - --singularity-args="--no-home --bind ${PWD}/../" \ - --cores=256 \ + --singularity-args="--bind ${PWD}/../" \ --printshellcmds \ --rerun-incomplete \ --verbose -# Snakemake report: prepare workflow -snakemake \ - --snakefile="../workflow/prepare/Snakefile" \ - --configfile="config_prepare.yaml" \ - --report="snakemake_report_prepare.html" - -# Snakemake report: map workflow -snakemake \ - --snakefile="../workflow/map/Snakefile" \ - --configfile="config_map.yaml" \ - --report="snakemake_report_map.html" - -# Snakemake report: quantify workflow +# Snakemake report snakemake \ - --snakefile="../workflow/quantify/Snakefile" \ - --configfile="config_quantify.yaml" \ - --report="snakemake_report_quantify.html" + --snakefile="../workflow/Snakefile" \ + --report="snakemake_report.html" # Check md5 sum of some output files find results/ -type f -name \*\.gz -exec gunzip '{}' \; diff --git a/workflow/Snakefile b/workflow/Snakefile new file mode 100644 index 0000000..9467cd3 --- /dev/null +++ b/workflow/Snakefile @@ -0,0 +1,108 @@ +############################################################################### +# (c) 2020 Paula Iborra, Zavolan Lab, Biozentrum, University of Basel +# (@) paula.iborradetoledo@unibas.ch / paula.iborra@alumni.esci.upf.edu +# +# Snakemake workflow to: +# - download and prepare the necessary files for smallRNA-seq related workflows. +# - map small RNA-seq reads (e.g. from miRNA sequencing libraries) +# - quantify miRNAs, including isomiRs, from miRNA-seq alignments. +# +############################################################################### +# +# USAGE: +# snakemake \ +# --use-singularity \ +# --singularity-args "--bind $PWD/../" \ +# --cores 4 \ +# --printshellcmds \ +# --rerun-incomplete \ +# --verbose +# +################################################################################ + +import os + +############################################################################### +### Including subworkflows +############################################################################### + +include: os.path.join("rules" , "prepare.smk") +include: os.path.join("rules" , "map.smk") +include: os.path.join("rules" , "quantify.smk") + +############################################################################### +### Finish rule +############################################################################### + + +rule finish: + input: + idx_transcriptome=expand( + os.path.join( + config["output_dir"], + "{organism}", + "transcriptome_index_segemehl.idx", + ), + organism=config["organism"], + ), + idx_genome=expand( + os.path.join( + config["output_dir"], + "{organism}", + "genome_index_segemehl.idx", + ), + organism=config["organism"], + ), + exons=expand( + os.path.join( + config["output_dir"], + "{organism}", + "exons.bed", + ), + organism=config["organism"], + ), + header=expand( + os.path.join( + config["output_dir"], + "{organism}", + "headerOfCollapsedFasta.sam", + ), + organism=config["organism"], + ), + mirnafilt=expand( + os.path.join( + config["output_dir"], + "{organism}", + "mirna_filtered.bed", + ), + organism=config["organism"], + ), + isomirs=expand( + os.path.join( + config["output_dir"], + "{organism}", + "isomirs_annotation.bed", + ), + organism=config["organism"], + ), + maps=expand( + os.path.join( + config["output_dir"], + "{sample}", + "convertedSortedMappings_{sample}.bam.bai", + ), + sample=config["sample"], + ), + table1=expand( + os.path.join( + config["output_dir"], + "TABLES", + "counts.{mir}.tab" + ), + mir=config["mir_list"], + ), + table2=os.path.join( + config["output_dir"], + "TABLES", + "counts.isomirs.tab" + ), \ No newline at end of file diff --git a/workflow/config.yaml b/workflow/config.yaml new file mode 100644 index 0000000..25e52d7 --- /dev/null +++ b/workflow/config.yaml @@ -0,0 +1,106 @@ +--- +#### GLOBAL PARAMETERS ##### + +# Directories +# Usually there is no need to change these +map_input_dir: "path/to/map_input_directory" # For the mapping worflow +quantify_input_dir: "path/to/quantify_input_directory" # For the quantify worflow +output_dir: "results" +scripts_dir: "../scripts" +local_log: "logs/local" +cluster_log: "logs/cluster" + +# Inputs information +sample: ["sample_1", "sample_2"] # put all sample names, separated by comma + +####################################################################################################### +#### +#### PREPARE PARAMETERS +#### +####################################################################################################### + +# Isomirs annotation file +# Number of base pairs to add/substract from 5' (start) and 3' (end) coordinates. +bp_5p: [0] # array of numbers, e.g., [-2,-1,0,+1], to include 2 upstream and 1 downstream nts +bp_3p: [0] # array of numbers, e.g., [-2,-1,0,+1], to include 2 upstream and 1 downstream nts + +# List of inputs +organism: ["org/pre"] # e.g., ["homo_sapiens/GRCh38.100", "mus_musculus/GRCm37.98"] +# this string specifies a path, and the "/" is important for this +# "pre" specifies the assembly version + + +#### PARAMETERS SPECIFIC TO INPUTS ##### + +org/pre: # One section for each list item in "organism"; entry should match precisely what +# is in the "organism" section above, one entry per list item above, omitting the "" + # URLs to genome, gene & miRNA annotations + genome_url: # FTP/HTTP URL to gzipped genome in FASTA format, Ensembl style + # e.g. "ftp://ftp.ensembl.org/pub/release-106/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna_sm.primary_assembly.fa.gz" + gtf_url: # FTP/HTTP URL to gzipped gene annotations in GTF format, Ensembl style + # e.g. "ftp://ftp.ensembl.org/pub/release-106/gtf/homo_sapiens/Homo_sapiens.GRCh38.106.chr.gtf.gz" + mirna_url: # FTP/HTTP URL to unzipped microRNA annotations in GFF format, miRBase style + # e.g. "https://www.mirbase.org/ftp/CURRENT/genomes/hsa.gff3" + + # Chromosome name mappings between UCSC <-> Ensembl + # Other organisms available at: https://github.com/dpryan79/ChromosomeMappings + map_chr_url: # FTP/HTTP URL to mapping table + # e.g. "https://raw.githubusercontent.com/dpryan79/ChromosomeMappings/master/GRCh38_UCSC2ensembl.txt" + # Chromosome name mapping parameters: + column: 1 # Column number from input file where to change chromosome name + delimiter: "TAB" # Delimiter of the input file + +####################################################################################################### +#### +#### MAP PARAMETERS +#### +####################################################################################################### + +# Resources: genome, transcriptome, genes, miRs +# All of these are produced by the "prepare" workflow +genome: "path/to/genome.processed.fa" +gtf: "path/to/gene_annotations.filtered.gtf" +transcriptome: "path/to/transcriptome_idtrim.fa" +transcriptome_index_segemehl: "path/to/transcriptome_index_segemehl.idx" +genome_index_segemehl: "path/to/genome_index_segemehl.idx" +exons: "path/to/exons.bed" +header_of_collapsed_fasta: "path/to/headerOfCollapsedFasta.sam" + +# Tool parameters: quality filter +q_value: 10 # Q (Phred) score; minimum quality score to keep +p_value: 50 # minimum % of bases that must have Q quality + +# Tool parameters: adapter removal +error_rate: 0.1 # fraction of allowed errors +minimum_length: 15 # discard processed reads shorter than the indicated length +overlap: 3 # minimum overlap length of adapter and read to trim the bases +max_n: 0 # discard reads containing more than the indicated number of N bases + +# Tool parameters: mapping +max_length_reads: 30 # maximum length of processed reads to map with oligomap +nh: 100 # discard reads with more mappings than the indicated number + + +#### PARAMETERS SPECIFIC TO INPUTS #### + +sample_1: # one section per list item in "sample"; names have to match + adapter: "XXXXXXXXXXXXXXXXXXXX" # 3' adapter sequence to trim + format: "fa" # file format; currently supported: "fa" + + +####################################################################################################### +#### +#### QUANTIFY PARAMETERS +#### +####################################################################################################### + + +# Types of miRNAs to quantify +# Remove miRNA types you are not interested in +mir_list: ["miRNA", "miRNA_primary_transcript", "isomirs"] + +# Resources: miR annotations, chromosome name mappings +# All of these are produced by the "prepare" workflow +mirnas_anno: "path/to/mirna_filtered.bed" +isomirs_anno: "path/to/isomirs_annotation.bed" +... \ No newline at end of file diff --git a/workflow/rules/.gitkeep b/workflow/rules/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/workflow/map/Snakefile b/workflow/rules/map.smk similarity index 98% rename from workflow/map/Snakefile rename to workflow/rules/map.smk index aace1f1..f5ef040 100644 --- a/workflow/map/Snakefile +++ b/workflow/rules/map.smk @@ -4,22 +4,33 @@ # # Workflow to map small RNA-seq reads (e.g. from miRNA sequencing libraries). ############################################################################### - +# +# USAGE: +# snakemake \ +# --snakefile="map.smk" \ +# -- cores 4 \ +# --use-singularity \ +# --singularity-args "--bind $PWD/../" \ +# --printshellcmds \ +# --rerun-incomplete \ +# --verbose +# +################################################################################ import os +configfile: "config.yaml" # Rules that require internet connection for downloading files are included # in the localrules localrules: - finish, - + finish_map, ############################################################################### ### Finish rule ############################################################################### -rule finish: +rule finish_map: input: maps=expand( os.path.join( @@ -30,7 +41,6 @@ rule finish: sample=config["sample"], ), - ############################################################################### ### Uncompress fastq files ############################################################################### @@ -38,7 +48,7 @@ rule finish: rule uncompress_zipped_files: input: - reads=os.path.join(config["input_dir"], "{sample}.{format}.gz"), + reads=os.path.join(config["map_input_dir"], "{sample}.{format}.gz"), output: reads=os.path.join( config["output_dir"], "{sample}", "{format}", "reads.{format}" diff --git a/workflow/prepare/Snakefile b/workflow/rules/prepare.smk similarity index 96% rename from workflow/prepare/Snakefile rename to workflow/rules/prepare.smk index c16afb8..b254a90 100644 --- a/workflow/prepare/Snakefile +++ b/workflow/rules/prepare.smk @@ -2,17 +2,30 @@ # (c) 2020 Paula Iborra, Zavolan Lab, Biozentrum, University of Basel # (@) paula.iborradetoledo@unibas.ch / paula.iborra@alumni.esci.upf.edu # -# Snakemake workflow to download and prepare the necessary files for -# smallRNA-seq related workflows. +# Snakemake workflow to download and prepare the necessary files +# for smallRNA-seq related workflows. +# ############################################################################### - +# +# USAGE: +# snakemake \ +# --snakefile="prepare.smk" \ +# --cores 4 \ +# --use-singularity \ +# --singularity-args "--bind $PWD/../" \ +# --printshellcmds \ +# --rerun-incomplete \ +# --verbose +# +################################################################################ import os +configfile: "config.yaml" # Rules that require internet connection for downloading files are included # in the localrules localrules: - finish, + finish_prepare, genome_process, filter_anno_gtf, mirna_anno, @@ -24,7 +37,7 @@ localrules: ############################################################################### -rule finish: +rule finish_prepare: input: idx_transcriptome=expand( os.path.join( @@ -43,7 +56,11 @@ rule finish: organism=config["organism"], ), exons=expand( - os.path.join(config["output_dir"], "{organism}", "exons.bed"), + os.path.join( + config["output_dir"], + "{organism}", + "exons.bed", + ), organism=config["organism"], ), header=expand( @@ -56,23 +73,26 @@ rule finish: ), mirnafilt=expand( os.path.join( - config["output_dir"], "{organism}", "mirna_filtered.bed" + config["output_dir"], + "{organism}", + "mirna_filtered.bed", ), organism=config["organism"], ), isomirs=expand( os.path.join( - config["output_dir"], "{organism}", "isomirs_annotation.bed" + config["output_dir"], + "{organism}", + "isomirs_annotation.bed", ), organism=config["organism"], ), + ############################################################################### ### Download and process genome IDs ############################################################################### - - rule genome_process: input: script=os.path.join(config["scripts_dir"], "genome_process.sh"), @@ -582,7 +602,7 @@ rule filter_mature_mirs: ############################################################################### -### Create isomirs annotation file from mature miRNA +### Create isomirs annotation file from mature miRNA ############################################################################### @@ -728,4 +748,4 @@ rule iso_anno_final: singularity: "docker://zavolab/ubuntu:18.04" shell: - "(grep -v '{params.pattern}' {input.bed} > {output.bed}) &> {log}" + "(grep -v '{params.pattern}' {input.bed} > {output.bed}) &> {log}" \ No newline at end of file diff --git a/workflow/quantify/Snakefile b/workflow/rules/quantify.smk similarity index 93% rename from workflow/quantify/Snakefile rename to workflow/rules/quantify.smk index b30667d..6eabb23 100644 --- a/workflow/quantify/Snakefile +++ b/workflow/rules/quantify.smk @@ -4,14 +4,27 @@ # # Pipeline to quantify miRNAs, including isomiRs, from miRNA-seq alignments. ############################################################################### +# +# USAGE: +# snakemake \ +# --snakefile="quanitfy.smk" \ +# --cores 4 \ +# --use-singularity \ +# --singularity-args "--bind $PWD/../" \ +# --printshellcmds \ +# --rerun-incomplete \ +# --verbose +# +################################################################################ import os +configfile: "config.yaml" # Rules that require internet connection for downloading files are included # in the localrules localrules: - finish, + finish_quantify, ############################################################################### @@ -19,17 +32,22 @@ localrules: ############################################################################### -rule finish: +rule finish_quantify: input: table1=expand( - os.path.join(config["output_dir"], "TABLES", "counts.{mir}.tab"), + os.path.join( + config["output_dir"], + "TABLES", + "counts.{mir}.tab", + ), mir=config["mir_list"], ), table2=os.path.join( - config["output_dir"], "TABLES", "counts.isomirs.tab" + config["output_dir"], + "TABLES", + "counts.isomirs.tab", ), - ############################################################################### ### BAM to BED ############################################################################### @@ -38,7 +56,7 @@ rule finish: rule bamtobed: input: alignment=os.path.join( - config["input_dir"], + config["quantify_input_dir"], "{sample}", "convertedSortedMappings_{sample}.bam", ), @@ -314,4 +332,4 @@ rule merge_tables: --output_file {output.table} \ --prefix {params.prefix} \ --verbose \ - ) &> {log}" + ) &> {log}" \ No newline at end of file -- GitLab