diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml deleted file mode 100644 index 8a55175d254b70bbea33ee6c024c61c7f69b346a..0000000000000000000000000000000000000000 --- a/.gitlab-ci.yml +++ /dev/null @@ -1,18 +0,0 @@ -default: - tags: - - shell - -before_script: - - mamba env create --force -f environment.yml - - conda activate mirflowz - - echo $CONDA_DEFAULT_ENV - - snakemake --version - -test: - script: - - bash test/test_workflow_local.sh - - bash test/test_dag.sh - - bash test/test_rule_graph.sh - -after_script: - - bash test/test_cleanup.sh diff --git a/test/config.yaml b/test/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..98b4422dbfc52ad49a6d237b9b840aa0650b34ae --- /dev/null +++ b/test/config.yaml @@ -0,0 +1,98 @@ +--- +#### GLOBAL PARAMETERS ##### + +# Directories +# Usually there is no need to change these +map_input_dir: "test_files" # For the mapping worflow +quantify_input_dir: "results" # For the quantify worflow +output_dir: "results" +scripts_dir: "../scripts" +local_log: "logs/local" +cluster_log: "logs/cluster" + +# Inputs information +sample: ["test_lib"] + +####################################################################################################### +#### +#### PREPARE PARAMETERS +#### +####################################################################################################### + +# Isomirs annotation +# Number of base pairs to add/substract from 5' (start) and 3' (end) coordinates. +bp_5p: [-1, 0, +1] +bp_3p: [-1, 0, +1] + +# List of inputs +organism: ["homo_sapiens/chrY"] + +#### PARAMETERS SPECIFIC TO INPUTS ##### + +homo_sapiens/chrY: + # URLs to genome, gene & miRNA annotations + genome_url: "ftp://ftp.ensembl.org/pub/release-98/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna_sm.chromosome.Y.fa.gz" + gtf_url: "ftp://ftp.ensembl.org/pub/release-98/gtf/homo_sapiens/Homo_sapiens.GRCh38.98.gtf.gz" + mirna_url: "https://www.mirbase.org/ftp/CURRENT/genomes/hsa.gff3" + + # Chromosome name mappings between UCSC <-> Ensembl + # Other organisms available at: https://github.com/dpryan79/ChromosomeMappings + map_chr_url: "https://raw.githubusercontent.com/dpryan79/ChromosomeMappings/master/GRCh38_UCSC2ensembl.txt" + # Chromosome name mapping parameters: + column: 1 + delimiter: "TAB" + +####################################################################################################### +#### +#### MAP PARAMETERS +#### +####################################################################################################### + +# Resources: genome, transcriptome, genes, miRs +# All of these are produced by the "prepare" workflow + +genome: "results/homo_sapiens/chrY/genome.processed.fa" +gtf: "results/homo_sapiens/chrY/gene_annotations.filtered.gtf" +transcriptome: "results/homo_sapiens/chrY/transcriptome_idtrim.fa" +transcriptome_index_segemehl: "results/homo_sapiens/chrY/transcriptome_index_segemehl.idx" +genome_index_segemehl: "results/homo_sapiens/chrY/genome_index_segemehl.idx" +exons: "results/homo_sapiens/chrY/exons.bed" +header_of_collapsed_fasta: "results/homo_sapiens/chrY/headerOfCollapsedFasta.sam" + +# Tool parameters: quality filter +q_value: 10 +p_value: 50 + +# Tool parameters: adapter removal +error_rate: 0.1 +minimum_length: 15 +overlap: 3 +max_n: 0 + +# Tool parameters: mapping +max_length_reads: 30 +nh: 100 + + +#### PARAMETERS SPECIFIC TO INPUTS #### + +test_lib: + adapter: "AACTGTAGGCACCATCAAT" + format: "fa" + +####################################################################################################### +#### +#### QUANTIFY PARAMETERS +#### +####################################################################################################### + + +# Types of miRNAs to quantify +#mir_list: ["miRNA", "miRNA_primary_transcript", "isomirs"] +mir_list: ["miRNA", "miRNA_primary_transcript"] + +# Resources: miR annotations, chromosome name mappings +# All of these are produced by the "prepare" workflow +mirnas_anno: "results/homo_sapiens/chrY/mirna_filtered.bed" +isomirs_anno: "results/homo_sapiens/chrY/isomirs_annotation.bed" +... \ No newline at end of file diff --git a/test/config_map.yaml b/test/config_map.yaml deleted file mode 100644 index 4ad569eca8f629df04c5144cfaf00a7b876d3e51..0000000000000000000000000000000000000000 --- a/test/config_map.yaml +++ /dev/null @@ -1,44 +0,0 @@ ---- -#### GLOBAL PARAMETERS #### - -# Directories -# Usually there is no need to change these -output_dir: "results/" -local_log: "logs/local" -cluster_log: "logs/cluster" -scripts_dir: "../scripts" - -# Resources: genome, transcriptome, genes, miRs -# All of these are produced by the "prepare" workflow -genome: "results/homo_sapiens/chrY/genome.processed.fa" -gtf: "results/homo_sapiens/chrY/gene_annotations.filtered.gtf" -transcriptome: "results/homo_sapiens/chrY/transcriptome_idtrim.fa" -transcriptome_index_segemehl: "results/homo_sapiens/chrY/transcriptome_index_segemehl.idx" -genome_index_segemehl: "results/homo_sapiens/chrY/genome_index_segemehl.idx" -exons: "results/homo_sapiens/chrY/exons.bed" -header_of_collapsed_fasta: "results/homo_sapiens/chrY/headerOfCollapsedFasta.sam" - -# Tool parameters: quality filter -q_value: 10 -p_value: 50 - -# Tool parameters: adapter removal -error_rate: 0.1 -minimum_length: 15 -overlap: 3 -max_n: 0 - -# Tool parameters: mapping -max_length_reads: 30 -nh: 100 - -# Inputs information -input_dir: "test_files" -sample: ["test_lib"] - -#### PARAMETERS SPECIFIC TO INPUTS #### - -test_lib: - adapter: "AACTGTAGGCACCATCAAT" - format: "fa" -... diff --git a/test/config_prepare.yaml b/test/config_prepare.yaml deleted file mode 100644 index a74dba6c1bb82d4f8debb794f02b8dc1ea28e3a4..0000000000000000000000000000000000000000 --- a/test/config_prepare.yaml +++ /dev/null @@ -1,33 +0,0 @@ ---- -#### GLOBAL PARAMETERS ##### - -# Directories -# Usually there is no need to change these -output_dir: "results" -scripts_dir: "../scripts" -local_log: "logs/local" -cluster_log: "logs/cluster" - -# Isomirs annotation file -# Number of base pairs to add/substract from 5' (start) and 3' (end) coordinates. -bp_5p: [-1, 0, +1] -bp_3p: [-1, 0, +1] - -# List of inputs -organism: ["homo_sapiens/chrY"] - -#### PARAMETERS SPECIFIC TO INPUTS ##### - -homo_sapiens/chrY: - # URLs to genome, gene & miRNA annotations - genome_url: "ftp://ftp.ensembl.org/pub/release-98/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna_sm.chromosome.Y.fa.gz" - gtf_url: "ftp://ftp.ensembl.org/pub/release-98/gtf/homo_sapiens/Homo_sapiens.GRCh38.98.gtf.gz" - mirna_url: "https://www.mirbase.org/ftp/CURRENT/genomes/hsa.gff3" - - # Chromosome name mappings between UCSC <-> Ensembl - # Other organisms available at: https://github.com/dpryan79/ChromosomeMappings - map_chr_url: "https://raw.githubusercontent.com/dpryan79/ChromosomeMappings/master/GRCh38_UCSC2ensembl.txt" - # Chromosome name mapping parameters: - column: 1 - delimiter: "TAB" -... diff --git a/test/config_quantify.yaml b/test/config_quantify.yaml deleted file mode 100644 index 691cedff5fc2ca15c48f6b965b2eefed0af039f2..0000000000000000000000000000000000000000 --- a/test/config_quantify.yaml +++ /dev/null @@ -1,23 +0,0 @@ ---- -#### GLOBAL PARAMETERS #### - -# Directories -# Usually there is no need to change these -output_dir: "results" -scripts_dir: "../scripts" -local_log: "logs/local" -cluster_log: "logs/cluster" - -# Types of miRNAs to quantify -#mir_list: ["miRNA", "miRNA_primary_transcript", "isomirs"] -mir_list: ["miRNA", "miRNA_primary_transcript"] - -# Resources: miR annotations, chromosome name mappings -# All of these are produced by the "prepare" workflow -mirnas_anno: "results/homo_sapiens/chrY/mirna_filtered.bed" -isomirs_anno: "results/homo_sapiens/chrY/isomirs_annotation.bed" - -# Inputs information -input_dir: "results" -sample: ["test_lib"] # put all samples, separated by comma -... diff --git a/test/test_dag.sh b/test/test_dag.sh index f472b158e19f62524831e927275cd284b7358ceb..de9373026370a07f246bfa47099157d3eea94c00 100755 --- a/test/test_dag.sh +++ b/test/test_dag.sh @@ -16,32 +16,12 @@ user_dir=$PWD script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" cd $script_dir -# Run test: prepare workflow +# Run test snakemake \ - --snakefile="../workflow/prepare/Snakefile" \ - --configfile="config_prepare.yaml" \ + --snakefile="../workflow/Snakefile" \ + --configfile="config.yaml" \ --dag \ --printshellcmds \ --dryrun \ --verbose \ - | dot -Tsvg > "../images/workflow_dag_prepare.svg" - -# Run test: map workflow -snakemake \ - --snakefile="../workflow/map/Snakefile" \ - --configfile="config_map.yaml" \ - --dag \ - --printshellcmds \ - --dryrun \ - --verbose \ - | dot -Tsvg > "../images/workflow_dag_map.svg" - -# Run test: quantify workflow -snakemake \ - --snakefile="../workflow/quantify/Snakefile" \ - --configfile="config_quantify.yaml" \ - --dag \ - --printshellcmds \ - --dryrun \ - --verbose \ - | dot -Tsvg > "../images/workflow_dag_quantify.svg" + | dot -Tsvg > "../images/workflow_dag.svg" \ No newline at end of file diff --git a/test/test_rule_graph.sh b/test/test_rule_graph.sh index 3b0b235742ff1912fa22bb20384878b14fcf6cd5..66ca559eaa4c32d6e004d25ab157b67eb027cc32 100755 --- a/test/test_rule_graph.sh +++ b/test/test_rule_graph.sh @@ -16,32 +16,12 @@ user_dir=$PWD script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" cd $script_dir -# Run test: prepare workflow +# Run test snakemake \ - --snakefile="../workflow/prepare/Snakefile" \ - --configfile="config_prepare.yaml" \ + --snakefile="../workflow/Snakefile" \ + --configfile="config.yaml" \ --rulegraph \ --printshellcmds \ --dryrun \ --verbose \ - | dot -Tsvg > "../images/rule_graph_prepare.svg" - -# Run test: map workflow -snakemake \ - --snakefile="../workflow/map/Snakefile" \ - --configfile="config_map.yaml" \ - --rulegraph \ - --printshellcmds \ - --dryrun \ - --verbose \ - | dot -Tsvg > "../images/rule_graph_map.svg" - -# Run test: quantify workflow -snakemake \ - --snakefile="../workflow/quantify/Snakefile" \ - --configfile="config_quantify.yaml" \ - --rulegraph \ - --printshellcmds \ - --dryrun \ - --verbose \ - | dot -Tsvg > "../images/rule_graph_quantify.svg" + | dot -Tsvg > "../images/rule_graph_.svg" diff --git a/test/test_workflow_local.sh b/test/test_workflow_local.sh index e59a3f497f24ef06d1a2e5a57a678a1e914f1487..8c87878a9efc84389954fcbb8fa943d0b38ebdf8 100755 --- a/test/test_workflow_local.sh +++ b/test/test_workflow_local.sh @@ -5,6 +5,7 @@ cleanup () { rc=$? cd $user_dir echo "Exit status: $rc" + rm -rf .snakemake } trap cleanup EXIT @@ -16,56 +17,21 @@ user_dir=$PWD script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" cd $script_dir -# Run test: prepare workflow +# Run test snakemake \ - --snakefile="../workflow/prepare/Snakefile" \ - --configfile="config_prepare.yaml" \ + --snakefile="../workflow/Snakefile" \ + --cores 4 \ --use-singularity \ --singularity-args "--bind ${PWD}/../" \ - --cores=4 \ --printshellcmds \ --rerun-incomplete \ --verbose -# Run test: map workflow -snakemake \ - --snakefile="../workflow/map/Snakefile" \ - --configfile="config_map.yaml" \ - --use-singularity \ - --singularity-args "--bind ${PWD}/../" \ - --cores=4 \ - --printshellcmds \ - --rerun-incomplete \ - --verbose - -# Run test: quantify workflow -snakemake \ - --snakefile="../workflow/quantify/Snakefile" \ - --configfile="config_quantify.yaml" \ - --use-singularity \ - --singularity-args "--bind ${PWD}/../" \ - --cores=4 \ - --printshellcmds \ - --rerun-incomplete \ - --verbose - -# Snakemake report: prepare workflow -snakemake \ - --snakefile="../workflow/prepare/Snakefile" \ - --configfile="config_prepare.yaml" \ - --report="snakemake_report_prepare.html" - -# Snakemake report: map workflow -snakemake \ - --snakefile="../workflow/map/Snakefile" \ - --configfile="config_map.yaml" \ - --report="snakemake_report_map.html" -# Snakemake report: quantify workflow +# Snakemake report snakemake \ - --snakefile="../workflow/quantify/Snakefile" \ - --configfile="config_quantify.yaml" \ - --report="snakemake_report_quantify.html" + --snakefile="../workflow/Snakefile" \ + --report="snakemake_report.html" # Check md5 sum of some output files find results/ -type f -name \*\.gz -exec gunzip '{}' \; diff --git a/test/test_workflow_slurm.sh b/test/test_workflow_slurm.sh index 0c50de3921ca1a60f8f3944da6557a312c031a76..a1e19254c2c2c003ed64c460e41fa1f486bdaabd 100755 --- a/test/test_workflow_slurm.sh +++ b/test/test_workflow_slurm.sh @@ -21,56 +21,10 @@ mkdir -p logs/cluster/{homo_sapiens/chrY,test_lib} mkdir -p logs/local/{homo_sapiens/chrY,test_lib} mkdir -p results/{homo_sapiens/chrY,test_lib} -# Run test: prepare workflow +# Run test snakemake \ - --snakefile="../workflow/prepare/Snakefile" \ - --configfile="config_prepare.yaml" \ - --cluster-config="cluster.json" \ - --cluster "sbatch \ - --cpus-per-task={cluster.threads} \ - --mem={cluster.mem} \ - --qos={cluster.queue} \ - --time={cluster.time} \ - --export=JOB_NAME={rule} \ - -o {params.cluster_log} \ - -p scicore \ - --open-mode=append" \ - --jobscript="../jobscript.sh" \ - --jobs=20 \ - --use-singularity \ - --singularity-args="--no-home --bind ${PWD}/../" \ - --cores=256 \ - --printshellcmds \ - --rerun-incomplete \ - --verbose - -# Run test: map workflow -snakemake \ - --snakefile="../workflow/map/Snakefile" \ - --configfile="config_map.yaml" \ - --cluster-config="cluster.json" \ - --cluster "sbatch \ - --cpus-per-task={cluster.threads} \ - --mem={cluster.mem} \ - --qos={cluster.queue} \ - --time={cluster.time} \ - --export=JOB_NAME={rule} \ - -o {params.cluster_log} \ - -p scicore \ - --open-mode=append" \ - --jobscript="../jobscript.sh" \ - --jobs=20 \ - --use-singularity \ - --singularity-args="--no-home --bind ${PWD}/../" \ + --snakefile="../workflow/Snakefile" \ --cores=256 \ - --printshellcmds \ - --rerun-incomplete \ - --verbose - -# Run test: quantify workflow -snakemake \ - --snakefile="../workflow/quantify/Snakefile" \ - --configfile="config_quantify.yaml" \ --cluster-config="cluster.json" \ --cluster "sbatch \ --cpus-per-task={cluster.threads} \ @@ -84,29 +38,15 @@ snakemake \ --jobscript="../jobscript.sh" \ --jobs=20 \ --use-singularity \ - --singularity-args="--no-home --bind ${PWD}/../" \ - --cores=256 \ + --singularity-args="--bind ${PWD}/../" \ --printshellcmds \ --rerun-incomplete \ --verbose -# Snakemake report: prepare workflow -snakemake \ - --snakefile="../workflow/prepare/Snakefile" \ - --configfile="config_prepare.yaml" \ - --report="snakemake_report_prepare.html" - -# Snakemake report: map workflow -snakemake \ - --snakefile="../workflow/map/Snakefile" \ - --configfile="config_map.yaml" \ - --report="snakemake_report_map.html" - -# Snakemake report: quantify workflow +# Snakemake report snakemake \ - --snakefile="../workflow/quantify/Snakefile" \ - --configfile="config_quantify.yaml" \ - --report="snakemake_report_quantify.html" + --snakefile="../workflow/Snakefile" \ + --report="snakemake_report.html" # Check md5 sum of some output files find results/ -type f -name \*\.gz -exec gunzip '{}' \; diff --git a/workflow/Snakefile b/workflow/Snakefile new file mode 100644 index 0000000000000000000000000000000000000000..9467cd318d8c6c696450898b17618f2692d3f626 --- /dev/null +++ b/workflow/Snakefile @@ -0,0 +1,108 @@ +############################################################################### +# (c) 2020 Paula Iborra, Zavolan Lab, Biozentrum, University of Basel +# (@) paula.iborradetoledo@unibas.ch / paula.iborra@alumni.esci.upf.edu +# +# Snakemake workflow to: +# - download and prepare the necessary files for smallRNA-seq related workflows. +# - map small RNA-seq reads (e.g. from miRNA sequencing libraries) +# - quantify miRNAs, including isomiRs, from miRNA-seq alignments. +# +############################################################################### +# +# USAGE: +# snakemake \ +# --use-singularity \ +# --singularity-args "--bind $PWD/../" \ +# --cores 4 \ +# --printshellcmds \ +# --rerun-incomplete \ +# --verbose +# +################################################################################ + +import os + +############################################################################### +### Including subworkflows +############################################################################### + +include: os.path.join("rules" , "prepare.smk") +include: os.path.join("rules" , "map.smk") +include: os.path.join("rules" , "quantify.smk") + +############################################################################### +### Finish rule +############################################################################### + + +rule finish: + input: + idx_transcriptome=expand( + os.path.join( + config["output_dir"], + "{organism}", + "transcriptome_index_segemehl.idx", + ), + organism=config["organism"], + ), + idx_genome=expand( + os.path.join( + config["output_dir"], + "{organism}", + "genome_index_segemehl.idx", + ), + organism=config["organism"], + ), + exons=expand( + os.path.join( + config["output_dir"], + "{organism}", + "exons.bed", + ), + organism=config["organism"], + ), + header=expand( + os.path.join( + config["output_dir"], + "{organism}", + "headerOfCollapsedFasta.sam", + ), + organism=config["organism"], + ), + mirnafilt=expand( + os.path.join( + config["output_dir"], + "{organism}", + "mirna_filtered.bed", + ), + organism=config["organism"], + ), + isomirs=expand( + os.path.join( + config["output_dir"], + "{organism}", + "isomirs_annotation.bed", + ), + organism=config["organism"], + ), + maps=expand( + os.path.join( + config["output_dir"], + "{sample}", + "convertedSortedMappings_{sample}.bam.bai", + ), + sample=config["sample"], + ), + table1=expand( + os.path.join( + config["output_dir"], + "TABLES", + "counts.{mir}.tab" + ), + mir=config["mir_list"], + ), + table2=os.path.join( + config["output_dir"], + "TABLES", + "counts.isomirs.tab" + ), \ No newline at end of file diff --git a/workflow/config.yaml b/workflow/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..25e52d7e71ae759c66091653bafbc978177d22a3 --- /dev/null +++ b/workflow/config.yaml @@ -0,0 +1,106 @@ +--- +#### GLOBAL PARAMETERS ##### + +# Directories +# Usually there is no need to change these +map_input_dir: "path/to/map_input_directory" # For the mapping worflow +quantify_input_dir: "path/to/quantify_input_directory" # For the quantify worflow +output_dir: "results" +scripts_dir: "../scripts" +local_log: "logs/local" +cluster_log: "logs/cluster" + +# Inputs information +sample: ["sample_1", "sample_2"] # put all sample names, separated by comma + +####################################################################################################### +#### +#### PREPARE PARAMETERS +#### +####################################################################################################### + +# Isomirs annotation file +# Number of base pairs to add/substract from 5' (start) and 3' (end) coordinates. +bp_5p: [0] # array of numbers, e.g., [-2,-1,0,+1], to include 2 upstream and 1 downstream nts +bp_3p: [0] # array of numbers, e.g., [-2,-1,0,+1], to include 2 upstream and 1 downstream nts + +# List of inputs +organism: ["org/pre"] # e.g., ["homo_sapiens/GRCh38.100", "mus_musculus/GRCm37.98"] +# this string specifies a path, and the "/" is important for this +# "pre" specifies the assembly version + + +#### PARAMETERS SPECIFIC TO INPUTS ##### + +org/pre: # One section for each list item in "organism"; entry should match precisely what +# is in the "organism" section above, one entry per list item above, omitting the "" + # URLs to genome, gene & miRNA annotations + genome_url: # FTP/HTTP URL to gzipped genome in FASTA format, Ensembl style + # e.g. "ftp://ftp.ensembl.org/pub/release-106/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna_sm.primary_assembly.fa.gz" + gtf_url: # FTP/HTTP URL to gzipped gene annotations in GTF format, Ensembl style + # e.g. "ftp://ftp.ensembl.org/pub/release-106/gtf/homo_sapiens/Homo_sapiens.GRCh38.106.chr.gtf.gz" + mirna_url: # FTP/HTTP URL to unzipped microRNA annotations in GFF format, miRBase style + # e.g. "https://www.mirbase.org/ftp/CURRENT/genomes/hsa.gff3" + + # Chromosome name mappings between UCSC <-> Ensembl + # Other organisms available at: https://github.com/dpryan79/ChromosomeMappings + map_chr_url: # FTP/HTTP URL to mapping table + # e.g. "https://raw.githubusercontent.com/dpryan79/ChromosomeMappings/master/GRCh38_UCSC2ensembl.txt" + # Chromosome name mapping parameters: + column: 1 # Column number from input file where to change chromosome name + delimiter: "TAB" # Delimiter of the input file + +####################################################################################################### +#### +#### MAP PARAMETERS +#### +####################################################################################################### + +# Resources: genome, transcriptome, genes, miRs +# All of these are produced by the "prepare" workflow +genome: "path/to/genome.processed.fa" +gtf: "path/to/gene_annotations.filtered.gtf" +transcriptome: "path/to/transcriptome_idtrim.fa" +transcriptome_index_segemehl: "path/to/transcriptome_index_segemehl.idx" +genome_index_segemehl: "path/to/genome_index_segemehl.idx" +exons: "path/to/exons.bed" +header_of_collapsed_fasta: "path/to/headerOfCollapsedFasta.sam" + +# Tool parameters: quality filter +q_value: 10 # Q (Phred) score; minimum quality score to keep +p_value: 50 # minimum % of bases that must have Q quality + +# Tool parameters: adapter removal +error_rate: 0.1 # fraction of allowed errors +minimum_length: 15 # discard processed reads shorter than the indicated length +overlap: 3 # minimum overlap length of adapter and read to trim the bases +max_n: 0 # discard reads containing more than the indicated number of N bases + +# Tool parameters: mapping +max_length_reads: 30 # maximum length of processed reads to map with oligomap +nh: 100 # discard reads with more mappings than the indicated number + + +#### PARAMETERS SPECIFIC TO INPUTS #### + +sample_1: # one section per list item in "sample"; names have to match + adapter: "XXXXXXXXXXXXXXXXXXXX" # 3' adapter sequence to trim + format: "fa" # file format; currently supported: "fa" + + +####################################################################################################### +#### +#### QUANTIFY PARAMETERS +#### +####################################################################################################### + + +# Types of miRNAs to quantify +# Remove miRNA types you are not interested in +mir_list: ["miRNA", "miRNA_primary_transcript", "isomirs"] + +# Resources: miR annotations, chromosome name mappings +# All of these are produced by the "prepare" workflow +mirnas_anno: "path/to/mirna_filtered.bed" +isomirs_anno: "path/to/isomirs_annotation.bed" +... \ No newline at end of file diff --git a/workflow/rules/.gitkeep b/workflow/rules/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/workflow/map/Snakefile b/workflow/rules/map.smk similarity index 98% rename from workflow/map/Snakefile rename to workflow/rules/map.smk index aace1f1eee65ee34d8ef121efb4dbff15cad0d0f..f5ef0401792744f59e004702b8a97cd2ca66cdc3 100644 --- a/workflow/map/Snakefile +++ b/workflow/rules/map.smk @@ -4,22 +4,33 @@ # # Workflow to map small RNA-seq reads (e.g. from miRNA sequencing libraries). ############################################################################### - +# +# USAGE: +# snakemake \ +# --snakefile="map.smk" \ +# -- cores 4 \ +# --use-singularity \ +# --singularity-args "--bind $PWD/../" \ +# --printshellcmds \ +# --rerun-incomplete \ +# --verbose +# +################################################################################ import os +configfile: "config.yaml" # Rules that require internet connection for downloading files are included # in the localrules localrules: - finish, - + finish_map, ############################################################################### ### Finish rule ############################################################################### -rule finish: +rule finish_map: input: maps=expand( os.path.join( @@ -30,7 +41,6 @@ rule finish: sample=config["sample"], ), - ############################################################################### ### Uncompress fastq files ############################################################################### @@ -38,7 +48,7 @@ rule finish: rule uncompress_zipped_files: input: - reads=os.path.join(config["input_dir"], "{sample}.{format}.gz"), + reads=os.path.join(config["map_input_dir"], "{sample}.{format}.gz"), output: reads=os.path.join( config["output_dir"], "{sample}", "{format}", "reads.{format}" diff --git a/workflow/prepare/Snakefile b/workflow/rules/prepare.smk similarity index 96% rename from workflow/prepare/Snakefile rename to workflow/rules/prepare.smk index c16afb8ef831caace8b8e062efe53f7c1667eebe..b254a908a11c28c85b17d86bb467ebf271978727 100644 --- a/workflow/prepare/Snakefile +++ b/workflow/rules/prepare.smk @@ -2,17 +2,30 @@ # (c) 2020 Paula Iborra, Zavolan Lab, Biozentrum, University of Basel # (@) paula.iborradetoledo@unibas.ch / paula.iborra@alumni.esci.upf.edu # -# Snakemake workflow to download and prepare the necessary files for -# smallRNA-seq related workflows. +# Snakemake workflow to download and prepare the necessary files +# for smallRNA-seq related workflows. +# ############################################################################### - +# +# USAGE: +# snakemake \ +# --snakefile="prepare.smk" \ +# --cores 4 \ +# --use-singularity \ +# --singularity-args "--bind $PWD/../" \ +# --printshellcmds \ +# --rerun-incomplete \ +# --verbose +# +################################################################################ import os +configfile: "config.yaml" # Rules that require internet connection for downloading files are included # in the localrules localrules: - finish, + finish_prepare, genome_process, filter_anno_gtf, mirna_anno, @@ -24,7 +37,7 @@ localrules: ############################################################################### -rule finish: +rule finish_prepare: input: idx_transcriptome=expand( os.path.join( @@ -43,7 +56,11 @@ rule finish: organism=config["organism"], ), exons=expand( - os.path.join(config["output_dir"], "{organism}", "exons.bed"), + os.path.join( + config["output_dir"], + "{organism}", + "exons.bed", + ), organism=config["organism"], ), header=expand( @@ -56,23 +73,26 @@ rule finish: ), mirnafilt=expand( os.path.join( - config["output_dir"], "{organism}", "mirna_filtered.bed" + config["output_dir"], + "{organism}", + "mirna_filtered.bed", ), organism=config["organism"], ), isomirs=expand( os.path.join( - config["output_dir"], "{organism}", "isomirs_annotation.bed" + config["output_dir"], + "{organism}", + "isomirs_annotation.bed", ), organism=config["organism"], ), + ############################################################################### ### Download and process genome IDs ############################################################################### - - rule genome_process: input: script=os.path.join(config["scripts_dir"], "genome_process.sh"), @@ -582,7 +602,7 @@ rule filter_mature_mirs: ############################################################################### -### Create isomirs annotation file from mature miRNA +### Create isomirs annotation file from mature miRNA ############################################################################### @@ -728,4 +748,4 @@ rule iso_anno_final: singularity: "docker://zavolab/ubuntu:18.04" shell: - "(grep -v '{params.pattern}' {input.bed} > {output.bed}) &> {log}" + "(grep -v '{params.pattern}' {input.bed} > {output.bed}) &> {log}" \ No newline at end of file diff --git a/workflow/quantify/Snakefile b/workflow/rules/quantify.smk similarity index 93% rename from workflow/quantify/Snakefile rename to workflow/rules/quantify.smk index b30667d052b20cf60c8c3d002482b0f7122a0d6e..6eabb23b2b3b6c3f776e7d2b7e5193eab16f4612 100644 --- a/workflow/quantify/Snakefile +++ b/workflow/rules/quantify.smk @@ -4,14 +4,27 @@ # # Pipeline to quantify miRNAs, including isomiRs, from miRNA-seq alignments. ############################################################################### +# +# USAGE: +# snakemake \ +# --snakefile="quanitfy.smk" \ +# --cores 4 \ +# --use-singularity \ +# --singularity-args "--bind $PWD/../" \ +# --printshellcmds \ +# --rerun-incomplete \ +# --verbose +# +################################################################################ import os +configfile: "config.yaml" # Rules that require internet connection for downloading files are included # in the localrules localrules: - finish, + finish_quantify, ############################################################################### @@ -19,17 +32,22 @@ localrules: ############################################################################### -rule finish: +rule finish_quantify: input: table1=expand( - os.path.join(config["output_dir"], "TABLES", "counts.{mir}.tab"), + os.path.join( + config["output_dir"], + "TABLES", + "counts.{mir}.tab", + ), mir=config["mir_list"], ), table2=os.path.join( - config["output_dir"], "TABLES", "counts.isomirs.tab" + config["output_dir"], + "TABLES", + "counts.isomirs.tab", ), - ############################################################################### ### BAM to BED ############################################################################### @@ -38,7 +56,7 @@ rule finish: rule bamtobed: input: alignment=os.path.join( - config["input_dir"], + config["quantify_input_dir"], "{sample}", "convertedSortedMappings_{sample}.bam", ), @@ -314,4 +332,4 @@ rule merge_tables: --output_file {output.table} \ --prefix {params.prefix} \ --verbose \ - ) &> {log}" + ) &> {log}" \ No newline at end of file