diff --git a/RUNS/JOB/cluster.json b/RUNS/JOB/cluster.json new file mode 100644 index 0000000000000000000000000000000000000000..0fbb1efd4a305d24925d93bca35cece3bebca118 --- /dev/null +++ b/RUNS/JOB/cluster.json @@ -0,0 +1,68 @@ +{ + "__default__" : + { + "queue": "30min", + "time": "00:05:00", + "threads": "1", + "mem": "4G" + }, + + "cutadapt": + { + "threads":"{resources.threads}" + }, + + "mapping_genome_segemehl": + { + "mem":"{resources.mem}G" + }, + + "mapping_transcriptome_segemehl": + { + "mem":"{resources.mem}G" + }, + + "mapping_genome_oligomap": + { + "mem":"{resources.mem}G" + }, + + "mapping_transcriptome_oligomap": + { + "mem":"{resources.mem}G" + }, + + "sort_transcriptome_oligomap": + { + "threads":"{resources.threads}" + }, + + "sort_genome_oligomap": + { + "threads":"{resources.threads}" + }, + + "remove_inferiors": + { + "threads":"{resources.threads}", + "mem":"{resources.mem}G" + }, + + "generate_segemehl_index_transcriptome": + { + "threads":"{resources.threads}", + "mem":"{resources.mem}G" + }, + + "generate_segemehl_index_genome": + { + "threads":"{resources.threads}", + "mem":"{resources.mem}G" + }, + + "sort_alignment": + { + "mem":"{resources.mem}G", + "threads":"{resources.threads}" + } +} diff --git a/RUNS/JOB/input_files/samples_table.csv b/RUNS/JOB/input_files/samples_table.csv new file mode 100644 index 0000000000000000000000000000000000000000..1e190fe1e712afb83a39e7b8cd92e3082fb2c7b2 --- /dev/null +++ b/RUNS/JOB/input_files/samples_table.csv @@ -0,0 +1,2 @@ +sample sample_file adapter format +sample_lib path/to/sample_library_file XXXXXXXXXXXXXXXXXX library_format diff --git a/RUNS/JOB/run_workflow_local.sh b/RUNS/JOB/run_workflow_local.sh new file mode 100755 index 0000000000000000000000000000000000000000..5580f1c9dd6d053e24e5026f0d4d80b476258d65 --- /dev/null +++ b/RUNS/JOB/run_workflow_local.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +# Tear down test environment +cleanup () { + rc=$? + cd $user_dir + echo "Exit status: $rc" +} +trap cleanup EXIT + +# Set up test environment +set -eo pipefail # ensures that script exits at first command that exits with non-zero status +set -u # ensures that script exits when unset variables are used +set -x # facilitates debugging by printing out executed commands +user_dir=$PWD +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" +cd $script_dir + +# Run test +snakemake \ + --snakefile="../../workflow/Snakefile" \ + --cores 4 \ + --use-singularity \ + --singularity-args "--bind ${PWD}/../" \ + --printshellcmds \ + --rerun-incomplete \ + --verbose + + +# Snakemake report +snakemake \ + --snakefile="../../workflow/Snakefile" \ + --report="snakemake_report.html" diff --git a/RUNS/JOB/run_workflow_slurm.sh b/RUNS/JOB/run_workflow_slurm.sh new file mode 100755 index 0000000000000000000000000000000000000000..277ed9bfe3f2c2bdac49880eadf8bb4588f21c2d --- /dev/null +++ b/RUNS/JOB/run_workflow_slurm.sh @@ -0,0 +1,49 @@ +#!/bin/bash + +# Tear down test environment +cleanup () { + rc=$? + cd $user_dir + echo "Exit status: $rc" +} +trap cleanup EXIT + +# Set up test environment +set -eo pipefail # ensures that script exits at first command that exits with non-zero status +set -u # ensures that script exits when unset variables are used +set -x # facilitates debugging by printing out executed commands +user_dir=$PWD +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" +cd $script_dir + +# Have to match directories indicated in config.yaml files +mkdir -p logs/cluster/{homo_sapiens/chrY,test_lib} +mkdir -p logs/local/{homo_sapiens/chrY,test_lib} +mkdir -p results/{homo_sapiens/chrY,test_lib} + +# Run test +snakemake \ + --snakefile="../../workflow/Snakefile" \ + --cores=256 \ + --cluster-config="cluster.json" \ + --cluster "sbatch \ + --cpus-per-task={cluster.threads} \ + --mem={cluster.mem} \ + --qos={cluster.queue} \ + --time={cluster.time} \ + --export=JOB_NAME={rule} \ + -o {params.cluster_log} \ + -p scicore \ + --open-mode=append" \ + --jobscript="../../jobscript.sh" \ + --jobs=20 \ + --use-singularity \ + --singularity-args="--bind ${PWD}/../" \ + --printshellcmds \ + --rerun-incomplete \ + --verbose + +# Snakemake report +snakemake \ + --snakefile="../../workflow/Snakefile" \ + --report="snakemake_report.html" diff --git a/config/config.yaml b/config/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c7a3476f68a2d5baa06dbdc3fe5eea4be9f9a75b --- /dev/null +++ b/config/config.yaml @@ -0,0 +1,91 @@ +--- +#### GLOBAL PARAMETERS ##### + +# Directories +# Usually there is no need to change these +samples: "test_files/samples_table.csv" # For the mapping worflow +quantify_input_dir: "results" # For the quantify worflow +output_dir: "results" +scripts_dir: "../scripts" +local_log: "logs/local" +cluster_log: "logs/cluster" + +####################################################################################################### +#### +#### PREPARE PARAMETERS +#### +####################################################################################################### + +# Isomirs annotation +# Number of base pairs to add/substract from 5' (start) and 3' (end) coordinates. +bp_5p: [-1, 0, +1] +bp_3p: [-1, 0, +1] + + +#### PARAMETERS SPECIFIC TO INPUTS ##### + +organism: "homo_sapiens/chrY" + +homo_sapiens/chrY: + # URLs to genome, gene & miRNA annotations + genome_url: "ftp://ftp.ensembl.org/pub/release-98/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna_sm.chromosome.Y.fa.gz" + gtf_url: "ftp://ftp.ensembl.org/pub/release-98/gtf/homo_sapiens/Homo_sapiens.GRCh38.98.gtf.gz" + mirna_url: "https://www.mirbase.org/ftp/CURRENT/genomes/hsa.gff3" + + # Chromosome name mappings between UCSC <-> Ensembl + # Other organisms available at: https://github.com/dpryan79/ChromosomeMappings + map_chr_url: "https://raw.githubusercontent.com/dpryan79/ChromosomeMappings/master/GRCh38_UCSC2ensembl.txt" + # Chromosome name mapping parameters: + column: 1 + delimiter: "TAB" + +####################################################################################################### +#### +#### MAP PARAMETERS +#### +####################################################################################################### + +#### PARAMETERS SPECIFIC TO INPUTS ##### + +# All of these are produced by the "prepare" workflow +genome: "results/homo_sapiens/chrY/genome.processed.fa" +gtf: "results/homo_sapiens/chrY/gene_annotations.filtered.gtf" +transcriptome: "results/homo_sapiens/chrY/transcriptome_idtrim.fa" +transcriptome_index_segemehl: "results/homo_sapiens/chrY/transcriptome_index_segemehl.idx" +genome_index_segemehl: "results/homo_sapiens/chrY/genome_index_segemehl.idx" +exons: "results/homo_sapiens/chrY/exons.bed" +header_of_collapsed_fasta: "results/homo_sapiens/chrY/headerOfCollapsedFasta.sam" + + +#### TOOL PARAMETERS #### + +# quality filter +q_value: 10 +p_value: 50 + +# adapter removal +error_rate: 0.1 +minimum_length: 15 +overlap: 3 +max_n: 0 + +# mapping +max_length_reads: 30 +nh: 100 + +####################################################################################################### +#### +#### QUANTIFY PARAMETERS +#### +####################################################################################################### + + +# Types of miRNAs to quantify +#mir_list: ["miRNA", "miRNA_primary_transcript", "isomirs"] +mir_list: ["miRNA", "miRNA_primary_transcript"] + +# Resources: miR annotations, chromosome name mappings +# All of these are produced by the "prepare" workflow +mirnas_anno: "results/homo_sapiens/chrY/mirna_filtered.bed" +isomirs_anno: "results/homo_sapiens/chrY/isomirs_annotation.bed" +... \ No newline at end of file diff --git a/config/config_template.yaml b/config/config_template.yaml new file mode 100644 index 0000000000000000000000000000000000000000..62b061a4df558f67d29dad5a1a878d1dfa083456 --- /dev/null +++ b/config/config_template.yaml @@ -0,0 +1,98 @@ +--- +#### GLOBAL PARAMETERS ##### + +# Directories +# Usually there is no need to change these +samples: "test_files/samples_table.csv" # For the mapping worflow +quantify_input_dir: "results" # For the quantify worflow +output_dir: "results" +scripts_dir: "../scripts" +local_log: "logs/local" +cluster_log: "logs/cluster" +####################################################################################################### +#### +#### PREPARE PARAMETERS +#### +####################################################################################################### + +# Isomirs annotation file +# Number of base pairs to add/substract from 5' (start) and 3' (end) coordinates. +bp_5p: [0] # array of numbers, e.g., [-2,-1,0,+1], to include 2 upstream and 1 downstream nts +bp_3p: [0] # array of numbers, e.g., [-2,-1,0,+1], to include 2 upstream and 1 downstream nts + + +#### PARAMETERS SPECIFIC TO INPUTS ##### + +organism: "org/pre" # e.g., "homo_sapiens/GRCh38.100" or "mus_musculus/GRCm37.98" +# this string specifies a path, and the "/" is important for this +# "pre" specifies the assembly version + + +org/pre: # One section for each list item in "organism"; entry should match precisely what +# is in the "organism" section above, one entry per list item above, omitting the "" + # URLs to genome, gene & miRNA annotations + genome_url: # FTP/HTTP URL to gzipped genome in FASTA format, Ensembl style + # e.g. "ftp://ftp.ensembl.org/pub/release-106/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna_sm.primary_assembly.fa.gz" + gtf_url: # FTP/HTTP URL to gzipped gene annotations in GTF format, Ensembl style + # e.g. "ftp://ftp.ensembl.org/pub/release-106/gtf/homo_sapiens/Homo_sapiens.GRCh38.106.chr.gtf.gz" + mirna_url: # FTP/HTTP URL to unzipped microRNA annotations in GFF format, miRBase style + # e.g. "https://www.mirbase.org/ftp/CURRENT/genomes/hsa.gff3" + + # Chromosome name mappings between UCSC <-> Ensembl + # Other organisms available at: https://github.com/dpryan79/ChromosomeMappings + map_chr_url: # FTP/HTTP URL to mapping table + # e.g. "https://raw.githubusercontent.com/dpryan79/ChromosomeMappings/master/GRCh38_UCSC2ensembl.txt" + # Chromosome name mapping parameters: + column: 1 # Column number from input file where to change chromosome name + delimiter: "TAB" # Delimiter of the input file + +####################################################################################################### +#### +#### MAP PARAMETERS +#### +####################################################################################################### + +#### PARAMETERS SPECIFIC TO INPUTS ##### + +# Resources: genome, transcriptome, genes, miRs +# All of these are produced by the "prepare" workflow +genome: "path/to/genome.processed.fa" +gtf: "path/to/gene_annotations.filtered.gtf" +transcriptome: "path/to/transcriptome_idtrim.fa" +transcriptome_index_segemehl: "path/to/transcriptome_index_segemehl.idx" +genome_index_segemehl: "path/to/genome_index_segemehl.idx" +exons: "path/to/exons.bed" +header_of_collapsed_fasta: "path/to/headerOfCollapsedFasta.sam" + +#### TOOL PARAMETERS #### + +# quality filter +q_value: 10 # Q (Phred) score; minimum quality score to keep +p_value: 50 # minimum % of bases that must have Q quality + +# adapter removal +error_rate: 0.1 # fraction of allowed errors +minimum_length: 15 # discard processed reads shorter than the indicated length +overlap: 3 # minimum overlap length of adapter and read to trim the bases +max_n: 0 # discard reads containing more than the indicated number of N bases + +# mapping +max_length_reads: 30 # maximum length of processed reads to map with oligomap +nh: 100 # discard reads with more mappings than the indicated number + +####################################################################################################### +#### +#### QUANTIFY PARAMETERS +#### +####################################################################################################### + + +# Types of miRNAs to quantify +# Remove miRNA types you are not interested in +mir_list: ["miRNA", "miRNA_primary_transcript", "isomirs"] + +# Resources: miR annotations, chromosome name mappings +# All of these are produced by the "prepare" workflow +mirnas_anno: "path/to/mirna_filtered.bed" +isomirs_anno: "path/to/isomirs_annotation.bed" +... \ No newline at end of file diff --git a/test/config.yaml b/test/config.yaml index 98b4422dbfc52ad49a6d237b9b840aa0650b34ae..c5e23a221a244c731142db7fe29cb508cd75bbad 100644 --- a/test/config.yaml +++ b/test/config.yaml @@ -3,16 +3,13 @@ # Directories # Usually there is no need to change these -map_input_dir: "test_files" # For the mapping worflow +samples: "test_files/samples_table.csv" # For the mapping worflow quantify_input_dir: "results" # For the quantify worflow output_dir: "results" scripts_dir: "../scripts" local_log: "logs/local" cluster_log: "logs/cluster" -# Inputs information -sample: ["test_lib"] - ####################################################################################################### #### #### PREPARE PARAMETERS @@ -24,11 +21,11 @@ sample: ["test_lib"] bp_5p: [-1, 0, +1] bp_3p: [-1, 0, +1] -# List of inputs -organism: ["homo_sapiens/chrY"] #### PARAMETERS SPECIFIC TO INPUTS ##### +organism: "homo_sapiens/chrY" + homo_sapiens/chrY: # URLs to genome, gene & miRNA annotations genome_url: "ftp://ftp.ensembl.org/pub/release-98/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna_sm.chromosome.Y.fa.gz" @@ -48,8 +45,7 @@ homo_sapiens/chrY: #### ####################################################################################################### -# Resources: genome, transcriptome, genes, miRs -# All of these are produced by the "prepare" workflow +#### PARAMETERS SPECIFIC TO INPUTS ##### genome: "results/homo_sapiens/chrY/genome.processed.fa" gtf: "results/homo_sapiens/chrY/gene_annotations.filtered.gtf" @@ -59,27 +55,23 @@ genome_index_segemehl: "results/homo_sapiens/chrY/genome_index_segemehl.idx" exons: "results/homo_sapiens/chrY/exons.bed" header_of_collapsed_fasta: "results/homo_sapiens/chrY/headerOfCollapsedFasta.sam" -# Tool parameters: quality filter + +#### TOOL PARAMETERS #### + +# quality filter q_value: 10 p_value: 50 -# Tool parameters: adapter removal +# adapter removal error_rate: 0.1 minimum_length: 15 overlap: 3 max_n: 0 -# Tool parameters: mapping +# mapping max_length_reads: 30 nh: 100 - -#### PARAMETERS SPECIFIC TO INPUTS #### - -test_lib: - adapter: "AACTGTAGGCACCATCAAT" - format: "fa" - ####################################################################################################### #### #### QUANTIFY PARAMETERS diff --git a/test/test_files/samples_table.csv b/test/test_files/samples_table.csv new file mode 100644 index 0000000000000000000000000000000000000000..efbdfab904da4bb8511549a6db143b5a359eaa4c --- /dev/null +++ b/test/test_files/samples_table.csv @@ -0,0 +1,2 @@ +sample sample_file adapter format +test_lib ../test/test_files/test_lib.fa.gz AACTGTAGGCACCATCAAT fa diff --git a/workflow/Snakefile b/workflow/Snakefile index 9467cd318d8c6c696450898b17618f2692d3f626..c8c544662ede821ac314a0e65625cd9c70ed6a45 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -21,6 +21,16 @@ ################################################################################ import os +import pandas as pd + +configfile: "../config/config.yaml" + +############################################################################### +### Global configuration +############################################################################### + +localrules: + finish, ############################################################################### ### Including subworkflows @@ -33,8 +43,6 @@ include: os.path.join("rules" , "quantify.smk") ############################################################################### ### Finish rule ############################################################################### - - rule finish: input: idx_transcriptome=expand( @@ -91,7 +99,7 @@ rule finish: "{sample}", "convertedSortedMappings_{sample}.bam.bai", ), - sample=config["sample"], + sample=pd.unique(samples_table.index.values), ), table1=expand( os.path.join( diff --git a/workflow/rules/map.smk b/workflow/rules/map.smk index f5ef0401792744f59e004702b8a97cd2ca66cdc3..414f16f7564cbfdfa80f4e2777e9f9af0bfad332 100644 --- a/workflow/rules/map.smk +++ b/workflow/rules/map.smk @@ -5,24 +5,62 @@ # Workflow to map small RNA-seq reads (e.g. from miRNA sequencing libraries). ############################################################################### # -# USAGE: +# USAGE (from the file's directory): +# # snakemake \ # --snakefile="map.smk" \ -# -- cores 4 \ +# --cores 4 \ # --use-singularity \ -# --singularity-args "--bind $PWD/../" \ +# --singularity-args "--bind $PWD/../" \ # --printshellcmds \ # --rerun-incomplete \ # --verbose # +# IMPORTANT when executing this file alone: +## * You must modify the config.yaml. +## * Uncomment the configfile line. ################################################################################ + import os +import pandas as pd + +#configfile: "../../config/config.yaml" + +############################################################################### +### Reading sample and resources tables +############################################################################### + +samples_table = pd.read_csv( + config["samples"], + header = 0, + index_col = 0, + comment = "#", + engine = "python", + sep = "\t", +) + +############################################################################### +### Funcitons get_sample and get_resource +############################################################################### +# Function to get relevant per sample information from samples and resources table -configfile: "config.yaml" +def get_sample(column_id, sample_id = None): + if sample_id: + return str( + samples_table[column_id][samples_table.index == sample_id][0] + ) + else: + return str( + samples_table[column_id][0] + ) +############################################################################### +### Global configuration +############################################################################### # Rules that require internet connection for downloading files are included # in the localrules localrules: + start, finish_map, ############################################################################### @@ -38,21 +76,28 @@ rule finish_map: "{sample}", "convertedSortedMappings_{sample}.bam.bai", ), - sample=config["sample"], + sample=pd.unique(samples_table.index.values), ), + + ############################################################################### -### Uncompress fastq files +### Start rule (get samples) ############################################################################### - -rule uncompress_zipped_files: +rule start: input: - reads=os.path.join(config["map_input_dir"], "{sample}.{format}.gz"), + reads=lambda wildcards: expand( + pd.Series(samples_table.loc[wildcards.sample, "sample_file"]).values, + format=get_sample("format"), + ), output: reads=os.path.join( - config["output_dir"], "{sample}", "{format}", "reads.{format}" - ), + config["output_dir"], + "{sample}", + "{format}", + "reads.{format}", + ) params: cluster_log=os.path.join( config["cluster_log"], @@ -139,7 +184,7 @@ rule fasta_formatter: reads=lambda wildcards: os.path.join( config["output_dir"], wildcards.sample, - config[wildcards.sample]["format"], + get_sample("format", wildcards.sample), "reads.fa", ), output: @@ -170,7 +215,7 @@ rule cutadapt: cluster_log=os.path.join( config["cluster_log"], "cutadapt_{sample}.log" ), - adapter=lambda wildcards: config[wildcards.sample]["adapter"], + adapter=lambda wildcards: get_sample("adapter", wildcards.sample), error_rate=config["error_rate"], minimum_length=config["minimum_length"], overlap=config["overlap"], diff --git a/workflow/rules/prepare.smk b/workflow/rules/prepare.smk index b254a908a11c28c85b17d86bb467ebf271978727..2e4c1913b58171b79d95f75b01ec9c9571107a7b 100644 --- a/workflow/rules/prepare.smk +++ b/workflow/rules/prepare.smk @@ -7,20 +7,24 @@ # ############################################################################### # -# USAGE: +# USAGE (from the file's directory): +# # snakemake \ # --snakefile="prepare.smk" \ # --cores 4 \ # --use-singularity \ -# --singularity-args "--bind $PWD/../" \ +# --singularity-args "--bind $PWD/../" \ # --printshellcmds \ # --rerun-incomplete \ # --verbose # +# IMPORTANT when executing this file alone: +## * You must modify the config.yaml. +## * Uncomment the configfile line. ################################################################################ import os -configfile: "config.yaml" +#configfile: "../../config/config.yaml" # Rules that require internet connection for downloading files are included # in the localrules @@ -35,8 +39,6 @@ localrules: ############################################################################### ### Finish rule ############################################################################### - - rule finish_prepare: input: idx_transcriptome=expand( diff --git a/workflow/rules/quantify.smk b/workflow/rules/quantify.smk index 6eabb23b2b3b6c3f776e7d2b7e5193eab16f4612..90fd57223174711bf0d854f3693c2defdf4e0364 100644 --- a/workflow/rules/quantify.smk +++ b/workflow/rules/quantify.smk @@ -5,7 +5,8 @@ # Pipeline to quantify miRNAs, including isomiRs, from miRNA-seq alignments. ############################################################################### # -# USAGE: +# USAGE (from the file's directory): +# # snakemake \ # --snakefile="quanitfy.smk" \ # --cores 4 \ @@ -15,11 +16,28 @@ # --rerun-incomplete \ # --verbose # +# IMPORTANT when executing this file alone: +## * You must modify the config.yaml. +## * Uncomment the configfile line. ################################################################################ import os +import pandas as pd + +#configfile: "../../config/config.yaml" + +############################################################################### +### Reading samples' table +############################################################################### -configfile: "config.yaml" +samples_table = pd.read_csv( + config["samples"], + header = 0, + index_col = 0, + comment = "#", + engine = "python", + sep = "\t", +) # Rules that require internet connection for downloading files are included # in the localrules @@ -309,7 +327,7 @@ rule merge_tables: os.path.join( config["output_dir"], "TABLES", "{mir}_counts_{sample}" ), - sample=config["sample"], + sample=pd.unique(samples_table.index.values), mir=config["mir_list"], ), script=os.path.join(config["scripts_dir"], "merge_tables.R"),