diff --git a/config/config.yaml b/config/config.yaml index c7a3476f68a2d5baa06dbdc3fe5eea4be9f9a75b..ea03c216342e4feaf862ee86f2eb84b3a32ca402 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -1,91 +1,55 @@ --- -#### GLOBAL PARAMETERS ##### - -# Directories -# Usually there is no need to change these -samples: "test_files/samples_table.csv" # For the mapping worflow -quantify_input_dir: "results" # For the quantify worflow -output_dir: "results" -scripts_dir: "../scripts" -local_log: "logs/local" -cluster_log: "logs/cluster" - -####################################################################################################### -#### -#### PREPARE PARAMETERS -#### -####################################################################################################### - -# Isomirs annotation -# Number of base pairs to add/substract from 5' (start) and 3' (end) coordinates. -bp_5p: [-1, 0, +1] -bp_3p: [-1, 0, +1] - - -#### PARAMETERS SPECIFIC TO INPUTS ##### - -organism: "homo_sapiens/chrY" - -homo_sapiens/chrY: - # URLs to genome, gene & miRNA annotations - genome_url: "ftp://ftp.ensembl.org/pub/release-98/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna_sm.chromosome.Y.fa.gz" - gtf_url: "ftp://ftp.ensembl.org/pub/release-98/gtf/homo_sapiens/Homo_sapiens.GRCh38.98.gtf.gz" - mirna_url: "https://www.mirbase.org/ftp/CURRENT/genomes/hsa.gff3" - - # Chromosome name mappings between UCSC <-> Ensembl - # Other organisms available at: https://github.com/dpryan79/ChromosomeMappings - map_chr_url: "https://raw.githubusercontent.com/dpryan79/ChromosomeMappings/master/GRCh38_UCSC2ensembl.txt" - # Chromosome name mapping parameters: - column: 1 - delimiter: "TAB" - -####################################################################################################### -#### -#### MAP PARAMETERS -#### -####################################################################################################### - -#### PARAMETERS SPECIFIC TO INPUTS ##### - -# All of these are produced by the "prepare" workflow -genome: "results/homo_sapiens/chrY/genome.processed.fa" -gtf: "results/homo_sapiens/chrY/gene_annotations.filtered.gtf" -transcriptome: "results/homo_sapiens/chrY/transcriptome_idtrim.fa" -transcriptome_index_segemehl: "results/homo_sapiens/chrY/transcriptome_index_segemehl.idx" -genome_index_segemehl: "results/homo_sapiens/chrY/genome_index_segemehl.idx" -exons: "results/homo_sapiens/chrY/exons.bed" -header_of_collapsed_fasta: "results/homo_sapiens/chrY/headerOfCollapsedFasta.sam" - - -#### TOOL PARAMETERS #### +############################# +#### REQUIRED PARAMETERS #### +############################# + +samples: test_files/samples_table.csv + +#### GENOME RESOURCES #### + + +genome_url: "ftp://ftp.ensembl.org/pub/release-98/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna_sm.chromosome.Y.fa.gz" +gtf_url: "ftp://ftp.ensembl.org/pub/release-98/gtf/homo_sapiens/Homo_sapiens.GRCh38.98.gtf.gz" +mirna_url: "https://www.mirbase.org/ftp/CURRENT/genomes/hsa.gff3" +map_chr_url: "https://raw.githubusercontent.com/dpryan79/ChromosomeMappings/master/GRCh38_UCSC2ensembl.txt" + + + +############################### +#### "OPTIONAL" PARAMETERS #### +############################### + +#### DIRECTORIES ##### + +output_dir: results/ +local_log: logs/local/ +cluster_log: logs/cluster/ +scripts_dir: ../scripts/ + + +#### ISOMIR GENERATION PARAMETERS #### + +bp_5p: [-2, -1, 0, +1, +2] +bp_3p: [-2, -1, 0, +1, +2] + + +#### PROCESSING PARAMETERS #### # quality filter -q_value: 10 -p_value: 50 +q_value: 10 +p_value: 50 # adapter removal -error_rate: 0.1 -minimum_length: 15 -overlap: 3 -max_n: 0 +error_rate: 0.1 +minimum_length: 15 +overlap: 3 +max_n: 0 # mapping -max_length_reads: 30 -nh: 100 +max_length_reads: 30 +nh: 100 -####################################################################################################### -#### -#### QUANTIFY PARAMETERS -#### -####################################################################################################### +#### QUANTIFICATION PARAMETERS #### - -# Types of miRNAs to quantify -#mir_list: ["miRNA", "miRNA_primary_transcript", "isomirs"] mir_list: ["miRNA", "miRNA_primary_transcript"] - -# Resources: miR annotations, chromosome name mappings -# All of these are produced by the "prepare" workflow -mirnas_anno: "results/homo_sapiens/chrY/mirna_filtered.bed" -isomirs_anno: "results/homo_sapiens/chrY/isomirs_annotation.bed" ... \ No newline at end of file diff --git a/config/config_template.yaml b/config/config_template.yaml index 62b061a4df558f67d29dad5a1a878d1dfa083456..0a2afb77a20aa920270a716f08bc50f70011370f 100644 --- a/config/config_template.yaml +++ b/config/config_template.yaml @@ -1,98 +1,84 @@ --- -#### GLOBAL PARAMETERS ##### - -# Directories -# Usually there is no need to change these -samples: "test_files/samples_table.csv" # For the mapping worflow -quantify_input_dir: "results" # For the quantify worflow -output_dir: "results" -scripts_dir: "../scripts" -local_log: "logs/local" -cluster_log: "logs/cluster" -####################################################################################################### -#### -#### PREPARE PARAMETERS -#### -####################################################################################################### - -# Isomirs annotation file -# Number of base pairs to add/substract from 5' (start) and 3' (end) coordinates. -bp_5p: [0] # array of numbers, e.g., [-2,-1,0,+1], to include 2 upstream and 1 downstream nts -bp_3p: [0] # array of numbers, e.g., [-2,-1,0,+1], to include 2 upstream and 1 downstream nts - - -#### PARAMETERS SPECIFIC TO INPUTS ##### - -organism: "org/pre" # e.g., "homo_sapiens/GRCh38.100" or "mus_musculus/GRCm37.98" -# this string specifies a path, and the "/" is important for this -# "pre" specifies the assembly version - - -org/pre: # One section for each list item in "organism"; entry should match precisely what -# is in the "organism" section above, one entry per list item above, omitting the "" - # URLs to genome, gene & miRNA annotations - genome_url: # FTP/HTTP URL to gzipped genome in FASTA format, Ensembl style - # e.g. "ftp://ftp.ensembl.org/pub/release-106/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna_sm.primary_assembly.fa.gz" - gtf_url: # FTP/HTTP URL to gzipped gene annotations in GTF format, Ensembl style - # e.g. "ftp://ftp.ensembl.org/pub/release-106/gtf/homo_sapiens/Homo_sapiens.GRCh38.106.chr.gtf.gz" - mirna_url: # FTP/HTTP URL to unzipped microRNA annotations in GFF format, miRBase style - # e.g. "https://www.mirbase.org/ftp/CURRENT/genomes/hsa.gff3" - - # Chromosome name mappings between UCSC <-> Ensembl - # Other organisms available at: https://github.com/dpryan79/ChromosomeMappings - map_chr_url: # FTP/HTTP URL to mapping table - # e.g. "https://raw.githubusercontent.com/dpryan79/ChromosomeMappings/master/GRCh38_UCSC2ensembl.txt" - # Chromosome name mapping parameters: - column: 1 # Column number from input file where to change chromosome name - delimiter: "TAB" # Delimiter of the input file - -####################################################################################################### -#### -#### MAP PARAMETERS -#### -####################################################################################################### - -#### PARAMETERS SPECIFIC TO INPUTS ##### - -# Resources: genome, transcriptome, genes, miRs -# All of these are produced by the "prepare" workflow -genome: "path/to/genome.processed.fa" -gtf: "path/to/gene_annotations.filtered.gtf" -transcriptome: "path/to/transcriptome_idtrim.fa" -transcriptome_index_segemehl: "path/to/transcriptome_index_segemehl.idx" -genome_index_segemehl: "path/to/genome_index_segemehl.idx" -exons: "path/to/exons.bed" -header_of_collapsed_fasta: "path/to/headerOfCollapsedFasta.sam" - -#### TOOL PARAMETERS #### +############################# +#### REQUIRED PARAMETERS #### +############################# + +# All paths are relative to the current working directory unless noted otherwise + +samples: path/to/samples_table.csv + +#### GENOME RESOURCES ##### + +# All genome resources have to match the source/organism +# of all samples in the sample table + +# FTP/HTTP URL to gzipped genome in FASTA format, Ensembl style +genome_url: # e.g. "ftp://ftp.ensembl.org/pub/release-106/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna_sm.primary_assembly.fa.gz" + +# FTP/HTTP URL to gzipped gene annotations in GTF format, Ensembl style +gtf_url: # e.g. "ftp://ftp.ensembl.org/pub/release-106/gtf/homo_sapiens/Homo_sapiens.GRCh38.106.chr.gtf.gz" + +# FTP/HTTP URL to unzipped microRNA annotations in GFF format, miRBase style +mirna_url: # e.g. "https://www.mirbase.org/ftp/CURRENT/genomes/hsa.gff3" + +# Tab-separated mappings table between UCSC (column 1) +# and Ensembl (coulm 2) chromosome names +# Available at: https://github.com/dpryan79/ChromosomeMappings +map_chr_url: # e.g. "https://raw.githubusercontent.com/dpryan79/ChromosomeMappings/master/GRCh38_UCSC2ensembl.txt" + + +############################### +#### "OPTIONAL" PARAMETERS #### +############################### + +# The below parameters only need to be changed if the default behavior of +# MIRFLOWZ is to be changed; however, they still need to be present! + +#### DIRECTORIES #### + +output_dir: results/ +local_log: logs/local/ +cluster_log: logs/cluster/ +scripts_dir: ../scripts/ + + +#### ISOMIR GENERATION PARAMETERS #### + +# Generate isomiR annotations with the indicated number of shifts relative to +# the start and end position of each annotated mature miRNA, as an array of +# relative positions +# Examples: +# - `bp_5p: [-2,0,+1]` and `bp_3p: [+1]` generates 3 isomiRs for each mature +# miRNA: one that starts two nucleotides before, one that starts exactly at +# and one that starts one nucleotide after the annotated mature miRNA; all +# isomiRs will stop one nucleotide after the end of the annotated mature +# miRNA; note that because `0` is not included in the `bp_3p` array, the +# annotated mature miRNAs will not be included in this example +# - Use `bp_5p: [0]` and `bp_3p: [0]` to only include the mature annotated +# miRNAs and no isomiRs + +bp_5p: [-2, -1, 0, +1, +2] +bp_3p: [-2, -1, 0, +1, +2] + +#### PROCESSING PARAMETERS #### # quality filter q_value: 10 # Q (Phred) score; minimum quality score to keep p_value: 50 # minimum % of bases that must have Q quality # adapter removal -error_rate: 0.1 # fraction of allowed errors +error_rate: 0.1 # fraction of allowed errors minimum_length: 15 # discard processed reads shorter than the indicated length -overlap: 3 # minimum overlap length of adapter and read to trim the bases -max_n: 0 # discard reads containing more than the indicated number of N bases +overlap: 3 # minimum overlap length of adapter and read to trim the bases +max_n: 0 # discard reads containing more than the indicated number of N bases # mapping max_length_reads: 30 # maximum length of processed reads to map with oligomap -nh: 100 # discard reads with more mappings than the indicated number - -####################################################################################################### -#### -#### QUANTIFY PARAMETERS -#### -####################################################################################################### +nh: 100 # discard reads with more mappings than the indicated number +#### QUANTIFICATION PARAMETERS #### # Types of miRNAs to quantify # Remove miRNA types you are not interested in mir_list: ["miRNA", "miRNA_primary_transcript", "isomirs"] - -# Resources: miR annotations, chromosome name mappings -# All of these are produced by the "prepare" workflow -mirnas_anno: "path/to/mirna_filtered.bed" -isomirs_anno: "path/to/isomirs_annotation.bed" -... \ No newline at end of file +... diff --git a/test/config.yaml b/test/config.yaml index c5e23a221a244c731142db7fe29cb508cd75bbad..9308d853d0a6cdf0cc9cefc91034087017e9a726 100644 --- a/test/config.yaml +++ b/test/config.yaml @@ -1,90 +1,53 @@ --- -#### GLOBAL PARAMETERS ##### +############################# +#### REQUIRED PARAMETERS #### +############################# -# Directories -# Usually there is no need to change these -samples: "test_files/samples_table.csv" # For the mapping worflow -quantify_input_dir: "results" # For the quantify worflow -output_dir: "results" -scripts_dir: "../scripts" -local_log: "logs/local" -cluster_log: "logs/cluster" +samples: test_files/samples_table.csv -####################################################################################################### -#### -#### PREPARE PARAMETERS -#### -####################################################################################################### +#### GENOME RESOURCES #### -# Isomirs annotation -# Number of base pairs to add/substract from 5' (start) and 3' (end) coordinates. -bp_5p: [-1, 0, +1] -bp_3p: [-1, 0, +1] +genome_url: "ftp://ftp.ensembl.org/pub/release-98/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna_sm.chromosome.Y.fa.gz" +gtf_url: "ftp://ftp.ensembl.org/pub/release-98/gtf/homo_sapiens/Homo_sapiens.GRCh38.98.gtf.gz" +mirna_url: "https://www.mirbase.org/ftp/CURRENT/genomes/hsa.gff3" +map_chr_url: "https://raw.githubusercontent.com/dpryan79/ChromosomeMappings/master/GRCh38_UCSC2ensembl.txt" -#### PARAMETERS SPECIFIC TO INPUTS ##### +############################### +#### "OPTIONAL" PARAMETERS #### +############################### -organism: "homo_sapiens/chrY" +#### DIRECTORIES ##### -homo_sapiens/chrY: - # URLs to genome, gene & miRNA annotations - genome_url: "ftp://ftp.ensembl.org/pub/release-98/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna_sm.chromosome.Y.fa.gz" - gtf_url: "ftp://ftp.ensembl.org/pub/release-98/gtf/homo_sapiens/Homo_sapiens.GRCh38.98.gtf.gz" - mirna_url: "https://www.mirbase.org/ftp/CURRENT/genomes/hsa.gff3" +output_dir: results/ +local_log: logs/local/ +cluster_log: logs/cluster/ +scripts_dir: ../scripts/ - # Chromosome name mappings between UCSC <-> Ensembl - # Other organisms available at: https://github.com/dpryan79/ChromosomeMappings - map_chr_url: "https://raw.githubusercontent.com/dpryan79/ChromosomeMappings/master/GRCh38_UCSC2ensembl.txt" - # Chromosome name mapping parameters: - column: 1 - delimiter: "TAB" -####################################################################################################### -#### -#### MAP PARAMETERS -#### -####################################################################################################### +#### ISOMIR GENERATION PARAMETERS #### -#### PARAMETERS SPECIFIC TO INPUTS ##### +bp_5p: [-2, -1, 0, +1, +2] +bp_3p: [-2, -1, 0, +1, +2] -genome: "results/homo_sapiens/chrY/genome.processed.fa" -gtf: "results/homo_sapiens/chrY/gene_annotations.filtered.gtf" -transcriptome: "results/homo_sapiens/chrY/transcriptome_idtrim.fa" -transcriptome_index_segemehl: "results/homo_sapiens/chrY/transcriptome_index_segemehl.idx" -genome_index_segemehl: "results/homo_sapiens/chrY/genome_index_segemehl.idx" -exons: "results/homo_sapiens/chrY/exons.bed" -header_of_collapsed_fasta: "results/homo_sapiens/chrY/headerOfCollapsedFasta.sam" - -#### TOOL PARAMETERS #### +#### PROCESSING PARAMETERS #### # quality filter -q_value: 10 -p_value: 50 +q_value: 10 +p_value: 50 # adapter removal -error_rate: 0.1 -minimum_length: 15 -overlap: 3 -max_n: 0 +error_rate: 0.1 +minimum_length: 15 +overlap: 3 +max_n: 0 # mapping -max_length_reads: 30 -nh: 100 - -####################################################################################################### -#### -#### QUANTIFY PARAMETERS -#### -####################################################################################################### +max_length_reads: 30 +nh: 100 +#### QUANTIFICATION PARAMETERS #### -# Types of miRNAs to quantify -#mir_list: ["miRNA", "miRNA_primary_transcript", "isomirs"] mir_list: ["miRNA", "miRNA_primary_transcript"] - -# Resources: miR annotations, chromosome name mappings -# All of these are produced by the "prepare" workflow -mirnas_anno: "results/homo_sapiens/chrY/mirna_filtered.bed" -isomirs_anno: "results/homo_sapiens/chrY/isomirs_annotation.bed" ... \ No newline at end of file diff --git a/workflow/Snakefile b/workflow/Snakefile index c8c544662ede821ac314a0e65625cd9c70ed6a45..83fb2c5afe28ad765e81c06b3979865f1daaa207 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -23,7 +23,7 @@ import os import pandas as pd -configfile: "../config/config.yaml" +configfile: "config.yaml" ############################################################################### ### Global configuration @@ -45,53 +45,29 @@ include: os.path.join("rules" , "quantify.smk") ############################################################################### rule finish: input: - idx_transcriptome=expand( - os.path.join( + idx_transcriptome=os.path.join( config["output_dir"], - "{organism}", "transcriptome_index_segemehl.idx", - ), - organism=config["organism"], ), - idx_genome=expand( - os.path.join( + idx_genome=os.path.join( config["output_dir"], - "{organism}", "genome_index_segemehl.idx", - ), - organism=config["organism"], ), - exons=expand( - os.path.join( + exons=os.path.join( config["output_dir"], - "{organism}", "exons.bed", - ), - organism=config["organism"], ), - header=expand( - os.path.join( + header=os.path.join( config["output_dir"], - "{organism}", "headerOfCollapsedFasta.sam", - ), - organism=config["organism"], ), - mirnafilt=expand( - os.path.join( + mirnafilt=os.path.join( config["output_dir"], - "{organism}", "mirna_filtered.bed", - ), - organism=config["organism"], ), - isomirs=expand( - os.path.join( + isomirs=os.path.join( config["output_dir"], - "{organism}", "isomirs_annotation.bed", - ), - organism=config["organism"], ), maps=expand( os.path.join( diff --git a/workflow/rules/map.smk b/workflow/rules/map.smk index 414f16f7564cbfdfa80f4e2777e9f9af0bfad332..659b301477b18b0cbe8bca8b2705428cc9379cc1 100644 --- a/workflow/rules/map.smk +++ b/workflow/rules/map.smk @@ -268,8 +268,8 @@ rule fastx_collapser: rule mapping_genome_segemehl: input: reads=os.path.join(config["output_dir"], "{sample}", "collapsed.fasta"), - genome=config["genome"], - genome_index_segemehl=config["genome_index_segemehl"], + genome=os.path.join(config["output_dir"], "genome.processed.fa"), + genome_index_segemehl=os.path.join(config["output_dir"], "genome_index_segemehl.idx"), output: gmap=os.path.join( config["output_dir"], "{sample}", "segemehlGenome_map.sam" @@ -306,8 +306,8 @@ rule mapping_genome_segemehl: rule mapping_transcriptome_segemehl: input: reads=os.path.join(config["output_dir"], "{sample}", "collapsed.fasta"), - transcriptome=config["transcriptome"], - transcriptome_index_segemehl=config["transcriptome_index_segemehl"], + transcriptome=os.path.join(config["output_dir"], "transcriptome_idtrim.fa"), + transcriptome_index_segemehl=os.path.join(config["output_dir"], "transcriptome_index_segemehl.idx"), output: tmap=os.path.join( config["output_dir"], "{sample}", "segemehlTranscriptome_map.sam" @@ -379,7 +379,7 @@ rule mapping_genome_oligomap: reads=os.path.join( config["output_dir"], "{sample}", "filtered_for_oligomap.fasta" ), - target=config["genome"], + target=os.path.join(config["output_dir"], "genome.processed.fa"), output: gmap=os.path.join( config["output_dir"], "{sample}", "oligoGenome_map.fa" @@ -496,7 +496,7 @@ rule mapping_transcriptome_oligomap: reads=os.path.join( config["output_dir"], "{sample}", "filtered_for_oligomap.fasta" ), - target=config["transcriptome"], + target=os.path.join(config["output_dir"], "transcriptome_idtrim.fa"), output: tmap=os.path.join( config["output_dir"], "{sample}", "oligoTranscriptome_map.fa" @@ -808,7 +808,7 @@ rule trans_to_gen: "noheader_TranscriptomeMappings.sam", ), script=os.path.join(config["scripts_dir"], "sam_trx_to_sam_gen.pl"), - exons=config["exons"], + exons=os.path.join(config["output_dir"], "exons.bed"), output: genout=os.path.join(config["output_dir"], "{sample}", "TransToGen.sam"), params: @@ -861,7 +861,7 @@ rule cat_mapping: rule add_header: input: - header=config["header_of_collapsed_fasta"], + header=os.path.join(config["output_dir"], "headerOfCollapsedFasta.sam"), catmaps=os.path.join( config["output_dir"], "{sample}", "catMappings.sam" ), diff --git a/workflow/rules/prepare.smk b/workflow/rules/prepare.smk index 2e4c1913b58171b79d95f75b01ec9c9571107a7b..2294d3f8d9631c7b754d50b4713069db950f0a06 100644 --- a/workflow/rules/prepare.smk +++ b/workflow/rules/prepare.smk @@ -41,53 +41,30 @@ localrules: ############################################################################### rule finish_prepare: input: - idx_transcriptome=expand( - os.path.join( + idx_transcriptome=os.path.join( config["output_dir"], - "{organism}", "transcriptome_index_segemehl.idx", - ), - organism=config["organism"], + ), - idx_genome=expand( - os.path.join( + idx_genome=os.path.join( config["output_dir"], - "{organism}", "genome_index_segemehl.idx", - ), - organism=config["organism"], ), - exons=expand( - os.path.join( + exons=os.path.join( config["output_dir"], - "{organism}", "exons.bed", - ), - organism=config["organism"], ), - header=expand( - os.path.join( + header=os.path.join( config["output_dir"], - "{organism}", "headerOfCollapsedFasta.sam", - ), - organism=config["organism"], ), - mirnafilt=expand( - os.path.join( + mirnafilt=os.path.join( config["output_dir"], - "{organism}", "mirna_filtered.bed", - ), - organism=config["organism"], ), - isomirs=expand( - os.path.join( + isomirs=os.path.join( config["output_dir"], - "{organism}", "isomirs_annotation.bed", - ), - organism=config["organism"], ), @@ -100,13 +77,13 @@ rule genome_process: script=os.path.join(config["scripts_dir"], "genome_process.sh"), output: genome=os.path.join( - config["output_dir"], "{organism}", "genome.processed.fa" + config["output_dir"], "genome.processed.fa" ), params: - url=lambda wildcards: config[wildcards.organism]["genome_url"], - dir_out=os.path.join(config["output_dir"], "{organism}"), + url=config["genome_url"], + dir_out=config["output_dir"], log: - os.path.join(config["local_log"], "{organism}", "genome_process.log"), + os.path.join(config["local_log"], "genome_process.log"), singularity: "docker://zavolab/ubuntu:18.04" shell: @@ -124,14 +101,13 @@ rule filter_anno_gtf: output: gtf=os.path.join( config["output_dir"], - "{organism}", "gene_annotations.filtered.gtf", ), params: - url=lambda wildcards: config[wildcards.organism]["gtf_url"], - dir_out=os.path.join(config["output_dir"], "{organism}"), + url=config["gtf_url"], + dir_out=config["output_dir"], log: - os.path.join(config["local_log"], "{organism}", "filter_anno_gtf.log"), + os.path.join(config["local_log"], "filter_anno_gtf.log"), singularity: "docker://zavolab/ubuntu:18.04" shell: @@ -146,27 +122,25 @@ rule filter_anno_gtf: rule extract_transcriptome_seqs: input: genome=os.path.join( - config["output_dir"], "{organism}", "genome.processed.fa" + config["output_dir"], "genome.processed.fa" ), gtf=os.path.join( config["output_dir"], - "{organism}", "gene_annotations.filtered.gtf", ), output: fasta=os.path.join( - config["output_dir"], "{organism}", "transcriptome.fa" + config["output_dir"], "transcriptome.fa" ), params: cluster_log=os.path.join( config["cluster_log"], - "{organism}", "extract_transcriptome_seqs.log", ), log: os.path.join( config["local_log"], - "{organism}", + "extract_transcriptome_seqs.log", ), singularity: @@ -183,19 +157,19 @@ rule extract_transcriptome_seqs: rule trim_fasta: input: fasta=os.path.join( - config["output_dir"], "{organism}", "transcriptome.fa" + config["output_dir"], "transcriptome.fa" ), script=os.path.join(config["scripts_dir"], "validation_fasta.py"), output: fasta=os.path.join( - config["output_dir"], "{organism}", "transcriptome_idtrim.fa" + config["output_dir"], "transcriptome_idtrim.fa" ), params: cluster_log=os.path.join( - config["cluster_log"], "{organism}", "trim_fasta.log" + config["cluster_log"], "trim_fasta.log" ), log: - os.path.join(config["local_log"], "{organism}", "trim_fasta.log"), + os.path.join(config["local_log"], "trim_fasta.log"), singularity: "docker://zavolab/ubuntu:18.04" shell: @@ -215,24 +189,21 @@ rule trim_fasta: rule generate_segemehl_index_transcriptome: input: fasta=os.path.join( - config["output_dir"], "{organism}", "transcriptome_idtrim.fa" + config["output_dir"], "transcriptome_idtrim.fa" ), output: idx=os.path.join( config["output_dir"], - "{organism}", "transcriptome_index_segemehl.idx", ), params: cluster_log=os.path.join( config["cluster_log"], - "{organism}", "generate_segemehl_index_transcriptome.log", ), log: os.path.join( config["local_log"], - "{organism}", "generate_segemehl_index_transcriptome.log", ), resources: @@ -253,22 +224,20 @@ rule generate_segemehl_index_transcriptome: rule generate_segemehl_index_genome: input: genome=os.path.join( - config["output_dir"], "{organism}", "genome.processed.fa" + config["output_dir"], "genome.processed.fa" ), output: idx=os.path.join( - config["output_dir"], "{organism}", "genome_index_segemehl.idx" + config["output_dir"], "genome_index_segemehl.idx" ), params: cluster_log=os.path.join( config["cluster_log"], - "{organism}", "generate_segemehl_index_genome.log", ), log: os.path.join( config["local_log"], - "{organism}", "generate_segemehl_index_genome.log", ), resources: @@ -290,18 +259,17 @@ rule get_exons_gtf: input: gtf=os.path.join( config["output_dir"], - "{organism}", "gene_annotations.filtered.gtf", ), script=os.path.join(config["scripts_dir"], "get_lines_w_pattern.sh"), output: - exons=os.path.join(config["output_dir"], "{organism}", "exons.gtf"), + exons=os.path.join(config["output_dir"], "exons.gtf"), params: cluster_log=os.path.join( - config["cluster_log"], "{organism}", "get_exons_gtf.log" + config["cluster_log"], "get_exons_gtf.log" ), log: - os.path.join(config["local_log"], "{organism}", "get_exons_gtf.log"), + os.path.join(config["local_log"], "get_exons_gtf.log"), singularity: "docker://zavolab/ubuntu:18.04" shell: @@ -321,16 +289,16 @@ rule get_exons_gtf: rule gtftobed: input: - exons=os.path.join(config["output_dir"], "{organism}", "exons.gtf"), + exons=os.path.join(config["output_dir"], "exons.gtf"), script=os.path.join(config["scripts_dir"], "gtf_exons_bed.1.1.2.R"), output: - exons=os.path.join(config["output_dir"], "{organism}", "exons.bed"), + exons=os.path.join(config["output_dir"], "exons.bed"), params: cluster_log=os.path.join( - config["cluster_log"], "{organism}", "gtftobed.log" + config["cluster_log"], "gtftobed.log" ), log: - os.path.join(config["local_log"], "{organism}", "gtftobed.log"), + os.path.join(config["local_log"], "gtftobed.log"), singularity: "docker://zavolab/r-zavolab:3.5.1" shell: @@ -349,19 +317,19 @@ rule gtftobed: rule create_header_genome: input: genome=os.path.join( - config["output_dir"], "{organism}", "genome.processed.fa" + config["output_dir"], "genome.processed.fa" ), output: header=os.path.join( - config["output_dir"], "{organism}", "headerOfCollapsedFasta.sam" + config["output_dir"], "headerOfCollapsedFasta.sam" ), params: cluster_log=os.path.join( - config["cluster_log"], "{organism}", "create_header_genome.log" + config["cluster_log"], "create_header_genome.log" ), log: os.path.join( - config["local_log"], "{organism}", "create_header_genome.log" + config["local_log"], "create_header_genome.log" ), singularity: "docker://zavolab/samtools:1.8" @@ -377,19 +345,19 @@ rule create_header_genome: rule mirna_anno: input: genome=os.path.join( - config["output_dir"], "{organism}", "genome.processed.fa" + config["output_dir"], "genome.processed.fa" ), output: anno=os.path.join( - config["output_dir"], "{organism}", "raw", "mirna.gff3" + config["output_dir"], "mirna.gff3" ), params: - anno=lambda wildcards: config[wildcards.organism]["mirna_url"], + anno=config["mirna_url"], cluster_log=os.path.join( - config["cluster_log"], "{organism}", "mirna_anno.log" + config["cluster_log"], "mirna_anno.log" ), log: - os.path.join(config["local_log"], "{organism}", "mirna_anno.log"), + os.path.join(config["local_log"], "mirna_anno.log"), singularity: "docker://zavolab/ubuntu:18.04" shell: @@ -404,19 +372,19 @@ rule mirna_anno: rule dict_chr: input: genome=os.path.join( - config["output_dir"], "{organism}", "genome.processed.fa" + config["output_dir"], "genome.processed.fa" ), output: map_chr=os.path.join( - config["output_dir"], "{organism}", "UCSC2ensembl.txt" + config["output_dir"], "UCSC2ensembl.txt" ), params: - map_chr=lambda wildcards: config[wildcards.organism]["map_chr_url"], + map_chr=config["map_chr_url"], cluster_log=os.path.join( - config["cluster_log"], "{organism}", "dict_chr.log" + config["cluster_log"], "dict_chr.log" ), log: - os.path.join(config["local_log"], "{organism}", "dict_chr.log"), + os.path.join(config["local_log"], "dict_chr.log"), singularity: "docker://zavolab/ubuntu:18.04" shell: @@ -431,24 +399,24 @@ rule dict_chr: rule map_chr_names: input: anno=os.path.join( - config["output_dir"], "{organism}", "raw", "mirna.gff3" + config["output_dir"], "mirna.gff3" ), script=os.path.join(config["scripts_dir"], "map_chromosomes.pl"), map_chr=os.path.join( - config["output_dir"], "{organism}", "UCSC2ensembl.txt" + config["output_dir"], "UCSC2ensembl.txt" ), output: gff=os.path.join( - config["output_dir"], "{organism}", "mirna_chr_mapped.gff3" + config["output_dir"], "mirna_chr_mapped.gff3" ), params: cluster_log=os.path.join( - config["cluster_log"], "{organism}", "map_chr_names.log" + config["cluster_log"], "map_chr_names.log" ), - column=lambda wildcards: config[wildcards.organism]["column"], - delimiter=lambda wildcards: config[wildcards.organism]["delimiter"], + column=1, + delimiter="TAB", log: - os.path.join(config["local_log"], "{organism}", "map_chr_names.log"), + os.path.join(config["local_log"], "map_chr_names.log"), singularity: "docker://zavolab/perl:5.28" shell: @@ -469,20 +437,20 @@ rule map_chr_names: rule filter_mir_1_anno: input: gff=os.path.join( - config["output_dir"], "{organism}", "mirna_chr_mapped.gff3" + config["output_dir"], "mirna_chr_mapped.gff3" ), output: gff=os.path.join( - config["output_dir"], "{organism}", "mirna_filtered.gff3" + config["output_dir"], "mirna_filtered.gff3" ), params: script=os.path.join(config["scripts_dir"], "filter_mir_1_anno.sh"), cluster_log=os.path.join( - config["cluster_log"], "{organism}", "filter_mir_1_anno.log" + config["cluster_log"], "filter_mir_1_anno.log" ), log: os.path.join( - config["local_log"], "{organism}", "filter_mir_1_anno.log" + config["local_log"], "filter_mir_1_anno.log" ), singularity: "docker://zavolab/ubuntu:18.04" @@ -498,19 +466,19 @@ rule filter_mir_1_anno: rule gfftobed: input: gff=os.path.join( - config["output_dir"], "{organism}", "mirna_filtered.gff3" + config["output_dir"], "mirna_filtered.gff3" ), output: bed=os.path.join( - config["output_dir"], "{organism}", "mirna_filtered.bed" + config["output_dir"], "mirna_filtered.bed" ), params: cluster_log=os.path.join( - config["cluster_log"], "{organism}", "gfftobed.log" + config["cluster_log"], "gfftobed.log" ), - out_dir=os.path.join(config["output_dir"]), + out_dir=config["output_dir"] log: - os.path.join(config["local_log"], "{organism}", "gfftobed.log"), + os.path.join(config["local_log"], "gfftobed.log"), singularity: "docker://zavolab/bedops:2.4.35" shell: @@ -528,19 +496,19 @@ rule gfftobed: rule create_index_fasta: input: genome=os.path.join( - config["output_dir"], "{organism}", "genome.processed.fa" + config["output_dir"], "genome.processed.fa" ), output: genome=os.path.join( - config["output_dir"], "{organism}", "genome.processed.fa.fai" + config["output_dir"], "genome.processed.fa.fai" ), params: cluster_log=os.path.join( - config["cluster_log"], "{organism}", "create_index_fasta.log" + config["cluster_log"], "create_index_fasta.log" ), log: os.path.join( - config["local_log"], "{organism}", "create_index_fasta.log" + config["local_log"], "create_index_fasta.log" ), singularity: "docker://zavolab/samtools:1.8" @@ -556,18 +524,18 @@ rule create_index_fasta: rule extract_chr_len: input: genome=os.path.join( - config["output_dir"], "{organism}", "genome.processed.fa.fai" + config["output_dir"], "genome.processed.fa.fai" ), output: chrsize=os.path.join( - config["output_dir"], "{organism}", "chr_size.txt" + config["output_dir"], "chr_size.txt" ), params: cluster_log=os.path.join( - config["cluster_log"], "{organism}", "extract_chr_len.log" + config["cluster_log"], "extract_chr_len.log" ), log: - os.path.join(config["local_log"], "{organism}", "extract_chr_len.log"), + os.path.join(config["local_log"], "extract_chr_len.log"), singularity: "docker://zavolab/ubuntu:18.04" shell: @@ -582,20 +550,20 @@ rule extract_chr_len: rule filter_mature_mirs: input: bed=os.path.join( - config["output_dir"], "{organism}", "mirna_filtered.bed" + config["output_dir"], "mirna_filtered.bed" ), output: bed=os.path.join( - config["output_dir"], "{organism}", "mirna_mature_filtered.bed" + config["output_dir"], "mirna_mature_filtered.bed" ), params: cluster_log=os.path.join( - config["cluster_log"], "{organism}", "filter_mature_mirs.log" + config["cluster_log"], "filter_mature_mirs.log" ), precursor="miRNA_primary_transcript", log: os.path.join( - config["local_log"], "{organism}", "filter_mature_mirs.log" + config["local_log"], "filter_mature_mirs.log" ), singularity: "docker://zavolab/ubuntu:18.04" @@ -611,21 +579,19 @@ rule filter_mature_mirs: rule iso_anno: input: bed=os.path.join( - config["output_dir"], "{organism}", "mirna_mature_filtered.bed" + config["output_dir"], "mirna_mature_filtered.bed" ), chrsize=os.path.join( - config["output_dir"], "{organism}", "chr_size.txt" + config["output_dir"], "chr_size.txt" ), output: bed=os.path.join( config["output_dir"], - "{organism}", "iso_anno_5p{bp_5p}_3p{bp_3p}.bed", ), params: cluster_log=os.path.join( config["cluster_log"], - "{organism}", "iso_anno_5p{bp_5p}_3p{bp_3p}.log", ), bp_5p=lambda wildcards: wildcards.bp_5p, @@ -633,7 +599,6 @@ rule iso_anno: log: os.path.join( config["local_log"], - "{organism}", "iso_anno_5p{bp_5p}_3p{bp_3p}.log", ), singularity: @@ -657,19 +622,16 @@ rule iso_anno_rename: input: bed=os.path.join( config["output_dir"], - "{organism}", "iso_anno_5p{bp_5p}_3p{bp_3p}.bed", ), output: bed=os.path.join( config["output_dir"], - "{organism}", "iso_anno_rename_5p{bp_5p}_3p{bp_3p}.bed", ), params: cluster_log=os.path.join( config["cluster_log"], - "{organism}", "iso_anno_rename_5p{bp_5p}_3p{bp_3p}.log", ), bp_5p=lambda wildcards: wildcards.bp_5p, @@ -677,7 +639,6 @@ rule iso_anno_rename: log: os.path.join( config["local_log"], - "{organism}", "iso_anno_rename_5p{bp_5p}_3p{bp_3p}.log", ), singularity: @@ -700,26 +661,24 @@ rule iso_anno_concat: bed=lambda wildcards: expand( os.path.join( config["output_dir"], - "{organism}", "iso_anno_rename_5p{bp_5p}_3p{bp_3p}.bed", ), - organism=config["organism"], bp_3p=config["bp_3p"], bp_5p=config["bp_5p"], ), output: bed=os.path.join( - config["output_dir"], "{organism}", "iso_anno_concat.bed" + config["output_dir"], "iso_anno_concat.bed" ), params: cluster_log=os.path.join( - config["cluster_log"], "{organism}", "iso_anno_concat.log" + config["cluster_log"], "iso_anno_concat.log" ), prefix=os.path.join( - config["output_dir"], "{organism}", "iso_anno_rename" + config["output_dir"], "iso_anno_rename" ), log: - os.path.join(config["local_log"], "{organism}", "iso_anno_concat.log"), + os.path.join(config["local_log"], "iso_anno_concat.log"), singularity: "docker://zavolab/ubuntu:18.04" shell: @@ -734,19 +693,19 @@ rule iso_anno_concat: rule iso_anno_final: input: bed=os.path.join( - config["output_dir"], "{organism}", "iso_anno_concat.bed" + config["output_dir"], "iso_anno_concat.bed" ), output: bed=os.path.join( - config["output_dir"], "{organism}", "isomirs_annotation.bed" + config["output_dir"], "isomirs_annotation.bed" ), params: cluster_log=os.path.join( - config["cluster_log"], "{organism}", "iso_anno_final.log" + config["cluster_log"], "iso_anno_final.log" ), pattern="5p0_3p0", log: - os.path.join(config["local_log"], "{organism}", "iso_anno_final.log"), + os.path.join(config["local_log"], "iso_anno_final.log"), singularity: "docker://zavolab/ubuntu:18.04" shell: diff --git a/workflow/rules/quantify.smk b/workflow/rules/quantify.smk index 90fd57223174711bf0d854f3693c2defdf4e0364..625eea9160c5a96403b0e26fbf9a84c6adcea78f 100644 --- a/workflow/rules/quantify.smk +++ b/workflow/rules/quantify.smk @@ -74,7 +74,7 @@ rule finish_quantify: rule bamtobed: input: alignment=os.path.join( - config["quantify_input_dir"], + config["output_dir"], "{sample}", "convertedSortedMappings_{sample}.bam", ), @@ -143,7 +143,9 @@ rule intersect_mirna: alignment=os.path.join( config["output_dir"], "{sample}", "sorted.alignments.bed12" ), - mirna=config["mirnas_anno"], + mirna=os.path.join( + config["output_dir"], "mirna_filtered.bed" + ), output: intersect=os.path.join( config["output_dir"], "{sample}", "intersect_mirna.bed" @@ -178,7 +180,9 @@ rule intersect_mirna: # alignment=os.path.join( # config["output_dir"], "{sample}", "sorted.alignments.bed12" # ), -# isomirs=config["isomirs_anno"], +# isomirs=os.path.join( +# config["output_dir"], "isomirs_annotation.bed", +# ), # output: # intersect=os.path.join( # config["output_dir"], "{sample}", "intersect_isomirs.bed"