Skip to content
Snippets Groups Projects

refactor: merge prepare, map & quantify workflows

Merged Iris Mestres Pascual requested to merge 25-merge-workflows into dev
1 file
+ 0
731
Compare changes
  • Side-by-side
  • Inline
+ 0
731
###############################################################################
# (c) 2020 Paula Iborra, Zavolan Lab, Biozentrum, University of Basel
# (@) paula.iborradetoledo@unibas.ch / paula.iborra@alumni.esci.upf.edu
#
# Snakemake workflow to download and prepare the necessary files for
# smallRNA-seq related workflows.
###############################################################################
import os
# Rules that require internet connection for downloading files are included
# in the localrules
localrules:
finish,
genome_process,
filter_anno_gtf,
mirna_anno,
dict_chr,
###############################################################################
### Finish rule
###############################################################################
rule finish:
input:
idx_transcriptome=expand(
os.path.join(
config["output_dir"],
"{organism}",
"transcriptome_index_segemehl.idx",
),
organism=config["organism"],
),
idx_genome=expand(
os.path.join(
config["output_dir"],
"{organism}",
"genome_index_segemehl.idx",
),
organism=config["organism"],
),
exons=expand(
os.path.join(config["output_dir"], "{organism}", "exons.bed"),
organism=config["organism"],
),
header=expand(
os.path.join(
config["output_dir"],
"{organism}",
"headerOfCollapsedFasta.sam",
),
organism=config["organism"],
),
mirnafilt=expand(
os.path.join(
config["output_dir"], "{organism}", "mirna_filtered.bed"
),
organism=config["organism"],
),
isomirs=expand(
os.path.join(
config["output_dir"], "{organism}", "isomirs_annotation.bed"
),
organism=config["organism"],
),
###############################################################################
### Download and process genome IDs
###############################################################################
rule genome_process:
input:
script=os.path.join(config["scripts_dir"], "genome_process.sh"),
output:
genome=os.path.join(
config["output_dir"], "{organism}", "genome.processed.fa"
),
params:
url=lambda wildcards: config[wildcards.organism]["genome_url"],
dir_out=os.path.join(config["output_dir"], "{organism}"),
log:
os.path.join(config["local_log"], "{organism}", "genome_process.log"),
singularity:
"docker://zavolab/ubuntu:18.04"
shell:
"(bash {input.script} {params.dir_out} {log} {params.url})"
###############################################################################
### Download and filter gtf by transcript_level
###############################################################################
rule filter_anno_gtf:
input:
script=os.path.join(config["scripts_dir"], "filter_anno_gtf.sh"),
output:
gtf=os.path.join(
config["output_dir"],
"{organism}",
"gene_annotations.filtered.gtf",
),
params:
url=lambda wildcards: config[wildcards.organism]["gtf_url"],
dir_out=os.path.join(config["output_dir"], "{organism}"),
log:
os.path.join(config["local_log"], "{organism}", "filter_anno_gtf.log"),
singularity:
"docker://zavolab/ubuntu:18.04"
shell:
"(bash {input.script} {params.dir_out} {log} {params.url}) &> {log}"
###############################################################################
### Extract transcriptome sequences in FASTA from genome.
###############################################################################
rule extract_transcriptome_seqs:
input:
genome=os.path.join(
config["output_dir"], "{organism}", "genome.processed.fa"
),
gtf=os.path.join(
config["output_dir"],
"{organism}",
"gene_annotations.filtered.gtf",
),
output:
fasta=os.path.join(
config["output_dir"], "{organism}", "transcriptome.fa"
),
params:
cluster_log=os.path.join(
config["cluster_log"],
"{organism}",
"extract_transcriptome_seqs.log",
),
log:
os.path.join(
config["local_log"],
"{organism}",
"extract_transcriptome_seqs.log",
),
singularity:
"docker://zavolab/cufflinks:2.2.1"
shell:
"(gffread -w {output.fasta} -g {input.genome} {input.gtf}) &> {log}"
###############################################################################
### Trim transcript IDs from FASTA file
###############################################################################
rule trim_fasta:
input:
fasta=os.path.join(
config["output_dir"], "{organism}", "transcriptome.fa"
),
script=os.path.join(config["scripts_dir"], "validation_fasta.py"),
output:
fasta=os.path.join(
config["output_dir"], "{organism}", "transcriptome_idtrim.fa"
),
params:
cluster_log=os.path.join(
config["cluster_log"], "{organism}", "trim_fasta.log"
),
log:
os.path.join(config["local_log"], "{organism}", "trim_fasta.log"),
singularity:
"docker://zavolab/ubuntu:18.04"
shell:
"""(awk \
-F" " \
"/^>/ {{print \$1; next}} 1" \
{input.fasta} \
> {output.fasta} \
) &> {log}"""
###############################################################################
### Generate segemehl index for transcripts
###############################################################################
rule generate_segemehl_index_transcriptome:
input:
fasta=os.path.join(
config["output_dir"], "{organism}", "transcriptome_idtrim.fa"
),
output:
idx=os.path.join(
config["output_dir"],
"{organism}",
"transcriptome_index_segemehl.idx",
),
params:
cluster_log=os.path.join(
config["cluster_log"],
"{organism}",
"generate_segemehl_index_transcriptome.log",
),
log:
os.path.join(
config["local_log"],
"{organism}",
"generate_segemehl_index_transcriptome.log",
),
resources:
mem=10,
threads=8,
time=6,
singularity:
"docker://zavolab/segemehl:0.2.0"
shell:
"(segemehl.x -x {output.idx} -d {input.fasta}) &> {log}"
###############################################################################
### Generate segemehl index for genome
###############################################################################
rule generate_segemehl_index_genome:
input:
genome=os.path.join(
config["output_dir"], "{organism}", "genome.processed.fa"
),
output:
idx=os.path.join(
config["output_dir"], "{organism}", "genome_index_segemehl.idx"
),
params:
cluster_log=os.path.join(
config["cluster_log"],
"{organism}",
"generate_segemehl_index_genome.log",
),
log:
os.path.join(
config["local_log"],
"{organism}",
"generate_segemehl_index_genome.log",
),
resources:
mem=60,
threads=8,
time=6,
singularity:
"docker://zavolab/segemehl:0.2.0"
shell:
"(segemehl.x -x {output.idx} -d {input.genome}) &> {log}"
###############################################################################
### GTF file of exons (genomic coordinates)
###############################################################################
rule get_exons_gtf:
input:
gtf=os.path.join(
config["output_dir"],
"{organism}",
"gene_annotations.filtered.gtf",
),
script=os.path.join(config["scripts_dir"], "get_lines_w_pattern.sh"),
output:
exons=os.path.join(config["output_dir"], "{organism}", "exons.gtf"),
params:
cluster_log=os.path.join(
config["cluster_log"], "{organism}", "get_exons_gtf.log"
),
log:
os.path.join(config["local_log"], "{organism}", "get_exons_gtf.log"),
singularity:
"docker://zavolab/ubuntu:18.04"
shell:
"(bash \
{input.script} \
-f {input.gtf} \
-c 3 \
-p exon \
-o {output.exons} \
) &> {log}"
###############################################################################
### Convert GTF file of exons to BED file
###############################################################################
rule gtftobed:
input:
exons=os.path.join(config["output_dir"], "{organism}", "exons.gtf"),
script=os.path.join(config["scripts_dir"], "gtf_exons_bed.1.1.2.R"),
output:
exons=os.path.join(config["output_dir"], "{organism}", "exons.bed"),
params:
cluster_log=os.path.join(
config["cluster_log"], "{organism}", "gtftobed.log"
),
log:
os.path.join(config["local_log"], "{organism}", "gtftobed.log"),
singularity:
"docker://zavolab/r-zavolab:3.5.1"
shell:
"(Rscript \
{input.script} \
--gtf {input.exons} \
-o {output.exons} \
) &> {log}"
###############################################################################
### Create header for SAM file
###############################################################################
rule create_header_genome:
input:
genome=os.path.join(
config["output_dir"], "{organism}", "genome.processed.fa"
),
output:
header=os.path.join(
config["output_dir"], "{organism}", "headerOfCollapsedFasta.sam"
),
params:
cluster_log=os.path.join(
config["cluster_log"], "{organism}", "create_header_genome.log"
),
log:
os.path.join(
config["local_log"], "{organism}", "create_header_genome.log"
),
singularity:
"docker://zavolab/samtools:1.8"
shell:
"(samtools dict -o {output.header} --uri=NA {input.genome}) &> {log}"
###############################################################################
### Download miRNA annotation
###############################################################################
rule mirna_anno:
input:
genome=os.path.join(
config["output_dir"], "{organism}", "genome.processed.fa"
),
output:
anno=os.path.join(
config["output_dir"], "{organism}", "raw", "mirna.gff3"
),
params:
anno=lambda wildcards: config[wildcards.organism]["mirna_url"],
cluster_log=os.path.join(
config["cluster_log"], "{organism}", "mirna_anno.log"
),
log:
os.path.join(config["local_log"], "{organism}", "mirna_anno.log"),
singularity:
"docker://zavolab/ubuntu:18.04"
shell:
"(wget {params.anno} -O {output.anno}) &> {log}"
###############################################################################
### Download dictionary mapping chr
###############################################################################
rule dict_chr:
input:
genome=os.path.join(
config["output_dir"], "{organism}", "genome.processed.fa"
),
output:
map_chr=os.path.join(
config["output_dir"], "{organism}", "UCSC2ensembl.txt"
),
params:
map_chr=lambda wildcards: config[wildcards.organism]["map_chr_url"],
cluster_log=os.path.join(
config["cluster_log"], "{organism}", "dict_chr.log"
),
log:
os.path.join(config["local_log"], "{organism}", "dict_chr.log"),
singularity:
"docker://zavolab/ubuntu:18.04"
shell:
"(wget {params.map_chr} -O {output.map_chr}) &> {log}"
###############################################################################
### Mapping chromosomes names, UCSC <-> ENSEMBL
###############################################################################
rule map_chr_names:
input:
anno=os.path.join(
config["output_dir"], "{organism}", "raw", "mirna.gff3"
),
script=os.path.join(config["scripts_dir"], "map_chromosomes.pl"),
map_chr=os.path.join(
config["output_dir"], "{organism}", "UCSC2ensembl.txt"
),
output:
gff=os.path.join(
config["output_dir"], "{organism}", "mirna_chr_mapped.gff3"
),
params:
cluster_log=os.path.join(
config["cluster_log"], "{organism}", "map_chr_names.log"
),
column=lambda wildcards: config[wildcards.organism]["column"],
delimiter=lambda wildcards: config[wildcards.organism]["delimiter"],
log:
os.path.join(config["local_log"], "{organism}", "map_chr_names.log"),
singularity:
"docker://zavolab/perl:5.28"
shell:
"(perl {input.script} \
{input.anno} \
{params.column} \
{params.delimiter} \
{input.map_chr} \
{output.gff} \
) &> {log}"
###############################################################################
### Filtering _1 miR IDs
###############################################################################
rule filter_mir_1_anno:
input:
gff=os.path.join(
config["output_dir"], "{organism}", "mirna_chr_mapped.gff3"
),
output:
gff=os.path.join(
config["output_dir"], "{organism}", "mirna_filtered.gff3"
),
params:
script=os.path.join(config["scripts_dir"], "filter_mir_1_anno.sh"),
cluster_log=os.path.join(
config["cluster_log"], "{organism}", "filter_mir_1_anno.log"
),
log:
os.path.join(
config["local_log"], "{organism}", "filter_mir_1_anno.log"
),
singularity:
"docker://zavolab/ubuntu:18.04"
shell:
"(bash {params.script} -f {input.gff} -o {output.gff}) &> {log}"
###############################################################################
### GFF to BED (improve intersect memory efficient allowing to use -sorted)
###############################################################################
rule gfftobed:
input:
gff=os.path.join(
config["output_dir"], "{organism}", "mirna_filtered.gff3"
),
output:
bed=os.path.join(
config["output_dir"], "{organism}", "mirna_filtered.bed"
),
params:
cluster_log=os.path.join(
config["cluster_log"], "{organism}", "gfftobed.log"
),
out_dir=os.path.join(config["output_dir"]),
log:
os.path.join(config["local_log"], "{organism}", "gfftobed.log"),
singularity:
"docker://zavolab/bedops:2.4.35"
shell:
"(convert2bed -i gff < {input.gff} \
--sort-tmpdir={params.out_dir} \
> {output.bed} \
) &> {log}"
###############################################################################
### Index genome fasta file
###############################################################################
rule create_index_fasta:
input:
genome=os.path.join(
config["output_dir"], "{organism}", "genome.processed.fa"
),
output:
genome=os.path.join(
config["output_dir"], "{organism}", "genome.processed.fa.fai"
),
params:
cluster_log=os.path.join(
config["cluster_log"], "{organism}", "create_index_fasta.log"
),
log:
os.path.join(
config["local_log"], "{organism}", "create_index_fasta.log"
),
singularity:
"docker://zavolab/samtools:1.8"
shell:
"(samtools faidx {input.genome}) &> {log}"
###############################################################################
### Extract chromosome length
###############################################################################
rule extract_chr_len:
input:
genome=os.path.join(
config["output_dir"], "{organism}", "genome.processed.fa.fai"
),
output:
chrsize=os.path.join(
config["output_dir"], "{organism}", "chr_size.txt"
),
params:
cluster_log=os.path.join(
config["cluster_log"], "{organism}", "extract_chr_len.log"
),
log:
os.path.join(config["local_log"], "{organism}", "extract_chr_len.log"),
singularity:
"docker://zavolab/ubuntu:18.04"
shell:
"(cut -f1,2 {input.genome} > {output.chrsize}) &> {log}"
###############################################################################
### Extract mature miRNA
###############################################################################
rule filter_mature_mirs:
input:
bed=os.path.join(
config["output_dir"], "{organism}", "mirna_filtered.bed"
),
output:
bed=os.path.join(
config["output_dir"], "{organism}", "mirna_mature_filtered.bed"
),
params:
cluster_log=os.path.join(
config["cluster_log"], "{organism}", "filter_mature_mirs.log"
),
precursor="miRNA_primary_transcript",
log:
os.path.join(
config["local_log"], "{organism}", "filter_mature_mirs.log"
),
singularity:
"docker://zavolab/ubuntu:18.04"
shell:
"(grep -v {params.precursor} {input.bed} > {output.bed}) &> {log}"
###############################################################################
### Create isomirs annotation file from mature miRNA
###############################################################################
rule iso_anno:
input:
bed=os.path.join(
config["output_dir"], "{organism}", "mirna_mature_filtered.bed"
),
chrsize=os.path.join(
config["output_dir"], "{organism}", "chr_size.txt"
),
output:
bed=os.path.join(
config["output_dir"],
"{organism}",
"iso_anno_5p{bp_5p}_3p{bp_3p}.bed",
),
params:
cluster_log=os.path.join(
config["cluster_log"],
"{organism}",
"iso_anno_5p{bp_5p}_3p{bp_3p}.log",
),
bp_5p=lambda wildcards: wildcards.bp_5p,
bp_3p=lambda wildcards: wildcards.bp_3p,
log:
os.path.join(
config["local_log"],
"{organism}",
"iso_anno_5p{bp_5p}_3p{bp_3p}.log",
),
singularity:
"docker://zavolab/bedtools:2.28.0"
shell:
"(bedtools slop \
-i {input.bed} \
-g {input.chrsize} \
-l {params.bp_5p} \
-r {params.bp_3p} \
> {output.bed} \
) &> {log}"
###############################################################################
### Change miRNA names to isomirs names
###############################################################################
rule iso_anno_rename:
input:
bed=os.path.join(
config["output_dir"],
"{organism}",
"iso_anno_5p{bp_5p}_3p{bp_3p}.bed",
),
output:
bed=os.path.join(
config["output_dir"],
"{organism}",
"iso_anno_rename_5p{bp_5p}_3p{bp_3p}.bed",
),
params:
cluster_log=os.path.join(
config["cluster_log"],
"{organism}",
"iso_anno_rename_5p{bp_5p}_3p{bp_3p}.log",
),
bp_5p=lambda wildcards: wildcards.bp_5p,
bp_3p=lambda wildcards: wildcards.bp_3p,
log:
os.path.join(
config["local_log"],
"{organism}",
"iso_anno_rename_5p{bp_5p}_3p{bp_3p}.log",
),
singularity:
"docker://zavolab/ubuntu:18.04"
shell:
"(sed \
's/;Derives/_5p{params.bp_5p}_3p{params.bp_3p};Derives/' \
{input.bed} \
> {output.bed} \
) &> {log}"
###############################################################################
### Concatenate all isomirs annotation files
###############################################################################
rule iso_anno_concat:
input:
bed=lambda wildcards: expand(
os.path.join(
config["output_dir"],
"{organism}",
"iso_anno_rename_5p{bp_5p}_3p{bp_3p}.bed",
),
organism=config["organism"],
bp_3p=config["bp_3p"],
bp_5p=config["bp_5p"],
),
output:
bed=os.path.join(
config["output_dir"], "{organism}", "iso_anno_concat.bed"
),
params:
cluster_log=os.path.join(
config["cluster_log"], "{organism}", "iso_anno_concat.log"
),
prefix=os.path.join(
config["output_dir"], "{organism}", "iso_anno_rename"
),
log:
os.path.join(config["local_log"], "{organism}", "iso_anno_concat.log"),
singularity:
"docker://zavolab/ubuntu:18.04"
shell:
"(cat {params.prefix}* > {output.bed}) &> {log}"
###############################################################################
### Remove non changing isomirs (5p0_3p0)
###############################################################################
rule iso_anno_final:
input:
bed=os.path.join(
config["output_dir"], "{organism}", "iso_anno_concat.bed"
),
output:
bed=os.path.join(
config["output_dir"], "{organism}", "isomirs_annotation.bed"
),
params:
cluster_log=os.path.join(
config["cluster_log"], "{organism}", "iso_anno_final.log"
),
pattern="5p0_3p0",
log:
os.path.join(config["local_log"], "{organism}", "iso_anno_final.log"),
singularity:
"docker://zavolab/ubuntu:18.04"
shell:
"(grep -v '{params.pattern}' {input.bed} > {output.bed}) &> {log}"
Loading