Iris Mestres Pascual
--- a/workflow/prepare/Snakefile deleted 100644 → 0

+ 0

− 731
+++ b/workflow/prepare/Snakefile deleted 100644 → 0

+ 0

− 731
-###############################################################################
-# (c) 2020 Paula Iborra, Zavolan Lab, Biozentrum, University of Basel
-# (@) paula.iborradetoledo@unibas.ch / paula.iborra@alumni.esci.upf.edu
-#
-# Snakemake workflow to download and prepare the necessary files for
-# smallRNA-seq related workflows.
-###############################################################################
-
-import os
-
-
-# Rules that require internet connection for downloading files are included
-# in the localrules
-localrules:
-    finish,
-    genome_process,
-    filter_anno_gtf,
-    mirna_anno,
-    dict_chr,
-
-
-###############################################################################
-### Finish rule
-###############################################################################
-
-
-rule finish:
-    input:
-        idx_transcriptome=expand(
-            os.path.join(
-                config["output_dir"],
-                "{organism}",
-                "transcriptome_index_segemehl.idx",
-            ),
-            organism=config["organism"],
-        ),
-        idx_genome=expand(
-            os.path.join(
-                config["output_dir"],
-                "{organism}",
-                "genome_index_segemehl.idx",
-            ),
-            organism=config["organism"],
-        ),
-        exons=expand(
-            os.path.join(config["output_dir"], "{organism}", "exons.bed"),
-            organism=config["organism"],
-        ),
-        header=expand(
-            os.path.join(
-                config["output_dir"],
-                "{organism}",
-                "headerOfCollapsedFasta.sam",
-            ),
-            organism=config["organism"],
-        ),
-        mirnafilt=expand(
-            os.path.join(
-                config["output_dir"], "{organism}", "mirna_filtered.bed"
-            ),
-            organism=config["organism"],
-        ),
-        isomirs=expand(
-            os.path.join(
-                config["output_dir"], "{organism}", "isomirs_annotation.bed"
-            ),
-            organism=config["organism"],
-        ),
-
-
-###############################################################################
-### Download and process genome IDs
-###############################################################################
-
-
-rule genome_process:
-    input:
-        script=os.path.join(config["scripts_dir"], "genome_process.sh"),
-    output:
-        genome=os.path.join(
-            config["output_dir"], "{organism}", "genome.processed.fa"
-        ),
-    params:
-        url=lambda wildcards: config[wildcards.organism]["genome_url"],
-        dir_out=os.path.join(config["output_dir"], "{organism}"),
-    log:
-        os.path.join(config["local_log"], "{organism}", "genome_process.log"),
-    singularity:
-        "docker://zavolab/ubuntu:18.04"
-    shell:
-        "(bash {input.script} {params.dir_out} {log} {params.url})"
-
-
-###############################################################################
-### Download and filter gtf by transcript_level
-###############################################################################
-
-
-rule filter_anno_gtf:
-    input:
-        script=os.path.join(config["scripts_dir"], "filter_anno_gtf.sh"),
-    output:
-        gtf=os.path.join(
-            config["output_dir"],
-            "{organism}",
-            "gene_annotations.filtered.gtf",
-        ),
-    params:
-        url=lambda wildcards: config[wildcards.organism]["gtf_url"],
-        dir_out=os.path.join(config["output_dir"], "{organism}"),
-    log:
-        os.path.join(config["local_log"], "{organism}", "filter_anno_gtf.log"),
-    singularity:
-        "docker://zavolab/ubuntu:18.04"
-    shell:
-        "(bash {input.script} {params.dir_out} {log} {params.url}) &> {log}"
-
-
-###############################################################################
-### Extract transcriptome sequences in FASTA from genome.
-###############################################################################
-
-
-rule extract_transcriptome_seqs:
-    input:
-        genome=os.path.join(
-            config["output_dir"], "{organism}", "genome.processed.fa"
-        ),
-        gtf=os.path.join(
-            config["output_dir"],
-            "{organism}",
-            "gene_annotations.filtered.gtf",
-        ),
-    output:
-        fasta=os.path.join(
-            config["output_dir"], "{organism}", "transcriptome.fa"
-        ),
-    params:
-        cluster_log=os.path.join(
-            config["cluster_log"],
-            "{organism}",
-            "extract_transcriptome_seqs.log",
-        ),
-    log:
-        os.path.join(
-            config["local_log"],
-            "{organism}",
-            "extract_transcriptome_seqs.log",
-        ),
-    singularity:
-        "docker://zavolab/cufflinks:2.2.1"
-    shell:
-        "(gffread -w {output.fasta} -g {input.genome} {input.gtf}) &> {log}"
-
-
-###############################################################################
-### Trim transcript IDs from FASTA file
-###############################################################################
-
-
-rule trim_fasta:
-    input:
-        fasta=os.path.join(
-            config["output_dir"], "{organism}", "transcriptome.fa"
-        ),
-        script=os.path.join(config["scripts_dir"], "validation_fasta.py"),
-    output:
-        fasta=os.path.join(
-            config["output_dir"], "{organism}", "transcriptome_idtrim.fa"
-        ),
-    params:
-        cluster_log=os.path.join(
-            config["cluster_log"], "{organism}", "trim_fasta.log"
-        ),
-    log:
-        os.path.join(config["local_log"], "{organism}", "trim_fasta.log"),
-    singularity:
-        "docker://zavolab/ubuntu:18.04"
-    shell:
-        """(awk \
-        -F" " \
-        "/^>/ {{print \$1; next}} 1" \
-        {input.fasta} \
-        > {output.fasta} \
-        ) &> {log}"""
-
-
-###############################################################################
-### Generate segemehl index for transcripts
-###############################################################################
-
-
-rule generate_segemehl_index_transcriptome:
-    input:
-        fasta=os.path.join(
-            config["output_dir"], "{organism}", "transcriptome_idtrim.fa"
-        ),
-    output:
-        idx=os.path.join(
-            config["output_dir"],
-            "{organism}",
-            "transcriptome_index_segemehl.idx",
-        ),
-    params:
-        cluster_log=os.path.join(
-            config["cluster_log"],
-            "{organism}",
-            "generate_segemehl_index_transcriptome.log",
-        ),
-    log:
-        os.path.join(
-            config["local_log"],
-            "{organism}",
-            "generate_segemehl_index_transcriptome.log",
-        ),
-    resources:
-        mem=10,
-        threads=8,
-        time=6,
-    singularity:
-        "docker://zavolab/segemehl:0.2.0"
-    shell:
-        "(segemehl.x -x {output.idx} -d {input.fasta}) &> {log}"
-
-
-###############################################################################
-### Generate segemehl index for genome
-###############################################################################
-
-
-rule generate_segemehl_index_genome:
-    input:
-        genome=os.path.join(
-            config["output_dir"], "{organism}", "genome.processed.fa"
-        ),
-    output:
-        idx=os.path.join(
-            config["output_dir"], "{organism}", "genome_index_segemehl.idx"
-        ),
-    params:
-        cluster_log=os.path.join(
-            config["cluster_log"],
-            "{organism}",
-            "generate_segemehl_index_genome.log",
-        ),
-    log:
-        os.path.join(
-            config["local_log"],
-            "{organism}",
-            "generate_segemehl_index_genome.log",
-        ),
-    resources:
-        mem=60,
-        threads=8,
-        time=6,
-    singularity:
-        "docker://zavolab/segemehl:0.2.0"
-    shell:
-        "(segemehl.x -x {output.idx} -d {input.genome}) &> {log}"
-
-
-###############################################################################
-### GTF file of exons (genomic coordinates)
-###############################################################################
-
-
-rule get_exons_gtf:
-    input:
-        gtf=os.path.join(
-            config["output_dir"],
-            "{organism}",
-            "gene_annotations.filtered.gtf",
-        ),
-        script=os.path.join(config["scripts_dir"], "get_lines_w_pattern.sh"),
-    output:
-        exons=os.path.join(config["output_dir"], "{organism}", "exons.gtf"),
-    params:
-        cluster_log=os.path.join(
-            config["cluster_log"], "{organism}", "get_exons_gtf.log"
-        ),
-    log:
-        os.path.join(config["local_log"], "{organism}", "get_exons_gtf.log"),
-    singularity:
-        "docker://zavolab/ubuntu:18.04"
-    shell:
-        "(bash \
-        {input.script} \
-        -f {input.gtf} \
-        -c 3 \
-        -p exon \
-        -o {output.exons} \
-        ) &> {log}"
-
-
-###############################################################################
-### Convert GTF file of exons to BED file
-###############################################################################
-
-
-rule gtftobed:
-    input:
-        exons=os.path.join(config["output_dir"], "{organism}", "exons.gtf"),
-        script=os.path.join(config["scripts_dir"], "gtf_exons_bed.1.1.2.R"),
-    output:
-        exons=os.path.join(config["output_dir"], "{organism}", "exons.bed"),
-    params:
-        cluster_log=os.path.join(
-            config["cluster_log"], "{organism}", "gtftobed.log"
-        ),
-    log:
-        os.path.join(config["local_log"], "{organism}", "gtftobed.log"),
-    singularity:
-        "docker://zavolab/r-zavolab:3.5.1"
-    shell:
-        "(Rscript \
-        {input.script} \
-        --gtf {input.exons} \
-        -o {output.exons} \
-        ) &> {log}"
-
-
-###############################################################################
-### Create header for SAM file
-###############################################################################
-
-
-rule create_header_genome:
-    input:
-        genome=os.path.join(
-            config["output_dir"], "{organism}", "genome.processed.fa"
-        ),
-    output:
-        header=os.path.join(
-            config["output_dir"], "{organism}", "headerOfCollapsedFasta.sam"
-        ),
-    params:
-        cluster_log=os.path.join(
-            config["cluster_log"], "{organism}", "create_header_genome.log"
-        ),
-    log:
-        os.path.join(
-            config["local_log"], "{organism}", "create_header_genome.log"
-        ),
-    singularity:
-        "docker://zavolab/samtools:1.8"
-    shell:
-        "(samtools dict -o {output.header} --uri=NA {input.genome}) &> {log}"
-
-
-###############################################################################
-### Download miRNA annotation
-###############################################################################
-
-
-rule mirna_anno:
-    input:
-        genome=os.path.join(
-            config["output_dir"], "{organism}", "genome.processed.fa"
-        ),
-    output:
-        anno=os.path.join(
-            config["output_dir"], "{organism}", "raw", "mirna.gff3"
-        ),
-    params:
-        anno=lambda wildcards: config[wildcards.organism]["mirna_url"],
-        cluster_log=os.path.join(
-            config["cluster_log"], "{organism}", "mirna_anno.log"
-        ),
-    log:
-        os.path.join(config["local_log"], "{organism}", "mirna_anno.log"),
-    singularity:
-        "docker://zavolab/ubuntu:18.04"
-    shell:
-        "(wget {params.anno} -O {output.anno}) &> {log}"
-
-
-###############################################################################
-### Download dictionary mapping chr
-###############################################################################
-
-
-rule dict_chr:
-    input:
-        genome=os.path.join(
-            config["output_dir"], "{organism}", "genome.processed.fa"
-        ),
-    output:
-        map_chr=os.path.join(
-            config["output_dir"], "{organism}", "UCSC2ensembl.txt"
-        ),
-    params:
-        map_chr=lambda wildcards: config[wildcards.organism]["map_chr_url"],
-        cluster_log=os.path.join(
-            config["cluster_log"], "{organism}", "dict_chr.log"
-        ),
-    log:
-        os.path.join(config["local_log"], "{organism}", "dict_chr.log"),
-    singularity:
-        "docker://zavolab/ubuntu:18.04"
-    shell:
-        "(wget {params.map_chr} -O {output.map_chr}) &> {log}"
-
-
-###############################################################################
-### Mapping chromosomes names, UCSC <-> ENSEMBL
-###############################################################################
-
-
-rule map_chr_names:
-    input:
-        anno=os.path.join(
-            config["output_dir"], "{organism}", "raw", "mirna.gff3"
-        ),
-        script=os.path.join(config["scripts_dir"], "map_chromosomes.pl"),
-        map_chr=os.path.join(
-            config["output_dir"], "{organism}", "UCSC2ensembl.txt"
-        ),
-    output:
-        gff=os.path.join(
-            config["output_dir"], "{organism}", "mirna_chr_mapped.gff3"
-        ),
-    params:
-        cluster_log=os.path.join(
-            config["cluster_log"], "{organism}", "map_chr_names.log"
-        ),
-        column=lambda wildcards: config[wildcards.organism]["column"],
-        delimiter=lambda wildcards: config[wildcards.organism]["delimiter"],
-    log:
-        os.path.join(config["local_log"], "{organism}", "map_chr_names.log"),
-    singularity:
-        "docker://zavolab/perl:5.28"
-    shell:
-        "(perl {input.script} \
-        {input.anno} \
-        {params.column} \
-        {params.delimiter} \
-        {input.map_chr} \
-        {output.gff} \
-        ) &> {log}"
-
-
-###############################################################################
-### Filtering _1 miR IDs
-###############################################################################
-
-
-rule filter_mir_1_anno:
-    input:
-        gff=os.path.join(
-            config["output_dir"], "{organism}", "mirna_chr_mapped.gff3"
-        ),
-    output:
-        gff=os.path.join(
-            config["output_dir"], "{organism}", "mirna_filtered.gff3"
-        ),
-    params:
-        script=os.path.join(config["scripts_dir"], "filter_mir_1_anno.sh"),
-        cluster_log=os.path.join(
-            config["cluster_log"], "{organism}", "filter_mir_1_anno.log"
-        ),
-    log:
-        os.path.join(
-            config["local_log"], "{organism}", "filter_mir_1_anno.log"
-        ),
-    singularity:
-        "docker://zavolab/ubuntu:18.04"
-    shell:
-        "(bash {params.script} -f {input.gff} -o {output.gff}) &> {log}"
-
-
-###############################################################################
-### GFF to BED (improve intersect memory efficient allowing to use -sorted)
-###############################################################################
-
-
-rule gfftobed:
-    input:
-        gff=os.path.join(
-            config["output_dir"], "{organism}", "mirna_filtered.gff3"
-        ),
-    output:
-        bed=os.path.join(
-            config["output_dir"], "{organism}", "mirna_filtered.bed"
-        ),
-    params:
-        cluster_log=os.path.join(
-            config["cluster_log"], "{organism}", "gfftobed.log"
-        ),
-        out_dir=os.path.join(config["output_dir"]),
-    log:
-        os.path.join(config["local_log"], "{organism}", "gfftobed.log"),
-    singularity:
-        "docker://zavolab/bedops:2.4.35"
-    shell:
-        "(convert2bed -i gff < {input.gff} \
-        --sort-tmpdir={params.out_dir} \
-        > {output.bed} \
-        ) &> {log}"
-
-
-###############################################################################
-### Index genome fasta file
-###############################################################################
-
-
-rule create_index_fasta:
-    input:
-        genome=os.path.join(
-            config["output_dir"], "{organism}", "genome.processed.fa"
-        ),
-    output:
-        genome=os.path.join(
-            config["output_dir"], "{organism}", "genome.processed.fa.fai"
-        ),
-    params:
-        cluster_log=os.path.join(
-            config["cluster_log"], "{organism}", "create_index_fasta.log"
-        ),
-    log:
-        os.path.join(
-            config["local_log"], "{organism}", "create_index_fasta.log"
-        ),
-    singularity:
-        "docker://zavolab/samtools:1.8"
-    shell:
-        "(samtools faidx {input.genome}) &> {log}"
-
-
-###############################################################################
-### Extract chromosome length
-###############################################################################
-
-
-rule extract_chr_len:
-    input:
-        genome=os.path.join(
-            config["output_dir"], "{organism}", "genome.processed.fa.fai"
-        ),
-    output:
-        chrsize=os.path.join(
-            config["output_dir"], "{organism}", "chr_size.txt"
-        ),
-    params:
-        cluster_log=os.path.join(
-            config["cluster_log"], "{organism}", "extract_chr_len.log"
-        ),
-    log:
-        os.path.join(config["local_log"], "{organism}", "extract_chr_len.log"),
-    singularity:
-        "docker://zavolab/ubuntu:18.04"
-    shell:
-        "(cut -f1,2 {input.genome} > {output.chrsize}) &> {log}"
-
-
-###############################################################################
-### Extract mature miRNA
-###############################################################################
-
-
-rule filter_mature_mirs:
-    input:
-        bed=os.path.join(
-            config["output_dir"], "{organism}", "mirna_filtered.bed"
-        ),
-    output:
-        bed=os.path.join(
-            config["output_dir"], "{organism}", "mirna_mature_filtered.bed"
-        ),
-    params:
-        cluster_log=os.path.join(
-            config["cluster_log"], "{organism}", "filter_mature_mirs.log"
-        ),
-        precursor="miRNA_primary_transcript",
-    log:
-        os.path.join(
-            config["local_log"], "{organism}", "filter_mature_mirs.log"
-        ),
-    singularity:
-        "docker://zavolab/ubuntu:18.04"
-    shell:
-        "(grep -v {params.precursor} {input.bed} > {output.bed}) &> {log}"
-
-
-###############################################################################
-### Create isomirs annotation file from mature miRNA
-###############################################################################
-
-
-rule iso_anno:
-    input:
-        bed=os.path.join(
-            config["output_dir"], "{organism}", "mirna_mature_filtered.bed"
-        ),
-        chrsize=os.path.join(
-            config["output_dir"], "{organism}", "chr_size.txt"
-        ),
-    output:
-        bed=os.path.join(
-            config["output_dir"],
-            "{organism}",
-            "iso_anno_5p{bp_5p}_3p{bp_3p}.bed",
-        ),
-    params:
-        cluster_log=os.path.join(
-            config["cluster_log"],
-            "{organism}",
-            "iso_anno_5p{bp_5p}_3p{bp_3p}.log",
-        ),
-        bp_5p=lambda wildcards: wildcards.bp_5p,
-        bp_3p=lambda wildcards: wildcards.bp_3p,
-    log:
-        os.path.join(
-            config["local_log"],
-            "{organism}",
-            "iso_anno_5p{bp_5p}_3p{bp_3p}.log",
-        ),
-    singularity:
-        "docker://zavolab/bedtools:2.28.0"
-    shell:
-        "(bedtools slop \
-        -i {input.bed} \
-        -g {input.chrsize} \
-        -l {params.bp_5p} \
-        -r {params.bp_3p} \
-        > {output.bed} \
-        ) &> {log}"
-
-
-###############################################################################
-### Change miRNA names to isomirs names
-###############################################################################
-
-
-rule iso_anno_rename:
-    input:
-        bed=os.path.join(
-            config["output_dir"],
-            "{organism}",
-            "iso_anno_5p{bp_5p}_3p{bp_3p}.bed",
-        ),
-    output:
-        bed=os.path.join(
-            config["output_dir"],
-            "{organism}",
-            "iso_anno_rename_5p{bp_5p}_3p{bp_3p}.bed",
-        ),
-    params:
-        cluster_log=os.path.join(
-            config["cluster_log"],
-            "{organism}",
-            "iso_anno_rename_5p{bp_5p}_3p{bp_3p}.log",
-        ),
-        bp_5p=lambda wildcards: wildcards.bp_5p,
-        bp_3p=lambda wildcards: wildcards.bp_3p,
-    log:
-        os.path.join(
-            config["local_log"],
-            "{organism}",
-            "iso_anno_rename_5p{bp_5p}_3p{bp_3p}.log",
-        ),
-    singularity:
-        "docker://zavolab/ubuntu:18.04"
-    shell:
-        "(sed \
-        's/;Derives/_5p{params.bp_5p}_3p{params.bp_3p};Derives/' \
-        {input.bed} \
-        > {output.bed} \
-        ) &> {log}"
-
-
-###############################################################################
-### Concatenate all isomirs annotation files
-###############################################################################
-
-
-rule iso_anno_concat:
-    input:
-        bed=lambda wildcards: expand(
-            os.path.join(
-                config["output_dir"],
-                "{organism}",
-                "iso_anno_rename_5p{bp_5p}_3p{bp_3p}.bed",
-            ),
-            organism=config["organism"],
-            bp_3p=config["bp_3p"],
-            bp_5p=config["bp_5p"],
-        ),
-    output:
-        bed=os.path.join(
-            config["output_dir"], "{organism}", "iso_anno_concat.bed"
-        ),
-    params:
-        cluster_log=os.path.join(
-            config["cluster_log"], "{organism}", "iso_anno_concat.log"
-        ),
-        prefix=os.path.join(
-            config["output_dir"], "{organism}", "iso_anno_rename"
-        ),
-    log:
-        os.path.join(config["local_log"], "{organism}", "iso_anno_concat.log"),
-    singularity:
-        "docker://zavolab/ubuntu:18.04"
-    shell:
-        "(cat {params.prefix}* > {output.bed}) &> {log}"
-
-
-###############################################################################
-### Remove non changing isomirs (5p0_3p0)
-###############################################################################
-
-
-rule iso_anno_final:
-    input:
-        bed=os.path.join(
-            config["output_dir"], "{organism}", "iso_anno_concat.bed"
-        ),
-    output:
-        bed=os.path.join(
-            config["output_dir"], "{organism}", "isomirs_annotation.bed"
-        ),
-    params:
-        cluster_log=os.path.join(
-            config["cluster_log"], "{organism}", "iso_anno_final.log"
-        ),
-        pattern="5p0_3p0",
-    log:
-        os.path.join(config["local_log"], "{organism}", "iso_anno_final.log"),
-    singularity:
-        "docker://zavolab/ubuntu:18.04"
-    shell:
-        "(grep -v '{params.pattern}' {input.bed} > {output.bed}) &> {log}"