From e02c5e7c719dbdbbb162a1f4f154ba33b27eeb0c Mon Sep 17 00:00:00 2001 From: Iris Mestres Pascual <iris.mestrespascual@unibas.ch> Date: Tue, 17 Jan 2023 17:16:07 +0000 Subject: [PATCH 01/22] Upload Snakefile of the merged workflows --- workflow/Snakefile | 1941 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1941 insertions(+) create mode 100644 workflow/Snakefile diff --git a/workflow/Snakefile b/workflow/Snakefile new file mode 100644 index 0000000..2a37bcf --- /dev/null +++ b/workflow/Snakefile @@ -0,0 +1,1941 @@ +"""General purpose RNA-Seq analysis pipeline developed by the Zavolan Lab""" +import os +import pandas as pd +import shutil +import yaml +from shlex import quote +from typing import Tuple +from snakemake.utils import validate + + +configfile: os.path.join(workflow.basedir, "..", "config", "config.yaml") + + +## Preparations +# Get sample table +samples_table = pd.read_csv( + config["samples"], + header=0, + index_col=0, + comment="#", + engine="python", + sep="\t", +) + +# dict for translation of "directionality parameters" +directionality_dict = { + "SF": { + "kallisto": "--fr-stranded", + "alfa": "fr-secondstrand", + "alfa_plus": "str1", + "alfa_minus": "str2", + }, + "SR": { + "kallisto": "--rf-stranded", + "alfa": "fr-firststrand", + "alfa_plus": "str2", + "alfa_minus": "str1", + }, +} +# dict for alfa output" +alfa_dict = {"MultimappersIncluded": "UniqueMultiple", "UniqueMappers": "Unique"} + +# Validate config +validate(config, os.path.join("..", "resources", "config_schema.json")) +logger.info(f"Config file after validation: {config}") + +# Parse YAML rule config file +try: + with open(config["rule_config"]) as _file: + rule_config = yaml.safe_load(_file) + logger.info(f"Loaded rule_config from {config['rule_config']}.") +except TypeError: + logger.error( + f'No string supplied at field "rule_config", but: {type(config["rule_config"])} with content: {config["rule_config"]}' + ) + raise +except FileNotFoundError: + logger.error( + f"No rule config file found at {config['rule_config']}. Either provide file or remove rule_config parameter from config.yaml! " + ) + raise +except KeyError: + rule_config = {} + logger.warning(f"No rule config specified: using default values for all tools.") + + +def parse_rule_config( + rule_config: dict, current_rule: str, immutable: Tuple[str, ...] = () +): + """Get rule specific parameters from rule_config file""" + + # If rule config file not present, emtpy string will be returned + if not rule_config: + logger.info(f"No rule config specified: using default values for all tools.") + return "" + # Same if current rule not specified in rule config + if current_rule not in rule_config or not rule_config[current_rule]: + logger.info( + f"No additional parameters for rule {current_rule} specified: using default settings." + ) + return "" + + # Subset only section for current rule + rule_config = rule_config[current_rule] + + # Build list of parameters and values + params_vals = [] + for param, val in rule_config.items(): + # Do not allow the user to change wiring-critical, fixed arguments, or arguments that are passed through samples table + if param in immutable: + raise ValueError( + f"The following parameter in rule {current_rule} is critical for the pipeline to " + f"function as expected and cannot be modified: {param}" + ) + # Accept only strings; this prevents unintended results potentially + # arising from users entering reserved YAML keywords or nested + # structures (lists, dictionaries) + if isinstance(val, str): + params_vals.append(str(param)) + # Do not include a value for flags (signified by empty strings) + if val: + params_vals.append(val) + else: + raise ValueError( + "Only string values allowed for tool parameters: Found type " + f"'{type(val).__name__}' for value of parameter '{param}'" + ) + # Return quoted string + add_params = " ".join(quote(item) for item in params_vals) + logger.info( + f"User specified additional parameters for rule {current_rule}:\n {add_params}" + ) + return add_params + + +# Global config +localrules: + start, + finish, + rename_star_rpm_for_alfa, + prepare_multiqc_config, + + +# Include subworkflows +include: os.path.join("rules", "common.smk") +include: os.path.join("rules", "paired_end.snakefile.smk") +include: os.path.join("rules", "single_end.snakefile.smk") + + +rule finish: + """ + Rule for collecting outputs + """ + input: + multiqc_report=os.path.join(config["output_dir"], "multiqc_summary"), + bigWig=expand( + os.path.join( + config["output_dir"], + "samples", + "{sample}", + "bigWig", + "{renamed_unique}", + "{sample}_{renamed_unique}_{strand}.bw", + ), + sample=pd.unique(samples_table.index.values), + strand=["plus", "minus"], + renamed_unique=alfa_dict.keys(), + ), + salmon_merge_genes=expand( + os.path.join( + config["output_dir"], + "summary_salmon", + "quantmerge", + "genes_{salmon_merge_on}.tsv", + ), + salmon_merge_on=["tpm", "numreads"], + ), + salmon_merge_transcripts=expand( + os.path.join( + config["output_dir"], + "summary_salmon", + "quantmerge", + "transcripts_{salmon_merge_on}.tsv", + ), + salmon_merge_on=["tpm", "numreads"], + ), + kallisto_merge_transcripts=os.path.join( + config["output_dir"], "summary_kallisto", "transcripts_tpm.tsv" + ), + kallisto_merge_genes=os.path.join( + config["output_dir"], "summary_kallisto", "genes_tpm.tsv" + ), + + +current_rule = "start" + + +rule start: + """ + Get samples + """ + input: + reads=lambda wildcards: expand( + pd.Series(samples_table.loc[wildcards.sample, wildcards.mate]).values + ), + output: + reads=os.path.join( + config["output_dir"], + "samples", + "{sample}", + "start", + "{sample}.{mate}.fastq.gz", + ), + params: + cluster_log_path=config["cluster_log_dir"], + log: + stderr=os.path.join( + config["log_dir"], + "samples", + "{sample}", + f"{current_rule}_{{sample}}.{{mate}}.stderr.log", + ), + stdout=os.path.join( + config["log_dir"], + "samples", + "{sample}", + f"{current_rule}_{{sample}}.{{mate}}.stdout.log", + ), + container: + "docker://ubuntu:focal-20210416" + shell: + "(cat {input.reads} > {output.reads}) \ + 1> {log.stdout} 2> {log.stderr} " + + +current_rule = "fastqc" + + +rule fastqc: + """ + A quality control tool for high throughput sequence data + """ + input: + reads=os.path.join( + config["output_dir"], + "samples", + "{sample}", + "start", + "{sample}.{mate}.fastq.gz", + ), + output: + outdir=directory( + os.path.join( + config["output_dir"], "samples", "{sample}", "fastqc", "{mate}" + ) + ), + params: + cluster_log_path=config["cluster_log_dir"], + additional_params=parse_rule_config( + rule_config, current_rule=current_rule, immutable=("--outdir",) + ), + threads: 1 + resources: + mem_mb=lambda wildcards, attempt: 4096 * attempt, + container: + "docker://quay.io/biocontainers/fastqc:0.11.9--hdfd78af_1" + conda: + os.path.join(workflow.basedir, "envs", "fastqc.yaml") + log: + stderr=os.path.join( + config["log_dir"], + "samples", + "{sample}", + f"{current_rule}_{{mate}}.stderr.log", + ), + stdout=os.path.join( + config["log_dir"], + "samples", + "{sample}", + f"{current_rule}_{{mate}}.stdout.log", + ), + shell: + "(mkdir -p {output.outdir}; \ + fastqc --outdir {output.outdir} \ + --threads {threads} \ + {params.additional_params} \ + {input.reads}) \ + 1> {log.stdout} 2> {log.stderr}" + + +current_rule = "fastqc_trimmed" + + +rule fastqc_trimmed: + """ + A quality control tool for high throughput sequence data + """ + input: + reads=os.path.join( + config["output_dir"], + "samples", + "{sample}", + "{sample}.{mate}.{seqmode}.remove_polya.fastq.gz", + ), + output: + outdir=directory( + os.path.join( + config["output_dir"], + "samples", + "{sample}", + "fastqc_trimmed", + "{mate}_{seqmode}", + ) + ), + params: + cluster_log_path=config["cluster_log_dir"], + additional_params=parse_rule_config( + rule_config, current_rule=current_rule, immutable=("--outdir",) + ), + threads: 1 + container: + "docker://quay.io/biocontainers/fastqc:0.11.9--hdfd78af_1" + conda: + os.path.join(workflow.basedir, "envs", "fastqc.yaml") + log: + stderr=os.path.join( + config["log_dir"], + "samples", + "{sample}", + f"{current_rule}_{{seqmode}}_{{mate}}.stderr.log", + ), + stdout=os.path.join( + config["log_dir"], + "samples", + "{sample}", + f"{current_rule}_{{seqmode}}_{{mate}}.stdout.log", + ), + shell: + "(mkdir -p {output.outdir}; \ + fastqc --outdir {output.outdir} \ + --threads {threads} \ + {params.additional_params} \ + {input.reads}) \ + 1> {log.stdout} 2> {log.stderr}" + + +current_rule = "create_index_star" + + +rule create_index_star: + """ + Create index for STAR alignments + """ + input: + genome=lambda wildcards: os.path.abspath( + get_sample("genome", search_id="organism", search_value=wildcards.organism) + ), + gtf=lambda wildcards: os.path.abspath( + get_sample("gtf", search_id="organism", search_value=wildcards.organism) + ), + output: + chromosome_info=os.path.join( + config["star_indexes"], + "{organism}", + "{index_size}", + "STAR_index", + "chrNameLength.txt", + ), + chromosomes_names=os.path.join( + config["star_indexes"], + "{organism}", + "{index_size}", + "STAR_index", + "chrName.txt", + ), + params: + cluster_log_path=config["cluster_log_dir"], + output_dir=lambda wildcards, output: os.path.dirname(output.chromosome_info), + outFileNamePrefix=os.path.join( + config["star_indexes"], "{organism}", "{index_size}", "STAR_index/STAR_" + ), + sjdbOverhang="{index_size}", + additional_params=parse_rule_config( + rule_config, + current_rule=current_rule, + immutable=( + "--runMode", + "--sjdbOverhang", + "--genomeDir", + "--genomeFastaFiles", + "--outFileNamePrefix", + "--sjdbGTFfile", + ), + ), + container: + "docker://quay.io/biocontainers/star:2.7.8a--h9ee0642_1" + conda: + os.path.join(workflow.basedir, "envs", "STAR.yaml") + threads: 12 + resources: + mem_mb=lambda wildcards, attempt: 32000 * attempt, + log: + stderr=os.path.join( + config["log_dir"], f"{current_rule}_{{organism}}_{{index_size}}.stderr.log" + ), + stdout=os.path.join( + config["log_dir"], f"{current_rule}_{{organism}}_{{index_size}}.stdout.log" + ), + shell: + "(mkdir -p {params.output_dir}; \ + chmod -R 777 {params.output_dir}; \ + STAR \ + --runMode genomeGenerate \ + --sjdbOverhang {params.sjdbOverhang} \ + --genomeDir {params.output_dir} \ + --genomeFastaFiles {input.genome} \ + --runThreadN {threads} \ + --outFileNamePrefix {params.outFileNamePrefix} \ + --sjdbGTFfile {input.gtf}) \ + {params.additional_params} \ + 1> {log.stdout} 2> {log.stderr}" + + +current_rule = "extract_transcriptome" + + +rule extract_transcriptome: + """ + Create transcriptome from genome and gene annotations + """ + input: + genome=lambda wildcards: get_sample( + "genome", search_id="organism", search_value=wildcards.organism + ), + gtf=lambda wildcards: get_sample( + "gtf", search_id="organism", search_value=wildcards.organism + ), + output: + transcriptome=temp( + os.path.join( + config["output_dir"], "transcriptome", "{organism}", "transcriptome.fa" + ) + ), + params: + cluster_log_path=config["cluster_log_dir"], + additional_params=parse_rule_config( + rule_config, + current_rule=current_rule, + immutable=( + "-w", + "-g", + ), + ), + container: + "docker://quay.io/biocontainers/gffread:0.12.1--h2e03b76_1" + conda: + os.path.join(workflow.basedir, "envs", "gffread.yaml") + resources: + mem_mb=lambda wildcards, attempt: 4096 * attempt, + log: + stderr=os.path.join(config["log_dir"], f"{current_rule}_{{organism}}.log"), + stdout=os.path.join(config["log_dir"], f"{current_rule}_{{organism}}.log"), + shell: + "(gffread \ + -w {output.transcriptome} \ + -g {input.genome} \ + {params.additional_params} \ + {input.gtf}) \ + 1> {log.stdout} 2> {log.stderr}" + + +current_rule = "concatenate_transcriptome_and_genome" + + +rule concatenate_transcriptome_and_genome: + """ + Concatenate genome and transcriptome + """ + input: + transcriptome=os.path.join( + config["output_dir"], "transcriptome", "{organism}", "transcriptome.fa" + ), + genome=lambda wildcards: get_sample( + "genome", search_id="organism", search_value=wildcards.organism + ), + output: + genome_transcriptome=temp( + os.path.join( + config["output_dir"], + "transcriptome", + "{organism}", + "genome_transcriptome.fa", + ) + ), + params: + cluster_log_path=config["cluster_log_dir"], + container: + "docker://ubuntu:focal-20210416" + resources: + mem_mb=lambda wildcards, attempt: 4096 * attempt, + log: + stderr=os.path.join( + config["log_dir"], f"{current_rule}_{{organism}}.stderr.log" + ), + shell: + "(cat {input.transcriptome} {input.genome} \ + 1> {output.genome_transcriptome}) \ + 2> {log.stderr}" + + +current_rule = "create_index_salmon" + + +rule create_index_salmon: + """ + Create index for Salmon quantification + """ + input: + genome_transcriptome=os.path.join( + config["output_dir"], + "transcriptome", + "{organism}", + "genome_transcriptome.fa", + ), + chr_names=lambda wildcards: os.path.join( + config["star_indexes"], + get_sample("organism"), + get_sample("index_size"), + "STAR_index", + "chrName.txt", + ), + output: + index=directory( + os.path.join( + config["salmon_indexes"], "{organism}", "{kmer}", "salmon.idx" + ) + ), + params: + cluster_log_path=config["cluster_log_dir"], + kmerLen="{kmer}", + additional_params=parse_rule_config( + rule_config, + current_rule=current_rule, + immutable=( + "--transcripts", + "--decoys", + "--index", + "--kmerLen", + ), + ), + container: + "docker://quay.io/biocontainers/salmon:1.4.0--h84f40af_1" + conda: + os.path.join(workflow.basedir, "envs", "salmon.yaml") + threads: 8 + resources: + mem_mb=lambda wildcards, attempt: 32000 * attempt, + log: + stderr=os.path.join( + config["log_dir"], f"{current_rule}_{{organism}}_{{kmer}}.stderr.log" + ), + stdout=os.path.join( + config["log_dir"], f"{current_rule}_{{organism}}_{{kmer}}.stdout.log" + ), + shell: + "(salmon index \ + --transcripts {input.genome_transcriptome} \ + --decoys {input.chr_names} \ + --index {output.index} \ + --kmerLen {params.kmerLen} \ + --threads {threads}) \ + {params.additional_params} \ + 1> {log.stdout} 2> {log.stderr}" + + +current_rule = "create_index_kallisto" + + +rule create_index_kallisto: + """ + Create index for Kallisto quantification + """ + input: + transcriptome=os.path.join( + config["output_dir"], "transcriptome", "{organism}", "transcriptome.fa" + ), + output: + index=os.path.join(config["kallisto_indexes"], "{organism}", "kallisto.idx"), + params: + cluster_log_path=config["cluster_log_dir"], + output_dir=lambda wildcards, output: os.path.dirname(output.index), + additional_params=parse_rule_config( + rule_config, current_rule=current_rule, immutable=("-i",) + ), + container: + "docker://quay.io/biocontainers/kallisto:0.46.2--h60f4f9f_2" + conda: + os.path.join(workflow.basedir, "envs", "kallisto.yaml") + resources: + mem_mb=lambda wildcards, attempt: 8192 * attempt, + log: + stderr=os.path.join( + config["log_dir"], f"{current_rule}_{{organism}}.stderr.log" + ), + stdout=os.path.join( + config["log_dir"], f"{current_rule}_{{organism}}.stdout.log" + ), + shell: + "(mkdir -p {params.output_dir}; \ + chmod -R 777 {params.output_dir}; \ + kallisto index \ + {params.additional_params} \ + -i {output.index} \ + {input.transcriptome}) \ + 1> {log.stdout} 2> {log.stderr}" + + +current_rule = "extract_transcripts_as_bed12" + + +rule extract_transcripts_as_bed12: + """ + Convert transcripts to BED12 format + """ + input: + gtf=lambda wildcards: get_sample("gtf"), + output: + bed12=temp( + os.path.join(config["output_dir"], "full_transcripts_protein_coding.bed") + ), + params: + cluster_log_path=config["cluster_log_dir"], + additional_params=parse_rule_config( + rule_config, + current_rule=current_rule, + immutable=( + "--gtf", + "--bed12", + ), + ), + container: + "docker://quay.io/biocontainers/zgtf:0.1.1--pyh5e36f6f_0" + conda: + os.path.join(workflow.basedir, "envs", "zgtf.yaml") + threads: 1 + resources: + mem_mb=lambda wildcards, attempt: 4096 * attempt, + log: + stdout=os.path.join(config["log_dir"], f"{current_rule}.stdout.log"), + stderr=os.path.join(config["log_dir"], f"{current_rule}.stderr.log"), + shell: + "(gtf2bed12 \ + --gtf {input.gtf} \ + --bed12 {output.bed12}); \ + {params.additional_params} \ + 1> {log.stdout} 2> {log.stderr}" + + +current_rule = "sort_genomic_alignment_samtools" + + +rule sort_genomic_alignment_samtools: + """ + Sort genome bamfile using samtools + """ + input: + bam=os.path.join( + config["output_dir"], + "samples", + "{sample}", + "map_genome", + "{sample}.{seqmode}.Aligned.out.bam", + ), + output: + bam=os.path.join( + config["output_dir"], + "samples", + "{sample}", + "map_genome", + "{sample}.{seqmode}.Aligned.sortedByCoord.out.bam", + ), + params: + cluster_log_path=config["cluster_log_dir"], + additional_params=parse_rule_config( + rule_config, current_rule=current_rule, immutable=() + ), + container: + "docker://quay.io/biocontainers/samtools:1.9--h10a08f8_12" + conda: + os.path.join(workflow.basedir, "envs", "samtools.yaml") + threads: 8 + resources: + mem_mb=lambda wildcards, attempt: 8000 * attempt, + log: + stderr=os.path.join( + config["log_dir"], + "samples", + "{sample}", + f"{current_rule}.{{seqmode}}.stderr.log", + ), + stdout=os.path.join( + config["log_dir"], + "samples", + "{sample}", + f"{current_rule}.{{seqmode}}.stdout.log", + ), + shell: + "(samtools sort \ + -o {output.bam} \ + -@ {threads} \ + {params.additional_params} \ + {input.bam}) \ + 1> {log.stdout} 2> {log.stderr}" + + +current_rule = "index_genomic_alignment_samtools" + + +rule index_genomic_alignment_samtools: + """ + Index genome bamfile using samtools + """ + input: + bam=os.path.join( + config["output_dir"], + "samples", + "{sample}", + "map_genome", + "{sample}.{seqmode}.Aligned.sortedByCoord.out.bam", + ), + output: + bai=os.path.join( + config["output_dir"], + "samples", + "{sample}", + "map_genome", + "{sample}.{seqmode}.Aligned.sortedByCoord.out.bam.bai", + ), + params: + cluster_log_path=config["cluster_log_dir"], + additional_params=parse_rule_config( + rule_config, current_rule=current_rule, immutable=() + ), + container: + "docker://quay.io/biocontainers/samtools:1.3.1--h1b8c3c0_8" + conda: + os.path.join(workflow.basedir, "envs", "samtools.yaml") + threads: 1 + resources: + mem_mb=lambda wildcards, attempt: 4096 * attempt, + log: + stderr=os.path.join( + config["log_dir"], + "samples", + "{sample}", + f"{current_rule}.{{seqmode}}.stderr.log", + ), + stdout=os.path.join( + config["log_dir"], + "samples", + "{sample}", + f"{current_rule}.{{seqmode}}.stdout.log", + ), + shell: + "(samtools index \ + {params.additional_params} \ + {input.bam} {output.bai};) \ + 1> {log.stdout} 2> {log.stderr}" + + +current_rule = "calculate_TIN_scores" + + +rule calculate_TIN_scores: + """ + Calculate transcript integrity (TIN) score + """ + input: + bam=lambda wildcards: expand( + os.path.join( + config["output_dir"], + "samples", + "{sample}", + "map_genome", + "{sample}.{seqmode}.Aligned.sortedByCoord.out.bam", + ), + sample=wildcards.sample, + seqmode=get_sample( + "seqmode", search_id="index", search_value=wildcards.sample + ), + ), + bai=lambda wildcards: expand( + os.path.join( + config["output_dir"], + "samples", + "{sample}", + "map_genome", + "{sample}.{seqmode}.Aligned.sortedByCoord.out.bam.bai", + ), + sample=wildcards.sample, + seqmode=get_sample( + "seqmode", search_id="index", search_value=wildcards.sample + ), + ), + transcripts_bed12=os.path.join( + config["output_dir"], "full_transcripts_protein_coding.bed" + ), + output: + TIN_score=temp( + os.path.join( + config["output_dir"], "samples", "{sample}", "TIN", "TIN_score.tsv" + ) + ), + params: + cluster_log_path=config["cluster_log_dir"], + sample="{sample}", + additional_params=parse_rule_config( + rule_config, + current_rule=current_rule, + immutable=( + "-i", + "-r", + "--names", + ), + ), + container: + "docker://quay.io/biocontainers/tin-score-calculation:0.6.3--pyh5e36f6f_0" + conda: + os.path.join(workflow.basedir, "envs", "tin-score-calculation.yaml") + threads: 8 + resources: + mem_mb=lambda wildcards, attempt, input: len(input.bam) * 1024 * attempt, + log: + stderr=os.path.join( + config["log_dir"], "samples", "{sample}", f"{current_rule}.log" + ), + shell: + "(calculate-tin.py \ + -i {input.bam} \ + -r {input.transcripts_bed12} \ + --names {params.sample} \ + -p {threads} \ + {params.additional_params} \ + > {output.TIN_score};) 2> {log.stderr}" + + +current_rule = "salmon_quantmerge_genes" + + +rule salmon_quantmerge_genes: + """ + Merge gene quantifications into a single file + """ + input: + salmon_in=expand( + os.path.join( + config["output_dir"], + "samples", + "{sample}", + "{sample}.salmon.{seqmode}", + "quant.sf", + ), + zip, + sample=pd.unique(samples_table.index.values), + seqmode=[ + get_sample("seqmode", search_id="index", search_value=i) + for i in pd.unique(samples_table.index.values) + ], + ), + output: + salmon_out=os.path.join( + config["output_dir"], + "summary_salmon", + "quantmerge", + "genes_{salmon_merge_on}.tsv", + ), + params: + cluster_log_path=config["cluster_log_dir"], + salmon_in=expand( + os.path.join( + config["output_dir"], + "samples", + "{sample}", + "{sample}.salmon.{seqmode}", + ), + zip, + sample=[i for i in pd.unique(samples_table.index.values)], + seqmode=[ + get_sample("seqmode", search_id="index", search_value=i) + for i in pd.unique(samples_table.index.values) + ], + ), + sample_name_list=expand( + "{sample}", sample=pd.unique(samples_table.index.values) + ), + salmon_merge_on="{salmon_merge_on}", + additional_params=parse_rule_config( + rule_config, + current_rule=current_rule, + immutable=( + "--quants", + "--genes", + "--transcripts", + "--names", + "--column", + "--output", + ), + ), + container: + "docker://quay.io/biocontainers/salmon:1.4.0--h84f40af_1" + conda: + os.path.join(workflow.basedir, "envs", "salmon.yaml") + threads: 1 + resources: + mem_mb=lambda wildcards, attempt, input: len(input.salmon_in) * 1024 * attempt, + log: + stderr=os.path.join( + config["log_dir"], f"{current_rule}_{{salmon_merge_on}}.stderr.log" + ), + stdout=os.path.join( + config["log_dir"], f"{current_rule}_{{salmon_merge_on}}.stdout.log" + ), + shell: + "(salmon quantmerge \ + --quants {params.salmon_in} \ + --genes \ + --names {params.sample_name_list} \ + --column {params.salmon_merge_on} \ + --output {output.salmon_out};) \ + {params.additional_params} \ + 1> {log.stdout} 2> {log.stderr}" + + +current_rule = "salmon_quantmerge_transcripts" + + +rule salmon_quantmerge_transcripts: + """ + Merge transcript quantifications into a single file + """ + input: + salmon_in=expand( + os.path.join( + config["output_dir"], + "samples", + "{sample}", + "{sample}.salmon.{seqmode}", + "quant.sf", + ), + zip, + sample=[i for i in pd.unique(samples_table.index.values)], + seqmode=[ + get_sample("seqmode", search_id="index", search_value=i) + for i in pd.unique(samples_table.index.values) + ], + ), + output: + salmon_out=os.path.join( + config["output_dir"], + "summary_salmon", + "quantmerge", + "transcripts_{salmon_merge_on}.tsv", + ), + params: + cluster_log_path=config["cluster_log_dir"], + salmon_in=expand( + os.path.join( + config["output_dir"], + "samples", + "{sample}", + "{sample}.salmon.{seqmode}", + ), + zip, + sample=[i for i in pd.unique(samples_table.index.values)], + seqmode=[ + get_sample("seqmode", search_id="index", search_value=i) + for i in pd.unique(samples_table.index.values) + ], + ), + sample_name_list=expand( + "{sample}", sample=pd.unique(samples_table.index.values) + ), + salmon_merge_on="{salmon_merge_on}", + additional_params=parse_rule_config( + rule_config, + current_rule=current_rule, + immutable=( + "--quants", + "--genes", + "--transcripts", + "--names", + "--column", + "--output", + ), + ), + container: + "docker://quay.io/biocontainers/salmon:1.4.0--h84f40af_1" + conda: + os.path.join(workflow.basedir, "envs", "salmon.yaml") + threads: 1 + resources: + mem_mb=lambda wildcards, attempt, input: len(input.salmon_in) * 1000 * attempt, + log: + stderr=os.path.join( + config["log_dir"], f"{current_rule}_{{salmon_merge_on}}.stderr.log" + ), + stdout=os.path.join( + config["log_dir"], f"{current_rule}_{{salmon_merge_on}}.stdout.log" + ), + shell: + "(salmon quantmerge \ + --quants {params.salmon_in} \ + --names {params.sample_name_list} \ + --column {params.salmon_merge_on} \ + --output {output.salmon_out}) \ + {params.additional_params} \ + 1> {log.stdout} 2> {log.stderr}" + + +current_rule = "kallisto_merge_genes" + + +rule kallisto_merge_genes: + """ + Merge gene quantifications into single file + """ + input: + pseudoalignment=expand( + os.path.join( + config["output_dir"], + "samples", + "{sample}", + "quant_kallisto", + "{sample}.{seqmode}.kallisto.pseudo.sam", + ), + zip, + sample=[i for i in pd.unique(samples_table.index.values)], + seqmode=[ + get_sample("seqmode", search_id="index", search_value=i) + for i in pd.unique(samples_table.index.values) + ], + ), + gtf=get_sample("gtf"), + output: + gn_tpm=os.path.join(config["output_dir"], "summary_kallisto", "genes_tpm.tsv"), + gn_counts=os.path.join( + config["output_dir"], "summary_kallisto", "genes_counts.tsv" + ), + params: + cluster_log_path=config["cluster_log_dir"], + dir_out=lambda wildcards, output: os.path.dirname(output.gn_counts), + tables=",".join( + expand( + os.path.join( + config["output_dir"], + "samples", + "{sample}", + "quant_kallisto", + "abundance.h5", + ), + sample=[i for i in pd.unique(samples_table.index.values)], + ) + ), + sample_name_list=",".join( + expand("{sample}", sample=pd.unique(samples_table.index.values)) + ), + additional_params=parse_rule_config( + rule_config, + current_rule=current_rule, + immutable=( + "--input", + "--names", + "--txOut", + "--anno", + "--output", + ), + ), + container: + "docker://quay.io/biocontainers/r-merge-kallisto:0.6--hdfd78af_0" + conda: + os.path.join(workflow.basedir, "envs", "r-merge-kallisto.yaml") + threads: 1 + resources: + mem_mb=lambda wildcards, attempt, input: len(input.pseudoalignment) + * 1000 + * attempt, + log: + stderr=os.path.join(config["log_dir"], f"{current_rule}.stderr.log"), + stdout=os.path.join(config["log_dir"], f"{current_rule}.stdout.log"), + shell: + "(merge_kallisto.R \ + --input {params.tables} \ + --names {params.sample_name_list} \ + --txOut FALSE \ + --anno {input.gtf} \ + --output {params.dir_out} \ + {params.additional_params} ) \ + 1> {log.stdout} 2> {log.stderr}" + + +current_rule = "kallisto_merge_transcripts" + + +rule kallisto_merge_transcripts: + """ + Merge transcript quantifications into a single files + """ + input: + pseudoalignment=expand( + os.path.join( + config["output_dir"], + "samples", + "{sample}", + "quant_kallisto", + "{sample}.{seqmode}.kallisto.pseudo.sam", + ), + zip, + sample=[i for i in pd.unique(samples_table.index.values)], + seqmode=[ + get_sample("seqmode", search_id="index", search_value=i) + for i in pd.unique(samples_table.index.values) + ], + ), + output: + tx_tpm=os.path.join( + config["output_dir"], "summary_kallisto", "transcripts_tpm.tsv" + ), + tx_counts=os.path.join( + config["output_dir"], "summary_kallisto", "transcripts_counts.tsv" + ), + params: + cluster_log_path=config["cluster_log_dir"], + dir_out=lambda wildcards, output: os.path.dirname(output.tx_counts), + tables=",".join( + expand( + os.path.join( + config["output_dir"], + "samples", + "{sample}", + "quant_kallisto", + "abundance.h5", + ), + sample=[i for i in pd.unique(samples_table.index.values)], + ) + ), + sample_name_list=",".join( + expand("{sample}", sample=pd.unique(samples_table.index.values)) + ), + additional_params=parse_rule_config( + rule_config, + current_rule=current_rule, + immutable=( + "--input", + "--names", + "--txOut", + "--output", + ), + ), + container: + "docker://quay.io/biocontainers/r-merge-kallisto:0.6--hdfd78af_0" + conda: + os.path.join(workflow.basedir, "envs", "r-merge-kallisto.yaml") + threads: 1 + resources: + mem_mb=lambda wildcards, attempt, input: len(input.pseudoalignment) + * 1024 + * attempt, + log: + stderr=os.path.join(config["log_dir"], f"{current_rule}.stderr.log"), + stdout=os.path.join(config["log_dir"], f"{current_rule}.stdout.log"), + shell: + "(merge_kallisto.R \ + --input {params.tables} \ + --names {params.sample_name_list} \ + --output {params.dir_out} \ + {params.additional_params}) \ + 1> {log.stdout} 2> {log.stderr}" + + +current_rule = "pca_salmon" + + +rule pca_salmon: + input: + tpm=os.path.join( + config["output_dir"], "summary_salmon", "quantmerge", "{molecule}_tpm.tsv" + ), + output: + out=directory( + os.path.join(config["output_dir"], "zpca", "pca_salmon_{molecule}") + ), + params: + cluster_log_path=config["cluster_log_dir"], + additional_params=parse_rule_config( + rule_config, + current_rule=current_rule, + immutable=( + "--tpm", + "--out", + ), + ), + container: + "docker://quay.io/biocontainers/zpca:0.8.3.post1--pyh5e36f6f_0" + conda: + os.path.join(workflow.basedir, "envs", "zpca.yaml") + threads: 1 + resources: + mem_mb=lambda wildcards, attempt: 4096 * attempt, + log: + stderr=os.path.join( + config["log_dir"], f"{current_rule}_{{molecule}}.stderr.log" + ), + stdout=os.path.join( + config["log_dir"], f"{current_rule}_{{molecule}}.stdout.log" + ), + shell: + "(zpca-tpm \ + --tpm {input.tpm} \ + --out {output.out} \ + {params.additional_params}) \ + 1> {log.stdout} 2> {log.stderr}" + + +current_rule = "pca_kallisto" + + +rule pca_kallisto: + input: + tpm=os.path.join(config["output_dir"], "summary_kallisto", "{molecule}_tpm.tsv"), + output: + out=directory( + os.path.join(config["output_dir"], "zpca", "pca_kallisto_{molecule}") + ), + params: + cluster_log_path=config["cluster_log_dir"], + additional_params=parse_rule_config( + rule_config, + current_rule=current_rule, + immutable=( + "--tpm", + "--out", + ), + ), + container: + "docker://quay.io/biocontainers/zpca:0.8.3.post1--pyh5e36f6f_0" + conda: + os.path.join(workflow.basedir, "envs", "zpca.yaml") + threads: 1 + resources: + mem_mb=lambda wildcards, attempt: 4096 * attempt, + log: + stderr=os.path.join( + config["log_dir"], f"{current_rule}_{{molecule}}.stderr.log" + ), + stdout=os.path.join( + config["log_dir"], f"{current_rule}_{{molecule}}.stdout.log" + ), + shell: + "(zpca-tpm \ + --tpm {input.tpm} \ + --out {output.out} \ + {params.additional_params}) \ + 1> {log.stdout} 2> {log.stderr}" + + +current_rule = "star_rpm" + + +rule star_rpm: + """ + Create stranded bedgraph coverage with STARs RPM normalisation + """ + input: + bam=lambda wildcards: expand( + os.path.join( + config["output_dir"], + "samples", + "{sample}", + "map_genome", + "{sample}.{seqmode}.Aligned.sortedByCoord.out.bam", + ), + sample=wildcards.sample, + seqmode=get_sample( + "seqmode", search_id="index", search_value=wildcards.sample + ), + ), + bai=lambda wildcards: expand( + os.path.join( + config["output_dir"], + "samples", + "{sample}", + "map_genome", + "{sample}.{seqmode}.Aligned.sortedByCoord.out.bam.bai", + ), + sample=wildcards.sample, + seqmode=get_sample( + "seqmode", search_id="index", search_value=wildcards.sample + ), + ), + output: + str1=temp( + os.path.join( + config["output_dir"], + "samples", + "{sample}", + "STAR_coverage", + "{sample}_Signal.Unique.str1.out.bg", + ) + ), + str2=temp( + os.path.join( + config["output_dir"], + "samples", + "{sample}", + "STAR_coverage", + "{sample}_Signal.UniqueMultiple.str1.out.bg", + ) + ), + str3=temp( + os.path.join( + config["output_dir"], + "samples", + "{sample}", + "STAR_coverage", + "{sample}_Signal.Unique.str2.out.bg", + ) + ), + str4=temp( + os.path.join( + config["output_dir"], + "samples", + "{sample}", + "STAR_coverage", + "{sample}_Signal.UniqueMultiple.str2.out.bg", + ) + ), + shadow: + "full" + params: + cluster_log_path=config["cluster_log_dir"], + out_dir=lambda wildcards, output: os.path.dirname(output.str1), + prefix=lambda wildcards, output: os.path.join( + os.path.dirname(output.str1), f"{str(wildcards.sample)}_" + ), + additional_params=parse_rule_config( + rule_config, + current_rule=current_rule, + immutable=( + "--runMode", + "--inputBAMfile", + "--outWigType", + "--outFileNamePrefix", + ), + ), + container: + "docker://quay.io/biocontainers/star:2.7.8a--h9ee0642_1" + conda: + os.path.join(workflow.basedir, "envs", "STAR.yaml") + threads: 4 + resources: + mem_mb=lambda wildcards, attempt: 8192 * attempt, + log: + stderr=os.path.join( + config["log_dir"], "samples", "{sample}", f"{current_rule}_.stderr.log" + ), + stdout=os.path.join( + config["log_dir"], "samples", "{sample}", f"{current_rule}_.stdout.log" + ), + shell: + "(mkdir -p {params.out_dir}; \ + chmod -R 777 {params.out_dir}; \ + STAR \ + --runMode inputAlignmentsFromBAM \ + --runThreadN {threads} \ + --inputBAMfile {input.bam} \ + --outWigType bedGraph \ + --outFileNamePrefix {params.prefix}) \ + {params.additional_params} \ + 1> {log.stdout} 2> {log.stderr}" + + +current_rule = "rename_star_rpm_for_alfa" + + +rule rename_star_rpm_for_alfa: + input: + plus=lambda wildcards: expand( + os.path.join( + config["output_dir"], + "samples", + "{sample}", + "STAR_coverage", + "{sample}_Signal.{unique}.{plus}.out.bg", + ), + sample=wildcards.sample, + unique=alfa_dict[wildcards.renamed_unique], + plus=get_directionality( + get_sample( + "libtype", search_id="index", search_value=wildcards.sample + ), + "alfa_plus", + ), + ), + minus=lambda wildcards: expand( + os.path.join( + config["output_dir"], + "samples", + "{sample}", + "STAR_coverage", + "{sample}_Signal.{unique}.{minus}.out.bg", + ), + sample=wildcards.sample, + unique=alfa_dict[wildcards.renamed_unique], + minus=get_directionality( + get_sample( + "libtype", search_id="index", search_value=wildcards.sample + ), + "alfa_minus", + ), + ), + output: + plus=temp( + os.path.join( + config["output_dir"], + "samples", + "{sample}", + "ALFA", + "{renamed_unique}", + "{sample}.{renamed_unique}.plus.bg", + ) + ), + minus=temp( + os.path.join( + config["output_dir"], + "samples", + "{sample}", + "ALFA", + "{renamed_unique}", + "{sample}.{renamed_unique}.minus.bg", + ) + ), + params: + cluster_log_path=config["cluster_log_dir"], + container: + "docker://ubuntu:focal-20210416" + log: + stderr=os.path.join( + config["log_dir"], + "samples", + "{sample}", + f"{current_rule}_{{renamed_unique}}.stderr.log", + ), + stdout=os.path.join( + config["log_dir"], + "samples", + "{sample}", + f"{current_rule}_{{renamed_unique}}.stdout.log", + ), + shell: + "(cp {input.plus} {output.plus}; \ + cp {input.minus} {output.minus};) \ + 1>{log.stdout} 2>{log.stderr}" + + +current_rule = "generate_alfa_index" + + +rule generate_alfa_index: + """ Generate ALFA index files from sorted GTF file """ + input: + gtf=lambda wildcards: os.path.abspath( + get_sample("gtf", search_id="organism", search_value=wildcards.organism) + ), + chr_len=os.path.join( + config["star_indexes"], + "{organism}", + "{index_size}", + "STAR_index", + "chrNameLength.txt", + ), + output: + index_stranded=os.path.join( + config["alfa_indexes"], + "{organism}", + "{index_size}", + "ALFA", + "sorted_genes.stranded.ALFA_index", + ), + index_unstranded=os.path.join( + config["alfa_indexes"], + "{organism}", + "{index_size}", + "ALFA", + "sorted_genes.unstranded.ALFA_index", + ), + temp_dir=temp( + directory( + os.path.join( + config["alfa_indexes"], + "{organism}", + "{index_size}", + "ALFA_temp", + ) + ) + ), + params: + cluster_log_path=config["cluster_log_dir"], + genome_index="sorted_genes", + out_dir=lambda wildcards, output: os.path.dirname(output.index_stranded), + additional_params=parse_rule_config( + rule_config, + current_rule=current_rule, + immutable=( + "-a", + "-g", + "--chr_len", + "-o", + ), + ), + container: + "docker://quay.io/biocontainers/alfa:1.1.1--pyh5e36f6f_0" + conda: + os.path.join(workflow.basedir, "envs", "alfa.yaml") + threads: 4 + resources: + mem_mb=lambda wildcards, attempt: 4096 * attempt, + log: + os.path.join( + config["log_dir"], f"{current_rule}_{{organism}}_{{index_size}}.log" + ), + shell: + "(mkdir -p {output.temp_dir}; \ + alfa -a {input.gtf} \ + -g {params.genome_index} \ + --chr_len {input.chr_len} \ + --temp_dir {output.temp_dir} \ + -p {threads} \ + -o {params.out_dir} \ + {params.additional_params}) \ + &> {log}" + + +current_rule = "alfa_qc" + + +rule alfa_qc: + """ + Run ALFA from stranded bedgraph files + """ + input: + plus=lambda wildcards: os.path.join( + config["output_dir"], + "samples", + wildcards.sample, + "ALFA", + wildcards.renamed_unique, + f"{wildcards.sample}.{wildcards.renamed_unique}.plus.bg", + ), + minus=lambda wildcards: os.path.join( + config["output_dir"], + "samples", + wildcards.sample, + "ALFA", + wildcards.renamed_unique, + f"{wildcards.sample}.{wildcards.renamed_unique}.minus.bg", + ), + gtf=lambda wildcards: os.path.join( + config["alfa_indexes"], + get_sample("organism", search_id="index", search_value=wildcards.sample), + get_sample("index_size", search_id="index", search_value=wildcards.sample), + "ALFA", + "sorted_genes.stranded.ALFA_index", + ), + output: + biotypes=temp( + os.path.join( + config["output_dir"], + "samples", + "{sample}", + "ALFA", + "{renamed_unique}", + "ALFA_plots.Biotypes.pdf", + ) + ), + categories=temp( + os.path.join( + config["output_dir"], + "samples", + "{sample}", + "ALFA", + "{renamed_unique}", + "ALFA_plots.Categories.pdf", + ) + ), + table=os.path.join( + config["output_dir"], + "samples", + "{sample}", + "ALFA", + "{renamed_unique}", + "{sample}.ALFA_feature_counts.tsv", + ), + temp_dir=temp( + directory( + os.path.join( + config["output_dir"], + "samples", + "{sample}", + "ALFA", + "{renamed_unique}", + "ALFA_temp", + ) + ) + ), + params: + cluster_log_path=config["cluster_log_dir"], + out_dir=lambda wildcards, output: os.path.dirname(output.biotypes), + genome_index=lambda wildcards, input: os.path.abspath( + os.path.join(os.path.dirname(input.gtf), "sorted_genes") + ), + plus=lambda wildcards, input: os.path.basename(input.plus), + minus=lambda wildcards, input: os.path.basename(input.minus), + name="{sample}", + alfa_orientation=lambda wildcards: get_directionality( + get_sample("libtype", search_id="index", search_value=wildcards.sample), + "alfa", + ), + temp_dir=lambda wildcards, output: os.path.abspath( + os.path.dirname(output.temp_dir) + ), + additional_params=parse_rule_config( + rule_config, + current_rule=current_rule, + immutable=( + "-g", + "--bedgraph", + "-s", + ), + ), + container: + "docker://quay.io/biocontainers/alfa:1.1.1--pyh5e36f6f_0" + conda: + os.path.join(workflow.basedir, "envs", "alfa.yaml") + resources: + mem_mb=lambda wildcards, attempt: 4096 * attempt, + log: + os.path.join( + config["log_dir"], + "samples", + "{sample}", + f"{current_rule}.{{renamed_unique}}.log", + ), + shell: + "(mkdir -p {output.temp_dir};\ + cd {params.out_dir}; \ + alfa \ + -g {params.genome_index} \ + --bedgraph {params.plus} {params.minus} {params.name} \ + -s {params.alfa_orientation} \ + --temp_dir {params.temp_dir} \ + {params.additional_params}) \ + &> {log}" + + +current_rule = "prepare_multiqc_config" + + +rule prepare_multiqc_config: + """ + Prepare config for the MultiQC + """ + input: + script=os.path.join(workflow.basedir, "scripts", "zarp_multiqc_config.py"), + output: + multiqc_config=os.path.join(config["output_dir"], "multiqc_config.yaml"), + params: + cluster_log_path=config["cluster_log_dir"], + logo_path=config["report_logo"] if "report_logo" in config else "", + multiqc_intro_text=config["report_description"], + url=config["report_url"] if "report_url" in config else "", + author_name=config["author_name"], + author_email=config["author_email"], + additional_params=parse_rule_config( + rule_config, + current_rule=current_rule, + immutable=( + "--config", + "--intro-text", + "--custom-logo", + "--url", + ), + ), + log: + stderr=os.path.join(config["log_dir"], f"{current_rule}.stderr.log"), + stdout=os.path.join(config["log_dir"], f"{current_rule}.stdout.log"), + container: + "docker://quay.io/biocontainers/zavolan-multiqc-plugins:1.3--pyh5e36f6f_0" + conda: + os.path.join(workflow.basedir, "envs", "zavolan-multiqc-plugins.yaml") + resources: + mem_mb=lambda wildcards, attempt: 4096 * attempt, + shell: + "(python {input.script} \ + --config {output.multiqc_config} \ + --intro-text '{params.multiqc_intro_text}' \ + --custom-logo '{params.logo_path}' \ + --url '{params.url}' \ + --author-name '{params.author_name}' \ + --author-email '{params.author_email}' \ + {params.additional_params}) \ + 1> {log.stdout} 2> {log.stderr}" + + +current_rule = "multiqc_report" + + +rule multiqc_report: + """ + Create report with MultiQC + """ + input: + fastqc_se=expand( + os.path.join( + config["output_dir"], "samples", "{sample}", "fastqc", "{mate}" + ), + sample=pd.unique(samples_table.index.values), + mate="fq1", + ), + fastqc_pe=expand( + os.path.join( + config["output_dir"], "samples", "{sample}", "fastqc", "{mate}" + ), + sample=[ + i + for i in pd.unique( + samples_table[samples_table["seqmode"] == "pe"].index.values + ) + ], + mate="fq2", + ), + fastqc_trimmed_se=expand( + os.path.join( + config["output_dir"], + "samples", + "{sample}", + "fastqc_trimmed", + "fq1_{seqmode}", + ), + zip, + sample=pd.unique(samples_table.index.values), + seqmode=[ + get_sample("seqmode", search_id="index", search_value=i) + for i in pd.unique(samples_table.index.values) + ], + ), + fastqc_trimmed_pe=expand( + os.path.join( + config["output_dir"], + "samples", + "{sample}", + "fastqc_trimmed", + "{mate}_pe", + ), + sample=[ + i + for i in pd.unique( + samples_table[samples_table["seqmode"] == "pe"].index.values + ) + ], + mate="fq2", + ), + pseudoalignment=expand( + os.path.join( + config["output_dir"], + "samples", + "{sample}", + "quant_kallisto", + "{sample}.{seqmode}.kallisto.pseudo.sam", + ), + zip, + sample=[i for i in pd.unique(samples_table.index.values)], + seqmode=[ + get_sample("seqmode", search_id="index", search_value=i) + for i in pd.unique(samples_table.index.values) + ], + ), + TIN_score=expand( + os.path.join( + config["output_dir"], "samples", "{sample}", "TIN", "TIN_score.tsv" + ), + sample=pd.unique(samples_table.index.values), + ), + tables=lambda wildcards: expand( + os.path.join( + config["output_dir"], + "samples", + "{sample}", + "ALFA", + "{renamed_unique}", + "{sample}.ALFA_feature_counts.tsv", + ), + sample=pd.unique(samples_table.index.values), + renamed_unique=alfa_dict.keys(), + ), + zpca_salmon=expand( + os.path.join(config["output_dir"], "zpca", "pca_salmon_{molecule}"), + molecule=["genes", "transcripts"], + ), + zpca_kallisto=expand( + os.path.join(config["output_dir"], "zpca", "pca_kallisto_{molecule}"), + molecule=["genes", "transcripts"], + ), + multiqc_config=os.path.join(config["output_dir"], "multiqc_config.yaml"), + output: + multiqc_report=directory(os.path.join(config["output_dir"], "multiqc_summary")), + params: + cluster_log_path=config["cluster_log_dir"], + results_dir=lambda wildcards, output: os.path.split(output.multiqc_report)[0], + log_dir=config["log_dir"], + additional_params=parse_rule_config( + rule_config, + current_rule=current_rule, + immutable=( + "--outdir", + "--config", + ), + ), + container: + "docker://quay.io/biocontainers/zavolan-multiqc-plugins:1.3--pyh5e36f6f_0" + conda: + os.path.join(workflow.basedir, "envs", "zavolan-multiqc-plugins.yaml") + resources: + mem_mb=lambda wildcards, attempt: 4096 * attempt, + log: + stderr=os.path.join(config["log_dir"], f"{current_rule}.stderr.log"), + stdout=os.path.join(config["log_dir"], f"{current_rule}.stdout.log"), + shell: + "(multiqc \ + --outdir {output.multiqc_report} \ + --config {input.multiqc_config} \ + {params.additional_params} \ + {params.results_dir} \ + {params.log_dir};) \ + 1> {log.stdout} 2> {log.stderr}" + + +current_rule = "sort_bed_4_big" + + +rule sort_bed_4_big: + """ + sort bedGraphs in order to work with bedGraphtobigWig + """ + input: + bg=os.path.join( + config["output_dir"], + "samples", + "{sample}", + "ALFA", + "{renamed_unique}", + "{sample}.{renamed_unique}.{strand}.bg", + ), + output: + sorted_bg=temp( + os.path.join( + config["output_dir"], + "samples", + "{sample}", + "bigWig", + "{renamed_unique}", + "{sample}_{renamed_unique}_{strand}.sorted.bg", + ) + ), + params: + cluster_log_path=config["cluster_log_dir"], + additional_params=parse_rule_config( + rule_config, current_rule=current_rule, immutable=("-i",) + ), + container: + "docker://quay.io/biocontainers/bedtools:2.27.1--h9a82719_5" + conda: + os.path.join(workflow.basedir, "envs", "bedtools.yaml") + resources: + mem_mb=lambda wildcards, attempt: 2048 * attempt, + log: + stderr=os.path.join( + config["log_dir"], + "samples", + "{sample}", + f"{current_rule}_{{renamed_unique}}_{{strand}}.stderr.log", + ), + shell: + "(sortBed \ + -i {input.bg} \ + {params.additional_params} \ + > {output.sorted_bg};) 2> {log.stderr}" + + +current_rule = "prepare_bigWig" + + +rule prepare_bigWig: + """ + bedGraphtobigWig, for viewing in genome browsers + """ + input: + sorted_bg=os.path.join( + config["output_dir"], + "samples", + "{sample}", + "bigWig", + "{renamed_unique}", + "{sample}_{renamed_unique}_{strand}.sorted.bg", + ), + chr_sizes=lambda wildcards: os.path.join( + config["star_indexes"], + get_sample("organism", search_id="index", search_value=wildcards.sample), + get_sample("index_size", search_id="index", search_value=wildcards.sample), + "STAR_index", + "chrNameLength.txt", + ), + output: + bigWig=os.path.join( + config["output_dir"], + "samples", + "{sample}", + "bigWig", + "{renamed_unique}", + "{sample}_{renamed_unique}_{strand}.bw", + ), + params: + cluster_log_path=config["cluster_log_dir"], + additional_params=parse_rule_config( + rule_config, current_rule=current_rule, immutable=() + ), + container: + "docker://quay.io/biocontainers/ucsc-bedgraphtobigwig:377--h0b8a92a_2" + conda: + os.path.join(workflow.basedir, "envs", "ucsc-bedgraphtobigwig.yaml") + resources: + mem_mb=lambda wildcards, attempt: 1024 * attempt, + log: + stderr=os.path.join( + config["log_dir"], + "samples", + "{sample}", + f"{current_rule}_{{renamed_unique}}_{{strand}}.stderr.log", + ), + stdout=os.path.join( + config["log_dir"], + "samples", + "{sample}", + f"{current_rule}_{{renamed_unique}}_{{strand}}.stdout.log", + ), + shell: + "(bedGraphToBigWig \ + {params.additional_params} \ + {input.sorted_bg} \ + {input.chr_sizes} \ + {output.bigWig};) \ + 1> {log.stdout} 2> {log.stderr}" -- GitLab From 63a7dfb4a4077bbe6345b69b89cba11618bc9a73 Mon Sep 17 00:00:00 2001 From: Iris Mestres Pascual <iris.mestrespascual@unibas.ch> Date: Tue, 17 Jan 2023 17:16:26 +0000 Subject: [PATCH 02/22] Delete Snakefile --- workflow/Snakefile | 1941 -------------------------------------------- 1 file changed, 1941 deletions(-) delete mode 100644 workflow/Snakefile diff --git a/workflow/Snakefile b/workflow/Snakefile deleted file mode 100644 index 2a37bcf..0000000 --- a/workflow/Snakefile +++ /dev/null @@ -1,1941 +0,0 @@ -"""General purpose RNA-Seq analysis pipeline developed by the Zavolan Lab""" -import os -import pandas as pd -import shutil -import yaml -from shlex import quote -from typing import Tuple -from snakemake.utils import validate - - -configfile: os.path.join(workflow.basedir, "..", "config", "config.yaml") - - -## Preparations -# Get sample table -samples_table = pd.read_csv( - config["samples"], - header=0, - index_col=0, - comment="#", - engine="python", - sep="\t", -) - -# dict for translation of "directionality parameters" -directionality_dict = { - "SF": { - "kallisto": "--fr-stranded", - "alfa": "fr-secondstrand", - "alfa_plus": "str1", - "alfa_minus": "str2", - }, - "SR": { - "kallisto": "--rf-stranded", - "alfa": "fr-firststrand", - "alfa_plus": "str2", - "alfa_minus": "str1", - }, -} -# dict for alfa output" -alfa_dict = {"MultimappersIncluded": "UniqueMultiple", "UniqueMappers": "Unique"} - -# Validate config -validate(config, os.path.join("..", "resources", "config_schema.json")) -logger.info(f"Config file after validation: {config}") - -# Parse YAML rule config file -try: - with open(config["rule_config"]) as _file: - rule_config = yaml.safe_load(_file) - logger.info(f"Loaded rule_config from {config['rule_config']}.") -except TypeError: - logger.error( - f'No string supplied at field "rule_config", but: {type(config["rule_config"])} with content: {config["rule_config"]}' - ) - raise -except FileNotFoundError: - logger.error( - f"No rule config file found at {config['rule_config']}. Either provide file or remove rule_config parameter from config.yaml! " - ) - raise -except KeyError: - rule_config = {} - logger.warning(f"No rule config specified: using default values for all tools.") - - -def parse_rule_config( - rule_config: dict, current_rule: str, immutable: Tuple[str, ...] = () -): - """Get rule specific parameters from rule_config file""" - - # If rule config file not present, emtpy string will be returned - if not rule_config: - logger.info(f"No rule config specified: using default values for all tools.") - return "" - # Same if current rule not specified in rule config - if current_rule not in rule_config or not rule_config[current_rule]: - logger.info( - f"No additional parameters for rule {current_rule} specified: using default settings." - ) - return "" - - # Subset only section for current rule - rule_config = rule_config[current_rule] - - # Build list of parameters and values - params_vals = [] - for param, val in rule_config.items(): - # Do not allow the user to change wiring-critical, fixed arguments, or arguments that are passed through samples table - if param in immutable: - raise ValueError( - f"The following parameter in rule {current_rule} is critical for the pipeline to " - f"function as expected and cannot be modified: {param}" - ) - # Accept only strings; this prevents unintended results potentially - # arising from users entering reserved YAML keywords or nested - # structures (lists, dictionaries) - if isinstance(val, str): - params_vals.append(str(param)) - # Do not include a value for flags (signified by empty strings) - if val: - params_vals.append(val) - else: - raise ValueError( - "Only string values allowed for tool parameters: Found type " - f"'{type(val).__name__}' for value of parameter '{param}'" - ) - # Return quoted string - add_params = " ".join(quote(item) for item in params_vals) - logger.info( - f"User specified additional parameters for rule {current_rule}:\n {add_params}" - ) - return add_params - - -# Global config -localrules: - start, - finish, - rename_star_rpm_for_alfa, - prepare_multiqc_config, - - -# Include subworkflows -include: os.path.join("rules", "common.smk") -include: os.path.join("rules", "paired_end.snakefile.smk") -include: os.path.join("rules", "single_end.snakefile.smk") - - -rule finish: - """ - Rule for collecting outputs - """ - input: - multiqc_report=os.path.join(config["output_dir"], "multiqc_summary"), - bigWig=expand( - os.path.join( - config["output_dir"], - "samples", - "{sample}", - "bigWig", - "{renamed_unique}", - "{sample}_{renamed_unique}_{strand}.bw", - ), - sample=pd.unique(samples_table.index.values), - strand=["plus", "minus"], - renamed_unique=alfa_dict.keys(), - ), - salmon_merge_genes=expand( - os.path.join( - config["output_dir"], - "summary_salmon", - "quantmerge", - "genes_{salmon_merge_on}.tsv", - ), - salmon_merge_on=["tpm", "numreads"], - ), - salmon_merge_transcripts=expand( - os.path.join( - config["output_dir"], - "summary_salmon", - "quantmerge", - "transcripts_{salmon_merge_on}.tsv", - ), - salmon_merge_on=["tpm", "numreads"], - ), - kallisto_merge_transcripts=os.path.join( - config["output_dir"], "summary_kallisto", "transcripts_tpm.tsv" - ), - kallisto_merge_genes=os.path.join( - config["output_dir"], "summary_kallisto", "genes_tpm.tsv" - ), - - -current_rule = "start" - - -rule start: - """ - Get samples - """ - input: - reads=lambda wildcards: expand( - pd.Series(samples_table.loc[wildcards.sample, wildcards.mate]).values - ), - output: - reads=os.path.join( - config["output_dir"], - "samples", - "{sample}", - "start", - "{sample}.{mate}.fastq.gz", - ), - params: - cluster_log_path=config["cluster_log_dir"], - log: - stderr=os.path.join( - config["log_dir"], - "samples", - "{sample}", - f"{current_rule}_{{sample}}.{{mate}}.stderr.log", - ), - stdout=os.path.join( - config["log_dir"], - "samples", - "{sample}", - f"{current_rule}_{{sample}}.{{mate}}.stdout.log", - ), - container: - "docker://ubuntu:focal-20210416" - shell: - "(cat {input.reads} > {output.reads}) \ - 1> {log.stdout} 2> {log.stderr} " - - -current_rule = "fastqc" - - -rule fastqc: - """ - A quality control tool for high throughput sequence data - """ - input: - reads=os.path.join( - config["output_dir"], - "samples", - "{sample}", - "start", - "{sample}.{mate}.fastq.gz", - ), - output: - outdir=directory( - os.path.join( - config["output_dir"], "samples", "{sample}", "fastqc", "{mate}" - ) - ), - params: - cluster_log_path=config["cluster_log_dir"], - additional_params=parse_rule_config( - rule_config, current_rule=current_rule, immutable=("--outdir",) - ), - threads: 1 - resources: - mem_mb=lambda wildcards, attempt: 4096 * attempt, - container: - "docker://quay.io/biocontainers/fastqc:0.11.9--hdfd78af_1" - conda: - os.path.join(workflow.basedir, "envs", "fastqc.yaml") - log: - stderr=os.path.join( - config["log_dir"], - "samples", - "{sample}", - f"{current_rule}_{{mate}}.stderr.log", - ), - stdout=os.path.join( - config["log_dir"], - "samples", - "{sample}", - f"{current_rule}_{{mate}}.stdout.log", - ), - shell: - "(mkdir -p {output.outdir}; \ - fastqc --outdir {output.outdir} \ - --threads {threads} \ - {params.additional_params} \ - {input.reads}) \ - 1> {log.stdout} 2> {log.stderr}" - - -current_rule = "fastqc_trimmed" - - -rule fastqc_trimmed: - """ - A quality control tool for high throughput sequence data - """ - input: - reads=os.path.join( - config["output_dir"], - "samples", - "{sample}", - "{sample}.{mate}.{seqmode}.remove_polya.fastq.gz", - ), - output: - outdir=directory( - os.path.join( - config["output_dir"], - "samples", - "{sample}", - "fastqc_trimmed", - "{mate}_{seqmode}", - ) - ), - params: - cluster_log_path=config["cluster_log_dir"], - additional_params=parse_rule_config( - rule_config, current_rule=current_rule, immutable=("--outdir",) - ), - threads: 1 - container: - "docker://quay.io/biocontainers/fastqc:0.11.9--hdfd78af_1" - conda: - os.path.join(workflow.basedir, "envs", "fastqc.yaml") - log: - stderr=os.path.join( - config["log_dir"], - "samples", - "{sample}", - f"{current_rule}_{{seqmode}}_{{mate}}.stderr.log", - ), - stdout=os.path.join( - config["log_dir"], - "samples", - "{sample}", - f"{current_rule}_{{seqmode}}_{{mate}}.stdout.log", - ), - shell: - "(mkdir -p {output.outdir}; \ - fastqc --outdir {output.outdir} \ - --threads {threads} \ - {params.additional_params} \ - {input.reads}) \ - 1> {log.stdout} 2> {log.stderr}" - - -current_rule = "create_index_star" - - -rule create_index_star: - """ - Create index for STAR alignments - """ - input: - genome=lambda wildcards: os.path.abspath( - get_sample("genome", search_id="organism", search_value=wildcards.organism) - ), - gtf=lambda wildcards: os.path.abspath( - get_sample("gtf", search_id="organism", search_value=wildcards.organism) - ), - output: - chromosome_info=os.path.join( - config["star_indexes"], - "{organism}", - "{index_size}", - "STAR_index", - "chrNameLength.txt", - ), - chromosomes_names=os.path.join( - config["star_indexes"], - "{organism}", - "{index_size}", - "STAR_index", - "chrName.txt", - ), - params: - cluster_log_path=config["cluster_log_dir"], - output_dir=lambda wildcards, output: os.path.dirname(output.chromosome_info), - outFileNamePrefix=os.path.join( - config["star_indexes"], "{organism}", "{index_size}", "STAR_index/STAR_" - ), - sjdbOverhang="{index_size}", - additional_params=parse_rule_config( - rule_config, - current_rule=current_rule, - immutable=( - "--runMode", - "--sjdbOverhang", - "--genomeDir", - "--genomeFastaFiles", - "--outFileNamePrefix", - "--sjdbGTFfile", - ), - ), - container: - "docker://quay.io/biocontainers/star:2.7.8a--h9ee0642_1" - conda: - os.path.join(workflow.basedir, "envs", "STAR.yaml") - threads: 12 - resources: - mem_mb=lambda wildcards, attempt: 32000 * attempt, - log: - stderr=os.path.join( - config["log_dir"], f"{current_rule}_{{organism}}_{{index_size}}.stderr.log" - ), - stdout=os.path.join( - config["log_dir"], f"{current_rule}_{{organism}}_{{index_size}}.stdout.log" - ), - shell: - "(mkdir -p {params.output_dir}; \ - chmod -R 777 {params.output_dir}; \ - STAR \ - --runMode genomeGenerate \ - --sjdbOverhang {params.sjdbOverhang} \ - --genomeDir {params.output_dir} \ - --genomeFastaFiles {input.genome} \ - --runThreadN {threads} \ - --outFileNamePrefix {params.outFileNamePrefix} \ - --sjdbGTFfile {input.gtf}) \ - {params.additional_params} \ - 1> {log.stdout} 2> {log.stderr}" - - -current_rule = "extract_transcriptome" - - -rule extract_transcriptome: - """ - Create transcriptome from genome and gene annotations - """ - input: - genome=lambda wildcards: get_sample( - "genome", search_id="organism", search_value=wildcards.organism - ), - gtf=lambda wildcards: get_sample( - "gtf", search_id="organism", search_value=wildcards.organism - ), - output: - transcriptome=temp( - os.path.join( - config["output_dir"], "transcriptome", "{organism}", "transcriptome.fa" - ) - ), - params: - cluster_log_path=config["cluster_log_dir"], - additional_params=parse_rule_config( - rule_config, - current_rule=current_rule, - immutable=( - "-w", - "-g", - ), - ), - container: - "docker://quay.io/biocontainers/gffread:0.12.1--h2e03b76_1" - conda: - os.path.join(workflow.basedir, "envs", "gffread.yaml") - resources: - mem_mb=lambda wildcards, attempt: 4096 * attempt, - log: - stderr=os.path.join(config["log_dir"], f"{current_rule}_{{organism}}.log"), - stdout=os.path.join(config["log_dir"], f"{current_rule}_{{organism}}.log"), - shell: - "(gffread \ - -w {output.transcriptome} \ - -g {input.genome} \ - {params.additional_params} \ - {input.gtf}) \ - 1> {log.stdout} 2> {log.stderr}" - - -current_rule = "concatenate_transcriptome_and_genome" - - -rule concatenate_transcriptome_and_genome: - """ - Concatenate genome and transcriptome - """ - input: - transcriptome=os.path.join( - config["output_dir"], "transcriptome", "{organism}", "transcriptome.fa" - ), - genome=lambda wildcards: get_sample( - "genome", search_id="organism", search_value=wildcards.organism - ), - output: - genome_transcriptome=temp( - os.path.join( - config["output_dir"], - "transcriptome", - "{organism}", - "genome_transcriptome.fa", - ) - ), - params: - cluster_log_path=config["cluster_log_dir"], - container: - "docker://ubuntu:focal-20210416" - resources: - mem_mb=lambda wildcards, attempt: 4096 * attempt, - log: - stderr=os.path.join( - config["log_dir"], f"{current_rule}_{{organism}}.stderr.log" - ), - shell: - "(cat {input.transcriptome} {input.genome} \ - 1> {output.genome_transcriptome}) \ - 2> {log.stderr}" - - -current_rule = "create_index_salmon" - - -rule create_index_salmon: - """ - Create index for Salmon quantification - """ - input: - genome_transcriptome=os.path.join( - config["output_dir"], - "transcriptome", - "{organism}", - "genome_transcriptome.fa", - ), - chr_names=lambda wildcards: os.path.join( - config["star_indexes"], - get_sample("organism"), - get_sample("index_size"), - "STAR_index", - "chrName.txt", - ), - output: - index=directory( - os.path.join( - config["salmon_indexes"], "{organism}", "{kmer}", "salmon.idx" - ) - ), - params: - cluster_log_path=config["cluster_log_dir"], - kmerLen="{kmer}", - additional_params=parse_rule_config( - rule_config, - current_rule=current_rule, - immutable=( - "--transcripts", - "--decoys", - "--index", - "--kmerLen", - ), - ), - container: - "docker://quay.io/biocontainers/salmon:1.4.0--h84f40af_1" - conda: - os.path.join(workflow.basedir, "envs", "salmon.yaml") - threads: 8 - resources: - mem_mb=lambda wildcards, attempt: 32000 * attempt, - log: - stderr=os.path.join( - config["log_dir"], f"{current_rule}_{{organism}}_{{kmer}}.stderr.log" - ), - stdout=os.path.join( - config["log_dir"], f"{current_rule}_{{organism}}_{{kmer}}.stdout.log" - ), - shell: - "(salmon index \ - --transcripts {input.genome_transcriptome} \ - --decoys {input.chr_names} \ - --index {output.index} \ - --kmerLen {params.kmerLen} \ - --threads {threads}) \ - {params.additional_params} \ - 1> {log.stdout} 2> {log.stderr}" - - -current_rule = "create_index_kallisto" - - -rule create_index_kallisto: - """ - Create index for Kallisto quantification - """ - input: - transcriptome=os.path.join( - config["output_dir"], "transcriptome", "{organism}", "transcriptome.fa" - ), - output: - index=os.path.join(config["kallisto_indexes"], "{organism}", "kallisto.idx"), - params: - cluster_log_path=config["cluster_log_dir"], - output_dir=lambda wildcards, output: os.path.dirname(output.index), - additional_params=parse_rule_config( - rule_config, current_rule=current_rule, immutable=("-i",) - ), - container: - "docker://quay.io/biocontainers/kallisto:0.46.2--h60f4f9f_2" - conda: - os.path.join(workflow.basedir, "envs", "kallisto.yaml") - resources: - mem_mb=lambda wildcards, attempt: 8192 * attempt, - log: - stderr=os.path.join( - config["log_dir"], f"{current_rule}_{{organism}}.stderr.log" - ), - stdout=os.path.join( - config["log_dir"], f"{current_rule}_{{organism}}.stdout.log" - ), - shell: - "(mkdir -p {params.output_dir}; \ - chmod -R 777 {params.output_dir}; \ - kallisto index \ - {params.additional_params} \ - -i {output.index} \ - {input.transcriptome}) \ - 1> {log.stdout} 2> {log.stderr}" - - -current_rule = "extract_transcripts_as_bed12" - - -rule extract_transcripts_as_bed12: - """ - Convert transcripts to BED12 format - """ - input: - gtf=lambda wildcards: get_sample("gtf"), - output: - bed12=temp( - os.path.join(config["output_dir"], "full_transcripts_protein_coding.bed") - ), - params: - cluster_log_path=config["cluster_log_dir"], - additional_params=parse_rule_config( - rule_config, - current_rule=current_rule, - immutable=( - "--gtf", - "--bed12", - ), - ), - container: - "docker://quay.io/biocontainers/zgtf:0.1.1--pyh5e36f6f_0" - conda: - os.path.join(workflow.basedir, "envs", "zgtf.yaml") - threads: 1 - resources: - mem_mb=lambda wildcards, attempt: 4096 * attempt, - log: - stdout=os.path.join(config["log_dir"], f"{current_rule}.stdout.log"), - stderr=os.path.join(config["log_dir"], f"{current_rule}.stderr.log"), - shell: - "(gtf2bed12 \ - --gtf {input.gtf} \ - --bed12 {output.bed12}); \ - {params.additional_params} \ - 1> {log.stdout} 2> {log.stderr}" - - -current_rule = "sort_genomic_alignment_samtools" - - -rule sort_genomic_alignment_samtools: - """ - Sort genome bamfile using samtools - """ - input: - bam=os.path.join( - config["output_dir"], - "samples", - "{sample}", - "map_genome", - "{sample}.{seqmode}.Aligned.out.bam", - ), - output: - bam=os.path.join( - config["output_dir"], - "samples", - "{sample}", - "map_genome", - "{sample}.{seqmode}.Aligned.sortedByCoord.out.bam", - ), - params: - cluster_log_path=config["cluster_log_dir"], - additional_params=parse_rule_config( - rule_config, current_rule=current_rule, immutable=() - ), - container: - "docker://quay.io/biocontainers/samtools:1.9--h10a08f8_12" - conda: - os.path.join(workflow.basedir, "envs", "samtools.yaml") - threads: 8 - resources: - mem_mb=lambda wildcards, attempt: 8000 * attempt, - log: - stderr=os.path.join( - config["log_dir"], - "samples", - "{sample}", - f"{current_rule}.{{seqmode}}.stderr.log", - ), - stdout=os.path.join( - config["log_dir"], - "samples", - "{sample}", - f"{current_rule}.{{seqmode}}.stdout.log", - ), - shell: - "(samtools sort \ - -o {output.bam} \ - -@ {threads} \ - {params.additional_params} \ - {input.bam}) \ - 1> {log.stdout} 2> {log.stderr}" - - -current_rule = "index_genomic_alignment_samtools" - - -rule index_genomic_alignment_samtools: - """ - Index genome bamfile using samtools - """ - input: - bam=os.path.join( - config["output_dir"], - "samples", - "{sample}", - "map_genome", - "{sample}.{seqmode}.Aligned.sortedByCoord.out.bam", - ), - output: - bai=os.path.join( - config["output_dir"], - "samples", - "{sample}", - "map_genome", - "{sample}.{seqmode}.Aligned.sortedByCoord.out.bam.bai", - ), - params: - cluster_log_path=config["cluster_log_dir"], - additional_params=parse_rule_config( - rule_config, current_rule=current_rule, immutable=() - ), - container: - "docker://quay.io/biocontainers/samtools:1.3.1--h1b8c3c0_8" - conda: - os.path.join(workflow.basedir, "envs", "samtools.yaml") - threads: 1 - resources: - mem_mb=lambda wildcards, attempt: 4096 * attempt, - log: - stderr=os.path.join( - config["log_dir"], - "samples", - "{sample}", - f"{current_rule}.{{seqmode}}.stderr.log", - ), - stdout=os.path.join( - config["log_dir"], - "samples", - "{sample}", - f"{current_rule}.{{seqmode}}.stdout.log", - ), - shell: - "(samtools index \ - {params.additional_params} \ - {input.bam} {output.bai};) \ - 1> {log.stdout} 2> {log.stderr}" - - -current_rule = "calculate_TIN_scores" - - -rule calculate_TIN_scores: - """ - Calculate transcript integrity (TIN) score - """ - input: - bam=lambda wildcards: expand( - os.path.join( - config["output_dir"], - "samples", - "{sample}", - "map_genome", - "{sample}.{seqmode}.Aligned.sortedByCoord.out.bam", - ), - sample=wildcards.sample, - seqmode=get_sample( - "seqmode", search_id="index", search_value=wildcards.sample - ), - ), - bai=lambda wildcards: expand( - os.path.join( - config["output_dir"], - "samples", - "{sample}", - "map_genome", - "{sample}.{seqmode}.Aligned.sortedByCoord.out.bam.bai", - ), - sample=wildcards.sample, - seqmode=get_sample( - "seqmode", search_id="index", search_value=wildcards.sample - ), - ), - transcripts_bed12=os.path.join( - config["output_dir"], "full_transcripts_protein_coding.bed" - ), - output: - TIN_score=temp( - os.path.join( - config["output_dir"], "samples", "{sample}", "TIN", "TIN_score.tsv" - ) - ), - params: - cluster_log_path=config["cluster_log_dir"], - sample="{sample}", - additional_params=parse_rule_config( - rule_config, - current_rule=current_rule, - immutable=( - "-i", - "-r", - "--names", - ), - ), - container: - "docker://quay.io/biocontainers/tin-score-calculation:0.6.3--pyh5e36f6f_0" - conda: - os.path.join(workflow.basedir, "envs", "tin-score-calculation.yaml") - threads: 8 - resources: - mem_mb=lambda wildcards, attempt, input: len(input.bam) * 1024 * attempt, - log: - stderr=os.path.join( - config["log_dir"], "samples", "{sample}", f"{current_rule}.log" - ), - shell: - "(calculate-tin.py \ - -i {input.bam} \ - -r {input.transcripts_bed12} \ - --names {params.sample} \ - -p {threads} \ - {params.additional_params} \ - > {output.TIN_score};) 2> {log.stderr}" - - -current_rule = "salmon_quantmerge_genes" - - -rule salmon_quantmerge_genes: - """ - Merge gene quantifications into a single file - """ - input: - salmon_in=expand( - os.path.join( - config["output_dir"], - "samples", - "{sample}", - "{sample}.salmon.{seqmode}", - "quant.sf", - ), - zip, - sample=pd.unique(samples_table.index.values), - seqmode=[ - get_sample("seqmode", search_id="index", search_value=i) - for i in pd.unique(samples_table.index.values) - ], - ), - output: - salmon_out=os.path.join( - config["output_dir"], - "summary_salmon", - "quantmerge", - "genes_{salmon_merge_on}.tsv", - ), - params: - cluster_log_path=config["cluster_log_dir"], - salmon_in=expand( - os.path.join( - config["output_dir"], - "samples", - "{sample}", - "{sample}.salmon.{seqmode}", - ), - zip, - sample=[i for i in pd.unique(samples_table.index.values)], - seqmode=[ - get_sample("seqmode", search_id="index", search_value=i) - for i in pd.unique(samples_table.index.values) - ], - ), - sample_name_list=expand( - "{sample}", sample=pd.unique(samples_table.index.values) - ), - salmon_merge_on="{salmon_merge_on}", - additional_params=parse_rule_config( - rule_config, - current_rule=current_rule, - immutable=( - "--quants", - "--genes", - "--transcripts", - "--names", - "--column", - "--output", - ), - ), - container: - "docker://quay.io/biocontainers/salmon:1.4.0--h84f40af_1" - conda: - os.path.join(workflow.basedir, "envs", "salmon.yaml") - threads: 1 - resources: - mem_mb=lambda wildcards, attempt, input: len(input.salmon_in) * 1024 * attempt, - log: - stderr=os.path.join( - config["log_dir"], f"{current_rule}_{{salmon_merge_on}}.stderr.log" - ), - stdout=os.path.join( - config["log_dir"], f"{current_rule}_{{salmon_merge_on}}.stdout.log" - ), - shell: - "(salmon quantmerge \ - --quants {params.salmon_in} \ - --genes \ - --names {params.sample_name_list} \ - --column {params.salmon_merge_on} \ - --output {output.salmon_out};) \ - {params.additional_params} \ - 1> {log.stdout} 2> {log.stderr}" - - -current_rule = "salmon_quantmerge_transcripts" - - -rule salmon_quantmerge_transcripts: - """ - Merge transcript quantifications into a single file - """ - input: - salmon_in=expand( - os.path.join( - config["output_dir"], - "samples", - "{sample}", - "{sample}.salmon.{seqmode}", - "quant.sf", - ), - zip, - sample=[i for i in pd.unique(samples_table.index.values)], - seqmode=[ - get_sample("seqmode", search_id="index", search_value=i) - for i in pd.unique(samples_table.index.values) - ], - ), - output: - salmon_out=os.path.join( - config["output_dir"], - "summary_salmon", - "quantmerge", - "transcripts_{salmon_merge_on}.tsv", - ), - params: - cluster_log_path=config["cluster_log_dir"], - salmon_in=expand( - os.path.join( - config["output_dir"], - "samples", - "{sample}", - "{sample}.salmon.{seqmode}", - ), - zip, - sample=[i for i in pd.unique(samples_table.index.values)], - seqmode=[ - get_sample("seqmode", search_id="index", search_value=i) - for i in pd.unique(samples_table.index.values) - ], - ), - sample_name_list=expand( - "{sample}", sample=pd.unique(samples_table.index.values) - ), - salmon_merge_on="{salmon_merge_on}", - additional_params=parse_rule_config( - rule_config, - current_rule=current_rule, - immutable=( - "--quants", - "--genes", - "--transcripts", - "--names", - "--column", - "--output", - ), - ), - container: - "docker://quay.io/biocontainers/salmon:1.4.0--h84f40af_1" - conda: - os.path.join(workflow.basedir, "envs", "salmon.yaml") - threads: 1 - resources: - mem_mb=lambda wildcards, attempt, input: len(input.salmon_in) * 1000 * attempt, - log: - stderr=os.path.join( - config["log_dir"], f"{current_rule}_{{salmon_merge_on}}.stderr.log" - ), - stdout=os.path.join( - config["log_dir"], f"{current_rule}_{{salmon_merge_on}}.stdout.log" - ), - shell: - "(salmon quantmerge \ - --quants {params.salmon_in} \ - --names {params.sample_name_list} \ - --column {params.salmon_merge_on} \ - --output {output.salmon_out}) \ - {params.additional_params} \ - 1> {log.stdout} 2> {log.stderr}" - - -current_rule = "kallisto_merge_genes" - - -rule kallisto_merge_genes: - """ - Merge gene quantifications into single file - """ - input: - pseudoalignment=expand( - os.path.join( - config["output_dir"], - "samples", - "{sample}", - "quant_kallisto", - "{sample}.{seqmode}.kallisto.pseudo.sam", - ), - zip, - sample=[i for i in pd.unique(samples_table.index.values)], - seqmode=[ - get_sample("seqmode", search_id="index", search_value=i) - for i in pd.unique(samples_table.index.values) - ], - ), - gtf=get_sample("gtf"), - output: - gn_tpm=os.path.join(config["output_dir"], "summary_kallisto", "genes_tpm.tsv"), - gn_counts=os.path.join( - config["output_dir"], "summary_kallisto", "genes_counts.tsv" - ), - params: - cluster_log_path=config["cluster_log_dir"], - dir_out=lambda wildcards, output: os.path.dirname(output.gn_counts), - tables=",".join( - expand( - os.path.join( - config["output_dir"], - "samples", - "{sample}", - "quant_kallisto", - "abundance.h5", - ), - sample=[i for i in pd.unique(samples_table.index.values)], - ) - ), - sample_name_list=",".join( - expand("{sample}", sample=pd.unique(samples_table.index.values)) - ), - additional_params=parse_rule_config( - rule_config, - current_rule=current_rule, - immutable=( - "--input", - "--names", - "--txOut", - "--anno", - "--output", - ), - ), - container: - "docker://quay.io/biocontainers/r-merge-kallisto:0.6--hdfd78af_0" - conda: - os.path.join(workflow.basedir, "envs", "r-merge-kallisto.yaml") - threads: 1 - resources: - mem_mb=lambda wildcards, attempt, input: len(input.pseudoalignment) - * 1000 - * attempt, - log: - stderr=os.path.join(config["log_dir"], f"{current_rule}.stderr.log"), - stdout=os.path.join(config["log_dir"], f"{current_rule}.stdout.log"), - shell: - "(merge_kallisto.R \ - --input {params.tables} \ - --names {params.sample_name_list} \ - --txOut FALSE \ - --anno {input.gtf} \ - --output {params.dir_out} \ - {params.additional_params} ) \ - 1> {log.stdout} 2> {log.stderr}" - - -current_rule = "kallisto_merge_transcripts" - - -rule kallisto_merge_transcripts: - """ - Merge transcript quantifications into a single files - """ - input: - pseudoalignment=expand( - os.path.join( - config["output_dir"], - "samples", - "{sample}", - "quant_kallisto", - "{sample}.{seqmode}.kallisto.pseudo.sam", - ), - zip, - sample=[i for i in pd.unique(samples_table.index.values)], - seqmode=[ - get_sample("seqmode", search_id="index", search_value=i) - for i in pd.unique(samples_table.index.values) - ], - ), - output: - tx_tpm=os.path.join( - config["output_dir"], "summary_kallisto", "transcripts_tpm.tsv" - ), - tx_counts=os.path.join( - config["output_dir"], "summary_kallisto", "transcripts_counts.tsv" - ), - params: - cluster_log_path=config["cluster_log_dir"], - dir_out=lambda wildcards, output: os.path.dirname(output.tx_counts), - tables=",".join( - expand( - os.path.join( - config["output_dir"], - "samples", - "{sample}", - "quant_kallisto", - "abundance.h5", - ), - sample=[i for i in pd.unique(samples_table.index.values)], - ) - ), - sample_name_list=",".join( - expand("{sample}", sample=pd.unique(samples_table.index.values)) - ), - additional_params=parse_rule_config( - rule_config, - current_rule=current_rule, - immutable=( - "--input", - "--names", - "--txOut", - "--output", - ), - ), - container: - "docker://quay.io/biocontainers/r-merge-kallisto:0.6--hdfd78af_0" - conda: - os.path.join(workflow.basedir, "envs", "r-merge-kallisto.yaml") - threads: 1 - resources: - mem_mb=lambda wildcards, attempt, input: len(input.pseudoalignment) - * 1024 - * attempt, - log: - stderr=os.path.join(config["log_dir"], f"{current_rule}.stderr.log"), - stdout=os.path.join(config["log_dir"], f"{current_rule}.stdout.log"), - shell: - "(merge_kallisto.R \ - --input {params.tables} \ - --names {params.sample_name_list} \ - --output {params.dir_out} \ - {params.additional_params}) \ - 1> {log.stdout} 2> {log.stderr}" - - -current_rule = "pca_salmon" - - -rule pca_salmon: - input: - tpm=os.path.join( - config["output_dir"], "summary_salmon", "quantmerge", "{molecule}_tpm.tsv" - ), - output: - out=directory( - os.path.join(config["output_dir"], "zpca", "pca_salmon_{molecule}") - ), - params: - cluster_log_path=config["cluster_log_dir"], - additional_params=parse_rule_config( - rule_config, - current_rule=current_rule, - immutable=( - "--tpm", - "--out", - ), - ), - container: - "docker://quay.io/biocontainers/zpca:0.8.3.post1--pyh5e36f6f_0" - conda: - os.path.join(workflow.basedir, "envs", "zpca.yaml") - threads: 1 - resources: - mem_mb=lambda wildcards, attempt: 4096 * attempt, - log: - stderr=os.path.join( - config["log_dir"], f"{current_rule}_{{molecule}}.stderr.log" - ), - stdout=os.path.join( - config["log_dir"], f"{current_rule}_{{molecule}}.stdout.log" - ), - shell: - "(zpca-tpm \ - --tpm {input.tpm} \ - --out {output.out} \ - {params.additional_params}) \ - 1> {log.stdout} 2> {log.stderr}" - - -current_rule = "pca_kallisto" - - -rule pca_kallisto: - input: - tpm=os.path.join(config["output_dir"], "summary_kallisto", "{molecule}_tpm.tsv"), - output: - out=directory( - os.path.join(config["output_dir"], "zpca", "pca_kallisto_{molecule}") - ), - params: - cluster_log_path=config["cluster_log_dir"], - additional_params=parse_rule_config( - rule_config, - current_rule=current_rule, - immutable=( - "--tpm", - "--out", - ), - ), - container: - "docker://quay.io/biocontainers/zpca:0.8.3.post1--pyh5e36f6f_0" - conda: - os.path.join(workflow.basedir, "envs", "zpca.yaml") - threads: 1 - resources: - mem_mb=lambda wildcards, attempt: 4096 * attempt, - log: - stderr=os.path.join( - config["log_dir"], f"{current_rule}_{{molecule}}.stderr.log" - ), - stdout=os.path.join( - config["log_dir"], f"{current_rule}_{{molecule}}.stdout.log" - ), - shell: - "(zpca-tpm \ - --tpm {input.tpm} \ - --out {output.out} \ - {params.additional_params}) \ - 1> {log.stdout} 2> {log.stderr}" - - -current_rule = "star_rpm" - - -rule star_rpm: - """ - Create stranded bedgraph coverage with STARs RPM normalisation - """ - input: - bam=lambda wildcards: expand( - os.path.join( - config["output_dir"], - "samples", - "{sample}", - "map_genome", - "{sample}.{seqmode}.Aligned.sortedByCoord.out.bam", - ), - sample=wildcards.sample, - seqmode=get_sample( - "seqmode", search_id="index", search_value=wildcards.sample - ), - ), - bai=lambda wildcards: expand( - os.path.join( - config["output_dir"], - "samples", - "{sample}", - "map_genome", - "{sample}.{seqmode}.Aligned.sortedByCoord.out.bam.bai", - ), - sample=wildcards.sample, - seqmode=get_sample( - "seqmode", search_id="index", search_value=wildcards.sample - ), - ), - output: - str1=temp( - os.path.join( - config["output_dir"], - "samples", - "{sample}", - "STAR_coverage", - "{sample}_Signal.Unique.str1.out.bg", - ) - ), - str2=temp( - os.path.join( - config["output_dir"], - "samples", - "{sample}", - "STAR_coverage", - "{sample}_Signal.UniqueMultiple.str1.out.bg", - ) - ), - str3=temp( - os.path.join( - config["output_dir"], - "samples", - "{sample}", - "STAR_coverage", - "{sample}_Signal.Unique.str2.out.bg", - ) - ), - str4=temp( - os.path.join( - config["output_dir"], - "samples", - "{sample}", - "STAR_coverage", - "{sample}_Signal.UniqueMultiple.str2.out.bg", - ) - ), - shadow: - "full" - params: - cluster_log_path=config["cluster_log_dir"], - out_dir=lambda wildcards, output: os.path.dirname(output.str1), - prefix=lambda wildcards, output: os.path.join( - os.path.dirname(output.str1), f"{str(wildcards.sample)}_" - ), - additional_params=parse_rule_config( - rule_config, - current_rule=current_rule, - immutable=( - "--runMode", - "--inputBAMfile", - "--outWigType", - "--outFileNamePrefix", - ), - ), - container: - "docker://quay.io/biocontainers/star:2.7.8a--h9ee0642_1" - conda: - os.path.join(workflow.basedir, "envs", "STAR.yaml") - threads: 4 - resources: - mem_mb=lambda wildcards, attempt: 8192 * attempt, - log: - stderr=os.path.join( - config["log_dir"], "samples", "{sample}", f"{current_rule}_.stderr.log" - ), - stdout=os.path.join( - config["log_dir"], "samples", "{sample}", f"{current_rule}_.stdout.log" - ), - shell: - "(mkdir -p {params.out_dir}; \ - chmod -R 777 {params.out_dir}; \ - STAR \ - --runMode inputAlignmentsFromBAM \ - --runThreadN {threads} \ - --inputBAMfile {input.bam} \ - --outWigType bedGraph \ - --outFileNamePrefix {params.prefix}) \ - {params.additional_params} \ - 1> {log.stdout} 2> {log.stderr}" - - -current_rule = "rename_star_rpm_for_alfa" - - -rule rename_star_rpm_for_alfa: - input: - plus=lambda wildcards: expand( - os.path.join( - config["output_dir"], - "samples", - "{sample}", - "STAR_coverage", - "{sample}_Signal.{unique}.{plus}.out.bg", - ), - sample=wildcards.sample, - unique=alfa_dict[wildcards.renamed_unique], - plus=get_directionality( - get_sample( - "libtype", search_id="index", search_value=wildcards.sample - ), - "alfa_plus", - ), - ), - minus=lambda wildcards: expand( - os.path.join( - config["output_dir"], - "samples", - "{sample}", - "STAR_coverage", - "{sample}_Signal.{unique}.{minus}.out.bg", - ), - sample=wildcards.sample, - unique=alfa_dict[wildcards.renamed_unique], - minus=get_directionality( - get_sample( - "libtype", search_id="index", search_value=wildcards.sample - ), - "alfa_minus", - ), - ), - output: - plus=temp( - os.path.join( - config["output_dir"], - "samples", - "{sample}", - "ALFA", - "{renamed_unique}", - "{sample}.{renamed_unique}.plus.bg", - ) - ), - minus=temp( - os.path.join( - config["output_dir"], - "samples", - "{sample}", - "ALFA", - "{renamed_unique}", - "{sample}.{renamed_unique}.minus.bg", - ) - ), - params: - cluster_log_path=config["cluster_log_dir"], - container: - "docker://ubuntu:focal-20210416" - log: - stderr=os.path.join( - config["log_dir"], - "samples", - "{sample}", - f"{current_rule}_{{renamed_unique}}.stderr.log", - ), - stdout=os.path.join( - config["log_dir"], - "samples", - "{sample}", - f"{current_rule}_{{renamed_unique}}.stdout.log", - ), - shell: - "(cp {input.plus} {output.plus}; \ - cp {input.minus} {output.minus};) \ - 1>{log.stdout} 2>{log.stderr}" - - -current_rule = "generate_alfa_index" - - -rule generate_alfa_index: - """ Generate ALFA index files from sorted GTF file """ - input: - gtf=lambda wildcards: os.path.abspath( - get_sample("gtf", search_id="organism", search_value=wildcards.organism) - ), - chr_len=os.path.join( - config["star_indexes"], - "{organism}", - "{index_size}", - "STAR_index", - "chrNameLength.txt", - ), - output: - index_stranded=os.path.join( - config["alfa_indexes"], - "{organism}", - "{index_size}", - "ALFA", - "sorted_genes.stranded.ALFA_index", - ), - index_unstranded=os.path.join( - config["alfa_indexes"], - "{organism}", - "{index_size}", - "ALFA", - "sorted_genes.unstranded.ALFA_index", - ), - temp_dir=temp( - directory( - os.path.join( - config["alfa_indexes"], - "{organism}", - "{index_size}", - "ALFA_temp", - ) - ) - ), - params: - cluster_log_path=config["cluster_log_dir"], - genome_index="sorted_genes", - out_dir=lambda wildcards, output: os.path.dirname(output.index_stranded), - additional_params=parse_rule_config( - rule_config, - current_rule=current_rule, - immutable=( - "-a", - "-g", - "--chr_len", - "-o", - ), - ), - container: - "docker://quay.io/biocontainers/alfa:1.1.1--pyh5e36f6f_0" - conda: - os.path.join(workflow.basedir, "envs", "alfa.yaml") - threads: 4 - resources: - mem_mb=lambda wildcards, attempt: 4096 * attempt, - log: - os.path.join( - config["log_dir"], f"{current_rule}_{{organism}}_{{index_size}}.log" - ), - shell: - "(mkdir -p {output.temp_dir}; \ - alfa -a {input.gtf} \ - -g {params.genome_index} \ - --chr_len {input.chr_len} \ - --temp_dir {output.temp_dir} \ - -p {threads} \ - -o {params.out_dir} \ - {params.additional_params}) \ - &> {log}" - - -current_rule = "alfa_qc" - - -rule alfa_qc: - """ - Run ALFA from stranded bedgraph files - """ - input: - plus=lambda wildcards: os.path.join( - config["output_dir"], - "samples", - wildcards.sample, - "ALFA", - wildcards.renamed_unique, - f"{wildcards.sample}.{wildcards.renamed_unique}.plus.bg", - ), - minus=lambda wildcards: os.path.join( - config["output_dir"], - "samples", - wildcards.sample, - "ALFA", - wildcards.renamed_unique, - f"{wildcards.sample}.{wildcards.renamed_unique}.minus.bg", - ), - gtf=lambda wildcards: os.path.join( - config["alfa_indexes"], - get_sample("organism", search_id="index", search_value=wildcards.sample), - get_sample("index_size", search_id="index", search_value=wildcards.sample), - "ALFA", - "sorted_genes.stranded.ALFA_index", - ), - output: - biotypes=temp( - os.path.join( - config["output_dir"], - "samples", - "{sample}", - "ALFA", - "{renamed_unique}", - "ALFA_plots.Biotypes.pdf", - ) - ), - categories=temp( - os.path.join( - config["output_dir"], - "samples", - "{sample}", - "ALFA", - "{renamed_unique}", - "ALFA_plots.Categories.pdf", - ) - ), - table=os.path.join( - config["output_dir"], - "samples", - "{sample}", - "ALFA", - "{renamed_unique}", - "{sample}.ALFA_feature_counts.tsv", - ), - temp_dir=temp( - directory( - os.path.join( - config["output_dir"], - "samples", - "{sample}", - "ALFA", - "{renamed_unique}", - "ALFA_temp", - ) - ) - ), - params: - cluster_log_path=config["cluster_log_dir"], - out_dir=lambda wildcards, output: os.path.dirname(output.biotypes), - genome_index=lambda wildcards, input: os.path.abspath( - os.path.join(os.path.dirname(input.gtf), "sorted_genes") - ), - plus=lambda wildcards, input: os.path.basename(input.plus), - minus=lambda wildcards, input: os.path.basename(input.minus), - name="{sample}", - alfa_orientation=lambda wildcards: get_directionality( - get_sample("libtype", search_id="index", search_value=wildcards.sample), - "alfa", - ), - temp_dir=lambda wildcards, output: os.path.abspath( - os.path.dirname(output.temp_dir) - ), - additional_params=parse_rule_config( - rule_config, - current_rule=current_rule, - immutable=( - "-g", - "--bedgraph", - "-s", - ), - ), - container: - "docker://quay.io/biocontainers/alfa:1.1.1--pyh5e36f6f_0" - conda: - os.path.join(workflow.basedir, "envs", "alfa.yaml") - resources: - mem_mb=lambda wildcards, attempt: 4096 * attempt, - log: - os.path.join( - config["log_dir"], - "samples", - "{sample}", - f"{current_rule}.{{renamed_unique}}.log", - ), - shell: - "(mkdir -p {output.temp_dir};\ - cd {params.out_dir}; \ - alfa \ - -g {params.genome_index} \ - --bedgraph {params.plus} {params.minus} {params.name} \ - -s {params.alfa_orientation} \ - --temp_dir {params.temp_dir} \ - {params.additional_params}) \ - &> {log}" - - -current_rule = "prepare_multiqc_config" - - -rule prepare_multiqc_config: - """ - Prepare config for the MultiQC - """ - input: - script=os.path.join(workflow.basedir, "scripts", "zarp_multiqc_config.py"), - output: - multiqc_config=os.path.join(config["output_dir"], "multiqc_config.yaml"), - params: - cluster_log_path=config["cluster_log_dir"], - logo_path=config["report_logo"] if "report_logo" in config else "", - multiqc_intro_text=config["report_description"], - url=config["report_url"] if "report_url" in config else "", - author_name=config["author_name"], - author_email=config["author_email"], - additional_params=parse_rule_config( - rule_config, - current_rule=current_rule, - immutable=( - "--config", - "--intro-text", - "--custom-logo", - "--url", - ), - ), - log: - stderr=os.path.join(config["log_dir"], f"{current_rule}.stderr.log"), - stdout=os.path.join(config["log_dir"], f"{current_rule}.stdout.log"), - container: - "docker://quay.io/biocontainers/zavolan-multiqc-plugins:1.3--pyh5e36f6f_0" - conda: - os.path.join(workflow.basedir, "envs", "zavolan-multiqc-plugins.yaml") - resources: - mem_mb=lambda wildcards, attempt: 4096 * attempt, - shell: - "(python {input.script} \ - --config {output.multiqc_config} \ - --intro-text '{params.multiqc_intro_text}' \ - --custom-logo '{params.logo_path}' \ - --url '{params.url}' \ - --author-name '{params.author_name}' \ - --author-email '{params.author_email}' \ - {params.additional_params}) \ - 1> {log.stdout} 2> {log.stderr}" - - -current_rule = "multiqc_report" - - -rule multiqc_report: - """ - Create report with MultiQC - """ - input: - fastqc_se=expand( - os.path.join( - config["output_dir"], "samples", "{sample}", "fastqc", "{mate}" - ), - sample=pd.unique(samples_table.index.values), - mate="fq1", - ), - fastqc_pe=expand( - os.path.join( - config["output_dir"], "samples", "{sample}", "fastqc", "{mate}" - ), - sample=[ - i - for i in pd.unique( - samples_table[samples_table["seqmode"] == "pe"].index.values - ) - ], - mate="fq2", - ), - fastqc_trimmed_se=expand( - os.path.join( - config["output_dir"], - "samples", - "{sample}", - "fastqc_trimmed", - "fq1_{seqmode}", - ), - zip, - sample=pd.unique(samples_table.index.values), - seqmode=[ - get_sample("seqmode", search_id="index", search_value=i) - for i in pd.unique(samples_table.index.values) - ], - ), - fastqc_trimmed_pe=expand( - os.path.join( - config["output_dir"], - "samples", - "{sample}", - "fastqc_trimmed", - "{mate}_pe", - ), - sample=[ - i - for i in pd.unique( - samples_table[samples_table["seqmode"] == "pe"].index.values - ) - ], - mate="fq2", - ), - pseudoalignment=expand( - os.path.join( - config["output_dir"], - "samples", - "{sample}", - "quant_kallisto", - "{sample}.{seqmode}.kallisto.pseudo.sam", - ), - zip, - sample=[i for i in pd.unique(samples_table.index.values)], - seqmode=[ - get_sample("seqmode", search_id="index", search_value=i) - for i in pd.unique(samples_table.index.values) - ], - ), - TIN_score=expand( - os.path.join( - config["output_dir"], "samples", "{sample}", "TIN", "TIN_score.tsv" - ), - sample=pd.unique(samples_table.index.values), - ), - tables=lambda wildcards: expand( - os.path.join( - config["output_dir"], - "samples", - "{sample}", - "ALFA", - "{renamed_unique}", - "{sample}.ALFA_feature_counts.tsv", - ), - sample=pd.unique(samples_table.index.values), - renamed_unique=alfa_dict.keys(), - ), - zpca_salmon=expand( - os.path.join(config["output_dir"], "zpca", "pca_salmon_{molecule}"), - molecule=["genes", "transcripts"], - ), - zpca_kallisto=expand( - os.path.join(config["output_dir"], "zpca", "pca_kallisto_{molecule}"), - molecule=["genes", "transcripts"], - ), - multiqc_config=os.path.join(config["output_dir"], "multiqc_config.yaml"), - output: - multiqc_report=directory(os.path.join(config["output_dir"], "multiqc_summary")), - params: - cluster_log_path=config["cluster_log_dir"], - results_dir=lambda wildcards, output: os.path.split(output.multiqc_report)[0], - log_dir=config["log_dir"], - additional_params=parse_rule_config( - rule_config, - current_rule=current_rule, - immutable=( - "--outdir", - "--config", - ), - ), - container: - "docker://quay.io/biocontainers/zavolan-multiqc-plugins:1.3--pyh5e36f6f_0" - conda: - os.path.join(workflow.basedir, "envs", "zavolan-multiqc-plugins.yaml") - resources: - mem_mb=lambda wildcards, attempt: 4096 * attempt, - log: - stderr=os.path.join(config["log_dir"], f"{current_rule}.stderr.log"), - stdout=os.path.join(config["log_dir"], f"{current_rule}.stdout.log"), - shell: - "(multiqc \ - --outdir {output.multiqc_report} \ - --config {input.multiqc_config} \ - {params.additional_params} \ - {params.results_dir} \ - {params.log_dir};) \ - 1> {log.stdout} 2> {log.stderr}" - - -current_rule = "sort_bed_4_big" - - -rule sort_bed_4_big: - """ - sort bedGraphs in order to work with bedGraphtobigWig - """ - input: - bg=os.path.join( - config["output_dir"], - "samples", - "{sample}", - "ALFA", - "{renamed_unique}", - "{sample}.{renamed_unique}.{strand}.bg", - ), - output: - sorted_bg=temp( - os.path.join( - config["output_dir"], - "samples", - "{sample}", - "bigWig", - "{renamed_unique}", - "{sample}_{renamed_unique}_{strand}.sorted.bg", - ) - ), - params: - cluster_log_path=config["cluster_log_dir"], - additional_params=parse_rule_config( - rule_config, current_rule=current_rule, immutable=("-i",) - ), - container: - "docker://quay.io/biocontainers/bedtools:2.27.1--h9a82719_5" - conda: - os.path.join(workflow.basedir, "envs", "bedtools.yaml") - resources: - mem_mb=lambda wildcards, attempt: 2048 * attempt, - log: - stderr=os.path.join( - config["log_dir"], - "samples", - "{sample}", - f"{current_rule}_{{renamed_unique}}_{{strand}}.stderr.log", - ), - shell: - "(sortBed \ - -i {input.bg} \ - {params.additional_params} \ - > {output.sorted_bg};) 2> {log.stderr}" - - -current_rule = "prepare_bigWig" - - -rule prepare_bigWig: - """ - bedGraphtobigWig, for viewing in genome browsers - """ - input: - sorted_bg=os.path.join( - config["output_dir"], - "samples", - "{sample}", - "bigWig", - "{renamed_unique}", - "{sample}_{renamed_unique}_{strand}.sorted.bg", - ), - chr_sizes=lambda wildcards: os.path.join( - config["star_indexes"], - get_sample("organism", search_id="index", search_value=wildcards.sample), - get_sample("index_size", search_id="index", search_value=wildcards.sample), - "STAR_index", - "chrNameLength.txt", - ), - output: - bigWig=os.path.join( - config["output_dir"], - "samples", - "{sample}", - "bigWig", - "{renamed_unique}", - "{sample}_{renamed_unique}_{strand}.bw", - ), - params: - cluster_log_path=config["cluster_log_dir"], - additional_params=parse_rule_config( - rule_config, current_rule=current_rule, immutable=() - ), - container: - "docker://quay.io/biocontainers/ucsc-bedgraphtobigwig:377--h0b8a92a_2" - conda: - os.path.join(workflow.basedir, "envs", "ucsc-bedgraphtobigwig.yaml") - resources: - mem_mb=lambda wildcards, attempt: 1024 * attempt, - log: - stderr=os.path.join( - config["log_dir"], - "samples", - "{sample}", - f"{current_rule}_{{renamed_unique}}_{{strand}}.stderr.log", - ), - stdout=os.path.join( - config["log_dir"], - "samples", - "{sample}", - f"{current_rule}_{{renamed_unique}}_{{strand}}.stdout.log", - ), - shell: - "(bedGraphToBigWig \ - {params.additional_params} \ - {input.sorted_bg} \ - {input.chr_sizes} \ - {output.bigWig};) \ - 1> {log.stdout} 2> {log.stderr}" -- GitLab From 83496c92db17f7e4e428e111a5f28602f02879bd Mon Sep 17 00:00:00 2001 From: Iris Mestres Pascual <iris.mestrespascual@unibas.ch> Date: Tue, 17 Jan 2023 17:17:58 +0000 Subject: [PATCH 03/22] Add "rules" directory with the three subworkflows --- workflow/rules/.gitkeep | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 workflow/rules/.gitkeep diff --git a/workflow/rules/.gitkeep b/workflow/rules/.gitkeep new file mode 100644 index 0000000..e69de29 -- GitLab From 0200484018381c28158eb6e05f82a29c14d1321b Mon Sep 17 00:00:00 2001 From: Iris Mestres Pascual <iris.mestrespascual@unibas.ch> Date: Tue, 17 Jan 2023 17:18:39 +0000 Subject: [PATCH 04/22] Snakefile of the map subworkflow --- workflow/rules/map.smk | 1017 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1017 insertions(+) create mode 100644 workflow/rules/map.smk diff --git a/workflow/rules/map.smk b/workflow/rules/map.smk new file mode 100644 index 0000000..f5ef040 --- /dev/null +++ b/workflow/rules/map.smk @@ -0,0 +1,1017 @@ +############################################################################### +# (c) 2020 Paula Iborra, Zavolan Lab, Biozentrum, University of Basel +# (@) paula.iborradetoledo@unibas.ch / paula.iborra@alumni.esci.upf.edu +# +# Workflow to map small RNA-seq reads (e.g. from miRNA sequencing libraries). +############################################################################### +# +# USAGE: +# snakemake \ +# --snakefile="map.smk" \ +# -- cores 4 \ +# --use-singularity \ +# --singularity-args "--bind $PWD/../" \ +# --printshellcmds \ +# --rerun-incomplete \ +# --verbose +# +################################################################################ +import os + +configfile: "config.yaml" + +# Rules that require internet connection for downloading files are included +# in the localrules +localrules: + finish_map, + +############################################################################### +### Finish rule +############################################################################### + + +rule finish_map: + input: + maps=expand( + os.path.join( + config["output_dir"], + "{sample}", + "convertedSortedMappings_{sample}.bam.bai", + ), + sample=config["sample"], + ), + +############################################################################### +### Uncompress fastq files +############################################################################### + + +rule uncompress_zipped_files: + input: + reads=os.path.join(config["map_input_dir"], "{sample}.{format}.gz"), + output: + reads=os.path.join( + config["output_dir"], "{sample}", "{format}", "reads.{format}" + ), + params: + cluster_log=os.path.join( + config["cluster_log"], + "uncompress_zipped_files_{sample}_{format}.log", + ), + log: + os.path.join( + config["local_log"], + "uncompress_zipped_files_{sample}_{format}.log", + ), + singularity: + "docker://zavolab/ubuntu:18.04" + shell: + "(zcat {input.reads} > {output.reads}) &> {log}" + + +############################################################################### +### Quality filter +############################################################################### + + +rule fastq_quality_filter: + input: + reads=os.path.join( + config["output_dir"], "{sample}", "fastq", "reads.fastq" + ), + output: + reads=os.path.join( + config["output_dir"], "{sample}", "fastq", "filtered_reads.fastq" + ), + params: + cluster_log=os.path.join( + config["cluster_log"], "fastq_quality_filter_{sample}.log" + ), + p=config["p_value"], + q=config["q_value"], + log: + os.path.join(config["local_log"], "fastq_quality_filter_{sample}.log"), + singularity: + "docker://zavolab/fastx:0.0.14" + shell: + "(fastq_quality_filter \ + -v \ + -q {params.q} \ + -p {params.p} \ + -i {input.reads} \ + > {output.reads} \ + ) &> {log}" + + +############################################################################### +### Convert fastq to fasta +############################################################################### + + +rule fastq_to_fasta: + input: + reads=os.path.join( + config["output_dir"], "{sample}", "fastq", "filtered_reads.fastq" + ), + output: + reads=os.path.join( + config["output_dir"], "{sample}", "fastq", "reads.fa" + ), + params: + cluster_log=os.path.join( + config["cluster_log"], "fastq_to_fasta_{sample}.log" + ), + log: + os.path.join(config["local_log"], "fastq_to_fasta_{sample}.log"), + singularity: + "docker://zavolab/fastx:0.0.14" + shell: + "(fastq_to_fasta -r -n -i {input.reads} > {output.reads}) &> {log}" + + +############################################################################### +### Format fasta file +############################################################################### + + +rule fasta_formatter: + input: + reads=lambda wildcards: os.path.join( + config["output_dir"], + wildcards.sample, + config[wildcards.sample]["format"], + "reads.fa", + ), + output: + reads=os.path.join(config["output_dir"], "{sample}", "formatted.fasta"), + params: + cluster_log=os.path.join( + config["cluster_log"], "fasta_formatter_{sample}.log" + ), + log: + os.path.join(config["local_log"], "fasta_formatter_{sample}.log"), + singularity: + "docker://zavolab/fastx:0.0.14" + shell: + "(fasta_formatter -w 0 -i {input.reads} > {output.reads}) &> {log}" + + +############################################################################### +### Remove adapters +############################################################################### + + +rule cutadapt: + input: + reads=os.path.join(config["output_dir"], "{sample}", "formatted.fasta"), + output: + reads=os.path.join(config["output_dir"], "{sample}", "cut.fasta"), + params: + cluster_log=os.path.join( + config["cluster_log"], "cutadapt_{sample}.log" + ), + adapter=lambda wildcards: config[wildcards.sample]["adapter"], + error_rate=config["error_rate"], + minimum_length=config["minimum_length"], + overlap=config["overlap"], + max_n=config["max_n"], + log: + os.path.join(config["local_log"], "cutadapt_{sample}.log"), + resources: + threads=8, + singularity: + "docker://zavolab/cutadapt:1.16" + shell: + "(cutadapt \ + -a {params.adapter} \ + --error-rate {params.error_rate} \ + --minimum-length {params.minimum_length} \ + --overlap {params.overlap} \ + --trim-n \ + --max-n {params.max_n} \ + --cores {resources.threads} \ + -o {output.reads} {input.reads}) &> {log}" + + +############################################################################### +### Collapse identical reads +############################################################################### + + +rule fastx_collapser: + input: + reads=os.path.join(config["output_dir"], "{sample}", "cut.fasta"), + output: + reads=os.path.join(config["output_dir"], "{sample}", "collapsed.fasta"), + params: + cluster_log=os.path.join( + config["cluster_log"], "fastx_collapser_{sample}.log" + ), + log: + os.path.join(config["local_log"], "fastx_collapser_{sample}.log"), + singularity: + "docker://zavolab/fastx:0.0.14" + shell: + "(fastx_collapser -i {input.reads} > {output.reads}) &> {log}" + + +############################################################################### +### Segemehl genome mapping +############################################################################### + + +rule mapping_genome_segemehl: + input: + reads=os.path.join(config["output_dir"], "{sample}", "collapsed.fasta"), + genome=config["genome"], + genome_index_segemehl=config["genome_index_segemehl"], + output: + gmap=os.path.join( + config["output_dir"], "{sample}", "segemehlGenome_map.sam" + ), + params: + cluster_log=os.path.join( + config["cluster_log"], "mapping_genome_segemehl_{sample}.log" + ), + log: + os.path.join( + config["local_log"], "mapping_genome_segemehl_{sample}.log" + ), + resources: + mem=50, + time=12, + threads=8, + singularity: + "docker://zavolab/segemehl:0.2.0" + shell: + "(segemehl.x \ + -i {input.genome_index_segemehl} \ + -d {input.genome} \ + -t {threads} \ + -q {input.reads} \ + -outfile {output.gmap} \ + ) &> {log}" + + +############################################################################### +### Segemehl transcriptome mapping +############################################################################### + + +rule mapping_transcriptome_segemehl: + input: + reads=os.path.join(config["output_dir"], "{sample}", "collapsed.fasta"), + transcriptome=config["transcriptome"], + transcriptome_index_segemehl=config["transcriptome_index_segemehl"], + output: + tmap=os.path.join( + config["output_dir"], "{sample}", "segemehlTranscriptome_map.sam" + ), + params: + cluster_log=os.path.join( + config["cluster_log"], + "mapping_transcriptome_segemehl_{sample}.log", + ), + log: + os.path.join( + config["local_log"], "mapping_transcriptome_segemehl_{sample}.log" + ), + resources: + mem=10, + time=12, + threads=8, + singularity: + "docker://zavolab/segemehl:0.2.0" + shell: + "(segemehl.x \ + -i {input.transcriptome_index_segemehl} \ + -d {input.transcriptome} \ + -t {threads} \ + -q {input.reads} \ + -outfile {output.tmap} \ + ) &> {log}" + + +############################################################################### +### Filter fasta for oligomap mapping +############################################################################### + + +rule filter_fasta_for_oligomap: + input: + reads=os.path.join(config["output_dir"], "{sample}", "collapsed.fasta"), + script=os.path.join(config["scripts_dir"], "validation_fasta.py"), + output: + reads=os.path.join( + config["output_dir"], "{sample}", "filtered_for_oligomap.fasta" + ), + params: + cluster_log=os.path.join( + config["cluster_log"], "filter_fasta_for_oligomap_{sample}.log" + ), + max_length_reads=config["max_length_reads"], + log: + os.path.join( + config["local_log"], "filter_fasta_for_oligomap_{sample}.log" + ), + singularity: + "docker://zavolab/python:3.6.5" + shell: + "(python {input.script} \ + -r {params.max_length_reads} \ + -i {input.reads} \ + -o {output.reads} \ + ) &> {log}" + + +############################################################################### +### Oligomap genome mapping +############################################################################### + + +rule mapping_genome_oligomap: + input: + reads=os.path.join( + config["output_dir"], "{sample}", "filtered_for_oligomap.fasta" + ), + target=config["genome"], + output: + gmap=os.path.join( + config["output_dir"], "{sample}", "oligoGenome_map.fa" + ), + report=os.path.join( + config["output_dir"], "{sample}", "oligoGenome_report.txt" + ), + params: + cluster_log=os.path.join( + config["cluster_log"], "mapping_genome_oligomap_{sample}.log" + ), + log: + os.path.join( + config["local_log"], "mapping_genome_oligomap_{sample}.log" + ), + resources: + mem=50, + time=6, + threads=8, + singularity: + "docker://zavolab/oligomap:1.0" + shell: + "(oligomap \ + {input.target} \ + {input.reads} \ + -r {output.report} \ + > {output.gmap} \ + ) &> {log}" + + +############################################################################### +### Oligomap genome sorting +############################################################################### + + +rule sort_genome_oligomap: + input: + tmap=os.path.join( + config["output_dir"], "{sample}", "oligoGenome_map.fa" + ), + report=os.path.join( + config["output_dir"], "{sample}", "oligoGenome_report.txt" + ), + script=os.path.join(config["scripts_dir"], "blocksort.sh"), + output: + sort=os.path.join( + config["output_dir"], "{sample}", "oligoGenome_sorted.fa" + ), + params: + cluster_log=os.path.join( + config["cluster_log"], "sorting_genome_oligomap_{sample}.log" + ), + log: + os.path.join( + config["local_log"], "sorting_genome_oligomap_{sample}.log" + ), + resources: + threads=8, + time=6, + shell: + "(bash {input.script} \ + {input.tmap} \ + {resources.threads} \ + {output.sort} \ + ) &> {log}" + + +############################################################################### +### Oligomap genome mapping output to SAM +############################################################################### + + +rule oligomap_genome_toSAM: + input: + report=os.path.join( + config["output_dir"], "{sample}", "oligoGenome_report.txt" + ), + sort=os.path.join( + config["output_dir"], "{sample}", "oligoGenome_sorted.fa" + ), + script=os.path.join( + config["scripts_dir"], "oligomapOutputToSam_nhfiltered.py" + ), + output: + gmap=os.path.join( + config["output_dir"], "{sample}", "oligoGenome_converted.sam" + ), + params: + cluster_log=os.path.join( + config["cluster_log"], "oligomap_genome_toSAM_{sample}.log" + ), + nh=config["nh"], + log: + os.path.join(config["local_log"], "oligomap_genome_toSAM_{sample}.log"), + resources: + time=1, + queue=1, + singularity: + "docker://zavolab/python:3.6.5" + shell: + "(python {input.script} \ + -i {input.sort} \ + -n {params.nh} \ + > {output.gmap}) &> {log}" + + +############################################################################### +### Oligomap trancriptome mapping +############################################################################### + + +rule mapping_transcriptome_oligomap: + input: + reads=os.path.join( + config["output_dir"], "{sample}", "filtered_for_oligomap.fasta" + ), + target=config["transcriptome"], + output: + tmap=os.path.join( + config["output_dir"], "{sample}", "oligoTranscriptome_map.fa" + ), + report=os.path.join( + config["output_dir"], "{sample}", "oligoTranscriptome_report.txt" + ), + params: + cluster_log=os.path.join( + config["cluster_log"], + "mapping_transcriptome_oligomap_{sample}.log", + ), + log: + os.path.join( + config["local_log"], "mapping_transcriptome_oligomap_{sample}.log" + ), + resources: + mem=10, + time=6, + threads=8, + singularity: + "docker://zavolab/oligomap:1.0" + shell: + "(oligomap \ + {input.target} \ + {input.reads} \ + -s \ + -r {output.report} \ + > {output.tmap} \ + ) &> {log}" + + +############################################################################### +### Oligomap trancriptome sorting +############################################################################### + + +rule sort_transcriptome_oligomap: + input: + tmap=os.path.join( + config["output_dir"], "{sample}", "oligoTranscriptome_map.fa" + ), + report=os.path.join( + config["output_dir"], "{sample}", "oligoTranscriptome_report.txt" + ), + script=os.path.join(config["scripts_dir"], "blocksort.sh"), + output: + sort=os.path.join( + config["output_dir"], "{sample}", "oligoTranscriptome_sorted.fa" + ), + params: + cluster_log=os.path.join( + config["cluster_log"], + "sorting_transcriptome_oligomap_{sample}.log", + ), + log: + os.path.join( + config["local_log"], "sorting_transcriptome_oligomap_{sample}.log" + ), + resources: + threads=8, + shell: + "(bash {input.script} \ + {input.tmap} \ + {resources.threads} \ + {output.sort} \ + ) &> {log}" + + +############################################################################### +### Oligomap transcriptome mapping ouput to SAM +############################################################################### + + +rule oligomap_transcriptome_toSAM: + input: + report=os.path.join( + config["output_dir"], "{sample}", "oligoTranscriptome_report.txt" + ), + sort=os.path.join( + config["output_dir"], "{sample}", "oligoTranscriptome_sorted.fa" + ), + script=os.path.join( + config["scripts_dir"], "oligomapOutputToSam_nhfiltered.py" + ), + output: + tmap=os.path.join( + config["output_dir"], + "{sample}", + "oligoTranscriptome_converted.sam", + ), + params: + cluster_log=os.path.join( + config["cluster_log"], "oligomap_transcriptome_toSAM_{sample}.log" + ), + nh=config["nh"], + log: + os.path.join( + config["local_log"], "oligomap_transcriptome_toSAM_{sample}.log" + ), + singularity: + "docker://zavolab/python:3.6.5" + shell: + "(python {input.script} \ + -i {input.sort} \ + -n {params.nh} \ + > {output.tmap} \ + ) &> {log}" + + +############################################################################### +### Merge genome mappings +############################################################################### + + +rule merge_genome_maps: + input: + gmap1=os.path.join( + config["output_dir"], "{sample}", "segemehlGenome_map.sam" + ), + gmap2=os.path.join( + config["output_dir"], "{sample}", "oligoGenome_converted.sam" + ), + output: + gmaps=os.path.join( + config["output_dir"], "{sample}", "GenomeMappings.sam" + ), + params: + cluster_log=os.path.join( + config["cluster_log"], "merge_genome_maps_{sample}.log" + ), + log: + os.path.join(config["local_log"], "merge_genome_maps_{sample}.log"), + singularity: + "docker://zavolab/ubuntu:18.04" + shell: + "(cat {input.gmap1} {input.gmap2} > {output.gmaps}) &> {log}" + + +############################################################################### +### Merge trancriptome mappings +############################################################################### + + +rule merge_transcriptome_maps: + input: + tmap1=os.path.join( + config["output_dir"], "{sample}", "segemehlTranscriptome_map.sam" + ), + tmap2=os.path.join( + config["output_dir"], + "{sample}", + "oligoTranscriptome_converted.sam", + ), + output: + tmaps=os.path.join( + config["output_dir"], "{sample}", "TranscriptomeMappings.sam" + ), + params: + cluster_log=os.path.join( + config["cluster_log"], "merge_transcriptome_maps_{sample}.log" + ), + log: + os.path.join( + config["local_log"], "merge_transcriptome_maps_{sample}.log" + ), + singularity: + "docker://zavolab/ubuntu:18.04" + shell: + "(cat {input.tmap1} {input.tmap2} > {output.tmaps}) &> {log}" + + +############################################################################### +### Filter NH genome +############################################################################### + + +rule nh_filter_genome: + input: + gmaps=os.path.join( + config["output_dir"], "{sample}", "GenomeMappings.sam" + ), + script=os.path.join(config["scripts_dir"], "nh_filter.py"), + output: + gmaps=os.path.join( + config["output_dir"], "{sample}", "nhfiltered_GenomeMappings.sam" + ), + params: + cluster_log=os.path.join( + config["cluster_log"], "nh_filter_genome_{sample}.log" + ), + nh=config["nh"], + log: + os.path.join(config["local_log"], "nh_filter_genome_{sample}.log"), + singularity: + "docker://zavolab/python:3.6.5" + shell: + "(python {input.script} \ + {input.gmaps} \ + {params.nh} \ + {output.gmaps} \ + ) &> {log}" + + +############################################################################### +### Filter NH transcriptome +############################################################################### + + +rule filter_nh_transcriptome: + input: + tmaps=os.path.join( + config["output_dir"], "{sample}", "TranscriptomeMappings.sam" + ), + script=os.path.join(config["scripts_dir"], "nh_filter.py"), + output: + tmaps=os.path.join( + config["output_dir"], + "{sample}", + "nhfiltered_TranscriptomeMappings.sam", + ), + params: + cluster_log=os.path.join( + config["cluster_log"], "filter_nh_transcriptome_{sample}.log" + ), + nh=config["nh"], + log: + os.path.join( + config["local_log"], "filter_nh_transcriptome_{sample}.log" + ), + singularity: + "docker://zavolab/python:3.6.5" + shell: + "(python {input.script} \ + {input.tmaps} \ + {params.nh} \ + {output.tmaps} \ + ) &> {log}" + + +############################################################################### +### Remove header genome mappings +############################################################################### + + +rule remove_headers_genome: + input: + gmap=os.path.join( + config["output_dir"], "{sample}", "nhfiltered_GenomeMappings.sam" + ), + output: + gmap=os.path.join( + config["output_dir"], "{sample}", "noheader_GenomeMappings.sam" + ), + params: + cluster_log=os.path.join( + config["cluster_log"], "remove_headers_genome_{sample}.log" + ), + log: + os.path.join(config["local_log"], "remove_headers_genome_{sample}.log"), + singularity: + "docker://zavolab/samtools:1.8" + shell: + "samtools view {input.gmap} > {output.gmap}" + + +############################################################################### +### Remove header transcriptome mappings +############################################################################### + + +rule remove_headers_transcriptome: + input: + tmap=os.path.join( + config["output_dir"], + "{sample}", + "nhfiltered_TranscriptomeMappings.sam", + ), + output: + tmap=os.path.join( + config["output_dir"], + "{sample}", + "noheader_TranscriptomeMappings.sam", + ), + params: + cluster_log=os.path.join( + config["cluster_log"], "remove_headers_transcriptome_{sample}.log" + ), + log: + os.path.join( + config["local_log"], "remove_headers_transcriptome_{sample}.log" + ), + singularity: + "docker://zavolab/samtools:1.8" + shell: + "samtools view {input.tmap} > {output.tmap}" + + +############################################################################### +### Transcriptome to genome coordinates +############################################################################### + + +rule trans_to_gen: + input: + tmap=os.path.join( + config["output_dir"], + "{sample}", + "noheader_TranscriptomeMappings.sam", + ), + script=os.path.join(config["scripts_dir"], "sam_trx_to_sam_gen.pl"), + exons=config["exons"], + output: + genout=os.path.join(config["output_dir"], "{sample}", "TransToGen.sam"), + params: + cluster_log=os.path.join( + config["cluster_log"], "trans_to_gen_{sample}.log" + ), + log: + os.path.join(config["local_log"], "trans_to_gen_{sample}.log"), + singularity: + "docker://zavolab/perl:5.28" + shell: + "(perl {input.script} \ + --in {input.tmap} \ + --exons {input.exons} \ + --out {output.genout} \ + ) &> {log}" + + +############################################################################### +### Concatenate genome and trancriptome mappings +############################################################################### + + +rule cat_mapping: + input: + gmap1=os.path.join(config["output_dir"], "{sample}", "TransToGen.sam"), + gmap2=os.path.join( + config["output_dir"], "{sample}", "noheader_GenomeMappings.sam" + ), + output: + catmaps=os.path.join( + config["output_dir"], "{sample}", "catMappings.sam" + ), + params: + cluster_log=os.path.join( + config["cluster_log"], "cat_mapping_{sample}.log" + ), + log: + os.path.join(config["local_log"], "cat_mapping_{sample}.log"), + singularity: + "docker://zavolab/ubuntu:18.04" + shell: + "(cat {input.gmap1} {input.gmap2} > {output.catmaps}) &> {log}" + + +############################################################################### +### Add header +############################################################################### + + +rule add_header: + input: + header=config["header_of_collapsed_fasta"], + catmaps=os.path.join( + config["output_dir"], "{sample}", "catMappings.sam" + ), + output: + concatenate=os.path.join( + config["output_dir"], + "{sample}", + "concatenated_header_catMappings.sam", + ), + params: + cluster_log=os.path.join( + config["cluster_log"], "add_header_{sample}.log" + ), + log: + os.path.join(config["local_log"], "add_header_{sample}.log"), + singularity: + "docker://zavolab/ubuntu:18.04" + shell: + "(cat {input.header} {input.catmaps} > {output.concatenate}) &> {log}" + + +############################################################################### +### Sort mapped file by IDs +############################################################################### + + +rule sort_id: + input: + concatenate=os.path.join( + config["output_dir"], + "{sample}", + "concatenated_header_catMappings.sam", + ), + output: + sort=os.path.join( + config["output_dir"], "{sample}", "header_sorted_catMappings.sam" + ), + params: + cluster_log=os.path.join(config["cluster_log"], "sort_id_{sample}.log"), + log: + os.path.join(config["local_log"], "sort_id_{sample}.log"), + singularity: + "docker://zavolab/samtools:1.8" + shell: + "(samtools sort -n -o {output.sort} {input.concatenate}) &> {log}" + + +############################################################################### +### Remove inferior mappings (keeping multimappers) +############################################################################### + + +rule remove_inferiors: + input: + sort=os.path.join( + config["output_dir"], "{sample}", "header_sorted_catMappings.sam" + ), + script=os.path.join( + config["scripts_dir"], + "sam_remove_duplicates_inferior_alignments_multimappers.1_5.pl", + ), + output: + remove_inf=os.path.join( + config["output_dir"], "{sample}", "removeInferiors.sam" + ), + params: + cluster_log=os.path.join( + config["cluster_log"], "remove_inferiors_{sample}.log" + ), + log: + os.path.join(config["local_log"], "remove_inferiors_{sample}.log"), + resources: + mem=15, + threads=4, + singularity: + "docker://zavolab/perl:5.28" + shell: + "(perl {input.script} \ + --print-header \ + --keep-mm \ + --in {input.sort} \ + --out {output.remove_inf} \ + ) &> {log}" + + +############################################################################### +### Uncollapse reads +############################################################################### + + +rule uncollapse_reads: + input: + maps=os.path.join( + config["output_dir"], "{sample}", "removeInferiors.sam" + ), + script=os.path.join(config["scripts_dir"], "sam_uncollapse.pl"), + output: + maps=os.path.join( + config["output_dir"], "{sample}", "uncollapsedMappings.sam" + ), + params: + cluster_log=os.path.join( + config["cluster_log"], "uncollapse_reads_{sample}.log" + ), + log: + os.path.join(config["local_log"], "uncollapse_reads_{sample}.log"), + singularity: + "docker://zavolab/perl:5.28" + shell: + "(perl {input.script} \ + --suffix \ + --in {input.maps} \ + --out {output.maps} \ + ) &> {log}" + + +############################################################################### +### Convert SAM to BAM +############################################################################### + + +rule convert_to_bam: + input: + maps=os.path.join( + config["output_dir"], "{sample}", "uncollapsedMappings.sam" + ), + output: + maps=os.path.join( + config["output_dir"], "{sample}", "mappingsConverted.bam" + ), + params: + cluster_log=os.path.join( + config["cluster_log"], "convert_to_bam_{sample}.log" + ), + log: + os.path.join(config["local_log"], "convert_to_bam_{sample}.log"), + singularity: + "docker://zavolab/samtools:1.8" + shell: + "(samtools view -b {input.maps} > {output.maps}) &> {log}" + + +############################################################################### +### Sort by coordinate position +############################################################################### + + +rule sort_by_position: + input: + maps=os.path.join( + config["output_dir"], "{sample}", "mappingsConverted.bam" + ), + output: + maps=os.path.join( + config["output_dir"], + "{sample}", + "convertedSortedMappings_{sample}.bam", + ), + params: + cluster_log=os.path.join( + config["cluster_log"], "sort_by_position_{sample}.log" + ), + log: + os.path.join(config["local_log"], "sort_by_position_{sample}.log"), + singularity: + "docker://zavolab/samtools:1.8" + shell: + "(samtools sort {input.maps} > {output.maps}) &> {log}" + + +############################################################################### +### Create bam index +############################################################################### + + +rule index_bam: + input: + maps=os.path.join( + config["output_dir"], + "{sample}", + "convertedSortedMappings_{sample}.bam", + ), + output: + maps=os.path.join( + config["output_dir"], + "{sample}", + "convertedSortedMappings_{sample}.bam.bai", + ), + params: + cluster_log=os.path.join( + config["cluster_log"], "index_bam_{sample}.log" + ), + log: + os.path.join(config["local_log"], "index_bam_{sample}.log"), + singularity: + "docker://zavolab/samtools:1.8" + shell: + "(samtools index -b {input.maps} > {output.maps}) &> {log}" -- GitLab From c9a00a2155a0a1a957da8d77a6cfdd4ef899738b Mon Sep 17 00:00:00 2001 From: Iris Mestres Pascual <iris.mestrespascual@unibas.ch> Date: Tue, 17 Jan 2023 17:19:19 +0000 Subject: [PATCH 05/22] Snakefile of the "prepare" subworkflow --- workflow/rules/prepare.smk | 751 +++++++++++++++++++++++++++++++++++++ 1 file changed, 751 insertions(+) create mode 100644 workflow/rules/prepare.smk diff --git a/workflow/rules/prepare.smk b/workflow/rules/prepare.smk new file mode 100644 index 0000000..b254a90 --- /dev/null +++ b/workflow/rules/prepare.smk @@ -0,0 +1,751 @@ +############################################################################### +# (c) 2020 Paula Iborra, Zavolan Lab, Biozentrum, University of Basel +# (@) paula.iborradetoledo@unibas.ch / paula.iborra@alumni.esci.upf.edu +# +# Snakemake workflow to download and prepare the necessary files +# for smallRNA-seq related workflows. +# +############################################################################### +# +# USAGE: +# snakemake \ +# --snakefile="prepare.smk" \ +# --cores 4 \ +# --use-singularity \ +# --singularity-args "--bind $PWD/../" \ +# --printshellcmds \ +# --rerun-incomplete \ +# --verbose +# +################################################################################ +import os + +configfile: "config.yaml" + +# Rules that require internet connection for downloading files are included +# in the localrules +localrules: + finish_prepare, + genome_process, + filter_anno_gtf, + mirna_anno, + dict_chr, + + +############################################################################### +### Finish rule +############################################################################### + + +rule finish_prepare: + input: + idx_transcriptome=expand( + os.path.join( + config["output_dir"], + "{organism}", + "transcriptome_index_segemehl.idx", + ), + organism=config["organism"], + ), + idx_genome=expand( + os.path.join( + config["output_dir"], + "{organism}", + "genome_index_segemehl.idx", + ), + organism=config["organism"], + ), + exons=expand( + os.path.join( + config["output_dir"], + "{organism}", + "exons.bed", + ), + organism=config["organism"], + ), + header=expand( + os.path.join( + config["output_dir"], + "{organism}", + "headerOfCollapsedFasta.sam", + ), + organism=config["organism"], + ), + mirnafilt=expand( + os.path.join( + config["output_dir"], + "{organism}", + "mirna_filtered.bed", + ), + organism=config["organism"], + ), + isomirs=expand( + os.path.join( + config["output_dir"], + "{organism}", + "isomirs_annotation.bed", + ), + organism=config["organism"], + ), + + + +############################################################################### +### Download and process genome IDs +############################################################################### +rule genome_process: + input: + script=os.path.join(config["scripts_dir"], "genome_process.sh"), + output: + genome=os.path.join( + config["output_dir"], "{organism}", "genome.processed.fa" + ), + params: + url=lambda wildcards: config[wildcards.organism]["genome_url"], + dir_out=os.path.join(config["output_dir"], "{organism}"), + log: + os.path.join(config["local_log"], "{organism}", "genome_process.log"), + singularity: + "docker://zavolab/ubuntu:18.04" + shell: + "(bash {input.script} {params.dir_out} {log} {params.url})" + + +############################################################################### +### Download and filter gtf by transcript_level +############################################################################### + + +rule filter_anno_gtf: + input: + script=os.path.join(config["scripts_dir"], "filter_anno_gtf.sh"), + output: + gtf=os.path.join( + config["output_dir"], + "{organism}", + "gene_annotations.filtered.gtf", + ), + params: + url=lambda wildcards: config[wildcards.organism]["gtf_url"], + dir_out=os.path.join(config["output_dir"], "{organism}"), + log: + os.path.join(config["local_log"], "{organism}", "filter_anno_gtf.log"), + singularity: + "docker://zavolab/ubuntu:18.04" + shell: + "(bash {input.script} {params.dir_out} {log} {params.url}) &> {log}" + + +############################################################################### +### Extract transcriptome sequences in FASTA from genome. +############################################################################### + + +rule extract_transcriptome_seqs: + input: + genome=os.path.join( + config["output_dir"], "{organism}", "genome.processed.fa" + ), + gtf=os.path.join( + config["output_dir"], + "{organism}", + "gene_annotations.filtered.gtf", + ), + output: + fasta=os.path.join( + config["output_dir"], "{organism}", "transcriptome.fa" + ), + params: + cluster_log=os.path.join( + config["cluster_log"], + "{organism}", + "extract_transcriptome_seqs.log", + ), + log: + os.path.join( + config["local_log"], + "{organism}", + "extract_transcriptome_seqs.log", + ), + singularity: + "docker://zavolab/cufflinks:2.2.1" + shell: + "(gffread -w {output.fasta} -g {input.genome} {input.gtf}) &> {log}" + + +############################################################################### +### Trim transcript IDs from FASTA file +############################################################################### + + +rule trim_fasta: + input: + fasta=os.path.join( + config["output_dir"], "{organism}", "transcriptome.fa" + ), + script=os.path.join(config["scripts_dir"], "validation_fasta.py"), + output: + fasta=os.path.join( + config["output_dir"], "{organism}", "transcriptome_idtrim.fa" + ), + params: + cluster_log=os.path.join( + config["cluster_log"], "{organism}", "trim_fasta.log" + ), + log: + os.path.join(config["local_log"], "{organism}", "trim_fasta.log"), + singularity: + "docker://zavolab/ubuntu:18.04" + shell: + """(awk \ + -F" " \ + "/^>/ {{print \$1; next}} 1" \ + {input.fasta} \ + > {output.fasta} \ + ) &> {log}""" + + +############################################################################### +### Generate segemehl index for transcripts +############################################################################### + + +rule generate_segemehl_index_transcriptome: + input: + fasta=os.path.join( + config["output_dir"], "{organism}", "transcriptome_idtrim.fa" + ), + output: + idx=os.path.join( + config["output_dir"], + "{organism}", + "transcriptome_index_segemehl.idx", + ), + params: + cluster_log=os.path.join( + config["cluster_log"], + "{organism}", + "generate_segemehl_index_transcriptome.log", + ), + log: + os.path.join( + config["local_log"], + "{organism}", + "generate_segemehl_index_transcriptome.log", + ), + resources: + mem=10, + threads=8, + time=6, + singularity: + "docker://zavolab/segemehl:0.2.0" + shell: + "(segemehl.x -x {output.idx} -d {input.fasta}) &> {log}" + + +############################################################################### +### Generate segemehl index for genome +############################################################################### + + +rule generate_segemehl_index_genome: + input: + genome=os.path.join( + config["output_dir"], "{organism}", "genome.processed.fa" + ), + output: + idx=os.path.join( + config["output_dir"], "{organism}", "genome_index_segemehl.idx" + ), + params: + cluster_log=os.path.join( + config["cluster_log"], + "{organism}", + "generate_segemehl_index_genome.log", + ), + log: + os.path.join( + config["local_log"], + "{organism}", + "generate_segemehl_index_genome.log", + ), + resources: + mem=60, + threads=8, + time=6, + singularity: + "docker://zavolab/segemehl:0.2.0" + shell: + "(segemehl.x -x {output.idx} -d {input.genome}) &> {log}" + + +############################################################################### +### GTF file of exons (genomic coordinates) +############################################################################### + + +rule get_exons_gtf: + input: + gtf=os.path.join( + config["output_dir"], + "{organism}", + "gene_annotations.filtered.gtf", + ), + script=os.path.join(config["scripts_dir"], "get_lines_w_pattern.sh"), + output: + exons=os.path.join(config["output_dir"], "{organism}", "exons.gtf"), + params: + cluster_log=os.path.join( + config["cluster_log"], "{organism}", "get_exons_gtf.log" + ), + log: + os.path.join(config["local_log"], "{organism}", "get_exons_gtf.log"), + singularity: + "docker://zavolab/ubuntu:18.04" + shell: + "(bash \ + {input.script} \ + -f {input.gtf} \ + -c 3 \ + -p exon \ + -o {output.exons} \ + ) &> {log}" + + +############################################################################### +### Convert GTF file of exons to BED file +############################################################################### + + +rule gtftobed: + input: + exons=os.path.join(config["output_dir"], "{organism}", "exons.gtf"), + script=os.path.join(config["scripts_dir"], "gtf_exons_bed.1.1.2.R"), + output: + exons=os.path.join(config["output_dir"], "{organism}", "exons.bed"), + params: + cluster_log=os.path.join( + config["cluster_log"], "{organism}", "gtftobed.log" + ), + log: + os.path.join(config["local_log"], "{organism}", "gtftobed.log"), + singularity: + "docker://zavolab/r-zavolab:3.5.1" + shell: + "(Rscript \ + {input.script} \ + --gtf {input.exons} \ + -o {output.exons} \ + ) &> {log}" + + +############################################################################### +### Create header for SAM file +############################################################################### + + +rule create_header_genome: + input: + genome=os.path.join( + config["output_dir"], "{organism}", "genome.processed.fa" + ), + output: + header=os.path.join( + config["output_dir"], "{organism}", "headerOfCollapsedFasta.sam" + ), + params: + cluster_log=os.path.join( + config["cluster_log"], "{organism}", "create_header_genome.log" + ), + log: + os.path.join( + config["local_log"], "{organism}", "create_header_genome.log" + ), + singularity: + "docker://zavolab/samtools:1.8" + shell: + "(samtools dict -o {output.header} --uri=NA {input.genome}) &> {log}" + + +############################################################################### +### Download miRNA annotation +############################################################################### + + +rule mirna_anno: + input: + genome=os.path.join( + config["output_dir"], "{organism}", "genome.processed.fa" + ), + output: + anno=os.path.join( + config["output_dir"], "{organism}", "raw", "mirna.gff3" + ), + params: + anno=lambda wildcards: config[wildcards.organism]["mirna_url"], + cluster_log=os.path.join( + config["cluster_log"], "{organism}", "mirna_anno.log" + ), + log: + os.path.join(config["local_log"], "{organism}", "mirna_anno.log"), + singularity: + "docker://zavolab/ubuntu:18.04" + shell: + "(wget {params.anno} -O {output.anno}) &> {log}" + + +############################################################################### +### Download dictionary mapping chr +############################################################################### + + +rule dict_chr: + input: + genome=os.path.join( + config["output_dir"], "{organism}", "genome.processed.fa" + ), + output: + map_chr=os.path.join( + config["output_dir"], "{organism}", "UCSC2ensembl.txt" + ), + params: + map_chr=lambda wildcards: config[wildcards.organism]["map_chr_url"], + cluster_log=os.path.join( + config["cluster_log"], "{organism}", "dict_chr.log" + ), + log: + os.path.join(config["local_log"], "{organism}", "dict_chr.log"), + singularity: + "docker://zavolab/ubuntu:18.04" + shell: + "(wget {params.map_chr} -O {output.map_chr}) &> {log}" + + +############################################################################### +### Mapping chromosomes names, UCSC <-> ENSEMBL +############################################################################### + + +rule map_chr_names: + input: + anno=os.path.join( + config["output_dir"], "{organism}", "raw", "mirna.gff3" + ), + script=os.path.join(config["scripts_dir"], "map_chromosomes.pl"), + map_chr=os.path.join( + config["output_dir"], "{organism}", "UCSC2ensembl.txt" + ), + output: + gff=os.path.join( + config["output_dir"], "{organism}", "mirna_chr_mapped.gff3" + ), + params: + cluster_log=os.path.join( + config["cluster_log"], "{organism}", "map_chr_names.log" + ), + column=lambda wildcards: config[wildcards.organism]["column"], + delimiter=lambda wildcards: config[wildcards.organism]["delimiter"], + log: + os.path.join(config["local_log"], "{organism}", "map_chr_names.log"), + singularity: + "docker://zavolab/perl:5.28" + shell: + "(perl {input.script} \ + {input.anno} \ + {params.column} \ + {params.delimiter} \ + {input.map_chr} \ + {output.gff} \ + ) &> {log}" + + +############################################################################### +### Filtering _1 miR IDs +############################################################################### + + +rule filter_mir_1_anno: + input: + gff=os.path.join( + config["output_dir"], "{organism}", "mirna_chr_mapped.gff3" + ), + output: + gff=os.path.join( + config["output_dir"], "{organism}", "mirna_filtered.gff3" + ), + params: + script=os.path.join(config["scripts_dir"], "filter_mir_1_anno.sh"), + cluster_log=os.path.join( + config["cluster_log"], "{organism}", "filter_mir_1_anno.log" + ), + log: + os.path.join( + config["local_log"], "{organism}", "filter_mir_1_anno.log" + ), + singularity: + "docker://zavolab/ubuntu:18.04" + shell: + "(bash {params.script} -f {input.gff} -o {output.gff}) &> {log}" + + +############################################################################### +### GFF to BED (improve intersect memory efficient allowing to use -sorted) +############################################################################### + + +rule gfftobed: + input: + gff=os.path.join( + config["output_dir"], "{organism}", "mirna_filtered.gff3" + ), + output: + bed=os.path.join( + config["output_dir"], "{organism}", "mirna_filtered.bed" + ), + params: + cluster_log=os.path.join( + config["cluster_log"], "{organism}", "gfftobed.log" + ), + out_dir=os.path.join(config["output_dir"]), + log: + os.path.join(config["local_log"], "{organism}", "gfftobed.log"), + singularity: + "docker://zavolab/bedops:2.4.35" + shell: + "(convert2bed -i gff < {input.gff} \ + --sort-tmpdir={params.out_dir} \ + > {output.bed} \ + ) &> {log}" + + +############################################################################### +### Index genome fasta file +############################################################################### + + +rule create_index_fasta: + input: + genome=os.path.join( + config["output_dir"], "{organism}", "genome.processed.fa" + ), + output: + genome=os.path.join( + config["output_dir"], "{organism}", "genome.processed.fa.fai" + ), + params: + cluster_log=os.path.join( + config["cluster_log"], "{organism}", "create_index_fasta.log" + ), + log: + os.path.join( + config["local_log"], "{organism}", "create_index_fasta.log" + ), + singularity: + "docker://zavolab/samtools:1.8" + shell: + "(samtools faidx {input.genome}) &> {log}" + + +############################################################################### +### Extract chromosome length +############################################################################### + + +rule extract_chr_len: + input: + genome=os.path.join( + config["output_dir"], "{organism}", "genome.processed.fa.fai" + ), + output: + chrsize=os.path.join( + config["output_dir"], "{organism}", "chr_size.txt" + ), + params: + cluster_log=os.path.join( + config["cluster_log"], "{organism}", "extract_chr_len.log" + ), + log: + os.path.join(config["local_log"], "{organism}", "extract_chr_len.log"), + singularity: + "docker://zavolab/ubuntu:18.04" + shell: + "(cut -f1,2 {input.genome} > {output.chrsize}) &> {log}" + + +############################################################################### +### Extract mature miRNA +############################################################################### + + +rule filter_mature_mirs: + input: + bed=os.path.join( + config["output_dir"], "{organism}", "mirna_filtered.bed" + ), + output: + bed=os.path.join( + config["output_dir"], "{organism}", "mirna_mature_filtered.bed" + ), + params: + cluster_log=os.path.join( + config["cluster_log"], "{organism}", "filter_mature_mirs.log" + ), + precursor="miRNA_primary_transcript", + log: + os.path.join( + config["local_log"], "{organism}", "filter_mature_mirs.log" + ), + singularity: + "docker://zavolab/ubuntu:18.04" + shell: + "(grep -v {params.precursor} {input.bed} > {output.bed}) &> {log}" + + +############################################################################### +### Create isomirs annotation file from mature miRNA +############################################################################### + + +rule iso_anno: + input: + bed=os.path.join( + config["output_dir"], "{organism}", "mirna_mature_filtered.bed" + ), + chrsize=os.path.join( + config["output_dir"], "{organism}", "chr_size.txt" + ), + output: + bed=os.path.join( + config["output_dir"], + "{organism}", + "iso_anno_5p{bp_5p}_3p{bp_3p}.bed", + ), + params: + cluster_log=os.path.join( + config["cluster_log"], + "{organism}", + "iso_anno_5p{bp_5p}_3p{bp_3p}.log", + ), + bp_5p=lambda wildcards: wildcards.bp_5p, + bp_3p=lambda wildcards: wildcards.bp_3p, + log: + os.path.join( + config["local_log"], + "{organism}", + "iso_anno_5p{bp_5p}_3p{bp_3p}.log", + ), + singularity: + "docker://zavolab/bedtools:2.28.0" + shell: + "(bedtools slop \ + -i {input.bed} \ + -g {input.chrsize} \ + -l {params.bp_5p} \ + -r {params.bp_3p} \ + > {output.bed} \ + ) &> {log}" + + +############################################################################### +### Change miRNA names to isomirs names +############################################################################### + + +rule iso_anno_rename: + input: + bed=os.path.join( + config["output_dir"], + "{organism}", + "iso_anno_5p{bp_5p}_3p{bp_3p}.bed", + ), + output: + bed=os.path.join( + config["output_dir"], + "{organism}", + "iso_anno_rename_5p{bp_5p}_3p{bp_3p}.bed", + ), + params: + cluster_log=os.path.join( + config["cluster_log"], + "{organism}", + "iso_anno_rename_5p{bp_5p}_3p{bp_3p}.log", + ), + bp_5p=lambda wildcards: wildcards.bp_5p, + bp_3p=lambda wildcards: wildcards.bp_3p, + log: + os.path.join( + config["local_log"], + "{organism}", + "iso_anno_rename_5p{bp_5p}_3p{bp_3p}.log", + ), + singularity: + "docker://zavolab/ubuntu:18.04" + shell: + "(sed \ + 's/;Derives/_5p{params.bp_5p}_3p{params.bp_3p};Derives/' \ + {input.bed} \ + > {output.bed} \ + ) &> {log}" + + +############################################################################### +### Concatenate all isomirs annotation files +############################################################################### + + +rule iso_anno_concat: + input: + bed=lambda wildcards: expand( + os.path.join( + config["output_dir"], + "{organism}", + "iso_anno_rename_5p{bp_5p}_3p{bp_3p}.bed", + ), + organism=config["organism"], + bp_3p=config["bp_3p"], + bp_5p=config["bp_5p"], + ), + output: + bed=os.path.join( + config["output_dir"], "{organism}", "iso_anno_concat.bed" + ), + params: + cluster_log=os.path.join( + config["cluster_log"], "{organism}", "iso_anno_concat.log" + ), + prefix=os.path.join( + config["output_dir"], "{organism}", "iso_anno_rename" + ), + log: + os.path.join(config["local_log"], "{organism}", "iso_anno_concat.log"), + singularity: + "docker://zavolab/ubuntu:18.04" + shell: + "(cat {params.prefix}* > {output.bed}) &> {log}" + + +############################################################################### +### Remove non changing isomirs (5p0_3p0) +############################################################################### + + +rule iso_anno_final: + input: + bed=os.path.join( + config["output_dir"], "{organism}", "iso_anno_concat.bed" + ), + output: + bed=os.path.join( + config["output_dir"], "{organism}", "isomirs_annotation.bed" + ), + params: + cluster_log=os.path.join( + config["cluster_log"], "{organism}", "iso_anno_final.log" + ), + pattern="5p0_3p0", + log: + os.path.join(config["local_log"], "{organism}", "iso_anno_final.log"), + singularity: + "docker://zavolab/ubuntu:18.04" + shell: + "(grep -v '{params.pattern}' {input.bed} > {output.bed}) &> {log}" \ No newline at end of file -- GitLab From 2e3a966c12c7e853e3fd865d1087183ec0a40421 Mon Sep 17 00:00:00 2001 From: Iris Mestres Pascual <iris.mestrespascual@unibas.ch> Date: Tue, 17 Jan 2023 17:19:58 +0000 Subject: [PATCH 06/22] Snakefile of the "quantify" subworkflow --- workflow/rules/quantify.smk | 335 ++++++++++++++++++++++++++++++++++++ 1 file changed, 335 insertions(+) create mode 100644 workflow/rules/quantify.smk diff --git a/workflow/rules/quantify.smk b/workflow/rules/quantify.smk new file mode 100644 index 0000000..6eabb23 --- /dev/null +++ b/workflow/rules/quantify.smk @@ -0,0 +1,335 @@ +############################################################################### +# (c) 2020 Paula Iborra, Zavolan Lab, Biozentrum, University of Basel +# (@) paula.iborradetoledo@unibas.ch / paula.iborra@alumni.esci.upf.edu +# +# Pipeline to quantify miRNAs, including isomiRs, from miRNA-seq alignments. +############################################################################### +# +# USAGE: +# snakemake \ +# --snakefile="quanitfy.smk" \ +# --cores 4 \ +# --use-singularity \ +# --singularity-args "--bind $PWD/../" \ +# --printshellcmds \ +# --rerun-incomplete \ +# --verbose +# +################################################################################ + +import os + +configfile: "config.yaml" + +# Rules that require internet connection for downloading files are included +# in the localrules +localrules: + finish_quantify, + + +############################################################################### +### Finish rule +############################################################################### + + +rule finish_quantify: + input: + table1=expand( + os.path.join( + config["output_dir"], + "TABLES", + "counts.{mir}.tab", + ), + mir=config["mir_list"], + ), + table2=os.path.join( + config["output_dir"], + "TABLES", + "counts.isomirs.tab", + ), + +############################################################################### +### BAM to BED +############################################################################### + + +rule bamtobed: + input: + alignment=os.path.join( + config["quantify_input_dir"], + "{sample}", + "convertedSortedMappings_{sample}.bam", + ), + output: + alignment=os.path.join( + config["output_dir"], "{sample}", "alignments.bed12" + ), + params: + cluster_log=os.path.join( + config["cluster_log"], "bamtobed_{sample}.log" + ), + log: + os.path.join(config["local_log"], "bamtobed_{sample}.log"), + singularity: + "docker://zavolab/bedtools:2.27.0" + shell: + "(bedtools bamtobed \ + -bed12 \ + -tag NH \ + -i {input.alignment} \ + > {output.alignment} \ + ) &> {log}" + + +############################################################################### +### Sort alignments +############################################################################### + + +rule sort_alignments: + input: + alignment=os.path.join( + config["output_dir"], "{sample}", "alignments.bed12" + ), + output: + alignment=os.path.join( + config["output_dir"], "{sample}", "sorted.alignments.bed12" + ), + params: + cluster_log=os.path.join( + config["cluster_log"], "sortalignment_{sample}.log" + ), + log: + os.path.join(config["local_log"], "sortalignment_{sample}.log"), + resources: + mem=4, + threads=8, + singularity: + "docker://zavolab/ubuntu:18.04" + shell: + "(sort \ + -k1,1 \ + -k2,2n \ + {input.alignment} \ + > {output.alignment} \ + ) &> {log}" + + +############################################################################### +### miRNAs intersection +############################################################################### + + +rule intersect_mirna: + input: + alignment=os.path.join( + config["output_dir"], "{sample}", "sorted.alignments.bed12" + ), + mirna=config["mirnas_anno"], + output: + intersect=os.path.join( + config["output_dir"], "{sample}", "intersect_mirna.bed" + ), + params: + cluster_log=os.path.join( + config["cluster_log"], "intersection_mirna_{sample}.log" + ), + log: + os.path.join(config["local_log"], "intersection_mirna_{sample}.log"), + singularity: + "docker://zavolab/bedtools:2.27.0" + shell: + "(bedtools intersect \ + -wao \ + -s \ + -F 1 \ + -sorted \ + -b {input.alignment} \ + -a {input.mirna} \ + > {output.intersect} \ + ) &> {log}" + + +############################################################################### +### isomiRs intersection +############################################################################### + + +# rule intersect_isomirs: +# input: +# alignment=os.path.join( +# config["output_dir"], "{sample}", "sorted.alignments.bed12" +# ), +# isomirs=config["isomirs_anno"], +# output: +# intersect=os.path.join( +# config["output_dir"], "{sample}", "intersect_isomirs.bed" +# ), +# params: +# cluster_log=os.path.join( +# config["cluster_log"], "intersection_isomirs_{sample}.log" +# ), +# log: +# os.path.join(config["local_log"], "intersection_isomirs_{sample}.log"), +# singularity: +# "docker://zavolab/bedtools:2.27.0" +# shell: +# "(bedtools intersect \ +# -wao \ +# -s \ +# -F 1 \ +# -sorted \ +# -b {input.alignment} \ +# -a {input.isomirs} \ +# > {output.intersect} \ +# ) &> {log}" + + +############################################################################### +### miRNAs counting table - miRNA +############################################################################### + + +rule quant_mirna: + input: + intersect=os.path.join( + config["output_dir"], "{sample}", "intersect_mirna.bed" + ), + script=os.path.join(config["scripts_dir"], "mirna_quantification.py"), + output: + table=os.path.join( + config["output_dir"], "TABLES", "miRNA_counts_{sample}" + ), + params: + cluster_log=os.path.join( + config["cluster_log"], "quant_mirna_miRNA_{sample}.log" + ), + prefix=os.path.join( + config["output_dir"], "TABLES", "miRNA_counts_{sample}" + ), + log: + os.path.join(config["local_log"], "quant_mirna_miRNA_{sample}.log"), + singularity: + "docker://zavolab/python:3.6.5" + shell: + "(python \ + {input.script} \ + -i {input.intersect} \ + --uniq=miRNA \ + -p={params.prefix} \ + ) &> {log}" + + +############################################################################### +### miRNAs counting table - miRNA_primary +############################################################################### + + +rule quant_mirna_pri: + input: + intersect=os.path.join( + config["output_dir"], "{sample}", "intersect_mirna.bed" + ), + script=os.path.join(config["scripts_dir"], "mirna_quantification.py"), + output: + table=os.path.join( + config["output_dir"], + "TABLES", + "miRNA_primary_transcript_counts_{sample}", + ), + params: + cluster_log=os.path.join( + config["cluster_log"], + "quant_mirna_miRNA_primary_transcript_{sample}.log", + ), + prefix=os.path.join( + config["output_dir"], + "TABLES", + "miRNA_primary_transcript_counts_{sample}", + ), + log: + os.path.join( + config["local_log"], + "quant_mirna_miRNA_primary_transcript_{sample}.log", + ), + singularity: + "docker://zavolab/python:3.6.5" + shell: + "(python \ + {input.script} \ + -i {input.intersect} \ + --uniq=miRNA_primary_transcript \ + -p={params.prefix} \ + ) &> {log}" + + +############################################################################### +### isomiRs counting table +############################################################################### + + +# rule quant_isomirs: +# input: +# intersect=os.path.join( +# config["output_dir"], "{sample}", "intersect_isomirs.bed" +# ), +# script=os.path.join(config["scripts_dir"], "mirna_quantification.py"), +# output: +# table=os.path.join( +# config["output_dir"], "TABLES", "isomirs_counts_{sample}" +# ), +# params: +# cluster_log=os.path.join( +# config["cluster_log"], "quant_isomirs_{sample}.log" +# ), +# prefix=os.path.join( +# config["output_dir"], "TABLES", "isomirs_counts_{sample}" +# ), +# log: +# os.path.join(config["local_log"], "quant_isomirs_{sample}.log"), +# singularity: +# "docker://zavolab/python:3.6.5" +# shell: +# "(python \ +# {input.script} \ +# -i {input.intersect} \ +# --uniq=miRNA \ +# -p={params.prefix} \ +# ) &> {log}" + + +############################################################################### +### Merge counting tables for all samples by mature/primary/isomirs forms. +############################################################################### + + +rule merge_tables: + input: + table=expand( + os.path.join( + config["output_dir"], "TABLES", "{mir}_counts_{sample}" + ), + sample=config["sample"], + mir=config["mir_list"], + ), + script=os.path.join(config["scripts_dir"], "merge_tables.R"), + output: + table=os.path.join(config["output_dir"], "TABLES", "counts.{mir}.tab"), + params: + cluster_log=os.path.join( + config["cluster_log"], "merge_tables_{mir}.log" + ), + prefix="{mir}_counts_", + input_dir=os.path.join(config["output_dir"], "TABLES"), + log: + os.path.join(config["local_log"], "merge_tables_{mir}.log"), + singularity: + "docker://zavolab/r-tidyverse:3.5.3" + shell: + "(Rscript \ + {input.script} \ + --input_dir {params.input_dir} \ + --output_file {output.table} \ + --prefix {params.prefix} \ + --verbose \ + ) &> {log}" \ No newline at end of file -- GitLab From ed1083842770f7524d9066bb4f9b0c908cd59d88 Mon Sep 17 00:00:00 2001 From: Iris Mestres Pascual <iris.mestrespascual@unibas.ch> Date: Tue, 17 Jan 2023 17:20:45 +0000 Subject: [PATCH 07/22] Main snakefile (merged worflows) --- workflow/Snakefile | 108 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) create mode 100644 workflow/Snakefile diff --git a/workflow/Snakefile b/workflow/Snakefile new file mode 100644 index 0000000..9467cd3 --- /dev/null +++ b/workflow/Snakefile @@ -0,0 +1,108 @@ +############################################################################### +# (c) 2020 Paula Iborra, Zavolan Lab, Biozentrum, University of Basel +# (@) paula.iborradetoledo@unibas.ch / paula.iborra@alumni.esci.upf.edu +# +# Snakemake workflow to: +# - download and prepare the necessary files for smallRNA-seq related workflows. +# - map small RNA-seq reads (e.g. from miRNA sequencing libraries) +# - quantify miRNAs, including isomiRs, from miRNA-seq alignments. +# +############################################################################### +# +# USAGE: +# snakemake \ +# --use-singularity \ +# --singularity-args "--bind $PWD/../" \ +# --cores 4 \ +# --printshellcmds \ +# --rerun-incomplete \ +# --verbose +# +################################################################################ + +import os + +############################################################################### +### Including subworkflows +############################################################################### + +include: os.path.join("rules" , "prepare.smk") +include: os.path.join("rules" , "map.smk") +include: os.path.join("rules" , "quantify.smk") + +############################################################################### +### Finish rule +############################################################################### + + +rule finish: + input: + idx_transcriptome=expand( + os.path.join( + config["output_dir"], + "{organism}", + "transcriptome_index_segemehl.idx", + ), + organism=config["organism"], + ), + idx_genome=expand( + os.path.join( + config["output_dir"], + "{organism}", + "genome_index_segemehl.idx", + ), + organism=config["organism"], + ), + exons=expand( + os.path.join( + config["output_dir"], + "{organism}", + "exons.bed", + ), + organism=config["organism"], + ), + header=expand( + os.path.join( + config["output_dir"], + "{organism}", + "headerOfCollapsedFasta.sam", + ), + organism=config["organism"], + ), + mirnafilt=expand( + os.path.join( + config["output_dir"], + "{organism}", + "mirna_filtered.bed", + ), + organism=config["organism"], + ), + isomirs=expand( + os.path.join( + config["output_dir"], + "{organism}", + "isomirs_annotation.bed", + ), + organism=config["organism"], + ), + maps=expand( + os.path.join( + config["output_dir"], + "{sample}", + "convertedSortedMappings_{sample}.bam.bai", + ), + sample=config["sample"], + ), + table1=expand( + os.path.join( + config["output_dir"], + "TABLES", + "counts.{mir}.tab" + ), + mir=config["mir_list"], + ), + table2=os.path.join( + config["output_dir"], + "TABLES", + "counts.isomirs.tab" + ), \ No newline at end of file -- GitLab From 47e2399fa403cb6e3286b2c064d36b1580924771 Mon Sep 17 00:00:00 2001 From: Iris Mestres Pascual <iris.mestrespascual@unibas.ch> Date: Tue, 17 Jan 2023 17:21:53 +0000 Subject: [PATCH 08/22] Template of the config.yaml --- workflow/config.yaml | 106 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 workflow/config.yaml diff --git a/workflow/config.yaml b/workflow/config.yaml new file mode 100644 index 0000000..25e52d7 --- /dev/null +++ b/workflow/config.yaml @@ -0,0 +1,106 @@ +--- +#### GLOBAL PARAMETERS ##### + +# Directories +# Usually there is no need to change these +map_input_dir: "path/to/map_input_directory" # For the mapping worflow +quantify_input_dir: "path/to/quantify_input_directory" # For the quantify worflow +output_dir: "results" +scripts_dir: "../scripts" +local_log: "logs/local" +cluster_log: "logs/cluster" + +# Inputs information +sample: ["sample_1", "sample_2"] # put all sample names, separated by comma + +####################################################################################################### +#### +#### PREPARE PARAMETERS +#### +####################################################################################################### + +# Isomirs annotation file +# Number of base pairs to add/substract from 5' (start) and 3' (end) coordinates. +bp_5p: [0] # array of numbers, e.g., [-2,-1,0,+1], to include 2 upstream and 1 downstream nts +bp_3p: [0] # array of numbers, e.g., [-2,-1,0,+1], to include 2 upstream and 1 downstream nts + +# List of inputs +organism: ["org/pre"] # e.g., ["homo_sapiens/GRCh38.100", "mus_musculus/GRCm37.98"] +# this string specifies a path, and the "/" is important for this +# "pre" specifies the assembly version + + +#### PARAMETERS SPECIFIC TO INPUTS ##### + +org/pre: # One section for each list item in "organism"; entry should match precisely what +# is in the "organism" section above, one entry per list item above, omitting the "" + # URLs to genome, gene & miRNA annotations + genome_url: # FTP/HTTP URL to gzipped genome in FASTA format, Ensembl style + # e.g. "ftp://ftp.ensembl.org/pub/release-106/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna_sm.primary_assembly.fa.gz" + gtf_url: # FTP/HTTP URL to gzipped gene annotations in GTF format, Ensembl style + # e.g. "ftp://ftp.ensembl.org/pub/release-106/gtf/homo_sapiens/Homo_sapiens.GRCh38.106.chr.gtf.gz" + mirna_url: # FTP/HTTP URL to unzipped microRNA annotations in GFF format, miRBase style + # e.g. "https://www.mirbase.org/ftp/CURRENT/genomes/hsa.gff3" + + # Chromosome name mappings between UCSC <-> Ensembl + # Other organisms available at: https://github.com/dpryan79/ChromosomeMappings + map_chr_url: # FTP/HTTP URL to mapping table + # e.g. "https://raw.githubusercontent.com/dpryan79/ChromosomeMappings/master/GRCh38_UCSC2ensembl.txt" + # Chromosome name mapping parameters: + column: 1 # Column number from input file where to change chromosome name + delimiter: "TAB" # Delimiter of the input file + +####################################################################################################### +#### +#### MAP PARAMETERS +#### +####################################################################################################### + +# Resources: genome, transcriptome, genes, miRs +# All of these are produced by the "prepare" workflow +genome: "path/to/genome.processed.fa" +gtf: "path/to/gene_annotations.filtered.gtf" +transcriptome: "path/to/transcriptome_idtrim.fa" +transcriptome_index_segemehl: "path/to/transcriptome_index_segemehl.idx" +genome_index_segemehl: "path/to/genome_index_segemehl.idx" +exons: "path/to/exons.bed" +header_of_collapsed_fasta: "path/to/headerOfCollapsedFasta.sam" + +# Tool parameters: quality filter +q_value: 10 # Q (Phred) score; minimum quality score to keep +p_value: 50 # minimum % of bases that must have Q quality + +# Tool parameters: adapter removal +error_rate: 0.1 # fraction of allowed errors +minimum_length: 15 # discard processed reads shorter than the indicated length +overlap: 3 # minimum overlap length of adapter and read to trim the bases +max_n: 0 # discard reads containing more than the indicated number of N bases + +# Tool parameters: mapping +max_length_reads: 30 # maximum length of processed reads to map with oligomap +nh: 100 # discard reads with more mappings than the indicated number + + +#### PARAMETERS SPECIFIC TO INPUTS #### + +sample_1: # one section per list item in "sample"; names have to match + adapter: "XXXXXXXXXXXXXXXXXXXX" # 3' adapter sequence to trim + format: "fa" # file format; currently supported: "fa" + + +####################################################################################################### +#### +#### QUANTIFY PARAMETERS +#### +####################################################################################################### + + +# Types of miRNAs to quantify +# Remove miRNA types you are not interested in +mir_list: ["miRNA", "miRNA_primary_transcript", "isomirs"] + +# Resources: miR annotations, chromosome name mappings +# All of these are produced by the "prepare" workflow +mirnas_anno: "path/to/mirna_filtered.bed" +isomirs_anno: "path/to/isomirs_annotation.bed" +... \ No newline at end of file -- GitLab From 1fc05e9b0b8e07d78bc6b6b87100647f03eac033 Mon Sep 17 00:00:00 2001 From: Iris Mestres Pascual <iris.mestrespascual@unibas.ch> Date: Tue, 17 Jan 2023 17:24:18 +0000 Subject: [PATCH 09/22] Delete Snakefile --- workflow/map/Snakefile | 1007 ---------------------------------------- 1 file changed, 1007 deletions(-) delete mode 100644 workflow/map/Snakefile diff --git a/workflow/map/Snakefile b/workflow/map/Snakefile deleted file mode 100644 index aace1f1..0000000 --- a/workflow/map/Snakefile +++ /dev/null @@ -1,1007 +0,0 @@ -############################################################################### -# (c) 2020 Paula Iborra, Zavolan Lab, Biozentrum, University of Basel -# (@) paula.iborradetoledo@unibas.ch / paula.iborra@alumni.esci.upf.edu -# -# Workflow to map small RNA-seq reads (e.g. from miRNA sequencing libraries). -############################################################################### - -import os - - -# Rules that require internet connection for downloading files are included -# in the localrules -localrules: - finish, - - -############################################################################### -### Finish rule -############################################################################### - - -rule finish: - input: - maps=expand( - os.path.join( - config["output_dir"], - "{sample}", - "convertedSortedMappings_{sample}.bam.bai", - ), - sample=config["sample"], - ), - - -############################################################################### -### Uncompress fastq files -############################################################################### - - -rule uncompress_zipped_files: - input: - reads=os.path.join(config["input_dir"], "{sample}.{format}.gz"), - output: - reads=os.path.join( - config["output_dir"], "{sample}", "{format}", "reads.{format}" - ), - params: - cluster_log=os.path.join( - config["cluster_log"], - "uncompress_zipped_files_{sample}_{format}.log", - ), - log: - os.path.join( - config["local_log"], - "uncompress_zipped_files_{sample}_{format}.log", - ), - singularity: - "docker://zavolab/ubuntu:18.04" - shell: - "(zcat {input.reads} > {output.reads}) &> {log}" - - -############################################################################### -### Quality filter -############################################################################### - - -rule fastq_quality_filter: - input: - reads=os.path.join( - config["output_dir"], "{sample}", "fastq", "reads.fastq" - ), - output: - reads=os.path.join( - config["output_dir"], "{sample}", "fastq", "filtered_reads.fastq" - ), - params: - cluster_log=os.path.join( - config["cluster_log"], "fastq_quality_filter_{sample}.log" - ), - p=config["p_value"], - q=config["q_value"], - log: - os.path.join(config["local_log"], "fastq_quality_filter_{sample}.log"), - singularity: - "docker://zavolab/fastx:0.0.14" - shell: - "(fastq_quality_filter \ - -v \ - -q {params.q} \ - -p {params.p} \ - -i {input.reads} \ - > {output.reads} \ - ) &> {log}" - - -############################################################################### -### Convert fastq to fasta -############################################################################### - - -rule fastq_to_fasta: - input: - reads=os.path.join( - config["output_dir"], "{sample}", "fastq", "filtered_reads.fastq" - ), - output: - reads=os.path.join( - config["output_dir"], "{sample}", "fastq", "reads.fa" - ), - params: - cluster_log=os.path.join( - config["cluster_log"], "fastq_to_fasta_{sample}.log" - ), - log: - os.path.join(config["local_log"], "fastq_to_fasta_{sample}.log"), - singularity: - "docker://zavolab/fastx:0.0.14" - shell: - "(fastq_to_fasta -r -n -i {input.reads} > {output.reads}) &> {log}" - - -############################################################################### -### Format fasta file -############################################################################### - - -rule fasta_formatter: - input: - reads=lambda wildcards: os.path.join( - config["output_dir"], - wildcards.sample, - config[wildcards.sample]["format"], - "reads.fa", - ), - output: - reads=os.path.join(config["output_dir"], "{sample}", "formatted.fasta"), - params: - cluster_log=os.path.join( - config["cluster_log"], "fasta_formatter_{sample}.log" - ), - log: - os.path.join(config["local_log"], "fasta_formatter_{sample}.log"), - singularity: - "docker://zavolab/fastx:0.0.14" - shell: - "(fasta_formatter -w 0 -i {input.reads} > {output.reads}) &> {log}" - - -############################################################################### -### Remove adapters -############################################################################### - - -rule cutadapt: - input: - reads=os.path.join(config["output_dir"], "{sample}", "formatted.fasta"), - output: - reads=os.path.join(config["output_dir"], "{sample}", "cut.fasta"), - params: - cluster_log=os.path.join( - config["cluster_log"], "cutadapt_{sample}.log" - ), - adapter=lambda wildcards: config[wildcards.sample]["adapter"], - error_rate=config["error_rate"], - minimum_length=config["minimum_length"], - overlap=config["overlap"], - max_n=config["max_n"], - log: - os.path.join(config["local_log"], "cutadapt_{sample}.log"), - resources: - threads=8, - singularity: - "docker://zavolab/cutadapt:1.16" - shell: - "(cutadapt \ - -a {params.adapter} \ - --error-rate {params.error_rate} \ - --minimum-length {params.minimum_length} \ - --overlap {params.overlap} \ - --trim-n \ - --max-n {params.max_n} \ - --cores {resources.threads} \ - -o {output.reads} {input.reads}) &> {log}" - - -############################################################################### -### Collapse identical reads -############################################################################### - - -rule fastx_collapser: - input: - reads=os.path.join(config["output_dir"], "{sample}", "cut.fasta"), - output: - reads=os.path.join(config["output_dir"], "{sample}", "collapsed.fasta"), - params: - cluster_log=os.path.join( - config["cluster_log"], "fastx_collapser_{sample}.log" - ), - log: - os.path.join(config["local_log"], "fastx_collapser_{sample}.log"), - singularity: - "docker://zavolab/fastx:0.0.14" - shell: - "(fastx_collapser -i {input.reads} > {output.reads}) &> {log}" - - -############################################################################### -### Segemehl genome mapping -############################################################################### - - -rule mapping_genome_segemehl: - input: - reads=os.path.join(config["output_dir"], "{sample}", "collapsed.fasta"), - genome=config["genome"], - genome_index_segemehl=config["genome_index_segemehl"], - output: - gmap=os.path.join( - config["output_dir"], "{sample}", "segemehlGenome_map.sam" - ), - params: - cluster_log=os.path.join( - config["cluster_log"], "mapping_genome_segemehl_{sample}.log" - ), - log: - os.path.join( - config["local_log"], "mapping_genome_segemehl_{sample}.log" - ), - resources: - mem=50, - time=12, - threads=8, - singularity: - "docker://zavolab/segemehl:0.2.0" - shell: - "(segemehl.x \ - -i {input.genome_index_segemehl} \ - -d {input.genome} \ - -t {threads} \ - -q {input.reads} \ - -outfile {output.gmap} \ - ) &> {log}" - - -############################################################################### -### Segemehl transcriptome mapping -############################################################################### - - -rule mapping_transcriptome_segemehl: - input: - reads=os.path.join(config["output_dir"], "{sample}", "collapsed.fasta"), - transcriptome=config["transcriptome"], - transcriptome_index_segemehl=config["transcriptome_index_segemehl"], - output: - tmap=os.path.join( - config["output_dir"], "{sample}", "segemehlTranscriptome_map.sam" - ), - params: - cluster_log=os.path.join( - config["cluster_log"], - "mapping_transcriptome_segemehl_{sample}.log", - ), - log: - os.path.join( - config["local_log"], "mapping_transcriptome_segemehl_{sample}.log" - ), - resources: - mem=10, - time=12, - threads=8, - singularity: - "docker://zavolab/segemehl:0.2.0" - shell: - "(segemehl.x \ - -i {input.transcriptome_index_segemehl} \ - -d {input.transcriptome} \ - -t {threads} \ - -q {input.reads} \ - -outfile {output.tmap} \ - ) &> {log}" - - -############################################################################### -### Filter fasta for oligomap mapping -############################################################################### - - -rule filter_fasta_for_oligomap: - input: - reads=os.path.join(config["output_dir"], "{sample}", "collapsed.fasta"), - script=os.path.join(config["scripts_dir"], "validation_fasta.py"), - output: - reads=os.path.join( - config["output_dir"], "{sample}", "filtered_for_oligomap.fasta" - ), - params: - cluster_log=os.path.join( - config["cluster_log"], "filter_fasta_for_oligomap_{sample}.log" - ), - max_length_reads=config["max_length_reads"], - log: - os.path.join( - config["local_log"], "filter_fasta_for_oligomap_{sample}.log" - ), - singularity: - "docker://zavolab/python:3.6.5" - shell: - "(python {input.script} \ - -r {params.max_length_reads} \ - -i {input.reads} \ - -o {output.reads} \ - ) &> {log}" - - -############################################################################### -### Oligomap genome mapping -############################################################################### - - -rule mapping_genome_oligomap: - input: - reads=os.path.join( - config["output_dir"], "{sample}", "filtered_for_oligomap.fasta" - ), - target=config["genome"], - output: - gmap=os.path.join( - config["output_dir"], "{sample}", "oligoGenome_map.fa" - ), - report=os.path.join( - config["output_dir"], "{sample}", "oligoGenome_report.txt" - ), - params: - cluster_log=os.path.join( - config["cluster_log"], "mapping_genome_oligomap_{sample}.log" - ), - log: - os.path.join( - config["local_log"], "mapping_genome_oligomap_{sample}.log" - ), - resources: - mem=50, - time=6, - threads=8, - singularity: - "docker://zavolab/oligomap:1.0" - shell: - "(oligomap \ - {input.target} \ - {input.reads} \ - -r {output.report} \ - > {output.gmap} \ - ) &> {log}" - - -############################################################################### -### Oligomap genome sorting -############################################################################### - - -rule sort_genome_oligomap: - input: - tmap=os.path.join( - config["output_dir"], "{sample}", "oligoGenome_map.fa" - ), - report=os.path.join( - config["output_dir"], "{sample}", "oligoGenome_report.txt" - ), - script=os.path.join(config["scripts_dir"], "blocksort.sh"), - output: - sort=os.path.join( - config["output_dir"], "{sample}", "oligoGenome_sorted.fa" - ), - params: - cluster_log=os.path.join( - config["cluster_log"], "sorting_genome_oligomap_{sample}.log" - ), - log: - os.path.join( - config["local_log"], "sorting_genome_oligomap_{sample}.log" - ), - resources: - threads=8, - time=6, - shell: - "(bash {input.script} \ - {input.tmap} \ - {resources.threads} \ - {output.sort} \ - ) &> {log}" - - -############################################################################### -### Oligomap genome mapping output to SAM -############################################################################### - - -rule oligomap_genome_toSAM: - input: - report=os.path.join( - config["output_dir"], "{sample}", "oligoGenome_report.txt" - ), - sort=os.path.join( - config["output_dir"], "{sample}", "oligoGenome_sorted.fa" - ), - script=os.path.join( - config["scripts_dir"], "oligomapOutputToSam_nhfiltered.py" - ), - output: - gmap=os.path.join( - config["output_dir"], "{sample}", "oligoGenome_converted.sam" - ), - params: - cluster_log=os.path.join( - config["cluster_log"], "oligomap_genome_toSAM_{sample}.log" - ), - nh=config["nh"], - log: - os.path.join(config["local_log"], "oligomap_genome_toSAM_{sample}.log"), - resources: - time=1, - queue=1, - singularity: - "docker://zavolab/python:3.6.5" - shell: - "(python {input.script} \ - -i {input.sort} \ - -n {params.nh} \ - > {output.gmap}) &> {log}" - - -############################################################################### -### Oligomap trancriptome mapping -############################################################################### - - -rule mapping_transcriptome_oligomap: - input: - reads=os.path.join( - config["output_dir"], "{sample}", "filtered_for_oligomap.fasta" - ), - target=config["transcriptome"], - output: - tmap=os.path.join( - config["output_dir"], "{sample}", "oligoTranscriptome_map.fa" - ), - report=os.path.join( - config["output_dir"], "{sample}", "oligoTranscriptome_report.txt" - ), - params: - cluster_log=os.path.join( - config["cluster_log"], - "mapping_transcriptome_oligomap_{sample}.log", - ), - log: - os.path.join( - config["local_log"], "mapping_transcriptome_oligomap_{sample}.log" - ), - resources: - mem=10, - time=6, - threads=8, - singularity: - "docker://zavolab/oligomap:1.0" - shell: - "(oligomap \ - {input.target} \ - {input.reads} \ - -s \ - -r {output.report} \ - > {output.tmap} \ - ) &> {log}" - - -############################################################################### -### Oligomap trancriptome sorting -############################################################################### - - -rule sort_transcriptome_oligomap: - input: - tmap=os.path.join( - config["output_dir"], "{sample}", "oligoTranscriptome_map.fa" - ), - report=os.path.join( - config["output_dir"], "{sample}", "oligoTranscriptome_report.txt" - ), - script=os.path.join(config["scripts_dir"], "blocksort.sh"), - output: - sort=os.path.join( - config["output_dir"], "{sample}", "oligoTranscriptome_sorted.fa" - ), - params: - cluster_log=os.path.join( - config["cluster_log"], - "sorting_transcriptome_oligomap_{sample}.log", - ), - log: - os.path.join( - config["local_log"], "sorting_transcriptome_oligomap_{sample}.log" - ), - resources: - threads=8, - shell: - "(bash {input.script} \ - {input.tmap} \ - {resources.threads} \ - {output.sort} \ - ) &> {log}" - - -############################################################################### -### Oligomap transcriptome mapping ouput to SAM -############################################################################### - - -rule oligomap_transcriptome_toSAM: - input: - report=os.path.join( - config["output_dir"], "{sample}", "oligoTranscriptome_report.txt" - ), - sort=os.path.join( - config["output_dir"], "{sample}", "oligoTranscriptome_sorted.fa" - ), - script=os.path.join( - config["scripts_dir"], "oligomapOutputToSam_nhfiltered.py" - ), - output: - tmap=os.path.join( - config["output_dir"], - "{sample}", - "oligoTranscriptome_converted.sam", - ), - params: - cluster_log=os.path.join( - config["cluster_log"], "oligomap_transcriptome_toSAM_{sample}.log" - ), - nh=config["nh"], - log: - os.path.join( - config["local_log"], "oligomap_transcriptome_toSAM_{sample}.log" - ), - singularity: - "docker://zavolab/python:3.6.5" - shell: - "(python {input.script} \ - -i {input.sort} \ - -n {params.nh} \ - > {output.tmap} \ - ) &> {log}" - - -############################################################################### -### Merge genome mappings -############################################################################### - - -rule merge_genome_maps: - input: - gmap1=os.path.join( - config["output_dir"], "{sample}", "segemehlGenome_map.sam" - ), - gmap2=os.path.join( - config["output_dir"], "{sample}", "oligoGenome_converted.sam" - ), - output: - gmaps=os.path.join( - config["output_dir"], "{sample}", "GenomeMappings.sam" - ), - params: - cluster_log=os.path.join( - config["cluster_log"], "merge_genome_maps_{sample}.log" - ), - log: - os.path.join(config["local_log"], "merge_genome_maps_{sample}.log"), - singularity: - "docker://zavolab/ubuntu:18.04" - shell: - "(cat {input.gmap1} {input.gmap2} > {output.gmaps}) &> {log}" - - -############################################################################### -### Merge trancriptome mappings -############################################################################### - - -rule merge_transcriptome_maps: - input: - tmap1=os.path.join( - config["output_dir"], "{sample}", "segemehlTranscriptome_map.sam" - ), - tmap2=os.path.join( - config["output_dir"], - "{sample}", - "oligoTranscriptome_converted.sam", - ), - output: - tmaps=os.path.join( - config["output_dir"], "{sample}", "TranscriptomeMappings.sam" - ), - params: - cluster_log=os.path.join( - config["cluster_log"], "merge_transcriptome_maps_{sample}.log" - ), - log: - os.path.join( - config["local_log"], "merge_transcriptome_maps_{sample}.log" - ), - singularity: - "docker://zavolab/ubuntu:18.04" - shell: - "(cat {input.tmap1} {input.tmap2} > {output.tmaps}) &> {log}" - - -############################################################################### -### Filter NH genome -############################################################################### - - -rule nh_filter_genome: - input: - gmaps=os.path.join( - config["output_dir"], "{sample}", "GenomeMappings.sam" - ), - script=os.path.join(config["scripts_dir"], "nh_filter.py"), - output: - gmaps=os.path.join( - config["output_dir"], "{sample}", "nhfiltered_GenomeMappings.sam" - ), - params: - cluster_log=os.path.join( - config["cluster_log"], "nh_filter_genome_{sample}.log" - ), - nh=config["nh"], - log: - os.path.join(config["local_log"], "nh_filter_genome_{sample}.log"), - singularity: - "docker://zavolab/python:3.6.5" - shell: - "(python {input.script} \ - {input.gmaps} \ - {params.nh} \ - {output.gmaps} \ - ) &> {log}" - - -############################################################################### -### Filter NH transcriptome -############################################################################### - - -rule filter_nh_transcriptome: - input: - tmaps=os.path.join( - config["output_dir"], "{sample}", "TranscriptomeMappings.sam" - ), - script=os.path.join(config["scripts_dir"], "nh_filter.py"), - output: - tmaps=os.path.join( - config["output_dir"], - "{sample}", - "nhfiltered_TranscriptomeMappings.sam", - ), - params: - cluster_log=os.path.join( - config["cluster_log"], "filter_nh_transcriptome_{sample}.log" - ), - nh=config["nh"], - log: - os.path.join( - config["local_log"], "filter_nh_transcriptome_{sample}.log" - ), - singularity: - "docker://zavolab/python:3.6.5" - shell: - "(python {input.script} \ - {input.tmaps} \ - {params.nh} \ - {output.tmaps} \ - ) &> {log}" - - -############################################################################### -### Remove header genome mappings -############################################################################### - - -rule remove_headers_genome: - input: - gmap=os.path.join( - config["output_dir"], "{sample}", "nhfiltered_GenomeMappings.sam" - ), - output: - gmap=os.path.join( - config["output_dir"], "{sample}", "noheader_GenomeMappings.sam" - ), - params: - cluster_log=os.path.join( - config["cluster_log"], "remove_headers_genome_{sample}.log" - ), - log: - os.path.join(config["local_log"], "remove_headers_genome_{sample}.log"), - singularity: - "docker://zavolab/samtools:1.8" - shell: - "samtools view {input.gmap} > {output.gmap}" - - -############################################################################### -### Remove header transcriptome mappings -############################################################################### - - -rule remove_headers_transcriptome: - input: - tmap=os.path.join( - config["output_dir"], - "{sample}", - "nhfiltered_TranscriptomeMappings.sam", - ), - output: - tmap=os.path.join( - config["output_dir"], - "{sample}", - "noheader_TranscriptomeMappings.sam", - ), - params: - cluster_log=os.path.join( - config["cluster_log"], "remove_headers_transcriptome_{sample}.log" - ), - log: - os.path.join( - config["local_log"], "remove_headers_transcriptome_{sample}.log" - ), - singularity: - "docker://zavolab/samtools:1.8" - shell: - "samtools view {input.tmap} > {output.tmap}" - - -############################################################################### -### Transcriptome to genome coordinates -############################################################################### - - -rule trans_to_gen: - input: - tmap=os.path.join( - config["output_dir"], - "{sample}", - "noheader_TranscriptomeMappings.sam", - ), - script=os.path.join(config["scripts_dir"], "sam_trx_to_sam_gen.pl"), - exons=config["exons"], - output: - genout=os.path.join(config["output_dir"], "{sample}", "TransToGen.sam"), - params: - cluster_log=os.path.join( - config["cluster_log"], "trans_to_gen_{sample}.log" - ), - log: - os.path.join(config["local_log"], "trans_to_gen_{sample}.log"), - singularity: - "docker://zavolab/perl:5.28" - shell: - "(perl {input.script} \ - --in {input.tmap} \ - --exons {input.exons} \ - --out {output.genout} \ - ) &> {log}" - - -############################################################################### -### Concatenate genome and trancriptome mappings -############################################################################### - - -rule cat_mapping: - input: - gmap1=os.path.join(config["output_dir"], "{sample}", "TransToGen.sam"), - gmap2=os.path.join( - config["output_dir"], "{sample}", "noheader_GenomeMappings.sam" - ), - output: - catmaps=os.path.join( - config["output_dir"], "{sample}", "catMappings.sam" - ), - params: - cluster_log=os.path.join( - config["cluster_log"], "cat_mapping_{sample}.log" - ), - log: - os.path.join(config["local_log"], "cat_mapping_{sample}.log"), - singularity: - "docker://zavolab/ubuntu:18.04" - shell: - "(cat {input.gmap1} {input.gmap2} > {output.catmaps}) &> {log}" - - -############################################################################### -### Add header -############################################################################### - - -rule add_header: - input: - header=config["header_of_collapsed_fasta"], - catmaps=os.path.join( - config["output_dir"], "{sample}", "catMappings.sam" - ), - output: - concatenate=os.path.join( - config["output_dir"], - "{sample}", - "concatenated_header_catMappings.sam", - ), - params: - cluster_log=os.path.join( - config["cluster_log"], "add_header_{sample}.log" - ), - log: - os.path.join(config["local_log"], "add_header_{sample}.log"), - singularity: - "docker://zavolab/ubuntu:18.04" - shell: - "(cat {input.header} {input.catmaps} > {output.concatenate}) &> {log}" - - -############################################################################### -### Sort mapped file by IDs -############################################################################### - - -rule sort_id: - input: - concatenate=os.path.join( - config["output_dir"], - "{sample}", - "concatenated_header_catMappings.sam", - ), - output: - sort=os.path.join( - config["output_dir"], "{sample}", "header_sorted_catMappings.sam" - ), - params: - cluster_log=os.path.join(config["cluster_log"], "sort_id_{sample}.log"), - log: - os.path.join(config["local_log"], "sort_id_{sample}.log"), - singularity: - "docker://zavolab/samtools:1.8" - shell: - "(samtools sort -n -o {output.sort} {input.concatenate}) &> {log}" - - -############################################################################### -### Remove inferior mappings (keeping multimappers) -############################################################################### - - -rule remove_inferiors: - input: - sort=os.path.join( - config["output_dir"], "{sample}", "header_sorted_catMappings.sam" - ), - script=os.path.join( - config["scripts_dir"], - "sam_remove_duplicates_inferior_alignments_multimappers.1_5.pl", - ), - output: - remove_inf=os.path.join( - config["output_dir"], "{sample}", "removeInferiors.sam" - ), - params: - cluster_log=os.path.join( - config["cluster_log"], "remove_inferiors_{sample}.log" - ), - log: - os.path.join(config["local_log"], "remove_inferiors_{sample}.log"), - resources: - mem=15, - threads=4, - singularity: - "docker://zavolab/perl:5.28" - shell: - "(perl {input.script} \ - --print-header \ - --keep-mm \ - --in {input.sort} \ - --out {output.remove_inf} \ - ) &> {log}" - - -############################################################################### -### Uncollapse reads -############################################################################### - - -rule uncollapse_reads: - input: - maps=os.path.join( - config["output_dir"], "{sample}", "removeInferiors.sam" - ), - script=os.path.join(config["scripts_dir"], "sam_uncollapse.pl"), - output: - maps=os.path.join( - config["output_dir"], "{sample}", "uncollapsedMappings.sam" - ), - params: - cluster_log=os.path.join( - config["cluster_log"], "uncollapse_reads_{sample}.log" - ), - log: - os.path.join(config["local_log"], "uncollapse_reads_{sample}.log"), - singularity: - "docker://zavolab/perl:5.28" - shell: - "(perl {input.script} \ - --suffix \ - --in {input.maps} \ - --out {output.maps} \ - ) &> {log}" - - -############################################################################### -### Convert SAM to BAM -############################################################################### - - -rule convert_to_bam: - input: - maps=os.path.join( - config["output_dir"], "{sample}", "uncollapsedMappings.sam" - ), - output: - maps=os.path.join( - config["output_dir"], "{sample}", "mappingsConverted.bam" - ), - params: - cluster_log=os.path.join( - config["cluster_log"], "convert_to_bam_{sample}.log" - ), - log: - os.path.join(config["local_log"], "convert_to_bam_{sample}.log"), - singularity: - "docker://zavolab/samtools:1.8" - shell: - "(samtools view -b {input.maps} > {output.maps}) &> {log}" - - -############################################################################### -### Sort by coordinate position -############################################################################### - - -rule sort_by_position: - input: - maps=os.path.join( - config["output_dir"], "{sample}", "mappingsConverted.bam" - ), - output: - maps=os.path.join( - config["output_dir"], - "{sample}", - "convertedSortedMappings_{sample}.bam", - ), - params: - cluster_log=os.path.join( - config["cluster_log"], "sort_by_position_{sample}.log" - ), - log: - os.path.join(config["local_log"], "sort_by_position_{sample}.log"), - singularity: - "docker://zavolab/samtools:1.8" - shell: - "(samtools sort {input.maps} > {output.maps}) &> {log}" - - -############################################################################### -### Create bam index -############################################################################### - - -rule index_bam: - input: - maps=os.path.join( - config["output_dir"], - "{sample}", - "convertedSortedMappings_{sample}.bam", - ), - output: - maps=os.path.join( - config["output_dir"], - "{sample}", - "convertedSortedMappings_{sample}.bam.bai", - ), - params: - cluster_log=os.path.join( - config["cluster_log"], "index_bam_{sample}.log" - ), - log: - os.path.join(config["local_log"], "index_bam_{sample}.log"), - singularity: - "docker://zavolab/samtools:1.8" - shell: - "(samtools index -b {input.maps} > {output.maps}) &> {log}" -- GitLab From 657972b93e30a3427b51066be1f69b3819d36e13 Mon Sep 17 00:00:00 2001 From: Iris Mestres Pascual <iris.mestrespascual@unibas.ch> Date: Tue, 17 Jan 2023 17:24:32 +0000 Subject: [PATCH 10/22] Delete Snakefile --- workflow/prepare/Snakefile | 731 ------------------------------------- 1 file changed, 731 deletions(-) delete mode 100644 workflow/prepare/Snakefile diff --git a/workflow/prepare/Snakefile b/workflow/prepare/Snakefile deleted file mode 100644 index c16afb8..0000000 --- a/workflow/prepare/Snakefile +++ /dev/null @@ -1,731 +0,0 @@ -############################################################################### -# (c) 2020 Paula Iborra, Zavolan Lab, Biozentrum, University of Basel -# (@) paula.iborradetoledo@unibas.ch / paula.iborra@alumni.esci.upf.edu -# -# Snakemake workflow to download and prepare the necessary files for -# smallRNA-seq related workflows. -############################################################################### - -import os - - -# Rules that require internet connection for downloading files are included -# in the localrules -localrules: - finish, - genome_process, - filter_anno_gtf, - mirna_anno, - dict_chr, - - -############################################################################### -### Finish rule -############################################################################### - - -rule finish: - input: - idx_transcriptome=expand( - os.path.join( - config["output_dir"], - "{organism}", - "transcriptome_index_segemehl.idx", - ), - organism=config["organism"], - ), - idx_genome=expand( - os.path.join( - config["output_dir"], - "{organism}", - "genome_index_segemehl.idx", - ), - organism=config["organism"], - ), - exons=expand( - os.path.join(config["output_dir"], "{organism}", "exons.bed"), - organism=config["organism"], - ), - header=expand( - os.path.join( - config["output_dir"], - "{organism}", - "headerOfCollapsedFasta.sam", - ), - organism=config["organism"], - ), - mirnafilt=expand( - os.path.join( - config["output_dir"], "{organism}", "mirna_filtered.bed" - ), - organism=config["organism"], - ), - isomirs=expand( - os.path.join( - config["output_dir"], "{organism}", "isomirs_annotation.bed" - ), - organism=config["organism"], - ), - - -############################################################################### -### Download and process genome IDs -############################################################################### - - -rule genome_process: - input: - script=os.path.join(config["scripts_dir"], "genome_process.sh"), - output: - genome=os.path.join( - config["output_dir"], "{organism}", "genome.processed.fa" - ), - params: - url=lambda wildcards: config[wildcards.organism]["genome_url"], - dir_out=os.path.join(config["output_dir"], "{organism}"), - log: - os.path.join(config["local_log"], "{organism}", "genome_process.log"), - singularity: - "docker://zavolab/ubuntu:18.04" - shell: - "(bash {input.script} {params.dir_out} {log} {params.url})" - - -############################################################################### -### Download and filter gtf by transcript_level -############################################################################### - - -rule filter_anno_gtf: - input: - script=os.path.join(config["scripts_dir"], "filter_anno_gtf.sh"), - output: - gtf=os.path.join( - config["output_dir"], - "{organism}", - "gene_annotations.filtered.gtf", - ), - params: - url=lambda wildcards: config[wildcards.organism]["gtf_url"], - dir_out=os.path.join(config["output_dir"], "{organism}"), - log: - os.path.join(config["local_log"], "{organism}", "filter_anno_gtf.log"), - singularity: - "docker://zavolab/ubuntu:18.04" - shell: - "(bash {input.script} {params.dir_out} {log} {params.url}) &> {log}" - - -############################################################################### -### Extract transcriptome sequences in FASTA from genome. -############################################################################### - - -rule extract_transcriptome_seqs: - input: - genome=os.path.join( - config["output_dir"], "{organism}", "genome.processed.fa" - ), - gtf=os.path.join( - config["output_dir"], - "{organism}", - "gene_annotations.filtered.gtf", - ), - output: - fasta=os.path.join( - config["output_dir"], "{organism}", "transcriptome.fa" - ), - params: - cluster_log=os.path.join( - config["cluster_log"], - "{organism}", - "extract_transcriptome_seqs.log", - ), - log: - os.path.join( - config["local_log"], - "{organism}", - "extract_transcriptome_seqs.log", - ), - singularity: - "docker://zavolab/cufflinks:2.2.1" - shell: - "(gffread -w {output.fasta} -g {input.genome} {input.gtf}) &> {log}" - - -############################################################################### -### Trim transcript IDs from FASTA file -############################################################################### - - -rule trim_fasta: - input: - fasta=os.path.join( - config["output_dir"], "{organism}", "transcriptome.fa" - ), - script=os.path.join(config["scripts_dir"], "validation_fasta.py"), - output: - fasta=os.path.join( - config["output_dir"], "{organism}", "transcriptome_idtrim.fa" - ), - params: - cluster_log=os.path.join( - config["cluster_log"], "{organism}", "trim_fasta.log" - ), - log: - os.path.join(config["local_log"], "{organism}", "trim_fasta.log"), - singularity: - "docker://zavolab/ubuntu:18.04" - shell: - """(awk \ - -F" " \ - "/^>/ {{print \$1; next}} 1" \ - {input.fasta} \ - > {output.fasta} \ - ) &> {log}""" - - -############################################################################### -### Generate segemehl index for transcripts -############################################################################### - - -rule generate_segemehl_index_transcriptome: - input: - fasta=os.path.join( - config["output_dir"], "{organism}", "transcriptome_idtrim.fa" - ), - output: - idx=os.path.join( - config["output_dir"], - "{organism}", - "transcriptome_index_segemehl.idx", - ), - params: - cluster_log=os.path.join( - config["cluster_log"], - "{organism}", - "generate_segemehl_index_transcriptome.log", - ), - log: - os.path.join( - config["local_log"], - "{organism}", - "generate_segemehl_index_transcriptome.log", - ), - resources: - mem=10, - threads=8, - time=6, - singularity: - "docker://zavolab/segemehl:0.2.0" - shell: - "(segemehl.x -x {output.idx} -d {input.fasta}) &> {log}" - - -############################################################################### -### Generate segemehl index for genome -############################################################################### - - -rule generate_segemehl_index_genome: - input: - genome=os.path.join( - config["output_dir"], "{organism}", "genome.processed.fa" - ), - output: - idx=os.path.join( - config["output_dir"], "{organism}", "genome_index_segemehl.idx" - ), - params: - cluster_log=os.path.join( - config["cluster_log"], - "{organism}", - "generate_segemehl_index_genome.log", - ), - log: - os.path.join( - config["local_log"], - "{organism}", - "generate_segemehl_index_genome.log", - ), - resources: - mem=60, - threads=8, - time=6, - singularity: - "docker://zavolab/segemehl:0.2.0" - shell: - "(segemehl.x -x {output.idx} -d {input.genome}) &> {log}" - - -############################################################################### -### GTF file of exons (genomic coordinates) -############################################################################### - - -rule get_exons_gtf: - input: - gtf=os.path.join( - config["output_dir"], - "{organism}", - "gene_annotations.filtered.gtf", - ), - script=os.path.join(config["scripts_dir"], "get_lines_w_pattern.sh"), - output: - exons=os.path.join(config["output_dir"], "{organism}", "exons.gtf"), - params: - cluster_log=os.path.join( - config["cluster_log"], "{organism}", "get_exons_gtf.log" - ), - log: - os.path.join(config["local_log"], "{organism}", "get_exons_gtf.log"), - singularity: - "docker://zavolab/ubuntu:18.04" - shell: - "(bash \ - {input.script} \ - -f {input.gtf} \ - -c 3 \ - -p exon \ - -o {output.exons} \ - ) &> {log}" - - -############################################################################### -### Convert GTF file of exons to BED file -############################################################################### - - -rule gtftobed: - input: - exons=os.path.join(config["output_dir"], "{organism}", "exons.gtf"), - script=os.path.join(config["scripts_dir"], "gtf_exons_bed.1.1.2.R"), - output: - exons=os.path.join(config["output_dir"], "{organism}", "exons.bed"), - params: - cluster_log=os.path.join( - config["cluster_log"], "{organism}", "gtftobed.log" - ), - log: - os.path.join(config["local_log"], "{organism}", "gtftobed.log"), - singularity: - "docker://zavolab/r-zavolab:3.5.1" - shell: - "(Rscript \ - {input.script} \ - --gtf {input.exons} \ - -o {output.exons} \ - ) &> {log}" - - -############################################################################### -### Create header for SAM file -############################################################################### - - -rule create_header_genome: - input: - genome=os.path.join( - config["output_dir"], "{organism}", "genome.processed.fa" - ), - output: - header=os.path.join( - config["output_dir"], "{organism}", "headerOfCollapsedFasta.sam" - ), - params: - cluster_log=os.path.join( - config["cluster_log"], "{organism}", "create_header_genome.log" - ), - log: - os.path.join( - config["local_log"], "{organism}", "create_header_genome.log" - ), - singularity: - "docker://zavolab/samtools:1.8" - shell: - "(samtools dict -o {output.header} --uri=NA {input.genome}) &> {log}" - - -############################################################################### -### Download miRNA annotation -############################################################################### - - -rule mirna_anno: - input: - genome=os.path.join( - config["output_dir"], "{organism}", "genome.processed.fa" - ), - output: - anno=os.path.join( - config["output_dir"], "{organism}", "raw", "mirna.gff3" - ), - params: - anno=lambda wildcards: config[wildcards.organism]["mirna_url"], - cluster_log=os.path.join( - config["cluster_log"], "{organism}", "mirna_anno.log" - ), - log: - os.path.join(config["local_log"], "{organism}", "mirna_anno.log"), - singularity: - "docker://zavolab/ubuntu:18.04" - shell: - "(wget {params.anno} -O {output.anno}) &> {log}" - - -############################################################################### -### Download dictionary mapping chr -############################################################################### - - -rule dict_chr: - input: - genome=os.path.join( - config["output_dir"], "{organism}", "genome.processed.fa" - ), - output: - map_chr=os.path.join( - config["output_dir"], "{organism}", "UCSC2ensembl.txt" - ), - params: - map_chr=lambda wildcards: config[wildcards.organism]["map_chr_url"], - cluster_log=os.path.join( - config["cluster_log"], "{organism}", "dict_chr.log" - ), - log: - os.path.join(config["local_log"], "{organism}", "dict_chr.log"), - singularity: - "docker://zavolab/ubuntu:18.04" - shell: - "(wget {params.map_chr} -O {output.map_chr}) &> {log}" - - -############################################################################### -### Mapping chromosomes names, UCSC <-> ENSEMBL -############################################################################### - - -rule map_chr_names: - input: - anno=os.path.join( - config["output_dir"], "{organism}", "raw", "mirna.gff3" - ), - script=os.path.join(config["scripts_dir"], "map_chromosomes.pl"), - map_chr=os.path.join( - config["output_dir"], "{organism}", "UCSC2ensembl.txt" - ), - output: - gff=os.path.join( - config["output_dir"], "{organism}", "mirna_chr_mapped.gff3" - ), - params: - cluster_log=os.path.join( - config["cluster_log"], "{organism}", "map_chr_names.log" - ), - column=lambda wildcards: config[wildcards.organism]["column"], - delimiter=lambda wildcards: config[wildcards.organism]["delimiter"], - log: - os.path.join(config["local_log"], "{organism}", "map_chr_names.log"), - singularity: - "docker://zavolab/perl:5.28" - shell: - "(perl {input.script} \ - {input.anno} \ - {params.column} \ - {params.delimiter} \ - {input.map_chr} \ - {output.gff} \ - ) &> {log}" - - -############################################################################### -### Filtering _1 miR IDs -############################################################################### - - -rule filter_mir_1_anno: - input: - gff=os.path.join( - config["output_dir"], "{organism}", "mirna_chr_mapped.gff3" - ), - output: - gff=os.path.join( - config["output_dir"], "{organism}", "mirna_filtered.gff3" - ), - params: - script=os.path.join(config["scripts_dir"], "filter_mir_1_anno.sh"), - cluster_log=os.path.join( - config["cluster_log"], "{organism}", "filter_mir_1_anno.log" - ), - log: - os.path.join( - config["local_log"], "{organism}", "filter_mir_1_anno.log" - ), - singularity: - "docker://zavolab/ubuntu:18.04" - shell: - "(bash {params.script} -f {input.gff} -o {output.gff}) &> {log}" - - -############################################################################### -### GFF to BED (improve intersect memory efficient allowing to use -sorted) -############################################################################### - - -rule gfftobed: - input: - gff=os.path.join( - config["output_dir"], "{organism}", "mirna_filtered.gff3" - ), - output: - bed=os.path.join( - config["output_dir"], "{organism}", "mirna_filtered.bed" - ), - params: - cluster_log=os.path.join( - config["cluster_log"], "{organism}", "gfftobed.log" - ), - out_dir=os.path.join(config["output_dir"]), - log: - os.path.join(config["local_log"], "{organism}", "gfftobed.log"), - singularity: - "docker://zavolab/bedops:2.4.35" - shell: - "(convert2bed -i gff < {input.gff} \ - --sort-tmpdir={params.out_dir} \ - > {output.bed} \ - ) &> {log}" - - -############################################################################### -### Index genome fasta file -############################################################################### - - -rule create_index_fasta: - input: - genome=os.path.join( - config["output_dir"], "{organism}", "genome.processed.fa" - ), - output: - genome=os.path.join( - config["output_dir"], "{organism}", "genome.processed.fa.fai" - ), - params: - cluster_log=os.path.join( - config["cluster_log"], "{organism}", "create_index_fasta.log" - ), - log: - os.path.join( - config["local_log"], "{organism}", "create_index_fasta.log" - ), - singularity: - "docker://zavolab/samtools:1.8" - shell: - "(samtools faidx {input.genome}) &> {log}" - - -############################################################################### -### Extract chromosome length -############################################################################### - - -rule extract_chr_len: - input: - genome=os.path.join( - config["output_dir"], "{organism}", "genome.processed.fa.fai" - ), - output: - chrsize=os.path.join( - config["output_dir"], "{organism}", "chr_size.txt" - ), - params: - cluster_log=os.path.join( - config["cluster_log"], "{organism}", "extract_chr_len.log" - ), - log: - os.path.join(config["local_log"], "{organism}", "extract_chr_len.log"), - singularity: - "docker://zavolab/ubuntu:18.04" - shell: - "(cut -f1,2 {input.genome} > {output.chrsize}) &> {log}" - - -############################################################################### -### Extract mature miRNA -############################################################################### - - -rule filter_mature_mirs: - input: - bed=os.path.join( - config["output_dir"], "{organism}", "mirna_filtered.bed" - ), - output: - bed=os.path.join( - config["output_dir"], "{organism}", "mirna_mature_filtered.bed" - ), - params: - cluster_log=os.path.join( - config["cluster_log"], "{organism}", "filter_mature_mirs.log" - ), - precursor="miRNA_primary_transcript", - log: - os.path.join( - config["local_log"], "{organism}", "filter_mature_mirs.log" - ), - singularity: - "docker://zavolab/ubuntu:18.04" - shell: - "(grep -v {params.precursor} {input.bed} > {output.bed}) &> {log}" - - -############################################################################### -### Create isomirs annotation file from mature miRNA -############################################################################### - - -rule iso_anno: - input: - bed=os.path.join( - config["output_dir"], "{organism}", "mirna_mature_filtered.bed" - ), - chrsize=os.path.join( - config["output_dir"], "{organism}", "chr_size.txt" - ), - output: - bed=os.path.join( - config["output_dir"], - "{organism}", - "iso_anno_5p{bp_5p}_3p{bp_3p}.bed", - ), - params: - cluster_log=os.path.join( - config["cluster_log"], - "{organism}", - "iso_anno_5p{bp_5p}_3p{bp_3p}.log", - ), - bp_5p=lambda wildcards: wildcards.bp_5p, - bp_3p=lambda wildcards: wildcards.bp_3p, - log: - os.path.join( - config["local_log"], - "{organism}", - "iso_anno_5p{bp_5p}_3p{bp_3p}.log", - ), - singularity: - "docker://zavolab/bedtools:2.28.0" - shell: - "(bedtools slop \ - -i {input.bed} \ - -g {input.chrsize} \ - -l {params.bp_5p} \ - -r {params.bp_3p} \ - > {output.bed} \ - ) &> {log}" - - -############################################################################### -### Change miRNA names to isomirs names -############################################################################### - - -rule iso_anno_rename: - input: - bed=os.path.join( - config["output_dir"], - "{organism}", - "iso_anno_5p{bp_5p}_3p{bp_3p}.bed", - ), - output: - bed=os.path.join( - config["output_dir"], - "{organism}", - "iso_anno_rename_5p{bp_5p}_3p{bp_3p}.bed", - ), - params: - cluster_log=os.path.join( - config["cluster_log"], - "{organism}", - "iso_anno_rename_5p{bp_5p}_3p{bp_3p}.log", - ), - bp_5p=lambda wildcards: wildcards.bp_5p, - bp_3p=lambda wildcards: wildcards.bp_3p, - log: - os.path.join( - config["local_log"], - "{organism}", - "iso_anno_rename_5p{bp_5p}_3p{bp_3p}.log", - ), - singularity: - "docker://zavolab/ubuntu:18.04" - shell: - "(sed \ - 's/;Derives/_5p{params.bp_5p}_3p{params.bp_3p};Derives/' \ - {input.bed} \ - > {output.bed} \ - ) &> {log}" - - -############################################################################### -### Concatenate all isomirs annotation files -############################################################################### - - -rule iso_anno_concat: - input: - bed=lambda wildcards: expand( - os.path.join( - config["output_dir"], - "{organism}", - "iso_anno_rename_5p{bp_5p}_3p{bp_3p}.bed", - ), - organism=config["organism"], - bp_3p=config["bp_3p"], - bp_5p=config["bp_5p"], - ), - output: - bed=os.path.join( - config["output_dir"], "{organism}", "iso_anno_concat.bed" - ), - params: - cluster_log=os.path.join( - config["cluster_log"], "{organism}", "iso_anno_concat.log" - ), - prefix=os.path.join( - config["output_dir"], "{organism}", "iso_anno_rename" - ), - log: - os.path.join(config["local_log"], "{organism}", "iso_anno_concat.log"), - singularity: - "docker://zavolab/ubuntu:18.04" - shell: - "(cat {params.prefix}* > {output.bed}) &> {log}" - - -############################################################################### -### Remove non changing isomirs (5p0_3p0) -############################################################################### - - -rule iso_anno_final: - input: - bed=os.path.join( - config["output_dir"], "{organism}", "iso_anno_concat.bed" - ), - output: - bed=os.path.join( - config["output_dir"], "{organism}", "isomirs_annotation.bed" - ), - params: - cluster_log=os.path.join( - config["cluster_log"], "{organism}", "iso_anno_final.log" - ), - pattern="5p0_3p0", - log: - os.path.join(config["local_log"], "{organism}", "iso_anno_final.log"), - singularity: - "docker://zavolab/ubuntu:18.04" - shell: - "(grep -v '{params.pattern}' {input.bed} > {output.bed}) &> {log}" -- GitLab From 0217211515a584f01a37b1baf182d2a85dba1fd9 Mon Sep 17 00:00:00 2001 From: Iris Mestres Pascual <iris.mestrespascual@unibas.ch> Date: Tue, 17 Jan 2023 17:24:43 +0000 Subject: [PATCH 11/22] Delete Snakefile --- workflow/quantify/Snakefile | 317 ------------------------------------ 1 file changed, 317 deletions(-) delete mode 100644 workflow/quantify/Snakefile diff --git a/workflow/quantify/Snakefile b/workflow/quantify/Snakefile deleted file mode 100644 index b30667d..0000000 --- a/workflow/quantify/Snakefile +++ /dev/null @@ -1,317 +0,0 @@ -############################################################################### -# (c) 2020 Paula Iborra, Zavolan Lab, Biozentrum, University of Basel -# (@) paula.iborradetoledo@unibas.ch / paula.iborra@alumni.esci.upf.edu -# -# Pipeline to quantify miRNAs, including isomiRs, from miRNA-seq alignments. -############################################################################### - -import os - - -# Rules that require internet connection for downloading files are included -# in the localrules -localrules: - finish, - - -############################################################################### -### Finish rule -############################################################################### - - -rule finish: - input: - table1=expand( - os.path.join(config["output_dir"], "TABLES", "counts.{mir}.tab"), - mir=config["mir_list"], - ), - table2=os.path.join( - config["output_dir"], "TABLES", "counts.isomirs.tab" - ), - - -############################################################################### -### BAM to BED -############################################################################### - - -rule bamtobed: - input: - alignment=os.path.join( - config["input_dir"], - "{sample}", - "convertedSortedMappings_{sample}.bam", - ), - output: - alignment=os.path.join( - config["output_dir"], "{sample}", "alignments.bed12" - ), - params: - cluster_log=os.path.join( - config["cluster_log"], "bamtobed_{sample}.log" - ), - log: - os.path.join(config["local_log"], "bamtobed_{sample}.log"), - singularity: - "docker://zavolab/bedtools:2.27.0" - shell: - "(bedtools bamtobed \ - -bed12 \ - -tag NH \ - -i {input.alignment} \ - > {output.alignment} \ - ) &> {log}" - - -############################################################################### -### Sort alignments -############################################################################### - - -rule sort_alignments: - input: - alignment=os.path.join( - config["output_dir"], "{sample}", "alignments.bed12" - ), - output: - alignment=os.path.join( - config["output_dir"], "{sample}", "sorted.alignments.bed12" - ), - params: - cluster_log=os.path.join( - config["cluster_log"], "sortalignment_{sample}.log" - ), - log: - os.path.join(config["local_log"], "sortalignment_{sample}.log"), - resources: - mem=4, - threads=8, - singularity: - "docker://zavolab/ubuntu:18.04" - shell: - "(sort \ - -k1,1 \ - -k2,2n \ - {input.alignment} \ - > {output.alignment} \ - ) &> {log}" - - -############################################################################### -### miRNAs intersection -############################################################################### - - -rule intersect_mirna: - input: - alignment=os.path.join( - config["output_dir"], "{sample}", "sorted.alignments.bed12" - ), - mirna=config["mirnas_anno"], - output: - intersect=os.path.join( - config["output_dir"], "{sample}", "intersect_mirna.bed" - ), - params: - cluster_log=os.path.join( - config["cluster_log"], "intersection_mirna_{sample}.log" - ), - log: - os.path.join(config["local_log"], "intersection_mirna_{sample}.log"), - singularity: - "docker://zavolab/bedtools:2.27.0" - shell: - "(bedtools intersect \ - -wao \ - -s \ - -F 1 \ - -sorted \ - -b {input.alignment} \ - -a {input.mirna} \ - > {output.intersect} \ - ) &> {log}" - - -############################################################################### -### isomiRs intersection -############################################################################### - - -# rule intersect_isomirs: -# input: -# alignment=os.path.join( -# config["output_dir"], "{sample}", "sorted.alignments.bed12" -# ), -# isomirs=config["isomirs_anno"], -# output: -# intersect=os.path.join( -# config["output_dir"], "{sample}", "intersect_isomirs.bed" -# ), -# params: -# cluster_log=os.path.join( -# config["cluster_log"], "intersection_isomirs_{sample}.log" -# ), -# log: -# os.path.join(config["local_log"], "intersection_isomirs_{sample}.log"), -# singularity: -# "docker://zavolab/bedtools:2.27.0" -# shell: -# "(bedtools intersect \ -# -wao \ -# -s \ -# -F 1 \ -# -sorted \ -# -b {input.alignment} \ -# -a {input.isomirs} \ -# > {output.intersect} \ -# ) &> {log}" - - -############################################################################### -### miRNAs counting table - miRNA -############################################################################### - - -rule quant_mirna: - input: - intersect=os.path.join( - config["output_dir"], "{sample}", "intersect_mirna.bed" - ), - script=os.path.join(config["scripts_dir"], "mirna_quantification.py"), - output: - table=os.path.join( - config["output_dir"], "TABLES", "miRNA_counts_{sample}" - ), - params: - cluster_log=os.path.join( - config["cluster_log"], "quant_mirna_miRNA_{sample}.log" - ), - prefix=os.path.join( - config["output_dir"], "TABLES", "miRNA_counts_{sample}" - ), - log: - os.path.join(config["local_log"], "quant_mirna_miRNA_{sample}.log"), - singularity: - "docker://zavolab/python:3.6.5" - shell: - "(python \ - {input.script} \ - -i {input.intersect} \ - --uniq=miRNA \ - -p={params.prefix} \ - ) &> {log}" - - -############################################################################### -### miRNAs counting table - miRNA_primary -############################################################################### - - -rule quant_mirna_pri: - input: - intersect=os.path.join( - config["output_dir"], "{sample}", "intersect_mirna.bed" - ), - script=os.path.join(config["scripts_dir"], "mirna_quantification.py"), - output: - table=os.path.join( - config["output_dir"], - "TABLES", - "miRNA_primary_transcript_counts_{sample}", - ), - params: - cluster_log=os.path.join( - config["cluster_log"], - "quant_mirna_miRNA_primary_transcript_{sample}.log", - ), - prefix=os.path.join( - config["output_dir"], - "TABLES", - "miRNA_primary_transcript_counts_{sample}", - ), - log: - os.path.join( - config["local_log"], - "quant_mirna_miRNA_primary_transcript_{sample}.log", - ), - singularity: - "docker://zavolab/python:3.6.5" - shell: - "(python \ - {input.script} \ - -i {input.intersect} \ - --uniq=miRNA_primary_transcript \ - -p={params.prefix} \ - ) &> {log}" - - -############################################################################### -### isomiRs counting table -############################################################################### - - -# rule quant_isomirs: -# input: -# intersect=os.path.join( -# config["output_dir"], "{sample}", "intersect_isomirs.bed" -# ), -# script=os.path.join(config["scripts_dir"], "mirna_quantification.py"), -# output: -# table=os.path.join( -# config["output_dir"], "TABLES", "isomirs_counts_{sample}" -# ), -# params: -# cluster_log=os.path.join( -# config["cluster_log"], "quant_isomirs_{sample}.log" -# ), -# prefix=os.path.join( -# config["output_dir"], "TABLES", "isomirs_counts_{sample}" -# ), -# log: -# os.path.join(config["local_log"], "quant_isomirs_{sample}.log"), -# singularity: -# "docker://zavolab/python:3.6.5" -# shell: -# "(python \ -# {input.script} \ -# -i {input.intersect} \ -# --uniq=miRNA \ -# -p={params.prefix} \ -# ) &> {log}" - - -############################################################################### -### Merge counting tables for all samples by mature/primary/isomirs forms. -############################################################################### - - -rule merge_tables: - input: - table=expand( - os.path.join( - config["output_dir"], "TABLES", "{mir}_counts_{sample}" - ), - sample=config["sample"], - mir=config["mir_list"], - ), - script=os.path.join(config["scripts_dir"], "merge_tables.R"), - output: - table=os.path.join(config["output_dir"], "TABLES", "counts.{mir}.tab"), - params: - cluster_log=os.path.join( - config["cluster_log"], "merge_tables_{mir}.log" - ), - prefix="{mir}_counts_", - input_dir=os.path.join(config["output_dir"], "TABLES"), - log: - os.path.join(config["local_log"], "merge_tables_{mir}.log"), - singularity: - "docker://zavolab/r-tidyverse:3.5.3" - shell: - "(Rscript \ - {input.script} \ - --input_dir {params.input_dir} \ - --output_file {output.table} \ - --prefix {params.prefix} \ - --verbose \ - ) &> {log}" -- GitLab From 0e26684698a3555a328c919a936f4a3f1c2b8842 Mon Sep 17 00:00:00 2001 From: Iris Mestres Pascual <iris.mestrespascual@unibas.ch> Date: Tue, 17 Jan 2023 17:24:57 +0000 Subject: [PATCH 12/22] Delete config_map.yaml --- test/config_map.yaml | 44 -------------------------------------------- 1 file changed, 44 deletions(-) delete mode 100644 test/config_map.yaml diff --git a/test/config_map.yaml b/test/config_map.yaml deleted file mode 100644 index 4ad569e..0000000 --- a/test/config_map.yaml +++ /dev/null @@ -1,44 +0,0 @@ ---- -#### GLOBAL PARAMETERS #### - -# Directories -# Usually there is no need to change these -output_dir: "results/" -local_log: "logs/local" -cluster_log: "logs/cluster" -scripts_dir: "../scripts" - -# Resources: genome, transcriptome, genes, miRs -# All of these are produced by the "prepare" workflow -genome: "results/homo_sapiens/chrY/genome.processed.fa" -gtf: "results/homo_sapiens/chrY/gene_annotations.filtered.gtf" -transcriptome: "results/homo_sapiens/chrY/transcriptome_idtrim.fa" -transcriptome_index_segemehl: "results/homo_sapiens/chrY/transcriptome_index_segemehl.idx" -genome_index_segemehl: "results/homo_sapiens/chrY/genome_index_segemehl.idx" -exons: "results/homo_sapiens/chrY/exons.bed" -header_of_collapsed_fasta: "results/homo_sapiens/chrY/headerOfCollapsedFasta.sam" - -# Tool parameters: quality filter -q_value: 10 -p_value: 50 - -# Tool parameters: adapter removal -error_rate: 0.1 -minimum_length: 15 -overlap: 3 -max_n: 0 - -# Tool parameters: mapping -max_length_reads: 30 -nh: 100 - -# Inputs information -input_dir: "test_files" -sample: ["test_lib"] - -#### PARAMETERS SPECIFIC TO INPUTS #### - -test_lib: - adapter: "AACTGTAGGCACCATCAAT" - format: "fa" -... -- GitLab From 489f15ce3fbb638fa205bf32f4b019c02708c9d7 Mon Sep 17 00:00:00 2001 From: Iris Mestres Pascual <iris.mestrespascual@unibas.ch> Date: Tue, 17 Jan 2023 17:25:04 +0000 Subject: [PATCH 13/22] Delete config_prepare.yaml --- test/config_prepare.yaml | 33 --------------------------------- 1 file changed, 33 deletions(-) delete mode 100644 test/config_prepare.yaml diff --git a/test/config_prepare.yaml b/test/config_prepare.yaml deleted file mode 100644 index a74dba6..0000000 --- a/test/config_prepare.yaml +++ /dev/null @@ -1,33 +0,0 @@ ---- -#### GLOBAL PARAMETERS ##### - -# Directories -# Usually there is no need to change these -output_dir: "results" -scripts_dir: "../scripts" -local_log: "logs/local" -cluster_log: "logs/cluster" - -# Isomirs annotation file -# Number of base pairs to add/substract from 5' (start) and 3' (end) coordinates. -bp_5p: [-1, 0, +1] -bp_3p: [-1, 0, +1] - -# List of inputs -organism: ["homo_sapiens/chrY"] - -#### PARAMETERS SPECIFIC TO INPUTS ##### - -homo_sapiens/chrY: - # URLs to genome, gene & miRNA annotations - genome_url: "ftp://ftp.ensembl.org/pub/release-98/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna_sm.chromosome.Y.fa.gz" - gtf_url: "ftp://ftp.ensembl.org/pub/release-98/gtf/homo_sapiens/Homo_sapiens.GRCh38.98.gtf.gz" - mirna_url: "https://www.mirbase.org/ftp/CURRENT/genomes/hsa.gff3" - - # Chromosome name mappings between UCSC <-> Ensembl - # Other organisms available at: https://github.com/dpryan79/ChromosomeMappings - map_chr_url: "https://raw.githubusercontent.com/dpryan79/ChromosomeMappings/master/GRCh38_UCSC2ensembl.txt" - # Chromosome name mapping parameters: - column: 1 - delimiter: "TAB" -... -- GitLab From 0d35e6e163dba066e5aff30a93f6a7acad184a83 Mon Sep 17 00:00:00 2001 From: Iris Mestres Pascual <iris.mestrespascual@unibas.ch> Date: Tue, 17 Jan 2023 17:25:10 +0000 Subject: [PATCH 14/22] Delete config_quantify.yaml --- test/config_quantify.yaml | 23 ----------------------- 1 file changed, 23 deletions(-) delete mode 100644 test/config_quantify.yaml diff --git a/test/config_quantify.yaml b/test/config_quantify.yaml deleted file mode 100644 index 691cedf..0000000 --- a/test/config_quantify.yaml +++ /dev/null @@ -1,23 +0,0 @@ ---- -#### GLOBAL PARAMETERS #### - -# Directories -# Usually there is no need to change these -output_dir: "results" -scripts_dir: "../scripts" -local_log: "logs/local" -cluster_log: "logs/cluster" - -# Types of miRNAs to quantify -#mir_list: ["miRNA", "miRNA_primary_transcript", "isomirs"] -mir_list: ["miRNA", "miRNA_primary_transcript"] - -# Resources: miR annotations, chromosome name mappings -# All of these are produced by the "prepare" workflow -mirnas_anno: "results/homo_sapiens/chrY/mirna_filtered.bed" -isomirs_anno: "results/homo_sapiens/chrY/isomirs_annotation.bed" - -# Inputs information -input_dir: "results" -sample: ["test_lib"] # put all samples, separated by comma -... -- GitLab From c848b44fa7d00b9a327b22fcd0691b745d6c7b23 Mon Sep 17 00:00:00 2001 From: Iris Mestres Pascual <iris.mestrespascual@unibas.ch> Date: Tue, 17 Jan 2023 17:25:51 +0000 Subject: [PATCH 15/22] Upload "config.yaml" file for the merged workflow --- test/config.yaml | 98 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) create mode 100644 test/config.yaml diff --git a/test/config.yaml b/test/config.yaml new file mode 100644 index 0000000..98b4422 --- /dev/null +++ b/test/config.yaml @@ -0,0 +1,98 @@ +--- +#### GLOBAL PARAMETERS ##### + +# Directories +# Usually there is no need to change these +map_input_dir: "test_files" # For the mapping worflow +quantify_input_dir: "results" # For the quantify worflow +output_dir: "results" +scripts_dir: "../scripts" +local_log: "logs/local" +cluster_log: "logs/cluster" + +# Inputs information +sample: ["test_lib"] + +####################################################################################################### +#### +#### PREPARE PARAMETERS +#### +####################################################################################################### + +# Isomirs annotation +# Number of base pairs to add/substract from 5' (start) and 3' (end) coordinates. +bp_5p: [-1, 0, +1] +bp_3p: [-1, 0, +1] + +# List of inputs +organism: ["homo_sapiens/chrY"] + +#### PARAMETERS SPECIFIC TO INPUTS ##### + +homo_sapiens/chrY: + # URLs to genome, gene & miRNA annotations + genome_url: "ftp://ftp.ensembl.org/pub/release-98/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna_sm.chromosome.Y.fa.gz" + gtf_url: "ftp://ftp.ensembl.org/pub/release-98/gtf/homo_sapiens/Homo_sapiens.GRCh38.98.gtf.gz" + mirna_url: "https://www.mirbase.org/ftp/CURRENT/genomes/hsa.gff3" + + # Chromosome name mappings between UCSC <-> Ensembl + # Other organisms available at: https://github.com/dpryan79/ChromosomeMappings + map_chr_url: "https://raw.githubusercontent.com/dpryan79/ChromosomeMappings/master/GRCh38_UCSC2ensembl.txt" + # Chromosome name mapping parameters: + column: 1 + delimiter: "TAB" + +####################################################################################################### +#### +#### MAP PARAMETERS +#### +####################################################################################################### + +# Resources: genome, transcriptome, genes, miRs +# All of these are produced by the "prepare" workflow + +genome: "results/homo_sapiens/chrY/genome.processed.fa" +gtf: "results/homo_sapiens/chrY/gene_annotations.filtered.gtf" +transcriptome: "results/homo_sapiens/chrY/transcriptome_idtrim.fa" +transcriptome_index_segemehl: "results/homo_sapiens/chrY/transcriptome_index_segemehl.idx" +genome_index_segemehl: "results/homo_sapiens/chrY/genome_index_segemehl.idx" +exons: "results/homo_sapiens/chrY/exons.bed" +header_of_collapsed_fasta: "results/homo_sapiens/chrY/headerOfCollapsedFasta.sam" + +# Tool parameters: quality filter +q_value: 10 +p_value: 50 + +# Tool parameters: adapter removal +error_rate: 0.1 +minimum_length: 15 +overlap: 3 +max_n: 0 + +# Tool parameters: mapping +max_length_reads: 30 +nh: 100 + + +#### PARAMETERS SPECIFIC TO INPUTS #### + +test_lib: + adapter: "AACTGTAGGCACCATCAAT" + format: "fa" + +####################################################################################################### +#### +#### QUANTIFY PARAMETERS +#### +####################################################################################################### + + +# Types of miRNAs to quantify +#mir_list: ["miRNA", "miRNA_primary_transcript", "isomirs"] +mir_list: ["miRNA", "miRNA_primary_transcript"] + +# Resources: miR annotations, chromosome name mappings +# All of these are produced by the "prepare" workflow +mirnas_anno: "results/homo_sapiens/chrY/mirna_filtered.bed" +isomirs_anno: "results/homo_sapiens/chrY/isomirs_annotation.bed" +... \ No newline at end of file -- GitLab From 5f7e5d53b19af368e5b29304f26762acb16eb788 Mon Sep 17 00:00:00 2001 From: Iris Mestres Pascual <iris.mestrespascual@unibas.ch> Date: Tue, 17 Jan 2023 17:36:03 +0000 Subject: [PATCH 16/22] Replace test_workflow_local.sh --- test/test_workflow_local.sh | 47 +++++-------------------------------- 1 file changed, 6 insertions(+), 41 deletions(-) diff --git a/test/test_workflow_local.sh b/test/test_workflow_local.sh index e59a3f4..b8efa82 100755 --- a/test/test_workflow_local.sh +++ b/test/test_workflow_local.sh @@ -16,56 +16,21 @@ user_dir=$PWD script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" cd $script_dir -# Run test: prepare workflow +# Run test snakemake \ - --snakefile="../workflow/prepare/Snakefile" \ - --configfile="config_prepare.yaml" \ + --snakefile="../workflow/Snakefile" \ + --cores 4 \ --use-singularity \ --singularity-args "--bind ${PWD}/../" \ - --cores=4 \ --printshellcmds \ --rerun-incomplete \ --verbose -# Run test: map workflow -snakemake \ - --snakefile="../workflow/map/Snakefile" \ - --configfile="config_map.yaml" \ - --use-singularity \ - --singularity-args "--bind ${PWD}/../" \ - --cores=4 \ - --printshellcmds \ - --rerun-incomplete \ - --verbose - -# Run test: quantify workflow -snakemake \ - --snakefile="../workflow/quantify/Snakefile" \ - --configfile="config_quantify.yaml" \ - --use-singularity \ - --singularity-args "--bind ${PWD}/../" \ - --cores=4 \ - --printshellcmds \ - --rerun-incomplete \ - --verbose - -# Snakemake report: prepare workflow -snakemake \ - --snakefile="../workflow/prepare/Snakefile" \ - --configfile="config_prepare.yaml" \ - --report="snakemake_report_prepare.html" - -# Snakemake report: map workflow -snakemake \ - --snakefile="../workflow/map/Snakefile" \ - --configfile="config_map.yaml" \ - --report="snakemake_report_map.html" -# Snakemake report: quantify workflow +# Snakemake report snakemake \ - --snakefile="../workflow/quantify/Snakefile" \ - --configfile="config_quantify.yaml" \ - --report="snakemake_report_quantify.html" + --snakefile="../workflow/Snakefile" \ + --report="snakemake_report.html" # Check md5 sum of some output files find results/ -type f -name \*\.gz -exec gunzip '{}' \; -- GitLab From 08ecdc9a45398e8f562c8b6c7c3c5512c9834eb7 Mon Sep 17 00:00:00 2001 From: Iris Mestres Pascual <iris.mestrespascual@unibas.ch> Date: Tue, 17 Jan 2023 17:36:46 +0000 Subject: [PATCH 17/22] Replace test_workflow_slurm.sh --- test/test_workflow_slurm.sh | 72 ++++--------------------------------- 1 file changed, 6 insertions(+), 66 deletions(-) diff --git a/test/test_workflow_slurm.sh b/test/test_workflow_slurm.sh index 0c50de3..a1e1925 100755 --- a/test/test_workflow_slurm.sh +++ b/test/test_workflow_slurm.sh @@ -21,56 +21,10 @@ mkdir -p logs/cluster/{homo_sapiens/chrY,test_lib} mkdir -p logs/local/{homo_sapiens/chrY,test_lib} mkdir -p results/{homo_sapiens/chrY,test_lib} -# Run test: prepare workflow +# Run test snakemake \ - --snakefile="../workflow/prepare/Snakefile" \ - --configfile="config_prepare.yaml" \ - --cluster-config="cluster.json" \ - --cluster "sbatch \ - --cpus-per-task={cluster.threads} \ - --mem={cluster.mem} \ - --qos={cluster.queue} \ - --time={cluster.time} \ - --export=JOB_NAME={rule} \ - -o {params.cluster_log} \ - -p scicore \ - --open-mode=append" \ - --jobscript="../jobscript.sh" \ - --jobs=20 \ - --use-singularity \ - --singularity-args="--no-home --bind ${PWD}/../" \ - --cores=256 \ - --printshellcmds \ - --rerun-incomplete \ - --verbose - -# Run test: map workflow -snakemake \ - --snakefile="../workflow/map/Snakefile" \ - --configfile="config_map.yaml" \ - --cluster-config="cluster.json" \ - --cluster "sbatch \ - --cpus-per-task={cluster.threads} \ - --mem={cluster.mem} \ - --qos={cluster.queue} \ - --time={cluster.time} \ - --export=JOB_NAME={rule} \ - -o {params.cluster_log} \ - -p scicore \ - --open-mode=append" \ - --jobscript="../jobscript.sh" \ - --jobs=20 \ - --use-singularity \ - --singularity-args="--no-home --bind ${PWD}/../" \ + --snakefile="../workflow/Snakefile" \ --cores=256 \ - --printshellcmds \ - --rerun-incomplete \ - --verbose - -# Run test: quantify workflow -snakemake \ - --snakefile="../workflow/quantify/Snakefile" \ - --configfile="config_quantify.yaml" \ --cluster-config="cluster.json" \ --cluster "sbatch \ --cpus-per-task={cluster.threads} \ @@ -84,29 +38,15 @@ snakemake \ --jobscript="../jobscript.sh" \ --jobs=20 \ --use-singularity \ - --singularity-args="--no-home --bind ${PWD}/../" \ - --cores=256 \ + --singularity-args="--bind ${PWD}/../" \ --printshellcmds \ --rerun-incomplete \ --verbose -# Snakemake report: prepare workflow -snakemake \ - --snakefile="../workflow/prepare/Snakefile" \ - --configfile="config_prepare.yaml" \ - --report="snakemake_report_prepare.html" - -# Snakemake report: map workflow -snakemake \ - --snakefile="../workflow/map/Snakefile" \ - --configfile="config_map.yaml" \ - --report="snakemake_report_map.html" - -# Snakemake report: quantify workflow +# Snakemake report snakemake \ - --snakefile="../workflow/quantify/Snakefile" \ - --configfile="config_quantify.yaml" \ - --report="snakemake_report_quantify.html" + --snakefile="../workflow/Snakefile" \ + --report="snakemake_report.html" # Check md5 sum of some output files find results/ -type f -name \*\.gz -exec gunzip '{}' \; -- GitLab From 37f9af9be3734f2e84e63535439555d5a602a628 Mon Sep 17 00:00:00 2001 From: Iris Mestres Pascual <iris.mestrespascual@unibas.ch> Date: Tue, 17 Jan 2023 17:38:48 +0000 Subject: [PATCH 18/22] Replace test_rule_graph.sh --- test/test_rule_graph.sh | 28 ++++------------------------ 1 file changed, 4 insertions(+), 24 deletions(-) diff --git a/test/test_rule_graph.sh b/test/test_rule_graph.sh index 3b0b235..66ca559 100755 --- a/test/test_rule_graph.sh +++ b/test/test_rule_graph.sh @@ -16,32 +16,12 @@ user_dir=$PWD script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" cd $script_dir -# Run test: prepare workflow +# Run test snakemake \ - --snakefile="../workflow/prepare/Snakefile" \ - --configfile="config_prepare.yaml" \ + --snakefile="../workflow/Snakefile" \ + --configfile="config.yaml" \ --rulegraph \ --printshellcmds \ --dryrun \ --verbose \ - | dot -Tsvg > "../images/rule_graph_prepare.svg" - -# Run test: map workflow -snakemake \ - --snakefile="../workflow/map/Snakefile" \ - --configfile="config_map.yaml" \ - --rulegraph \ - --printshellcmds \ - --dryrun \ - --verbose \ - | dot -Tsvg > "../images/rule_graph_map.svg" - -# Run test: quantify workflow -snakemake \ - --snakefile="../workflow/quantify/Snakefile" \ - --configfile="config_quantify.yaml" \ - --rulegraph \ - --printshellcmds \ - --dryrun \ - --verbose \ - | dot -Tsvg > "../images/rule_graph_quantify.svg" + | dot -Tsvg > "../images/rule_graph_.svg" -- GitLab From cb3756a133705ca00479ed71d9bd13747e886e86 Mon Sep 17 00:00:00 2001 From: Iris Mestres Pascual <iris.mestrespascual@unibas.ch> Date: Tue, 17 Jan 2023 17:40:04 +0000 Subject: [PATCH 19/22] Replace test_dag.sh --- test/test_dag.sh | 28 ++++------------------------ 1 file changed, 4 insertions(+), 24 deletions(-) diff --git a/test/test_dag.sh b/test/test_dag.sh index f472b15..de93730 100755 --- a/test/test_dag.sh +++ b/test/test_dag.sh @@ -16,32 +16,12 @@ user_dir=$PWD script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" cd $script_dir -# Run test: prepare workflow +# Run test snakemake \ - --snakefile="../workflow/prepare/Snakefile" \ - --configfile="config_prepare.yaml" \ + --snakefile="../workflow/Snakefile" \ + --configfile="config.yaml" \ --dag \ --printshellcmds \ --dryrun \ --verbose \ - | dot -Tsvg > "../images/workflow_dag_prepare.svg" - -# Run test: map workflow -snakemake \ - --snakefile="../workflow/map/Snakefile" \ - --configfile="config_map.yaml" \ - --dag \ - --printshellcmds \ - --dryrun \ - --verbose \ - | dot -Tsvg > "../images/workflow_dag_map.svg" - -# Run test: quantify workflow -snakemake \ - --snakefile="../workflow/quantify/Snakefile" \ - --configfile="config_quantify.yaml" \ - --dag \ - --printshellcmds \ - --dryrun \ - --verbose \ - | dot -Tsvg > "../images/workflow_dag_quantify.svg" + | dot -Tsvg > "../images/workflow_dag.svg" \ No newline at end of file -- GitLab From 1d90d9ebead171a0d70d67168ad7ec935a9e9bec Mon Sep 17 00:00:00 2001 From: Alex Kanitz <alexander.kanitz@alumni.ethz.ch> Date: Thu, 19 Jan 2023 15:25:33 +0100 Subject: [PATCH 20/22] simple change to trigger CI --- test/test_workflow_local.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/test/test_workflow_local.sh b/test/test_workflow_local.sh index b8efa82..8c87878 100755 --- a/test/test_workflow_local.sh +++ b/test/test_workflow_local.sh @@ -5,6 +5,7 @@ cleanup () { rc=$? cd $user_dir echo "Exit status: $rc" + rm -rf .snakemake } trap cleanup EXIT -- GitLab From e8e8dec3010f5fc26290a9dbffd45509b166cdad Mon Sep 17 00:00:00 2001 From: Alex Kanitz <alexander.kanitz@alumni.ethz.ch> Date: Fri, 20 Jan 2023 13:02:17 +0100 Subject: [PATCH 21/22] temporarily deactivate CI --- .gitlab-ci.yml | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 8a55175..69d0806 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,18 +1,18 @@ -default: - tags: - - shell - -before_script: - - mamba env create --force -f environment.yml - - conda activate mirflowz - - echo $CONDA_DEFAULT_ENV - - snakemake --version - -test: - script: - - bash test/test_workflow_local.sh - - bash test/test_dag.sh - - bash test/test_rule_graph.sh - -after_script: - - bash test/test_cleanup.sh +# default: +# tags: +# - shell +# +# before_script: +# - mamba env create --force -f environment.yml +# - conda activate mirflowz +# - echo $CONDA_DEFAULT_ENV +# - snakemake --version +# +# test: +# script: +# - bash test/test_workflow_local.sh +# - bash test/test_dag.sh +# - bash test/test_rule_graph.sh +# +# after_script: +# - bash test/test_cleanup.sh -- GitLab From b3c38a2164067729386329bd48cb6afd03977228 Mon Sep 17 00:00:00 2001 From: Alex Kanitz <alexander.kanitz@alumni.ethz.ch> Date: Fri, 20 Jan 2023 13:02:53 +0100 Subject: [PATCH 22/22] temporarily deactivate CI --- .gitlab-ci.yml | 18 ------------------ 1 file changed, 18 deletions(-) delete mode 100644 .gitlab-ci.yml diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml deleted file mode 100644 index 69d0806..0000000 --- a/.gitlab-ci.yml +++ /dev/null @@ -1,18 +0,0 @@ -# default: -# tags: -# - shell -# -# before_script: -# - mamba env create --force -f environment.yml -# - conda activate mirflowz -# - echo $CONDA_DEFAULT_ENV -# - snakemake --version -# -# test: -# script: -# - bash test/test_workflow_local.sh -# - bash test/test_dag.sh -# - bash test/test_rule_graph.sh -# -# after_script: -# - bash test/test_cleanup.sh -- GitLab