Skip to content
Snippets Groups Projects
Commit d99c1d48 authored by BIOPZ-Gypas Foivos's avatar BIOPZ-Gypas Foivos
Browse files

First commit

parents
No related branches found
No related tags found
No related merge requests found
snakemake/annotation/
snakemake/prepare_annotation/results/
snakemake/prepare_annotation/results/*
snakemake/process_data/.snakemake
snakemake/process_data/results/local_log
snakemake/process_data/results/cluster_log
.DS_Store
._.DS_Store
##### BASE IMAGE #####
FROM debian:9.3
##### METADATA #####
LABEL base.image="debian:9.3"
LABEL version="1"
LABEL software="segemehl"
LABEL software.version="0.2.0"
LABEL software.description="short read mapping with gaps"
LABEL software.website="http://www.bioinf.uni-leipzig.de/Software/segemehl/"
LABEL software.documentation="http://www.bioinf.uni-leipzig.de/Software/segemehl/"
LABEL software.license="link to license(file) of original software"
LABEL software.tags="Genomics, Transcriptomics"
LABEL maintainer="foivos.gypas@unibas.ch"
LABEL maintainer.organisation="Biozentrum, University of Basel"
LABEL maintainer.location="Klingelbergstrasse 50/70, CH-4056 Basel, Switzerland"
LABEL maintainer.lab="Zavolan Lab"
LABEL maintainer.license="https://spdx.org/licenses/Apache-2.0"
##### VARIABLES #####
# Use variables for convenient updates/re-usability
ENV SOFTWARE_VERSION 0.2.0
##### INSTALL #####
RUN apt-get update -y \
&& apt-get install -y wget python build-essential libz-dev gcc libgl1-mesa-dev
## Do your install stuff, merge with '&&' whenever possible, don't create unnecessary layers
# && wget https://.../v${SOFTWARE_VERSION}/software-${SOFTWARE_VERSION}.tar.gz \
# && tar -zxvf software-${SOFTWARE_VERSION}.tar.gz \
# && cd software \
# && make \
## Copy binaries to location in PATH
# && cp bin/* /usr/bin \
## Remove files no longer needed
# && cd .. \
# && rm -r /intermediate/folder software-${SOFTWARE_VERSION}.tar.gz \
## Clean up
# && apt-get purge -y wget \
# && apt-get autoremove -y && apt-get clean \
# && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \
#
###### SET PATH (if not in /usr/bin/) #####
#ENV PATH /path/to/binary:$PATH
#
###### WORKING DIRECTORY #####
#WORKDIR /data/
configfile: "config.yaml"
#from snakemake.utils import listfiles
localrules: create_output_and_log_directories, concat_samples, finish
#################################################################################
### Finish rule
#################################################################################
rule finish:
input:
reads = expand(os.path.join(config["output_dir"], "{sample}/pro.filtered.fasta.gz"), sample=config["sample"])
#################################################################################
### Create output and log directories
#################################################################################
rule create_output_and_log_directories:
output:
output_dir = config["output_dir"],
cluster_log = config["cluster_log"],
local_log = config["local_log"],
sample_dir = expand(os.path.join(config["output_dir"], "{sample}"), sample=config["sample"]),
flag = config["dir_created"]
threads: 1
shell:
"mkdir -p {output.output_dir}; \
mkdir -p {output.cluster_log}; \
mkdir -p {output.local_log}; \
mkdir -p {output.sample_dir}; \
touch {output.flag};"
#################################################################################
### Clipping reads
#################################################################################
rule clip_reads:
input:
flag = config["dir_created"],
reads = os.path.join(config["input_dir"], "{sample}" + config["input_reads_pattern"]),
output:
reads = os.path.join(config["output_dir"], "{sample}/pro.clipped.fastq.gz"),
params:
v = "-v",
n = "-n",
l = "20",
qual = "-Q33",
z = "-z",
adapter = lambda wildcards: config[ wildcards.sample ]['adapter'],
cluster_log = os.path.join(config["cluster_log"], "clip_reads_{sample}.log")
log:
os.path.join(config["local_log"], "clip_reads_{sample}.log")
singularity:
"docker://cjh4zavolab/fastx:0.0.14"
shell:
"(fastx_clipper \
{params.v} \
{params.n} \
-l {params.l} \
{params.qual} \
-a {params.adapter} \
{params.z} \
-i <(zcat {input.reads}) \
-o {output.reads}) &> {log}"
#################################################################################
### Trimming reads
#################################################################################
rule trim_reads:
input:
reads = os.path.join(config["output_dir"], "{sample}/pro.clipped.fastq.gz")
output:
reads = os.path.join(config["output_dir"], "{sample}/pro.trimmed.fastq.gz"),
params:
v = "-v",
l = "20",
t = "20",
qual = "-Q33",
z = "-z",
cluster_log = os.path.join(config["local_log"], "trim_reads_{sample}.log")
log:
os.path.join(config["cluster_log"], "trim_reads_{sample}.log")
singularity:
"docker://cjh4zavolab/fastx:0.0.14"
shell:
"(fastq_quality_trimmer \
{params.v} \
-l {params.l} \
-t {params.t} \
{params.qual} \
{params.z} \
-i <(zcat {input.reads}) \
-o {output.reads}) &> {log}"
#################################################################################
### Filtering reads
#################################################################################
rule filter_reads:
input:
reads = os.path.join(config["output_dir"], "{sample}/pro.trimmed.fastq.gz"),
output:
reads = os.path.join(config["output_dir"], "{sample}/pro.filtered.fastq.gz"),
params:
v = "-v",
q = "20",
p = "90",
qual = "-Q33",
z = "-z",
cluster_log = os.path.join(config["local_log"], "filter_reads_{sample}.log")
log:
os.path.join(config["cluster_log"], "filter_reads_{sample}.log")
singularity:
"docker://cjh4zavolab/fastx:0.0.14"
shell:
"(fastq_quality_filter \
{params.v} \
-q {params.q} \
-p {params.p} \
{params.qual} \
{params.z} \
-i <(zcat {input.reads}) \
-o {output.reads}) &> {log}"
#################################################################################
### Convert fastq to fasta
#################################################################################
rule fastq_to_fasta:
input:
reads = os.path.join(config["output_dir"], "{sample}/pro.filtered.fastq.gz"),
output:
reads = os.path.join(config["output_dir"], "{sample}/pro.filtered.fasta.gz"),
params:
v = "-v",
qual = "-Q33",
n = "-n",
r = "-r",
z = "-z",
cluster_log = os.path.join(config["local_log"], "fastq_to_fasta_{sample}.log")
log:
os.path.join(config["cluster_log"], "fastq_to_fasta_{sample}.log")
singularity:
"docker://cjh4zavolab/fastx:0.0.14"
shell:
"(fastq_to_fasta \
{params.v} \
{params.qual} \
{params.n} \
{params.r} \
{params.z} \
-i <(zcat {input.reads}) \
-o {output.reads}) &> {log}"
# #################################################################################
# ### Trim 3' adapters
# #################################################################################
# rule trim_3p_adapter:
# input:
# flag = config["dir_created"],
# reads = os.path.join(config["input_dir"], "{sample}" + config["input_reads_pattern"]),
# output:
# reads = os.path.join(config["output_dir"], "{sample}/trim_3p_adapter.fastq.gz"),
# params:
# adapter = lambda wildcards: config[ wildcards.sample ]['adapter'],
# error_rate = config["error_rate"],
# minimum_length = config["minimum_length"],
# overlap = config["overlap"],
# cluster_log = os.path.join(config["cluster_log"], "trim_3p_adapter_{sample}.log"),
# activate = config["activate"],
# env = config["env"]
# log:
# os.path.join(config["local_log"], "trim_3p_adapter_{sample}.log")
# shell:
# "(set +u; source {params.activate} {params.env}; set -u; \
# cutadapt \
# --adapter {params.adapter} \
# --error-rate {params.error_rate} \
# --minimum-length {params.minimum_length} \
# --overlap {params.overlap} \
# {input.reads} | gzip > {output.reads}) 2> {log}"
# #################################################################################
# ### Concatenate fastq files
# #################################################################################
# rule concat_samples:
# input:
# reads = expand(os.path.join(config["output_dir"], "{sample}/trim_3p_adapter.fastq.gz"),sample=config["sample"])
# output:
# reads = os.path.join(config["output_dir"],"merged_reads.fastq.gz")
# params:
# cluster_log = os.path.join(config["cluster_log"], "concat_samples.log"),
# activate = config["activate"],
# env = config["env"]
# log:
# os.path.join(config["local_log"], "concat_samples.log")
# shell:
# "(set +u; source {params.activate} {params.env}; set -u; \
# cat {input.reads} > {output.reads}) 2> {log}"
# #################################################################################
# ### Align reads STAR
# #################################################################################
# rule align_reads_STAR:
# input:
# index = config["STAR_index"],
# reads = os.path.join(config["output_dir"],"merged_reads.fastq.gz"),
# gtf = config["gtf"]
# output:
# outputfile = os.path.join(config["output_dir"], "STAR_Aligned.out.bam")
# params:
# outFileNamePrefix = os.path.join(config["output_dir"], "STAR_"),
# cluster_log = os.path.join(config["cluster_log"], "align_reads_STAR.log"),
# activate = config["activate"],
# env = config["env"]
# threads: 8
# log:
# os.path.join(config["local_log"],"align_reads_STAR.log")
# shell:
# "(set +u; source {params.activate} {params.env}; set -u; \
# STAR --runMode alignReads \
# --twopassMode Basic \
# --runThreadN {threads} \
# --genomeDir {input.index} \
# --sjdbGTFfile {input.gtf} \
# --readFilesIn {input.reads} \
# --readFilesCommand zcat \
# --outFileNamePrefix {params.outFileNamePrefix} \
# --outSAMtype BAM Unsorted) 2> {log}"
# #################################################################################
# ### Sort alignment file
# #################################################################################
# rule sort_bam:
# input:
# bam = os.path.join(config["output_dir"], "STAR_Aligned.out.bam")
# output:
# bam = os.path.join(config["output_dir"], "STAR_Aligned.out.sorted.bam")
# params:
# cluster_log = os.path.join(config["cluster_log"], "sort_bam.log"),
# activate = config["activate"],
# env = config["env"]
# threads: 8
# log:
# os.path.join(config["local_log"],"sort_bam.log")
# shell:
# "(set +u; source {params.activate} {params.env}; set -u; \
# samtools sort -@ {threads} {input.bam} > {output.bam}) 2> {log}"
# #################################################################################
# ### Index alignment file
# #################################################################################
# rule samtools_index:
# input:
# bam = os.path.join(config["output_dir"], "STAR_Aligned.out.sorted.bam")
# output:
# bai = os.path.join(config["output_dir"], "STAR_Aligned.out.sorted.bam.bai")
# params:
# cluster_log = os.path.join(config["cluster_log"], "samtools_index.log"),
# activate = config["activate"],
# env = config["env"]
# log:
# os.path.join(config["local_log"],"samtools_index.log")
# shell:
# "(set +u; source {params.activate} {params.env}; set -u; \
# samtools index {input.bam} > {output.bai}) 2> {log}"
# ##################################################################################
# ### Quantify
# ##################################################################################
# rule salmon_quant:
# input:
# flag = config["dir_created"],
# reads = os.path.join(config["output_dir"], "{sample}/trim_3p_adapter.fastq.gz"),
# gtf = config["filtered_gtf"],
# index = config["filtered_transcripts_salmon_index"]
# output:
# output = os.path.join(config["output_dir"], "{sample}/salmon_quant"),
# tr_estimates = os.path.join(config["output_dir"], "{sample}/salmon_quant/quant.sf"),
# gn_estimates = os.path.join(config["output_dir"], "{sample}/salmon_quant/quant.genes.sf")
# params:
# libType = "A",
# fldMean = "300",
# fldSD = "100",
# activate = config["activate"],
# env = config["env"],
# cluster_log = os.path.join(config["cluster_log"], "salmon_quant_{sample}.log")
# log:
# os.path.join(config["local_log"], "salmon_quant_{sample}.log")
# threads: 6
# shell:
# "(set +u; source {params.activate} {params.env}; set -u; \
# salmon quant \
# --index {input.index} \
# --libType {params.libType} \
# --unmatedReads {input.reads} \
# --seqBias \
# --geneMap {input.gtf} \
# --fldMean {params.fldMean} \
# --fldSD {params.fldSD} \
# --useVBOpt \
# --threads {threads} \
# --output {output.output}) > {log}"
# ###################################################################################
# #### Map reads to rRNA
# ###################################################################################
# #
# #rule map_reads_to_rRNA:
# # input:
# # rRNA = config["rRNA"],
# # rRNA_index = config["rRNA_index"],
# # reads = os.path.join(config["output_dir"], "{sample}/trim_quality.fastq.gz")
# # output:
# # mapped_reads_to_rRNA = os.path.join(config["output_dir"], "{sample}/mapped_reads_to_rRNA.sam"),
# # unmapped_reads_to_rRNA = os.path.join(config["output_dir"], "{sample}/unmapped_reads_to_rRNA.fq.gz")
# # params:
# # activate = config["activate"],
# # env = config["env"],
# # cluster_log = os.path.join(config["cluster_log"], "map_reads_to_rRNA_{sample}.log"),
# # unmapped_reads_to_rRNA = os.path.join(config["output_dir"], "{sample}/unmapped_reads_to_rRNA.fq")
# # log:
# # os.path.join(config["local_log"], "map_reads_to_rRNA_{sample}.log")
# # threads: 4
# # shell:
# # "(set +u; source {params.activate} {params.env}; set -u; \
# # segemehl.x \
# # -i {input.rRNA_index} \
# # -d {input.rRNA} \
# # -q {input.reads} \
# # --accuracy 90 \
# # --threads {threads} \
# # -o {output.mapped_reads_to_rRNA} \
# # -u {params.unmapped_reads_to_rRNA}; \
# # gzip {params.unmapped_reads_to_rRNA}) 2> {log}"
# #
# ####################################################################################
# ##### Map reads to transcripts
# ####################################################################################
# ##
# ##rule map_reads_to_transcripts_segemehll:
# ## input:
# ## transcripts = config["filtered_transcripts"],
# ## index = config["segemehl_filtered_transcripts_index"],
# ## reads = os.path.join(config["output_dir"], "{sample}/trim_quality.fastq.gz")
# ## output:
# ## mapped_reads_to_rRNA = os.path.join(config["output_dir"], "{sample}/mapped_reads_to_rRNA.sam"),
# ## unmapped_reads_to_rRNA = os.path.join(config["output_dir"], "{sample}/unmapped_reads_to_rRNA.fq.gz")
# ## params:
# ## activate = config["activate"],
# ## env = config["env"],
# ## cluster_log = os.path.join(config["cluster_log"], "map_reads_to_rRNA_{sample}.log"),
# ## unmapped_reads_to_rRNA = os.path.join(config["output_dir"], "{sample}/unmapped_reads_to_rRNA.fq")
# ## log:
# ## os.path.join(config["local_log"], "map_reads_to_rRNA_{sample}.log")
# ## threads: 4
# ## shell:
# ## "(set +u; source {params.activate} {params.env}; set -u; \
# ## segemehl.x \
# ## -i {input.rRNA_index} \
# ## -d {input.rRNA} \
# ## -q {input.reads} \
# ## --accuracy 90 \
# ## --threads {threads} \
# ## -o {output.mapped_reads_to_rRNA} \
# ## -u {params.unmapped_reads_to_rRNA}; \
# ## gzip {params.unmapped_reads_to_rRNA}) 2> {log}"
{
"__default__":
{
"queue":"6hours",
"time": "05:00:00",
"threads":"1",
"mem":"4G"
},
"align_reads_STAR":
{
"queue":"6hours",
"time": "06:00:00",
"threads":"8",
"mem":"50G"
},
"sort_bam":
{
"queue":"6hours",
"time": "03:00:00",
"threads":"8",
"mem":"50G"
},
"salmon_quant":
{
"queue":"30min",
"time": "00:30:00",
"threads":"6",
"mem":"30G"
},
"map_reads_to_rRNA":
{
"queue":"6hours",
"time": "05:00:00",
"threads":"4",
"mem":"50G"
}
}
---
##############################################################################
### Annotation
##############################################################################
##############################################################################
### Output and log directory
##############################################################################
output_dir: "results"
local_log: "results/local_log"
cluster_log: "results/cluster_log"
dir_created: "results/dir_created"
##############################################################################
### sample info
##############################################################################
input_dir: "samples"
input_reads_pattern: ".fastq.gz"
sample: ["example"]
example: {adapter: GATCGGAAGAGCACA}
##############################################################################
### Parameters for independent rules/jobs
##############################################################################
...
snakemake --dag -np | dot -Tpng > dag.png
snakemake/process_data/dag.png

14.7 KiB

# set -e
snakemake \
--cluster-config cluster.json \
--cluster "sbatch --cpus-per-task={cluster.threads} --mem={cluster.mem} --qos={cluster.queue} --time={cluster.time} --output={params.cluster_log}-%j-%N -p scicore" \
--cores 256 \
-p \
--rerun-incomplete \
--use-singularity \
--singularity-args "--bind ${PWD}"
\ No newline at end of file
File added
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment