First commit

d99c1d48 · BIOPZ-Gypas Foivos · d99c1d48 · d99c1d48 · d99c1d48 · d99c1d48
Commit d99c1d48 authored 7 years ago by BIOPZ-Gypas Foivos
--- a/.gitignore
+++ b/.gitignore
+snakemake/annotation/
+snakemake/prepare_annotation/results/
+snakemake/prepare_annotation/results/*
+snakemake/process_data/.snakemake
+snakemake/process_data/results/local_log
+snakemake/process_data/results/cluster_log
+.DS_Store
+._.DS_Store
--- a/snakemake/docker/segemehl/0.2.0/Dockerfile
+++ b/snakemake/docker/segemehl/0.2.0/Dockerfile
+##### BASE IMAGE #####
+FROM debian:9.3
+
+##### METADATA #####
+LABEL base.image="debian:9.3"
+LABEL version="1"
+LABEL software="segemehl"
+LABEL software.version="0.2.0"
+LABEL software.description="short read mapping with gaps"
+LABEL software.website="http://www.bioinf.uni-leipzig.de/Software/segemehl/"
+LABEL software.documentation="http://www.bioinf.uni-leipzig.de/Software/segemehl/"
+LABEL software.license="link to license(file) of original software"
+LABEL software.tags="Genomics, Transcriptomics"
+LABEL maintainer="foivos.gypas@unibas.ch"
+LABEL maintainer.organisation="Biozentrum, University of Basel"
+LABEL maintainer.location="Klingelbergstrasse 50/70, CH-4056 Basel, Switzerland"
+LABEL maintainer.lab="Zavolan Lab"
+LABEL maintainer.license="https://spdx.org/licenses/Apache-2.0"
+
+##### VARIABLES #####
+# Use variables for convenient updates/re-usability
+ENV SOFTWARE_VERSION 0.2.0
+
+##### INSTALL #####
+RUN apt-get update -y \
+  && apt-get install -y wget python build-essential libz-dev gcc libgl1-mesa-dev
+
+## Do your install stuff, merge with '&&' whenever possible, don't create unnecessary layers
+#  && wget https://.../v${SOFTWARE_VERSION}/software-${SOFTWARE_VERSION}.tar.gz \
+#  && tar -zxvf software-${SOFTWARE_VERSION}.tar.gz \
+#  && cd software \
+#  && make \
+## Copy binaries to location in PATH
+#  && cp bin/* /usr/bin \
+## Remove files no longer needed
+#  && cd .. \
+#  && rm -r /intermediate/folder software-${SOFTWARE_VERSION}.tar.gz \
+## Clean up
+#  && apt-get purge -y wget \
+#  && apt-get autoremove -y && apt-get clean \
+#  && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \
+#
+###### SET PATH (if not in /usr/bin/) #####
+#ENV PATH /path/to/binary:$PATH
+#
+###### WORKING DIRECTORY #####
+#WORKDIR /data/
--- a/snakemake/process_data/Snakefile
+++ b/snakemake/process_data/Snakefile
+configfile: "config.yaml"
+#from snakemake.utils import listfiles
+
+localrules: create_output_and_log_directories, concat_samples, finish
+
+#################################################################################
+### Finish rule
+#################################################################################
+
+rule finish:
+	input:
+		reads = expand(os.path.join(config["output_dir"], "{sample}/pro.filtered.fasta.gz"), sample=config["sample"])
+
+#################################################################################
+### Create output and log directories
+#################################################################################
+
+rule create_output_and_log_directories:
+	output:
+		output_dir = config["output_dir"],
+		cluster_log = config["cluster_log"],
+		local_log = config["local_log"],
+		sample_dir = expand(os.path.join(config["output_dir"], "{sample}"), sample=config["sample"]),
+		flag = config["dir_created"]
+	threads:	1
+	shell:
+		"mkdir -p {output.output_dir}; \
+		mkdir -p {output.cluster_log}; \
+		mkdir -p {output.local_log}; \
+		mkdir -p {output.sample_dir}; \
+		touch {output.flag};"
+
+#################################################################################
+### Clipping reads
+#################################################################################
+
+rule clip_reads:
+	input:
+		flag = config["dir_created"],
+		reads = os.path.join(config["input_dir"], "{sample}" + config["input_reads_pattern"]),
+	output:
+		reads = os.path.join(config["output_dir"], "{sample}/pro.clipped.fastq.gz"),
+	params:
+		v = "-v",
+		n = "-n",
+		l = "20",
+		qual = "-Q33",
+		z = "-z",
+		adapter = lambda wildcards: config[ wildcards.sample ]['adapter'],
+		cluster_log = os.path.join(config["cluster_log"], "clip_reads_{sample}.log")
+	log:
+		os.path.join(config["local_log"], "clip_reads_{sample}.log")
+	singularity:
+		"docker://cjh4zavolab/fastx:0.0.14"
+	shell:
+		"(fastx_clipper \
+		{params.v} \
+		{params.n} \
+		-l {params.l} \
+		{params.qual} \
+		-a {params.adapter} \
+		{params.z} \
+		-i <(zcat {input.reads}) \
+		-o {output.reads}) &> {log}"
+
+#################################################################################
+### Trimming reads
+#################################################################################
+
+rule trim_reads:
+	input:
+		reads = os.path.join(config["output_dir"], "{sample}/pro.clipped.fastq.gz")
+	output:
+		reads = os.path.join(config["output_dir"], "{sample}/pro.trimmed.fastq.gz"),
+	params:
+		v = "-v",
+		l = "20",
+		t = "20",
+		qual = "-Q33",
+		z = "-z",
+		cluster_log = os.path.join(config["local_log"], "trim_reads_{sample}.log")
+	log:
+		os.path.join(config["cluster_log"], "trim_reads_{sample}.log")
+	singularity:
+		"docker://cjh4zavolab/fastx:0.0.14"
+	shell:
+		"(fastq_quality_trimmer \
+		{params.v} \
+		-l {params.l} \
+		-t {params.t} \
+		{params.qual} \
+		{params.z} \
+		-i <(zcat {input.reads}) \
+		-o {output.reads}) &> {log}"
+
+#################################################################################
+### Filtering reads
+#################################################################################
+
+rule filter_reads:
+	input:
+		reads = os.path.join(config["output_dir"], "{sample}/pro.trimmed.fastq.gz"),
+	output:
+		reads = os.path.join(config["output_dir"], "{sample}/pro.filtered.fastq.gz"),
+	params:
+		v = "-v",
+		q = "20",
+		p = "90",
+		qual = "-Q33",
+		z = "-z",
+		cluster_log = os.path.join(config["local_log"], "filter_reads_{sample}.log")
+	log:
+		os.path.join(config["cluster_log"], "filter_reads_{sample}.log")
+	singularity:
+		"docker://cjh4zavolab/fastx:0.0.14"
+	shell:
+		"(fastq_quality_filter \
+		{params.v} \
+		-q {params.q} \
+		-p {params.p} \
+		{params.qual} \
+		{params.z} \
+		-i <(zcat {input.reads}) \
+		-o {output.reads}) &> {log}"
+
+#################################################################################
+### Convert fastq to fasta
+#################################################################################
+
+rule fastq_to_fasta:
+	input:
+		reads = os.path.join(config["output_dir"], "{sample}/pro.filtered.fastq.gz"),
+	output:
+		reads = os.path.join(config["output_dir"], "{sample}/pro.filtered.fasta.gz"),
+	params:
+		v = "-v",
+		qual = "-Q33",
+		n = "-n",
+		r = "-r",
+		z = "-z",
+		cluster_log = os.path.join(config["local_log"], "fastq_to_fasta_{sample}.log")
+	log:
+		os.path.join(config["cluster_log"], "fastq_to_fasta_{sample}.log")
+	singularity:
+		"docker://cjh4zavolab/fastx:0.0.14"
+	shell:
+		"(fastq_to_fasta \
+		{params.v} \
+		{params.qual} \
+		{params.n} \
+		{params.r} \
+		{params.z} \
+		-i <(zcat {input.reads}) \
+		-o {output.reads}) &> {log}"
+
+# #################################################################################
+# ### Trim 3' adapters
+# #################################################################################
+
+# rule trim_3p_adapter:
+# 	input:
+# 		flag = config["dir_created"],
+# 		reads = os.path.join(config["input_dir"], "{sample}" + config["input_reads_pattern"]),
+# 	output:
+# 		reads = os.path.join(config["output_dir"], "{sample}/trim_3p_adapter.fastq.gz"),
+# 	params:
+# 		adapter = lambda wildcards: config[ wildcards.sample ]['adapter'],
+# 		error_rate = config["error_rate"],
+# 		minimum_length = config["minimum_length"],
+# 		overlap = config["overlap"],
+# 		cluster_log = os.path.join(config["cluster_log"], "trim_3p_adapter_{sample}.log"),
+# 		activate = config["activate"],
+# 		env = config["env"]
+# 	log:
+# 		os.path.join(config["local_log"], "trim_3p_adapter_{sample}.log")
+# 	shell:
+# 		"(set +u; source {params.activate} {params.env}; set -u; \
+# 		cutadapt \
+# 		--adapter {params.adapter} \
+# 		--error-rate {params.error_rate} \
+# 		--minimum-length {params.minimum_length} \
+# 		--overlap {params.overlap} \
+# 		{input.reads} | gzip > {output.reads}) 2> {log}"
+
+# #################################################################################
+# ### Concatenate fastq files
+# #################################################################################
+
+# rule concat_samples:
+# 	input:
+# 		reads = expand(os.path.join(config["output_dir"], "{sample}/trim_3p_adapter.fastq.gz"),sample=config["sample"])
+# 	output:
+# 		reads = os.path.join(config["output_dir"],"merged_reads.fastq.gz")
+# 	params:
+# 		cluster_log = os.path.join(config["cluster_log"], "concat_samples.log"),
+# 		activate = config["activate"],
+# 		env = config["env"]
+# 	log:
+# 		os.path.join(config["local_log"], "concat_samples.log")
+# 	shell:
+# 		"(set +u; source {params.activate} {params.env}; set -u; \
+# 		cat {input.reads} > {output.reads}) 2> {log}"
+
+# #################################################################################
+# ### Align reads STAR
+# #################################################################################
+
+# rule align_reads_STAR:
+# 	input:
+# 		index = config["STAR_index"],
+# 		reads = os.path.join(config["output_dir"],"merged_reads.fastq.gz"),
+# 		gtf = config["gtf"]
+# 	output:
+# 		outputfile = os.path.join(config["output_dir"], "STAR_Aligned.out.bam")
+# 	params:
+# 		outFileNamePrefix = os.path.join(config["output_dir"], "STAR_"),
+# 		cluster_log = os.path.join(config["cluster_log"], "align_reads_STAR.log"),
+# 		activate = config["activate"],
+# 		env = config["env"]
+# 	threads:	8
+# 	log:
+# 		os.path.join(config["local_log"],"align_reads_STAR.log")
+# 	shell:
+# 		"(set +u; source {params.activate} {params.env}; set -u; \
+# 		STAR --runMode alignReads \
+# 		--twopassMode Basic \
+# 		--runThreadN {threads} \
+# 		--genomeDir {input.index} \
+# 		--sjdbGTFfile {input.gtf} \
+# 		--readFilesIn {input.reads} \
+# 		--readFilesCommand zcat \
+# 		--outFileNamePrefix {params.outFileNamePrefix} \
+# 		--outSAMtype BAM Unsorted) 2> {log}"
+
+# #################################################################################
+# ### Sort alignment file
+# #################################################################################
+
+# rule sort_bam:
+# 	input:
+# 		bam = os.path.join(config["output_dir"], "STAR_Aligned.out.bam")
+# 	output:
+# 		bam = os.path.join(config["output_dir"], "STAR_Aligned.out.sorted.bam")
+# 	params:
+# 		cluster_log = os.path.join(config["cluster_log"], "sort_bam.log"),
+# 		activate = config["activate"],
+# 		env = config["env"]
+# 	threads:	8
+# 	log:
+# 		os.path.join(config["local_log"],"sort_bam.log")
+# 	shell:
+# 		"(set +u; source {params.activate} {params.env}; set -u; \
+# 		samtools sort -@ {threads} {input.bam} > {output.bam}) 2> {log}"
+
+# #################################################################################
+# ### Index alignment file
+# #################################################################################
+
+# rule samtools_index:
+# 	input:
+# 		bam = os.path.join(config["output_dir"], "STAR_Aligned.out.sorted.bam")
+# 	output:
+# 		bai = os.path.join(config["output_dir"], "STAR_Aligned.out.sorted.bam.bai")
+# 	params:
+# 		cluster_log = os.path.join(config["cluster_log"], "samtools_index.log"),
+# 		activate = config["activate"],
+# 		env = config["env"]
+# 	log:
+# 		os.path.join(config["local_log"],"samtools_index.log")
+# 	shell:
+# 		"(set +u; source {params.activate} {params.env}; set -u; \
+# 		samtools index {input.bam} > {output.bai}) 2> {log}"
+
+# ##################################################################################
+# ### Quantify
+# ##################################################################################
+
+# rule salmon_quant:
+# 	input:
+# 		flag = config["dir_created"],
+# 		reads = os.path.join(config["output_dir"], "{sample}/trim_3p_adapter.fastq.gz"),
+# 		gtf = config["filtered_gtf"],
+# 		index = config["filtered_transcripts_salmon_index"]
+# 	output:
+# 		output = os.path.join(config["output_dir"], "{sample}/salmon_quant"),
+# 		tr_estimates = os.path.join(config["output_dir"], "{sample}/salmon_quant/quant.sf"),
+# 		gn_estimates = os.path.join(config["output_dir"], "{sample}/salmon_quant/quant.genes.sf")
+# 	params:
+# 		libType = "A",
+# 		fldMean = "300",
+# 		fldSD = "100",
+# 		activate = config["activate"],
+# 		env = config["env"],
+# 		cluster_log = os.path.join(config["cluster_log"], "salmon_quant_{sample}.log")
+# 	log:
+# 		os.path.join(config["local_log"], "salmon_quant_{sample}.log")
+# 	threads:	6
+# 	shell:
+# 		"(set +u; source {params.activate} {params.env}; set -u; \
+# 		salmon quant \
+# 		--index {input.index} \
+# 		--libType {params.libType} \
+# 		--unmatedReads {input.reads} \
+# 		--seqBias \
+# 		--geneMap {input.gtf} \
+# 		--fldMean {params.fldMean} \
+# 		--fldSD {params.fldSD} \
+# 		--useVBOpt \
+# 		--threads {threads} \
+# 		--output {output.output}) > {log}"
+
+# ###################################################################################
+# #### Map reads to rRNA
+# ###################################################################################
+# #
+# #rule map_reads_to_rRNA:
+# #	input:
+# #		rRNA = config["rRNA"],
+# #		rRNA_index = config["rRNA_index"],
+# #		reads = os.path.join(config["output_dir"], "{sample}/trim_quality.fastq.gz")
+# #	output:
+# #		mapped_reads_to_rRNA = os.path.join(config["output_dir"], "{sample}/mapped_reads_to_rRNA.sam"),
+# #		unmapped_reads_to_rRNA = os.path.join(config["output_dir"], "{sample}/unmapped_reads_to_rRNA.fq.gz")
+# #	params:
+# #		activate = config["activate"],
+# #		env = config["env"],
+# #		cluster_log = os.path.join(config["cluster_log"], "map_reads_to_rRNA_{sample}.log"),
+# #		unmapped_reads_to_rRNA = os.path.join(config["output_dir"], "{sample}/unmapped_reads_to_rRNA.fq")
+# #	log:
+# #		os.path.join(config["local_log"], "map_reads_to_rRNA_{sample}.log")
+# #	threads:	4
+# #	shell:
+# #		"(set +u; source {params.activate} {params.env}; set -u; \
+# #		segemehl.x \
+# #		-i {input.rRNA_index} \
+# #		-d {input.rRNA} \
+# #		-q {input.reads} \
+# #		--accuracy 90 \
+# #		--threads {threads} \
+# #		-o {output.mapped_reads_to_rRNA} \
+# #		-u {params.unmapped_reads_to_rRNA}; \
+# #		gzip {params.unmapped_reads_to_rRNA}) 2> {log}"
+# #
+# ####################################################################################
+# ##### Map reads to transcripts
+# ####################################################################################
+# ##
+# ##rule map_reads_to_transcripts_segemehll:
+# ##	input:
+# ##		transcripts = config["filtered_transcripts"],
+# ##		index = config["segemehl_filtered_transcripts_index"],
+# ##		reads = os.path.join(config["output_dir"], "{sample}/trim_quality.fastq.gz")
+# ##	output:
+# ##		mapped_reads_to_rRNA = os.path.join(config["output_dir"], "{sample}/mapped_reads_to_rRNA.sam"),
+# ##		unmapped_reads_to_rRNA = os.path.join(config["output_dir"], "{sample}/unmapped_reads_to_rRNA.fq.gz")
+# ##	params:
+# ##		activate = config["activate"],
+# ##		env = config["env"],
+# ##		cluster_log = os.path.join(config["cluster_log"], "map_reads_to_rRNA_{sample}.log"),
+# ##		unmapped_reads_to_rRNA = os.path.join(config["output_dir"], "{sample}/unmapped_reads_to_rRNA.fq")
+# ##	log:
+# ##		os.path.join(config["local_log"], "map_reads_to_rRNA_{sample}.log")
+# ##	threads:	4
+# ##	shell:
+# ##		"(set +u; source {params.activate} {params.env}; set -u; \
+# ##		segemehl.x \
+# ##		-i {input.rRNA_index} \
+# ##		-d {input.rRNA} \
+# ##		-q {input.reads} \
+# ##		--accuracy 90 \
+# ##		--threads {threads} \
+# ##		-o {output.mapped_reads_to_rRNA} \
+# ##		-u {params.unmapped_reads_to_rRNA}; \
+# ##		gzip {params.unmapped_reads_to_rRNA}) 2> {log}"
--- a/snakemake/process_data/cluster.json
+++ b/snakemake/process_data/cluster.json
+{
+"__default__":
+{
+"queue":"6hours",
+"time": "05:00:00",
+"threads":"1",
+"mem":"4G"
+},
+"align_reads_STAR":
+{
+"queue":"6hours",
+"time": "06:00:00",
+"threads":"8",
+"mem":"50G"
+},
+"sort_bam":
+{
+"queue":"6hours",
+"time": "03:00:00",
+"threads":"8",
+"mem":"50G"
+},
+"salmon_quant":
+{
+"queue":"30min",
+"time": "00:30:00",
+"threads":"6",
+"mem":"30G"
+},
+"map_reads_to_rRNA":
+{
+"queue":"6hours",
+"time": "05:00:00",
+"threads":"4",
+"mem":"50G"
+}
+}
--- a/snakemake/process_data/config.yaml
+++ b/snakemake/process_data/config.yaml
+---
+  ##############################################################################
+  ### Annotation
+  ##############################################################################
+  
+  ##############################################################################
+  ### Output and log directory
+  ##############################################################################
+  output_dir: "results"
+  local_log: "results/local_log"
+  cluster_log: "results/cluster_log"
+  dir_created: "results/dir_created"
+  ##############################################################################
+  ### sample info
+  ##############################################################################
+  input_dir: "samples"
+  input_reads_pattern: ".fastq.gz"
+  sample: ["example"]
+  example: {adapter: GATCGGAAGAGCACA}
+  ##############################################################################
+  ### Parameters for independent rules/jobs
+  ##############################################################################
+
+...
--- a/snakemake/process_data/create_snakemake_flowchart.sh
+++ b/snakemake/process_data/create_snakemake_flowchart.sh
+snakemake --dag -np | dot -Tpng > dag.png
--- a/snakemake/process_data/dag.png
+++ b/snakemake/process_data/dag.png
--- a/snakemake/process_data/run_snakefile.sh
+++ b/snakemake/process_data/run_snakefile.sh
+# set -e
+
+snakemake \
+--cluster-config cluster.json \
+--cluster "sbatch --cpus-per-task={cluster.threads} --mem={cluster.mem} --qos={cluster.queue} --time={cluster.time} --output={params.cluster_log}-%j-%N -p scicore" \
+--cores 256 \
+-p \
+--rerun-incomplete \
+--use-singularity \
+--singularity-args "--bind ${PWD}"
\ No newline at end of file
--- a/snakemake/process_data/samples/example.fastq.gz
+++ b/snakemake/process_data/samples/example.fastq.gz