From 52c1de75dfc3122d2172d062afc924b7a3b19273 Mon Sep 17 00:00:00 2001
From: BIOPZ-Gypas Foivos <foivos.gypas@unibas.ch>
Date: Tue, 19 Feb 2019 18:50:02 +0100
Subject: [PATCH] Addition of basic steps for process data

---
 .gitignore                                 |   3 +-
 process_data/.gitignore                    |   7 +
 process_data/Snakefile                     | 182 +++++++++++++++++++++
 process_data/cluster.json                  |  41 +++++
 process_data/config.yaml                   |  24 +++
 process_data/create_snakemake_flowchart.sh |   1 +
 process_data/run_snakefile.sh              |  13 ++
 7 files changed, 269 insertions(+), 2 deletions(-)
 create mode 100644 process_data/.gitignore
 create mode 100644 process_data/Snakefile
 create mode 100644 process_data/cluster.json
 create mode 100644 process_data/config.yaml
 create mode 100755 process_data/create_snakemake_flowchart.sh
 create mode 100755 process_data/run_snakefile.sh

diff --git a/.gitignore b/.gitignore
index ea553e2..8b13789 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1 @@
-download_annotation
-process_data
+
diff --git a/process_data/.gitignore b/process_data/.gitignore
new file mode 100644
index 0000000..58c3d7f
--- /dev/null
+++ b/process_data/.gitignore
@@ -0,0 +1,7 @@
+.*
+Log.out
+results
+logs
+dag.png
+nohup.out
+samples
diff --git a/process_data/Snakefile b/process_data/Snakefile
new file mode 100644
index 0000000..edd9baa
--- /dev/null
+++ b/process_data/Snakefile
@@ -0,0 +1,182 @@
+configfile: "config.yaml"
+
+localrules: finish
+
+#################################################################################
+### Final rule
+#################################################################################
+
+rule finish:
+	input:
+		fastqc = expand(os.path.join(config["output_dir"], "{sample}", "fastqc"), sample=config["sample"]),
+		htseq_qa = expand(os.path.join(config["output_dir"], "{sample}", "htseq_qa", "htseq_quality.pdf"), sample=config["sample"]),
+		gn_estimates = expand(os.path.join(config["output_dir"], "{sample}", "salmon_quant", "quant.genes.sf"), sample=config["sample"]),
+		bam = expand(os.path.join(config["output_dir"], "{sample}", "STAR_Aligned.out.bam"), sample=config["sample"])
+
+##################################################################################
+### Fastqc
+##################################################################################
+
+rule fastqc:
+	input:
+		reads = os.path.join(config["input_dir"], "{sample}.fastq.gz")
+	output:
+		outdir = os.path.join(config["output_dir"], "{sample}", "fastqc")
+	singularity:
+		"docker://zavolab/fastqc:0.11.8"
+	log:
+		os.path.join(config["local_log"], "fastqc_{sample}.log")
+	shell:
+		"(mkdir -p {output.outdir}; \
+		fastqc \
+		--outdir {output.outdir} \
+		{input.reads}) &> {log}"
+
+##################################################################################
+### HTSeq quality assessment of the fastq file
+##################################################################################
+
+rule htseq_qa:
+	input:
+		reads = os.path.join(config["input_dir"], "{sample}.fastq.gz")
+	output:
+		qual_pdf = os.path.join(config["output_dir"], "{sample}", "htseq_qa", "htseq_quality.pdf")
+	singularity:
+		"docker://zavolab/python_htseq:3.6.5_0.10.0"
+	log:
+		os.path.join(config["local_log"], "htseq_qa_{sample}.log")
+	shell:
+		"(htseq-qa \
+		-t fastq \
+		-o {output.qual_pdf} \
+        {input.reads} ) &> {log}"
+
+##################################################################################
+### Map to other RNAs with Segemehl
+##################################################################################
+
+rule map_to_other_RNAs:
+	input:
+		reads = os.path.join(config["input_dir"], "{sample}.fastq.gz"),
+		index = config["other_RNAs_index"],
+		sequence = config["other_RNAs_sequence"]
+	output:
+		sam = os.path.join(config["output_dir"], "{sample}", "other_genes.mapped.sam"),
+		reads = os.path.join(config["output_dir"], "{sample}", "other_genes.unmapped.fastq.gz")
+	params:
+		reads = os.path.join(config["output_dir"], "{sample}", "other_genes.unmapped.fastq"),
+		silent = "--silent",
+		accuracy = "90"
+	log:
+		os.path.join(config["local_log"], "map_to_other_genes_{sample}.log")
+	threads:	8
+	singularity:
+		"docker://zavolab/segemehl:0.2.0"
+	shell:
+		"(segemehl.x \
+		{params.silent} \
+		-i {input.index} \
+		-d {input.sequence} \
+		-q {input.reads} \
+		--accuracy {params.accuracy} \
+		--threads {threads} \
+		-o {output.sam} \
+		-u {params.reads}; \
+		gzip -c {params.reads} > {output.reads}; \
+		rm {params.reads}) &> {log}"
+
+##################################################################################
+### salmon quant
+##################################################################################
+
+rule salmon_quant:
+	input:
+		reads = os.path.join(config["output_dir"], "{sample}", "other_genes.unmapped.fastq.gz"),
+		gtf = config["annotation_filtered"],
+		index = config["salmon_index"]
+	output:
+		output_dir = os.path.join(config["output_dir"], "{sample}", "salmon_quant"),
+		gn_estimates = os.path.join(config["output_dir"], "{sample}", "salmon_quant", "quant.genes.sf"),
+		tr_estimates = os.path.join(config["output_dir"], "{sample}", "salmon_quant", "quant.sf")
+	params:
+		libType = lambda wildcards: config[wildcards.sample]['libType'],
+		fldMean = lambda wildcards: config[wildcards.sample]['fldMean'],
+		fldSD = lambda wildcards: config[wildcards.sample]['fldSD'],
+	log:
+		os.path.join(config["local_log"], "salmon_quant_{sample}.log")
+	threads:	6
+	singularity:
+		"docker://zavolab/salmon:0.11.0"
+	shell:
+		"(salmon quant \
+		--index {input.index} \
+		--libType {params.libType} \
+		--unmatedReads <(zcat {input.reads}) \
+		--seqBias \
+		--geneMap {input.gtf} \
+		--fldMean {params.fldMean} \
+		--fldSD {params.fldSD} \
+		--threads {threads} \
+		--output {output.output_dir}) &> {log}"
+
+#################################################################################
+### Align reads STAR
+#################################################################################
+
+rule align_reads_STAR:
+	input:
+		index = config["STAR_index"],
+		reads = os.path.join(config["output_dir"], "{sample}", "other_genes.unmapped.fastq.gz"),
+		gtf = config["annotation"]
+	output:
+		outputfile = os.path.join(config["output_dir"], "{sample}", "STAR_Aligned.out.bam")
+	params:
+		outFileNamePrefix = os.path.join(config["output_dir"], "{sample}", "STAR_")
+	log:
+		os.path.join(config["local_log"],"align_reads_STAR_{sample}.log")
+	threads:	8
+	singularity:
+		"docker://zavolab/star:2.6.0a"
+	shell:
+		"(STAR --runMode alignReads \
+		--twopassMode Basic \
+		--runThreadN {threads} \
+		--genomeDir {input.index} \
+		--sjdbGTFfile {input.gtf} \
+		--readFilesIn {input.reads} \
+		--readFilesCommand zcat \
+		--outFileNamePrefix {params.outFileNamePrefix} \
+		--outSAMtype BAM Unsorted) &> {log}"
+
+################################################################################
+### Sort alignment file
+################################################################################
+
+rule sort_bam:
+	input:
+		bam = os.path.join(config["output_dir"], "{sample}", "STAR_Aligned.out.bam")
+	output:
+		bam = os.path.join(config["output_dir"], "{sample}", "STAR_Aligned.out.sorted.bam")
+	threads:	8
+	log:
+		os.path.join(config["local_log"],"sort_bam_{sample}.log")
+	singularity:
+		"docker://zavolab/samtools:1.8"
+	shell:
+		"(samtools sort -@ {threads} {input.bam} > {output.bam}) &> {log}"
+
+################################################################################
+### Index alignment file
+################################################################################
+
+rule samtools_index:
+	input:
+		bam = os.path.join(config["output_dir"], "{sample}", "STAR_Aligned.out.sorted.bam")
+	output:
+		bai = os.path.join(config["output_dir"], "{sample}", "STAR_Aligned.out.sorted.bam.bai")
+	log:
+		os.path.join(config["local_log"],"samtools_index_{sample}.log")
+	singularity:
+		"docker://zavolab/samtools:1.8"
+	shell:
+		"(samtools index {input.bam} > {output.bai}) &> {log}"
diff --git a/process_data/cluster.json b/process_data/cluster.json
new file mode 100644
index 0000000..eb681fc
--- /dev/null
+++ b/process_data/cluster.json
@@ -0,0 +1,41 @@
+{
+  "__default__" :
+  {
+    "queue": "6hours",
+    "time": "01:00:00",
+    "threads": "1",
+    "mem": "4G",
+    "name": "{rule}.{wildcards}",
+    "out": "$PWD/logs/cluster_log/{rule}.{wildcards}-%j-%N.out"
+  },
+  "generate_segemehl_index_other_RNAs":
+  {
+    "time": "06:00:00",
+    "threads":"8",
+    "mem":"50G"
+  },
+  "map_to_other_RNAs":
+  {
+    "time": "06:00:00",
+    "threads":"8",
+    "mem":"50G"
+  },
+  "index_genome_STAR":
+  {
+    "time": "06:00:00",
+    "threads":"8",
+    "mem":"75G"
+  },
+  "align_reads_STAR":
+  {
+    "time": "06:00:00",
+    "threads":"8",
+    "mem":"75G"
+  },
+  "salmon_quant":
+  {
+    "time": "02:00:00",
+    "threads":"6",
+    "mem":"32G"
+  }
+}
diff --git a/process_data/config.yaml b/process_data/config.yaml
new file mode 100644
index 0000000..d2c34ba
--- /dev/null
+++ b/process_data/config.yaml
@@ -0,0 +1,24 @@
+---
+  ##############################################################################
+  ### Annotation
+  ##############################################################################
+  annotation: "../prepare_annotation/results/annotation.gtf"
+  genome: "../prepare_annotation/results/genome.fa"
+  annotation_filtered: "../prepare_annotation/results/filtered_transcripts.gtf"
+  STAR_index: "../prepare_annotation/results/STAR_index/"
+  other_RNAs_sequence: "../prepare_annotation/other.fa"
+  other_RNAs_index: "../prepare_annotation/results/other_RNAs_sequence.idx"
+  salmon_index: "../prepare_annotation/results/filtered_transcripts_salmon.idx/"
+  ##############################################################################
+  ### Output and log directories
+  ##############################################################################
+  output_dir: "results"
+  local_log: "logs/local_log"
+  cluster_log: "logs/cluster_log"
+  ##############################################################################
+  ### Sample info
+  ##############################################################################
+  input_dir: "samples"
+  sample: ["test"]
+  test: {libType: A, fldMean: 300, fldSD: 100}
+...
diff --git a/process_data/create_snakemake_flowchart.sh b/process_data/create_snakemake_flowchart.sh
new file mode 100755
index 0000000..ce71a9a
--- /dev/null
+++ b/process_data/create_snakemake_flowchart.sh
@@ -0,0 +1 @@
+snakemake --dag -np | dot -Tpng > dag.png
diff --git a/process_data/run_snakefile.sh b/process_data/run_snakefile.sh
new file mode 100755
index 0000000..ceec8a8
--- /dev/null
+++ b/process_data/run_snakefile.sh
@@ -0,0 +1,13 @@
+# set -e
+
+mkdir -p logs/cluster_log
+mkdir -p logs/local_log
+
+snakemake \
+--cluster-config cluster.json \
+--cluster "sbatch --cpus-per-task={cluster.threads} --mem={cluster.mem} --qos={cluster.queue} --time={cluster.time} --job-name={cluster.name} -o {cluster.out} -p scicore" \
+--cores 256 \
+-p \
+--rerun-incomplete \
+--use-singularity \
+--singularity-args "--bind ${PWD}"
-- 
GitLab