Added mapping steps in snakemake process_data pipeline

9547714c · BIOPZ-Gypas Foivos · d99c1d48 · 9547714c · 9547714c · 9547714c
Commit 9547714c authored 7 years ago by BIOPZ-Gypas Foivos
--- a/snakemake/docker/segemehl/0.2.0/Dockerfile
+++ b/snakemake/docker/segemehl/0.2.0/Dockerfile
@@ -23,25 +23,15 @@ ENV SOFTWARE_VERSION 0.2.0
 ##### INSTALL #####
 RUN apt-get update -y \
-  && apt-get install -y wget python build-essential libz-dev gcc libgl1-mesa-dev
+  && apt-get install -y wget make gcc build-essential zlib1g-dev libncurses5-dev samtools msmtp zlib1g-dev make libncurses5-dev libxml2-dev \
+  && wget http://www.bioinf.uni-leipzig.de/Software/segemehl/segemehl_0_2_0.tar.gz \
-## Do your install stuff, merge with '&&' whenever possible, don't create unnecessary layers
+  && tar xzvf segemehl_0_2_0.tar.gz \
-#  && wget https://.../v${SOFTWARE_VERSION}/software-${SOFTWARE_VERSION}.tar.gz \
+  && cd segemehl_0_2_0/segemehl/ \
-#  && tar -zxvf software-${SOFTWARE_VERSION}.tar.gz \
+  && make \
-#  && cd software \
+  && cd ../.. \
-#  && make \
+  && cp segemehl_0_2_0/segemehl/segemehl.x /usr/local/bin \
-## Copy binaries to location in PATH
+  && cp segemehl_0_2_0/segemehl/lack.x /usr/local/bin \
-#  && cp bin/* /usr/bin \
+  && rm -rf segemehl_0_2_0* segemehl_0_2_0* \
-## Remove files no longer needed
+  && apt-get purge -y wget \
-#  && cd .. \
+  && apt-get autoremove -y && apt-get clean \
-#  && rm -r /intermediate/folder software-${SOFTWARE_VERSION}.tar.gz \
+  && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
-## Clean up
-#  && apt-get purge -y wget \
-#  && apt-get autoremove -y && apt-get clean \
-#  && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \
-#
-###### SET PATH (if not in /usr/bin/) #####
-#ENV PATH /path/to/binary:$PATH
-#
-###### WORKING DIRECTORY #####
-#WORKDIR /data/
--- a/snakemake/process_data/Snakefile
+++ b/snakemake/process_data/Snakefile
@@ -9,7 +9,7 @@ localrules: create_output_and_log_directories, concat_samples, finish
 rule finish:
 	input:
-		reads = expand(os.path.join(config["output_dir"], "{sample}/pro.filtered.fasta.gz"), sample=config["sample"])
+		sam = expand(os.path.join(config["output_dir"], "{sample}/transcripts.mapped.sam"), sample=config["sample"])
 #################################################################################
 ### Create output and log directories
@@ -131,7 +131,7 @@ rule fastq_to_fasta:
 	input:
 		reads = os.path.join(config["output_dir"], "{sample}/pro.filtered.fastq.gz"),
 	output:
-		reads = os.path.join(config["output_dir"], "{sample}/pro.filtered.fasta.gz"),
+		reads = os.path.join(config["output_dir"], "{sample}/pro.filtered.fasta"),
 	params:
 		v = "-v",
 		qual = "-Q33",
@@ -149,226 +149,69 @@ rule fastq_to_fasta:
 		{params.qual} \
 		{params.n} \
 		{params.r} \
-		{params.z} \
 		-i <(zcat {input.reads}) \
 		-o {output.reads}) &> {log}"
-# #################################################################################
+#################################################################################
-# ### Trim 3' adapters
+### Map reads to other genes (rRNA, tRNA, etc...)
-# #################################################################################
+#################################################################################
-# rule trim_3p_adapter:
-# 	input:
-# 		flag = config["dir_created"],
-# 		reads = os.path.join(config["input_dir"], "{sample}" + config["input_reads_pattern"]),
-# 	output:
-# 		reads = os.path.join(config["output_dir"], "{sample}/trim_3p_adapter.fastq.gz"),
-# 	params:
-# 		adapter = lambda wildcards: config[ wildcards.sample ]['adapter'],
-# 		error_rate = config["error_rate"],
-# 		minimum_length = config["minimum_length"],
-# 		overlap = config["overlap"],
-# 		cluster_log = os.path.join(config["cluster_log"], "trim_3p_adapter_{sample}.log"),
-# 		activate = config["activate"],
-# 		env = config["env"]
-# 	log:
-# 		os.path.join(config["local_log"], "trim_3p_adapter_{sample}.log")
-# 	shell:
-# 		"(set +u; source {params.activate} {params.env}; set -u; \
-# 		cutadapt \
-# 		--adapter {params.adapter} \
-# 		--error-rate {params.error_rate} \
-# 		--minimum-length {params.minimum_length} \
-# 		--overlap {params.overlap} \
-# 		{input.reads} | gzip > {output.reads}) 2> {log}"
-# #################################################################################
-# ### Concatenate fastq files
-# #################################################################################
-# rule concat_samples:
-# 	input:
-# 		reads = expand(os.path.join(config["output_dir"], "{sample}/trim_3p_adapter.fastq.gz"),sample=config["sample"])
-# 	output:
-# 		reads = os.path.join(config["output_dir"],"merged_reads.fastq.gz")
-# 	params:
-# 		cluster_log = os.path.join(config["cluster_log"], "concat_samples.log"),
-# 		activate = config["activate"],
-# 		env = config["env"]
-# 	log:
-# 		os.path.join(config["local_log"], "concat_samples.log")
-# 	shell:
-# 		"(set +u; source {params.activate} {params.env}; set -u; \
-# 		cat {input.reads} > {output.reads}) 2> {log}"
-# #################################################################################
-# ### Align reads STAR
-# #################################################################################
-# rule align_reads_STAR:
-# 	input:
-# 		index = config["STAR_index"],
-# 		reads = os.path.join(config["output_dir"],"merged_reads.fastq.gz"),
-# 		gtf = config["gtf"]
-# 	output:
-# 		outputfile = os.path.join(config["output_dir"], "STAR_Aligned.out.bam")
-# 	params:
-# 		outFileNamePrefix = os.path.join(config["output_dir"], "STAR_"),
-# 		cluster_log = os.path.join(config["cluster_log"], "align_reads_STAR.log"),
-# 		activate = config["activate"],
-# 		env = config["env"]
-# 	threads:	8
-# 	log:
-# 		os.path.join(config["local_log"],"align_reads_STAR.log")
-# 	shell:
-# 		"(set +u; source {params.activate} {params.env}; set -u; \
-# 		STAR --runMode alignReads \
-# 		--twopassMode Basic \
-# 		--runThreadN {threads} \
-# 		--genomeDir {input.index} \
-# 		--sjdbGTFfile {input.gtf} \
-# 		--readFilesIn {input.reads} \
-# 		--readFilesCommand zcat \
-# 		--outFileNamePrefix {params.outFileNamePrefix} \
-# 		--outSAMtype BAM Unsorted) 2> {log}"
-# #################################################################################
-# ### Sort alignment file
-# #################################################################################
-# rule sort_bam:
-# 	input:
-# 		bam = os.path.join(config["output_dir"], "STAR_Aligned.out.bam")
-# 	output:
-# 		bam = os.path.join(config["output_dir"], "STAR_Aligned.out.sorted.bam")
-# 	params:
-# 		cluster_log = os.path.join(config["cluster_log"], "sort_bam.log"),
-# 		activate = config["activate"],
-# 		env = config["env"]
-# 	threads:	8
-# 	log:
-# 		os.path.join(config["local_log"],"sort_bam.log")
-# 	shell:
-# 		"(set +u; source {params.activate} {params.env}; set -u; \
-# 		samtools sort -@ {threads} {input.bam} > {output.bam}) 2> {log}"
-# #################################################################################
-# ### Index alignment file
-# #################################################################################
-# rule samtools_index:
-# 	input:
-# 		bam = os.path.join(config["output_dir"], "STAR_Aligned.out.sorted.bam")
-# 	output:
-# 		bai = os.path.join(config["output_dir"], "STAR_Aligned.out.sorted.bam.bai")
-# 	params:
-# 		cluster_log = os.path.join(config["cluster_log"], "samtools_index.log"),
-# 		activate = config["activate"],
-# 		env = config["env"]
-# 	log:
-# 		os.path.join(config["local_log"],"samtools_index.log")
-# 	shell:
-# 		"(set +u; source {params.activate} {params.env}; set -u; \
-# 		samtools index {input.bam} > {output.bai}) 2> {log}"
-# ##################################################################################
+rule map_to_other_genes:
-# ### Quantify
+	input:
-# ##################################################################################
+		reads = os.path.join(config["output_dir"], "{sample}/pro.filtered.fasta"),
+		index = config["other_RNAs_index"],
+		sequence = config["other_RNAs_sequence"]
+	output:
+		sam = os.path.join(config["output_dir"], "{sample}/other_genes.mapped.sam"),
+		reads = os.path.join(config["output_dir"], "{sample}/other_genes.unmapped.fasta")
+	params:
+		silent = "--silent",
+		accuracy = "90",
+		cluster_log = os.path.join(config["cluster_log"], "map_to_other_genes_{sample}.log")
+	log:
+		os.path.join(config["local_log"], "map_to_other_genes_{sample}.log")
+	threads:	8
+	singularity:
+		"docker://fgypas/segemehl:0.2.0"
+	shell:
+		"(segemehl.x \
+		{params.silent} \
+		-i {input.index} \
+		-d {input.sequence} \
+		-q {input.reads} \
+		--accuracy {params.accuracy} \
+		--threads {threads} \
+		-o {output.sam} \
+		-u {output.reads} ) &> {log}"
-# rule salmon_quant:
+#################################################################################
-# 	input:
+### Map reads to other genes (rRNA, tRNA, etc...)
-# 		flag = config["dir_created"],
+#################################################################################
-# 		reads = os.path.join(config["output_dir"], "{sample}/trim_3p_adapter.fastq.gz"),
-# 		gtf = config["filtered_gtf"],
-# 		index = config["filtered_transcripts_salmon_index"]
-# 	output:
-# 		output = os.path.join(config["output_dir"], "{sample}/salmon_quant"),
-# 		tr_estimates = os.path.join(config["output_dir"], "{sample}/salmon_quant/quant.sf"),
-# 		gn_estimates = os.path.join(config["output_dir"], "{sample}/salmon_quant/quant.genes.sf")
-# 	params:
-# 		libType = "A",
-# 		fldMean = "300",
-# 		fldSD = "100",
-# 		activate = config["activate"],
-# 		env = config["env"],
-# 		cluster_log = os.path.join(config["cluster_log"], "salmon_quant_{sample}.log")
-# 	log:
-# 		os.path.join(config["local_log"], "salmon_quant_{sample}.log")
-# 	threads:	6
-# 	shell:
-# 		"(set +u; source {params.activate} {params.env}; set -u; \
-# 		salmon quant \
-# 		--index {input.index} \
-# 		--libType {params.libType} \
-# 		--unmatedReads {input.reads} \
-# 		--seqBias \
-# 		--geneMap {input.gtf} \
-# 		--fldMean {params.fldMean} \
-# 		--fldSD {params.fldSD} \
-# 		--useVBOpt \
-# 		--threads {threads} \
-# 		--output {output.output}) > {log}"
-# ###################################################################################
+rule map_to_transcripts:
-# #### Map reads to rRNA
+	input:
-# ###################################################################################
+		reads = os.path.join(config["output_dir"], "{sample}/pro.filtered.fasta"),
-# #
+		index = config["transcripts_index"],
-# #rule map_reads_to_rRNA:
+		sequence = config["transcripts_sequence"]
-# #	input:
+	output:
-# #		rRNA = config["rRNA"],
+		sam = os.path.join(config["output_dir"], "{sample}/transcripts.mapped.sam"),
-# #		rRNA_index = config["rRNA_index"],
+		reads = os.path.join(config["output_dir"], "{sample}/transcripts.unmapped.fasta")
-# #		reads = os.path.join(config["output_dir"], "{sample}/trim_quality.fastq.gz")
+	params:
-# #	output:
+		silent = "--silent",
-# #		mapped_reads_to_rRNA = os.path.join(config["output_dir"], "{sample}/mapped_reads_to_rRNA.sam"),
+		accuracy = "90",
-# #		unmapped_reads_to_rRNA = os.path.join(config["output_dir"], "{sample}/unmapped_reads_to_rRNA.fq.gz")
+		cluster_log = os.path.join(config["cluster_log"], "map_to_transcripts_{sample}.log")
-# #	params:
+	log:
-# #		activate = config["activate"],
+		os.path.join(config["local_log"], "map_to_transcripts_{sample}.log")
-# #		env = config["env"],
+	threads:	8
-# #		cluster_log = os.path.join(config["cluster_log"], "map_reads_to_rRNA_{sample}.log"),
+	singularity:
-# #		unmapped_reads_to_rRNA = os.path.join(config["output_dir"], "{sample}/unmapped_reads_to_rRNA.fq")
+		"docker://fgypas/segemehl:0.2.0"
-# #	log:
+	shell:
-# #		os.path.join(config["local_log"], "map_reads_to_rRNA_{sample}.log")
+		"(segemehl.x \
-# #	threads:	4
+		{params.silent} \
-# #	shell:
+		-i {input.index} \
-# #		"(set +u; source {params.activate} {params.env}; set -u; \
+		-d {input.sequence} \
-# #		segemehl.x \
+		-q {input.reads} \
-# #		-i {input.rRNA_index} \
+		--accuracy {params.accuracy} \
-# #		-d {input.rRNA} \
+		--threads {threads} \
-# #		-q {input.reads} \
+		-o {output.sam} \
-# #		--accuracy 90 \
+		-u {output.reads} ) &> {log}"
-# #		--threads {threads} \
-# #		-o {output.mapped_reads_to_rRNA} \
-# #		-u {params.unmapped_reads_to_rRNA}; \
-# #		gzip {params.unmapped_reads_to_rRNA}) 2> {log}"
-# #
-# ####################################################################################
-# ##### Map reads to transcripts
-# ####################################################################################
-# ##
-# ##rule map_reads_to_transcripts_segemehll:
-# ##	input:
-# ##		transcripts = config["filtered_transcripts"],
-# ##		index = config["segemehl_filtered_transcripts_index"],
-# ##		reads = os.path.join(config["output_dir"], "{sample}/trim_quality.fastq.gz")
-# ##	output:
-# ##		mapped_reads_to_rRNA = os.path.join(config["output_dir"], "{sample}/mapped_reads_to_rRNA.sam"),
-# ##		unmapped_reads_to_rRNA = os.path.join(config["output_dir"], "{sample}/unmapped_reads_to_rRNA.fq.gz")
-# ##	params:
-# ##		activate = config["activate"],
-# ##		env = config["env"],
-# ##		cluster_log = os.path.join(config["cluster_log"], "map_reads_to_rRNA_{sample}.log"),
-# ##		unmapped_reads_to_rRNA = os.path.join(config["output_dir"], "{sample}/unmapped_reads_to_rRNA.fq")
-# ##	log:
-# ##		os.path.join(config["local_log"], "map_reads_to_rRNA_{sample}.log")
-# ##	threads:	4
-# ##	shell:
-# ##		"(set +u; source {params.activate} {params.env}; set -u; \
-# ##		segemehl.x \
-# ##		-i {input.rRNA_index} \
-# ##		-d {input.rRNA} \
-# ##		-q {input.reads} \
-# ##		--accuracy 90 \
-# ##		--threads {threads} \
-# ##		-o {output.mapped_reads_to_rRNA} \
-# ##		-u {params.unmapped_reads_to_rRNA}; \
-# ##		gzip {params.unmapped_reads_to_rRNA}) 2> {log}"
--- a/snakemake/process_data/config.yaml
+++ b/snakemake/process_data/config.yaml
@@ -2,7 +2,10 @@
  ##############################################################################
  ### Annotation
  ##############################################################################
+  other_RNAs_sequence: "../annotation/txome_rRNAs_joao.fa"
+  other_RNAs_index: "../annotation/txome_rRNAs_joao.idx"
+  transcripts_sequence: "../annotation/hg38_refseq_20171127_rep.fa"
+  transcripts_index: "../annotation/hg38_refseq_20171127_rep.idx"
  ##############################################################################
  ### Output and log directory
  ##############################################################################
@@ -17,8 +20,4 @@
  input_reads_pattern: ".fastq.gz"
  sample: ["example"]
  example: {adapter: GATCGGAAGAGCACA}
-  ##############################################################################
-  ### Parameters for independent rules/jobs
-  ##############################################################################
 ...
--- a/snakemake/process_data/dag.png
+++ b/snakemake/process_data/dag.png