Skip to content
Snippets Groups Projects
Commit 9547714c authored by BIOPZ-Gypas Foivos's avatar BIOPZ-Gypas Foivos
Browse files

Added mapping steps in snakemake process_data pipeline

parent d99c1d48
No related branches found
No related tags found
No related merge requests found
...@@ -23,25 +23,15 @@ ENV SOFTWARE_VERSION 0.2.0 ...@@ -23,25 +23,15 @@ ENV SOFTWARE_VERSION 0.2.0
##### INSTALL ##### ##### INSTALL #####
RUN apt-get update -y \ RUN apt-get update -y \
&& apt-get install -y wget python build-essential libz-dev gcc libgl1-mesa-dev && apt-get install -y wget make gcc build-essential zlib1g-dev libncurses5-dev samtools msmtp zlib1g-dev make libncurses5-dev libxml2-dev \
&& wget http://www.bioinf.uni-leipzig.de/Software/segemehl/segemehl_0_2_0.tar.gz \
## Do your install stuff, merge with '&&' whenever possible, don't create unnecessary layers && tar xzvf segemehl_0_2_0.tar.gz \
# && wget https://.../v${SOFTWARE_VERSION}/software-${SOFTWARE_VERSION}.tar.gz \ && cd segemehl_0_2_0/segemehl/ \
# && tar -zxvf software-${SOFTWARE_VERSION}.tar.gz \ && make \
# && cd software \ && cd ../.. \
# && make \ && cp segemehl_0_2_0/segemehl/segemehl.x /usr/local/bin \
## Copy binaries to location in PATH && cp segemehl_0_2_0/segemehl/lack.x /usr/local/bin \
# && cp bin/* /usr/bin \ && rm -rf segemehl_0_2_0* segemehl_0_2_0* \
## Remove files no longer needed && apt-get purge -y wget \
# && cd .. \ && apt-get autoremove -y && apt-get clean \
# && rm -r /intermediate/folder software-${SOFTWARE_VERSION}.tar.gz \ && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
## Clean up
# && apt-get purge -y wget \
# && apt-get autoremove -y && apt-get clean \
# && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \
#
###### SET PATH (if not in /usr/bin/) #####
#ENV PATH /path/to/binary:$PATH
#
###### WORKING DIRECTORY #####
#WORKDIR /data/
...@@ -9,7 +9,7 @@ localrules: create_output_and_log_directories, concat_samples, finish ...@@ -9,7 +9,7 @@ localrules: create_output_and_log_directories, concat_samples, finish
rule finish: rule finish:
input: input:
reads = expand(os.path.join(config["output_dir"], "{sample}/pro.filtered.fasta.gz"), sample=config["sample"]) sam = expand(os.path.join(config["output_dir"], "{sample}/transcripts.mapped.sam"), sample=config["sample"])
################################################################################# #################################################################################
### Create output and log directories ### Create output and log directories
...@@ -131,7 +131,7 @@ rule fastq_to_fasta: ...@@ -131,7 +131,7 @@ rule fastq_to_fasta:
input: input:
reads = os.path.join(config["output_dir"], "{sample}/pro.filtered.fastq.gz"), reads = os.path.join(config["output_dir"], "{sample}/pro.filtered.fastq.gz"),
output: output:
reads = os.path.join(config["output_dir"], "{sample}/pro.filtered.fasta.gz"), reads = os.path.join(config["output_dir"], "{sample}/pro.filtered.fasta"),
params: params:
v = "-v", v = "-v",
qual = "-Q33", qual = "-Q33",
...@@ -149,226 +149,69 @@ rule fastq_to_fasta: ...@@ -149,226 +149,69 @@ rule fastq_to_fasta:
{params.qual} \ {params.qual} \
{params.n} \ {params.n} \
{params.r} \ {params.r} \
{params.z} \
-i <(zcat {input.reads}) \ -i <(zcat {input.reads}) \
-o {output.reads}) &> {log}" -o {output.reads}) &> {log}"
# ################################################################################# #################################################################################
# ### Trim 3' adapters ### Map reads to other genes (rRNA, tRNA, etc...)
# ################################################################################# #################################################################################
# rule trim_3p_adapter:
# input:
# flag = config["dir_created"],
# reads = os.path.join(config["input_dir"], "{sample}" + config["input_reads_pattern"]),
# output:
# reads = os.path.join(config["output_dir"], "{sample}/trim_3p_adapter.fastq.gz"),
# params:
# adapter = lambda wildcards: config[ wildcards.sample ]['adapter'],
# error_rate = config["error_rate"],
# minimum_length = config["minimum_length"],
# overlap = config["overlap"],
# cluster_log = os.path.join(config["cluster_log"], "trim_3p_adapter_{sample}.log"),
# activate = config["activate"],
# env = config["env"]
# log:
# os.path.join(config["local_log"], "trim_3p_adapter_{sample}.log")
# shell:
# "(set +u; source {params.activate} {params.env}; set -u; \
# cutadapt \
# --adapter {params.adapter} \
# --error-rate {params.error_rate} \
# --minimum-length {params.minimum_length} \
# --overlap {params.overlap} \
# {input.reads} | gzip > {output.reads}) 2> {log}"
# #################################################################################
# ### Concatenate fastq files
# #################################################################################
# rule concat_samples:
# input:
# reads = expand(os.path.join(config["output_dir"], "{sample}/trim_3p_adapter.fastq.gz"),sample=config["sample"])
# output:
# reads = os.path.join(config["output_dir"],"merged_reads.fastq.gz")
# params:
# cluster_log = os.path.join(config["cluster_log"], "concat_samples.log"),
# activate = config["activate"],
# env = config["env"]
# log:
# os.path.join(config["local_log"], "concat_samples.log")
# shell:
# "(set +u; source {params.activate} {params.env}; set -u; \
# cat {input.reads} > {output.reads}) 2> {log}"
# #################################################################################
# ### Align reads STAR
# #################################################################################
# rule align_reads_STAR:
# input:
# index = config["STAR_index"],
# reads = os.path.join(config["output_dir"],"merged_reads.fastq.gz"),
# gtf = config["gtf"]
# output:
# outputfile = os.path.join(config["output_dir"], "STAR_Aligned.out.bam")
# params:
# outFileNamePrefix = os.path.join(config["output_dir"], "STAR_"),
# cluster_log = os.path.join(config["cluster_log"], "align_reads_STAR.log"),
# activate = config["activate"],
# env = config["env"]
# threads: 8
# log:
# os.path.join(config["local_log"],"align_reads_STAR.log")
# shell:
# "(set +u; source {params.activate} {params.env}; set -u; \
# STAR --runMode alignReads \
# --twopassMode Basic \
# --runThreadN {threads} \
# --genomeDir {input.index} \
# --sjdbGTFfile {input.gtf} \
# --readFilesIn {input.reads} \
# --readFilesCommand zcat \
# --outFileNamePrefix {params.outFileNamePrefix} \
# --outSAMtype BAM Unsorted) 2> {log}"
# #################################################################################
# ### Sort alignment file
# #################################################################################
# rule sort_bam:
# input:
# bam = os.path.join(config["output_dir"], "STAR_Aligned.out.bam")
# output:
# bam = os.path.join(config["output_dir"], "STAR_Aligned.out.sorted.bam")
# params:
# cluster_log = os.path.join(config["cluster_log"], "sort_bam.log"),
# activate = config["activate"],
# env = config["env"]
# threads: 8
# log:
# os.path.join(config["local_log"],"sort_bam.log")
# shell:
# "(set +u; source {params.activate} {params.env}; set -u; \
# samtools sort -@ {threads} {input.bam} > {output.bam}) 2> {log}"
# #################################################################################
# ### Index alignment file
# #################################################################################
# rule samtools_index:
# input:
# bam = os.path.join(config["output_dir"], "STAR_Aligned.out.sorted.bam")
# output:
# bai = os.path.join(config["output_dir"], "STAR_Aligned.out.sorted.bam.bai")
# params:
# cluster_log = os.path.join(config["cluster_log"], "samtools_index.log"),
# activate = config["activate"],
# env = config["env"]
# log:
# os.path.join(config["local_log"],"samtools_index.log")
# shell:
# "(set +u; source {params.activate} {params.env}; set -u; \
# samtools index {input.bam} > {output.bai}) 2> {log}"
# ################################################################################## rule map_to_other_genes:
# ### Quantify input:
# ################################################################################## reads = os.path.join(config["output_dir"], "{sample}/pro.filtered.fasta"),
index = config["other_RNAs_index"],
sequence = config["other_RNAs_sequence"]
output:
sam = os.path.join(config["output_dir"], "{sample}/other_genes.mapped.sam"),
reads = os.path.join(config["output_dir"], "{sample}/other_genes.unmapped.fasta")
params:
silent = "--silent",
accuracy = "90",
cluster_log = os.path.join(config["cluster_log"], "map_to_other_genes_{sample}.log")
log:
os.path.join(config["local_log"], "map_to_other_genes_{sample}.log")
threads: 8
singularity:
"docker://fgypas/segemehl:0.2.0"
shell:
"(segemehl.x \
{params.silent} \
-i {input.index} \
-d {input.sequence} \
-q {input.reads} \
--accuracy {params.accuracy} \
--threads {threads} \
-o {output.sam} \
-u {output.reads} ) &> {log}"
# rule salmon_quant: #################################################################################
# input: ### Map reads to other genes (rRNA, tRNA, etc...)
# flag = config["dir_created"], #################################################################################
# reads = os.path.join(config["output_dir"], "{sample}/trim_3p_adapter.fastq.gz"),
# gtf = config["filtered_gtf"],
# index = config["filtered_transcripts_salmon_index"]
# output:
# output = os.path.join(config["output_dir"], "{sample}/salmon_quant"),
# tr_estimates = os.path.join(config["output_dir"], "{sample}/salmon_quant/quant.sf"),
# gn_estimates = os.path.join(config["output_dir"], "{sample}/salmon_quant/quant.genes.sf")
# params:
# libType = "A",
# fldMean = "300",
# fldSD = "100",
# activate = config["activate"],
# env = config["env"],
# cluster_log = os.path.join(config["cluster_log"], "salmon_quant_{sample}.log")
# log:
# os.path.join(config["local_log"], "salmon_quant_{sample}.log")
# threads: 6
# shell:
# "(set +u; source {params.activate} {params.env}; set -u; \
# salmon quant \
# --index {input.index} \
# --libType {params.libType} \
# --unmatedReads {input.reads} \
# --seqBias \
# --geneMap {input.gtf} \
# --fldMean {params.fldMean} \
# --fldSD {params.fldSD} \
# --useVBOpt \
# --threads {threads} \
# --output {output.output}) > {log}"
# ################################################################################### rule map_to_transcripts:
# #### Map reads to rRNA input:
# ################################################################################### reads = os.path.join(config["output_dir"], "{sample}/pro.filtered.fasta"),
# # index = config["transcripts_index"],
# #rule map_reads_to_rRNA: sequence = config["transcripts_sequence"]
# # input: output:
# # rRNA = config["rRNA"], sam = os.path.join(config["output_dir"], "{sample}/transcripts.mapped.sam"),
# # rRNA_index = config["rRNA_index"], reads = os.path.join(config["output_dir"], "{sample}/transcripts.unmapped.fasta")
# # reads = os.path.join(config["output_dir"], "{sample}/trim_quality.fastq.gz") params:
# # output: silent = "--silent",
# # mapped_reads_to_rRNA = os.path.join(config["output_dir"], "{sample}/mapped_reads_to_rRNA.sam"), accuracy = "90",
# # unmapped_reads_to_rRNA = os.path.join(config["output_dir"], "{sample}/unmapped_reads_to_rRNA.fq.gz") cluster_log = os.path.join(config["cluster_log"], "map_to_transcripts_{sample}.log")
# # params: log:
# # activate = config["activate"], os.path.join(config["local_log"], "map_to_transcripts_{sample}.log")
# # env = config["env"], threads: 8
# # cluster_log = os.path.join(config["cluster_log"], "map_reads_to_rRNA_{sample}.log"), singularity:
# # unmapped_reads_to_rRNA = os.path.join(config["output_dir"], "{sample}/unmapped_reads_to_rRNA.fq") "docker://fgypas/segemehl:0.2.0"
# # log: shell:
# # os.path.join(config["local_log"], "map_reads_to_rRNA_{sample}.log") "(segemehl.x \
# # threads: 4 {params.silent} \
# # shell: -i {input.index} \
# # "(set +u; source {params.activate} {params.env}; set -u; \ -d {input.sequence} \
# # segemehl.x \ -q {input.reads} \
# # -i {input.rRNA_index} \ --accuracy {params.accuracy} \
# # -d {input.rRNA} \ --threads {threads} \
# # -q {input.reads} \ -o {output.sam} \
# # --accuracy 90 \ -u {output.reads} ) &> {log}"
# # --threads {threads} \
# # -o {output.mapped_reads_to_rRNA} \
# # -u {params.unmapped_reads_to_rRNA}; \
# # gzip {params.unmapped_reads_to_rRNA}) 2> {log}"
# #
# ####################################################################################
# ##### Map reads to transcripts
# ####################################################################################
# ##
# ##rule map_reads_to_transcripts_segemehll:
# ## input:
# ## transcripts = config["filtered_transcripts"],
# ## index = config["segemehl_filtered_transcripts_index"],
# ## reads = os.path.join(config["output_dir"], "{sample}/trim_quality.fastq.gz")
# ## output:
# ## mapped_reads_to_rRNA = os.path.join(config["output_dir"], "{sample}/mapped_reads_to_rRNA.sam"),
# ## unmapped_reads_to_rRNA = os.path.join(config["output_dir"], "{sample}/unmapped_reads_to_rRNA.fq.gz")
# ## params:
# ## activate = config["activate"],
# ## env = config["env"],
# ## cluster_log = os.path.join(config["cluster_log"], "map_reads_to_rRNA_{sample}.log"),
# ## unmapped_reads_to_rRNA = os.path.join(config["output_dir"], "{sample}/unmapped_reads_to_rRNA.fq")
# ## log:
# ## os.path.join(config["local_log"], "map_reads_to_rRNA_{sample}.log")
# ## threads: 4
# ## shell:
# ## "(set +u; source {params.activate} {params.env}; set -u; \
# ## segemehl.x \
# ## -i {input.rRNA_index} \
# ## -d {input.rRNA} \
# ## -q {input.reads} \
# ## --accuracy 90 \
# ## --threads {threads} \
# ## -o {output.mapped_reads_to_rRNA} \
# ## -u {params.unmapped_reads_to_rRNA}; \
# ## gzip {params.unmapped_reads_to_rRNA}) 2> {log}"
...@@ -2,7 +2,10 @@ ...@@ -2,7 +2,10 @@
############################################################################## ##############################################################################
### Annotation ### Annotation
############################################################################## ##############################################################################
other_RNAs_sequence: "../annotation/txome_rRNAs_joao.fa"
other_RNAs_index: "../annotation/txome_rRNAs_joao.idx"
transcripts_sequence: "../annotation/hg38_refseq_20171127_rep.fa"
transcripts_index: "../annotation/hg38_refseq_20171127_rep.idx"
############################################################################## ##############################################################################
### Output and log directory ### Output and log directory
############################################################################## ##############################################################################
...@@ -17,8 +20,4 @@ ...@@ -17,8 +20,4 @@
input_reads_pattern: ".fastq.gz" input_reads_pattern: ".fastq.gz"
sample: ["example"] sample: ["example"]
example: {adapter: GATCGGAAGAGCACA} example: {adapter: GATCGGAAGAGCACA}
##############################################################################
### Parameters for independent rules/jobs
##############################################################################
... ...
snakemake/process_data/dag.png

14.7 KiB | W: | H:

snakemake/process_data/dag.png

17.4 KiB | W: | H:

snakemake/process_data/dag.png
snakemake/process_data/dag.png
snakemake/process_data/dag.png
snakemake/process_data/dag.png
  • 2-up
  • Swipe
  • Onion skin
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment