diff --git a/snakemake/docker/segemehl/0.2.0/Dockerfile b/docker/segemehl/0.2.0/Dockerfile similarity index 53% rename from snakemake/docker/segemehl/0.2.0/Dockerfile rename to docker/segemehl/0.2.0/Dockerfile index 26d608d8c2908a9add9002649d87171c31f507ea..f9e20e2c93bdde13970aade578f08507ace32532 100644 --- a/snakemake/docker/segemehl/0.2.0/Dockerfile +++ b/docker/segemehl/0.2.0/Dockerfile @@ -23,25 +23,15 @@ ENV SOFTWARE_VERSION 0.2.0 ##### INSTALL ##### RUN apt-get update -y \ - && apt-get install -y wget python build-essential libz-dev gcc libgl1-mesa-dev - -## Do your install stuff, merge with '&&' whenever possible, don't create unnecessary layers -# && wget https://.../v${SOFTWARE_VERSION}/software-${SOFTWARE_VERSION}.tar.gz \ -# && tar -zxvf software-${SOFTWARE_VERSION}.tar.gz \ -# && cd software \ -# && make \ -## Copy binaries to location in PATH -# && cp bin/* /usr/bin \ -## Remove files no longer needed -# && cd .. \ -# && rm -r /intermediate/folder software-${SOFTWARE_VERSION}.tar.gz \ -## Clean up -# && apt-get purge -y wget \ -# && apt-get autoremove -y && apt-get clean \ -# && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \ -# -###### SET PATH (if not in /usr/bin/) ##### -#ENV PATH /path/to/binary:$PATH -# -###### WORKING DIRECTORY ##### -#WORKDIR /data/ + && apt-get install -y wget make gcc build-essential zlib1g-dev libncurses5-dev samtools msmtp zlib1g-dev make libncurses5-dev libxml2-dev \ + && wget http://www.bioinf.uni-leipzig.de/Software/segemehl/segemehl_0_2_0.tar.gz \ + && tar xzvf segemehl_0_2_0.tar.gz \ + && cd segemehl_0_2_0/segemehl/ \ + && make \ + && cd ../.. \ + && cp segemehl_0_2_0/segemehl/segemehl.x /usr/local/bin \ + && cp segemehl_0_2_0/segemehl/lack.x /usr/local/bin \ + && rm -rf segemehl_0_2_0* segemehl_0_2_0* \ + && apt-get purge -y wget \ + && apt-get autoremove -y && apt-get clean \ + && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* diff --git a/snakemake/process_data/Snakefile b/snakemake/process_data/Snakefile index 8923e9b0e4ac2996c9ed296b2f0d05b47d01b251..0f2528d30e2bc7fd5125f1b2402d0f7fea2d2ee5 100644 --- a/snakemake/process_data/Snakefile +++ b/snakemake/process_data/Snakefile @@ -9,7 +9,7 @@ localrules: create_output_and_log_directories, concat_samples, finish rule finish: input: - reads = expand(os.path.join(config["output_dir"], "{sample}/pro.filtered.fasta.gz"), sample=config["sample"]) + sam = expand(os.path.join(config["output_dir"], "{sample}/transcripts.mapped.sam"), sample=config["sample"]) ################################################################################# ### Create output and log directories @@ -131,7 +131,7 @@ rule fastq_to_fasta: input: reads = os.path.join(config["output_dir"], "{sample}/pro.filtered.fastq.gz"), output: - reads = os.path.join(config["output_dir"], "{sample}/pro.filtered.fasta.gz"), + reads = os.path.join(config["output_dir"], "{sample}/pro.filtered.fasta"), params: v = "-v", qual = "-Q33", @@ -149,226 +149,69 @@ rule fastq_to_fasta: {params.qual} \ {params.n} \ {params.r} \ - {params.z} \ -i <(zcat {input.reads}) \ -o {output.reads}) &> {log}" -# ################################################################################# -# ### Trim 3' adapters -# ################################################################################# - -# rule trim_3p_adapter: -# input: -# flag = config["dir_created"], -# reads = os.path.join(config["input_dir"], "{sample}" + config["input_reads_pattern"]), -# output: -# reads = os.path.join(config["output_dir"], "{sample}/trim_3p_adapter.fastq.gz"), -# params: -# adapter = lambda wildcards: config[ wildcards.sample ]['adapter'], -# error_rate = config["error_rate"], -# minimum_length = config["minimum_length"], -# overlap = config["overlap"], -# cluster_log = os.path.join(config["cluster_log"], "trim_3p_adapter_{sample}.log"), -# activate = config["activate"], -# env = config["env"] -# log: -# os.path.join(config["local_log"], "trim_3p_adapter_{sample}.log") -# shell: -# "(set +u; source {params.activate} {params.env}; set -u; \ -# cutadapt \ -# --adapter {params.adapter} \ -# --error-rate {params.error_rate} \ -# --minimum-length {params.minimum_length} \ -# --overlap {params.overlap} \ -# {input.reads} | gzip > {output.reads}) 2> {log}" - -# ################################################################################# -# ### Concatenate fastq files -# ################################################################################# - -# rule concat_samples: -# input: -# reads = expand(os.path.join(config["output_dir"], "{sample}/trim_3p_adapter.fastq.gz"),sample=config["sample"]) -# output: -# reads = os.path.join(config["output_dir"],"merged_reads.fastq.gz") -# params: -# cluster_log = os.path.join(config["cluster_log"], "concat_samples.log"), -# activate = config["activate"], -# env = config["env"] -# log: -# os.path.join(config["local_log"], "concat_samples.log") -# shell: -# "(set +u; source {params.activate} {params.env}; set -u; \ -# cat {input.reads} > {output.reads}) 2> {log}" - -# ################################################################################# -# ### Align reads STAR -# ################################################################################# - -# rule align_reads_STAR: -# input: -# index = config["STAR_index"], -# reads = os.path.join(config["output_dir"],"merged_reads.fastq.gz"), -# gtf = config["gtf"] -# output: -# outputfile = os.path.join(config["output_dir"], "STAR_Aligned.out.bam") -# params: -# outFileNamePrefix = os.path.join(config["output_dir"], "STAR_"), -# cluster_log = os.path.join(config["cluster_log"], "align_reads_STAR.log"), -# activate = config["activate"], -# env = config["env"] -# threads: 8 -# log: -# os.path.join(config["local_log"],"align_reads_STAR.log") -# shell: -# "(set +u; source {params.activate} {params.env}; set -u; \ -# STAR --runMode alignReads \ -# --twopassMode Basic \ -# --runThreadN {threads} \ -# --genomeDir {input.index} \ -# --sjdbGTFfile {input.gtf} \ -# --readFilesIn {input.reads} \ -# --readFilesCommand zcat \ -# --outFileNamePrefix {params.outFileNamePrefix} \ -# --outSAMtype BAM Unsorted) 2> {log}" - -# ################################################################################# -# ### Sort alignment file -# ################################################################################# - -# rule sort_bam: -# input: -# bam = os.path.join(config["output_dir"], "STAR_Aligned.out.bam") -# output: -# bam = os.path.join(config["output_dir"], "STAR_Aligned.out.sorted.bam") -# params: -# cluster_log = os.path.join(config["cluster_log"], "sort_bam.log"), -# activate = config["activate"], -# env = config["env"] -# threads: 8 -# log: -# os.path.join(config["local_log"],"sort_bam.log") -# shell: -# "(set +u; source {params.activate} {params.env}; set -u; \ -# samtools sort -@ {threads} {input.bam} > {output.bam}) 2> {log}" - -# ################################################################################# -# ### Index alignment file -# ################################################################################# - -# rule samtools_index: -# input: -# bam = os.path.join(config["output_dir"], "STAR_Aligned.out.sorted.bam") -# output: -# bai = os.path.join(config["output_dir"], "STAR_Aligned.out.sorted.bam.bai") -# params: -# cluster_log = os.path.join(config["cluster_log"], "samtools_index.log"), -# activate = config["activate"], -# env = config["env"] -# log: -# os.path.join(config["local_log"],"samtools_index.log") -# shell: -# "(set +u; source {params.activate} {params.env}; set -u; \ -# samtools index {input.bam} > {output.bai}) 2> {log}" +################################################################################# +### Map reads to other genes (rRNA, tRNA, etc...) +################################################################################# -# ################################################################################## -# ### Quantify -# ################################################################################## +rule map_to_other_genes: + input: + reads = os.path.join(config["output_dir"], "{sample}/pro.filtered.fasta"), + index = config["other_RNAs_index"], + sequence = config["other_RNAs_sequence"] + output: + sam = os.path.join(config["output_dir"], "{sample}/other_genes.mapped.sam"), + reads = os.path.join(config["output_dir"], "{sample}/other_genes.unmapped.fasta") + params: + silent = "--silent", + accuracy = "90", + cluster_log = os.path.join(config["cluster_log"], "map_to_other_genes_{sample}.log") + log: + os.path.join(config["local_log"], "map_to_other_genes_{sample}.log") + threads: 8 + singularity: + "docker://fgypas/segemehl:0.2.0" + shell: + "(segemehl.x \ + {params.silent} \ + -i {input.index} \ + -d {input.sequence} \ + -q {input.reads} \ + --accuracy {params.accuracy} \ + --threads {threads} \ + -o {output.sam} \ + -u {output.reads} ) &> {log}" -# rule salmon_quant: -# input: -# flag = config["dir_created"], -# reads = os.path.join(config["output_dir"], "{sample}/trim_3p_adapter.fastq.gz"), -# gtf = config["filtered_gtf"], -# index = config["filtered_transcripts_salmon_index"] -# output: -# output = os.path.join(config["output_dir"], "{sample}/salmon_quant"), -# tr_estimates = os.path.join(config["output_dir"], "{sample}/salmon_quant/quant.sf"), -# gn_estimates = os.path.join(config["output_dir"], "{sample}/salmon_quant/quant.genes.sf") -# params: -# libType = "A", -# fldMean = "300", -# fldSD = "100", -# activate = config["activate"], -# env = config["env"], -# cluster_log = os.path.join(config["cluster_log"], "salmon_quant_{sample}.log") -# log: -# os.path.join(config["local_log"], "salmon_quant_{sample}.log") -# threads: 6 -# shell: -# "(set +u; source {params.activate} {params.env}; set -u; \ -# salmon quant \ -# --index {input.index} \ -# --libType {params.libType} \ -# --unmatedReads {input.reads} \ -# --seqBias \ -# --geneMap {input.gtf} \ -# --fldMean {params.fldMean} \ -# --fldSD {params.fldSD} \ -# --useVBOpt \ -# --threads {threads} \ -# --output {output.output}) > {log}" +################################################################################# +### Map reads to other genes (rRNA, tRNA, etc...) +################################################################################# -# ################################################################################### -# #### Map reads to rRNA -# ################################################################################### -# # -# #rule map_reads_to_rRNA: -# # input: -# # rRNA = config["rRNA"], -# # rRNA_index = config["rRNA_index"], -# # reads = os.path.join(config["output_dir"], "{sample}/trim_quality.fastq.gz") -# # output: -# # mapped_reads_to_rRNA = os.path.join(config["output_dir"], "{sample}/mapped_reads_to_rRNA.sam"), -# # unmapped_reads_to_rRNA = os.path.join(config["output_dir"], "{sample}/unmapped_reads_to_rRNA.fq.gz") -# # params: -# # activate = config["activate"], -# # env = config["env"], -# # cluster_log = os.path.join(config["cluster_log"], "map_reads_to_rRNA_{sample}.log"), -# # unmapped_reads_to_rRNA = os.path.join(config["output_dir"], "{sample}/unmapped_reads_to_rRNA.fq") -# # log: -# # os.path.join(config["local_log"], "map_reads_to_rRNA_{sample}.log") -# # threads: 4 -# # shell: -# # "(set +u; source {params.activate} {params.env}; set -u; \ -# # segemehl.x \ -# # -i {input.rRNA_index} \ -# # -d {input.rRNA} \ -# # -q {input.reads} \ -# # --accuracy 90 \ -# # --threads {threads} \ -# # -o {output.mapped_reads_to_rRNA} \ -# # -u {params.unmapped_reads_to_rRNA}; \ -# # gzip {params.unmapped_reads_to_rRNA}) 2> {log}" -# # -# #################################################################################### -# ##### Map reads to transcripts -# #################################################################################### -# ## -# ##rule map_reads_to_transcripts_segemehll: -# ## input: -# ## transcripts = config["filtered_transcripts"], -# ## index = config["segemehl_filtered_transcripts_index"], -# ## reads = os.path.join(config["output_dir"], "{sample}/trim_quality.fastq.gz") -# ## output: -# ## mapped_reads_to_rRNA = os.path.join(config["output_dir"], "{sample}/mapped_reads_to_rRNA.sam"), -# ## unmapped_reads_to_rRNA = os.path.join(config["output_dir"], "{sample}/unmapped_reads_to_rRNA.fq.gz") -# ## params: -# ## activate = config["activate"], -# ## env = config["env"], -# ## cluster_log = os.path.join(config["cluster_log"], "map_reads_to_rRNA_{sample}.log"), -# ## unmapped_reads_to_rRNA = os.path.join(config["output_dir"], "{sample}/unmapped_reads_to_rRNA.fq") -# ## log: -# ## os.path.join(config["local_log"], "map_reads_to_rRNA_{sample}.log") -# ## threads: 4 -# ## shell: -# ## "(set +u; source {params.activate} {params.env}; set -u; \ -# ## segemehl.x \ -# ## -i {input.rRNA_index} \ -# ## -d {input.rRNA} \ -# ## -q {input.reads} \ -# ## --accuracy 90 \ -# ## --threads {threads} \ -# ## -o {output.mapped_reads_to_rRNA} \ -# ## -u {params.unmapped_reads_to_rRNA}; \ -# ## gzip {params.unmapped_reads_to_rRNA}) 2> {log}" +rule map_to_transcripts: + input: + reads = os.path.join(config["output_dir"], "{sample}/pro.filtered.fasta"), + index = config["transcripts_index"], + sequence = config["transcripts_sequence"] + output: + sam = os.path.join(config["output_dir"], "{sample}/transcripts.mapped.sam"), + reads = os.path.join(config["output_dir"], "{sample}/transcripts.unmapped.fasta") + params: + silent = "--silent", + accuracy = "90", + cluster_log = os.path.join(config["cluster_log"], "map_to_transcripts_{sample}.log") + log: + os.path.join(config["local_log"], "map_to_transcripts_{sample}.log") + threads: 8 + singularity: + "docker://fgypas/segemehl:0.2.0" + shell: + "(segemehl.x \ + {params.silent} \ + -i {input.index} \ + -d {input.sequence} \ + -q {input.reads} \ + --accuracy {params.accuracy} \ + --threads {threads} \ + -o {output.sam} \ + -u {output.reads} ) &> {log}" diff --git a/snakemake/process_data/config.yaml b/snakemake/process_data/config.yaml index 324a1bf9e498593430d263a89d3d69bdb7559db2..10a986d619f36e8f2623c46bd583bc7ec4149551 100644 --- a/snakemake/process_data/config.yaml +++ b/snakemake/process_data/config.yaml @@ -2,7 +2,10 @@ ############################################################################## ### Annotation ############################################################################## - + other_RNAs_sequence: "../annotation/txome_rRNAs_joao.fa" + other_RNAs_index: "../annotation/txome_rRNAs_joao.idx" + transcripts_sequence: "../annotation/hg38_refseq_20171127_rep.fa" + transcripts_index: "../annotation/hg38_refseq_20171127_rep.idx" ############################################################################## ### Output and log directory ############################################################################## @@ -17,8 +20,4 @@ input_reads_pattern: ".fastq.gz" sample: ["example"] example: {adapter: GATCGGAAGAGCACA} - ############################################################################## - ### Parameters for independent rules/jobs - ############################################################################## - ... diff --git a/snakemake/process_data/dag.png b/snakemake/process_data/dag.png index 972eeb28d9a2651dff35aae195a955d3326737e8..2adc7095107d6deb94fd8a170a15ae20b466b454 100644 Binary files a/snakemake/process_data/dag.png and b/snakemake/process_data/dag.png differ