Addition of prepare_annotation pipeline. Small changes in process_data pipeline.

716ae90f · BIOPZ-Gypas Foivos · 3cf3a11e · 3cf3a11e · 716ae90f · 716ae90f
Commit 716ae90f authored 7 years ago by BIOPZ-Gypas Foivos
--- a/docker/segemehl/0.2.0/Dockerfile
+++ b/docker/segemehl/0.2.0/Dockerfile
-##### BASE IMAGE #####
-FROM debian:9.3
-##### METADATA #####
-LABEL base.image="debian:9.3"
-LABEL version="1"
-LABEL software="segemehl"
-LABEL software.version="0.2.0"
-LABEL software.description="short read mapping with gaps"
-LABEL software.website="http://www.bioinf.uni-leipzig.de/Software/segemehl/"
-LABEL software.documentation="http://www.bioinf.uni-leipzig.de/Software/segemehl/"
-LABEL software.license="link to license(file) of original software"
-LABEL software.tags="Genomics, Transcriptomics"
-LABEL maintainer="foivos.gypas@unibas.ch"
-LABEL maintainer.organisation="Biozentrum, University of Basel"
-LABEL maintainer.location="Klingelbergstrasse 50/70, CH-4056 Basel, Switzerland"
-LABEL maintainer.lab="Zavolan Lab"
-LABEL maintainer.license="https://spdx.org/licenses/Apache-2.0"
-##### VARIABLES #####
-# Use variables for convenient updates/re-usability
-ENV SOFTWARE_VERSION 0.2.0
-##### INSTALL #####
-RUN apt-get update -y \
-  && apt-get install -y wget make gcc build-essential zlib1g-dev libncurses5-dev samtools msmtp zlib1g-dev make libncurses5-dev libxml2-dev \
-  && wget http://www.bioinf.uni-leipzig.de/Software/segemehl/segemehl_0_2_0.tar.gz \
-  && tar xzvf segemehl_0_2_0.tar.gz \
-  && cd segemehl_0_2_0/segemehl/ \
-  && make \
-  && cd ../.. \
-  && cp segemehl_0_2_0/segemehl/segemehl.x /usr/local/bin \
-  && cp segemehl_0_2_0/segemehl/lack.x /usr/local/bin \
-  && rm -rf segemehl_0_2_0* segemehl_0_2_0* \
-  && apt-get purge -y wget \
-  && apt-get autoremove -y && apt-get clean \
-  && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
--- a/snakemake/prepare_annotation/Snakefile
+++ b/snakemake/prepare_annotation/Snakefile
+configfile: "config.yaml"
+localrules: create_output_and_log_directories, finish
+#################################################################################
+### Finish rule
+#################################################################################
+rule finish:
+	input:
+		idx_other = os.path.join(config["output_dir"], "other_RNAs_sequence.idx"),
+		idx_transcripts = os.path.join(config["output_dir"], "longest_pc_transcript_per_gene.idx")
+#################################################################################
+### Create output and log directories
+#################################################################################
+rule create_output_and_log_directories:
+	output:
+		output_dir = config["output_dir"],
+		cluster_log = config["cluster_log"],
+		local_log = config["local_log"],
+		flag = config["dir_created"]
+	threads:	1
+	shell:
+		"mkdir -p {output.output_dir}; \
+		mkdir -p {output.cluster_log}; \
+		mkdir -p {output.local_log}; \
+		touch {output.flag};"
+#################################################################################
+### Select longest protein coding transcripts
+#################################################################################
+rule select_longest_coding_transcripts:
+	input:
+		flag = config["dir_created"],
+		gtf = config["gtf"],
+		script = os.path.join(config["scripts"], "find_longest_coding_transcripts.py")
+	output:
+		gtf = os.path.join(config["output_dir"], "longest_pc_transcript_per_gene.gtf")
+	params:
+		cluster_log =  os.path.join(config["cluster_log"], "select_longest_coding_transcript.log")
+	log:
+		os.path.join(config["local_log"], "select_longest_coding_transcript.log")
+	singularity:
+		"docker://zavolab/python_htseq:3.6.5_0.10.0"
+	shell:
+		"({input.script} \
+		--gtf {input.gtf} \
+		--out {output.gtf} \
+		) &> {log}"
+#################################################################################
+### Generate segemehl index for other RNAs
+#################################################################################
+rule generate_segemehl_index_other_RNAs:
+	input:
+		flag = config["dir_created"],
+		sequence = config["other_RNAs_sequence"]
+	output:
+		idx = os.path.join(config["output_dir"], "other_RNAs_sequence.idx")
+	params:
+		cluster_log = os.path.join(config["cluster_log"], "generate_segemehl_index_other_RNAs.log")
+	log:
+		os.path.join(config["local_log"], "generate_segemehl_index_other_RNAs.log")
+	singularity:
+		"docker://zavolab/segemehl:0.2.0"
+	shell:
+		"(segemehl.x -x {output.idx} -d {input.sequence}) &> {log}"
+#################################################################################
+### Extract transcripts
+#################################################################################
+rule extract_transcript_sequences:
+	input:
+		gtf = os.path.join(config["output_dir"], "longest_pc_transcript_per_gene.gtf"),
+		genome = config["genome"]
+	output:
+		transcripts = os.path.join(config["output_dir"], "longest_pc_transcript_per_gene.fa")
+	params:
+		cluster_log = os.path.join(config["cluster_log"], "extract_transcript_sequences.log")
+	log:
+		os.path.join(config["local_log"], "extract_transcript_sequences.log")
+	singularity:
+		"docker://zavolab/cufflinks:2.2.1"
+	shell:
+		"(gffread {input.gtf} \
+		-g {input.genome} \
+		-w {output.transcripts}) &> {log}"
+#################################################################################
+### Generate segemehl index for transcripts
+#################################################################################
+rule generate_segemehl_index_transcripts:
+	input:
+		sequence = os.path.join(config["output_dir"], "longest_pc_transcript_per_gene.fa")
+	output:
+		idx = os.path.join(config["output_dir"], "longest_pc_transcript_per_gene.idx")
+	params:
+		cluster_log = os.path.join(config["cluster_log"], "generate_segemehl_index_transcripts.log")
+	log:
+		os.path.join(config["local_log"], "generate_segemehl_index_transcripts.log")
+	singularity:
+		"docker://zavolab/segemehl:0.2.0"
+	shell:
+		"(segemehl.x -x {output.idx} -d {input.sequence}) &> {log}"
--- a/snakemake/prepare_annotation/cluster.json
+++ b/snakemake/prepare_annotation/cluster.json
+{
+"__default__":
+{
+"queue":"6hours",
+"time": "05:00:00",
+"threads":"1",
+"mem":"4G"
+},
+"generate_segemehl_index_other_RNAs":
+{
+"queue":"6hours",
+"time": "06:00:00",
+"threads":"8",
+"mem":"50G"
+},
+"generate_segemehl_index_transcripts":
+{
+"queue":"6hours",
+"time": "06:00:00",
+"threads":"8",
+"mem":"50G"
+}
+}
--- a/snakemake/prepare_annotation/config.yaml
+++ b/snakemake/prepare_annotation/config.yaml
+---
+  ##############################################################################
+  ### Annotation
+  ##############################################################################
+  genome: "../annotation/Homo_sapiens.GRCh38.dna_sm.primary_assembly.fa"
+  gtf: "../annotation/Homo_sapiens.GRCh38.90.chr.gtf"
+  other_RNAs_sequence: "../annotation/txome_rRNAs_joao.fa"
+  ##############################################################################
+  ### Output and log directory
+  ##############################################################################
+  output_dir: "results"
+  local_log: "results/local_log"
+  cluster_log: "results/cluster_log"
+  dir_created: "results/dir_created"
+  scripts: "scripts"
+  ##############################################################################
+  ### sample info
+  ##############################################################################
+  input_dir: "samples"
+  input_reads_pattern: ".fastq.gz"
+  sample: ["example"]
+  example: {adapter: GATCGGAAGAGCACA}
+...
--- a/snakemake/prepare_annotation/create_snakemake_flowchart.sh
+++ b/snakemake/prepare_annotation/create_snakemake_flowchart.sh
+snakemake --dag -np | dot -Tpng > dag.png
--- a/snakemake/prepare_annotation/dag.png
+++ b/snakemake/prepare_annotation/dag.png
--- a/snakemake/prepare_annotation/run_snakefile.sh
+++ b/snakemake/prepare_annotation/run_snakefile.sh
+# set -e
+snakemake \
+--cluster-config cluster.json \
+--cluster "sbatch --cpus-per-task={cluster.threads} --mem={cluster.mem} --qos={cluster.queue} --time={cluster.time} --output={params.cluster_log}-%j-%N -p scicore" \
+--cores 256 \
+-p \
+--rerun-incomplete \
+--use-singularity \
+--singularity-args "--bind ${PWD}"
\ No newline at end of file
--- a/snakemake/prepare_annotation/scripts/find_longest_coding_transcripts.py
+++ b/snakemake/prepare_annotation/scripts/find_longest_coding_transcripts.py
+#!/usr/bin/env python
+__version__ = "0.1"
+__author__ = "Foivos Gypas"
+__contact__ = "foivos.gypas@unibas.ch"
+__doc__ = "Parse a gtf file and find the longest expressed protein " + \
+          "coding transcripts for each gene."
+# _____________________________________________________________________________
+# -----------------------------------------------------------------------------
+# import needed (external) modules
+# -----------------------------------------------------------------------------
+import sys
+import os
+import HTSeq
+from argparse import ArgumentParser, RawTextHelpFormatter
+# _____________________________________________________________________________
+# -----------------------------------------------------------------------------
+# Custom classes
+# -----------------------------------------------------------------------------
+class Gene(object):
+    """Gene object"""
+    def __init__(self, gene_id):
+        """Init object"""
+        self.gene_id = gene_id
+        self.transcripts = []
+    def add_transcript(self, transcript):
+        """Add transcript in the list of transcripts"""
+        if transcript not in self.transcripts:
+            self.transcripts.append(transcript)
+    def get_all_transcripts(self):
+        """Get all transcripts of that gene"""
+        return self.transcripts
+    def get_longest_coding_transcript(self):
+        """"""
+        selected_transcript = None
+        for transcript in self.transcripts:
+            if selected_transcript is None:
+                selected_transcript = transcript
+                continue
+            if (transcript.get_CDS_length() > selected_transcript.get_CDS_length()):
+                selected_transcript = transcript
+        return selected_transcript
+class Transcript(object):
+    """Transcript object"""
+    def __init__(self, transcript_id, gene_id):
+        """Init object"""
+        self.transcript_id = transcript_id
+        self.gene_id = gene_id
+        self.CDS_length = 0
+    def get_CDS_length(self):
+        """Get CDS length of the transcript"""
+        return self.CDS_length
+    def update_CDS_length(self, length):
+        """Update CDS length"""
+        self.CDS_length += length
+# _____________________________________________________________________________
+# -----------------------------------------------------------------------------
+# Main function
+# -----------------------------------------------------------------------------
+def main():
+    """ Main function """
+    parser = ArgumentParser(
+        description=__doc__,
+        formatter_class=RawTextHelpFormatter
+    )
+    parser.add_argument(
+        "--gtf",
+        dest="gtf",
+        help="Input GTF file in ENSEMBL format",
+        required=True,
+        metavar="FILE"
+    )
+    parser.add_argument(
+        "--out",
+        dest="out",
+        help="GTF output file",
+        required=True,
+        metavar="FILE"
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        dest="verbose",
+        default=False,
+        required=False,
+        help="Verbose"
+    )
+    parser.add_argument(
+        '--version',
+        action='version',
+        version=__version__
+    )
+    # _________________________________________________________________________
+    # -------------------------------------------------------------------------
+    # get the arguments
+    # -------------------------------------------------------------------------
+    try:
+        options = parser.parse_args()
+    except(Exception):
+        parser.print_help()
+    if len(sys.argv) == 1:
+        parser.print_help()
+        sys.exit(1)
+    if options.verbose:
+        sys.stdout.write(
+            "Parsing gtf file: {} {}".format(
+                options.gtf,
+                os.linesep
+            )
+        )
+    # dictionary of genes
+    genes = {}
+    # dictionary of transcripts
+    transcripts = {}
+    # parse gtf file
+    gtf_file = HTSeq.GFF_Reader(options.gtf)
+    for gtf_line in gtf_file:
+        if gtf_line.type == 'CDS':
+            transcript_id = gtf_line.attr['transcript_id']
+            gene_id = gtf_line.attr['gene_id']
+            length = abs(gtf_line.iv.end - gtf_line.iv.start)
+            if gene_id not in genes.keys():
+                genes[gene_id] = Gene(gene_id=gene_id)
+            if transcript_id not in transcripts.keys():
+                transcripts[transcript_id] = Transcript(transcript_id, gene_id)
+                genes[gene_id].transcripts.append(transcripts[transcript_id])
+            transcripts[transcript_id].update_CDS_length(length)
+    # keep the longest coding transcripts per gene
+    selected_transcsripts_ids = {}
+    for gene_id, gene_object in genes.items():
+        if options.verbose:
+            print(gene_object.gene_id)
+            for tr in gene_object.transcripts:
+                print(tr.transcript_id, tr.get_CDS_length())
+        longest_transcript = gene_object.get_longest_coding_transcript()
+        selected_transcsripts_ids[longest_transcript.transcript_id] = \
+            longest_transcript.transcript_id
+    # parse again the gtf file and keep only the longest transcripts that
+    # were selected previously
+    if options.verbose:
+        sys.stdout.write(
+            "Re-parse gtf file: {} and write out: {} {}".format(
+                options.gtf,
+                options.out,
+                os.linesep
+            )
+        )
+    w = open(options.out, 'w')
+    for gtf_line in gtf_file:
+        if (
+            gtf_line.type == 'transcript' or
+            gtf_line.type == 'exon' or
+            gtf_line.type == 'CDS' or
+            gtf_line.type == 'start_codon' or
+            gtf_line.type == 'stop_codon'
+        ):
+            if (gtf_line.attr['transcript_id'] in selected_transcsripts_ids):
+                w.write(gtf_line.get_gff_line())
+    w.close()
+# _____________________________________________________________________________
+# -----------------------------------------------------------------------------
+# Call the Main function and catch Keyboard interrups
+# -----------------------------------------------------------------------------
+if __name__ == '__main__':
+    try:
+        main()
+    except KeyboardInterrupt:
+        sys.stderr.write("User interrupt!" + os.linesep)
+        sys.exit(0)
--- a/snakemake/process_data/Snakefile
+++ b/snakemake/process_data/Snakefile
@@ -51,7 +51,7 @@ rule clip_reads:
 	log:
 		os.path.join(config["local_log"], "clip_reads_{sample}.log")
 	singularity:
-		"docker://cjh4zavolab/fastx:0.0.14"
+		"docker://zavolab/fastx:0.0.14"
 	shell:
 		"(fastx_clipper \
 		{params.v} \
@@ -82,7 +82,7 @@ rule trim_reads:
 	log:
 		os.path.join(config["cluster_log"], "trim_reads_{sample}.log")
 	singularity:
-		"docker://cjh4zavolab/fastx:0.0.14"
+		"docker://zavolab/fastx:0.0.14"
 	shell:
 		"(fastq_quality_trimmer \
 		{params.v} \
@@ -112,7 +112,7 @@ rule filter_reads:
 	log:
 		os.path.join(config["cluster_log"], "filter_reads_{sample}.log")
 	singularity:
-		"docker://cjh4zavolab/fastx:0.0.14"
+		"docker://zavolab/fastx:0.0.14"
 	shell:
 		"(fastq_quality_filter \
 		{params.v} \
@@ -142,7 +142,7 @@ rule fastq_to_fasta:
 	log:
 		os.path.join(config["cluster_log"], "fastq_to_fasta_{sample}.log")
 	singularity:
-		"docker://cjh4zavolab/fastx:0.0.14"
+		"docker://zavolab/fastx:0.0.14"
 	shell:
 		"(fastq_to_fasta \
 		{params.v} \
@@ -172,7 +172,7 @@ rule map_to_other_genes:
 		os.path.join(config["local_log"], "map_to_other_genes_{sample}.log")
 	threads:	8
 	singularity:
-		"docker://fgypas/segemehl:0.2.0"
+		"docker://zavolab/segemehl:0.2.0"
 	shell:
 		"(segemehl.x \
 		{params.silent} \
@@ -204,7 +204,7 @@ rule map_to_transcripts:
 		os.path.join(config["local_log"], "map_to_transcripts_{sample}.log")
 	threads:	8
 	singularity:
-		"docker://fgypas/segemehl:0.2.0"
+		"docker://zavolab/segemehl:0.2.0"
 	shell:
 		"(segemehl.x \
 		{params.silent} \