diff --git a/docker/segemehl/0.2.0/Dockerfile b/docker/segemehl/0.2.0/Dockerfile deleted file mode 100644 index f9e20e2c93bdde13970aade578f08507ace32532..0000000000000000000000000000000000000000 --- a/docker/segemehl/0.2.0/Dockerfile +++ /dev/null @@ -1,37 +0,0 @@ -##### BASE IMAGE ##### -FROM debian:9.3 - -##### METADATA ##### -LABEL base.image="debian:9.3" -LABEL version="1" -LABEL software="segemehl" -LABEL software.version="0.2.0" -LABEL software.description="short read mapping with gaps" -LABEL software.website="http://www.bioinf.uni-leipzig.de/Software/segemehl/" -LABEL software.documentation="http://www.bioinf.uni-leipzig.de/Software/segemehl/" -LABEL software.license="link to license(file) of original software" -LABEL software.tags="Genomics, Transcriptomics" -LABEL maintainer="foivos.gypas@unibas.ch" -LABEL maintainer.organisation="Biozentrum, University of Basel" -LABEL maintainer.location="Klingelbergstrasse 50/70, CH-4056 Basel, Switzerland" -LABEL maintainer.lab="Zavolan Lab" -LABEL maintainer.license="https://spdx.org/licenses/Apache-2.0" - -##### VARIABLES ##### -# Use variables for convenient updates/re-usability -ENV SOFTWARE_VERSION 0.2.0 - -##### INSTALL ##### -RUN apt-get update -y \ - && apt-get install -y wget make gcc build-essential zlib1g-dev libncurses5-dev samtools msmtp zlib1g-dev make libncurses5-dev libxml2-dev \ - && wget http://www.bioinf.uni-leipzig.de/Software/segemehl/segemehl_0_2_0.tar.gz \ - && tar xzvf segemehl_0_2_0.tar.gz \ - && cd segemehl_0_2_0/segemehl/ \ - && make \ - && cd ../.. \ - && cp segemehl_0_2_0/segemehl/segemehl.x /usr/local/bin \ - && cp segemehl_0_2_0/segemehl/lack.x /usr/local/bin \ - && rm -rf segemehl_0_2_0* segemehl_0_2_0* \ - && apt-get purge -y wget \ - && apt-get autoremove -y && apt-get clean \ - && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* diff --git a/snakemake/prepare_annotation/Snakefile b/snakemake/prepare_annotation/Snakefile new file mode 100644 index 0000000000000000000000000000000000000000..23deb5da77820088139d5c2647e7fab8bac23551 --- /dev/null +++ b/snakemake/prepare_annotation/Snakefile @@ -0,0 +1,110 @@ +configfile: "config.yaml" + +localrules: create_output_and_log_directories, finish + +################################################################################# +### Finish rule +################################################################################# + +rule finish: + input: + idx_other = os.path.join(config["output_dir"], "other_RNAs_sequence.idx"), + idx_transcripts = os.path.join(config["output_dir"], "longest_pc_transcript_per_gene.idx") + +################################################################################# +### Create output and log directories +################################################################################# + +rule create_output_and_log_directories: + output: + output_dir = config["output_dir"], + cluster_log = config["cluster_log"], + local_log = config["local_log"], + flag = config["dir_created"] + threads: 1 + shell: + "mkdir -p {output.output_dir}; \ + mkdir -p {output.cluster_log}; \ + mkdir -p {output.local_log}; \ + touch {output.flag};" + +################################################################################# +### Select longest protein coding transcripts +################################################################################# + +rule select_longest_coding_transcripts: + input: + flag = config["dir_created"], + gtf = config["gtf"], + script = os.path.join(config["scripts"], "find_longest_coding_transcripts.py") + output: + gtf = os.path.join(config["output_dir"], "longest_pc_transcript_per_gene.gtf") + params: + cluster_log = os.path.join(config["cluster_log"], "select_longest_coding_transcript.log") + log: + os.path.join(config["local_log"], "select_longest_coding_transcript.log") + singularity: + "docker://zavolab/python_htseq:3.6.5_0.10.0" + shell: + "({input.script} \ + --gtf {input.gtf} \ + --out {output.gtf} \ + ) &> {log}" + +################################################################################# +### Generate segemehl index for other RNAs +################################################################################# + +rule generate_segemehl_index_other_RNAs: + input: + flag = config["dir_created"], + sequence = config["other_RNAs_sequence"] + output: + idx = os.path.join(config["output_dir"], "other_RNAs_sequence.idx") + params: + cluster_log = os.path.join(config["cluster_log"], "generate_segemehl_index_other_RNAs.log") + log: + os.path.join(config["local_log"], "generate_segemehl_index_other_RNAs.log") + singularity: + "docker://zavolab/segemehl:0.2.0" + shell: + "(segemehl.x -x {output.idx} -d {input.sequence}) &> {log}" + +################################################################################# +### Extract transcripts +################################################################################# + +rule extract_transcript_sequences: + input: + gtf = os.path.join(config["output_dir"], "longest_pc_transcript_per_gene.gtf"), + genome = config["genome"] + output: + transcripts = os.path.join(config["output_dir"], "longest_pc_transcript_per_gene.fa") + params: + cluster_log = os.path.join(config["cluster_log"], "extract_transcript_sequences.log") + log: + os.path.join(config["local_log"], "extract_transcript_sequences.log") + singularity: + "docker://zavolab/cufflinks:2.2.1" + shell: + "(gffread {input.gtf} \ + -g {input.genome} \ + -w {output.transcripts}) &> {log}" + +################################################################################# +### Generate segemehl index for transcripts +################################################################################# + +rule generate_segemehl_index_transcripts: + input: + sequence = os.path.join(config["output_dir"], "longest_pc_transcript_per_gene.fa") + output: + idx = os.path.join(config["output_dir"], "longest_pc_transcript_per_gene.idx") + params: + cluster_log = os.path.join(config["cluster_log"], "generate_segemehl_index_transcripts.log") + log: + os.path.join(config["local_log"], "generate_segemehl_index_transcripts.log") + singularity: + "docker://zavolab/segemehl:0.2.0" + shell: + "(segemehl.x -x {output.idx} -d {input.sequence}) &> {log}" diff --git a/snakemake/prepare_annotation/cluster.json b/snakemake/prepare_annotation/cluster.json new file mode 100644 index 0000000000000000000000000000000000000000..7845df8acc0b69927ac1448d13bb01ec70647771 --- /dev/null +++ b/snakemake/prepare_annotation/cluster.json @@ -0,0 +1,23 @@ +{ +"__default__": +{ +"queue":"6hours", +"time": "05:00:00", +"threads":"1", +"mem":"4G" +}, +"generate_segemehl_index_other_RNAs": +{ +"queue":"6hours", +"time": "06:00:00", +"threads":"8", +"mem":"50G" +}, +"generate_segemehl_index_transcripts": +{ +"queue":"6hours", +"time": "06:00:00", +"threads":"8", +"mem":"50G" +} +} diff --git a/snakemake/prepare_annotation/config.yaml b/snakemake/prepare_annotation/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2f104b8e8b2d0ca1f19ae241514c0f107c346db4 --- /dev/null +++ b/snakemake/prepare_annotation/config.yaml @@ -0,0 +1,23 @@ +--- + ############################################################################## + ### Annotation + ############################################################################## + genome: "../annotation/Homo_sapiens.GRCh38.dna_sm.primary_assembly.fa" + gtf: "../annotation/Homo_sapiens.GRCh38.90.chr.gtf" + other_RNAs_sequence: "../annotation/txome_rRNAs_joao.fa" + ############################################################################## + ### Output and log directory + ############################################################################## + output_dir: "results" + local_log: "results/local_log" + cluster_log: "results/cluster_log" + dir_created: "results/dir_created" + scripts: "scripts" + ############################################################################## + ### sample info + ############################################################################## + input_dir: "samples" + input_reads_pattern: ".fastq.gz" + sample: ["example"] + example: {adapter: GATCGGAAGAGCACA} +... diff --git a/snakemake/prepare_annotation/create_snakemake_flowchart.sh b/snakemake/prepare_annotation/create_snakemake_flowchart.sh new file mode 100755 index 0000000000000000000000000000000000000000..ce71a9a3dcfcd940c55d9568bff4e10bfa670f45 --- /dev/null +++ b/snakemake/prepare_annotation/create_snakemake_flowchart.sh @@ -0,0 +1 @@ +snakemake --dag -np | dot -Tpng > dag.png diff --git a/snakemake/prepare_annotation/dag.png b/snakemake/prepare_annotation/dag.png new file mode 100644 index 0000000000000000000000000000000000000000..2bd5f04eaaf11d89e3c5bb55dd5bfd49f74cebea Binary files /dev/null and b/snakemake/prepare_annotation/dag.png differ diff --git a/snakemake/prepare_annotation/run_snakefile.sh b/snakemake/prepare_annotation/run_snakefile.sh new file mode 100755 index 0000000000000000000000000000000000000000..6791403020a28256d19de15e0efa43447fe8ee58 --- /dev/null +++ b/snakemake/prepare_annotation/run_snakefile.sh @@ -0,0 +1,10 @@ +# set -e + +snakemake \ +--cluster-config cluster.json \ +--cluster "sbatch --cpus-per-task={cluster.threads} --mem={cluster.mem} --qos={cluster.queue} --time={cluster.time} --output={params.cluster_log}-%j-%N -p scicore" \ +--cores 256 \ +-p \ +--rerun-incomplete \ +--use-singularity \ +--singularity-args "--bind ${PWD}" \ No newline at end of file diff --git a/snakemake/prepare_annotation/scripts/find_longest_coding_transcripts.py b/snakemake/prepare_annotation/scripts/find_longest_coding_transcripts.py new file mode 100755 index 0000000000000000000000000000000000000000..dcf3e41ef1ec44666e47d44ea1cef9ebd2f99f3d --- /dev/null +++ b/snakemake/prepare_annotation/scripts/find_longest_coding_transcripts.py @@ -0,0 +1,216 @@ +#!/usr/bin/env python + +__version__ = "0.1" +__author__ = "Foivos Gypas" +__contact__ = "foivos.gypas@unibas.ch" +__doc__ = "Parse a gtf file and find the longest expressed protein " + \ + "coding transcripts for each gene." + +# _____________________________________________________________________________ +# ----------------------------------------------------------------------------- +# import needed (external) modules +# ----------------------------------------------------------------------------- + +import sys +import os +import HTSeq +from argparse import ArgumentParser, RawTextHelpFormatter + +# _____________________________________________________________________________ +# ----------------------------------------------------------------------------- +# Custom classes +# ----------------------------------------------------------------------------- + + +class Gene(object): + + """Gene object""" + + def __init__(self, gene_id): + """Init object""" + self.gene_id = gene_id + self.transcripts = [] + + def add_transcript(self, transcript): + """Add transcript in the list of transcripts""" + if transcript not in self.transcripts: + self.transcripts.append(transcript) + + def get_all_transcripts(self): + """Get all transcripts of that gene""" + return self.transcripts + + def get_longest_coding_transcript(self): + """""" + selected_transcript = None + for transcript in self.transcripts: + if selected_transcript is None: + selected_transcript = transcript + continue + if (transcript.get_CDS_length() > selected_transcript.get_CDS_length()): + selected_transcript = transcript + return selected_transcript + + +class Transcript(object): + + """Transcript object""" + + def __init__(self, transcript_id, gene_id): + """Init object""" + self.transcript_id = transcript_id + self.gene_id = gene_id + self.CDS_length = 0 + + def get_CDS_length(self): + """Get CDS length of the transcript""" + return self.CDS_length + + def update_CDS_length(self, length): + """Update CDS length""" + self.CDS_length += length + +# _____________________________________________________________________________ +# ----------------------------------------------------------------------------- +# Main function +# ----------------------------------------------------------------------------- + + +def main(): + """ Main function """ + + parser = ArgumentParser( + description=__doc__, + formatter_class=RawTextHelpFormatter + ) + + parser.add_argument( + "--gtf", + dest="gtf", + help="Input GTF file in ENSEMBL format", + required=True, + metavar="FILE" + ) + + parser.add_argument( + "--out", + dest="out", + help="GTF output file", + required=True, + metavar="FILE" + ) + + parser.add_argument( + "--verbose", + action="store_true", + dest="verbose", + default=False, + required=False, + help="Verbose" + ) + + parser.add_argument( + '--version', + action='version', + version=__version__ + ) + + # _________________________________________________________________________ + # ------------------------------------------------------------------------- + # get the arguments + # ------------------------------------------------------------------------- + try: + options = parser.parse_args() + except(Exception): + parser.print_help() + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(1) + + if options.verbose: + sys.stdout.write( + "Parsing gtf file: {} {}".format( + options.gtf, + os.linesep + ) + ) + + # dictionary of genes + genes = {} + + # dictionary of transcripts + transcripts = {} + + # parse gtf file + gtf_file = HTSeq.GFF_Reader(options.gtf) + + for gtf_line in gtf_file: + + if gtf_line.type == 'CDS': + + transcript_id = gtf_line.attr['transcript_id'] + gene_id = gtf_line.attr['gene_id'] + length = abs(gtf_line.iv.end - gtf_line.iv.start) + + if gene_id not in genes.keys(): + genes[gene_id] = Gene(gene_id=gene_id) + + if transcript_id not in transcripts.keys(): + transcripts[transcript_id] = Transcript(transcript_id, gene_id) + genes[gene_id].transcripts.append(transcripts[transcript_id]) + + transcripts[transcript_id].update_CDS_length(length) + + # keep the longest coding transcripts per gene + selected_transcsripts_ids = {} + + for gene_id, gene_object in genes.items(): + + if options.verbose: + print(gene_object.gene_id) + for tr in gene_object.transcripts: + print(tr.transcript_id, tr.get_CDS_length()) + + longest_transcript = gene_object.get_longest_coding_transcript() + + selected_transcsripts_ids[longest_transcript.transcript_id] = \ + longest_transcript.transcript_id + + # parse again the gtf file and keep only the longest transcripts that + # were selected previously + if options.verbose: + sys.stdout.write( + "Re-parse gtf file: {} and write out: {} {}".format( + options.gtf, + options.out, + os.linesep + ) + ) + + w = open(options.out, 'w') + for gtf_line in gtf_file: + if ( + gtf_line.type == 'transcript' or + gtf_line.type == 'exon' or + gtf_line.type == 'CDS' or + gtf_line.type == 'start_codon' or + gtf_line.type == 'stop_codon' + ): + + if (gtf_line.attr['transcript_id'] in selected_transcsripts_ids): + w.write(gtf_line.get_gff_line()) + w.close() + +# _____________________________________________________________________________ +# ----------------------------------------------------------------------------- +# Call the Main function and catch Keyboard interrups +# ----------------------------------------------------------------------------- + + +if __name__ == '__main__': + try: + main() + except KeyboardInterrupt: + sys.stderr.write("User interrupt!" + os.linesep) + sys.exit(0) diff --git a/snakemake/process_data/Snakefile b/snakemake/process_data/Snakefile index 0f2528d30e2bc7fd5125f1b2402d0f7fea2d2ee5..9c791fcf29f23a4e71f5cb9bb9a8761c9388d640 100644 --- a/snakemake/process_data/Snakefile +++ b/snakemake/process_data/Snakefile @@ -51,7 +51,7 @@ rule clip_reads: log: os.path.join(config["local_log"], "clip_reads_{sample}.log") singularity: - "docker://cjh4zavolab/fastx:0.0.14" + "docker://zavolab/fastx:0.0.14" shell: "(fastx_clipper \ {params.v} \ @@ -82,7 +82,7 @@ rule trim_reads: log: os.path.join(config["cluster_log"], "trim_reads_{sample}.log") singularity: - "docker://cjh4zavolab/fastx:0.0.14" + "docker://zavolab/fastx:0.0.14" shell: "(fastq_quality_trimmer \ {params.v} \ @@ -112,7 +112,7 @@ rule filter_reads: log: os.path.join(config["cluster_log"], "filter_reads_{sample}.log") singularity: - "docker://cjh4zavolab/fastx:0.0.14" + "docker://zavolab/fastx:0.0.14" shell: "(fastq_quality_filter \ {params.v} \ @@ -142,7 +142,7 @@ rule fastq_to_fasta: log: os.path.join(config["cluster_log"], "fastq_to_fasta_{sample}.log") singularity: - "docker://cjh4zavolab/fastx:0.0.14" + "docker://zavolab/fastx:0.0.14" shell: "(fastq_to_fasta \ {params.v} \ @@ -172,7 +172,7 @@ rule map_to_other_genes: os.path.join(config["local_log"], "map_to_other_genes_{sample}.log") threads: 8 singularity: - "docker://fgypas/segemehl:0.2.0" + "docker://zavolab/segemehl:0.2.0" shell: "(segemehl.x \ {params.silent} \ @@ -204,7 +204,7 @@ rule map_to_transcripts: os.path.join(config["local_log"], "map_to_transcripts_{sample}.log") threads: 8 singularity: - "docker://fgypas/segemehl:0.2.0" + "docker://zavolab/segemehl:0.2.0" shell: "(segemehl.x \ {params.silent} \