Skip to content
Snippets Groups Projects
Commit 716ae90f authored by BIOPZ-Gypas Foivos's avatar BIOPZ-Gypas Foivos
Browse files

Addition of prepare_annotation pipeline. Small changes in process_data pipeline.

parent 3cf3a11e
No related branches found
No related tags found
No related merge requests found
##### BASE IMAGE #####
FROM debian:9.3
##### METADATA #####
LABEL base.image="debian:9.3"
LABEL version="1"
LABEL software="segemehl"
LABEL software.version="0.2.0"
LABEL software.description="short read mapping with gaps"
LABEL software.website="http://www.bioinf.uni-leipzig.de/Software/segemehl/"
LABEL software.documentation="http://www.bioinf.uni-leipzig.de/Software/segemehl/"
LABEL software.license="link to license(file) of original software"
LABEL software.tags="Genomics, Transcriptomics"
LABEL maintainer="foivos.gypas@unibas.ch"
LABEL maintainer.organisation="Biozentrum, University of Basel"
LABEL maintainer.location="Klingelbergstrasse 50/70, CH-4056 Basel, Switzerland"
LABEL maintainer.lab="Zavolan Lab"
LABEL maintainer.license="https://spdx.org/licenses/Apache-2.0"
##### VARIABLES #####
# Use variables for convenient updates/re-usability
ENV SOFTWARE_VERSION 0.2.0
##### INSTALL #####
RUN apt-get update -y \
&& apt-get install -y wget make gcc build-essential zlib1g-dev libncurses5-dev samtools msmtp zlib1g-dev make libncurses5-dev libxml2-dev \
&& wget http://www.bioinf.uni-leipzig.de/Software/segemehl/segemehl_0_2_0.tar.gz \
&& tar xzvf segemehl_0_2_0.tar.gz \
&& cd segemehl_0_2_0/segemehl/ \
&& make \
&& cd ../.. \
&& cp segemehl_0_2_0/segemehl/segemehl.x /usr/local/bin \
&& cp segemehl_0_2_0/segemehl/lack.x /usr/local/bin \
&& rm -rf segemehl_0_2_0* segemehl_0_2_0* \
&& apt-get purge -y wget \
&& apt-get autoremove -y && apt-get clean \
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
configfile: "config.yaml"
localrules: create_output_and_log_directories, finish
#################################################################################
### Finish rule
#################################################################################
rule finish:
input:
idx_other = os.path.join(config["output_dir"], "other_RNAs_sequence.idx"),
idx_transcripts = os.path.join(config["output_dir"], "longest_pc_transcript_per_gene.idx")
#################################################################################
### Create output and log directories
#################################################################################
rule create_output_and_log_directories:
output:
output_dir = config["output_dir"],
cluster_log = config["cluster_log"],
local_log = config["local_log"],
flag = config["dir_created"]
threads: 1
shell:
"mkdir -p {output.output_dir}; \
mkdir -p {output.cluster_log}; \
mkdir -p {output.local_log}; \
touch {output.flag};"
#################################################################################
### Select longest protein coding transcripts
#################################################################################
rule select_longest_coding_transcripts:
input:
flag = config["dir_created"],
gtf = config["gtf"],
script = os.path.join(config["scripts"], "find_longest_coding_transcripts.py")
output:
gtf = os.path.join(config["output_dir"], "longest_pc_transcript_per_gene.gtf")
params:
cluster_log = os.path.join(config["cluster_log"], "select_longest_coding_transcript.log")
log:
os.path.join(config["local_log"], "select_longest_coding_transcript.log")
singularity:
"docker://zavolab/python_htseq:3.6.5_0.10.0"
shell:
"({input.script} \
--gtf {input.gtf} \
--out {output.gtf} \
) &> {log}"
#################################################################################
### Generate segemehl index for other RNAs
#################################################################################
rule generate_segemehl_index_other_RNAs:
input:
flag = config["dir_created"],
sequence = config["other_RNAs_sequence"]
output:
idx = os.path.join(config["output_dir"], "other_RNAs_sequence.idx")
params:
cluster_log = os.path.join(config["cluster_log"], "generate_segemehl_index_other_RNAs.log")
log:
os.path.join(config["local_log"], "generate_segemehl_index_other_RNAs.log")
singularity:
"docker://zavolab/segemehl:0.2.0"
shell:
"(segemehl.x -x {output.idx} -d {input.sequence}) &> {log}"
#################################################################################
### Extract transcripts
#################################################################################
rule extract_transcript_sequences:
input:
gtf = os.path.join(config["output_dir"], "longest_pc_transcript_per_gene.gtf"),
genome = config["genome"]
output:
transcripts = os.path.join(config["output_dir"], "longest_pc_transcript_per_gene.fa")
params:
cluster_log = os.path.join(config["cluster_log"], "extract_transcript_sequences.log")
log:
os.path.join(config["local_log"], "extract_transcript_sequences.log")
singularity:
"docker://zavolab/cufflinks:2.2.1"
shell:
"(gffread {input.gtf} \
-g {input.genome} \
-w {output.transcripts}) &> {log}"
#################################################################################
### Generate segemehl index for transcripts
#################################################################################
rule generate_segemehl_index_transcripts:
input:
sequence = os.path.join(config["output_dir"], "longest_pc_transcript_per_gene.fa")
output:
idx = os.path.join(config["output_dir"], "longest_pc_transcript_per_gene.idx")
params:
cluster_log = os.path.join(config["cluster_log"], "generate_segemehl_index_transcripts.log")
log:
os.path.join(config["local_log"], "generate_segemehl_index_transcripts.log")
singularity:
"docker://zavolab/segemehl:0.2.0"
shell:
"(segemehl.x -x {output.idx} -d {input.sequence}) &> {log}"
{
"__default__":
{
"queue":"6hours",
"time": "05:00:00",
"threads":"1",
"mem":"4G"
},
"generate_segemehl_index_other_RNAs":
{
"queue":"6hours",
"time": "06:00:00",
"threads":"8",
"mem":"50G"
},
"generate_segemehl_index_transcripts":
{
"queue":"6hours",
"time": "06:00:00",
"threads":"8",
"mem":"50G"
}
}
---
##############################################################################
### Annotation
##############################################################################
genome: "../annotation/Homo_sapiens.GRCh38.dna_sm.primary_assembly.fa"
gtf: "../annotation/Homo_sapiens.GRCh38.90.chr.gtf"
other_RNAs_sequence: "../annotation/txome_rRNAs_joao.fa"
##############################################################################
### Output and log directory
##############################################################################
output_dir: "results"
local_log: "results/local_log"
cluster_log: "results/cluster_log"
dir_created: "results/dir_created"
scripts: "scripts"
##############################################################################
### sample info
##############################################################################
input_dir: "samples"
input_reads_pattern: ".fastq.gz"
sample: ["example"]
example: {adapter: GATCGGAAGAGCACA}
...
snakemake --dag -np | dot -Tpng > dag.png
snakemake/prepare_annotation/dag.png

24.1 KiB

# set -e
snakemake \
--cluster-config cluster.json \
--cluster "sbatch --cpus-per-task={cluster.threads} --mem={cluster.mem} --qos={cluster.queue} --time={cluster.time} --output={params.cluster_log}-%j-%N -p scicore" \
--cores 256 \
-p \
--rerun-incomplete \
--use-singularity \
--singularity-args "--bind ${PWD}"
\ No newline at end of file
#!/usr/bin/env python
__version__ = "0.1"
__author__ = "Foivos Gypas"
__contact__ = "foivos.gypas@unibas.ch"
__doc__ = "Parse a gtf file and find the longest expressed protein " + \
"coding transcripts for each gene."
# _____________________________________________________________________________
# -----------------------------------------------------------------------------
# import needed (external) modules
# -----------------------------------------------------------------------------
import sys
import os
import HTSeq
from argparse import ArgumentParser, RawTextHelpFormatter
# _____________________________________________________________________________
# -----------------------------------------------------------------------------
# Custom classes
# -----------------------------------------------------------------------------
class Gene(object):
"""Gene object"""
def __init__(self, gene_id):
"""Init object"""
self.gene_id = gene_id
self.transcripts = []
def add_transcript(self, transcript):
"""Add transcript in the list of transcripts"""
if transcript not in self.transcripts:
self.transcripts.append(transcript)
def get_all_transcripts(self):
"""Get all transcripts of that gene"""
return self.transcripts
def get_longest_coding_transcript(self):
""""""
selected_transcript = None
for transcript in self.transcripts:
if selected_transcript is None:
selected_transcript = transcript
continue
if (transcript.get_CDS_length() > selected_transcript.get_CDS_length()):
selected_transcript = transcript
return selected_transcript
class Transcript(object):
"""Transcript object"""
def __init__(self, transcript_id, gene_id):
"""Init object"""
self.transcript_id = transcript_id
self.gene_id = gene_id
self.CDS_length = 0
def get_CDS_length(self):
"""Get CDS length of the transcript"""
return self.CDS_length
def update_CDS_length(self, length):
"""Update CDS length"""
self.CDS_length += length
# _____________________________________________________________________________
# -----------------------------------------------------------------------------
# Main function
# -----------------------------------------------------------------------------
def main():
""" Main function """
parser = ArgumentParser(
description=__doc__,
formatter_class=RawTextHelpFormatter
)
parser.add_argument(
"--gtf",
dest="gtf",
help="Input GTF file in ENSEMBL format",
required=True,
metavar="FILE"
)
parser.add_argument(
"--out",
dest="out",
help="GTF output file",
required=True,
metavar="FILE"
)
parser.add_argument(
"--verbose",
action="store_true",
dest="verbose",
default=False,
required=False,
help="Verbose"
)
parser.add_argument(
'--version',
action='version',
version=__version__
)
# _________________________________________________________________________
# -------------------------------------------------------------------------
# get the arguments
# -------------------------------------------------------------------------
try:
options = parser.parse_args()
except(Exception):
parser.print_help()
if len(sys.argv) == 1:
parser.print_help()
sys.exit(1)
if options.verbose:
sys.stdout.write(
"Parsing gtf file: {} {}".format(
options.gtf,
os.linesep
)
)
# dictionary of genes
genes = {}
# dictionary of transcripts
transcripts = {}
# parse gtf file
gtf_file = HTSeq.GFF_Reader(options.gtf)
for gtf_line in gtf_file:
if gtf_line.type == 'CDS':
transcript_id = gtf_line.attr['transcript_id']
gene_id = gtf_line.attr['gene_id']
length = abs(gtf_line.iv.end - gtf_line.iv.start)
if gene_id not in genes.keys():
genes[gene_id] = Gene(gene_id=gene_id)
if transcript_id not in transcripts.keys():
transcripts[transcript_id] = Transcript(transcript_id, gene_id)
genes[gene_id].transcripts.append(transcripts[transcript_id])
transcripts[transcript_id].update_CDS_length(length)
# keep the longest coding transcripts per gene
selected_transcsripts_ids = {}
for gene_id, gene_object in genes.items():
if options.verbose:
print(gene_object.gene_id)
for tr in gene_object.transcripts:
print(tr.transcript_id, tr.get_CDS_length())
longest_transcript = gene_object.get_longest_coding_transcript()
selected_transcsripts_ids[longest_transcript.transcript_id] = \
longest_transcript.transcript_id
# parse again the gtf file and keep only the longest transcripts that
# were selected previously
if options.verbose:
sys.stdout.write(
"Re-parse gtf file: {} and write out: {} {}".format(
options.gtf,
options.out,
os.linesep
)
)
w = open(options.out, 'w')
for gtf_line in gtf_file:
if (
gtf_line.type == 'transcript' or
gtf_line.type == 'exon' or
gtf_line.type == 'CDS' or
gtf_line.type == 'start_codon' or
gtf_line.type == 'stop_codon'
):
if (gtf_line.attr['transcript_id'] in selected_transcsripts_ids):
w.write(gtf_line.get_gff_line())
w.close()
# _____________________________________________________________________________
# -----------------------------------------------------------------------------
# Call the Main function and catch Keyboard interrups
# -----------------------------------------------------------------------------
if __name__ == '__main__':
try:
main()
except KeyboardInterrupt:
sys.stderr.write("User interrupt!" + os.linesep)
sys.exit(0)
...@@ -51,7 +51,7 @@ rule clip_reads: ...@@ -51,7 +51,7 @@ rule clip_reads:
log: log:
os.path.join(config["local_log"], "clip_reads_{sample}.log") os.path.join(config["local_log"], "clip_reads_{sample}.log")
singularity: singularity:
"docker://cjh4zavolab/fastx:0.0.14" "docker://zavolab/fastx:0.0.14"
shell: shell:
"(fastx_clipper \ "(fastx_clipper \
{params.v} \ {params.v} \
...@@ -82,7 +82,7 @@ rule trim_reads: ...@@ -82,7 +82,7 @@ rule trim_reads:
log: log:
os.path.join(config["cluster_log"], "trim_reads_{sample}.log") os.path.join(config["cluster_log"], "trim_reads_{sample}.log")
singularity: singularity:
"docker://cjh4zavolab/fastx:0.0.14" "docker://zavolab/fastx:0.0.14"
shell: shell:
"(fastq_quality_trimmer \ "(fastq_quality_trimmer \
{params.v} \ {params.v} \
...@@ -112,7 +112,7 @@ rule filter_reads: ...@@ -112,7 +112,7 @@ rule filter_reads:
log: log:
os.path.join(config["cluster_log"], "filter_reads_{sample}.log") os.path.join(config["cluster_log"], "filter_reads_{sample}.log")
singularity: singularity:
"docker://cjh4zavolab/fastx:0.0.14" "docker://zavolab/fastx:0.0.14"
shell: shell:
"(fastq_quality_filter \ "(fastq_quality_filter \
{params.v} \ {params.v} \
...@@ -142,7 +142,7 @@ rule fastq_to_fasta: ...@@ -142,7 +142,7 @@ rule fastq_to_fasta:
log: log:
os.path.join(config["cluster_log"], "fastq_to_fasta_{sample}.log") os.path.join(config["cluster_log"], "fastq_to_fasta_{sample}.log")
singularity: singularity:
"docker://cjh4zavolab/fastx:0.0.14" "docker://zavolab/fastx:0.0.14"
shell: shell:
"(fastq_to_fasta \ "(fastq_to_fasta \
{params.v} \ {params.v} \
...@@ -172,7 +172,7 @@ rule map_to_other_genes: ...@@ -172,7 +172,7 @@ rule map_to_other_genes:
os.path.join(config["local_log"], "map_to_other_genes_{sample}.log") os.path.join(config["local_log"], "map_to_other_genes_{sample}.log")
threads: 8 threads: 8
singularity: singularity:
"docker://fgypas/segemehl:0.2.0" "docker://zavolab/segemehl:0.2.0"
shell: shell:
"(segemehl.x \ "(segemehl.x \
{params.silent} \ {params.silent} \
...@@ -204,7 +204,7 @@ rule map_to_transcripts: ...@@ -204,7 +204,7 @@ rule map_to_transcripts:
os.path.join(config["local_log"], "map_to_transcripts_{sample}.log") os.path.join(config["local_log"], "map_to_transcripts_{sample}.log")
threads: 8 threads: 8
singularity: singularity:
"docker://fgypas/segemehl:0.2.0" "docker://zavolab/segemehl:0.2.0"
shell: shell:
"(segemehl.x \ "(segemehl.x \
{params.silent} \ {params.silent} \
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment