diff --git a/README.md b/README.md index 3c0ea33db568787880606f9f6015fb29b58c829a..182330ab8e7971e7554a61e25ef684ada5a880e6 100644 --- a/README.md +++ b/README.md @@ -31,10 +31,10 @@ package manager. We recommend that you install For improved reproducibility and reusability of the workflow, as well as an easy means to run it on a high performance computing (HPC) cluster managed, -e.g., by [Slurm][slurm], almost all steps of the workflow run in their own -container. As a consequence, running this workflow has very few individual -dependencies. It does, however, require that the container engine -[Singularity][singularity] is installed. +e.g., by [Slurm][slurm], all steps of the workflow run in their own container. +As a consequence, running this workflow has very few individual dependencies. It +does, however, require that the container engine [Singularity][singularity] is +installed. Create and activate the environment with necessary dependencies with conda: diff --git a/scripts/split_file_by_colum_value.sh b/scripts/split_file_by_colum_value.sh deleted file mode 100755 index 62169059c0a052fec957bbd092cb7ce42b6a2906..0000000000000000000000000000000000000000 --- a/scripts/split_file_by_colum_value.sh +++ /dev/null @@ -1,51 +0,0 @@ -#!/bin/bash - -# Setting to strict mode -set -euo pipefail -IFS=$'\n\t' - -#### FUNCTIONS - -usage() -{ - echo "usage: split_file_by_colum_value.sh [[[-f file] [-c column] [-s separator]] | [-h]]" -} - - - -#### MAIN -# Test whether all required inputs are present -if [[ $1 == -h ]] || [[ $# != 6 ]] -then - usage - exit -fi - -# Get parameters -while [ $# -gt 0 ]; do - case $1 in - -f | --file ) shift - filename=$1 - ;; - -c | --column ) shift - column=$1 - ;; - -s | --separator ) shift - separator=$1 - ;; - -h | --help ) usage - exit - ;; - * ) usage - exit 1 - esac - shift -done - -printf "\nFile splited by colum %i values.\n" \ - ${column}\ - -# Split file by values in requested column -awk -v col="${column}" -v sep="${separator}" -F'$sep' '{print > $col".txt"}' ${filename} - -printf "\nDONE!\n" diff --git a/scripts/validation_fasta.py b/scripts/validation_fasta.py deleted file mode 100755 index f39059e37e2f0c1c119ed7205742d368a6185cab..0000000000000000000000000000000000000000 --- a/scripts/validation_fasta.py +++ /dev/null @@ -1,170 +0,0 @@ -#!/usr/bin/env python -import sys -import re -import gzip -from argparse import ArgumentParser, RawTextHelpFormatter - - -### Created: Mar 5, 2019 -### Author: Paula Iborra -### Company: Zavolan Group, Biozentrum, University of Basel - - -### ARGUMENTS ### - -parser = ArgumentParser( - description="Script to filter FASTA files" - ) -parser.add_argument( - '-v','--version', - action='version', - version='%(prog)s 1.0', - help="Show program's version number and exit" - ) -parser.add_argument( - '--trim', - help="Character's used to trim the ID. Remove anything that follows the character's. Write \\ infront of \'.\' and \'-\'(i.e trim=\"$\\.\\-|_\"). Default: first white space", - type=str, - nargs='?', - default="" - ) -parser.add_argument( - '--idlist', - help="Generate text file with the sequences IDs. One ID per line." - ) -parser.add_argument( - '-f','--filter', - help="Input ID list. Filter IDs and sequences from FASTA file with the mode selected. Filter file must contain ONE ID per line", - ) -parser.add_argument( - '-m', '--mode', - help="Type of filtering fasta file: keep (k) or discard (d) IDs contained in the ID list file.", - choices=('k', 'd') - ) -parser.add_argument( - '-r','--remove', - help="Remove sequences from FASTA file longer than specified length.", - type=int - ) -parser.add_argument( - '-i','--input', - required=True, - help="Input FASTA file", - type=str - ) -parser.add_argument( - '-o','--output', - help="Output FASTA file" - ) - -args = parser.parse_args() - -if args.filter and not args.mode: - sys.exit("ERROR! Mode argument required when using filter option. (--mode, -m). See --help option.") - -### PARSE FASTA FILE ### - -class Seq: - def __init__(self): - self.id="" - def __init__(self): - self.seq="" - def __init__(self): - self.features="" - -# open files -if args.input.endswith('.gz'): - f = gzip.open(args.input, 'rt') -else: - f = open(args.input) - -record=[] #list of records -nrec=-1 -inseq=0 - -# parse fasta file -sys.stdout.write("Parsing FASTA file...") -for line in f: - if re.match(r'^>' , line): - nrec+=1 - record.append(Seq()) - - # define id of the record - if not args.trim: - mobj=re.match(r'^>(\S*)(.*)', line) - else: - mobj=re.match(r'^>([^%s]*)(.*)'%args.trim , line) - - # add id and features - if (mobj): - record[nrec].id=mobj.group(1) - record[nrec].features=mobj.group(2) - inseq=0 - else : - if inseq==0 : - inseq=1 - record[nrec].seq = line - else: - cstring=record[nrec].seq+line - record[nrec].seq = cstring -sys.stdout.write("DONE\n") - -## ID FILTER LIST ## - -if (args.filter): - sys.stdout.write("Filtering FASTA file...") - id_filter=[line.rstrip('\n') for line in open(args.filter)] - sys.stdout.write("DONE\n") - - -## OUTPUT FASTA FILE ## - -if (args.output): - sys.stdout.write("Writing FASTA file...") - with open(args.output, 'w') as output: - if (args.filter) and args.mode == 'k': - if (args.remove): - for x in range(0,nrec+1): - if record[x].id in id_filter and (len(record[x].seq)-1 <= args.remove): - output.write(">%s\n%s"%(record[x].id, record[x].seq)) - else: - for x in range(0,nrec+1): - if record[x].id in id_filter: - output.write(">%s\n%s"%(record[x].id, record[x].seq)) - elif (args.filter) and args.mode == 'd': - if (args.remove): - for x in range(0,nrec+1): - if record[x].id not in id_filter and (len(record[x].seq)-1 <= args.remove): - output.write(">%s\n%s"%(record[x].id, record[x].seq)) - else: - for x in range(0,nrec+1): - if record[x].id not in id_filter: - output.write(">%s\n%s"%(record[x].id, record[x].seq)) - else: - if (args.remove): - for x in range(0,nrec+1): - if (len(record[x].seq)-1 <= args.remove): - output.write(">%s\n%s"%(record[x].id, record[x].seq)) - else: - for x in range(0,nrec+1): - output.write(">%s\n%s"%(record[x].id, record[x].seq)) - output.close() - sys.stdout.write("DONE\n") - - -## OUTPUT LIST IDs ## - -idlist=[] -if (args.idlist): - sys.stdout.write("Creating IDs list from FASTA file...") - fasta = open(args.output, 'r') - with open(args.idlist, 'w') as id_list: - for line in fasta: - if line.startswith('>'): - idlist.append(line[1:]) - idlist.sort() - id_list.write(''.join(idlist)) - id_list.close() - sys.stdout.write("DONE\n") - - diff --git a/scripts/validation_gtf.py b/scripts/validation_gtf.py deleted file mode 100755 index 515bcd7a427f30b268760620c2bc003d82b24f29..0000000000000000000000000000000000000000 --- a/scripts/validation_gtf.py +++ /dev/null @@ -1,180 +0,0 @@ -#!/usr/bin/env python - -import sys -import re -import HTSeq -from argparse import ArgumentParser - -### Created: Mar 5, 2019 -### Author: Paula Iborra -### Company: Zavolan Group, Biozentrum, University of Basel - - -### ARGUMENTS ### - -parser = ArgumentParser( - description='Script to filter GTF files' - ) -parser.add_argument( - '-v','--version', - action='version', - version='%(prog)s 1.0', - help='Show program\'s version number and exit' - ) -parser.add_argument( - '--idfield', - help='Name of field that contains ID of interest. Default: transcript_id', - default='transcript_id', - choices=('transcript_id','gene_id', 'exon_id'), - ) -parser.add_argument( - '--idlist', - help='Generate text file with one ID per line' - ) -parser.add_argument( - '-f','--filter', - help='Remove IDs from GTF file missing in filter file. Filter file must contain ONE ID per line' - ) -parser.add_argument( - '-i','--input', - required=True, - help='Input GTF file', - type=str - ) -parser.add_argument( - '-o','--output', - required=True, - help='Output GTF file' - ) - -args = parser.parse_args() - -#list filtered IDs -ids = [] - -### PARSE GTF FILE ### - -sys.stdout.write('Parsing GTF file...') - -# parse gtf file -gtf_file = HTSeq.GFF_Reader(args.input) -sys.stdout.write('DONE\n') - -### FILTER GTF ### -# keep everyline not containing the id field, and the ones that contains the id field if id in id_filter list. - -if (args.filter): - sys.stdout.write('Filtering GTF file - Step 1...') - - #open filter file and make it a list - id_filter=[line.rstrip('\n') for line in open(args.filter)] - - pregtf = open('pregtf.gtf', 'w') - - for gtf_line in gtf_file: - if gtf_line.type in ['gene', 'transcript', 'exon']: - ## filter by gene_id - if args.idfield == 'gene_id': - if gtf_line.attr['gene_id'] in id_filter: - pregtf.write(gtf_line.get_gff_line()) - - ## filter by anything else - else: - if args.idfield not in gtf_line.attr.keys(): - pregtf.write(gtf_line.get_gff_line()) - - else: - if gtf_line.attr[args.idfield] in id_filter: - pregtf.write(gtf_line.get_gff_line()) - - pregtf.close() - sys.stdout.write('DONE\n') - -### OUTPUT GTF FILE ### -# remove those lines with no childs (empty genes / transcripts) - - sys.stdout.write('Filtering/Writing GTF file - Step 2...') - pregtf = open('pregtf.gtf', 'r') - gtf_output = open(args.output, 'w') - previous = [] - - if args.idfield == 'gene_id': - for line in pregtf: - gtf_output.write(line) - - elif args.idfield == 'transcript_id': - # store first line - if pregtf: - first_line = pregtf.readline().split('\t') - previous.append(first_line) - - for line in pregtf: - line=line.split('\t') - - if line[2] == previous[-1][2]: - if line[2] != 'gene': - gtf_output.write('\t'.join(previous[-1])) - del previous[-1] - previous.append(line) - else: - del previous[-1] - previous.append(line) - else: - gtf_output.write('\t'.join(previous[-1])) - del previous[-1] - previous.append(line) - - #for last line - if previous[-1][2] == 'exon': - gtf_output.write('\t'.join(previous[-1])) - - elif args.idfield == 'exon_id': - for line in reversed(open('pregtf.gtf','r').readlines()): - line = line.split('\t') - if line[2] == 'exon': - previous.append(line) - else: - if len(previous) == 0: - continue - else: - if line[2] == 'transcript': - if previous[-1][2] == 'exon': - previous.append(line) - elif line[2] == 'gene': - if previous[-1][2] == 'transcript': - previous.append(line) - else: - print('FILTER ERROR') - break - for i in previous[::-1]: - gtf_output.write('\t'.join(i)) - - gtf_output.close() - sys.stdout.write('DONE\n') - - -### OUTPUT IDs GTF FILE ### - -if (args.idlist): - sys.stdout.write('Creating IDs list from GTF file...') - ids_list = open(args.idlist, 'w') - t = (re.match(r'([^_]*)',args.idfield)).group() - - # creating list of ids from gtf output filtered - if (args.filter): - gtf_output = HTSeq.GFF_Reader(args.output) - for line in gtf_output: - if line.type == t: - ids_list.write(line.attr[args.idfield]+'\n') - ids_list.close() - - # creating list of ids from input gtf file - else: - for line in gtf_file: - if line.type == t: - ids_list.write(line.attr[args.idfield]+'\n') - ids_list.close() - - sys.stdout.write('DONE\n') - - diff --git a/snakemake/Snakefile b/snakemake/Snakefile index 43009bd56f0b59b35db87a18adf9cc56066d2362..069b8ade6f20eee8e76dd3487b27443724455f53 100644 --- a/snakemake/Snakefile +++ b/snakemake/Snakefile @@ -7,36 +7,36 @@ localrules: finish, genome_process, filter_anno_gtf ################################################################################# rule finish: - input: - idx_transcriptome = expand(os.path.join(config["output_dir"], "{organism}", "{prefix_name}","transcriptome_index_segemehl.idx"), - organism=config["organism"], prefix_name=config["prefix_name"]), - idx_genome = expand(os.path.join(config["output_dir"],"{organism}", "{prefix_name}", "genome_index_segemehl.idx"), - organism=config["organism"], prefix_name=config["prefix_name"]), - exons = expand(os.path.join(config["output_dir"],"{organism}", "{prefix_name}", "exons.bed"), - organism=config["organism"], prefix_name=config["prefix_name"]), - header = expand(os.path.join(config["output_dir"],"{organism}", "{prefix_name}","headerOfCollapsedFasta.sam"), - organism=config["organism"], prefix_name=config["prefix_name"]), + input: + idx_transcriptome = expand(os.path.join(config["output_dir"], "{organism}", "{prefix_name}","transcriptome_index_segemehl.idx"), + organism=config["organism"], prefix_name=config["prefix_name"]), + idx_genome = expand(os.path.join(config["output_dir"],"{organism}", "{prefix_name}", "genome_index_segemehl.idx"), + organism=config["organism"], prefix_name=config["prefix_name"]), + exons = expand(os.path.join(config["output_dir"],"{organism}", "{prefix_name}", "exons.bed"), + organism=config["organism"], prefix_name=config["prefix_name"]), + header = expand(os.path.join(config["output_dir"],"{organism}", "{prefix_name}","headerOfCollapsedFasta.sam"), + organism=config["organism"], prefix_name=config["prefix_name"]), ################################################################################# ### Download and process genome IDs ################################################################################# rule genome_process: - input: - script = os.path.join(config["scripts_dir"],"genome_process.sh"), - output: - genome = os.path.join(config["output_dir"],"{organism}","{prefix_name}", "genome.processed.fa") - params: - cluster_log = os.path.join(config["cluster_log"],"{organism}","{prefix_name}","genome_process.log"), - prefix = config["prefix_name"], - url = config["genome_url"], - organism=config["organism"] - log: - os.path.join(config["local_log"],"{organism}","{prefix_name}","genome_process.log") - singularity: - "docker://zavolab/ubuntu:18.04" - shell: - "(bash {input.script} {params.prefix} {params.organism} {params.url}) &> {log}" + input: + script = os.path.join(config["scripts_dir"],"genome_process.sh"), + output: + genome = os.path.join(config["output_dir"],"{organism}","{prefix_name}", "genome.processed.fa") + params: + cluster_log = os.path.join(config["cluster_log"],"{organism}","{prefix_name}","genome_process.log"), + prefix = config["prefix_name"], + url = config["genome_url"], + organism=config["organism"] + log: + os.path.join(config["local_log"],"{organism}","{prefix_name}","genome_process.log") + singularity: + "docker://zavolab/ubuntu:18.04" + shell: + "(bash {input.script} {params.prefix} {params.organism} {params.url}) &> {log}" ################################################################################# @@ -44,40 +44,40 @@ rule genome_process: ################################################################################# rule filter_anno_gtf: - input: - script = os.path.join(config["scripts_dir"],"filter_anno_gtf.sh"), - output: - gtf = os.path.join(config["output_dir"],"{organism}","{prefix_name}","gene_annotations.filtered.gtf") - params: - cluster_log = os.path.join(config["cluster_log"],"{organism}","{prefix_name}","download_filter_gtf.log"), - prefix = config["prefix_name"], - url = config["gtf_url"], - organism=config["organism"] - log: - os.path.join(config["local_log"],"{organism}","{prefix_name}","filter_anno_gtf.log") - singularity: - "docker://zavolab/ubuntu:18.04" - shell: - "(bash {input.script} {params.prefix} {params.organism} {params.url}) &> {log}" + input: + script = os.path.join(config["scripts_dir"],"filter_anno_gtf.sh"), + output: + gtf = os.path.join(config["output_dir"],"{organism}","{prefix_name}","gene_annotations.filtered.gtf") + params: + cluster_log = os.path.join(config["cluster_log"],"{organism}","{prefix_name}","download_filter_gtf.log"), + prefix = config["prefix_name"], + url = config["gtf_url"], + organism=config["organism"] + log: + os.path.join(config["local_log"],"{organism}","{prefix_name}","filter_anno_gtf.log") + singularity: + "docker://zavolab/ubuntu:18.04" + shell: + "(bash {input.script} {params.prefix} {params.organism} {params.url}) &> {log}" ################################################################################# ### Extract transcriptome sequences in FASTA from genome. ################################################################################# rule extract_transcriptome_seqs: - input: - genome = os.path.join(config["output_dir"],"{organism}","{prefix_name}", "genome.processed.fa"), - gtf = os.path.join(config["output_dir"],"{organism}","{prefix_name}","gene_annotations.filtered.gtf") - output: - fasta = os.path.join(config["output_dir"],"{organism}","{prefix_name}","transcriptome.fa") - params: - cluster_log = os.path.join(config["cluster_log"],"{organism}","{prefix_name}","extract_transcriptome_seqs.log") - log: - os.path.join(config["local_log"],"{organism}","{prefix_name}","extract_transcriptome_seqs.log") - singularity: - "docker://zavolab/cufflinks:2.2.1" - shell: - "(gffread -w {output.fasta} -g {input.genome} {input.gtf}) &> {log}" + input: + genome = os.path.join(config["output_dir"],"{organism}","{prefix_name}", "genome.processed.fa"), + gtf = os.path.join(config["output_dir"],"{organism}","{prefix_name}","gene_annotations.filtered.gtf") + output: + fasta = os.path.join(config["output_dir"],"{organism}","{prefix_name}","transcriptome.fa") + params: + cluster_log = os.path.join(config["cluster_log"],"{organism}","{prefix_name}","extract_transcriptome_seqs.log") + log: + os.path.join(config["local_log"],"{organism}","{prefix_name}","extract_transcriptome_seqs.log") + singularity: + "docker://zavolab/cufflinks:2.2.1" + shell: + "(gffread -w {output.fasta} -g {input.genome} {input.gtf}) &> {log}" ################################################################################ @@ -85,17 +85,18 @@ rule extract_transcriptome_seqs: ################################################################################ rule trim_fasta: - input: - fasta = os.path.join(config["output_dir"], "{organism}","{prefix_name}","transcriptome.fa"), - script = os.path.join(config["scripts_dir"], "validation_fasta.py") - output: - fasta = os.path.join(config["output_dir"],"{organism}","{prefix_name}","transcriptome_idtrim.fa") - params: - cluster_log = os.path.join(config["cluster_log"],"{organism}","{prefix_name}","trim_fasta.log") - log: - os.path.join(config["local_log"],"{organism}","{prefix_name}","trim_fasta.log") - shell: - "(python {input.script} --trim -i {input.fasta} -o {output.fasta}) &> {log}" + input: + fasta = os.path.join(config["output_dir"], "{organism}","{prefix_name}","transcriptome.fa"), + output: + fasta = os.path.join(config["output_dir"],"{organism}","{prefix_name}","transcriptome_idtrim.fa") + params: + cluster_log = os.path.join(config["cluster_log"],"{organism}","{prefix_name}","trim_fasta.log") + log: + os.path.join(config["local_log"],"{organism}","{prefix_name}","trim_fasta.log") + singularity: + "docker://zavolab/ubuntu:18.04" + shell: + """(awk -F" " "/^>/ {{print \$1; next}} 1" {input.fasta} > {output.fasta}) &> {log}""" ################################################################################# @@ -103,22 +104,22 @@ rule trim_fasta: ################################################################################# rule generate_segemehl_index_transcriptome: - input: - fasta = os.path.join(config["output_dir"],"{organism}","{prefix_name}","transcriptome_idtrim.fa") - output: - idx = os.path.join(config["output_dir"],"{organism}","{prefix_name}","transcriptome_index_segemehl.idx") - params: - cluster_log = os.path.join(config["cluster_log"],"{organism}","{prefix_name}","generate_segemehl_index_transcriptome.log"), - log: - os.path.join(config["local_log"],"{organism}","{prefix_name}","generate_segemehl_index_transcriptome.log") - resources: - mem = 10, - threads = 8, - time = 6 - singularity: - "docker://zavolab/segemehl:0.2.0" - shell: - "(segemehl.x -x {output.idx} -d {input.fasta}) &> {log}" + input: + fasta = os.path.join(config["output_dir"],"{organism}","{prefix_name}","transcriptome_idtrim.fa") + output: + idx = os.path.join(config["output_dir"],"{organism}","{prefix_name}","transcriptome_index_segemehl.idx") + params: + cluster_log = os.path.join(config["cluster_log"],"{organism}","{prefix_name}","generate_segemehl_index_transcriptome.log"), + log: + os.path.join(config["local_log"],"{organism}","{prefix_name}","generate_segemehl_index_transcriptome.log") + resources: + mem = 10, + threads = 8, + time = 6 + singularity: + "docker://zavolab/segemehl:0.2.0" + shell: + "(segemehl.x -x {output.idx} -d {input.fasta}) &> {log}" ################################################################################# @@ -126,24 +127,24 @@ rule generate_segemehl_index_transcriptome: ################################################################################# rule generate_segemehl_index_genome: - input: - #genome = config["genome"] - genome = os.path.join(config["output_dir"],"{organism}","{prefix_name}", "genome.processed.fa") - - output: - idx = os.path.join(config["output_dir"],"{organism}","{prefix_name}","genome_index_segemehl.idx") - params: - cluster_log = os.path.join(config["cluster_log"],"{organism}","{prefix_name}","generate_segemehl_index_genome.log"), - log: - os.path.join(config["local_log"],"{organism}","{prefix_name}","generate_segemehl_index_genome.log") - resources: - mem = 60, - threads = 8, - time = 6 - singularity: - "docker://zavolab/segemehl:0.2.0" - shell: - "(segemehl.x -x {output.idx} -d {input.genome}) &> {log}" + input: + #genome = config["genome"] + genome = os.path.join(config["output_dir"],"{organism}","{prefix_name}", "genome.processed.fa") + + output: + idx = os.path.join(config["output_dir"],"{organism}","{prefix_name}","genome_index_segemehl.idx") + params: + cluster_log = os.path.join(config["cluster_log"],"{organism}","{prefix_name}","generate_segemehl_index_genome.log"), + log: + os.path.join(config["local_log"],"{organism}","{prefix_name}","generate_segemehl_index_genome.log") + resources: + mem = 60, + threads = 8, + time = 6 + singularity: + "docker://zavolab/segemehl:0.2.0" + shell: + "(segemehl.x -x {output.idx} -d {input.genome}) &> {log}" ################################################################################# @@ -151,19 +152,19 @@ rule generate_segemehl_index_genome: ################################################################################# rule get_exons_gtf: - input: - gtf = os.path.join(config["output_dir"],"{organism}","{prefix_name}","gene_annotations.filtered.gtf"), - script = os.path.join(config["scripts_dir"], "get_lines_w_pattern.sh") - output: - exons = os.path.join(config["output_dir"],"{organism}","{prefix_name}","exons.gtf") - params: - cluster_log = os.path.join(config["cluster_log"],"{organism}","{prefix_name}","get_exons_gtf.log") - log: - os.path.join(config["local_log"],"{organism}","{prefix_name}", "get_exons_gtf.log") - singularity: - "docker://zavolab/ubuntu:18.04" - shell: - "(bash {input.script} -f {input.gtf} -c 3 -p exon -o {output.exons} ) &> {log}" + input: + gtf = os.path.join(config["output_dir"],"{organism}","{prefix_name}","gene_annotations.filtered.gtf"), + script = os.path.join(config["scripts_dir"], "get_lines_w_pattern.sh") + output: + exons = os.path.join(config["output_dir"],"{organism}","{prefix_name}","exons.gtf") + params: + cluster_log = os.path.join(config["cluster_log"],"{organism}","{prefix_name}","get_exons_gtf.log") + log: + os.path.join(config["local_log"],"{organism}","{prefix_name}", "get_exons_gtf.log") + singularity: + "docker://zavolab/ubuntu:18.04" + shell: + "(bash {input.script} -f {input.gtf} -c 3 -p exon -o {output.exons} ) &> {log}" ################################################################################# @@ -171,19 +172,19 @@ rule get_exons_gtf: ################################################################################# rule gtftobed: - input: - exons = os.path.join(config["output_dir"],"{organism}","{prefix_name}","exons.gtf"), - script = os.path.join(config["scripts_dir"], "gtf_exons_bed.1.1.2.R") - output: - exons = os.path.join(config["output_dir"],"{organism}","{prefix_name}","exons.bed") - params: - cluster_log = os.path.join(config["cluster_log"],"{organism}","{prefix_name}","gtftobed.log") - log: - os.path.join(config["local_log"],"{organism}","{prefix_name}","gtftobed.log") - singularity: - "docker://zavolab/r-zavolab:3.5.1" - shell: - "(Rscript {input.script} --gtf {input.exons} -o {output.exons}) &> {log}" + input: + exons = os.path.join(config["output_dir"],"{organism}","{prefix_name}","exons.gtf"), + script = os.path.join(config["scripts_dir"], "gtf_exons_bed.1.1.2.R") + output: + exons = os.path.join(config["output_dir"],"{organism}","{prefix_name}","exons.bed") + params: + cluster_log = os.path.join(config["cluster_log"],"{organism}","{prefix_name}","gtftobed.log") + log: + os.path.join(config["local_log"],"{organism}","{prefix_name}","gtftobed.log") + singularity: + "docker://zavolab/r-zavolab:3.5.1" + shell: + "(Rscript {input.script} --gtf {input.exons} -o {output.exons}) &> {log}" ################################################################################# @@ -191,16 +192,16 @@ rule gtftobed: ################################################################################# rule create_header_genome: - input: - #genome = config["genome"] - genome = os.path.join(config["output_dir"],"{organism}","{prefix_name}", "genome.processed.fa") - output: - header = os.path.join(config["output_dir"],"{organism}","{prefix_name}","headerOfCollapsedFasta.sam") - params: - cluster_log = os.path.join(config["cluster_log"],"{organism}","{prefix_name}","create_header_genome.log") - log: - os.path.join(config["local_log"],"{organism}","{prefix_name}","create_header_genome.log") - singularity: - "docker://zavolab/samtools:1.8" - shell: - "(samtools dict -o {output.header} {input.genome}) &> {log}" + input: + #genome = config["genome"] + genome = os.path.join(config["output_dir"],"{organism}","{prefix_name}", "genome.processed.fa") + output: + header = os.path.join(config["output_dir"],"{organism}","{prefix_name}","headerOfCollapsedFasta.sam") + params: + cluster_log = os.path.join(config["cluster_log"],"{organism}","{prefix_name}","create_header_genome.log") + log: + os.path.join(config["local_log"],"{organism}","{prefix_name}","create_header_genome.log") + singularity: + "docker://zavolab/samtools:1.8" + shell: + "(samtools dict -o {output.header} {input.genome}) &> {log}" diff --git a/snakemake/snakemake_unused_rules b/snakemake/snakemake_unused_rules deleted file mode 100644 index f34d654900148210095340a25c5f64d88123bae1..0000000000000000000000000000000000000000 --- a/snakemake/snakemake_unused_rules +++ /dev/null @@ -1,56 +0,0 @@ -################################################################################# -### DOWNLOAD DATA -################################################################################# - -# rule download_files: -# input: -# files = config["input_files"] -# output: -# files = config["download_directory"] -# params: -# type = config["download_type"] -# log: -# os.path.join(config["local_log"], "download_files.log") -# shell: -# "(python scripts/download_files.py {}) &> {log}" - - -################################################################################# -### Filter GTF and get ID list -################################################################################# - -# rule filter_gtf: -# input: -# idlist = os.path.join(config["output_dir"], "idlist_fasta.txt"), -# gtf = config["gtf"], -# script = os.path.join(config["scripts_dir"], "validation_gtf.py") -# output: -# idlist = os.path.join(config["output_dir"], "idlist_gtf.txt"), -# gtf = os.path.join(config["output_dir"], "gtf_filtered.gtf") -# params: -# cluster_log = os.path.join(config["cluster_log"], "filter_gtf.log") -# singularity: -# "docker://zavolab/python_htseq:3.6.5_0.10.0" -# log: -# os.path.join(config["local_log"], "filter_gtf.log") -# shell: -# "(python {input.script} --idlist {output.idlist} -f {input.idlist} -i {input.gtf} -o {output.gtf}) &> {log}" - - -################################################################################# -### Filter FASTA file -################################################################################# - -# rule filter_fasta: -# input: -# fasta = os.path.join(config["output_dir"], "transcriptome_idtrim.fa"), -# idlist = os.path.join(config["output_dir"], "idlist_gtf.txt"), -# script = os.path.join(config["scripts_dir"], "validation_fasta.py") -# output: -# fasta = os.path.join(config["output_dir"], "transcriptome_filtered.fa") -# params: -# cluster_log = os.path.join(config["cluster_log"], "filter_fasta.log") -# log: -# os.path.join(config["local_log"], "filter_fasta.log") -# shell: -# "(python {input.script} --filter {input.idlist} -i {input.fasta} -o {output.fasta}) &> {log}"