Skip to content
Snippets Groups Projects
Commit 0b9c8ff3 authored by Alex Kanitz's avatar Alex Kanitz
Browse files

refactor: use awk to trim FASTA identifiers

parent 2983adfc
No related branches found
No related tags found
1 merge request!6refactor: use awk to trim FASTA identifiers
Pipeline #14405 passed
......@@ -31,10 +31,10 @@ package manager. We recommend that you install
For improved reproducibility and reusability of the workflow, as well as an
easy means to run it on a high performance computing (HPC) cluster managed,
e.g., by [Slurm][slurm], almost all steps of the workflow run in their own
container. As a consequence, running this workflow has very few individual
dependencies. It does, however, require that the container engine
[Singularity][singularity] is installed.
e.g., by [Slurm][slurm], all steps of the workflow run in their own container.
As a consequence, running this workflow has very few individual dependencies. It
does, however, require that the container engine [Singularity][singularity] is
installed.
Create and activate the environment with necessary dependencies with conda:
......
#!/bin/bash
# Setting to strict mode
set -euo pipefail
IFS=$'\n\t'
#### FUNCTIONS
usage()
{
echo "usage: split_file_by_colum_value.sh [[[-f file] [-c column] [-s separator]] | [-h]]"
}
#### MAIN
# Test whether all required inputs are present
if [[ $1 == -h ]] || [[ $# != 6 ]]
then
usage
exit
fi
# Get parameters
while [ $# -gt 0 ]; do
case $1 in
-f | --file ) shift
filename=$1
;;
-c | --column ) shift
column=$1
;;
-s | --separator ) shift
separator=$1
;;
-h | --help ) usage
exit
;;
* ) usage
exit 1
esac
shift
done
printf "\nFile splited by colum %i values.\n" \
${column}\
# Split file by values in requested column
awk -v col="${column}" -v sep="${separator}" -F'$sep' '{print > $col".txt"}' ${filename}
printf "\nDONE!\n"
#!/usr/bin/env python
import sys
import re
import gzip
from argparse import ArgumentParser, RawTextHelpFormatter
### Created: Mar 5, 2019
### Author: Paula Iborra
### Company: Zavolan Group, Biozentrum, University of Basel
### ARGUMENTS ###
parser = ArgumentParser(
description="Script to filter FASTA files"
)
parser.add_argument(
'-v','--version',
action='version',
version='%(prog)s 1.0',
help="Show program's version number and exit"
)
parser.add_argument(
'--trim',
help="Character's used to trim the ID. Remove anything that follows the character's. Write \\ infront of \'.\' and \'-\'(i.e trim=\"$\\.\\-|_\"). Default: first white space",
type=str,
nargs='?',
default=""
)
parser.add_argument(
'--idlist',
help="Generate text file with the sequences IDs. One ID per line."
)
parser.add_argument(
'-f','--filter',
help="Input ID list. Filter IDs and sequences from FASTA file with the mode selected. Filter file must contain ONE ID per line",
)
parser.add_argument(
'-m', '--mode',
help="Type of filtering fasta file: keep (k) or discard (d) IDs contained in the ID list file.",
choices=('k', 'd')
)
parser.add_argument(
'-r','--remove',
help="Remove sequences from FASTA file longer than specified length.",
type=int
)
parser.add_argument(
'-i','--input',
required=True,
help="Input FASTA file",
type=str
)
parser.add_argument(
'-o','--output',
help="Output FASTA file"
)
args = parser.parse_args()
if args.filter and not args.mode:
sys.exit("ERROR! Mode argument required when using filter option. (--mode, -m). See --help option.")
### PARSE FASTA FILE ###
class Seq:
def __init__(self):
self.id=""
def __init__(self):
self.seq=""
def __init__(self):
self.features=""
# open files
if args.input.endswith('.gz'):
f = gzip.open(args.input, 'rt')
else:
f = open(args.input)
record=[] #list of records
nrec=-1
inseq=0
# parse fasta file
sys.stdout.write("Parsing FASTA file...")
for line in f:
if re.match(r'^>' , line):
nrec+=1
record.append(Seq())
# define id of the record
if not args.trim:
mobj=re.match(r'^>(\S*)(.*)', line)
else:
mobj=re.match(r'^>([^%s]*)(.*)'%args.trim , line)
# add id and features
if (mobj):
record[nrec].id=mobj.group(1)
record[nrec].features=mobj.group(2)
inseq=0
else :
if inseq==0 :
inseq=1
record[nrec].seq = line
else:
cstring=record[nrec].seq+line
record[nrec].seq = cstring
sys.stdout.write("DONE\n")
## ID FILTER LIST ##
if (args.filter):
sys.stdout.write("Filtering FASTA file...")
id_filter=[line.rstrip('\n') for line in open(args.filter)]
sys.stdout.write("DONE\n")
## OUTPUT FASTA FILE ##
if (args.output):
sys.stdout.write("Writing FASTA file...")
with open(args.output, 'w') as output:
if (args.filter) and args.mode == 'k':
if (args.remove):
for x in range(0,nrec+1):
if record[x].id in id_filter and (len(record[x].seq)-1 <= args.remove):
output.write(">%s\n%s"%(record[x].id, record[x].seq))
else:
for x in range(0,nrec+1):
if record[x].id in id_filter:
output.write(">%s\n%s"%(record[x].id, record[x].seq))
elif (args.filter) and args.mode == 'd':
if (args.remove):
for x in range(0,nrec+1):
if record[x].id not in id_filter and (len(record[x].seq)-1 <= args.remove):
output.write(">%s\n%s"%(record[x].id, record[x].seq))
else:
for x in range(0,nrec+1):
if record[x].id not in id_filter:
output.write(">%s\n%s"%(record[x].id, record[x].seq))
else:
if (args.remove):
for x in range(0,nrec+1):
if (len(record[x].seq)-1 <= args.remove):
output.write(">%s\n%s"%(record[x].id, record[x].seq))
else:
for x in range(0,nrec+1):
output.write(">%s\n%s"%(record[x].id, record[x].seq))
output.close()
sys.stdout.write("DONE\n")
## OUTPUT LIST IDs ##
idlist=[]
if (args.idlist):
sys.stdout.write("Creating IDs list from FASTA file...")
fasta = open(args.output, 'r')
with open(args.idlist, 'w') as id_list:
for line in fasta:
if line.startswith('>'):
idlist.append(line[1:])
idlist.sort()
id_list.write(''.join(idlist))
id_list.close()
sys.stdout.write("DONE\n")
#!/usr/bin/env python
import sys
import re
import HTSeq
from argparse import ArgumentParser
### Created: Mar 5, 2019
### Author: Paula Iborra
### Company: Zavolan Group, Biozentrum, University of Basel
### ARGUMENTS ###
parser = ArgumentParser(
description='Script to filter GTF files'
)
parser.add_argument(
'-v','--version',
action='version',
version='%(prog)s 1.0',
help='Show program\'s version number and exit'
)
parser.add_argument(
'--idfield',
help='Name of field that contains ID of interest. Default: transcript_id',
default='transcript_id',
choices=('transcript_id','gene_id', 'exon_id'),
)
parser.add_argument(
'--idlist',
help='Generate text file with one ID per line'
)
parser.add_argument(
'-f','--filter',
help='Remove IDs from GTF file missing in filter file. Filter file must contain ONE ID per line'
)
parser.add_argument(
'-i','--input',
required=True,
help='Input GTF file',
type=str
)
parser.add_argument(
'-o','--output',
required=True,
help='Output GTF file'
)
args = parser.parse_args()
#list filtered IDs
ids = []
### PARSE GTF FILE ###
sys.stdout.write('Parsing GTF file...')
# parse gtf file
gtf_file = HTSeq.GFF_Reader(args.input)
sys.stdout.write('DONE\n')
### FILTER GTF ###
# keep everyline not containing the id field, and the ones that contains the id field if id in id_filter list.
if (args.filter):
sys.stdout.write('Filtering GTF file - Step 1...')
#open filter file and make it a list
id_filter=[line.rstrip('\n') for line in open(args.filter)]
pregtf = open('pregtf.gtf', 'w')
for gtf_line in gtf_file:
if gtf_line.type in ['gene', 'transcript', 'exon']:
## filter by gene_id
if args.idfield == 'gene_id':
if gtf_line.attr['gene_id'] in id_filter:
pregtf.write(gtf_line.get_gff_line())
## filter by anything else
else:
if args.idfield not in gtf_line.attr.keys():
pregtf.write(gtf_line.get_gff_line())
else:
if gtf_line.attr[args.idfield] in id_filter:
pregtf.write(gtf_line.get_gff_line())
pregtf.close()
sys.stdout.write('DONE\n')
### OUTPUT GTF FILE ###
# remove those lines with no childs (empty genes / transcripts)
sys.stdout.write('Filtering/Writing GTF file - Step 2...')
pregtf = open('pregtf.gtf', 'r')
gtf_output = open(args.output, 'w')
previous = []
if args.idfield == 'gene_id':
for line in pregtf:
gtf_output.write(line)
elif args.idfield == 'transcript_id':
# store first line
if pregtf:
first_line = pregtf.readline().split('\t')
previous.append(first_line)
for line in pregtf:
line=line.split('\t')
if line[2] == previous[-1][2]:
if line[2] != 'gene':
gtf_output.write('\t'.join(previous[-1]))
del previous[-1]
previous.append(line)
else:
del previous[-1]
previous.append(line)
else:
gtf_output.write('\t'.join(previous[-1]))
del previous[-1]
previous.append(line)
#for last line
if previous[-1][2] == 'exon':
gtf_output.write('\t'.join(previous[-1]))
elif args.idfield == 'exon_id':
for line in reversed(open('pregtf.gtf','r').readlines()):
line = line.split('\t')
if line[2] == 'exon':
previous.append(line)
else:
if len(previous) == 0:
continue
else:
if line[2] == 'transcript':
if previous[-1][2] == 'exon':
previous.append(line)
elif line[2] == 'gene':
if previous[-1][2] == 'transcript':
previous.append(line)
else:
print('FILTER ERROR')
break
for i in previous[::-1]:
gtf_output.write('\t'.join(i))
gtf_output.close()
sys.stdout.write('DONE\n')
### OUTPUT IDs GTF FILE ###
if (args.idlist):
sys.stdout.write('Creating IDs list from GTF file...')
ids_list = open(args.idlist, 'w')
t = (re.match(r'([^_]*)',args.idfield)).group()
# creating list of ids from gtf output filtered
if (args.filter):
gtf_output = HTSeq.GFF_Reader(args.output)
for line in gtf_output:
if line.type == t:
ids_list.write(line.attr[args.idfield]+'\n')
ids_list.close()
# creating list of ids from input gtf file
else:
for line in gtf_file:
if line.type == t:
ids_list.write(line.attr[args.idfield]+'\n')
ids_list.close()
sys.stdout.write('DONE\n')
......@@ -7,36 +7,36 @@ localrules: finish, genome_process, filter_anno_gtf
#################################################################################
rule finish:
input:
idx_transcriptome = expand(os.path.join(config["output_dir"], "{organism}", "{prefix_name}","transcriptome_index_segemehl.idx"),
organism=config["organism"], prefix_name=config["prefix_name"]),
idx_genome = expand(os.path.join(config["output_dir"],"{organism}", "{prefix_name}", "genome_index_segemehl.idx"),
organism=config["organism"], prefix_name=config["prefix_name"]),
exons = expand(os.path.join(config["output_dir"],"{organism}", "{prefix_name}", "exons.bed"),
organism=config["organism"], prefix_name=config["prefix_name"]),
header = expand(os.path.join(config["output_dir"],"{organism}", "{prefix_name}","headerOfCollapsedFasta.sam"),
organism=config["organism"], prefix_name=config["prefix_name"]),
input:
idx_transcriptome = expand(os.path.join(config["output_dir"], "{organism}", "{prefix_name}","transcriptome_index_segemehl.idx"),
organism=config["organism"], prefix_name=config["prefix_name"]),
idx_genome = expand(os.path.join(config["output_dir"],"{organism}", "{prefix_name}", "genome_index_segemehl.idx"),
organism=config["organism"], prefix_name=config["prefix_name"]),
exons = expand(os.path.join(config["output_dir"],"{organism}", "{prefix_name}", "exons.bed"),
organism=config["organism"], prefix_name=config["prefix_name"]),
header = expand(os.path.join(config["output_dir"],"{organism}", "{prefix_name}","headerOfCollapsedFasta.sam"),
organism=config["organism"], prefix_name=config["prefix_name"]),
#################################################################################
### Download and process genome IDs
#################################################################################
rule genome_process:
input:
script = os.path.join(config["scripts_dir"],"genome_process.sh"),
output:
genome = os.path.join(config["output_dir"],"{organism}","{prefix_name}", "genome.processed.fa")
params:
cluster_log = os.path.join(config["cluster_log"],"{organism}","{prefix_name}","genome_process.log"),
prefix = config["prefix_name"],
url = config["genome_url"],
organism=config["organism"]
log:
os.path.join(config["local_log"],"{organism}","{prefix_name}","genome_process.log")
singularity:
"docker://zavolab/ubuntu:18.04"
shell:
"(bash {input.script} {params.prefix} {params.organism} {params.url}) &> {log}"
input:
script = os.path.join(config["scripts_dir"],"genome_process.sh"),
output:
genome = os.path.join(config["output_dir"],"{organism}","{prefix_name}", "genome.processed.fa")
params:
cluster_log = os.path.join(config["cluster_log"],"{organism}","{prefix_name}","genome_process.log"),
prefix = config["prefix_name"],
url = config["genome_url"],
organism=config["organism"]
log:
os.path.join(config["local_log"],"{organism}","{prefix_name}","genome_process.log")
singularity:
"docker://zavolab/ubuntu:18.04"
shell:
"(bash {input.script} {params.prefix} {params.organism} {params.url}) &> {log}"
#################################################################################
......@@ -44,40 +44,40 @@ rule genome_process:
#################################################################################
rule filter_anno_gtf:
input:
script = os.path.join(config["scripts_dir"],"filter_anno_gtf.sh"),
output:
gtf = os.path.join(config["output_dir"],"{organism}","{prefix_name}","gene_annotations.filtered.gtf")
params:
cluster_log = os.path.join(config["cluster_log"],"{organism}","{prefix_name}","download_filter_gtf.log"),
prefix = config["prefix_name"],
url = config["gtf_url"],
organism=config["organism"]
log:
os.path.join(config["local_log"],"{organism}","{prefix_name}","filter_anno_gtf.log")
singularity:
"docker://zavolab/ubuntu:18.04"
shell:
"(bash {input.script} {params.prefix} {params.organism} {params.url}) &> {log}"
input:
script = os.path.join(config["scripts_dir"],"filter_anno_gtf.sh"),
output:
gtf = os.path.join(config["output_dir"],"{organism}","{prefix_name}","gene_annotations.filtered.gtf")
params:
cluster_log = os.path.join(config["cluster_log"],"{organism}","{prefix_name}","download_filter_gtf.log"),
prefix = config["prefix_name"],
url = config["gtf_url"],
organism=config["organism"]
log:
os.path.join(config["local_log"],"{organism}","{prefix_name}","filter_anno_gtf.log")
singularity:
"docker://zavolab/ubuntu:18.04"
shell:
"(bash {input.script} {params.prefix} {params.organism} {params.url}) &> {log}"
#################################################################################
### Extract transcriptome sequences in FASTA from genome.
#################################################################################
rule extract_transcriptome_seqs:
input:
genome = os.path.join(config["output_dir"],"{organism}","{prefix_name}", "genome.processed.fa"),
gtf = os.path.join(config["output_dir"],"{organism}","{prefix_name}","gene_annotations.filtered.gtf")
output:
fasta = os.path.join(config["output_dir"],"{organism}","{prefix_name}","transcriptome.fa")
params:
cluster_log = os.path.join(config["cluster_log"],"{organism}","{prefix_name}","extract_transcriptome_seqs.log")
log:
os.path.join(config["local_log"],"{organism}","{prefix_name}","extract_transcriptome_seqs.log")
singularity:
"docker://zavolab/cufflinks:2.2.1"
shell:
"(gffread -w {output.fasta} -g {input.genome} {input.gtf}) &> {log}"
input:
genome = os.path.join(config["output_dir"],"{organism}","{prefix_name}", "genome.processed.fa"),
gtf = os.path.join(config["output_dir"],"{organism}","{prefix_name}","gene_annotations.filtered.gtf")
output:
fasta = os.path.join(config["output_dir"],"{organism}","{prefix_name}","transcriptome.fa")
params:
cluster_log = os.path.join(config["cluster_log"],"{organism}","{prefix_name}","extract_transcriptome_seqs.log")
log:
os.path.join(config["local_log"],"{organism}","{prefix_name}","extract_transcriptome_seqs.log")
singularity:
"docker://zavolab/cufflinks:2.2.1"
shell:
"(gffread -w {output.fasta} -g {input.genome} {input.gtf}) &> {log}"
################################################################################
......@@ -85,17 +85,18 @@ rule extract_transcriptome_seqs:
################################################################################
rule trim_fasta:
input:
fasta = os.path.join(config["output_dir"], "{organism}","{prefix_name}","transcriptome.fa"),
script = os.path.join(config["scripts_dir"], "validation_fasta.py")
output:
fasta = os.path.join(config["output_dir"],"{organism}","{prefix_name}","transcriptome_idtrim.fa")
params:
cluster_log = os.path.join(config["cluster_log"],"{organism}","{prefix_name}","trim_fasta.log")
log:
os.path.join(config["local_log"],"{organism}","{prefix_name}","trim_fasta.log")
shell:
"(python {input.script} --trim -i {input.fasta} -o {output.fasta}) &> {log}"
input:
fasta = os.path.join(config["output_dir"], "{organism}","{prefix_name}","transcriptome.fa"),
output:
fasta = os.path.join(config["output_dir"],"{organism}","{prefix_name}","transcriptome_idtrim.fa")
params:
cluster_log = os.path.join(config["cluster_log"],"{organism}","{prefix_name}","trim_fasta.log")
log:
os.path.join(config["local_log"],"{organism}","{prefix_name}","trim_fasta.log")
singularity:
"docker://zavolab/ubuntu:18.04"
shell:
"""(awk -F" " "/^>/ {{print \$1; next}} 1" {input.fasta} > {output.fasta}) &> {log}"""
#################################################################################
......@@ -103,22 +104,22 @@ rule trim_fasta:
#################################################################################
rule generate_segemehl_index_transcriptome:
input:
fasta = os.path.join(config["output_dir"],"{organism}","{prefix_name}","transcriptome_idtrim.fa")
output:
idx = os.path.join(config["output_dir"],"{organism}","{prefix_name}","transcriptome_index_segemehl.idx")
params:
cluster_log = os.path.join(config["cluster_log"],"{organism}","{prefix_name}","generate_segemehl_index_transcriptome.log"),
log:
os.path.join(config["local_log"],"{organism}","{prefix_name}","generate_segemehl_index_transcriptome.log")
resources:
mem = 10,
threads = 8,
time = 6
singularity:
"docker://zavolab/segemehl:0.2.0"
shell:
"(segemehl.x -x {output.idx} -d {input.fasta}) &> {log}"
input:
fasta = os.path.join(config["output_dir"],"{organism}","{prefix_name}","transcriptome_idtrim.fa")
output:
idx = os.path.join(config["output_dir"],"{organism}","{prefix_name}","transcriptome_index_segemehl.idx")
params:
cluster_log = os.path.join(config["cluster_log"],"{organism}","{prefix_name}","generate_segemehl_index_transcriptome.log"),
log:
os.path.join(config["local_log"],"{organism}","{prefix_name}","generate_segemehl_index_transcriptome.log")
resources:
mem = 10,
threads = 8,
time = 6
singularity:
"docker://zavolab/segemehl:0.2.0"
shell:
"(segemehl.x -x {output.idx} -d {input.fasta}) &> {log}"
#################################################################################
......@@ -126,24 +127,24 @@ rule generate_segemehl_index_transcriptome:
#################################################################################
rule generate_segemehl_index_genome:
input:
#genome = config["genome"]
genome = os.path.join(config["output_dir"],"{organism}","{prefix_name}", "genome.processed.fa")
output:
idx = os.path.join(config["output_dir"],"{organism}","{prefix_name}","genome_index_segemehl.idx")
params:
cluster_log = os.path.join(config["cluster_log"],"{organism}","{prefix_name}","generate_segemehl_index_genome.log"),
log:
os.path.join(config["local_log"],"{organism}","{prefix_name}","generate_segemehl_index_genome.log")
resources:
mem = 60,
threads = 8,
time = 6
singularity:
"docker://zavolab/segemehl:0.2.0"
shell:
"(segemehl.x -x {output.idx} -d {input.genome}) &> {log}"
input:
#genome = config["genome"]
genome = os.path.join(config["output_dir"],"{organism}","{prefix_name}", "genome.processed.fa")
output:
idx = os.path.join(config["output_dir"],"{organism}","{prefix_name}","genome_index_segemehl.idx")
params:
cluster_log = os.path.join(config["cluster_log"],"{organism}","{prefix_name}","generate_segemehl_index_genome.log"),
log:
os.path.join(config["local_log"],"{organism}","{prefix_name}","generate_segemehl_index_genome.log")
resources:
mem = 60,
threads = 8,
time = 6
singularity:
"docker://zavolab/segemehl:0.2.0"
shell:
"(segemehl.x -x {output.idx} -d {input.genome}) &> {log}"
#################################################################################
......@@ -151,19 +152,19 @@ rule generate_segemehl_index_genome:
#################################################################################
rule get_exons_gtf:
input:
gtf = os.path.join(config["output_dir"],"{organism}","{prefix_name}","gene_annotations.filtered.gtf"),
script = os.path.join(config["scripts_dir"], "get_lines_w_pattern.sh")
output:
exons = os.path.join(config["output_dir"],"{organism}","{prefix_name}","exons.gtf")
params:
cluster_log = os.path.join(config["cluster_log"],"{organism}","{prefix_name}","get_exons_gtf.log")
log:
os.path.join(config["local_log"],"{organism}","{prefix_name}", "get_exons_gtf.log")
singularity:
"docker://zavolab/ubuntu:18.04"
shell:
"(bash {input.script} -f {input.gtf} -c 3 -p exon -o {output.exons} ) &> {log}"
input:
gtf = os.path.join(config["output_dir"],"{organism}","{prefix_name}","gene_annotations.filtered.gtf"),
script = os.path.join(config["scripts_dir"], "get_lines_w_pattern.sh")
output:
exons = os.path.join(config["output_dir"],"{organism}","{prefix_name}","exons.gtf")
params:
cluster_log = os.path.join(config["cluster_log"],"{organism}","{prefix_name}","get_exons_gtf.log")
log:
os.path.join(config["local_log"],"{organism}","{prefix_name}", "get_exons_gtf.log")
singularity:
"docker://zavolab/ubuntu:18.04"
shell:
"(bash {input.script} -f {input.gtf} -c 3 -p exon -o {output.exons} ) &> {log}"
#################################################################################
......@@ -171,19 +172,19 @@ rule get_exons_gtf:
#################################################################################
rule gtftobed:
input:
exons = os.path.join(config["output_dir"],"{organism}","{prefix_name}","exons.gtf"),
script = os.path.join(config["scripts_dir"], "gtf_exons_bed.1.1.2.R")
output:
exons = os.path.join(config["output_dir"],"{organism}","{prefix_name}","exons.bed")
params:
cluster_log = os.path.join(config["cluster_log"],"{organism}","{prefix_name}","gtftobed.log")
log:
os.path.join(config["local_log"],"{organism}","{prefix_name}","gtftobed.log")
singularity:
"docker://zavolab/r-zavolab:3.5.1"
shell:
"(Rscript {input.script} --gtf {input.exons} -o {output.exons}) &> {log}"
input:
exons = os.path.join(config["output_dir"],"{organism}","{prefix_name}","exons.gtf"),
script = os.path.join(config["scripts_dir"], "gtf_exons_bed.1.1.2.R")
output:
exons = os.path.join(config["output_dir"],"{organism}","{prefix_name}","exons.bed")
params:
cluster_log = os.path.join(config["cluster_log"],"{organism}","{prefix_name}","gtftobed.log")
log:
os.path.join(config["local_log"],"{organism}","{prefix_name}","gtftobed.log")
singularity:
"docker://zavolab/r-zavolab:3.5.1"
shell:
"(Rscript {input.script} --gtf {input.exons} -o {output.exons}) &> {log}"
#################################################################################
......@@ -191,16 +192,16 @@ rule gtftobed:
#################################################################################
rule create_header_genome:
input:
#genome = config["genome"]
genome = os.path.join(config["output_dir"],"{organism}","{prefix_name}", "genome.processed.fa")
output:
header = os.path.join(config["output_dir"],"{organism}","{prefix_name}","headerOfCollapsedFasta.sam")
params:
cluster_log = os.path.join(config["cluster_log"],"{organism}","{prefix_name}","create_header_genome.log")
log:
os.path.join(config["local_log"],"{organism}","{prefix_name}","create_header_genome.log")
singularity:
"docker://zavolab/samtools:1.8"
shell:
"(samtools dict -o {output.header} {input.genome}) &> {log}"
input:
#genome = config["genome"]
genome = os.path.join(config["output_dir"],"{organism}","{prefix_name}", "genome.processed.fa")
output:
header = os.path.join(config["output_dir"],"{organism}","{prefix_name}","headerOfCollapsedFasta.sam")
params:
cluster_log = os.path.join(config["cluster_log"],"{organism}","{prefix_name}","create_header_genome.log")
log:
os.path.join(config["local_log"],"{organism}","{prefix_name}","create_header_genome.log")
singularity:
"docker://zavolab/samtools:1.8"
shell:
"(samtools dict -o {output.header} {input.genome}) &> {log}"
#################################################################################
### DOWNLOAD DATA
#################################################################################
# rule download_files:
# input:
# files = config["input_files"]
# output:
# files = config["download_directory"]
# params:
# type = config["download_type"]
# log:
# os.path.join(config["local_log"], "download_files.log")
# shell:
# "(python scripts/download_files.py {}) &> {log}"
#################################################################################
### Filter GTF and get ID list
#################################################################################
# rule filter_gtf:
# input:
# idlist = os.path.join(config["output_dir"], "idlist_fasta.txt"),
# gtf = config["gtf"],
# script = os.path.join(config["scripts_dir"], "validation_gtf.py")
# output:
# idlist = os.path.join(config["output_dir"], "idlist_gtf.txt"),
# gtf = os.path.join(config["output_dir"], "gtf_filtered.gtf")
# params:
# cluster_log = os.path.join(config["cluster_log"], "filter_gtf.log")
# singularity:
# "docker://zavolab/python_htseq:3.6.5_0.10.0"
# log:
# os.path.join(config["local_log"], "filter_gtf.log")
# shell:
# "(python {input.script} --idlist {output.idlist} -f {input.idlist} -i {input.gtf} -o {output.gtf}) &> {log}"
#################################################################################
### Filter FASTA file
#################################################################################
# rule filter_fasta:
# input:
# fasta = os.path.join(config["output_dir"], "transcriptome_idtrim.fa"),
# idlist = os.path.join(config["output_dir"], "idlist_gtf.txt"),
# script = os.path.join(config["scripts_dir"], "validation_fasta.py")
# output:
# fasta = os.path.join(config["output_dir"], "transcriptome_filtered.fa")
# params:
# cluster_log = os.path.join(config["cluster_log"], "filter_fasta.log")
# log:
# os.path.join(config["local_log"], "filter_fasta.log")
# shell:
# "(python {input.script} --filter {input.idlist} -i {input.fasta} -o {output.fasta}) &> {log}"
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment