Newer
Older
#!/usr/bin/env Rscript
#==================#
# HEADER START #
#==================#
### Created: Sep 29, 2019
### Author: Alexander Kanitz
### Affiliation: Zavolan Group, Biozentrum, University of Basel
### Email: alexander.kanitz@alumni.ethz.ch
#==================#
# HEADER END #
#==================#
#==========================#
# PRE-REQUISITES START #
#==========================#
#---> LOAD OPTION PARSER <---#
if (suppressWarnings(suppressPackageStartupMessages(require("optparse"))) == FALSE) { stop("Package 'optparse' required!\nExecution aborted.") }
#---> GET SCRIPT NAME <---#
args.all <- commandArgs(trailingOnly = FALSE)
name.field <- "--file="
script <- basename(sub(name.field, "", args.all[grep(name.field, args.all)]))
#---> DESCRIPTION <---#
description <- "Generates an ASCII-style pileup of read alignments in one or more BAM files
against one or more regions specified in a BED file.\n"
author <- "Author: Alexander Kanitz"
affiliation <- "Affiliation: Biozentrum, University of Basel"
email <- "Email: alexander.kanitz@alumni.ethz.ch"
version <- "1.1.0"
version_formatted <- paste("Version:", version, sep=" ")
requirements <- c("optparse", "rtracklayer", "GenomicAlignments", "tools")
requirements_txt <- paste("Requires:", paste(requirements, collapse=", "), sep=" ")
msg <- paste(description, author, affiliation, email, version_formatted, requirements_txt, sep="\n")
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
notes <- "Notes:
- For the input queries, consider the `--maximum-region-width` parameter, which
is provided for safety. While it is possible to increase it, wide regions may
require excessive memory.
"
#---> COMMAND-LINE ARGUMENTS <---#
## List of allowed/recognized arguments
option_list <- list(
make_option(
"--reference",
action="store",
type="character",
default=NULL,
help="Reference genome sequence in FASTA format. The file *MUST* be compressed with
BGZIP. If supplied, the reference sequence for the query region(s) will be added to
the output. Note that on the first run with a specific reference genome file, an FAI
index is generated which will take some time.",
metavar="file"
),
make_option(
"--annotations",
action="store",
type="character",
default=NULL,
help="Annotation file in GFF/GTF format used to annotate sequences. If supplied,
features overlapping the query region(s) will be visualized in the output. Ensure that
the argument to option `annotation-name-field` corresponds to a field in the
annotations, otherwise the script will fail.",
metavar="file"
),
make_option(
"--output-directory",
action="store",
type="character",
default=getwd(),
help="Output directory. One output file will be created for each region in `--bed` and
the filenames will be generated from the basenames of the supplied BAM file(s) and the
name field (4th column) of the BED file. [default \"%default\"]",
metavar="dir"
),
make_option(
"--maximum-region-width",
action="store",
type="integer",
default=200,
help="Maximum input region width. Use with care as wide regions will use excessive
resources. [default %default]",
metavar="int"
),
make_option(
"--do-not-collapse-alignments",
action="store_true",
type="logical",
default=FALSE,
help="Show alignments of reads with identical sequences individually."
),
make_option(
"--minimum-count",
action="store",
type="integer",
default=1,
help="Alignments of reads with less copies than the specified number will not be
printed. Option is not considered if `do-not-collapse-alignments` is set.
[default %default]",
metavar="int"
),
make_option(
"--annotation-name-field",
action="store",
type="character",
default="Name",
help="Annotation field used to populate the `name` column in the output.
[default \"%default\"]",
metavar="str"
),
make_option(
"--padding-character",
action="store",
default=".",
help="Character used for padding alignments. [default \"%default\"]",
metavar="char"
),
make_option(
"--indel-character",
action="store",
default="-",
help="Character to denote insertions and deletions in alignments. [default \"%default\"]",
metavar="char"
),
make_option(
c("-h", "--help"),
action="store_true",
default=FALSE,
help="Show this information and die."
),
make_option(
c("-v", "--verbose"),
action="store_true",
default=FALSE,
help="Print log messages to STDOUT."
)
)
## Parse options
opt_parser <- OptionParser(
usage=paste(script, "[--help] [--verbose] [OPTIONS] BED BAM [BAM2 ...]\n"),
option_list=option_list,
add_help_option=FALSE,
description=msg,
epilogue=notes
)
cli <- parse_args(opt_parser, positional_arguments=c(2, Inf))
# Re-assign CLI arguments
fl.query <- cli$args[[1]]
fl.bam <- cli$args[2:length(cli$args)]
fl.ref <- cli$options[["reference"]]
fl.anno <- cli$options[["annotations"]]
dir.out <- cli$options[["output-directory"]]
width.max <- cli$options[["maximum-region-width"]]
collapse <- ! cli$options[["do-not-collapse-alignments"]]
count.min <- cli$options[["minimum-count"]]
char.pad <- cli$options[["padding-character"]]
char.indel <- cli$options[["indel-character"]]
field.name.anno <- cli$options[["annotation-name-field"]]
verb <- cli$options[["verbose"]]
#==========================#
# PRE-REQUISITES END #
#==========================#
#================#
# MAIN START #
#================#
#---> START MESSAGE <---#
if (verb) cat("Starting '", script, "'...\n", sep="")
#---> LOAD REQUIRED LIBRARIES <---#
if (verb) cat("Loading libraries...\n", sep="")
for (req in requirements) {
if ( suppressWarnings(suppressPackageStartupMessages(require(req, character.only=TRUE))) == FALSE ) { stop("Package '", req, "' required!") }
}
#---> IMPORT FILES <---#
# Print status message
if (verb) cat("Importing input files...\n", sep="")
# Load files
bed <- import(con=fl.query)
if (! is.null(fl.ref)) {ref <- FaFile(fl.ref)}
if (! is.null(fl.anno)) {anno <- import(con=fl.anno)}
# Get file prefix from BAM files
fl.prefix <- paste(basename(file_path_sans_ext(fl.bam)), collapse=".")
#---> <---#
# Print status message
if (verb) cat("Iterating over regions in BED file...\n")
# Iterate over input regions
for(index in seq_along(bed)) {
#---> PREPARE STACKING OF ALIGNMENTS <---#
# Assign current region
region <- bed[index]
# Print status message
if (verb) cat("Processing region '", mcols(region)[["name"]], "'...\n", sep="")
# Exit with error if region is too wide
if (width(region) > width.max) {stop("Supplied region too large. Consider increasing the `width.max` parameter, but note that for very large regions, the memory footprint may be excessive.")}
# Initialize DNAStringSet container object
seq.out <- DNAStringSet()
#---> ADD READ ALIGNMENTS <---#
# Print status message
if (verb) cat("Iterating over BAM files...\n")
# Iterate over BAM files
for (bam in fl.bam) {
# Print status message
if (verb) cat("Adding read alignments for file '", bam,"'...\n", sep="")
# Get alignments for current BAM file
seq.stack <- stackStringsFromBam(
bam,
param=region,
D.letter=char.indel,
N.letter=char.indel,
Lpadding.letter=char.pad,
Rpadding.letter=char.pad
)
# Add to container
seq.out <- append(seq.out, seq.stack)
}
# Get complement if on minus strand
if (as.character(strand(region))[[1]] == "-") {
seq.out <- complement(seq.out)
}
# Convert to character
seq.out <- as.character(seq.out)
# Convert to dataframe for writing output
df <- data.frame(seq=seq.out, count=rep(NA, length(seq.out)), stringsAsFactors=FALSE)
#---> COLLAPSE READ ALIGNMENTS <---#
if (collapse) {
# Print status message
if (verb) cat("Collapsing identical reads/alignments...\n")
# Get unique alignments
df <- data.frame(table(df[["seq"]]), stringsAsFactors=FALSE)
if (! ncol(df) == 2) df <- data.frame(matrix(ncol = 2, nrow = 0))
colnames(df) <- c("seq", "count")
df[["seq"]] <- as.character(df[["seq"]])
# Filter out any alignments that do not make the specified minimum count cutoff
df <- df[df[["count"]] >= count.min, ]
}
#---> SORTING ALIGNMENTS <---#
# Print status message
if (verb) cat("Sorting alignments...\n")
# Reverse sequence if on minus strand
if (as.character(strand(region))[[1]] == "-") {
df[["seq"]] <- reverse(df[["seq"]])
}
# Sort by position of first nucleotide, count and position of last nucleotide
if (nrow(df)) {
last_char <- nchar(df[["seq"]][[1]])
pos.nuc.first <- regexpr(paste0("[^", char.pad, "\\.]"), df[["seq"]])
pos.nuc.last <- last_char - regexpr(paste0("[^", char.pad, "\\.]"), unlist(lapply(df[["seq"]], reverse))) + 1
df <- df[order(
pos.nuc.first, df[["count"]], pos.nuc.last,
decreasing=c(FALSE, TRUE, FALSE)
), ]
}
# Reverse sequence again if on minus strand
if (as.character(strand(region))[[1]] == "-") {
df[["seq"]] <- reverse(df[["seq"]])
}
#---> ADD REFERENCE SEQUENCE <---#
if (! is.null(fl.ref)) {
# Print status message
if (verb) cat("Adding reference sequence...\n")
# Generate name for reference sequence
name.ref <- paste(
seqnames(region),
paste(start(region), end(region), sep="-"),
strand(region), sep=":"
)
# Get sequence
row.ref <- getSeq(ref, region) # will create .fai index if not present
# Get reverse if on minus strand
if (as.character(strand(region))[[1]] == "-") {
row.ref <- reverse(row.ref)
}
# Compile row and add to dataframe
df.ref <- data.frame(seq=as.character(row.ref), count=name.ref, stringsAsFactors=FALSE)
df <- rbind(df.ref, df)
}
#---> ADD ANNOTATIONS FOR REGION <---#
# Add annotations
if (! is.null(fl.anno)) {
# Print status message
if (verb) cat("Adding overlapping features...\n")
# Find overlapping features
features <- anno[to(findOverlaps(region, anno))]
# Set order of addition from most upstream to most downstream
if (as.character(strand(region)) == "+") {
order.features <- order(start(features))
} else {
order.features <- rev(order(end(features)))
}
# Order features
features <- features[order.features]
# Iterate over features
seqs <- sapply(seq_along(features), function(index) {
feat <- features[index]
# Set markup character depending on strand
char.anno <- ifelse(strand(region) == "+", ">", "<")
# Calculate lengths of feature (in region) & left/right padding
diff.start <- start(feat) - start(region)
n.padl <- max(0, diff.start)
n.anno <- min(min(0, diff.start) + width(feat), width(region) - n.padl)
n.padr <- width(region) - n.padl - n.anno
# Generate string encompassing feature
str.padl <- paste(rep(char.pad, n.padl), collapse="")
str.anno <- paste(rep(char.anno, n.anno), collapse="")
str.padr <- paste(rep(char.pad, n.padr), collapse="")
str.final <- paste0(str.padl, str.anno, str.padr)
})
df.anno <- data.frame(seq=seqs, count=mcols(features)[[field.name.anno]], stringsAsFactors=FALSE)
df <- rbind(df.anno, df)
}
#---> WRITE OUTPUT <---#
if (collapse) {
name.file <- paste(fl.prefix, mcols(region)[["name"]], "min", as.character(count.min), "pileup", "tab", sep=".")
} else {
name.file <- paste(fl.prefix, mcols(region)[["name"]], "uncollapsed", "pileup", "tab", sep=".")
}
fl.out <- file.path(dir.out, name.file)
# Print status message
if (verb) cat("Writing output to file '", fl.out, "'...\n", sep="")
# Write tab-separated output
write.table(df, fl.out, quote=FALSE, sep="\t", col.names=FALSE, row.names=FALSE)
}
#---> END MESSAGE <---#
if (verb) cat("Done.\n\nSession info:\n")
if (verb) print(sessionInfo())
#================#
# MAIN END #
#================#