ascii_alignment_pileup.R

#!/usr/bin/env Rscript

#==================#
#   HEADER START   #
#==================#
### Created: Sep 29, 2019
### Author: Alexander Kanitz
### Affiliation: Zavolan Group, Biozentrum, University of Basel
### Email: alexander.kanitz@alumni.ethz.ch
#==================#
#    HEADER END    #
#==================#


#==========================#
#   PRE-REQUISITES START   #
#==========================#
#---> LOAD OPTION PARSER <---#
if (suppressWarnings(suppressPackageStartupMessages(require("optparse"))) == FALSE) { stop("Package 'optparse' required!\nExecution aborted.") }

#---> GET SCRIPT NAME <---#
args.all <- commandArgs(trailingOnly = FALSE)
name.field <- "--file="
script <- basename(sub(name.field, "", args.all[grep(name.field, args.all)]))

#---> DESCRIPTION <---#
description <- "Generates an ASCII-style pileup of read alignments in one or more BAM files
against one or more regions specified in a BED file.\n"
author <- "Author: Alexander Kanitz"
affiliation <- "Affiliation: Biozentrum, University of Basel"
email <- "Email: alexander.kanitz@alumni.ethz.ch"
version <- "1.1.0"
version_formatted <- paste("Version:", version, sep=" ")
requirements <- c("optparse", "rtracklayer", "GenomicAlignments", "tools")
requirements_txt <- paste("Requires:", paste(requirements, collapse=", "), sep=" ")
msg <- paste(description, author, affiliation, email, version_formatted, requirements_txt, sep="\n")
notes <- "Notes:
- For the input queries, consider the `--maximum-region-width` parameter, which
  is provided for safety. While it is possible to increase it, wide regions may
  require excessive memory.
"

#---> COMMAND-LINE ARGUMENTS <---#
## List of allowed/recognized arguments
option_list <- list(
        make_option(
            "--reference",
            action="store",
            type="character",
            default=NULL,
            help="Reference genome sequence in FASTA format. The file *MUST* be compressed with
            BGZIP. If supplied, the reference sequence for the query region(s) will be added to
            the output. Note that on the first run with a specific reference genome file, an FAI
            index is generated which will take some time.",
            metavar="file"
        ),
        make_option(
            "--annotations",
            action="store",
            type="character",
            default=NULL,
            help="Annotation file in GFF/GTF format used to annotate sequences. If supplied,
            features overlapping the query region(s) will be visualized in the output. Ensure that
            the argument to option `annotation-name-field` corresponds to a field in the
            annotations, otherwise the script will fail.",
            metavar="file"
        ),
        make_option(
            "--output-directory",
            action="store",
            type="character",
            default=getwd(),
            help="Output directory. One output file will be created for each region in `--bed` and
            the filenames will be generated from the basenames of the supplied BAM file(s) and the
            name field (4th column) of the BED file. [default \"%default\"]",
            metavar="dir"
        ),
        make_option(
            "--maximum-region-width",
            action="store",
            type="integer",
            default=200,
            help="Maximum input region width. Use with care as wide regions will use excessive
            resources. [default %default]",
            metavar="int"
        ),
        make_option(
            "--do-not-collapse-alignments",
            action="store_true",
            type="logical",
            default=FALSE,
            help="Show alignments of reads with identical sequences individually."
        ),
        make_option(
            "--minimum-count",
            action="store",
            type="integer",
            default=1,
            help="Alignments of reads with less copies than the specified number will not be
            printed. Option is not considered if `do-not-collapse-alignments` is set.
            [default %default]",
            metavar="int"
        ),
        make_option(
            "--annotation-name-field",
            action="store",
            type="character",
            default="Name",
            help="Annotation field used to populate the `name` column in the output.
            [default \"%default\"]",
            metavar="str"
        ),
        make_option(
            "--padding-character",
            action="store",
            default=".",
            help="Character used for padding alignments. [default \"%default\"]",
            metavar="char"
        ),
        make_option(
            "--indel-character",
            action="store",
            default="-",
            help="Character to denote insertions and deletions in alignments. [default \"%default\"]",
            metavar="char"
        ),
        make_option(
            c("-h", "--help"),
            action="store_true",
            default=FALSE,
            help="Show this information and die."
        ),
        make_option(
            c("-v", "--verbose"),
            action="store_true",
            default=FALSE,
            help="Print log messages to STDOUT."
        )
)

## Parse options
opt_parser <- OptionParser(
    usage=paste(script, "[--help] [--verbose] [OPTIONS] BED BAM [BAM2 ...]\n"),
    option_list=option_list,
    add_help_option=FALSE,
    description=msg,
    epilogue=notes
)
cli <- parse_args(opt_parser, positional_arguments=c(2, Inf))

# Re-assign CLI arguments
fl.query <- cli$args[[1]]
fl.bam <- cli$args[2:length(cli$args)]
fl.ref <- cli$options[["reference"]]
fl.anno <- cli$options[["annotations"]]
dir.out <- cli$options[["output-directory"]]
width.max <- cli$options[["maximum-region-width"]]
collapse <- ! cli$options[["do-not-collapse-alignments"]]
count.min <- cli$options[["minimum-count"]]
char.pad <- cli$options[["padding-character"]]
char.indel <- cli$options[["indel-character"]]
field.name.anno <- cli$options[["annotation-name-field"]]
verb <- cli$options[["verbose"]]
#==========================#
#    PRE-REQUISITES END    #
#==========================#


#================#
#   MAIN START   #
#================#
#---> START MESSAGE <---#
if (verb) cat("Starting '", script, "'...\n", sep="")

#---> LOAD REQUIRED LIBRARIES <---#
if (verb) cat("Loading libraries...\n", sep="")
for (req in requirements) {
    if ( suppressWarnings(suppressPackageStartupMessages(require(req, character.only=TRUE))) == FALSE ) { stop("Package '", req, "' required!") }
}

#---> IMPORT FILES <---#
# Print status message
if (verb) cat("Importing input files...\n", sep="")
# Load files
bed <- import(con=fl.query)
if (! is.null(fl.ref)) {ref <- FaFile(fl.ref)}
if (! is.null(fl.anno)) {anno <- import(con=fl.anno)}

# Get file prefix from BAM files
fl.prefix <- paste(basename(file_path_sans_ext(fl.bam)), collapse=".")

#--->   <---#
# Print status message
if (verb) cat("Iterating over regions in BED file...\n")
# Iterate over input regions
for(index in seq_along(bed)) {

    #---> PREPARE STACKING OF ALIGNMENTS  <---#
    # Assign current region
    region <- bed[index]
    # Print status message
    if (verb) cat("Processing region '", mcols(region)[["name"]], "'...\n", sep="")
    # Exit with error if region is too wide
    if (width(region) > width.max) {stop("Supplied region too large. Consider increasing the `width.max` parameter, but note that for very large regions, the memory footprint may be excessive.")}
    # Initialize DNAStringSet container object
    seq.out <- DNAStringSet()

    #---> ADD READ ALIGNMENTS  <---#
    # Print status message
    if (verb) cat("Iterating over BAM files...\n")
    # Iterate over BAM files
    for (bam in fl.bam) {
        # Print status message
        if (verb) cat("Adding read alignments for file '", bam,"'...\n", sep="")
        # Get alignments for current BAM file
        seq.stack <- stackStringsFromBam(
            bam,
            param=region,
            D.letter=char.indel,
            N.letter=char.indel,
            Lpadding.letter=char.pad,
            Rpadding.letter=char.pad
        )
        # Add to container
        seq.out <- append(seq.out, seq.stack)
    }
    # Get complement if on minus strand
    if (as.character(strand(region))[[1]] == "-") {
        seq.out <- complement(seq.out)
    }
    # Convert to character
    seq.out <- as.character(seq.out)
    # Convert to dataframe for writing output
    df <- data.frame(seq=seq.out, count=rep(NA, length(seq.out)), stringsAsFactors=FALSE)

    #---> COLLAPSE READ ALIGNMENTS <---#
    if (collapse) {
        # Print status message
        if (verb) cat("Collapsing identical reads/alignments...\n")
        # Get unique alignments
        df <- data.frame(table(df[["seq"]]), stringsAsFactors=FALSE)
        if (! ncol(df) == 2) df <- data.frame(matrix(ncol = 2, nrow = 0))
        colnames(df) <- c("seq", "count")
        df[["seq"]] <- as.character(df[["seq"]])
        # Filter out any alignments that do not make the specified minimum count cutoff
        df <- df[df[["count"]] >= count.min, ]
    }

    #---> SORTING ALIGNMENTS  <---#
    # Print status message
    if (verb) cat("Sorting alignments...\n")
    # Reverse sequence if on minus strand
    if (as.character(strand(region))[[1]] == "-") {
        df[["seq"]] <- reverse(df[["seq"]])
    }
    # Sort by position of first nucleotide, count and position of last nucleotide
    if (nrow(df)) {
        last_char <- nchar(df[["seq"]][[1]])
        pos.nuc.first <- regexpr(paste0("[^", char.pad, "\\.]"), df[["seq"]])
        pos.nuc.last <- last_char - regexpr(paste0("[^", char.pad, "\\.]"), unlist(lapply(df[["seq"]], reverse))) + 1
        df <- df[order(
            pos.nuc.first, df[["count"]], pos.nuc.last,
            decreasing=c(FALSE, TRUE, FALSE)
        ), ]
    }
    # Reverse sequence again if on minus strand
    if (as.character(strand(region))[[1]] == "-") {
        df[["seq"]] <- reverse(df[["seq"]])
    }

    #---> ADD REFERENCE SEQUENCE  <---#
    if (! is.null(fl.ref)) {
        # Print status message
        if (verb) cat("Adding reference sequence...\n")
        # Generate name for reference sequence
        name.ref <- paste(
            seqnames(region),
            paste(start(region), end(region), sep="-"),
            strand(region), sep=":"
        )
        # Get sequence
        row.ref <- getSeq(ref, region)  # will create .fai index if not present
        # Get reverse if on minus strand
        if (as.character(strand(region))[[1]] == "-") {
            row.ref <- reverse(row.ref)
        }
        # Compile row and add to dataframe
        df.ref <- data.frame(seq=as.character(row.ref), count=name.ref, stringsAsFactors=FALSE)
        df <- rbind(df.ref, df)
    }

    #---> ADD ANNOTATIONS FOR REGION <---#
    # Add annotations
    if (! is.null(fl.anno)) {
        # Print status message
        if (verb) cat("Adding overlapping features...\n")
        # Find overlapping features
        features <- anno[to(findOverlaps(region, anno))]
        # Set order of addition from most upstream to most downstream
        if (as.character(strand(region)) == "+") {
            order.features <- order(start(features))
        } else {
            order.features <- rev(order(end(features)))
        }
        # Order features
        features <- features[order.features]
        # Iterate over features
        seqs <- sapply(seq_along(features), function(index) {
            feat <- features[index]
            # Set markup character depending on strand
            char.anno <- ifelse(strand(region) == "+", ">", "<")
            # Calculate lengths of feature (in region) & left/right padding
            diff.start <- start(feat) - start(region)
            n.padl <- max(0, diff.start)
            n.anno <- min(min(0, diff.start) + width(feat), width(region) - n.padl)
            n.padr <- width(region) - n.padl - n.anno
            # Generate string encompassing feature
            str.padl <- paste(rep(char.pad, n.padl), collapse="")
            str.anno <- paste(rep(char.anno, n.anno), collapse="")
            str.padr <- paste(rep(char.pad, n.padr), collapse="")
            str.final <- paste0(str.padl, str.anno, str.padr)
        })
        df.anno <- data.frame(seq=seqs, count=mcols(features)[[field.name.anno]], stringsAsFactors=FALSE)
        df <- rbind(df.anno, df)
    }

    #---> WRITE OUTPUT  <---#
    if (collapse) {
        name.file <- paste(fl.prefix, mcols(region)[["name"]], "min", as.character(count.min), "pileup", "tab", sep=".")
    } else {
        name.file <- paste(fl.prefix, mcols(region)[["name"]], "uncollapsed", "pileup", "tab", sep=".")
    }
    fl.out <- file.path(dir.out, name.file)
    # Print status message
    if (verb) cat("Writing output to file '", fl.out, "'...\n", sep="")
    # Write tab-separated output
    write.table(df, fl.out, quote=FALSE, sep="\t", col.names=FALSE, row.names=FALSE)
}

#---> END MESSAGE <---#
if (verb) cat("Done.\n\nSession info:\n")
if (verb) print(sessionInfo())
#================#
#    MAIN END    #
#================#