universalmotif: Import, Modify, and Export Motifs with R

Documented in motif_peaks

#' Look for overrepresented motif position peaks in a set of sequences.
#'
#' Using the motif position data from [scan_sequences()] (or elsewhere),
#' test whether certain positions in the sequences have significantly higher
#' motif density.
#'
#' @param hits `numeric` A vector of sequence positions indicating motif sites.
#' @param seq.length `numeric(1)` Length of sequences. Only one number is
#'    allowed, as all sequences must be of identical length. If missing,
#'    then the largest number from `hits` is used.
#' @param seq.count `numeric(1)` Number of sequences with motif sites. If
#'    missing, then the number of unique values in `hits` is used.
#' @param bandwidth `numeric(1)` Peak smoothing parameter. Smaller numbers
#'    will result in skinnier peaks, larger numbers will result in wider
#'    peaks. Leaving this empty will cause [motif_peaks()] to generate one
#'    by itself (see 'details').
#' @param max.p `numeric(1)` Maximum P-value allowed for finding significant
#'    motif site peaks.
#' @param peak.width `numeric(1)` Minimum peak width. A peak is defined as
#'    as the highest point within the value set by `peak.width`.
#' @param nrand `numeric(1)` Number of random permutations for generating a
#'    null distribution. In order to calculate P-values, a set of random
#'    motif site positions are generated `nrand` times.
#' @param plot `logical(1)` Will create a `ggplot2` object displaying motif
#'    peaks.
#' @param BP `logical(1)` Allows for the use of \pkg{BiocParallel} within
#'    [motif_peaks()]. See [BiocParallel::register()] to change the
#'    default backend. Setting `BP = TRUE` is only recommended for
#'    exceptionally large jobs. Keep in mind that this function will not
#'    attempt to limit its memory usage.
#'
#' @details
#'    Kernel smoothing is used to calculate motif position density. The
#'    implementation for this process is based on code from the
#'    \pkg{KernSmooth} R package (Wand 2015). These
#'    density estimates are used to
#'    determine peak locations and heights. To calculate the P-values of
#'    these peaks, a null distribution is calculated from peak heights of
#'    randomly generated motif positions.
#'
#'    If the `bandwidth` option is not supplied, then the following code is used
#'    (from \pkg{KernSmooth}):
#'
#'    `del0 <- (1 / (4 * pi))^(1 / 10)`
#'
#'    `bandwidth <- del0 * (243 / (35 * length(hits)))^(1 / 5) * sqrt(var(hits))`
#'
#' @return A `DataFrame` with peak positions and P-values. If `plot = TRUE`,
#'    then a list is returned with the `DataFrame` as the first item and
#'    the `ggplot2` object as the second item.
#'
#' @references
#'
#' Wand M (2015). *KernSmooth: Functions for Kernel Smoothing
#' Supporting Wand and Jones (1995)*. R package version 2.23-15,
#' <URL: https://CRAN.R-project.org/package=KernSmooth>.
#'
#' @examples
#' data(ArabidopsisMotif)
#' data(ArabidopsisPromoters)
#' if (R.Version()$arch != "i386") {
#' hits <- scan_sequences(ArabidopsisMotif, ArabidopsisPromoters, RC = FALSE)
#' res <- motif_peaks(as.vector(hits$start), 1000, 50)
#' # View plot:
#' res$Plot
#'
#' # The raw plot data can be found in:
#' res$Plot$data
#' }
#'
#' @author Benjamin Jean-Marie Tremblay, \email{benjamin.tremblay@@uwaterloo.ca}
#' @seealso [scan_sequences()]
#' @export
motif_peaks <- function(hits, seq.length, seq.count, bandwidth, max.p = 1e-6,
                        peak.width = 3, nrand = 100, plot = TRUE, BP = FALSE) {

# TODO: vignette section + stop peaks from showing up in flat sections

# Plans:
#  - Compare peak locations to a set of bkg peaks
#  - Look for peak co-occurence between multiple motifs

  # param check --------------------------------------------
  args <- as.list(environment())
  num_check <- check_fun_params(list(hits = args$hits,
                                     seq.length = args$seq.length,
                                     seq.count = args$seq.count,
                                     bandwidth = args$bandwidth,
                                     max.p = args$max.p,
                                     peak.width = args$peak.width,
                                     nrand = args$nrand),
                                c(0, rep(1, 6)),
                                c(FALSE, TRUE, TRUE, TRUE,
                                  FALSE, FALSE, FALSE), TYPE_NUM)
  logi_check <- check_fun_params(list(plot = args$plot, BP = args$BP),
                                 numeric(), logical(), TYPE_LOGI)
  all_checks <- c(num_check, logi_check)
  if (length(all_checks) > 0) stop(all_checks_collapse(all_checks))
  #---------------------------------------------------------

  if (missing(seq.length)) seq.length <- max(hits)
  if (missing(seq.count)) seq.count <- length(unique(hits))
  if (missing(bandwidth)) {
    del0 <- (1 / (4 * pi))^(1 / 10)
    bandwidth <- del0 * (243 / (35 * length(hits)))^(1 / 5) * sqrt(var(hits))
  }

  hit.count.perseq <- length(hits) / seq.count

  rand.hits <- lapply(seq_len(nrand),
                      function(x) sample.int(seq.length, length(hits),
                                             replace = TRUE))
  # this is the slowest step
  rand.kern <- lapply_(rand.hits, function(x) kern_fun(x, bandwidth, seq.length,
                                                       seq.length),
                       BP = BP, PB = FALSE)
  # second slowest
  rand.peaks <- lapply_(rand.kern, function(x) x$y[peakfinder_cpp(x$y, peak.width)],
                        BP = BP, PB = FALSE)
  rand.peaks <- do.call(c, rand.peaks)

  data.kern <- kern_fun(hits, bandwidth, seq.length, seq.length)
  data.loc <- peakfinder_cpp(data.kern$y, peak.width)
  data.peaks <- data.kern$y[data.loc]

  pv <- fitdistr(rand.peaks, "normal")
  peak.pvals <- pnorm(data.peaks, pv$estimate["mean"], pv$estimate["sd"],
                      lower.tail = FALSE)

  if (plot) {
    pval.lim <- qnorm(max.p, pv$estimate["mean"], pv$estimate["sd"],
                      lower.tail = FALSE)
    kern.df <- data.frame(x = data.kern$x, y = data.kern$y)
    p <- ggplot(kern.df, aes(.data$x, .data$y)) +
           geom_line() +
           geom_point(data = data.frame(x = data.kern$x[data.loc],
                                        y = data.kern$y[data.loc]),
                      colour = "red") +
           xlab("Sequence location") +
           ylab("Kernel density") +
           geom_hline(yintercept = pval.lim, colour = "blue") +
           geom_text(data = data.frame(x = 0, y = pval.lim), aes(.data$x, .data$y),
                     hjust = 0, vjust = -1,
                     label = paste("Cutoff for P-value =<", max.p)) +
           theme_bw()
  }

  out <- DataFrame(Peak = data.kern$x[data.loc], Pval = peak.pvals)
  out <- out[order(out$Pval), ]
  out <- out[out$Pval <= max.p, ]
  if (nrow(out) == 0) {
    message("No significant peaks found.")
    if (plot) return(p) else return(invisible(NULL))
  }
  rownames(out) <- NULL
  if (plot) return(list(Peaks = out, Plot = p)) else return(out)

}

kern_fun <- function(x, bandwidth, gridsize, range.x) {

  # modified from KernSmooth::bkde

  n <- length(x)
  M <- gridsize
  tau <- 4
  h <- bandwidth
  a <- 1
  b <- range.x

  gpoints <- seq(a, b, length = M)
  gcounts <- linbin_cpp(x, gpoints)

  delta <- (b - a) / (h * (M - 1L))
  L <- min(floor(tau / delta), M)

  lvec <- seq(0L, L, 1)
  kappa <- dnorm(lvec * delta) / (n * h)

  P <- 2^(ceiling(log(M + L + 1L) / log(2)))
  kappa <- c(kappa, rep(0, P - 2L * L - 1L), rev(kappa[-1L]))
  tot <- sum(kappa) * (b - a) / (M - 1L) * n
  gcounts <- c(gcounts, rep(0, P - M))
  kappa <- fft(kappa / tot)
  gcounts <- fft(gcounts)

  list(x = gpoints, y = (Re(fft(kappa * gcounts, TRUE)) / P)[seq_len(M)])

}