R/plotPredictive.R

#' Plot samples drawn from the posterior predictive density.
#'
#' When the observed random variable is multivariate, one individual plot per dimension will be drawn (i.e. the analysis will be carried out marginally). As with all visualizations from this package, colors are fully customizable via a \code{\link{theme}}. Note that this method is only available for full Bayesian estimation run via Markov-chain Monte Carlo.
#'
#' @param stanfit An object returned by either \code{\link{fit}} or \code{\link{draw_samples}}.
#' @param type An optional vector of character strings with the names of the analysis to include in the plot. These may be \emph{density} (kernel density curves of the observation variable), \emph{cumulative} (kernel cumulative density curves of the observation variable), \emph{boxplot} (boxplots of the observation variable), \emph{histogram} (histogram of the values returned by \emph{fun}), \emph{scatterplot} (scatterplot of the values returned by \emph{fun1} and \emph{fun2}), \emph{kl} (histogram of the empirical Kullback-Leibler divergence), \emph{ks} (histogram of the Kolmogorov-Smirnov statistic). All plots show the actual sample and many draws from the posterior predictive density. Note that more than one type may be plotted at the same time. It allows for partial matching. It defaults to none.
#' @param r An optional numeric vector with the numbers of the dimensions of the observation vector to be plotted. It defaults to plotting all variables.
#' @param subset An optional boolean vector indicating with TRUE which elements of the observation vector should be included in the analysis (some plots may become slow with large datasets). It defaults to NULL, meaning all the sample units.
#' @param chain An optional integer number between 1 and the number of chains M. Only the samples generated by the selected chain are considered. It defaults to the first chain.
#' @param fun An optional function summarizing the whole sample into a single value (e.g. \code{\link{median}}). This function is applied ot each sample (actual and generated) and the values returned are plotted in the histogram.
#' @param fun1 An optional function summarizing the whole sample into a single value (e.g. \code{\link{mean}}). This function is applied ot each sample (actual and generated) and the values returned are plotted in the horizontal axis of the scatterplot.
#' @param fun2 An optional function summarizing the whole sample into a single value (e.g. \code{\link{sd}}). This function is applied ot each sample (actual and generated) and the values returned are plotted in the vertical axis of the scatterplot.
#' @param boxplotControl Arguments to be passed to \code{\link{boxplot}} if the argument \emph{type} includes "boxplot".
#' @param cumulativeControl Arguments to be passed to \code{\link{quantile}} if the argument \emph{type} includes "cumulative" (e.g. use `probs` to set a range if the default is innapropiate).
#' @param densityControl Arguments to be passed to \code{\link{density}} if the argument \emph{type} includes "density".
#' @param funControl Arguments to be passed to `fun` if the argument \emph{type} includes "histogram".
#' @param fun1Control Arguments to be passed to `fun1` if the argument \emph{type} includes "scatterplot".
#' @param fun2Control Arguments to be passed to `fun2` if the argument \emph{type} includes "scatterplot".
#' @param ksControl Arguments to be passed to \code{\link{ks.test}} if the argument \emph{type} includes "ks".
#' @param klControl Arguments to compute the empirical Kullback-Leibler divergence if the argument \emph{type} includes "kl". It may include two-sized vectors with the lower and upper bounds of the discretization range (`yRange` and `yPredRange`) and the number of bins (`yNBins` and `yPredNBins`).
#' @param main An optional character string with the overall title for the plot.
#' @note If the user was surprised to find the Kolmogorov-Smirnov statistic in a Bayesian software, simply note that we use it as a measure of similarity between two samples from continuous distributions. Data analysis and modeling decissions do not rely on hypothesis testing.
#' @family visualization functions
#' @examples
#' \dontrun{
#' plot_ppredictive(
#'   myFit,
#'   type = c("density", "cumulative", "boxplot",
#'            "histogram", "scatterplot", "ks"),
#'   fun = mean, fun1 = mean, fun2 = sd
#' )
#'
#' plot_ppredictive(
#'   myFit,
#'   type = c("density", "cumulative", "ks", "kl"),
#'   klControl = list(yNBins = 20, yPredNBins = 20)
#' )
#' }
plot_ppredictive <- function(stanfit, type = "", r = NULL, subset = NULL, chain = 1,
           fun = NULL, fun1 = NULL, fun2 = NULL,
           boxplotControl = NULL, cumulativeControl = NULL, densityControl = NULL,
           funControl = NULL, fun1Control = NULL, fun2Control = NULL, ksControl = NULL,
           klControl = NULL, main = NULL) {
  UseMethod("plot_ppredictive", stanfit)
}

#' @keywords internal
#' @inherit plot_ppredictive
plot_ppredictive.stanfit <- function(stanfit, type = "", r = NULL, subset = NULL, chain = 1,
                             fun = NULL, fun1 = NULL, fun2 = NULL,
                             boxplotControl = NULL, cumulativeControl = NULL, densityControl = NULL,
                             funControl = NULL, fun1Control = NULL, fun2Control = NULL,
                             ksControl = NULL, klControl = NULL, main = NULL) {
  theme <- getOption("BayesHMM.theme")

  R     <- extract_R(stanfit)
  y     <- extract_y(stanfit)
  yPred <- extract_ypred(stanfit, chain = chain)
  if (R == 1) { dim(yPred) <- c(dim(yPred), 1) }
  # ^ extract_ypred drops index for [iteration, chain, r] when r = 1
  if (!is.null(subset)) { yPred <- yPred[subset, , , drop = FALSE] }

  rSeq  <- if (is.null(r)) { seq_len(R) } else { r }

  funList <- list(
    density     = function(y, yPred) {
      plot_ppredictive_density(
        y, yPred, densityControl,
        NULL, NULL, NULL, FALSE
      )
    },
    cumulative  = function(y, yPred) {
      plot_ppredictive_cumulative(
        y, yPred, cumulativeControl,
        NULL, NULL, NULL, FALSE
      )
    },
    histogram   = function(y, yPred) {
      plot_ppredictive_hist(
        y, yPred, fun, funControl,
        NULL, NULL, NULL, FALSE
      )
    },
    boxplot     = function(y, yPred) {
      plot_ppredictive_boxplot(
        y, yPred, boxplotControl,
        NULL, NULL, NULL, FALSE
      )
    },
    scatterplot = function(y, yPred) {
      plot_ppredictive_scatter(
        y, yPred, fun1, fun2, fun1Control, fun2Control,
        NULL, NULL, NULL, FALSE
      )
    },
    kl          = function(y, yPred) {
      plot_ppredictive_kl(
        y, yPred, klControl,
        NULL, NULL, NULL, FALSE
      )
    },
    ks          = function(y, yPred) {
      plot_ppredictive_ks(
        y, yPred, ksControl,
        NULL, NULL, NULL, FALSE
      )
    }
  )

  funSeq <- funList[which(names(funList) %in% type)]

  opar   <- par(
    oma   = c(2, 0, 0, 0),
    mfrow = rev(n2mfrow(length(rSeq) * length(funSeq)))
  )
  on.exit(par(opar))

  for (r in rSeq) {
    for (f in funSeq) {
      f(y[, r], yPred[, , r])
      if (R != 1) {
        title(colnames(y)[r], adj = 1, line = 0.5, cex.main = 1)
        # title(sprintf("Variable %d", r), adj = 1, line = 0.5, cex.main = 1)
      }
    }
  }

  add_title_overlay(
    main = if (is.null(main)) { "Posterior Predictive Checks" } else { main },
    line = -1.5
  )

  add_legend_overlay(
    x = "bottom",
    legend = c("Observed sample", "Posterior predictive samples"),
    border = c(theme$densityY, theme$densityYPred),
    fill   = c(theme$densityY, theme$densityYPred),
    bty    = "n",
    horiz  = TRUE
  )
}

# Internal undocumented methods -------------------------------------------

plot_ppredictive_scatter <- function(y, yPred, fun1 = NULL, fun2 = NULL,
                                     control1 = NULL, control2 = NULL,
                                     main = NULL, xlab = NULL, ylab = NULL,
                                     addLegend = TRUE) {

  theme <- getOption("BayesHMM.theme")

  myFun <- function(x) {
    if (is.null(control1)) { control1 <- list() }
    if (is.null(control2)) { control2 <- list() }
    cbind(
      do.call(fun1, c(list(x = x), control1)),
      do.call(fun2, c(list(x = x), control2))
    )
  }

  yFun     <- myFun(y)
  yPredFun <- t(apply(yPred, 1, myFun))
  yAll     <- rbind(yFun, yPredFun)

  plot(
    NULL,
    main = if (is.null(main)) { "" } else { main },
    xlab = if (is.null(xlab)) { funinvarName(fun1) } else { xlab },
    ylab = if (is.null(ylab)) { funinvarName(fun2) } else { ylab },
    xlim = quantile(yAll[, 1], probs = c(0, 1)),
    ylim = quantile(yAll[, 2], probs = c(0, 1))
  )

  points(
    yPredFun,
    pch = 21,
    col = theme$scatterYPred,
    bg  = theme$scatterYPred
  )

  points(
    yFun,
    pch = 21,
    bg  = theme$scatterY,
    col = theme$scatterY
  )

  if (addLegend) {
    legend(
      x = "topright",
      legend = c("Observed sample", "Posterior predictive samples"),
      pch = 21,
      bg  = c(theme$scatterY, theme$scatterYPred),
      col = c(theme$scatterY, theme$scatterYPred),
      bty = "n"
    )
  }
}

plot_ppredictive_hist <- function(y, yPred, fun, control = NULL, main = NULL,
                                  xlab = NULL, ylab = NULL,
                                  addLegend = TRUE) {

  theme <- getOption("BayesHMM.theme")

  myFun <- function(x) {
    if (is.null(control)) { control <- list() }
    do.call(fun, c(list(x = x), control))
  }

  yFun     <- myFun(y)
  yPredFun <- apply(yPred, 1, myFun)

  hist(
    yPredFun,
    main = if (is.null(main)) { "" } else { main },
    xlab = if (is.null(xlab)) { toproper(funinvarName(fun)) } else { xlab },
    ylab = if (is.null(ylab)) { "Frequency" } else { ylab },
    breaks = "FD",
    col    = theme$histCol,
    border = theme$histBorder
  )

  abline(v = yFun, col = theme$histLine)

  if (addLegend) {
    legend(
      x = "topright",
      legend = c("Observed sample", "Posterior predictive samples"),
      lwd = 2,
      col = c(theme$histLine, theme$histCol),
      bty = "n"
    )
  }
}

plot_ppredictive_ks <- function(y, yPred, control = NULL, main = NULL,
                                xlab = NULL, ylab = NULL, addLegend = TRUE) {

  theme <- getOption("BayesHMM.theme")
  if (is.null(control)) { control <- list() }

  yPredFun <- suppressWarnings(
    apply(yPred, 1, function(x) {
      do.call(ks.test, c(list(x = x, y = y), control))$statistic
    })
  )

  hist(
    yPredFun,
    main = if (is.null(main)) { "" } else { main },
    xlab = if (is.null(xlab)) { "Kolmogorov-Smirnov Statistic" } else { xlab },
    ylab = if (is.null(ylab)) { "Frequency" } else { ylab },
    breaks = "FD",
    col    = theme$histCol,
    border = theme$histBorder
  )
}

plot_ppredictive_kl <- function(y, yPred, control = NULL, main = NULL,
                                xlab = NULL, ylab = NULL, addLegend = TRUE) {

  KLDivergence <- function(y, yPred,
                           yRange     = range(y),     yNBins = 10,
                           yPredRange = range(yPred), yPredNBins = 10) {
    yBins     <- seq(from = yRange[1],     to = yRange[2],     length.out = yNBins + 1)
    yPredBins <- seq(from = yPredRange[1], to = yPredRange[2], length.out = yPredNBins + 1)
    freqs  <- prop.table(
      table(
        cut(y,     breaks = yBins    , include.lowest = TRUE),
        cut(yPred, breaks = yPredBins, include.lowest = TRUE)
      )
    )

    # Look at all these nifty hacks, gah
    P <- rowSums(freqs) # discretized marginal distribution of x
    P <- P + ifelse(P == 0, 1E-30, 0)
    P <- P / sum(P)

    Q <- colSums(freqs) # discretized marginal distribution of y
    Q <- Q + ifelse(Q == 0, 1E-30, 0)
    Q <- Q / sum(Q)

    sum(P * log(P/Q))
  }

  theme <- getOption("BayesHMM.theme")
  yPredFun <- suppressWarnings(
    apply(yPred, 1, function(x) {
      do.call(KLDivergence, c(list(y = y, yPred = x), control))
    })
  )

  hist(
    yPredFun,
    main = if (is.null(main)) { "" } else { main },
    xlab = if (is.null(xlab)) { "Kullback-Leibler divergence" } else { xlab },
    ylab = if (is.null(ylab)) { "Frequency" } else { ylab },
    breaks = "FD",
    col    = theme$histCol,
    border = theme$histBorder
  )
}

plot_ppredictive_boxplot <- function(y, yPred, control = NULL, main = NULL,
                                     xlab = NULL, ylab = NULL,
                                     addLegend = TRUE) {

  theme <- getOption("BayesHMM.theme")

  argList <- list(
    x    = cbind(y, t(yPred)),
    main = if (is.null(main)) { "" } else { main },
    xlab = if (is.null(xlab)) { "Sample" } else { xlab },
    ylab = if (is.null(ylab)) { "Observation" } else { ylab },
    border = c(theme$boxY, rep(theme$boxYPred, nrow(yPred))),
    names  = c("y", seq_len(nrow(yPred)))
  )

  if (is.null(control)) { control <- list() }

  do.call(boxplot, c(argList, control))
}

plot_ppredictive_cumulative <- function(y, yPred, control = NULL, main = NULL,
                                        xlab = NULL, ylab = NULL,
                                        addLegend = TRUE) {
  theme <- getOption("BayesHMM.theme")

  if (is.null(control)) { control <- list() }
  if (!("probs" %in% names(control))) {
    control$probs <- seq(from = 0, to = 1, by = 0.01)
  }

  myCumulative    <- function(x) {
    do.call(quantile, c(list(x = x), control))
  }

  yCumulative     <- myCumulative(y)
  yPredCumulative <- t(apply(yPred, 1, myCumulative))
  xlim <- quantile(c(as.numeric(y), as.numeric(yPred)), probs = c(0, 1))

  plot(
    NULL,
    main = if (is.null(main)) { "" } else { main },
    xlab = if (is.null(xlab)) { "Cumulative Density" } else { xlab },
    ylab = if (is.null(ylab)) { "Observation" } else { ylab },
    xlim = xlim,
    ylim = c(0, 1)
  )

  for (yrow in seq_len(nrow(yPredCumulative))) {
    lines(x = yPredCumulative[yrow, ], y = control$probs, col = theme$cumulativeYPred)
  }

  lines(x = yCumulative, y = control$probs, col = theme$cumulativeY)

  if (addLegend) {
    legend(
      x = "bottomright",
      legend = c("Observed sample", "Posterior predictive samples"),
      lwd = 2,
      col = c(theme$cumulativeY, theme$cumulativeYPred),
      bty = "n"
    )
  }
}

plot_ppredictive_density <- function(y, yPred, control = NULL, main = NULL,
                                     xlab = NULL, ylab = NULL,
                                     addLegend = TRUE) {
  theme <- getOption("BayesHMM.theme")

  myDensity <- function(x) {
    if (is.null(control)) { control <- list(bw = "SJ") }
    do.call(density, c(list(x = x), control))
  }

  yDensity     <- myDensity(y)
  yPredDensity <- apply(yPred, 2, myDensity)
  yExtremes <- sapply(
    c(list(yDensity), yPredDensity), function(l) {
      c(
        quantile(l$x, probs = c(0, 1)),
        quantile(l$y, probs = c(0, 1))
      )
    }
  )

  plot(
    NULL,
    main = if (is.null(main)) { "" } else { main },
    xlab = if (is.null(xlab)) { "Observation" } else { xlab },
    ylab = if (is.null(ylab)) { "Density" } else { ylab },
    xlim = c(min(yExtremes[1, ]), max(yExtremes[2, ])),
    ylim = c(min(yExtremes[3, ]), max(yExtremes[4, ]))
  )

  for (yp in yPredDensity) {
    lines(yp, col = theme$densityYPred)
  }

  lines(yDensity, col = theme$densityY)

  if (addLegend) {
    legend(
      x = "topright",
      legend = c("Observed sample", "Posterior predictive samples"),
      lwd = 2,
      col = c(theme$densityY, theme$densityYPred),
      bty = "n"
    )
  }
}
luisdamiano/BayesHMM documentation built on May 20, 2019, 2:59 p.m.