tidymv: Tidy Model Visualisation for Generalised Additive Models

Documented in find_difference get_difference summary_data

# Functions imported from itsadug. The copyright holders are Jacolien van Rij
# and Martijn Wieling.

#' Get model predictions for differences between conditions.
#'
#' @import mgcv
#' @import stats
#' @param model A gam object, produced by [mgcv::gam()] or
#' [mgcv::bam()].
#' @param comp A named list with the two levels to compare.
#' @param cond A named list of the values to use for the other predictor
#' terms. Variables omitted from this list will have the closest observed
#' value to the median for continuous variables, or the reference level for
#' factors.
#' @param rm.ranef Logical: whether or not to remove random effects.
#' Default is TRUE. Alternatively a vector of numbers with the
#' model term number of the random effect(s) to remove.
#' (See notes.)
#' @param se Logical: whether or not to return the confidence interval or
#' standard error around the estimates.
#' @param sim.ci Logical: Using simultaneous confidence intervals or not
#' (default set to FALSE). The implementation of simultaneous CIs follows
#' Gavin Simpson's blog of December 15, 2016:
#' <https://fromthebottomoftheheap.net/2016/12/15/simultaneous-interval-revisited/>.
#' This interval is calculated from simulations based.
#' Please specify a seed (e.g., `set.seed(123)`) for reproducible results.
#' In addition, make sure to specify at least 200 points for each smooth
#' for the simulations when using simultaneous CI.
#' Note: in contrast with Gavin Simpson's code, here the Bayesian posterior
#' covariance matrix of the parameters is uncertainty corrected
#' (`unconditional=TRUE`) to reflect the uncertainty on the estimation of
#' smoothness parameters.
#' @param f A number to scale the standard error. Defaults to 1.96, resulting
#' in 95\% confidence intervals. For 99\% confidence intervals use a value of
#' 2.58.
#' @param return.n.posterior Numeric: N samples from
#' the posterior distribution of the fitted model are returned.
#' Default value is 0 (no samples returned).
#' Only works when `sim.ci=TRUE`.
#' @param print.summary Logical: whether or not to print a summary of the
#' values selected for each predictor.
#' Default set to the print info messages option
#' (see `infoMessages`).
#' @return Returns a data frame with the estimates of the difference and
#' optionally the confidence intervals around that estimate.
#' @section Notes:
#' Other, not specified effects and random effects are generally cancelled
#' out, when calculating the difference. When the predictors that
#' specify the conditions to compare are involved in other interactions
#' or included as random slopes, it may be useful to specify the values
#' of other predictors with `cond` or remove the random effects with
#' `rm.ranef`.
#' @author Jacolien van Rij, Martijn Wieling
#' @export
#' @keywords internal
#' @importFrom magrittr "%>%"
get_difference <- function(model, comp, cond = NULL, rm.ranef = TRUE, se = TRUE, sim.ci = FALSE, f = 1.96,
                           return.n.posterior = 0, print.summary = FALSE) {
  if (!"lm" %in% class(model)) {
    stop("This function does not work for class %s models.", class(model)[1])
  } else {
    newd <- NULL
    su <- model$var.summary
    dat <- model$model
    # check comp
    if (is.null(names(comp))) {
      stop("Predictor specified in 'comp' unknown. Please provide a named list for 'comp', in the form of 'comp=list(Predictor=c('level1', 'level2'))'.")
    }
    if (all(names(comp) %in% colnames(dat))) {
      for (i in 1:length(comp)) {
        if (length(comp[[i]]) < 2) {
          stop(sprintf("Provide two levels for %s to calculate difference.", names(comp)[i]))
        } else if (length(comp[[i]]) > 2) {
          warning(sprintf("More than two levels provided for predictor %s. Only first two levels are being used.",
                          names(comp)[i]))
        }
      }
    } else {
      errname <- paste(which(!names(comp) %in% colnames(dat)), collapse = ", ")
      stop(sprintf("Grouping predictor(s) not found in model: %s.", errname))
    }
    if (any(names(cond) %in% names(comp))) {
      for (i in names(cond)[names(cond) %in% names(comp)]) {
        cond[[i]] <- NULL
        warning(sprintf("Predictor %s specified in comp and cond. (The value in cond will be ignored.)",
                        i))
      }
    }
    new.cond1 <- list()
    new.cond2 <- list()
    for (i in names(su)) {
      if (i %in% names(comp)) {
        new.cond1[[i]] <- comp[[i]][1]
        new.cond2[[i]] <- comp[[i]][2]
      } else if (i %in% names(cond)) {
        new.cond1[[i]] <- new.cond2[[i]] <- cond[[i]]
      } else {
        if (inherits(su[[i]], "factor")) {
          new.cond1[[i]] <- as.character(su[[i]][1])
          new.cond2[[i]] <- as.character(su[[i]][1])
        } else if (inherits(su[[i]], "numeric")) {
          new.cond1[[i]] <- su[[i]][2]
          new.cond2[[i]] <- su[[i]][2]
        }
      }
    }
    newd1 <- expand.grid(new.cond1)
    newd2 <- expand.grid(new.cond2)
    p1 <- mgcv::predict.gam(model, newd1, type = "lpmatrix")
    p2 <- mgcv::predict.gam(model, newd2, type = "lpmatrix")
    newd <- as.data.frame(newd1, stringsAsFactors = TRUE)
    newd.names <- colnames(newd)
    for (nn in newd.names) {
      if (nn %in% names(comp)) {
        newd[, nn] <- NULL
      }

    }
    mysummary <- summary_data(newd, print = FALSE)
    # Check for random effects:
    if (inherits(rm.ranef, "logical")) {
      if (rm.ranef[1] == FALSE) {
        rm.ranef <- NULL
      }
    }
    if (!is.null(rm.ranef)) {
      # get random effects columns:
      smoothlabels.table <- as.data.frame(do.call("rbind", lapply(model$smooth, function(x) {
        data.frame(Label = x[["label"]], Dim = x[["null.space.dim"]], Class = attr(x, "class")[1],
                   stringsAsFactors = FALSE)
      })), stringsAsFactors = FALSE)
      # smoothlabels <- as.vector( smoothlabels.table[smoothlabels.table$Class %in%
      # c('random.effect','fs.interaction'), 'Label'] )
      smoothlabels <- as.vector(smoothlabels.table[smoothlabels.table$Dim == 0, "Label"])
      if (inherits(rm.ranef, "logical")) {
        if (rm.ranef[1] == TRUE) {
          rm.ranef <- smoothlabels
        } else {
          rm.ranef <- ""
        }
      } else if (inherits(rm.ranef, c("numeric", "integer"))) {
        smoothlabels.table <- smoothlabels.table[rm.ranef, ]
        smoothlabels <- as.vector(smoothlabels.table[smoothlabels.table$Class %in% c("random.effect",
                                                                                     "fs.interaction"), "Label"])
      }
      rm.col <- unlist(lapply(rm.ranef, function(x) {
        colnames(p1)[grepl(x, colnames(p1), fixed = TRUE)]
      }))
      rm.col <- unlist(lapply(smoothlabels, function(x) {
        rm.col[grepl(x, rm.col, fixed = TRUE)]
      }))
      # cancel random effects
      p1[, rm.col] <- 0
      p2[, rm.col] <- 0
      # find terms that only occur in random effects:
      predictors <- do.call("rbind", lapply(model$smooth, function(x) {
        data.frame(Label = x[["label"]], Terms = x[["term"]], stringsAsFactors = TRUE)
      }))
      test <- table(predictors$Terms) - table(predictors[predictors$Label %in% rm.ranef, ]$Terms)
      for (pred in names(test[test == 0])) {
        if (pred %in% names(mysummary)) {
          mysummary[[pred]] <- paste(mysummary[[pred]], "(Might be cancelled as random effect, check below.)")
        }
      }
      if (length(rm.col) > 0) {
        mysummary[["NOTE"]] = sprintf("The following random effects columns are cancelled: %s\n", paste(smoothlabels,
                                                                                                       collapse = ","))
      } else {
        mysummary[["NOTE"]] = "No random effects in the model to cancel.\n"
        # warning('No random effects to cancel.\n')
      }
    }
    # calculate the difference:
    p <- p1 - p2
    newd$difference <- as.vector(p %*% coef(model))
    if (se) {
      newd$SE <- sqrt(rowSums((p %*% vcov(model)) * p))
      newd$CI <- f * sqrt(rowSums((p %*% vcov(model)) * p))
    }
    # simultaneous CI See http://www.fromthebottomoftheheap.net/2016/12/15/simultaneous-interval-revisited/
    stackFits <- NULL
    if (sim.ci == TRUE) {
      Vb <- vcov(model, freq = FALSE, unconditional = TRUE)
      se.fit <- sqrt(rowSums((p %*% Vb) * p))
      sim <- mgcv::rmvn(10000, mu = rep(0, nrow(Vb)), V = Vb)
      # Cg <- predict(model, newd, type='lpmatrix') Cg replaced by p
      simDev <- p %*% t(sim)
      # Evaluate the basis function at g and compute the deviations between the fitted and true parameters. Then
      # we find the absolute values of the standardized deviations from the true model:
      absDev <- abs(sweep(simDev, 1, se.fit, FUN = "/"))
      # maximum of the absolute standardized deviations:
      masd <- apply(absDev, 2L, max)
      # set simultaneous X% CI on the basis of original se multiplication specified by f:
      crit <- quantile(masd, prob = 1 - round(2 * (1 - pnorm(f)), 2), type = 8)
      newd$sim.CI <- crit * se.fit

      if ((return.n.posterior > 0) | (print.summary == TRUE)) {
        # coverage pointwise and simultaneous CIs:
        sims <- rmvn(max(10000, return.n.posterior), mu = coef(model), V = Vb)
        fits <- p %*% t(sims)
        inCI <- function(x, upr, lwr) {
          all(x >= lwr & x <= upr)
        }
        fitsInPCI <- apply(fits, 2L, inCI, upr = newd$fit + newd$CI, lwr = newd$fit - newd$CI)
        fitsInSCI <- apply(fits, 2L, inCI, upr = newd$fit + newd$sim.CI, lwr = newd$fit - newd$sim.CI)
        mysummary[[paste("Simultaneous ", 100 * (1 - round(2 * (1 - pnorm(f)), 2)), "%-CI used", sep = "")]] = sprintf("\n\t\t%s\n\t\t%s\n\t\t%s\n",
                                                                                                                       paste("Critical value: ", round(crit, 3), sep = ""), paste("Proportion posterior simulations in pointwise CI: ",
                                                                                                                                                                                  round(sum(fitsInPCI)/length(fitsInPCI), 2), " (10000 samples)", sep = ""), paste("Proportion posterior simulations in simultaneous CI: ",
                                                                                                                                                                                                                                                                   round(sum(fitsInSCI)/length(fitsInSCI), 2), " (10000 samples)", sep = ""))

        if (return.n.posterior > 0) {
          rnd <- sample(max(10000, return.n.posterior), return.n.posterior)
          fits <- utils::stack(as.data.frame(fits[, rnd], stringsAsFactors = FALSE))[, 1]
          stackFits <- newd[rep(1:nrow(newd), length(rnd)), colnames(newd)[!colnames(newd) %in% c("fit",
                                                                                                  "CI", "sim.CI", "rm.ranef")]]
          row.names(stackFits) <- NULL
          stackFits$posterior.fit <- fits
          stackFits$draw <- rep(rnd, each = nrow(newd))
        }
      }
    }
    # print summary of chosen values
    # if (print.summary == TRUE) {
    #   print_summary(mysummary)
    # }

    if (return.n.posterior > 0) {
      return(list(newd = newd, posterior.fit = stackFits))
    } else {
      return(newd)
    }
  }
}


#' Return the regions in which the smooth is significantly different from zero.
#'
#' @import grDevices
#' @import graphics
#' @param mean A vector with smooth predictions.
#' @param se A vector with the standard error on the smooth predictions.
#' @param xVals Optional vector with x values for the smooth.
#' When `xVals` is provided, the regions are returned in terms of x-
#' values, otherwise as indices.
#' @param f A number to multiply the `se` with, to convert the `se`
#' into confidence intervals. Use 1.96 for 95% CI and 2.58 for 99%CI.
#' @param as.vector Logical: whether or not to return the data points as
#' vector, or not. Default is FALSE, and a list with start and end points will
#'  be returned.
#' @return The function returns a list with start points of each region
#' (`start`) and end points of each region (`end`). The logical
#' `xVals` indicates whether the returned values are on the x-scale
#' (TRUE) or indices (FALSE).
#' @author Jacolien van Rij
#' @keywords internal
#' @export
find_difference <- function(mean, se, xVals = NULL, f = 1, as.vector = FALSE) {
  if (length(mean) != length(se)) {
    stop("The vectors mean and se are not equal in length.")
  } else {
    ub <- mean + f * se
    lb <- mean - f * se

    n <- which(!(ub >= 0 & lb <= 0))
    if (as.vector) {
      if (length(n) == 0) {
        return(rep(FALSE, length(mean)))
      } else {
        out <- rep(FALSE, length(mean))
        out[n] <- TRUE
        return(out)
      }
    } else {
      if (length(n) == 0) {
        return(NULL)
      } else {
        n_prev <- c(NA, n[1:(length(n) - 1)])
        n_next <- c(n[2:length(n)], NA)
        if (!is.null(xVals) & (length(xVals) == length(mean))) {
          return(list(start = xVals[n[which(is.na(n - n_prev) | (n - n_prev) > 1)]], end = xVals[n[which(is.na(n_next -
                                                                                                                 n) | (n_next - n) > 1)]], xVals = TRUE))
        } else {
          return(list(start = n[which(is.na(n - n_prev) | (n - n_prev) > 1)], end = n[which(is.na(n_next -
                                                                                                    n) | (n_next - n) > 1)], xVals = FALSE))
        }

      }
    }
  }
}

#' Print a descriptive summary of a data frame.
#'
#' @export
#' @keywords internal
#' @description The function prints a summary of the data.
#' Similar to the function [utils::str()], but easier readable.
#' @param data A data frame.
#' @param print Logical: whether or not to print the summary.
#' @param n Number: maximum number of values being mentioned in the summary.
#' If NULL all values are being mentioned. Defaults to 10.
#' @return Optionally returns a named list with info.
#' @author Jacolien van Rij
summary_data <- function(data, print = TRUE, n = 10) {

  labelColumns <- function(x, data) {
    out <- NULL
    cn <- ifelse(is.numeric(x), colnames(data)[x], x)
    cl <- class(data[, cn])
    if (inherits(data[, cn], "factor")) {
      vals <- sort(unique(as.character(data[, x])))
      n.cur <- length(vals) + 1
      if (!is.null(n)) {
        n.cur <- n
      }
      if (length(vals) > n.cur) {
        out <- sprintf("factor with %d values; set to the value(s): %s, ...", length(vals), paste(vals[1:n.cur],
                                                                                                  collapse = ", "))
      } else {
        out <- sprintf("factor; set to the value(s): %s.", paste(vals, collapse = ", "))
      }
    } else if (inherits(data[, cn], "numeric")) {
      if (length(unique(data[, x])) > 2) {
        out <- sprintf("numeric predictor; with %d values ranging from %f to %f.", length(unique(data[,
                                                                                                      x])), min(data[, x], na.rm = TRUE), max(data[, x], na.rm = TRUE))
      } else {
        out <- sprintf("numeric predictor; set to the value(s): %s.", paste(unique(data[, x]), collapse = ", "))
      }
    } else if (inherits(data[, cn], "matrix")) {
      if (length(unique(data[, x])) > 2) {
        out <- sprintf("a matrix predictor; with %d values ranging from %f to %f.", length(unique(data[,
                                                                                                       x])), min(data[, x], na.rm = TRUE), max(data[, x], na.rm = TRUE))
      } else {
        out <- sprintf("matrix predictor; set to the value(s): %s.", paste(unique(data[, x]), collapse = ", "))
      }
    } else {
      vals <- sort(unique(data[, x]))
      n.cur <- length(vals) + 1
      if (!is.null(n)) {
        n.cur <- n
      }
      if (length(vals) > n.cur) {
        out <- sprintf("%s vector with %d values; set to the value(s): %s, ...", class(data[, cn])[1],
                       length(vals), paste(vals[1:n.cur], collapse = ", "))
      } else {
        out <- sprintf("%s vector; set to the value(s): %s.", class(data[, cn])[1], paste(vals, collapse = ", "))
      }
    }
    return(out)
  }
  mysummary <- sapply(colnames(data), function(x) {
    labelColumns(x, data)
  })
  # if (print) {
  #   print_summary(mysummary)
  # }
  invisible(mysummary)
}