R/make_predictions.R

Defines functions make_predictions.rq make_predictions.brmsfit make_predictions.stanreg make_predictions.merMod make_predictions.svyglm make_predictions.default prepare_return_data make_predictions

Documented in make_predictions make_predictions.default

#' @rdname make_predictions
#' @export

make_predictions <- function(model, ...) {
  
  UseMethod("make_predictions")
  
}

# Helper for the final output 
prepare_return_data <- function(model, data, return.orig.data, 
                                partial.residuals, pm, pred, at, 
                                center, set.offset, formula = NULL, ...) {
  if (return.orig.data == FALSE && partial.residuals == FALSE) {
    o <- tibble::as_tibble(pm)
  } else {
    # Want to make sure I return the full original data if it's already given, 
    # otherwise fetch it again
    if (is.null(data)) {
      if (is.null(formula)) {
        formula <- get_formula(model)
      }
      suppressMessages(d <- get_data(model, formula = formula))
    } else if (!is.null(data)) {
      if (!is.null(formula)) {
        # I'll assume formula was specified so that I would fetch more variables
        suppressMessages(d <- get_data(model, formula = formula))
      } else {
        formula <- get_formula(model)
        d <- data
      }
    }
    if (return.orig.data == TRUE && partial.residuals == FALSE) {
      o <- list(predictions = tibble::as_tibble(pm), data = d)
      if ("is_dpar" %in% names(attributes(formula))) {return(o)}
      resp <- as.character(deparse(get_lhs(formula)))
      # If left-hand side is transformed, make new column in original data for
      # the transformed version and evaluate it
      if (is_lhs_transformed(formula)) {
        if (!check_two_col(model)) {
          o[[2]][[resp]] <- eval(get_lhs(formula), o[[2]])
        } 
      }
      # For binomial family, character/logical DVs can cause problems
      if (get_family(model, ...)$family == "binomial" &&
          !is.numeric(o[[2]][[resp]]) && !check_two_col(model)) {
        if (is.logical(o[[2]][[resp]])) {
          o[[2]][[resp]] <- as.numeric(o[[2]][[resp]])
        } else {
          o[[2]][[resp]] <- 
            as.numeric(o[[2]][[resp]] != zero_or_base(o[[2]][[resp]]))
        }
      }
    } else {
      if ("is_dpar" %in% names(attributes(formula))) {
        stop_wrap("Partial residuals cannot be calculated for distributional
                  dependent variables.")
      }
      o <- list(predictions = tibble::as_tibble(pm), data = 
                  suppressMessages(
                    partialize(model, vars = pred, at = at, data = data,
                               center = center, set.offset = set.offset,
                               formula = formula, ...)
                  )
                )
    }
    }
  return(o)
}

#### Default method ########################################################

#' @title Generate predicted data for plotting results of regression models
#'
#' @description This is an alternate interface to the underlying tools that
#'   make up [effect_plot()] as well as `interactions::interact_plot()` and
#'   `interactions::cat_plot()` from the `interactions` package.
#'   `make_predictions()` creates the data to be plotted and adds information
#'   to the original data to make it more amenable for plotting with the
#'   predicted data.
#'   
#' @inheritParams make_new_data
#' @inheritParams effect_plot
#' 
#' @param data Optional, default is NULL. You may provide the data used to
#'   fit the model. This can be a better way to get mean values for centering
#'   and can be crucial for models with variable transformations in the formula
#'   (e.g., `log(x)`) or polynomial terms (e.g., `poly(x, 2)`). You will
#'   see a warning if the function detects problems that would likely be
#'   solved by providing the data with this argument and the function will
#'   attempt to retrieve the original data from the global environment.
#'
#' @param interval Logical. If \code{TRUE}, plots confidence/prediction
#'   intervals around the line using \code{\link[ggplot2]{geom_ribbon}}.
#'
#' @param int.type Type of interval to plot. Options are "confidence" or
#'  "prediction". Default is confidence interval.
#'
#' @param int.width How large should the interval be, relative to the standard
#'   error? The default, .95, corresponds to roughly 1.96 standard errors and
#'   a .05 alpha level for values outside the range. In other words, for a
#'   confidence interval, .95 is analogous to a 95% confidence interval.
#'
#' @param outcome.scale For nonlinear models (i.e., GLMs), should the outcome
#'   variable be plotted on the link scale (e.g., log odds for logit models) or
#'   the original scale (e.g., predicted probabilities for logit models)? The
#'   default is \code{"response"}, which is the original scale. For the link
#'   scale, which will show straight lines rather than curves, use
#'   \code{"link"}.
#'
#' @param vcov Optional. You may supply the variance-covariance matrix of the
#'  coefficients yourself. This is useful if you are using some method for
#'  robust standard error calculation not supported by the \pkg{sandwich}
#'  package.
#'
#' @param set.offset For models with an offset (e.g., Poisson models), sets an
#'   offset for the predicted values. All predicted values will have the same
#'   offset. By default, this is set to 1, which makes the predicted values a
#'   proportion. See details for more about offset support.
#'
#' @param robust Should robust standard errors be used to find confidence
#'   intervals for supported models? Default is FALSE, but you should specify
#'   the type of sandwich standard errors if you'd like to use them (i.e.,
#'   `"HC0"`, `"HC1"`, and so on). If `TRUE`, defaults to `"HC3"` standard
#'   errors.
#'
#' @param cluster For clustered standard errors, provide the column name of
#'   the cluster variable in the input data frame (as a string). Alternately,
#'   provide a vector of clusters.
#'   
#' @param return.orig.data Instead of returning a just the predicted data frame,
#'  should the original data be returned as well? If so, then a list will be 
#'  return with both the predicted data (as the first element) and the original
#'  data (as the second element). Default is FALSE. 
#'
#' @param partial.residuals If `return.orig.data` is TRUE, should the observed
#'   dependent variable be replaced with the partial residual? This makes a 
#'   call to [partialize()], where you can find more details.
#'   
#' @param new_data If you would prefer to generate your own hypothetical
#'   (or not hypothetical) data rather than have the function make a call to
#'   [make_new_data()], you can provide it.
#'   
#' @param ... Ignored.
#'
#' @family plotting tools
#' @importFrom stats coef coefficients lm predict sd qnorm getCall model.offset
#' @importFrom stats median ecdf quantile get_all_vars complete.cases qt
#' @rdname make_predictions
#' @export
#' 

make_predictions.default <- function(model, pred, pred.values = NULL, at = NULL,
  data = NULL, center = TRUE, interval = TRUE,
  int.type = c("confidence", "prediction"), int.width = .95,
  outcome.scale = "response", robust = FALSE, cluster = NULL, vcov = NULL,
  set.offset = NULL, new_data = NULL, return.orig.data = FALSE,
  partial.residuals = FALSE, ...) {
  
  # Check if user provided own new_data
  if (is.null(new_data)) {
    # Get the data ready with make_new_data()
    pm <- make_new_data(model, pred, pred.values = pred.values, at = at, 
                        data = data, center = center, set.offset = set.offset)
  } else {pm <- new_data}
  
  
  link_or_lm <- ifelse(get_family(model)$link == "identity",
                       yes = "response", no = "link")
  
  # Do the predictions using built-in prediction method if robust is FALSE
  if (robust == FALSE) {
    predicted <- as.data.frame(predict(model, newdata = pm,
                                       se.fit = interval,
                                       interval = int.type[1],
                                       type = link_or_lm))
  } else { # Use my custom robust predictions function
    if (is.null(vcov)) {
      the_vcov <- do_robust(model, robust, cluster, data)$vcov
    } else {
      the_vcov <- vcov
    }
    predicted <- as.data.frame(predict_rob(model, newdata = pm,
                                           se.fit = interval,
                                           interval = int.type[1],
                                           type = link_or_lm,
                                           .vcov = the_vcov))
  }
  
  pm[[get_response_name(model)]] <- predicted[[1]]
  
  ## Convert the confidence percentile to a number of S.E. to multiply by
  intw <- 1 - ((1 - int.width)/2)
  ## Try to get the residual degrees of freedom to get the critical value
  r.df <- try({
    df.residual(model)
  }, silent = TRUE)
  if (is.numeric(r.df)) {
    ses <- qt(intw, r.df)
  } else {
    message(wrap_str("Could not find residual degrees of freedom for this
                       model. Using confidence intervals based on normal
                       distribution instead."))
    ses <- qnorm(intw, 0, 1)
  }
  
  # See minimum and maximum values for plotting intervals
  if (interval == TRUE) { # only create SE columns if intervals are requested
    if ("fit.lwr" %nin% names(predicted)) { # Okay this is funky but...
      if (int.type[1] == "prediction") {
        stop_wrap("R does not compute prediction intervals for this kind of
                  model.")
      }
      pm[["ymax"]] <- pm[[get_response_name(model)]] + (predicted[["se.fit"]]) * ses
      pm[["ymin"]] <- pm[[get_response_name(model)]] - (predicted[["se.fit"]]) * ses
    } else {
      pm[["ymin"]] <- predicted[["fit.lwr"]]
      pm[["ymax"]] <- predicted[["fit.upr"]]
    }
  } else {
    # Do nothing
  }
  
  # Back-convert the predictions to the response scale
  if (outcome.scale == "response") {
    pm[[get_response_name(model)]] <-
      get_family(model)$linkinv(pm[[get_response_name(model)]])
    if (interval == TRUE) {
      pm[["ymax"]] <- get_family(model)$linkinv(pm[["ymax"]])
      pm[["ymin"]] <- get_family(model)$linkinv(pm[["ymin"]])
    }
  }
  
  # Use helper function to prepare the final return object
  o <- prepare_return_data(model = model, data = data,
                           return.orig.data = return.orig.data, 
                           partial.residuals = partial.residuals,
                           pm = pm, pred = pred, at = at, center = center,
                           set.offset = set.offset)
  return(o)
  
}

### svyglm method #############################################################
#' @export

make_predictions.svyglm <- function(model, pred, pred.values = NULL, at = NULL,
  data = NULL, center = TRUE, interval = TRUE, 
  int.type = c("confidence", "prediction"), int.width = .95, 
  outcome.scale = "response", set.offset = NULL, new_data = NULL,
  return.orig.data = FALSE, partial.residuals = FALSE, ...) {
  
  # Check if user provided own new_data
  if (is.null(new_data)) {
    # Get the data ready with make_new_data()
    pm <- make_new_data(model, pred, pred.values = pred.values, at = at, 
                        data = data, center = center, set.offset = set.offset)
  } else {pm <- new_data}
  
  link_or_lm <- ifelse(family(model)$link == "identity",
                       yes = "response", no = "link")
  
  # Do the predictions using built-in prediction method if robust is FALSE
  predicted <- as.data.frame(predict(model, newdata = pm,
                                     se.fit = TRUE, interval = int.type[1],
                                     type = link_or_lm))
  
  pm[[get_response_name(model)]] <- predicted[[1]]
  
  ## Convert the confidence percentile to a number of S.E. to multiply by
  intw <- 1 - ((1 - int.width)/2)
  ## Try to get the residual degrees of freedom to get the critical value
  r.df <- try({
    df.residual(model)
  }, silent = TRUE)
  if (is.numeric(r.df)) {
    ses <- qt(intw, r.df)
  } else {
    message(wrap_str("Could not find residual degrees of freedom for this
                       model. Using confidence intervals based on normal
                       distribution instead."))
    ses <- qnorm(intw, 0, 1)
  }
  
  # See minimum and maximum values for plotting intervals
  if (interval == TRUE) { # only create SE columns if intervals are needed
    pm[["ymax"]] <- pm[[get_response_name(model)]] + (predicted[["SE"]]) * ses
    pm[["ymin"]] <- pm[[get_response_name(model)]] - (predicted[["SE"]]) * ses
  } else {
    # Do nothing
  }
  
  
  # Back-convert the predictions to the response scale
  if (outcome.scale == "response") {
    pm[[get_response_name(model)]] <-
      family(model)$linkinv(pm[[get_response_name(model)]])
    if (interval == TRUE) {
      pm[["ymax"]] <- family(model)$linkinv(pm[["ymax"]])
      pm[["ymin"]] <- family(model)$linkinv(pm[["ymin"]])
    }
  }
  
  # Use helper function to prepare the final return object
  o <- prepare_return_data(model = model, data = data,
                           return.orig.data = return.orig.data, 
                           partial.residuals = partial.residuals,
                           pm = pm, pred = pred, at = at, center = center,
                           set.offset = set.offset)
  return(o)
  
}

### merMod method #############################################################
#' @export
make_predictions.merMod <- function(model, pred, pred.values = NULL, at = NULL,
  data = NULL, center = TRUE, interval = TRUE, 
  int.type = c("confidence", "prediction"), int.width = .95,
  outcome.scale = "response", re.form = ~0, add.re.variance = FALSE,
  boot = FALSE, sims = 1000, progress = "txt", set.offset = NULL, 
  new_data = NULL, return.orig.data = FALSE, partial.residuals = FALSE,
  message = TRUE, ...) {
  
  # Check if user provided own new_data
  if (is.null(new_data)) {
    # Get the data ready with make_new_data()
    pm <- make_new_data(model, pred, pred.values = pred.values, at = at, 
                        data = data, center = center, set.offset = set.offset)
  } else {pm <- new_data}
  
  resp <- get_response_name(model)
  link_or_lm <- ifelse(family(model)$link == "identity",
                       yes = "response", no = "link")
  
  if (interval == TRUE && boot == FALSE && message == TRUE) {
    msg_wrap("Confidence intervals for merMod models is an experimental
              feature. The intervals reflect only the variance of the
              fixed effects, not the random effects.")
  }
  
  # Do the predictions using built-in prediction method if robust is FALSE
  if (interval == FALSE && is.null(model.offset(model.frame(model)))) {
    predicted <- as.data.frame(predict(model, newdata = pm,
                                       type = link_or_lm,
                                       re.form = re.form,
                                       allow.new.levels = FALSE))
    
    pm[[get_response_name(model)]] <- predicted[[1]]
    
  } else { # Use my custom predictions function
    
    if (interactive() && boot == TRUE && progress != "none") {
      cat("Bootstrap progress:\n")
    }
    predicted <- predict_mer(model, newdata = pm, use_re_var = add.re.variance,
                             se.fit = TRUE, allow.new.levels = TRUE, 
                             type = link_or_lm, re.form = re.form,
                             boot = boot, sims = sims, prog_arg = progress, ...)
    
    if (boot == TRUE) {
      raw_boot <- predicted
      
      ## Convert the confidence percentile to a number of S.E. to multiply by
      intw <- 1 - ((1 - int.width) / 2)
      # Set the predicted values at the median
      fit <- sapply(as.data.frame(raw_boot), median)
      upper <- sapply(as.data.frame(raw_boot), quantile, probs = intw)
      lower <- sapply(as.data.frame(raw_boot), quantile, probs = 1 - intw)
      
      # Add to predicted frame
      pm[[resp]] <- fit
      pm[["ymax"]] <- upper
      pm[["ymin"]] <- lower
      
      # Drop the cases that should be missing if I had done it piecewise
      pm <- pm[complete.cases(pm),]
    } else {
      ## Convert the confidence percentile to a number of S.E. to multiply by
      intw <- 1 - ((1 - int.width)/2)
      ## Try to get the residual degrees of freedom to get the critical value
      r.df <- try({
        df.residual(model)
      }, silent = TRUE)
      if (is.numeric(r.df)) {
        ses <- qt(intw, r.df)
      } else {
        message(wrap_str("Could not find residual degrees of freedom for this
                       model. Using confidence intervals based on normal
                       distribution instead."))
        ses <- qnorm(intw, 0, 1)
      }
      pm[[get_response_name(model)]] <- predicted[[1]]
      pm[["ymax"]] <-
        pm[[get_response_name(model)]] + (predicted[["se.fit"]]) * ses
      pm[["ymin"]] <-
        pm[[get_response_name(model)]] - (predicted[["se.fit"]]) * ses
    }
  }
  
  
  # Back-convert the predictions to the response scale
  if (outcome.scale == "response") {
    pm[[resp]] <- family(model)$linkinv(pm[[resp]])
    if (interval == TRUE) {
      pm[["ymax"]] <- family(model)$linkinv(pm[["ymax"]])
      pm[["ymin"]] <- family(model)$linkinv(pm[["ymin"]])
    }
  }
  
  # Use helper function to prepare the final return object
  o <- prepare_return_data(model = model, data = data,
                           return.orig.data = return.orig.data, 
                           partial.residuals = partial.residuals,
                           pm = pm, pred = pred, at = at, center = center,
                           set.offset = set.offset)
  return(o)
  
}


### stanreg method ###########################################################
#' @export

make_predictions.stanreg <- function(model, pred, pred.values = NULL, at = NULL,
  data = NULL, center = TRUE, estimate = c("mean", "median"), interval = TRUE,
  int.width = .95, re.form = ~0,  set.offset = NULL, new_data = NULL,
  return.orig.data = FALSE, partial.residuals = FALSE, ...) {
  
  # Check if user provided own new_data
  if (is.null(new_data)) {
    # Get the data ready with make_new_data()
    pm <- make_new_data(model, pred, pred.values = pred.values, at = at, 
                        data = data, center = center, set.offset = set.offset)
  } else {pm <- new_data}
  
  predicted <- 
    rstanarm::posterior_predict(model, 
                                newdata = pm %not% get_response_name(model), 
                                re.form = re.form)
  
  # the 'ppd' object is a weird pseudo-matrix that misbehaves when
  # I try to make it into a data frame
  if (estimate[1] == "mean") {
    pm[[get_response_name(model)]] <- colMeans(predicted)
  } else if (estimate[1] == "median") {
    pm[[get_response_name(model)]] <- apply(predicted, 2, median)
  }
  
  if (interval == TRUE) {
    
    ints <- rstanarm::predictive_interval(predicted, prob = int.width)
    
    pm[["ymax"]] <- ints[,2]
    pm[["ymin"]] <- ints[,1]
    
  }
  
  # Use helper function to prepare the final return object
  o <- prepare_return_data(model = model, data = data,
                           return.orig.data = return.orig.data, 
                           partial.residuals = partial.residuals,
                           pm = pm, pred = pred, at = at, center = center,
                           set.offset = set.offset)
  return(o)
  
}

### brmsfit method ###########################################################
#' @export
make_predictions.brmsfit <- function(model, pred, pred.values = NULL, at = NULL,
  data = NULL, center = TRUE, estimate = c("mean", "median"), interval = TRUE,
  int.width = .95, re.form = ~0,  set.offset = NULL, new_data = NULL,
  return.orig.data = FALSE, partial.residuals = FALSE, dpar = NULL, resp = NULL,
  outcome.scale = c("response", "linear"), ...) {
  
  # Translate to brms's interface
  if (outcome.scale[1] == "link") {outcome.scale <- "linear"}
  
  # Check if user provided own new_data
  if (is.null(new_data)) {
    # Get the data ready with make_new_data()
    pm <- make_new_data(model, pred, pred.values = pred.values, at = at, 
                        data = data, center = center, set.offset = set.offset,
                        dpar = dpar, resp = resp)
  } else {pm <- new_data}
  
  intw <- c(((1 - int.width)/2), 1 - ((1 - int.width)/2))
  
  # the 'ppd' object is a weird pseudo-matrix that misbehaves when
  # I try to make it into a data frame
  predicted <- as.data.frame(fitted(model,
                               newdata = pm, re_formula = re.form,
                               robust = estimate[1] == "median",
                               probs = intw, summary = TRUE,
                               resp = get_response_name(model, resp = resp),
                               dpar = dpar, scale = outcome.scale))
  pm[[get_response_name(model, resp = resp, dpar = dpar)]] <- predicted[[1]]
  
  if (interval == TRUE) {
    pm[["ymax"]] <- predicted[[4]]
    pm[["ymin"]] <- predicted[[3]]
  }
  
  # Use helper function to prepare the final return object
  o <- prepare_return_data(model = model, data = data,
                           return.orig.data = return.orig.data, 
                           partial.residuals = partial.residuals,
                           pm = pm, pred = pred, at = at, center = center,
                           set.offset = set.offset, 
                           formula = get_formula(model, dpar = dpar, resp = resp), dpar = dpar, resp = resp)
  return(o)
  
}


### quantile regression #######################################################
#' @export

make_predictions.rq <- function(model, pred, pred.values = NULL, at = NULL,
  data = NULL, center = TRUE, interval = TRUE, int.width = .95,
  se = c("nid", "iid", "ker"), set.offset = NULL, new_data = NULL,
  return.orig.data = FALSE, partial.residuals = FALSE, ...) {
  
  se <- match.arg(se, c("nid", "iid", "ker"), several.ok = FALSE)
  
  # Check if user provided own new_data
  if (is.null(new_data)) {
    # Get the data ready with make_new_data()
    pm <- make_new_data(model, pred, pred.values = pred.values, at = at, 
                        data = data, center = center, set.offset = set.offset)
  } else {pm <- new_data}
  
  predicted <- as.data.frame(predict(model, newdata = pm,
                                     interval = "confidence",
                                     se = se, level = int.width))
  
  pm[[get_response_name(model)]] <- predicted[[1]]
  
  if (interval == TRUE) {
    pm[["ymax"]] <- predicted[["higher"]]
    pm[["ymin"]] <- predicted[["lower"]]
  }
  
  # Use helper function to prepare the final return object
  o <- prepare_return_data(model = model, data = data,
                           return.orig.data = return.orig.data, 
                           partial.residuals = partial.residuals,
                           pm = pm, pred = pred, at = at, center = center,
                           set.offset = set.offset)
  return(o)
  
}

Try the jtools package in your browser

Any scripts or data that you put into this service are public.

jtools documentation built on Sept. 11, 2024, 7 p.m.