#' \code{Imputation}
#' @description This is a wrapper for \code{\link{mice}}.
#' @param data A \code{\link{data.frame}}.
#' @param formula A \code{\link{formula}}. Where the formula contains a dependent variable,
#' observations with missing values on this variable are deleted after the imputation (von Hippel 2007).
#' @param method "mice" applies multivariate imputation by chained equations
#' (predictive mean matching) with the \code{\link[mice]{mice}} package. "hot deck" applies the \code{\link[hot.deck:hot.deck]{hot.deck}} method.
#' The default setting is "try mice", which first applies the \code{\link[mice]{mice}} method and,if an error
#' occurs, falls back to \code{\link[hot.deck]{hot.deck}}.
#' @param m Number of imputation samples.
#' @param seed Seed used in random number generation.
#' @references von Hippel, Paul T. 2007. "Regression With Missing Y's: An
#' Improved Strategy for Analyzing Multiply Imputed Data." Sociological Methodology 37:83-117.
#' Skyler J. Cranmer and Jeff Gill (2013). We Have to Be Discrete About This: A Non-Parametric Imputation Technique for Missing Categorical Data. British Journal of Political Science, 43, pp 425-449.
#' Stef van Buuren and Karin Groothuis-Oudshoorn (2011), "mice: Multivariate
#' Imputation by Chained Equations in R", Journal of Statistical Software, 45:3, 1-67.
#' @details Variables with class \code{"POSIXct"} are converted to numeric and
#' scaled prior to imputation. Variables with class \code{"character"} are converted
#' to factor prior to imputation with blank (0-character) entries considered missing.
#' @importFrom mice mice complete
#' @importFrom flipU OutcomeName CopyAttributes AnyNA AllVariablesNames
#' @importFrom hot.deck hot.deck
#' @export
Imputation <- function(data = NULL, formula = NULL, method = "try mice", m = 1, seed = 12321)
{
method <- tolower(method)
.errorInImputation <- function(imputed.data, formula)
{
if (any("try-error" %in% class(imputed.data)))
return(TRUE)
for (i.data in imputed.data){
if(anyNA(i.data))
return(TRUE)
}
FALSE
}
## Log warnings so they can be throw only in the event of an error
## mice and hot.deck can throw uninformative error messages when given
## bad data, but they usually throw an informative warning in these cases.
## Warnings from mice are logged in a data.frame called loggedEvents in the
## output returned by mice().
## See https://stat.ethz.ch/pipermail/r-help/2010-December/262626.html
.withWarnings <- function(expr) {
warnings <- NULL
wHandler <- function(w) {
warnings <<- c(warnings, list(w))
invokeRestart("muffleWarning")
}
val <- withCallingHandlers(expr, warning = wHandler)
list(value = val, warnings = warnings)
}
single.var <- NCOL(data) == 1L
pdata <- preProcessData(data)
if(!anyNA(pdata))
return(lapply(seq(m), function(x) data))
outcome.name <- if (is.null(formula)) NULL else OutcomeName(formula)
if (!is.null(outcome.name))
{
temp.data <- data[, AllVariablesNames(formula, data = data)]
temp.data <- data[!is.na(temp.data[, outcome.name]), , drop = FALSE]
if(!any(is.na(temp.data)))
{
warning("Imputation has been selected, but the data has no missing ",
"values in the predictors, so nothing has been imputed.")
temp.data <- CopyAttributes(temp.data, data)
return(lapply(seq(m), function(x) temp.data))
}
}
hot.deck.used <- FALSE
if (method != "hot deck")
{
imputed.data <- suppressWarnings(try(
{
set.seed(seed)
mice.method <- if (single.var) c("sample", "") # else NULL
mice.setup <- mice(pdata, m = m, seed = seed, printFlag = FALSE,
method = mice.method)
data.sets <- vector("list", m)
for (i in 1:m)
data.sets[[i]] <- complete(mice.setup, action = i)
data.sets
}
, silent = TRUE))
if (method == "mice" && .errorInImputation(imputed.data, formula))
{
if (exists("mice.setup"))
{
.logEventStr <- function(log.events)
{
if (is.null(log.events) || nrow(log.events) == 0)
return("")
out.str <- "Log Events:\n"
for (i in nrow(log.events))
out.str <- paste0(out.str,
paste(names(log.events), log.events[i,],
sep = " = ", collapse = ", "),
"\n")
return(out.str)
}
extra.warnings <- paste0(" See ?mice::mids for interpretation of ",
"the following logged events to diagnose ",
"the issue.\n",
.logEventStr(mice.setup$loggedEvents))
}else
extra.warnings <- ""
stop("Mice imputation failed.", extra.warnings)
}
}
if (method != "mice" && (method == "hot deck" ||
.errorInImputation(imputed.data, formula)))
{
set.seed(seed)
imputed.data <- .withWarnings(try(hot.deck(pdata, m = m), silent = TRUE))
warnings <- imputed.data$warnings
imputed.data <- imputed.data$value
failed <- inherits(imputed.data, "try-error")
if (failed || .errorInImputation(imputed.data$data, formula))
stop("Imputation has failed.\n",
if (failed)
paste0(attr(imputed.data, "condition")$message, "\n"),
vapply(warnings, function(x)x$message, ""))
imputed.data <- imputed.data$data
all.na.rows <- nrow(imputed.data[[1]]) != nrow(data)
hot.deck.used <- TRUE
}
imputation.method <- ifelse(hot.deck.used, "hot decking",
"chained equations (predictive mean matching)")
for (i in 1:m)
{
imputed.data[[i]] <- postProcessData(pdata, imputed.data[[i]], data)
attr(imputed.data[[i]], "imputation.method") <- imputation.method
}
if (!is.null(formula))
{
if (!is.null(outcome.name))
{
for (i in 1:m)
{
valid.dependent <- !is.na(data[, outcome.name])
imp.data <- imputed.data[[i]][valid.dependent, ] # Excluding observations with missing values.
imp.data <- CopyAttributes(imp.data, imputed.data[[i]]) # Copying labels
attr(imp.data, "imputation.method") <- attr(imputed.data[[i]], "imputation.method") # Data file attributes
imputed.data[[i]] <- imp.data
}
}
}
imputed.data
}
#' 1) CE-437: Drops levels from integer and numeric columns created by
#' calling unclass() on factors contain levels, which confuses mice
#' and causes it to crash (and display cryptic warnings).
#' 2) Converts date variables to numeric
#' 3) relabels factors with duplicate levels so they are unique
#' 4) Converts text variables to factors with blank entries converted to NA
#' @noRd
preProcessData <- function(data)
{
orig.classes <- lapply(data, class)
## orig.attrs <- lapply(data, attributes)
for (nms in names(data))
{
clss <- class(data[[nms]])
if (any(c("integer", "numeric") %in% clss))
attr(data[[nms]], "levels") <- NULL
else if ("character" %in% clss)
data[[nms]] <- factor(data[[nms]], exclude = c("", NA))
else if ("factor" %in% clss)
levels(data[[nms]]) <- make.unique(levels(data[[nms]]))
else if (inherits(data[[nms]], c("QDate", "POSIXlt", "POSIXct")))
data[[nms]] <- as.numeric(data[[nms]])/1e9
}
## add dummy extra variable to allow mice/hotdeck to run with a single variable
if (NCOL(data) == 1L)
data <- cbind(data, DUMMY__VAR__ = 1)
## need to replace names to avoid errors in mice v3.0.0
attr(data, "orig.names") <- colnames(data)
colnames(data) <- paste0("A", 1:ncol(data))
return(data)
}
#' Converts text and date variables back to their original types
#' Converts de-duplicated factor levels back to their original state
#' @noRd
postProcessData <- function(preprocessed.data, imputed.data, orig.data)
{
## orig.classes <- preprocessed.data[["orig.classes"]]
## orig.attrs <- preprocessed.data[["orig.attrs"]]
colnames(imputed.data) <- colnames(orig.data)
orig.classes <- lapply(orig.data, class)
if (NCOL(orig.data) == 1) # drop dummy variable needed to run mice with 1 var.
imputed.data <- imputed.data[, 1, drop = FALSE]
if (nrow(imputed.data) != nrow(orig.data))
imputed.data <- imputeAllMissingRows(imputed.data, preprocessed.data)
for (nms in colnames(imputed.data))
{
orig.class <- orig.classes[[nms]]
if ("character" %in% orig.class)
imputed.data[[nms]] <- as.character(imputed.data[[nms]])
else if ("factor" %in% orig.class) # preserve duplicate levels to match Displayr
imputed.data[[nms]] <- structure(as.integer(imputed.data[[nms]]),
.Label = levels(orig.data[[nms]]),
class = class(orig.data[[nms]]))
else if (any(orig.class %in% c("QDate", "POSIXct")))
{
imputed.data[[nms]] <- as.POSIXct(1e9*imputed.data[[nms]],
origin = "1970-01-01")
class(imputed.data[[nms]]) <- c(class(imputed.data[[nms]]), "QDate")
}
}
imputed.data <- CopyAttributes(imputed.data, orig.data,
attr.to.not.copy = c("dimnames", "names", "dim", "class", "levels"))
## colnames(imputed.data) <- attr(preprocessed.data, "orig.names")
return(imputed.data)
}
#' The hot deck method drops rows from the data that are entirely missing. This
#' function uses mice to impute values for rows with entirely missing data by
#' randomly sampling the observed values
imputeAllMissingRows <- function(imputed.data, processed.data, seed = 585)
{
out.dat <- processed.data
all.na.rows <- which(apply(processed.data, 1, function(x) all(is.na(x))))
out.dat[-all.na.rows, ] <- imputed.data
mice.method <- rep("sample", ncol(out.dat))
try(
{
mice.out <- mice(out.dat, m = 1, seed = seed, printFlag = FALSE,
method = mice.method)
out.dat <- complete(mice.out, action = 1)
}, TRUE)
return(out.dat)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.