#' imputeForest
#'
#' Missing Value Imputation through randomForest Proximity Matrix
#'
#' @export
#'
#' @importFrom stats predict
#'
#' @author David Navega
#'
#' @param x a data.frame.
#' @param y a numeric or factor vector.
#' @param method type of procedure for imputation. Default is "proximity" for
#' proximity based imputation. "missForest" triggers a hybrid technique using
#' missForest and proximity based imputation.
#' @param initial a character defining how initial missing values are imputed.
#' Default is "mode" for mode imputation. "random" for random imputation from
#' marginal distribution of each feature.
#' @param mtry number of varibles to try at each split.
#' @param ntree number of trees.
#' @param iterations number of iterations used to update imputed values.
#' @param mcar amount of missing values to induced completely at random. Used
#' compute cross-validation imputation error.
#' @param crossvalidation a logical indicating whether cross-validation should
#' be made or not. Default is TRUE.
#' @param kfold number of fold in cross-validation. Default is 5.
#' @param verbose a logical indicating if output should be more detailed.
#' @param seed a positive integer defining the seed for the random number generator.
#' @param ... additional arguments for randomForest
#'
#' @return an object of class imputeForest
#'
#' @seealso \link{randomForest}
#'
#' @examples
#' x <- iris[, -5]
#' y <- iris$Species
#' x_na <- induce_missing(x)
#' print(imputeForest(x = x_na, y = y))
#
imputeForest <- function(x, y = NULL,
method = "missForest",
initial = "mode",
mtry = ceiling(sqrt(ncol(x))),
ntree = 300,
iterations = 5,
mcar = 0.10,
crossvalidation = T,
kfold = 5,
verbose = F,
seed = 1984,
...
) {
# RNG
set.seed(seed)
# initialize and assess class of each variable
x_names <- colnames(x)
# assess if x is data.frame tbl_df are coerced
condition <- "data.frame" %in% class(x)
if(condition) {
x <- as.data.frame(x)
} else {
stop("[-] x must be a 'data.frame' object")
}
x_class <- lapply(x, class)
x_class <- named_apply(x_class, numeric_factor_class, simplify = T)
x_dimensions <- dim(x)
n <- x_dimensions[1]
m <- x_dimensions[2]
condition <- is.null(y)
if(condition) {
# unsupervised imputation ----
type <- "unsupervised"
switch(method,
"proximity" = {
# step i - mode imputation ----
condition <- verbose
if(condition) {
cat("\nUnsupervised Imputation Forest\n")
cat("\nProximity based Imputation\n")
cat("Step I - Mode Imputation", "\n",sep = "")
}
# mode imputation ---
imputed_x <- switch (initial,
mode = {
mode_imputation(x)
},
random = {
random_imputation(x)
},
{
stop("[-] initial should be 'mode' or 'random'.")
}
)
unsupervised_forest <- unsupervisedForest(
x = imputed_x,
ntree = ntree,
mtry = mtry,
...
)
# proximity imputation (1) ---
proximity_matrix <- unsupervised_forest$proximity
proximity_imputed <- proximity_imputation(
x = x,
proximity = proximity_matrix
)
# step ii - proximity imputation (iterate) ----
n_iterations <- 0
while(n_iterations < iterations) {
unsupervised_forest <- unsupervisedForest(
x = proximity_imputed,
ntree = ntree,
mtry = mtry,
...
)
proximity_matrix <- unsupervised_forest$proximity
proximity_imputed <- proximity_imputation(
x = x,
proximity = proximity_matrix
)
condition <- verbose
if(condition) {
cat("Step II - Iterations:", n_iterations + 1, "of", iterations, "\n")
}
condition <- all(imputed_x == proximity_imputed)
if(condition) {
break
}
imputed_x <- proximity_imputed
n_iterations <- n_iterations + 1
}
},
"missForest" = {
condition <- verbose
if(condition) {
cat("\nUnsupervised Imputation Forest\n")
cat("missForest | Proximity based Imputation Hybrid", "\n", sep = "")
}
# missForest imputation ---
condition <- verbose
if(condition) {
miss_forest <- missForest::missForest(x)
} else {
condition <- Sys.info()["sysname"] == "Windows"
if(condition){
sink("NUL")
miss_forest <- missForest::missForest(x)
sink()
} else {
sink("/dev/null")
sink()
miss_forest <- missForest::missForest(x)
sink()
}
}
imputed_x <- miss_forest$ximp
# proximity forest for external mapping ---
unsupervised_forest <- unsupervisedForest(
x = imputed_x,
ntree = ntree,
mtry = mtry,
...
)
n_iterations <- iterations
},
{stop("[-] method must be 'proximity' or 'missForest'.")}
)
} else {
type <- "supervised"
# supervised imputation ----
switch(method,
"proximity" = {
# step i - mode imputation ----
condition <- verbose
if(condition) {
cat("\nSupervised Imputation Forest\n")
cat("\nProximity based Imputation\n")
cat("Step I - Mode Imputation", "\n", sep = "")
}
# mode imputation ---
imputed_x <- switch (initial,
mode = {
mode_imputation(x)
},
random = {
random_imputation(x)
},
{
stop("[-] initial should be 'mode' or 'random'.")
}
)
supervised_forest <- randomForest::randomForest(
x = imputed_x,
y = y,
ntree = ntree,
mtry = mtry,
proximity = T,
oob.prox = T,
...
)
proximity_matrix <- supervised_forest$proximity
proximity_imputed <- proximity_imputation(
x = x,
proximity = proximity_matrix
)
# supervised error ---
task <- supervised_forest$type
switch(task,
regression = {
s_error <- sqrt(supervised_forest$mse[ntree])
},
classification = {
s_error <- 1 - supervised_forest$err.rate[ntree, 1]
}
)
condition <- verbose
if(condition) {
cat("Supervised Accuracy: ", round(s_error, 4), "\n",sep = "")
}
# step ii - proximity imputation (iterate) ---
n_iterations <- 0
while(n_iterations < iterations) {
supervised_forest <- randomForest::randomForest(
x = proximity_imputed,
y = y,
mtry = mtry,
ntree = ntree,
proximity = T,
oob.prox = T
)
proximity_matrix <- supervised_forest$proximity
proximity_imputed <- proximity_imputation(
x = x,
proximity = proximity_matrix
)
# supervised error ---
task <- supervised_forest$type
switch(task,
regression = {
s_error <- sqrt(supervised_forest$mse[ntree])
},
classification = {
s_error <- 1 - supervised_forest$err.rate[ntree, 1]
}
)
condition <- verbose
if(condition) {
cat("Iterations: ", n_iterations + 1, " of ", iterations, "\n", sep = "")
cat("Supervised Accuracy: ", round(s_error, 4), "\n", sep = "")
}
condition <- all(imputed_x == proximity_imputed)
if(condition) {
break
}
imputed_x <- proximity_imputed
n_iterations <- n_iterations + 1
}
},
"missForest" = {
condition <- verbose
if(condition) {
cat("\nSupervised Imputation Forest\n")
cat("missForest | Proximity based Imputation Hybrid", "\n", sep = "")
}
# missForest imputation ---
condition <- verbose
if(condition) {
miss_forest <- missForest::missForest(x)
} else {
condition <- Sys.info()["sysname"] == "Windows"
if(condition){
sink("NUL")
miss_forest <- missForest::missForest(x)
sink()
} else {
sink("/dev/null")
miss_forest <- missForest::missForest(x)
sink()
}
}
imputed_x <- miss_forest$ximp
# proximity forest for external mapping ---
supervised_forest <- randomForest::randomForest(
x = imputed_x,
y = y,
ntree = ntree,
mtry = mtry,
proximity = T,
oob.prox = T,
...
)
# supervised error ---
task <- supervised_forest$type
switch(task,
regression = {
s_error <- sqrt(supervised_forest$mse[ntree])
},
classification = {
s_error <- 1 - supervised_forest$err.rate[ntree, 1]
}
)
condition <- verbose
if(condition) {
cat("Supervised Accuracy: ", round(s_error, 4), "\n", sep = "")
}
n_iterations <- iterations
},
{stop("[-] method must be 'proximity' or 'missForest'.")}
)
}
object <- list(
x = imputed_x,
na = is.na(x),
type = type,
method = method,
accuracy = switch(type, unsupervised = {NULL}, supervised = {s_error}),
iterations = n_iterations,
crossvalidation = crossvalidation,
mcar = mcar,
forest = if(!is.null(y)) {supervised_forest} else {unsupervised_forest}
)
class(object) <- "imputeForest"
# k-fold cross-validation ----
condition <- crossvalidation
if(condition) {
condition <- verbose
if(condition) {
cat("\nCross-validation (K-Fold), K = ", kfold, ", MCAR = ", mcar,"\n", sep = "")
}
kfold_matrix <- kfold_crossvalidation(n = n, k = kfold, seed = seed)
cv_imputed_x <- imputed_x
cv_missing_x <- imputed_x
cv_true_x <- imputed_x
oob_accuracy <- vector()
condition <- !is.null(y)
if(condition) {
y_cv <- y
}
for(kth in seq_len(kfold)) {
condition <- verbose
if(condition) {
cat("\nFold:", kth, "of", kfold)
}
# setup k-fold partition ---
fold_index <- kfold_matrix[, kth]
train_index <- which(!fold_index)
test_index <- which(fold_index)
# setup data ---
x_train <- imputed_x[train_index, ]
x_test <- imputed_x[test_index, ]
# induce missing ---
x_train_miss <- induce_missing(x = x_train, amount = mcar)
x_test_miss <- induce_missing(x = x_test, amount = mcar)
condition <- is.null(y)
if(condition) {
y_train <- NULL
} else {
y_train <- y[train_index]
}
# generate forest ---
kfold_forest <- imputeForest(
x = x_train_miss,
y = y_train,
mtry = mtry,
ntree = ntree,
method = method,
iterations = iterations,
verbose = verbose,
crossvalidation = F,
seed = seed
)
oob_accuracy <- c(oob_accuracy, kfold_forest$accuracy)
# predict (impute) missing values ---
x_test_imputed <- predict(object = kfold_forest, newdata = x_test_miss)
condition <- exists("y_cv")
if(condition) {
y_cv[test_index] <- predict(kfold_forest$forest, x_test_imputed)
}
# update
cv_imputed_x[test_index, ] <- x_test_imputed
cv_missing_x[test_index, ] <- x_test_miss
}
# cross_validation information ---
cv_object <- list(
ximp = cv_imputed_x,
xmiss = cv_missing_x,
xtrue = cv_true_x
)
condition <- exists("y_cv")
if(condition) {
condition <- is.numeric(y_cv)
if(condition) {
cv_accuracy <- sqrt(mean((y - y_cv) ^ 2))
} else {
cv_accuracy <- sum(diag(m_estimator(x = y, y = y_cv)))
}
}
error_table <- sapply(x_names, function(name) {
switch(x_class[name],
numeric = {
# na percent ---
percent_na <- (sum(is.na(x[[name]])) / length(x[[name]])) * 100
is_na <- is.na(cv_object$xmiss[[name]])
imputed <- cv_object$ximp[[name]][is_na]
known <- cv_object$xtrue[[name]][is_na]
# median absolute error ---
medae <- stats::median(abs(known - imputed))
# median symmetric accuracy ---
msa <- 100 * (exp(stats::median(abs(log(imputed / known)))) - 1)
# symmetric signed percent bias
MdLQ <- stats::median(log(imputed / known))
sspb <- 100 * sign(MdLQ) * (exp(abs(MdLQ)) - 1)
# variation due to imputation
esquares <- sum((known - imputed) ^ 2)
tsquares <- sum((mean(x[[name]], na.rm = T) - x[[name]]) ^ 2, na.rm = T)
imp_var <- (esquares / tsquares) * 100
# return ---
rout <- c(percent_na, msa, sspb, medae, imp_var, 0)
return(rout)
},
factor = {
# na percent ---
percent_na <- (sum(is.na(x[[name]])) / length(x[[name]])) * 100
is_na <- is.na(cv_object$xmiss[[name]])
imputed <- cv_object$ximp[[name]][is_na]
known <- cv_object$xtrue[[name]][is_na]
n <- length(known)
# probability of correct classification
pcc <- sum(known == imputed) / n
# adjusted probability of correct classification
px <- m_estimator(known)
py <- m_estimator(imputed)
pe <- sum(px * py)
apcc <- (pcc - pe) / (1 - pe)
# probability of random classification
purc <- 1 / nlevels(known)
# balanced probability
bpcc <- diag(m_estimator(known, imputed)) / m_estimator(known)
bpcc[is.infinite(bpcc) | is.na(bpcc)] <- 0
bpcc <- mean(bpcc)
# p-value ----
binomial_test <- stats::binom.test(
x = as.integer(pcc * n), n = n, p = pe, alternative = "greater"
)
p.value <- binomial_test$p.value
# normalized entropy (known)---
p <- m_estimator(x[[name]])
logp <- log(p, base = 2)
logp[is.infinite(logp)] <- 0
hx <- (-sum(logp * p) / log(nlevels(known), base = 2))
# normalized entropy (imputed)---
p <- m_estimator(imputed)
logp <- log(p, base = 2)
logp[is.infinite(logp)] <- 0
hy <- (-sum(logp * p) / log(nlevels(known), base = 2))
# return ---
rout <- c(percent_na, hy, purc, pcc, bpcc, p.value)
return(rout)
}
)
})
error_table <- round(t(error_table), digits = 4)
error_frame <- data.frame(
Feature = x_names,
Index = seq_len(m),
error_table,
row.names = NULL
)
num_frame <- error_frame[which(x_class == "numeric"), -ncol(error_frame)]
fct_frame <- error_frame[which(x_class == "factor"), ]
colnames(num_frame) <- c(
"Feature", "Index", "% NA", "MSA", "SSPB", "MedAE", "Variance"
)
colnames(fct_frame) <- c(
"Feature", "Index", "% NA","H(X')","PuRC", "PCC", "bPCC", "p-value"
)
error_list <- list(
"numeric" = num_frame,
"factor" = fct_frame
)
condition <- !is.null(y)
accuracy <- if(condition) {
round(
x = c("MCAR (OOB)" = mean(oob_accuracy), "MCAR (CV)" = cv_accuracy),
digits = 4
)
} else {
NULL
}
object$error <- error_list
object$accuracy <- c(OOB = unname(object$accuracy), accuracy)
}
# return ----
rout <- object
return(rout)
}
#'Predict method for imputeForest class
#'
#' @export
#'
#' @author David Navega
#'
#' @param object an imputeForest object
#' @param newdata data.frame where missing values should be predicted
#' @param ... ...
#'
#' @return a data.frame with imputed missing values
#'
predict.imputeForest <- function(object, newdata, ...) {
# has_missing function ----
has_missing <- function(x, index = F) {
condition <- index
if(condition) {
which(!stats::complete.cases(x))
} else {
!stats::complete.cases(x)
}
}
# proximity imputer function ----
proximity_imputer <- function(x, proximity) {
# initialize and assess class of each variable
x_names <- colnames(x)
x_class <- lapply(x, class)
x_class <- named_apply(x_class, numeric_factor_class, simplify = TRUE)
x_dimensions <- dim(x)
n <- x_dimensions[1]
m <- x_dimensions[2]
# reoconstruct values using proximity
proximity_reconstructed <- named_apply(x_names, function(current) {
current_x <- x[[current]]
switch(x_class[current],
factor = {
x_levels <- levels(current_x)
p_vector <- split(proximity, current_x)
p_imputed <- which.max(sapply(p_vector, sum))
reconstructed <- factor(x_levels[p_imputed], levels = x_levels)
# return
rout <- reconstructed
return(rout)
},
numeric = {
not_na <- !is.na(current_x)
p_vector <- proximity[not_na]
reconstructed <- weighted_mean(
x = current_x[not_na],
weights = p_vector,
na.rm = TRUE
)
# return
rout <- reconstructed
return(rout)
}
)
})
# return ----
rout <- proximity_reconstructed
return(rout)
}
# proximity imputation function ----
proximity_imputation <- function(data) {
proximity_matrix <- switch(object$type,
unsupervised = {
predicted <- predict(object = object$forest, newdata = data)
proximity <- predicted$proximity
rownames(proximity) <- NULL
colnames(proximity) <- NULL
proximity
},
supervised = {
predicted <- predict(
object = object$forest,
newdata = dplyr::bind_rows(data, x),
proximity = T
)
proximity <- predicted$proximity
proximity <- proximity[seq_len(n_with_missing), -seq_len(n_with_missing)]
rownames(proximity) <- NULL
colnames(proximity) <- NULL
proximity
}
)
proximity_matrix <- rbind(proximity_matrix)
variables_imputed <- lapply(seq_len(n_with_missing), function(index) {
proximity_vector <- proximity_matrix[index, ]
as.data.frame(proximity_imputer(
x = x[, variable_missing[[index]], drop = F],
proximity = proximity_vector
))
})
proximity_imputed <- newdata
for(index in seq_len(n_with_missing)) {
missing <- variable_missing[[index]]
missing_index <- with_missing[index]
proximity_imputed[missing_index, missing] <- variables_imputed[[index]]
}
# return
rout <- proximity_imputed
return(rout)
}
# initialize parameters,data and assess class of each variable ----
x <- object$x
x_names <- colnames(x)
x_class <- lapply(x, class)
x_class <- named_apply(x_class, numeric_factor_class, simplify = T)
x_dimensions <- dim(x)
n_x <- x_dimensions[1]
m_x <- x_dimensions[2]
z <- dplyr::as_data_frame(rbind(newdata))
z_names <- colnames(z)
z_class <- lapply(z, class)
z_class <- named_apply(z_class, numeric_factor_class, simplify = T)
z_dimensions <- dim(x)
n_z <- z_dimensions[1]
m_z <- z_dimensions[2]
iterations <- object$iterations
condition <- any(x_class != z_class)
if(condition) {
stop("[-] One of more variables in newdata do not match variable class")
}
condition <- m_x != m_z
if(condition) {
stop("[-] Number of variables in newdata do not match trained forest")
}
with_missing <- has_missing(z, index = T)
is_missing <- dplyr::as_data_frame(is.na(z)[with_missing, ])
z <- z[with_missing, ]
n_with_missing <- length(with_missing)
variable_missing <- apply(is_missing, 1, function(x) z_names[which(x)])
# strawman imputation / roughfix ----
# compute strawman imputation values (initial guesses)
# mode (factor) and median (numeric)
strawman_value <- named_apply(x_names, function(name) {
switch(x_class[name],
factor = {
discrete_mode(x[[name]])
},
numeric = {
stats::median(x[[name]],na.rm = T)
}
)
})
strawman_imputed <- dplyr::as_data_frame(named_apply(z_names, function(name){
strawman_variable <- z[[name]]
strawman_variable[is_missing[[name]]] <- strawman_value[[name]]
return(strawman_variable)
}))
proximity_imputed <- proximity_imputation(strawman_imputed)
n_iterations <- 1
while(n_iterations <= iterations) {
proximity_imputed <- proximity_imputation(proximity_imputed[with_missing, ])
n_iterations <- n_iterations + 1
}
# return ----
rout <- proximity_imputed
return(rout)
}
#' Print method for imputeForest class
#'
#' @noRd
#'
#' @author David Navega
#'
#' @export
#'
#' @param x an imputeForest x
#' @param ... ...
#'
print.imputeForest <- function(x, ...) {
cat("\nimputeForest\n")
cat("Missing value imputation model using randomForest \n")
condition <- x$method == "missForest"
if(condition) {
cat("\nMethod:", x$method, "(proximity hybrid)")
} else {
cat("\nMethod:", x$method)
}
cat("\nIterations:", x$iterations)
cat("\nForest:", x$type)
condition <- x$type == "supervised"
if(condition) {
cat("\nTask:", x$forest$type)
cat("\n\nSupervised Accuracy\n\n")
print(x$accuracy)
}
condition <- x$crossvalidation
if(condition) {
cat("\n\nImputation Accuracy\nMCAR =", x$mcar,"\n\n")
cat("Numeric\n\n")
print(x$error$numeric)
cat("\nFactor\n\n")
print(x$error$factor)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.