#' Proximity Imputation
#'
#' @noRd
#'
#' @author David Navega
#'
#' @param x a data.frame containing missing values
#' @param proximity a n-by-n matrix with the proximity values as computed by a
#' randomForest model.
#'
proximity_imputation <- function(x, proximity) {
# initialize and assess class of each variable
x_names <- colnames(x)
x_class <- lapply(x, class)
x_class <- named_apply(x_class, numeric_factor_class, simplify = TRUE)
x_dimensions <- dim(x)
n <- x_dimensions[1]
m <- x_dimensions[2]
# reconstruct values using proximity
proximity_reconstructed <- named_apply(x_names, function(current) {
current_x <- x[[current]]
switch(x_class[current],
factor = {
x_levels <- levels(current_x)
sapply(seq_len(n), function(i) {
p_vector <- split(proximity[-i, i], current_x[-i])
p_imputed <- which.max(sapply(p_vector, sum))
reconstructed <- factor(x_levels[p_imputed], levels = x_levels)
# return
rout <- reconstructed
return(rout)
})
},
numeric = {
sapply(seq_len(n), function(i) {
not_na <- !is.na(current_x[-i])
p_vector <- proximity[-i, i][not_na]
reconstructed <- weighted_mean(
x = current_x[-i][not_na],
weights = p_vector,
na.rm = TRUE
)
# return
rout <- reconstructed
return(rout)
})
}
)
})
proximity_imputed <- as.data.frame(proximity_reconstructed)
colnames(proximity_imputed) <- x_names
# substitute missing value
x_imputed <- named_apply(x_names, function(name) {
current_variable <- x[[name]]
is_missing <- is.na(current_variable)
current_variable[is_missing] <- proximity_imputed[[name]][is_missing]
# return
rout <- current_variable
return(rout)
})
x_imputed <- as.data.frame(x_imputed)
colnames(x_imputed) <- x_names
# return ----
rout <- x_imputed
return(rout)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.