Nothing
#' Impute in classes
#'
#' Apply an imputation function inside imputation classes
#'
#' @template impute
#'
#' @details Imputation classes (sometimes also called adjustment cells) are
#' build using cross-validation of all `cols_class`. The classes are
#' collapsed, if they do not satisfy all of the criteria defined by
#' `min_objs_in_class`, `min_obs_comp`, `min_obs_per_col` and `donor_limit`.
#' Collapsing starts from the last value of `cols_class`. Internally, a mixture
#' of collapsing and early stopping is used for the construction of the
#' classes.
#'
#' @param cols_class Columns that are used for constructing the imputation
#' classes.
#' @param FUN An imputation function that is applied to impute the missing
#' values.
#' @param breaks Number of intervals / levels a column is broken into (see
#' [cut()], which is used internally for cutting numeric columns). If `breaks
#' = Inf` (the default), every unique value of a column can be in a separate
#' class (if no other restrictions apply).
#' @param use_quantiles Should quantiles be used for cutting numeric vectors?
#' Normally, [cut()] divides the range of an vector into equal spaced
#' intervals. If `use_quantiles = TRUE`, the classes will be of roughly equal
#' content.
#' @param min_objs_in_class Minimum number of objects (rows) in an imputation class.
#' @param min_obs_comp Minimum number of completely observed objects (rows) in an
#' imputation class.
#' @param min_obs_per_col Minimum number of observed values in every column of
#' an imputation class.
#' @param donor_limit Minimum odds between incomplete and complete values in a
#' column, if `dl_type = cols_seq`. If `dl_type = sim_comp`, minimum odds
#' between incomplete and complete rows.
#' @param dl_type See `donor_limit`.
#' @param add_imputation_classes Should imputation classes be added as
#' attributes to the imputed dataset?
#' @param ... Arguments passed to `FUN`.
#'
#'
#' @export
#'
#' @references Andridge, R.R. and Little, R.J.A. (2010), A Review of Hot Deck
#' Imputation for Survey Non-response. International Statistical Review, 78:
#' 40-64. doi:10.1111/j.1751-5823.2010.00103.x
#'
#' @examples
#' # Mean imputation in classes
#' impute_in_classes(data.frame(X = 1:5, Y = c(NA, 12:15)), "X",
#' impute_mean,
#' min_obs_per_col = 2
#' )
#' @md
impute_in_classes <- function(ds, cols_class, FUN, breaks = Inf,
use_quantiles = FALSE,
min_objs_in_class = 1,
min_obs_comp = 0,
min_obs_per_col = 1,
donor_limit = Inf, dl_type = "cols_seq",
add_imputation_classes = FALSE,
...) {
# Check for missing argument cols_class
# because subsetting "works" with missing argument...
if (missing(cols_class)) {
stop("cols_class must be specified")
}
FUN <- match.fun(FUN)
imp_classes <- find_classes(
ds = ds, cols_class = cols_class, breaks = breaks,
use_quantiles = use_quantiles,
min_objs_in_class = min_objs_in_class,
min_obs_comp = min_obs_comp,
min_obs_per_col = min_obs_per_col,
donor_limit = donor_limit, dl_type = dl_type
)
# Apply imputation function for every imputation class separate
for (imp_cl in imp_classes) {
if (anyNA(ds[imp_cl, ])) {
ds[imp_cl, ] <- FUN(ds[imp_cl, ], ...)
}
}
if (add_imputation_classes) {
ds <- structure(ds, imputation_classes = imp_classes)
}
ds
}
#' Hot deck imputation in imputation classes
#'
#' Impute missing values in a data frame or a matrix using a hot deck within
#' imputation classes
#'
#' @template impute
#'
#' @details
#' This function is a combination of [impute_in_classes()] and [impute_sRHD()].
#' It applies [impute_sRHD()] inside of imputation classes (adjustment cells),
#' which are constructed via [impute_in_classes()]. More details can be found in
#' these two functions.
#'
#' @inheritParams impute_in_classes
#' @param donor_limit Minimum odds between incomplete and complete values in a
#' column, if `type = cols_seq`. If `type = sim_comp`, minimum odds between
#' incomplete and complete rows. For `type = sim_part` the donor limit option
#' is not implemented and `donor_limit` should be `Inf`.
#' @param type The type of hot deck (for details, see [impute_sRHD()]).
#'
#' @seealso
#' [impute_in_classes()], which is used for the construction of the imputation
#' classes.
#'
#' [impute_sRHD()], which is used for the imputation.
#'
#' @export
#'
#' @references
#' Andridge, R.R. and Little, R.J.A. (2010), A Review of Hot Deck Imputation for
#' Survey Non-response. International Statistical Review, 78: 40-64.
#' doi:10.1111/j.1751-5823.2010.00103.x
#'
#' @examples
#' impute_hot_deck_in_classes(data.frame(
#' X = c(rep("A", 10), rep("B", 10)),
#' Y = c(rep(NA, 5), 106:120)
#' ),
#' "X",
#' donor_limit = 1
#' )
#' @md
impute_hot_deck_in_classes <- function(ds, cols_class, type = "cols_seq",
breaks = Inf, use_quantiles = FALSE,
min_objs_in_class = 1,
min_obs_comp = 0,
min_obs_per_col = 1,
donor_limit = Inf,
add_imputation_classes = FALSE) {
impute_in_classes(ds, cols_class,
FUN = impute_sRHD,
breaks = breaks, use_quantiles = use_quantiles,
min_objs_in_class = min_objs_in_class,
min_obs_comp = min_obs_comp,
min_obs_per_col = min_obs_per_col,
donor_limit = donor_limit,
dl_type = type,
add_imputation_classes = add_imputation_classes,
type = type
)
}
## Helpers for impute_in_classes() --------------------------------------------
find_classes <- function(ds, cols_class, breaks = Inf, use_quantiles = FALSE,
min_objs_in_class = 0,
min_obs_comp = 0,
min_obs_per_col = 0,
donor_limit = Inf, dl_type = "cols_seq") {
# check for NA in cols_class
if (anyNA(ds[, cols_class])) {
stop("No NAs in ds[, cols_class] allowed")
}
find_classes_recursive(ds, cols_class,
breaks = breaks, use_quantiles = use_quantiles,
min_objs_in_class = min_objs_in_class,
min_obs_comp = min_obs_comp,
min_obs_per_col = min_obs_per_col,
donor_limit = donor_limit, dl_type = dl_type,
cols_act = seq_len(nrow(ds)),
lvls_act = NULL,
imp_classes = list(),
M = is.na(ds)
)
}
## No defaults for the recursive function
## all arguments should be handed over by the calling function
## This will automatically throw an error, if a new argument is added to the
## function, but not in the calling statement(s)
find_classes_recursive <- function(ds, cols_class, breaks, use_quantiles,
min_objs_in_class,
min_obs_comp,
min_obs_per_col,
donor_limit, dl_type,
cols_act, lvls_act = NULL, imp_classes,
M) {
# first check for fast return (no columns in cols_act or no more cols_class)
if (length(cols_act) == 0L) { # no object in new class -> eliminate class
return(imp_classes)
} else if (length(cols_class) == 0L) { # no more columns to form classes
if (is.null(lvls_act)) { # just one class for all
lvls_act <- "everything"
}
imp_classes[[lvls_act]] <- cols_act
return(imp_classes)
}
# no fast return:
# we have objects and at least one column to form classes
# we select only the objects from the cols_act
grouping_factor <- cut_vector(ds[cols_act, cols_class[1], drop = TRUE],
breaks = breaks,
use_quantiles = use_quantiles
)
# check if all new formed classes are okay
# if not -> join the problematic class(es)
repeat {
lvls <- levels(grouping_factor)
new_classes <- list()
new_lvls <- list()
for (i in seq_along(lvls)) {
new_classes[[i]] <- cols_act[grouping_factor == lvls[i]]
new_lvls[[i]] <- ifelse(is.null(lvls_act), as.character(lvls[i]),
paste(lvls_act, lvls[i], sep = ".")
)
}
# remove empty classes
empty_classes <- 0L == vapply(new_classes, length, integer(1))
new_classes[empty_classes] <- NULL
new_lvls[empty_classes] <- NULL
okay_classes <- are_classes_okay(ds, new_classes,
min_objs_in_class, min_obs_comp,
min_obs_per_col = min_obs_per_col,
donor_limit = donor_limit, dl_type = dl_type,
M = M
)
if (all(okay_classes)) { # everything okay -> leave repeat loop
break
} else { # join first not okay_class and try again
levels(grouping_factor) <- merge_lvls(
grouping_factor,
lvls[which(!okay_classes)[1]]
)
}
}
# call find_classes_recursive() for all new formed classes
for (i in seq_along(new_classes)) {
imp_classes <- find_classes_recursive(ds, cols_class[-1],
use_quantiles = use_quantiles,
breaks = breaks,
min_objs_in_class = min_objs_in_class,
min_obs_comp = min_obs_comp,
min_obs_per_col = min_obs_per_col,
donor_limit = donor_limit, dl_type = dl_type,
cols_act = new_classes[[i]],
lvls_act = new_lvls[[i]],
imp_classes = imp_classes,
M = M
)
}
imp_classes
}
cut_vector <- function(x, breaks, use_quantiles = FALSE) {
if (!is.finite(breaks)) { # breaks is infinite, no merging of lvls or cutting
return(as.factor(x))
}
if (is.numeric(x)) { # cuts are ordered for possible later merging
if (use_quantiles) {
x <- cut(x,
breaks = stats::quantile(
x,
seq(from = 0, to = 1, length.out = breaks + 1)
),
include.lowest = TRUE, ordered_result = TRUE
)
} else { # equal-sized classes
x <- cut(x, breaks, ordered_result = TRUE)
}
} else { # not a numeric vector
x <- as.factor(x)
while (length(levels(x)) > breaks) {
levels(x) <- merge_lvls(x)
}
}
x
}
are_classes_okay <- function(ds, new_classes,
min_objs_in_class = 1,
min_obs_comp = 0,
min_obs_per_col = 0,
donor_limit = Inf,
dl_type = "cols_seq",
M = is.na(ds)) {
res <- rep(TRUE, length(new_classes))
for (i in seq_along(new_classes)) {
M_class_i <- M[new_classes[[i]], , drop = FALSE]
## check min_objs_in_class, if > 1 --------------------
if (min_objs_in_class > 1) {
if (length(new_classes[[i]]) < min_objs_in_class) {
res[i] <- FALSE
}
}
## check min_obs_comp, if > 0 -------------------------
if (min_obs_comp > 0) {
n_incomp_obs_i <- sum(apply(M_class_i, 1, any))
n_comp_obs_i <- length(new_classes[[i]]) - n_incomp_obs_i
if (n_comp_obs_i < min_obs_comp) {
res[i] <- FALSE
}
}
## check min_obs_per_col, if > 0 -------------------------
if (min_obs_per_col > 0) {
obs_per_col <- apply(M_class_i, 2, function(x) sum(!x))
if (any(obs_per_col < min_obs_per_col)) {
res[i] <- FALSE
}
}
## check donor_limit, if donor_limit is finite ---------
if (is.finite(donor_limit)) {
if (min_donor_limit(M_class_i, dl_type) > donor_limit) {
res[i] <- FALSE
}
}
}
res
}
merge_lvls <- function(grouping_factor, merging_lvl_1 = NULL) {
lvls <- levels(grouping_factor)
if (length(lvls) < 2L) {
stop("merging only possible for two or more levels")
}
lvls_freq <- tabulate(match(grouping_factor, lvls))
if (is.null(merging_lvl_1)) {
merging_lvl_1 <- lvls[which.min(lvls_freq)]
}
if (is.ordered(grouping_factor)) { # join left or right, if possible
pos_lvl_1 <- which(merging_lvl_1 == lvls)
if (pos_lvl_1 == 1L) { # first level -> merge with second
merging_lvl_2 <- lvls[2L]
} else if (pos_lvl_1 == length(lvls)) { # last level -> merge with second to last
merging_lvl_2 <- lvls[length(lvls) - 1L]
} else { # neither first nor last level -> left or right
lvls_freq_left <- lvls_freq[pos_lvl_1 - 1L]
lvls_freq_right <- lvls_freq[pos_lvl_1 + 1L]
merging_lvl_2 <- pos_lvl_1 + ifelse(lvls_freq_left < lvls_freq_right, -1L, +1L)
merging_lvl_2 <- lvls[merging_lvl_2]
}
} else { # unordered factor
lvls_freq_without_lvl_1 <- lvls_freq[-which(merging_lvl_1 == lvls)]
lvls_without_lvl_1 <- lvls[-which(merging_lvl_1 == lvls)]
merging_lvl_2 <- lvls_without_lvl_1[which.min(lvls_freq_without_lvl_1)]
}
new_lvl <- paste(merging_lvl_1, merging_lvl_2, sep = "_and_")
lvls[lvls %in% c(merging_lvl_1, merging_lvl_2)] <- new_lvl
lvls
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.