Nothing
#' Simple random hot deck imputation
#'
#' Impute missing values in a data frame or a matrix using a simple random hot
#' deck
#'
#' @template impute
#'
#' @details
#' There are three types of simple random hot decks implemented. They can be
#' selected via \code{type}:
#' \itemize{
#' \item{"cols_seq" (the default): Each variable (column) is handled
#' separately. If an object (row) has a missing value in a variable (column),
#' then one of the observed values in the same variable is chosen randomly and
#' the missing value is replaced with this chosen value. This is done for all
#' missing values.}
#' \item{"sim_comp": All missing variables (columns) of an object are imputed together
#' ("simultaneous"). For every object with missing values (such an object is
#' called a recipient in hot deck terms), one complete object is chosen randomly
#' and all missing values of the recipient are imputed with the values from the
#' complete object. A complete object used for imputation is called a donor.}
#' \item{"sim_part": All missing variables (columns) of an object are imputed
#' together ("simultaneous"). For every object with missing values (recipient)
#' one donor is chosen. The donor must have observed values in all the variables
#' that are missing in the recipient. The donor is allowed to have unobserved
#' values in the non-missing parts of the recipient. So, in contrast to
#' "sim_comp", the donor can be partly incomplete.} }
#'
#' The parameter \code{donor_limit} controls how often an object can be a donor.
#' This parameter is only implemented for types "cols_seq" and "sim_comp". If
#' \code{type = "sim_part"} and \code{donor_limit} is not \code{Inf}, then an
#' error will be thrown. For "sim_comp" the default value (\code{Inf}) allows
#' every object to be a donor for an infinite number of times (there is no
#' restriction on the times an object can be a donor). If a numeric value less
#' than \code{Inf} is chosen, then every object can be a donor at most
#' \code{donor_limit} times. For example \code{donor_limit = 1} ensures that
#' every object donates at most one time. If there are only few complete objects
#' and \code{donor_limit} is set too low, then an imputation might not be
#' possible with the chosen \code{donor_limit}. In this case, the
#' \code{donor_limit} will be adjusted (see examples). Setting \code{donor_limit
#' = "min"} chooses automatically the minimum value for \code{donor_limit} that
#' allows imputation of all missing values. For \code{type = "cols_seq"} the
#' donor limit is applied for every column separately.
#'
#'
#' @param type The type of hot deck; the default ("cols_seq") is a random hot
#' deck that imputes each column separately. Other choices are "sim_comp" and
#' "sim_part". Both impute all missing values in an object (row)
#' simultaneously using a single donor object. The difference between the two
#' types is the choice of objects that can act as donors. "sim_comp:" only
#' completely observed objects can be donors. "sim_part": all objects that
#' have no missing values in the missing parts of a recipient can be donors.
#' @param donor_limit Numeric of length one or "min"; how many times an object
#' can be a donor. Default is \code{Inf} (no restriction).
#'
#' @references Andridge, R. R., & Little, R. J. (2010). A review of hot deck
#' imputation for survey non-response. \emph{International statistical review},
#' 78(1), 40-64.
#'
#' @export
#'
#' @examples
#' ds <- data.frame(X = 1:20, Y = 101:120)
#' ds_mis <- delete_MCAR(ds, 0.2)
#' ds_imp <- impute_sRHD(ds_mis)
#' \donttest{
#' # Warning: donor limit to low
#' ds_mis_one_donor <- ds
#' ds_mis_one_donor[1:19, "X"] <- NA
#' impute_sRHD(ds_mis_one_donor, donor_limit = 3)
#' }
impute_sRHD <- function(ds, type = "cols_seq", donor_limit = Inf) {
type <- match.arg(type, c("cols_seq", "sim_comp", "sim_part"))
# missing data indicator matrix M and short M check -----
M <- is.na(ds) # missing data indicator matrix
completely_missing_col <- which(apply(M, 2, all))
if (length(completely_missing_col) != 0) {
stop(
"imputation not possible: column(s) ",
paste(completely_missing_col, collapse = ", "),
" are completely missing (all entries are NA)"
)
}
# check donor_limit -------------------------------------
# and convert donor_limit = "min" to appropriate limit
if (type == "sim_part") {
if (length(donor_limit) != 1 | !is.infinite(donor_limit)) {
stop("donor_limit is not implemented for type = sim_part")
}
} else { # types: cols_seq or sim_comp
if (is.numeric(donor_limit)) {
if (length(donor_limit) != 1L) {
stop("donor_limit must be of length 1")
} else if (donor_limit < 1) {
stop("donor_limit must be a number >= 1 or the string 'min'")
} else { # donor_limit high enough?
theo_min_donor_lim <- min_donor_limit(M, type)
if (donor_limit < theo_min_donor_lim) {
warning(
"donor_limit = ", donor_limit,
" is to low to impute all missing values; ",
"it was set to ", theo_min_donor_lim
)
donor_limit <- theo_min_donor_lim
}
}
} else if (is.character(donor_limit) & donor_limit == "min") {
donor_limit <- min_donor_limit(M, type) # convert "min"
message(paste("donor_limit is set to:", donor_limit))
} else {
stop("donor_limit must be a number >= 1 or the string 'min'")
}
}
# imputation --------------------------------------------
if (type == "cols_seq") { # cols_seq --------------------
ds <- impute_sRHD_cols_seq(ds, M, donor_limit)
} else if (type == "sim_comp") { # sim_comp -------------
ds <- impute_sRHD_sim_comp(ds, M, donor_limit)
} else if (type == "sim_part") { # sim_part ---------------
ds <- impute_sRHD_sim_part(ds, M)
}
ds
}
impute_sRHD_cols_seq <- function(ds, M = is.na(ds), donor_limit) {
if (is.infinite(donor_limit)) { # Inf donor_limit -> easy/faster implementation
for (k in seq_len(ncol(ds))) {
ds[M[, k], k] <- resample(ds[!M[, k], k, drop = TRUE],
sum(M[, k]),
replace = TRUE
)
}
} else { # finite donor_limit ----------------------------
for (k in seq_len(ncol(ds))) {
recipients <- M[, k]
pot_donors <- which(!recipients)
recipients <- which(recipients)
donor_limit_obj <- rep(donor_limit, nrow(ds)) # donor_limit for every col separate
for (i in seq_along(recipients)) {
pot_donors <- pot_donors[donor_limit_obj[pot_donors] > 0]
donor <- resample(pot_donors, 1)
ds[recipients[i], k] <- ds[donor, k]
donor_limit_obj[donor] <- donor_limit_obj[donor] - 1
}
}
}
ds
}
impute_sRHD_sim_comp <- function(ds, M = is.na(ds), donor_limit) {
recipients <- apply(M, 1, any)
pot_donors <- which(!recipients)
recipients <- which(recipients)
# if (length(pot_donors) == 0) { # error is thrown in fun min_donor_limit
# stop("imputation not possible: there is no completely observed object")
# }
if (is.infinite(donor_limit) || isTRUE(all.equal(donor_limit, 1))) {
# important special cases: no donor limit and sampling without replacement
replace_donors <- ifelse(is.infinite(donor_limit), TRUE, FALSE)
donors_match <- resample(pot_donors, length(recipients), replace = replace_donors)
for (i in seq_along(recipients)) {
ds[recipients[i], M[recipients[i], ]] <- ds[donors_match[i], M[recipients[i], ]]
}
# vectorized but ugly and does not work for tibbles!
# ds[recipients, ][M[recipients, ]] <- ds[donors_match, ][M[recipients, ]]
} else {
donor_limit_obj <- rep(donor_limit, nrow(ds))
for (i in seq_along(recipients)) {
pot_donors <- pot_donors[donor_limit_obj[pot_donors] > 0]
donor <- resample(pot_donors, 1)
ds[recipients[i], M[recipients[i], ]] <- ds[donor, M[recipients[i], ]]
donor_limit_obj[donor] <- donor_limit_obj[donor] - 1
}
}
ds
}
impute_sRHD_sim_part <- function(ds, M = is.na(ds)) {
md_patterns <- find_md_patterns(M)
pattern_matrix <- md_patterns$pattern_matrix
pattern_obj <- md_patterns$pattern_obj
rm(md_patterns)
for (pat_ind in seq_len(nrow(pattern_matrix))) {
if (any(pattern_matrix[pat_ind, ])) { # if not -> completely observed
pot_donor_pattern_nrs <- find_pot_donor_pattern_nrs(
pattern_matrix,
pattern_matrix[pat_ind, ]
)
if (length(pot_donor_pattern_nrs) == 0L) {
stop(
"there is no appropriate donor for the object(s) ",
paste(pattern_obj[pat_ind], collapse = ", ")
)
}
pot_donors <- unlist(pattern_obj[pot_donor_pattern_nrs])
recipients <- pattern_obj[[pat_ind]]
donors_match <- resample(pot_donors, length(recipients), replace = TRUE)
for (i in seq_along(recipients)) {
ds[recipients[i], M[recipients[i], ]] <- ds[donors_match[i], M[recipients[i], ]]
}
}
}
ds
}
# helper: min_donor_limit ---------------------------------
min_donor_limit <- function(M, type) {
if (type == "sim_comp") {
nr_incomp_obs <- sum(apply(M, 1, any))
nr_comp_obs <- nrow(M) - nr_incomp_obs
if (nr_comp_obs > 0) {
min_limit <- ceiling(nr_incomp_obs / nr_comp_obs)
} else {
stop(
"calculation of min donor_limit and Imputation with type = ",
type,
" not possible: there is no completely observed object"
)
}
} else if (type == "cols_seq") {
max_nr_incomp_in_one_col <- max(apply(M, 2, sum))
min_nr_comp_in_one_col <- nrow(M) - max_nr_incomp_in_one_col
if (min_nr_comp_in_one_col > 0) {
min_limit <- ceiling(max_nr_incomp_in_one_col / min_nr_comp_in_one_col)
} else {
stop(
"calculation of min donor_limit and Imputation with type = ",
type,
" not possible: there are completely unobserved variables"
)
}
} else {
stop("type = ", type, " not implemented")
}
min_limit
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.