R/model_donor.R

Defines functions predict_donor model_donor

Documented in model_donor predict_donor

#' Model for donor-based imputation
#'
#' This function is intended to be used inside of [impute_unsupervised()] as
#' `model_fun`.
#'
#' @inheritParams impute_supervised
#' @param i Index for row of `ds` or `NULL`
#' @param model_arg A list with two named elements (missing elements will be
#'   replaced by default values):
#'   * `selection` How to select the donors? Possible choices are:
#'     `complete_rows` (default), `partly_complete_rows`, `knn_complete_rows`,
#'     `knn_partly_complete_rows`
#' * `k` number of selected closest donor (default: 10), only used for knn
#'   `selection`s
#'
#' @return A "model" for [predict_donor()] which is merely a data frame.
#' @export
#' @seealso [predict_donor()]
#' @importFrom gower gower_topn
#' @importFrom stats complete.cases
#' @examples
#' set.seed(123)
#' ds_mis <- data.frame(X = rnorm(10), Y = rnorm(10))
#' ds_mis[2:4, 1] <- NA
#' ds_mis[4:6, 2] <- NA
#' # default returns only complete rows
#' model_donor(ds_mis)
#' # with partly_complete and knn returned objects depends on i
#' model_donor(ds_mis,
#'   i = 2,
#'   model_arg = list(selection = "partly_complete_rows")
#' )
#' model_donor(ds_mis,
#'   i = 4,
#'   model_arg = list(selection = "partly_complete_rows")
#' )
#' model_donor(ds_mis,
#'   i = 5,
#'   model_arg = list(selection = "partly_complete_rows")
#' )
#' model_donor(ds_mis,
#'   i = 5,
#'   model_arg = list(selection = "knn_partly_complete_rows", k = 2)
#' )
model_donor <- function(ds, M = is.na(ds), i = NULL, model_arg = NULL) {
  stopifnot(
    "model_arg must be a list or NULL" =
      is.list(model_arg) || is.null(model_arg)
  )
  model_arg <- set_defaults_for_missing(
    model_arg, list(selection = "complete_rows", k = 10)
  )
  if (is.null(i) && model_arg$selection != "complete_rows") {
    stop("only donor selection \"complete_rows\"' is possible for this case")
  }
  if (model_arg$selection %in% c("complete_rows", "knn_complete_rows")) {
    suitable_rows <- complete.cases(ds)
  } else if (model_arg$selection %in%
    c("partly_complete_rows", "knn_partly_complete_rows")) {
    suitable_rows <- apply(M, 1, function(x) !any(M[i, ] & x))
  } else {
    stop(paste0(
      "'", model_arg$selection, "' is not a valid option for donor selection"
    ))
  }
  if (model_arg$selection %in%
    c("knn_complete_rows", "knn_partly_complete_rows")) {
    suitable_rows_ind <- which(suitable_rows)
    best_k <- gower::gower_topn(ds[i, ], ds[suitable_rows, ], n = model_arg$k)
    best_k <- best_k$index[, 1]
    suitable_rows <- suitable_rows_ind[best_k]
  }
  return(structure(ds[suitable_rows, ], donor_selection = model_arg$selection))
}

#' Prediction for donor-based imputation
#'
#' This function is intended to be used inside of [impute_unsupervised()] as
#' `predict_fun`.
#'
#' @inheritParams model_donor
#' @param ds_donors Data set with donors, normally generated by [model_donor()]
#' @param i Index of row of `ds` which should be imputed
#' @param donor_aggregation Type of donor aggregation. Can be one of
#'   'choose_random' and 'average'.
#'
#' @return The imputation values for row `i`.
#' @seealso [model_donor()]
#' @export
#' @examples
#' set.seed(123)
#' ds_mis <- data.frame(X = rnorm(10), Y = rnorm(10))
#' ds_mis[2:4, 1] <- NA
#' ds_mis[4:6, 2] <- NA
#' # default for ds_donors and predict_donors
#' ds_donors <- model_donor(ds_mis)
#' predict_donor(ds_donors, ds_mis, i = 2)
#' predict_donor(ds_donors, ds_mis, i = 4)
#' # with partly_complete, knn and average of neighbors
#' ds_donors <- model_donor(
#'   ds_mis,
#'   i = 5, model_arg = list(selection = "knn_partly_complete_rows", k = 2)
#' )
#' ds_donors
#' predict_donor(ds_donors, ds_mis, i = 5, donor_aggregation = "average")
predict_donor <- function(ds_donors, ds, M = is.na(ds), i,
                          donor_aggregation = "choose_random") {
  if (donor_aggregation == "choose_random") {
    return(ds_donors[sample.int(nrow(ds_donors), 1), M[i, ]])
  } else if (donor_aggregation == "average") {
    return(colMeans(ds_donors[, M[i, ], drop = FALSE]))
  }
}

Try the imputeGeneric package in your browser

Any scripts or data that you put into this service are public.

imputeGeneric documentation built on March 18, 2022, 6:35 p.m.