R/proximity_imputation.R

#' Proximity Imputation
#'
#' @noRd
#'
#' @author David Navega
#'
#' @param x a data.frame containing missing values
#' @param proximity a n-by-n matrix with the proximity values as computed by a
#' randomForest model.
#'
proximity_imputation <- function(x, proximity) {

  # initialize and assess class of each variable
  x_names <- colnames(x)
  x_class <- lapply(x, class)
  x_class <- named_apply(x_class, numeric_factor_class, simplify = TRUE)

  x_dimensions <- dim(x)
  n <- x_dimensions[1]
  m <- x_dimensions[2]

  # reconstruct values using proximity
  proximity_reconstructed <- named_apply(x_names, function(current) {

    current_x <- x[[current]]

    switch(x_class[current],

      factor = {

        x_levels <- levels(current_x)
        sapply(seq_len(n), function(i) {

          p_vector <- split(proximity[-i, i], current_x[-i])
          p_imputed <- which.max(sapply(p_vector, sum))
          reconstructed <- factor(x_levels[p_imputed], levels = x_levels)

          # return
          rout <- reconstructed
          return(rout)

        })

      },

      numeric = {

        sapply(seq_len(n), function(i) {

          not_na <- !is.na(current_x[-i])
          p_vector <- proximity[-i, i][not_na]
          reconstructed <- weighted_mean(
            x = current_x[-i][not_na],
            weights = p_vector,
            na.rm = TRUE
          )

          # return
          rout <- reconstructed
          return(rout)

        })
      }

    )


  })

  proximity_imputed <- as.data.frame(proximity_reconstructed)
  colnames(proximity_imputed) <- x_names

  # substitute missing value
  x_imputed <- named_apply(x_names, function(name) {

    current_variable <- x[[name]]
    is_missing <- is.na(current_variable)
    current_variable[is_missing] <- proximity_imputed[[name]][is_missing]

    # return
    rout <- current_variable
    return(rout)

  })

  x_imputed <- as.data.frame(x_imputed)
  colnames(x_imputed) <- x_names

  # return ----
  rout <- x_imputed
  return(rout)

}
dsnavega/imputeForest documentation built on May 8, 2019, 2:43 p.m.