R/imputeForestS3.R

#' imputeForest
#'
#' Missing Value Imputation through randomForest Proximity Matrix
#'
#' @export
#'
#' @importFrom stats predict
#'
#' @author David Navega
#'
#' @param x a data.frame.
#' @param y a numeric or factor vector.
#' @param method type of procedure for imputation. Default is "proximity" for
#' proximity based imputation. "missForest" triggers a hybrid technique using
#' missForest and proximity based imputation.
#' @param initial a character defining how initial missing values are imputed.
#' Default is "mode" for mode imputation. "random" for random imputation from
#' marginal distribution of each feature.
#' @param mtry number of varibles to try at each split.
#' @param ntree number of trees.
#' @param iterations number of iterations used to update imputed values.
#' @param mcar amount of missing values to induced completely at random. Used
#' compute cross-validation imputation error.
#' @param crossvalidation a logical indicating whether cross-validation should
#' be made or not. Default is TRUE.
#' @param kfold number of fold in cross-validation. Default is 5.
#' @param verbose a logical indicating if output should be more detailed.
#' @param seed a positive integer defining the seed for the random number generator.
#' @param ... additional arguments for randomForest
#'
#' @return an object of class imputeForest
#'
#' @seealso \link{randomForest}
#'
#' @examples
#' x <- iris[, -5]
#' y <- iris$Species
#' x_na <- induce_missing(x)
#' print(imputeForest(x = x_na, y = y))
#
imputeForest <- function(x, y = NULL,
                         method = "missForest",
                         initial = "mode",
                         mtry = ceiling(sqrt(ncol(x))),
                         ntree = 300,
                         iterations = 5,
                         mcar = 0.10,
                         crossvalidation = T,
                         kfold = 5,
                         verbose = F,
                         seed = 1984,
                         ...
) {

  # RNG
  set.seed(seed)

  # initialize and assess class of each variable
  x_names <- colnames(x)

  # assess if x is data.frame tbl_df are coerced
  condition <- "data.frame" %in% class(x)
  if(condition) {
    x <- as.data.frame(x)
  } else {
    stop("[-] x must be a 'data.frame' object")
  }

  x_class <- lapply(x, class)
  x_class <- named_apply(x_class, numeric_factor_class, simplify = T)

  x_dimensions <- dim(x)
  n <- x_dimensions[1]
  m <- x_dimensions[2]

  condition <- is.null(y)
  if(condition) {

    # unsupervised imputation ----
    type <- "unsupervised"

    switch(method,
           "proximity" = {

             # step i - mode imputation ----
             condition <- verbose
             if(condition) {
               cat("\nUnsupervised Imputation Forest\n")
               cat("\nProximity based Imputation\n")
               cat("Step I - Mode Imputation", "\n",sep = "")
             }

             # mode imputation ---
             imputed_x <- switch (initial,
                                  mode = {
                                    mode_imputation(x)
                                  },
                                  random = {
                                    random_imputation(x)
                                  },
                                  {
                                    stop("[-] initial should be 'mode' or 'random'.")
                                  }
             )


             unsupervised_forest <- unsupervisedForest(
               x = imputed_x,
               ntree = ntree,
               mtry = mtry,
               ...
             )

             # proximity imputation (1) ---
             proximity_matrix <- unsupervised_forest$proximity

             proximity_imputed <- proximity_imputation(
               x = x,
               proximity = proximity_matrix
             )

             # step ii - proximity imputation (iterate) ----
             n_iterations <- 0
             while(n_iterations < iterations) {

               unsupervised_forest <- unsupervisedForest(
                 x = proximity_imputed,
                 ntree = ntree,
                 mtry = mtry,
                 ...
               )

               proximity_matrix <- unsupervised_forest$proximity

               proximity_imputed <- proximity_imputation(
                 x = x,
                 proximity = proximity_matrix
               )

               condition <- verbose
               if(condition) {
                 cat("Step II - Iterations:", n_iterations + 1, "of", iterations, "\n")
               }

               condition <- all(imputed_x == proximity_imputed)
               if(condition) {
                 break
               }

               imputed_x <- proximity_imputed
               n_iterations <- n_iterations + 1

             }

           },

           "missForest" = {

             condition <- verbose
             if(condition) {
               cat("\nUnsupervised Imputation Forest\n")
               cat("missForest | Proximity based Imputation Hybrid", "\n", sep = "")
             }

             # missForest imputation ---
             condition <- verbose
             if(condition) {
               miss_forest <- missForest::missForest(x)
             } else {

               condition <- Sys.info()["sysname"] == "Windows"
               if(condition){


                 sink("NUL")
                 miss_forest <- missForest::missForest(x)
                 sink()

               } else {

                 sink("/dev/null")
                 sink()
                 miss_forest <- missForest::missForest(x)
                 sink()

               }

             }

             imputed_x <- miss_forest$ximp

             # proximity forest for external mapping ---
             unsupervised_forest <- unsupervisedForest(
               x = imputed_x,
               ntree = ntree,
               mtry = mtry,
               ...
             )

             n_iterations <- iterations

           },

           {stop("[-] method must be 'proximity' or 'missForest'.")}
    )

  } else {

    type  <- "supervised"
    # supervised imputation ----

    switch(method,
           "proximity" = {
             # step i - mode imputation ----
             condition <- verbose
             if(condition) {
               cat("\nSupervised Imputation Forest\n")
               cat("\nProximity based Imputation\n")
               cat("Step I - Mode Imputation", "\n", sep = "")
             }

             # mode imputation ---
             imputed_x <- switch (initial,
                                  mode = {
                                    mode_imputation(x)
                                  },
                                  random = {
                                    random_imputation(x)
                                  },
                                  {
                                    stop("[-] initial should be 'mode' or 'random'.")
                                  }
             )

             supervised_forest <- randomForest::randomForest(
               x = imputed_x,
               y = y,
               ntree = ntree,
               mtry = mtry,
               proximity = T,
               oob.prox = T,
               ...
             )

             proximity_matrix <- supervised_forest$proximity

             proximity_imputed <- proximity_imputation(
               x = x,
               proximity = proximity_matrix
             )

             # supervised error ---
             task <- supervised_forest$type
             switch(task,
                    regression = {
                      s_error <- sqrt(supervised_forest$mse[ntree])
                    },
                    classification = {
                      s_error <- 1 - supervised_forest$err.rate[ntree, 1]
                    }
             )

             condition <- verbose
             if(condition) {
               cat("Supervised Accuracy: ", round(s_error, 4), "\n",sep = "")
             }

             # step ii - proximity imputation (iterate) ---
             n_iterations <- 0
             while(n_iterations < iterations) {

               supervised_forest <- randomForest::randomForest(
                 x =  proximity_imputed,
                 y = y,
                 mtry = mtry,
                 ntree = ntree,
                 proximity = T,
                 oob.prox = T
               )

               proximity_matrix <- supervised_forest$proximity

               proximity_imputed <- proximity_imputation(
                 x = x,
                 proximity = proximity_matrix
               )

               # supervised error ---
               task <- supervised_forest$type
               switch(task,

                      regression = {
                        s_error <- sqrt(supervised_forest$mse[ntree])
                      },

                      classification = {
                        s_error <- 1 - supervised_forest$err.rate[ntree, 1]
                      }
               )

               condition <- verbose
               if(condition) {
                 cat("Iterations: ", n_iterations + 1, " of ", iterations, "\n", sep = "")
                 cat("Supervised Accuracy: ", round(s_error, 4), "\n", sep = "")
               }

               condition <- all(imputed_x == proximity_imputed)
               if(condition) {
                 break
               }

               imputed_x <- proximity_imputed
               n_iterations <- n_iterations + 1

             }

           },

           "missForest" = {

             condition <- verbose
             if(condition) {
               cat("\nSupervised Imputation Forest\n")
               cat("missForest | Proximity based Imputation Hybrid", "\n", sep = "")
             }

             # missForest imputation ---
             condition <- verbose
             if(condition) {
               miss_forest <- missForest::missForest(x)
             } else {

               condition <- Sys.info()["sysname"] == "Windows"
               if(condition){
                 sink("NUL")
                 miss_forest <- missForest::missForest(x)
                 sink()
               } else {
                 sink("/dev/null")
                 miss_forest <- missForest::missForest(x)
                 sink()
               }

             }

             imputed_x <- miss_forest$ximp

             # proximity forest for external mapping ---
             supervised_forest <- randomForest::randomForest(
               x = imputed_x,
               y = y,
               ntree = ntree,
               mtry = mtry,
               proximity = T,
               oob.prox = T,
               ...
             )

             # supervised error ---
             task <- supervised_forest$type
             switch(task,

                    regression = {
                      s_error <- sqrt(supervised_forest$mse[ntree])
                    },

                    classification = {
                      s_error <- 1 - supervised_forest$err.rate[ntree, 1]
                    }
             )

             condition <- verbose
             if(condition) {
               cat("Supervised Accuracy: ", round(s_error, 4), "\n", sep = "")
             }

             n_iterations <- iterations

           },

           {stop("[-] method must be 'proximity' or 'missForest'.")}

    )

  }

  object <- list(
    x = imputed_x,
    na = is.na(x),
    type = type,
    method = method,
    accuracy = switch(type, unsupervised = {NULL}, supervised = {s_error}),
    iterations = n_iterations,
    crossvalidation = crossvalidation,
    mcar = mcar,
    forest = if(!is.null(y)) {supervised_forest} else {unsupervised_forest}
  )
  class(object) <- "imputeForest"

  # k-fold cross-validation ----
  condition <- crossvalidation
  if(condition) {

    condition <- verbose
    if(condition) {
      cat("\nCross-validation (K-Fold), K = ", kfold, ", MCAR = ", mcar,"\n", sep = "")
    }

    kfold_matrix <- kfold_crossvalidation(n = n, k = kfold, seed = seed)

    cv_imputed_x <- imputed_x
    cv_missing_x <- imputed_x
    cv_true_x <- imputed_x
    oob_accuracy <- vector()

    condition <- !is.null(y)
    if(condition) {
      y_cv <- y
    }

    for(kth in seq_len(kfold)) {

      condition <- verbose
      if(condition) {
        cat("\nFold:", kth, "of", kfold)
      }

      # setup k-fold partition ---
      fold_index <- kfold_matrix[, kth]
      train_index <- which(!fold_index)
      test_index <- which(fold_index)

      # setup data ---
      x_train <- imputed_x[train_index, ]
      x_test <- imputed_x[test_index, ]

      # induce missing ---
      x_train_miss <- induce_missing(x = x_train, amount = mcar)
      x_test_miss <- induce_missing(x = x_test, amount = mcar)

      condition <- is.null(y)
      if(condition) {
        y_train <- NULL
      } else {
        y_train <- y[train_index]
      }

      # generate forest ---
      kfold_forest <- imputeForest(
        x = x_train_miss,
        y = y_train,
        mtry = mtry,
        ntree = ntree,
        method = method,
        iterations = iterations,
        verbose = verbose,
        crossvalidation = F,
        seed = seed
      )

      oob_accuracy <- c(oob_accuracy, kfold_forest$accuracy)

      # predict (impute) missing values ---
      x_test_imputed <- predict(object = kfold_forest, newdata = x_test_miss)

      condition <- exists("y_cv")
      if(condition) {
        y_cv[test_index] <- predict(kfold_forest$forest, x_test_imputed)
      }

      # update
      cv_imputed_x[test_index, ] <- x_test_imputed
      cv_missing_x[test_index, ] <- x_test_miss

    }

    # cross_validation information ---
    cv_object <- list(
      ximp = cv_imputed_x,
      xmiss = cv_missing_x,
      xtrue = cv_true_x
    )

    condition <- exists("y_cv")
    if(condition) {

      condition <- is.numeric(y_cv)
      if(condition) {
        cv_accuracy <- sqrt(mean((y - y_cv) ^ 2))
      } else {
        cv_accuracy <- sum(diag(m_estimator(x = y, y = y_cv)))
      }

    }

    error_table <- sapply(x_names, function(name) {

      switch(x_class[name],
             numeric = {

               # na percent ---
               percent_na <- (sum(is.na(x[[name]])) / length(x[[name]])) * 100

               is_na <- is.na(cv_object$xmiss[[name]])
               imputed <- cv_object$ximp[[name]][is_na]
               known <- cv_object$xtrue[[name]][is_na]

               # median absolute error ---
               medae <- stats::median(abs(known - imputed))

               # median symmetric accuracy ---
               msa <- 100 * (exp(stats::median(abs(log(imputed / known)))) - 1)

               # symmetric signed percent bias
               MdLQ <- stats::median(log(imputed / known))
               sspb <- 100 * sign(MdLQ) * (exp(abs(MdLQ)) - 1)

               # variation due to imputation
               esquares <- sum((known - imputed) ^ 2)
               tsquares <- sum((mean(x[[name]], na.rm = T) - x[[name]]) ^ 2, na.rm = T)
               imp_var <- (esquares / tsquares) * 100

               # return ---
               rout <- c(percent_na, msa, sspb, medae, imp_var, 0)
               return(rout)

             },

             factor = {

               # na percent ---
               percent_na <- (sum(is.na(x[[name]])) / length(x[[name]])) * 100

               is_na <- is.na(cv_object$xmiss[[name]])
               imputed <- cv_object$ximp[[name]][is_na]
               known <- cv_object$xtrue[[name]][is_na]

               n <- length(known)

               # probability of correct classification
               pcc <- sum(known == imputed) / n

               # adjusted probability of correct classification
               px <- m_estimator(known)
               py <- m_estimator(imputed)
               pe <- sum(px * py)
               apcc <- (pcc - pe) / (1 - pe)

               # probability of random classification
               purc <- 1 / nlevels(known)

               # balanced probability
               bpcc <- diag(m_estimator(known, imputed)) / m_estimator(known)
               bpcc[is.infinite(bpcc) | is.na(bpcc)] <- 0
               bpcc <- mean(bpcc)

               # p-value ----
               binomial_test <- stats::binom.test(
                 x = as.integer(pcc * n), n = n, p = pe, alternative = "greater"
               )
               p.value <- binomial_test$p.value

               # normalized entropy (known)---
               p <- m_estimator(x[[name]])
               logp <- log(p, base = 2)
               logp[is.infinite(logp)] <- 0
               hx <- (-sum(logp * p) / log(nlevels(known), base = 2))

               # normalized entropy (imputed)---
               p <- m_estimator(imputed)
               logp <- log(p, base = 2)
               logp[is.infinite(logp)] <- 0
               hy <- (-sum(logp * p) / log(nlevels(known), base = 2))

               # return ---
               rout <- c(percent_na, hy, purc, pcc, bpcc, p.value)
               return(rout)

             }
      )

    })

    error_table <- round(t(error_table), digits = 4)

    error_frame <- data.frame(
      Feature = x_names,
      Index = seq_len(m),
      error_table,
      row.names = NULL
    )

    num_frame <- error_frame[which(x_class == "numeric"), -ncol(error_frame)]
    fct_frame <- error_frame[which(x_class == "factor"), ]

    colnames(num_frame) <- c(
      "Feature", "Index", "% NA", "MSA", "SSPB", "MedAE", "Variance"
    )

    colnames(fct_frame) <- c(
      "Feature", "Index", "% NA","H(X')","PuRC", "PCC", "bPCC", "p-value"
    )

    error_list <- list(
      "numeric" = num_frame,
      "factor" = fct_frame
    )

    condition <- !is.null(y)
    accuracy <- if(condition) {
      round(
        x = c("MCAR (OOB)" = mean(oob_accuracy), "MCAR (CV)" = cv_accuracy),
        digits = 4
      )
    } else {
      NULL
    }

    object$error <- error_list
    object$accuracy <- c(OOB = unname(object$accuracy), accuracy)

  }

  # return ----
  rout <- object
  return(rout)

}


#'Predict method for imputeForest class
#'
#' @export
#'
#' @author David Navega
#'
#' @param object an imputeForest object
#' @param newdata data.frame where missing values should be predicted
#' @param ... ...
#'
#' @return a data.frame with imputed missing values
#'
predict.imputeForest <- function(object, newdata, ...) {

  # has_missing function ----
  has_missing <- function(x, index = F) {

    condition <- index
    if(condition) {
      which(!stats::complete.cases(x))
    } else {
      !stats::complete.cases(x)
    }

  }

  # proximity imputer function ----
  proximity_imputer <- function(x, proximity) {


    # initialize and assess class of each variable
    x_names <- colnames(x)
    x_class <- lapply(x, class)
    x_class <- named_apply(x_class, numeric_factor_class, simplify = TRUE)

    x_dimensions <- dim(x)
    n <- x_dimensions[1]
    m <- x_dimensions[2]

    # reoconstruct values using proximity
    proximity_reconstructed <- named_apply(x_names, function(current) {

      current_x <- x[[current]]

      switch(x_class[current],

             factor = {

               x_levels <- levels(current_x)
               p_vector <- split(proximity, current_x)
               p_imputed <- which.max(sapply(p_vector, sum))
               reconstructed <- factor(x_levels[p_imputed], levels = x_levels)

               # return
               rout <- reconstructed
               return(rout)
             },

             numeric = {

               not_na <- !is.na(current_x)
               p_vector <- proximity[not_na]
               reconstructed <- weighted_mean(
                 x = current_x[not_na],
                 weights = p_vector,
                 na.rm = TRUE
               )

               # return
               rout <- reconstructed
               return(rout)

             }

      )


    })

    # return ----
    rout <- proximity_reconstructed
    return(rout)

  }

  # proximity imputation function ----
  proximity_imputation <- function(data) {

    proximity_matrix <- switch(object$type,

                               unsupervised = {
                                 predicted <- predict(object = object$forest, newdata = data)
                                 proximity <- predicted$proximity
                                 rownames(proximity) <- NULL
                                 colnames(proximity) <- NULL
                                 proximity
                               },

                               supervised = {
                                 predicted <- predict(
                                   object = object$forest,
                                   newdata = dplyr::bind_rows(data, x),
                                   proximity = T
                                 )
                                 proximity <- predicted$proximity
                                 proximity <- proximity[seq_len(n_with_missing), -seq_len(n_with_missing)]
                                 rownames(proximity) <- NULL
                                 colnames(proximity) <- NULL
                                 proximity

                               }

    )
    proximity_matrix <- rbind(proximity_matrix)

    variables_imputed <- lapply(seq_len(n_with_missing), function(index) {
      proximity_vector <- proximity_matrix[index, ]

      as.data.frame(proximity_imputer(
        x = x[, variable_missing[[index]], drop = F],
        proximity = proximity_vector
      ))

    })

    proximity_imputed <- newdata
    for(index in seq_len(n_with_missing)) {

      missing <- variable_missing[[index]]
      missing_index <- with_missing[index]
      proximity_imputed[missing_index, missing] <- variables_imputed[[index]]

    }

    # return
    rout <- proximity_imputed
    return(rout)

  }

  # initialize parameters,data and assess class of each variable ----
  x <- object$x
  x_names <- colnames(x)
  x_class <- lapply(x, class)
  x_class <- named_apply(x_class, numeric_factor_class, simplify = T)

  x_dimensions <- dim(x)
  n_x <- x_dimensions[1]
  m_x <- x_dimensions[2]

  z <- dplyr::as_data_frame(rbind(newdata))
  z_names <- colnames(z)
  z_class <- lapply(z, class)
  z_class <- named_apply(z_class, numeric_factor_class, simplify = T)

  z_dimensions <- dim(x)
  n_z <- z_dimensions[1]
  m_z <- z_dimensions[2]

  iterations <- object$iterations

  condition <- any(x_class != z_class)
  if(condition) {
    stop("[-] One of more variables in newdata do not match variable class")
  }

  condition <- m_x != m_z
  if(condition) {
    stop("[-] Number of variables in newdata do not match trained forest")
  }

  with_missing <- has_missing(z, index = T)
  is_missing <- dplyr::as_data_frame(is.na(z)[with_missing, ])
  z <- z[with_missing, ]
  n_with_missing <- length(with_missing)
  variable_missing <- apply(is_missing, 1, function(x) z_names[which(x)])

  # strawman imputation / roughfix ----
  # compute strawman imputation values (initial guesses)
  # mode (factor) and median (numeric)

  strawman_value <- named_apply(x_names, function(name) {

    switch(x_class[name],
           factor = {
             discrete_mode(x[[name]])
           },
           numeric = {
             stats::median(x[[name]],na.rm = T)
           }
    )

  })

  strawman_imputed <- dplyr::as_data_frame(named_apply(z_names, function(name){

    strawman_variable <- z[[name]]
    strawman_variable[is_missing[[name]]] <- strawman_value[[name]]
    return(strawman_variable)

  }))

  proximity_imputed <- proximity_imputation(strawman_imputed)

  n_iterations <- 1
  while(n_iterations <= iterations) {

    proximity_imputed <- proximity_imputation(proximity_imputed[with_missing, ])

    n_iterations <- n_iterations + 1

  }

  # return ----
  rout <- proximity_imputed
  return(rout)

}


#' Print method for imputeForest class
#'
#' @noRd
#'
#' @author David Navega
#'
#' @export
#'
#' @param  x an imputeForest x
#' @param ... ...
#'
print.imputeForest <- function(x, ...) {

  cat("\nimputeForest\n")
  cat("Missing value imputation model using randomForest \n")

  condition <- x$method == "missForest"
  if(condition) {
    cat("\nMethod:", x$method,  "(proximity hybrid)")
  } else {
    cat("\nMethod:", x$method)
  }

  cat("\nIterations:", x$iterations)
  cat("\nForest:", x$type)
  condition <- x$type == "supervised"
  if(condition) {
    cat("\nTask:", x$forest$type)
    cat("\n\nSupervised Accuracy\n\n")
    print(x$accuracy)
  }

  condition <- x$crossvalidation
  if(condition) {
    cat("\n\nImputation Accuracy\nMCAR =", x$mcar,"\n\n")

    cat("Numeric\n\n")
    print(x$error$numeric)
    cat("\nFactor\n\n")
    print(x$error$factor)

  }

}
dsnavega/imputeForest documentation built on May 8, 2019, 2:43 p.m.