R/unsupervisedForestS3.R

#' unsupervisedForest
#'
#' Wrapper for creating unsupervised randomForest objects with prediction capabilities
#'
#' @export
#'
#' @author David Navega
#'
#' @param x a data.frame.
#' @param mtry an integer, defines number of variables to try at each split.
#' @param ntree number of trees composing the forest.
#' @param type the type of envelope used to generate random data. Default is
#' "data". type can be "data" or "uniform".
#' @param ... additional parameters to the randomForest
#'
#' @return an unsupervisedForest object
#'
#' @seealso \link{randomForest}
#'
unsupervisedForest <- function(x, mtry,
  ntree = 500,
  type = "data",
  ...
  ) {

  # initialize and assess class of each variable
  x_names <- colnames(x)
  x_class <- lapply(x, class)
  x_class <- named_apply(x_class, numeric_factor_class, simplify = TRUE)

  x_dimensions <- dim(x)
  n <- x_dimensions[1]
  m <- x_dimensions[2]

  condition <- missing(mtry)
  if(condition) {
    mtry <- sqrt(ncol(x))
  }

  # mode imputation
  condition <- any(is.na(x))
  if(condition) {
    x <- randomForest::na.roughfix(x)
    warning("\n[-] NA values detected - randomForest::na.roughfix(x) applied.")
  }

  # sythethic data ----
  distribution_envelope <- create_envelope(x, type = type)
  synthetic <- sample_envelope(distribution_envelope, n = n)

  # data for unsupervised tree
  forest_output <- factor(rep(c(1, 2), each = n))
  forest_data <- dplyr::bind_rows(x, synthetic)

  unsupervised_forest <- randomForest::randomForest(
    x = forest_data,
    y = forest_output,
    mtry = mtry,
    ntree = ntree,
    proximity = T,
    oob.prox = T,
    ...
  )

  full_proximity <- unsupervised_forest$proximity
  proximity <- full_proximity[seq_len(n), seq_len(n)]
  unsupervised_forest$proximity <- proximity

  object <- list(
    forest = unsupervised_forest,
    data = x,
    proximity = proximity,
    full_proximity = full_proximity
  )
  class(object) <- "unsupervisedForest"

  # return
  rout <- object
  return(rout)

}

#' Predict method for unsupervisedForest class
#'
#' @export
#'
#' @author David Navega
#'
#' @param object an unsupervisedForest object,
#' @param newdata data to predict proximity and probability based from an
#' unsupervisedForest
#' @param ... ...
#'
#' @return a list with the following components
#' \item{proximity}{proximity vector for newdata}
#' \item{probability}{probability of belonging to normal or synthethic data}
#'
predict.unsupervisedForest <- function(object, newdata, ...) {

  data <- object$data
  forest <- object$forest

  forest_data <- dplyr::bind_rows(newdata, object$data)
  n <- NROW(newdata)

  forest_output <- predict(
    object = forest,
    newdata = forest_data,
    type = "prob",
    proximity = T
  )

  probability <- rbind(forest_output$pred[seq_len(n), ])
  proximity <- forest_output$proximity[seq_len(n),-seq_len(n)]

  # return
  rout <- list(
    proximity = (rbind(proximity)),
    probability = probability
  )
  return(rout)

}

#' Print method for unsurpersivedForest class
#'
#' @noRd
#'
#' @export
#'
#' @author David Navega
#'
#' @param x an unsupervisedForest object
#' @param ... ...
#'
#'
print.unsupervisedForest <- function(x, ...) {
  error <- x$forest$err.rate[x$forest$ntree,1:2]
  cat("unsupervisedForest\n\n")
  cat("OOB Accuracy (%): ", round(1 - error[1], 4) * 100, sep = "")
  cat("\nOOB Detection Rate (%): ", round(1 - error[2], 4) * 100, sep = "")
}
dsnavega/imputeForest documentation built on May 8, 2019, 2:43 p.m.