R/predict.R

Defines functions predict.outForest

Documented in predict.outForest

#' Out-of-Sample Application
#'
#' Identify outliers in new data set based on previously fitted 'outForest' object. The result of \code{predict} is again an object of type 'outForest'. All its methods can be applied to it.
#'
#' @importFrom stats predict
#' @param object An object of class "outForest".
#' @param newdata A new \code{data.frame} to be assessed for numeric outliers.
#' @param replace Should outliers be replaced by predicting mean matching (from the original non-outliers) on the predictions ("pmm", the default), by predictions ("predictions"), by \code{NA} ("NA"). Use "no" to keep outliers as they are.
#' @param pmm.k For \code{replace = "pmm"}, how many nearest prediction neighbours (from the original non-outliers) be considered to sample observed values from?
#' @param threshold Threshold above which an outlier score is considered an outlier.
#' @param max_n_outliers Maximal number of outliers to identify. Will be used in combination with \code{threshold} and \code{max_prop_outliers}.
#' @param max_prop_outliers Maximal relative count of outliers. Will be used in combination with \code{threshold} and \code{max_n_outliers}.
#' @param seed Integer random seed.
#' @param ... Further arguments passed from other methods.
#' @return An object of type \code{outForest}.
#' @export
#' @method predict outForest
#' @examples
#' (out <- outForest(iris, allow_predictions = TRUE))
#' iris1 <- iris[1, ]
#' iris1$Sepal.Length <- -1
#' pred <- predict(out, newdata = iris1)
#' outliers(pred)
#' Data(pred)
#' plot(pred)
#' plot(pred, what = "scores")
#' @seealso \code{\link{outForest}}, \code{\link{outliers}}, \code{\link{Data}}.
predict.outForest <- function(object, newdata, replace = c("pmm", "predictions", "NA", "no"),
                              pmm.k = 3, threshold = object$threshold, max_n_outliers = Inf,
                              max_prop_outliers = 1, seed = NULL, ...) {
  replace <- match.arg(replace)
  newdata <- as.data.frame(newdata)

  # Initial check
  stopifnot(is.data.frame(newdata),
            (n <- nrow(newdata)) >= 1L)
  if (!object$allow_predictions) {
    stop("Use 'allow_predictions = TRUE' when creating 'outForest' object.")
  }

  # Initialization
  if (!is.null(seed)) {
    set.seed(seed)
  }
  v <- object$v
  predData <- newdata[, v, drop = FALSE]

  # Currently, can deal only with NA in one single v
  all_vars <- union(v, object$used_to_check)
  any_NA <- names(which(0 < colSums(is.na(newdata[, all_vars, drop = FALSE]))))
  if (length(any_NA) <= 1L && all(any_NA %in% v)) {
    if (length(any_NA) == 1L) {
      .s <- is.na(newdata[[any_NA]])
      newdata[.s, any_NA] <- predict(object$forests[[any_NA]], data = newdata[.s, ])$predictions
    }
  } else {
    stop("Missing values only in one v variable allowed.")
  }

  # Check each variable
  for (vv in v) {
    if (length(setdiff(object$used_to_check, vv))) {
      predData[[vv]] <- predict(object$forests[[vv]], data = newdata)$predictions
    } else {
      predData[[vv]] <- object$mu[vv]
    }
  }
  # Calculate outlier scores and status
  scores <- scale(newdata[, v, drop = FALSE] - predData, center = FALSE, scale = object$rmse)
  if (length(any_NA)) {
    scores[.s, any_NA] <- 0
    newdata[.s, any_NA] <- NA
  }
  out <- process_scores(data = newdata, scores = scores, predData = predData,
                        v = v, rmse = object$rmse, replace = replace, pmm.k = pmm.k,
                        threshold = threshold, max_n_outliers = max_n_outliers,
                        max_prop_outliers = max_prop_outliers,
                        allow_predictions = FALSE, obj = object)
  out <- c(out, list(forests = NULL), object[c("used_to_check", "mu")])
  class(out) <- c("outForest", "list")
  out
}

Try the outForest package in your browser

Any scripts or data that you put into this service are public.

outForest documentation built on Jan. 31, 2022, 9:07 a.m.