R/print.bagged.outlieroutputs.R

Defines functions print.bagged.outlieroutputs

Documented in print.bagged.outlieroutputs

#' @title Print outliers in human-readable format
#' @description Pretty-prints outliers as output by the `predict` function from a Bagged OutlierTrees
#' model (as generated by function `bagged.outliertrees`).
#' @param x Outliers as returned by predict method on an object from `bagged.outliertrees`.
#' @param outliers_print Maximum number of outliers to print.
#' @param ... No use.
#' @return The same input `x` that was passed (as `invisible`).
#' @seealso \link{bagged.outliertrees} \link{predict.bagged.outliertrees}
#' @examples
#' library(bagged.outliertrees)
#'
#' ### example dataset with interesting outliers
#' data(hypothyroid)
#'
#' ### fit a Bagged OutlierTrees model
#' model <- bagged.outliertrees(hypothyroid,
#'   ntrees = 10,
#'   subsampling_rate = 0.5,
#'   z_outlier = 6,
#'   nthreads = 1
#' )
#'
#' ### use the fitted model to find outliers in the training dataset
#' outliers <- predict(model,
#'   newdata = hypothyroid,
#'   min_outlier_score = 0.5,
#'   nthreads = 1
#' )
#'
#' ### print the top-10 outliers in human-readable format
#' print(outliers, outliers_print = 10)
#' @export
print.bagged.outlieroutputs <- function(x, outliers_print = 15, ...) {
  outliers_print <- min(outliers_print, length(x))
  cat(sprintf(
    "Reporting top %d outliers [out of %d found]\n\n",
    outliers_print,
    length(x)
  ))

  for (i in names(x[1:outliers_print])) {

    ### print suspicious value
    cat(sprintf("row [%s] - suspicious column: [%s] - ", i, x[[i]]$suspicious_value$column))
    cat(sprintf("suspicious value: [%s]\n", x[[i]]$suspicious_value$value))

    ### print distribution
    if (!is.na(as.numeric(x[[i]]$suspicious_value$value))) {
      if (as.numeric(x[[i]]$suspicious_value$value) > as.numeric(x[[i]]$group_statistics$mean)) {
        cat(sprintf(
          "\tdistribution: %.2f%% <= [%s] - [mean: %s] - [sd: %s] - [norm. obs: %s]\n",
          as.numeric(x[[i]]$group_statistics$pct) * 100.,
          x[[i]]$group_statistics$thr,
          x[[i]]$group_statistics$mean,
          x[[i]]$group_statistics$sd,
          x[[i]]$group_statistics$n_obs
        ))
      } else {
        cat(sprintf(
          "\tdistribution: %.2f%% >= [%s] - [mean: %s] - [sd: %s] - [norm. obs: %s]\n",
          as.numeric(x[[i]]$group_statistics$pct) * 100.,
          x[[i]]$group_statistics$thr,
          x[[i]]$group_statistics$mean,
          x[[i]]$group_statistics$sd,
          x[[i]]$group_statistics$n_obs
        ))
      }
    } else {
      cat(sprintf(
        "\tdistribution: %.2f%% in [%s]\n",
        as.numeric(x[[i]]$group_statistics$pct) * 100.,
        x[[i]]$group_statistics$thr
      ))

      cat(sprintf(
        "\t( [norm. obs: %s] - [prior_prob: %.2f%%] - [next smallest: %.2f%%] )\n",
        x[[i]]$group_statistics$n_obs,
        as.numeric(x[[i]]$group_statistics$sd) * 100.,
        as.numeric(x[[i]]$group_statistics$mean) * 100.
      ))
    }

    ### print conditions
    if (length(x[[i]]$conditions) > 0) {
      conditions <- as.data.frame(x[[i]]$conditions)

      for (j in nrow(conditions)) {
        switch(conditions$comparison[j],
          "is NA" = {
            cat(sprintf("\t\t[%s] is NA\n", conditions$column[j]))
          },
          "<=" = {
            cat(sprintf(
              "\t\t[%s] <= [%s] (value: %s)\n",
              conditions$column[j], conditions$value_comp[j], conditions$value_this[j]
            ))
          },
          ">" = {
            cat(sprintf(
              "\t\t[%s] > [%s] (value: %s)\n",
              conditions$column[j], conditions$value_comp[j], conditions$value_this[j]
            ))
          },
          "=" = {
            cat(sprintf("\t\t[%s] = [%s]\n", conditions$column[j], conditions$value_comp[j]))
          },
          "!=" = {
            cat(sprintf(
              "\t\t[%s] != [%s] (value: %s)\n",
              conditions$column, conditions$value_comp[j], conditions$value_this[j]
            ))
          },
          "in" = {
            cat(sprintf(
              "\t\t[%s] in [%s] (value: %s)\n",
              conditions$column[j], conditions$value_comp[j], conditions$value_this[j]
            ))
          }
        )
      }
    }

    cat("\n\n")
  }
}

Try the bagged.outliertrees package in your browser

Any scripts or data that you put into this service are public.

bagged.outliertrees documentation built on July 6, 2021, 9:06 a.m.