R/discretize.R

Defines functions get_signif_digits equal_freq_bin customBreaksControl equalsizeControl mdlControl discretize.numeric discretize.data.frame discretize.formula discretize.default discretize

Documented in customBreaksControl discretize equalsizeControl mdlControl

#' Discretization
#'
#' Discretize a range of numeric attributes in the dataset into nominal
#' attributes. \code{Minimum Description Length} (MDL) method is set as the default
#' control. There is also available \code{equalsizeControl} method.
#'
#' @param x Explanatory continuous variables to be discretized or a \link{formula}.
#' @param y Dependent variable for supervised discretization or a \link{data.frame} when \code{x} ia a \link{formula}.
#' @param control \code{discretizationControl} object containing the parameters for
#'   discretization algorithm. Possible inputs are \code{mdlControl} or \code{equalsizeControl}, so far. If passed as a list, the first element is used.
#' @param all Logical indicating if a returned \link{data.frame} should contain other features that were not discretized.
#' (Example: should \code{Sepal.Width} be returned, when you pass \code{iris} and discretize \code{Sepal.Length, Petal.Length, Petal.Width}.)
#' @param discIntegers logical value.
#' If true (default), then integers are treated as numeric vectors and they are discretized.
#' If false integers are treated as factors and they are left as is.
#' @param call Keep as \code{NULL}. Inner method parameter for consistency.
#'
#' @references U. M. Fayyad and K. B. Irani. Multi-Interval Discretization of
#'   Continuous-Valued Attributes for Classification Learning. In 13th
#'   International Joint Conference on Uncertainly in Artificial
#'   Intelligence(IJCAI93), pages 1022-1029, 1993.
#'
#' @examples
#'
#' # vectors
#' discretize(x = iris[[1]], y = iris[[5]])
#'
#' # list and vector
#' head(discretize(x = list(iris[[1]], iris$Sepal.Width), y = iris$Species))
#'
#' # formula input
#' head(discretize(x = Species ~ ., y = iris))
#' head(discretize(Species ~ ., iris))
#'
#' # use different methods for specific columns
#' ir1 <- discretize(Species ~ Sepal.Length, iris)
#' ir2 <- discretize(Species ~ Sepal.Width, ir1, control = equalsizeControl(3))
#' ir3 <- discretize(Species ~ Petal.Length, ir2, control = equalsizeControl(5))
#' head(ir3)
#'
#' # custom breaks
#' ir <- discretize(Species ~ Sepal.Length, iris,
#'   control = customBreaksControl(breaks = c(0, 2, 5, 7.5, 10)))
#' head(ir)
#'
#' \dontrun{
#' # Same results
#' library(RWeka)
#' Rweka_disc_out <- RWeka::Discretize(Species ~ Sepal.Length, iris)[, 1]
#' FSelectorRcpp_disc_out <- FSelectorRcpp::discretize(Species ~ Sepal.Length,
#'                                                     iris)[, 1]
#' table(Rweka_disc_out, FSelectorRcpp_disc_out)
#' # But faster method
#' library(microbenchmark)
#' microbenchmark(FSelectorRcpp::discretize(Species ~ Sepal.Length, iris),
#'                RWeka::Discretize(Species ~ Sepal.Length, iris))
#'
#' }
#'
#' @author Zygmunt Zawadzki \email{zygmunt@zstat.pl}
#' @importFrom stats formula
#' @export
discretize <- function(x, y, control = list(mdlControl(), equalsizeControl()),
                       all = TRUE, discIntegers = TRUE, call = NULL) {
  UseMethod("discretize", x)
}

#' @export
discretize.default <- function(x, y,
  control = list(mdlControl(), equalsizeControl()),
  all = TRUE, discIntegers = TRUE, call = NULL) {

  stop(sprintf("Object of class %s is not supported!", class(x)[1]))
}

#' @export
discretize.formula <- function(x, y,
                               control = list(mdlControl(), equalsizeControl()),
                               all = TRUE, discIntegers = TRUE, call = NULL) {
  formula <- formula2names(x, y)
  data <- y
  yy <- y[[formula$y]]

  if (!("discretizationControl" %in% class(control))) {
    control <- control[[1]]

    if (!"discretizationControl" %in% class(control)) {
      stop(paste("control is not a subclass of discretizationControl.",
        paste("  Please use mdlControl(), customBreaksControl()",
        "or equalsizeControl() functions."),
                 sep = "\n"))
    }
  }

  fnc <- if (discIntegers) is.numeric else is.double
  colClasses <- sapply(data, fnc)
  colClasses <- colClasses[formula$x]

  if (all(!colClasses)) {
    if (discIntegers) {
      stop(
        "There are no columns that contain the numeric values."
      )
    } else {
      stop(
        "There are no columns that contain the double values.\n",
        "Note that discIntegers is set to FALSE, so all columns ",
        "that contain the integers are not discretized."
        )
    }
  } else if (any(!colClasses)) {

    if (!all) {
      warning(
        paste("Columns with classes other than numeric will be skipped!\n",
              "\n",
              " Skipped columns:",
              paste(names(colClasses)[!colClasses], collapse = ", "))
      )
    }

    colClasses <- colClasses[colClasses]
  }

  columnsToDiscretize <- names(colClasses)

  splitPointsList <- list()

  for (col in columnsToDiscretize) {

    if (class(control)[1] == "customBreaksControl") {

      signifDigits <- get_signif_digits(control$breaks)
      res <- cut(data[[col]], control$breaks,
                 ordered_result = TRUE, dig.lab = signifDigits)
      attr(res, "SplitValues") <- control$breaks

    } else {
      res <- discretize_cpp(data[[col]], yy, control)
    }

    if (anyNA(data[[col]])) {
      res[res == 0] <- NA
    }
    class(res) <- c("ordered", "factor")

    if (!is.null(attr(res, "SplitValues"))) {

      splitVals <- attr(res, "SplitValues")
      signifDigits <- get_signif_digits(splitVals)
      levels(res) <- levels(cut(
        splitVals, splitVals,
        dig.lab = signifDigits, ordered_result = TRUE))

      splitPointsList[[col]] <- splitVals
    } else {
      # in case of no split points
      warning(paste(
        sprintf("Cannot find any split points for `%s`.", col),
        "Drops this column.",
        sep = " "
      ))
      res <- NULL
      splitPointsList[[col]] <- NA
    }

    data[[col]] <- res
  }

  if (!all) {
    data <- data[, c(formula$y, columnsToDiscretize)]
  }

  attr(data, "fsSplitPointsList") <- c(
    attr(data, "fsSplitPointsList"),
    splitPointsList
  )

  return(data)
}

#' @export
discretize.data.frame <- function(x, y,
  control = list(mdlControl(), equalsizeControl()),
  all = TRUE, discIntegers = TRUE, call = match.call()) {

  if (class(y)[[1]] == "formula") {
    discretize.formula(
      x = y, y = x, control = control, all = all,
      call = call, discIntegers = discIntegers)
  } else {
    if (!is.data.frame(y)) {
      y <- format_handler(call$y, y)
    }
    y <- cbind(y, x)
    if (any(table(colnames(y)) != 1)) {
      stop("Duplicated columns!")
    }
    x <- formula(paste0(colnames(y)[1], "~.")) #nolint

    if (!("discretizationControl" %in% class(control))) {
      control <- control[[1]]
    }

    discretize.formula(
      x = x, y = y, control = control,
      all = all, discIntegers = discIntegers,
      call = call)
  }
}
#' @export
discretize.numeric <- function(x, y,
  control = list(mdlControl(), equalsizeControl()),
  all = TRUE, discIntegers = TRUE, call = NULL) {
  call <- match.call()
  x <- format_handler(call$x, x)

  if (!("discretizationControl" %in% class(control))) {
    control <- control[[1]]
  }

  discretize.data.frame(
    x = x, y = y, control = control,
    all = all, call = call,
    discIntegers = discIntegers)
}

#' @export
discretize.list <- discretize.numeric

#' @export
discretize.matrix <- discretize.numeric

#' @rdname discretize
#' @export
mdlControl <- function() {
  params <- list(method = "MDL")
  attr(params, "class") <- c("mdlControl", "discretizationControl", "list")
  params
}

#' @rdname discretize
#' @param k Number of partitions.
#' @export
equalsizeControl <- function(k = 10) {
  stopifnot(k > 0)
  k <- floor(k)
  params <- list(method = "EQUAL_SIZE", k = k)
  attr(params, "class") <- c("equalsizeControl",
                             "discretizationControl",
                             "list")
  params
}

#' @rdname discretize
#' @param breaks custom breaks used for partitioning.
#' @export
customBreaksControl <- function(breaks) {
  stopifnot(is.numeric(breaks))
  params <- list(method = "CUSTOM_BREAKS", breaks = breaks)
  attr(params, "class") <- c("customBreaksControl",
                             "discretizationControl",
                             "list")
  params
}

#' Discretize numeric dependent variable
#'
#' Returns discretized dependent variable
#'
#' The observations are segregated into bins in a way that every bin has the
#' same number of them. This function is based on the original one from
#' FSelector package.
#'
#' @noRd
equal_freq_bin <- function(data, bins) {
  bins <- as.integer(bins)

  if (!is.numeric(data)) {
    stop("Data must be numeric!")
  }

  if (bins < 1) {
    stop("Number of bins must be greater than 1!")
  }

  complete <- complete.cases(data)
  ord <- order(data)
  len <- length(data[complete])
  blen <- len / bins
  new_data <- data
  p1 <- 0
  p2 <- 0

  for (i in 1:bins) {
    p1 <- p2 + 1
    p2 <- round(i * blen)
    new_data[ord[p1:min(p2, len)]] <- i
  }

  new_data
}


get_signif_digits <- function(x) {
  x <- x[!is.infinite(x)]
  before_dot <- max(nchar(format(round(x), scientific = FALSE)))

  charx <- format(x, scientific = FALSE, digits = before_dot + 7)
  charx <- charx[grep(x, pattern = "\\.")]
  after_dot <- vapply(
    strsplit(charx, split = "\\."), "", FUN = "[[", 2)
  after_dot <- nchar(after_dot)
  if (length(after_dot) == 0) after_dot <- 0

  before_dot + min(pmax(after_dot, 0), 6)
}

Try the FSelectorRcpp package in your browser

Any scripts or data that you put into this service are public.

FSelectorRcpp documentation built on April 28, 2023, 5:07 p.m.