R/discretize.R

Defines functions .rediscretizeDF discretizeDF discretize

Documented in discretize discretizeDF

#######################################################################
# arules - Mining Association Rules and Frequent Itemsets
# Copyright (C) 2011-2015 Michael Hahsler, Christian Buchta, 
#                       Bettina Gruen and Kurt Hornik
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.


#' Convert a Continuous Variable into a Categorical Variable
#' 
#' This function implements several basic unsupervised methods to convert a
#' continuous variable into a categorical variable (factor) using different
#' binning strategies. For convenience, a whole data.frame can be discretized
#' (i.e., all numeric columns are discretized).
#' 
#' Discretize calculates breaks between intervals using various methods and
#' then uses [base::cut()] to convert the numeric values into intervals
#' represented as a factor.
#' 
#' Discretization may fail for several reasons. Some reasons are 
#' \itemize{
#' \item A variable contains only a single value. In this case, the variable
#' should be dropped or directly converted into a factor with a single level
#' (see [factor]). 
#' \item Some calculated breaks are not unique.
#' This can happen for method frequency with very skewed data (e.g., a large
#' portion of the values is 0). In this case, non-unique breaks are dropped
#' with a warning. It would be probably better to look at the histogram of the
#' data and decide on breaks for the method fixed. }
#' 
#' `discretize` only implements unsupervised discretization. See
#' [arulesCBA::discretizeDF.supervised()] in package \pkg{arulesCBA}
#' for supervised discretization.
#' 
#' `discretizeDF()` applies discretization to each numeric column.
#' Individual discretization parameters can be specified in the form:
#' `methods = list(column_name1 = list(method = ,...), column_name2 = list(...))`. 
#' If no discretization method is specified for a column, then the
#' discretization in default is applied (`NULL` invokes the default method
#' in `discretize()`). The special method `"none"` can be specified
#' to suppress discretization for a column.
#' 
#' @aliases discretize binning
#' @family preprocessing
#' 
#' @param x a numeric vector (continuous variable).
#' @param method discretization method. Available are: `"interval"` (equal
#' interval width), `"frequency"` (equal frequency), `"cluster"`
#' (k-means clustering) and `"fixed"` (categories specifies interval
#' boundaries).  Note that equal frequency does not achieve perfect equally
#' sized groups if the data contains duplicated values.
#' @param breaks,categories either number of categories or a vector with boundaries for
#' discretization (all values outside the boundaries will be set to NA). 
#' \bold{`categories` is deprecated, use `breaks` instead.}
#' @param labels character vector; labels for the levels of the resulting
#' category. By default, labels are constructed using "(a,b]" interval
#' notation. If `labels = FALSE`, simple integer codes are returned
#' instead of a factor..
#' @param include.lowest logical; should the first interval be closed to the
#' left?
#' @param right logical; should the intervals be closed on the right (and open
#' on the left) or vice versa?
#' @param dig.lab integer; number of digits used to create labels.
#' @param ordered_result logical; return a ordered factor?
#' @param infinity logical; should the first/last break boundary changed to
#' +/-Inf?
#' @param onlycuts logical; return only computed interval boundaries?
#' @param \dots for method "cluster" further arguments are passed on to
#' `kmeans`.
#' @param df data.frame; each numeric column in the data.frame is discretized.
#' @param methods named list of lists or a data.frame; the named list contains
#' lists of discretization parameters (see parameters of `discretize()`) for
#' each numeric column (see details). If no discretization is
#' specified for a column, then the default settings for `discretize()` are
#' used.  Note: the names have to match exactly.  If a data.frame is specified,
#' then the discretization breaks in this data.frame are applied to `df`.
#' @param default named list; parameters for `discretize()` used for all
#' columns not specified in `methods`.
#' @return `discretize()` returns a factor representing the 
#' categorized continuous variable with
#' attribute `"discretized:breaks"` indicating the used breaks or and
#' `"discretized:method"` giving the used method. If `onlycuts = TRUE` 
#' is used, a vector with the calculated interval boundaries is returned.
#' 
#' `discretizeDF()` returns a discretized data.frame.
#' @author Michael Hahsler
#' @seealso [base::cut()],
#' [arulesCBA::discretizeDF.supervised()].
#' @keywords manip
#' @examples
#' data(iris)
#' x <- iris[,1]
#' 
#' ### look at the distribution before discretizing
#' hist(x, breaks = 20, main = "Data")
#' 
#' def.par <- par(no.readonly = TRUE) # save default
#' layout(mat = rbind(1:2,3:4))
#' 
#' ### convert continuous variables into categories (there are 3 types of flowers)
#' ### the default method is equal frequency
#' table(discretize(x, breaks = 3))
#' hist(x, breaks = 20, main = "Equal Frequency")
#' abline(v = discretize(x, breaks = 3, 
#'   onlycuts = TRUE), col = "red")
#' # Note: the frequencies are not exactly equal because of ties in the data 
#' 
#' ### equal interval width
#' table(discretize(x, method = "interval", breaks = 3))
#' hist(x, breaks = 20, main = "Equal Interval length")
#' abline(v = discretize(x, method = "interval", breaks = 3, 
#'   onlycuts = TRUE), col = "red")
#' 
#' ### k-means clustering 
#' table(discretize(x, method = "cluster", breaks = 3))
#' hist(x, breaks = 20, main = "K-Means")
#' abline(v = discretize(x, method = "cluster", breaks = 3, 
#'   onlycuts = TRUE), col = "red")
#' 
#' ### user-specified (with labels)
#' table(discretize(x, method = "fixed", breaks = c(-Inf, 6, Inf), 
#'     labels = c("small", "large")))
#' hist(x, breaks = 20, main = "Fixed")
#' abline(v = discretize(x, method = "fixed", breaks = c(-Inf, 6, Inf), 
#'     onlycuts = TRUE), col = "red")
#' 
#' par(def.par)  # reset to default
#' 
#' ### prepare the iris data set for association rule mining
#' ### use default discretization
#' irisDisc <- discretizeDF(iris)
#' head(irisDisc)
#' 
#' ### discretize all numeric columns differently
#' irisDisc <- discretizeDF(iris, default = list(method = "interval", breaks = 2, 
#'   labels = c("small", "large")))
#' head(irisDisc)
#' 
#' ### specify discretization for the petal columns and don't discretize the others
#' irisDisc <- discretizeDF(iris, methods = list(
#'   Petal.Length = list(method = "frequency", breaks = 3, 
#'     labels = c("short", "medium", "long")),
#'   Petal.Width = list(method = "frequency", breaks = 2, 
#'     labels = c("narrow", "wide"))
#'   ),
#'   default = list(method = "none")
#'   )
#' head(irisDisc)
#' 
#' ### discretize new data using the same discretization scheme as the
#' ###   data.frame supplied in methods. Note: NAs may occure if a new 
#' ###   value falls outside the range of values observed in the 
#' ###   originally discretized table (use argument infinity = TRUE in 
#' ###   discretize to prevent this case.) 
#' discretizeDF(iris[sample(1:nrow(iris), 5),], methods = irisDisc)
#' @export discretize
discretize <- function(x, method = "frequency", breaks = 3, 
  labels = NULL, include.lowest = TRUE, right = FALSE, dig.lab = 3,
  ordered_result = FALSE, infinity = FALSE, onlycuts = FALSE, categories = NULL, ...) {
  
  if(!is.null(categories)) {
    warning("Parameter categories is deprecated. Use breaks instead! Also, the default method is now frequency!")
    breaks <- categories
  }
  
  methods <- c("interval", "frequency", "cluster", "fixed")
  
  method <- methods[pmatch(tolower(method), methods)]
  if(is.na(method)) stop("Unknown method!")
  
  if(method == "fixed" && length(breaks) < 2) 
    stop("fixed needs at least two values for breaks.")
  if(method != "fixed" && (length(breaks) != 1 || breaks < 1))
    stop("breaks needs to be a single positive integer for this method.")
  
  breaks <- switch(method,
    interval = seq(from=min(x, na.rm=TRUE), to=max(x, na.rm=TRUE), 
      length.out=breaks+1),
    
    frequency = stats::quantile(x, probs = seq(0,1, length.out = breaks+1), 
      na.rm = TRUE),
    
    cluster = {
      cl <-  stats::kmeans(stats::na.omit(x), breaks, ...)
      centers <- sort(cl$centers[,1])
      as.numeric(c(min(x, na.rm=TRUE), head(centers, 
        length(centers)-1) + diff(centers)/2, max(x, na.rm=TRUE)))
    },
    
    fixed = breaks
  )
  
  if(any(duplicated(breaks))){ 
    warning("The calculated breaks are: ", paste(breaks, collapse = ", "), "\n  Only unique breaks are used reducing the number of intervals. Look at ? discretize for details.")
    
    breaks <- unique(breaks)
    if(length(breaks) < 2) stop("Less than 2 uniques breaks left. Maybe the variable has only one value!")
  }

  
  ### fix first and last to -/+Inf
  if(infinity) {
    breaks[1] <- -Inf
    breaks[length(breaks)] <- Inf
  }
  
  if(onlycuts) return(as.vector(breaks))
  
  structure(
    cut(x, breaks = breaks, labels = labels, 
      include.lowest = include.lowest, right = right, dig.lab = dig.lab,
      ordered_result = ordered_result),
    "discretized:breaks" = as.vector(breaks),
    "discretized:method" = method
  )  
}

#' @rdname discretize
discretizeDF <- function(df, methods = NULL, default = NULL) {
  
  ### methods is a data.frame to get the discretization info from
  if(is.data.frame(methods)) return(.rediscretizeDF(methods, df))
  
  for(i in colnames(df)) {
    if(!is.numeric(df[[i]])) next
    args <- if(is.null(methods[[i]])) default else methods[[i]]
    
    ### skip columns with method na
    if(!is.null(args) && (is.null(args$method) || args$method == "none")) next
    
    if(is(err <- try(
      df[[i]] <- do.call("discretize", c(list(x = df[[i]]), args)), 
      silent = TRUE), "try-error")) stop("Problem with column ", i, "\n", err)
  }
  
  df
}

.rediscretizeDF <- function(data, newdata) {
  
  if(!all(colnames(data) == colnames(newdata))) stop("column names in the new data are not the same as in the discretized data.")
  
  cps <- lapply(data, FUN = function(x) {
    breaks <- attr(x, "discretized:breaks")
    if(is.null(breaks)) NULL
    else list(breaks = breaks, method = "fixed", labels = levels(x))
  })
  
  discretizeDF(newdata, methods = cps, default = list(method = "none"))
}
mhahsler/arules documentation built on March 19, 2024, 5:45 p.m.