R/binning.R

Defines functions isBinningFeasible createEqualWidthBins createEqualFreqBins executeBinning

Documented in createEqualFreqBins createEqualWidthBins executeBinning

# utility functions for binning numerical data
# written by Kenji Kondo @ 2019/1/1


# Inner function used in executeBinning
isBinningFeasible <- function(x_vec) {
  return(is.integer(x_vec) | is.numeric(x_vec) | is.ordered(x_vec))
}


#' Create bins by Equal Width Binning
#'
#' @param left left value of the original interval.
#' @param right right value of the original interval.
#' @param nbin number of bins to be created.
#'
#' @return a numeric vector which indicates the boundaries of bins, with `nbin` elements.
#'
#' @export
#' @importFrom assertthat assert_that
createEqualWidthBins <- function(left, right, nbin){
  nbin <- as.integer(nbin)
  assert_that(length(nbin) == 1 & nbin > 1)

  breaks <- (0:(nbin - 1)) * ((right - left) / (nbin - 1)) + left
  return(breaks)
}


#' Create bins by Equal Freq Binning
#'
#' @param x_vec A reference integer or numeric or ordered vector to be binned.
#' @param nbin.max An integer value which indicates the maximum counts of bins.
#'   Note that this function makes `min(nbin.max, length(x_vec))` counts of bins.
#'
#' @return a numeric vector which indicates the boundaries of bins, with `nbin` elements.
#'
#' @export
#' @importFrom assertthat assert_that
createEqualFreqBins <- function(x_vec, nbin.max) {
  nbin.max <- as.integer(nbin.max)
  assert_that(nbin.max > 0 & length(x_vec) > 0)

  nbin <- min(nbin.max, length(x_vec))
  percents <- seq(0, 1, 1 / (nbin - 1))
  if (is.ordered(x_vec)) {
    breaks <- unique(quantile(x_vec, percents, type=1))
    names(breaks) <- NULL
  } else { # integer or numeric cases
    breaks <- unique(as.numeric(quantile(x_vec, percents)))
  }

  return(breaks)
}


#' Execute binning for numerical data.
#'
#' @param x_vec an integer or numeric or ordered vector, to be binned.
#' @param breaks a numeric vector which indicates the boundaries of bins, of length (number of bins - 1).
#'   If NULL is set, bins are generated using the method which is specified by the `method` argument.
#' @param nbin.max a maximum number of bins which is generated by this function. Only used when `breaks` is not set.
#' @param method used for specifying binning method. "freq": equal freq binning (default), "width": equal width binning.
#'   Ignore if `breaks` is set.
#'
#' @return a list which has two members `labels` and `breaks`.
#' * `labels`: an integer vector of `length(x_vec)`.
#'   `(labels[i]==k)` indicates the i-th element of x_vec is in the k-th bin.
#' * `breaks`: a numeric vector which indicates the boundaries of bins, of length (number of bins - 1).
#'
#' @export
#' @importFrom assertthat assert_that
executeBinning <- function(x_vec, breaks=NULL, nbin.max=100, method="freq") {
  # Check arguments
  assert_that(isBinningFeasible(x_vec))

  # If breaks is NULL, generate bins by self.
  if (is.null(breaks)) {
    left <- min(x_vec)
    right <- max(x_vec)
    if (method == "freq") {
      breaks <- createEqualFreqBins(x_vec, nbin.max)
    } else if (method == "width") {
      breaks <- createEqualWidthBins(left, right, nbin.max)
    } else {
      assert_that(FALSE, msg="wrong 'method' argument.")
    }
  }

  if (is.ordered(x_vec)) {
    labels <- as.integer(rep(1, length(x_vec)))
    for (i in seq(length(breaks))) {
      labels[x_vec >= breaks[i]] <- i + 1
    }
  } else { # integer or numeric cases
    # Calc labels for each element
    labels <- cut(x_vec, breaks=c(-Inf, breaks, Inf), labels=FALSE, right=FALSE)
  }

  return(list(labels=labels, breaks=breaks))
}
kazuzowo/aglm-LVar documentation built on Dec. 10, 2019, 12:13 a.m.