R/gen_buckets.R

#' Supervised discretization with 'smbinning'
#'
#' Generate buckets for numerical features using 'smbinning'.
#'
#' @param data A data frame contains target and numerical variable that to be binned.
#'             Has to be exactly 'data.frame', "smbinning::smbinning()" won't accept,
#'             say, tibbles.
#' @param target Character string indicating name of the target variable.
#' @param x Character string indicating name of the continuous numerical variable.
#' @param fitLimits Vector of two float numbers indicating model fitting limits of x.
#' @param minNumCuts Integer representing minimum number of cut points required/wanted.
#' @param minBktPct Float number indicating minimum percent of observation required for
#'                  each 'smbinning' generated buckets. A number to be used to indicate
#'                  parameter 'p' in function 'smbinning'.
#'
#' @return A 'smbinning' object.
#'
#' @export
#'
#' @examples
#' BinsFICO_UsedPlat <- gen_buckets(data = Data_UsedPlat, target = 'BadInd',
#'     x = 'CreditScore', fitLimits = c(450, 900), minNumCuts = 5, minBktPct = 0.15)
gen_buckets <- function(data, target, x, fitLimits = range(data[, x]),
                        minNumCuts = 4, minBktPct = 0.10) {
  data <- as.data.frame(data)
  dataIndex <- data[, x] >= fitLimits[1] & data[, x] <= fitLimits[2]
  SMBin_Obj <- smbinning(df = data[dataIndex, ], y = target, x = x, p = minBktPct)
  if (length(SMBin_Obj$cuts) < 3) {
    SMBin_Obj$bands <- quantile(data[x][dataIndex, ], probs = seq(0, 1, 0.20), names = FALSE)
    SMBin_Obj$cuts <- quantile(data[x][dataIndex, ], probs = seq(0.20, 0.80, 0.20), names = FALSE)
  }
  return(SMBin_Obj)
}
hongqi0314/PRAuto.PMML documentation built on May 6, 2019, 11:30 a.m.