Nothing
#' Split bins all
#'
#' \code{split_bins} is for transforming data to bins.
#' The \code{split_bins_all} function is a simpler wrapper for \code{split_bins}.
#' @param dat A data.frame with independent variables.
#' @param x_list A list of x variables.
#' @param ex_cols Names of excluded variables. Regular expressions can also be used to match variable names. Default is NULL.
#' @param breaks_list A list contains breaks of variables. it is generated by code{\link{get_breaks_all}},code{\link{get_breaks}}
#' @param bins_no Number the generated bins. Default is TRUE.
#' @param note Logical, outputs info. Default is TRUE.
#' @param return_x Logical, return data.frame containing only variables in x_list.
#' @param char_free Logical, if TRUE, characters are not splitted.
#' @param save_data Logical, save results in locally specified folder. Default is TRUE
#' @param file_name The name for periodically saved woe file. Default is "dat_woe".
#' @param dir_path The path for periodically saved woe file Default is "./data"
#' @param ... Additional parameters.
#' @return A data.frame with splitted bins.
#' @seealso \code{\link{get_tree_breaks}}, \code{\link{cut_equal}}, \code{\link{select_best_class}}, \code{\link{select_best_breaks}}
#' @examples
#' sub = cv_split(UCICreditCard, k = 30)[[1]]
#' dat = UCICreditCard[sub,]
#' dat = re_name(dat, "default.payment.next.month", "target")
#' dat = data_cleansing(dat, target = "target", obs_id = "ID", occur_time = "apply_date",
#' miss_values = list("", -1))
#'
#' train_test = train_test_split(dat, split_type = "OOT", prop = 0.7,
#' occur_time = "apply_date")
#' dat_train = train_test$train
#' dat_test = train_test$test
#' #get breaks of all predictive variables
#' x_list = c("PAY_0", "LIMIT_BAL", "PAY_AMT5", "EDUCATION", "PAY_3", "PAY_2")
#' breaks_list = get_breaks_all(dat = dat_train, target = "target",
#' x_list = x_list, occur_time = "apply_date", ex_cols = "ID",
#' save_data = FALSE, note = FALSE)
#' #woe transform
#' train_bins = split_bins_all(dat = dat_train,
#' breaks_list = breaks_list,
#' woe_name = FALSE)
#' test_bins = split_bins_all(dat = dat_test,
#' breaks_list = breaks_list,
#' note = FALSE)
#'
#' @export
split_bins_all = function(dat, x_list = NULL, ex_cols = NULL,
breaks_list = NULL, bins_no = TRUE,
note = FALSE,return_x = FALSE,
char_free = FALSE,
save_data = FALSE,
file_name = NULL,
dir_path = tempdir(), ...) {
dat = checking_data(dat)
if (note) cat_line("-- Transforming variables to bins", col = love_color("deep_green"))
opt = options(scipen = 200, stringsAsFactors = FALSE) #
if (is.null(x_list)) {
if (!is.null(breaks_list)) {
x_list = unique(unlist(breaks_list[, "Feature"]))
} else {
x_list = get_names(dat = dat,
types = c('factor', 'character', 'numeric', 'integer', 'double'),
ex_cols = ex_cols, get_ex = FALSE)
}
}
if(class(breaks_list) == 'list'){
dat[, x_list] = lapply(x_list, function(x) split_bins(dat = dat, x = x,
breaks = if (is.null(breaks_list)) NULL else breaks_list[[x]],
bins_no = bins_no))
}else{
dat[, x_list] = lapply(x_list, function(x) split_bins(dat = dat, x = x,
breaks = if (is.null(breaks_list)) NULL else breaks_list[which(as.character(breaks_list[, "Feature"]) ==
x), "cuts"],
bins_no = bins_no))
}
if(return_x){
dat = dat[,x_list]
}
if (save_data) {
dir_path = ifelse(!is.character(dir_path),
tempdir(), dir_path)
if (!dir.exists(dir_path)) dir.create(dir_path)
if (!is.character(file_name)) file_name = NULL
save_data(dat, file_name = ifelse(is.null(file_name), "dat.bins", paste(file_name, "dat.bins", sep = ".")), dir_path = dir_path, note = note)
}
return(dat)
options(opt) # reset
}
#' split_bins
#'
#' \code{split_bins} is for binning using breaks.
#' @param dat A data.frame with independent variables.
#' @param x The name of an independent variable.
#' @param breaks Breaks for binning.
#' @param bins_no Number the generated bins. Default is TRUE.
#' @param as_factor Whether to convert to factor type.
#' @param labels Labels of bins.
#' @param use_NA Whether to process NAs.
#' @param char_free Logical, if TRUE, characters are not splitted.
#' @return A data.frame with Bined x.
#' @examples
#' bins = split_bins(dat = UCICreditCard,
#' x = "PAY_AMT1", breaks = NULL, bins_no = TRUE)
#' @export
split_bins = function (dat, x, breaks = NULL, bins_no = TRUE, as_factor = FALSE,
labels = NULL, use_NA = TRUE,char_free = FALSE) {
opt = options(scipen = 200, stringsAsFactors = FALSE,digits = 4)
dat = checking_data(dat)
if (length(breaks) < 1) {
breaks = get_breaks(dat, x, target = NULL, best = FALSE,
equal_bins = TRUE, g = 5, note = FALSE)
}
sp_value_num = sp_value_char = NULL
type_x = class(dat[, x])[1]
if (any(c("integer", "numeric", "double","Date") ==
type_x )) {
dat[, x] = as.numeric(dat[, x])
if(any(c("Date") == type_x)){
breaks = sort(unlist(unique(c(-Inf, as.numeric(as.Date(breaks)), Inf))))
}else{
breaks = sort(unlist(unique(c(-Inf, breaks, Inf))))
}
bins_1 = cut(dat[, x], breaks = unique(breaks), dig.lab = 12,
ordered = TRUE, include.lowest = FALSE, right = TRUE)
if (bins_no) {
if(use_NA){
bins_0 = ifelse(is.na(bins_1), "00", paste("0", as.numeric(bins_1), sep = ""))
bins = paste(bins_0, bins_1, sep = ".")
}else{
bins = ifelse(is.na(bins_1), NA, paste(paste("0", as.numeric(bins_1), sep = ""),
bins_1,sep="."))
}
bins[which(as.numeric(bins_1) >= 10)] = paste(as.numeric(bins_1[which(as.numeric(bins_1) >=10)]),
bins_1[which(as.numeric(bins_1) >= 10)],sep = ".")
}else{
if(use_NA){
bins = ifelse(is.na(bins_1), ".NA", as.character(bins_1))
}else{
bins = as.character(bins_1)
}
}
if(any(c("Date") == type_x)){
for(i in 1:length((breaks[-c(1,length(breaks))]))){
bins = gsub(breaks[-c(1,length(breaks))][i],as.Date(breaks[-c(1,length(breaks))][i],
origin = '1970-01-01'), bins)
}
}
if(!is.null(labels)){
if(length(breaks) -1 == length(labels)){
if(use_NA){
if(any(bins == "00.NA")){
levels = sort(names(table(bins)))
labels = c("00.NA",labels)
}else{
if(any(bins == ".NA")){
levels = sort(names(table(bins)))
labels = c(labels,".NA")
}
}
}else{
levels = sort(names(table(bins,useNA = "no")))
bins[which(bins == "00.NA"|bins == ".NA")] = NA
}
if(length(levels) == length(labels)){
bins = factor(bins,labels = labels,levels = levels )
}else{
stop(paste("'levels' is not right: length", length(levels) ,"should be", length(labels)))
}
}else{
stop(paste("'labels' is not right: length", length(labels) ,"should be", length(breaks) -1))
}
}
} else {
if (any(grepl("\\|", breaks))) {
breaks_s = sapply(breaks,function(s)strsplit(s, split="\\|"))
} else {
breaks_s = breaks
}
dat[, x] = as.character(dat[, x])
if(!char_free){
if (length(breaks_s) > 0) {
for (i in 1:length(breaks_s)) {
if (length(which(dat[, x] %in% unlist(c(breaks_s[[i]])))) >
1) {
if (i < 10) {
split_ind = which(dat[, x] %in% unlist(c(breaks_s[[i]])))
if (length(split_ind) > 0) {
if(use_NA){
dat[split_ind, x] = ifelse(bins_no, paste(paste0("0", i),
paste(breaks_s[[i]], collapse = ";"),
sep = "."), paste(breaks_s[[i]],
collapse = ";"))
dat[which(is.na(dat[,x])), x] = ifelse(bins_no,"00.NA",".NA")
}else{
dat[split_ind, x] = ifelse(is.na(dat[split_ind,x]), NA, ifelse(bins_no, paste(paste0("0", i),
paste(breaks_s[[i]], collapse = ";"),
sep = "."), paste(breaks_s[[i]],
collapse = ";")))
}
}
}else{
split_ind = which(dat[, x] %in% unlist(breaks_s[[i]]))
if (length(split_ind) > 0) {
if(use_NA){
dat[split_ind, x] = ifelse(bins_no, paste(ifelse(is.na(dat[split_ind, x]), "00",
paste0(i)), paste(breaks_s[[i]],
collapse = ";"),
sep = "."),
paste(breaks_s[[i]], collapse = ";"))
dat[which(is.na(dat[,x])), x] = ifelse(bins_no,"00.NA",".NA")
}else{
dat[split_ind, x] = ifelse(bins_no, paste(ifelse(is.na(dat[split_ind, x]), "00",
paste0(i)), paste(breaks_s[[i]],
collapse = ";"),
sep = "."),
paste(breaks_s[[i]], collapse = ";"))
}
}
}
}
}
}
bins = dat[,x]
if(!is.null(labels)){
if(length(breaks_s) == length(labels)){
if(use_NA){
if(any(bins == "00.NA")){
levels = sort(names(table(bins)))
labels = c("00.NA",labels)
}else{
if(any(bins == ".NA")){
levels = sort(names(table(bins)))
labels = c(labels,".NA")
}else{
levels = sort(names(table(bins)))
}
}
}else{
levels = sort(names(table(bins,useNA = "no")))
bins[which(bins == "00.NA"|bins == ".NA")] = NA
}
if(length(levels) == length(labels)){
bins = factor(bins,labels = labels )
}else{
stop(paste("'levels' is not right: length", length(levels) ,"should be", length(labels)))
}
}else{
stop(paste("'labels' is not right: length", length(labels) ,"should be", length(breaks) -1))
}
}
}
}
if(as_factor){
bins = as.factor(bins)
}else{
bins = as.character(bins)
}
return(bins)
options(opt)
}
#' Generates Best Breaks for Binning
#'
#' \code{get_breaks} is for generating optimal binning for numerical and nominal variables.
#' The \code{get_breaks_all} is a simpler wrapper for \code{get_breaks}.
#' @param dat A data frame with x and target.
#' @param target The name of target variable.
#' @param sp_values A list of missing values.
#' @param x_list A list of x variables.
#' @param ex_cols A list of excluded variables. Default is NULL.
#' @param pos_flag The value of positive class of target variable, default: "1".
#' @param occur_time The name of the variable that represents the time at which each observation takes place.
#' @param oot_pct Percentage of observations retained for overtime test (especially to calculate PSI). Defualt is 0.7
#' @param best Logical, if TRUE, merge initial breaks to get optimal breaks for binning.
#' @param equal_bins Logical, if TRUE, equal sample size initial breaks generates.If FALSE , tree breaks generates using desison tree.
#' @param cut_bin A string, if equal_bins is TRUE, 'equal_depth' or 'equal_width', default is 'equal_depth'.
#' @param g Integer, number of initial bins for equal_bins.
#' @param tree_control the list of tree parameters.
#' \itemize{
#' \item \code{p} the minimum percent of observations in any terminal <leaf> node. 0 < p< 1; 0.01 to 0.1 usually work.
#' \item \code{cp} complexity parameter. the larger, the more conservative the algorithm will be. 0 < cp< 1 ; 0.0001 to 0.0000001 usually work.
#' \item \code{xval} number of cross-validations.Default: 5
#' \item \code{max_depth} maximum depth of a tree. Default: 10
#' }
#' @param bins_control the list of parameters.
#' \itemize{
#' \item \code{bins_num} The maximum number of bins. 5 to 10 usually work. Default: 10
#' \item \code{bins_pct} The minimum percent of observations in any bins. 0 < bins_pct < 1 , 0.01 to 0.1 usually work. Default: 0.02
#' \item \code{b_chi} The minimum threshold of chi-square merge. 0 < b_chi< 1; 0.01 to 0.1 usually work. Default: 0.02
#' \item \code{b_odds} The minimum threshold of odds merge. 0 < b_odds < 1; 0.05 to 0.2 usually work. Default: 0.1
#' \item \code{b_psi} The maximum threshold of PSI in any bins. 0 < b_psi < 1 ; 0 to 0.1 usually work. Default: 0.05
#' \item \code{b_or} The maximum threshold of G/B index in any bins. 0 < b_or < 1 ; 0.05 to 0.3 usually work. Default: 0.15
#' \item \code{odds_psi} The maximum threshold of Training and Testing G/B index PSI in any bins. 0 < odds_psi < 1 ; 0.01 to 0.3 usually work. Default: 0.1
#' \item \code{mono} Monotonicity of all bins, the larger, the more nonmonotonic the bins will be. 0 < mono < 0.5 ; 0.2 to 0.4 usually work. Default: 0.2
#' \item \code{kc} number of cross-validations. 1 to 5 usually work. Default: 1
#' }
#' @param parallel Logical, parallel computing or not. Default is FALSE.
#' @param save_data Logical, save results in locally specified folder. Default is TRUE
#' @param file_name File name that save results in locally specified folder. Default is "breaks_list".
#' @param dir_path Path to save results. Default is "./variable"
#' @param note Logical.Outputs info.Default is TRUE.
#' @param ... Additional parameters.
#'
#' @return A table containing a list of splitting points for each independent variable.
#' @seealso \code{\link{get_tree_breaks}}, \code{\link{cut_equal}}, \code{\link{select_best_class}}, \code{\link{select_best_breaks}}
#' @examples
#' #controls
#' tree_control = list(p = 0.02, cp = 0.000001, xval = 5, maxdepth = 10)
#' bins_control = list(bins_num = 10, bins_pct = 0.02, b_chi = 0.02, b_odds = 0.1,
#' b_psi = 0.05, b_or = 15, mono = 0.2, odds_psi = 0.1, kc = 5)
#' # get categrory variable breaks
#' b = get_breaks(dat = UCICreditCard[1:1000,], x = "MARRIAGE",
#' target = "default.payment.next.month",
#' occur_time = "apply_date",
#' sp_values = list(-1, "missing"),
#' tree_control = tree_control, bins_control = bins_control)
#' # get numeric variable breaks
#' b2 = get_breaks(dat = UCICreditCard[1:1000,], x = "PAY_2",
#' target = "default.payment.next.month",
#' occur_time = "apply_date",
#' sp_values = list(-1, "missing"),
#' tree_control = tree_control, bins_control = bins_control)
#' # get breaks of all predictive variables
#' b3 = get_breaks_all(dat = UCICreditCard[1:1000,], target = "default.payment.next.month",
#' x_list = c("MARRIAGE","PAY_2"),
#' occur_time = "apply_date", ex_cols = "ID",
#' sp_values = list(-1, "missing"),
#' tree_control = tree_control, bins_control = bins_control,
#' save_data = FALSE)
#'
#' @export
get_breaks_all = function(dat, target = NULL, x_list = NULL, ex_cols = NULL,
pos_flag = NULL, occur_time = NULL, oot_pct = 0.7,
best = TRUE, equal_bins = FALSE,
cut_bin = 'equal_depth', g = 10, sp_values = NULL,
tree_control = list(p = 0.05, cp = 0.000001, xval = 5, maxdepth = 10),
bins_control = list(bins_num = 10, bins_pct = 0.05,
b_chi = 0.05, b_odds = 0.1,
b_psi = 0.05, b_or = 0.15, mono = 0.3,
odds_psi = 0.2, kc = 1),
parallel = FALSE, note = FALSE, save_data = FALSE,
file_name = NULL, dir_path = tempdir(), ...) {
opt = options(scipen = 200, stringsAsFactors = FALSE)
dat = checking_data(dat = dat, target = target, pos_flag = pos_flag)
if (note) {
cat_line("-- Getting optimal binning breaks", col = love_color("deep_green"))
}
if (is.null(x_list)) {
x_list = get_names(dat = dat, types = c('factor', 'character', 'numeric', 'integer', 'double'),
ex_cols = c(target, occur_time, ex_cols), get_ex = FALSE)
}
breaks_list = loop_function(func = get_breaks,
x_list = x_list,
args = list(dat = dat, target = target,
occur_time = occur_time, oot_pct = oot_pct,
pos_flag = pos_flag, best = best,
tree_control = tree_control,
sp_values = sp_values, equal_bins = equal_bins,
g = g, bins_control = bins_control, note = note),
as_list = TRUE, bind = "cbind", parallel = parallel)
breaks_list1 = lapply(1:length(breaks_list), function(i) data.frame(Feature = names(breaks_list)[i],
cuts = t(data.frame(t(breaks_list[[i]]))), row.names = NULL))
breaks_list2 = as.data.frame(rbindlist(breaks_list1))
if (save_data) {
dir_path = ifelse(!is.character(dir_path),
tempdir(), dir_path)
if (!dir.exists(dir_path)) dir.create(dir_path)
if (!is.character(file_name)) file_name = NULL
save_data(breaks_list2, file_name = ifelse(is.null(file_name), "breaks_list", paste(file_name, "breaks_list", sep = ".")),
dir_path = dir_path, note = note, as_list = FALSE)
}
options(opt) # reset
return(breaks_list2)
}
#' @param x The Name of an independent variable.
#' @rdname get_breaks_all
#' @export
get_breaks = function(dat, x, target = NULL, pos_flag = NULL,
best = TRUE, equal_bins = FALSE, cut_bin = 'equal_depth', g = 10,
sp_values = NULL, occur_time = NULL, oot_pct = 0.7,
tree_control = NULL, bins_control = NULL, note = FALSE, ...) {
dat = checking_data(dat = dat, target = target, pos_flag = pos_flag, occur_time = occur_time)
if (equal_bins | is.null(target)) {
tree_breaks = cut_equal(dat_x = dat[, x], g = g, sp_values = sp_values, cut_bin = cut_bin)
} else {
tree_breaks = get_tree_breaks(dat = dat, x = x, target = target, pos_flag = pos_flag,
tree_control = tree_control, sp_values = sp_values)
}
if (!is.null(target) && best) {
if (any(c("integer", "numeric", "double") == class(dat[, x]))) {
breaks = select_best_breaks(dat = dat, x = x, target = target,
occur_time = occur_time, oot_pct = oot_pct,
breaks = tree_breaks, pos_flag = pos_flag,
sp_values = sp_values,
bins_control = bins_control)
} else {
if (any(c("Date") == class(dat[, x]))) {
breaks = tree_breaks
}else{
breaks = select_best_class(dat = dat, x = x, target = target,
occur_time = occur_time, oot_pct = oot_pct,
breaks = tree_breaks, pos_flag = pos_flag,
sp_values = sp_values,
bins_control = bins_control)
}
}
} else {
breaks = tree_breaks
}
if (note) cat_bullet(paste0(format(x), ": ", paste(breaks, sep = ",", collapse = ",")), col = "darkgrey")
return(breaks)
}
#' Getting the breaks for terminal nodes from decision tree
#'
#' \code{get_tree_breaks} is for generating initial braks by decision tree for a numerical or nominal variable.
#' The \code{get_breaks} function is a simpler wrapper for \code{get_tree_breaks}.
#' @param dat A data frame with x and target.
#' @param x name of variable to cut breaks by tree.
#' @param target The name of target variable.
#' @param sp_values A list of special value. Default: NULL.
#' @param pos_flag The value of positive class of target variable, default: "1".
#' @param tree_control the list of parameters to control cutting initial breaks by decision tree.
#' \itemize{
#' \item \code{p} the minimum percent of observations in any terminal <leaf> node. 0 < p< 1; 0.01 to 0.1 usually work.
#' \item \code{cp} complexity parameter. the larger, the more conservative the algorithm will be. 0 < cp< 1 ; 0.0001 to 0.0000001 usually work.
#' \item \code{xval} number of cross-validations.Default: 5
#' \item \code{max_depth} maximum depth of a tree. Default: 10
#' }
#' @seealso \code{\link{get_breaks}}, \code{\link{get_breaks_all}}
#' @examples
#' #tree breaks
#' tree_control = list(p = 0.02, cp = 0.000001, xval = 5, maxdepth = 10)
#' tree_breaks = get_tree_breaks(dat = UCICreditCard, x = "MARRIAGE",
#' target = "default.payment.next.month", tree_control = tree_control)
#' @importFrom rpart rpart rpart.control path.rpart
#' @export
get_tree_breaks = function(dat, x, target, pos_flag = NULL,
tree_control = list(p = 0.02, cp = 0.000001, xval = 5, maxdepth = 10),
sp_values = NULL) {
dat = checking_data(dat = dat, target = target, pos_flag = pos_flag)
digits_x = ifelse(is.numeric(dat[, x]), digits_num(dat[, x]), 4)
opt = options(scipen = 200, stringsAsFactors = FALSE, digits = digits_x + 1) #
sp_value_char = sp_value_num = tree_breaks = NULL
x_miss = any(dat[, x] %in% sp_values)
if (!is.null(sp_values) && x_miss) {
sp_value_char = unlist(sp_values[sapply(sp_values, is.character)])
sp_value_num = unlist(sp_values[sapply(sp_values, is.numeric)])
dat1 = dat[!(dat[, x] %in% sp_values),]
} else {
dat1 = dat
}
if (length(unique(dat1[, x])) > 1) {
cp = ifelse(!is.null(tree_control[["cp"]]), tree_control[["cp"]], 0.000001)
xval = ifelse(!is.null(tree_control[["xval"]]), tree_control[["xval"]], 5)
maxdepth = ifelse(!is.null(tree_control[["maxdepth"]]), tree_control[["maxdepth"]], 10)
p = ifelse(!is.null(tree_control[["p"]]), tree_control[["p"]], 0.03)
trcontrol = rpart.control(minbucket = round(nrow(dat1) * p),
cp = cp,
xval = xval,
maxdepth = maxdepth)
Formula = as.formula(paste(target, names(dat1[x]), sep = ' ~ '))
set.seed(46)
fit = rpart(data = dat1, formula = Formula
, control = trcontrol
, parms = list(split = "information"))
if (any(is.element(class(dat1[, x]), c("integer", "numeric", "double")))) {
if (is.null(fit$splits[, 4])) {
tree_breaks = cut_equal(dat1[, x], g = 20, sp_values = sp_values)
} else {
tree_breaks = sort(fit$splits[, 4])
tree_breaks = floor(tree_breaks * 10 ^ (digits_x)) / 10 ^ (digits_x)
}
if (!is.null(sp_values) && x_miss && length(sp_value_num) > 0) {
tree_breaks = sort(unique(append(c(tree_breaks, Inf), sp_value_num, 0)))
}
} else {
if (is.null(fit$splits[, 4])) {
tree_breaks = cut_equal(dat1[, x], g = 10, sp_values = sp_values)
} else {
rpart.rules = path.rpart(fit,
rownames(fit$frame)[fit$frame$var == "<leaf>"],
print.it = FALSE)
tree_breaks = list()
for (i in 1:length(rpart.rules)) {
tree_breaks[i] = strsplit(gsub(paste0(x, "="), "",
rpart.rules[[i]][length(rpart.rules[[i]])]),
split = ",")
}
}
if (!is.null(sp_values) && x_miss && length(sp_value_char) > 0) {
tree_breaks = unique(append(sp_value_char, tree_breaks, 0))
}
}
} else {
tree_breaks = as.list(unique(dat[, x]))
}
rm(dat1)
return(unique(c(tree_breaks)))
options(opt) # reset
}
#' Generating Initial Equal Size Sample Bins
#'
#' \code{cut_equal} is used to generate initial breaks for equal frequency binning.
#' @param dat_x A vector of an variable x.
#' @param g numeric, number of initial bins for equal_bins.
#' @param cut_bin A string, 'equal_depth' or 'equal_width', default is 'equal_depth'.
#' @param sp_values a list of special value. Default: list(-1, "missing")
#' @seealso \code{\link{get_breaks}}, \code{\link{get_breaks_all}},\code{\link{get_tree_breaks}}
#' @examples
#' #equal sample size breaks
#' equ_breaks = cut_equal(dat = UCICreditCard[, "PAY_AMT2"], g = 10)
#'
#' @export
#' @importFrom stats aggregate approx
cut_equal = function(dat_x, g = 10, sp_values = NULL, cut_bin = 'equal_depth') {
dat_x = unlist(dat_x)
if(any(is.element(class(dat_x), c("integer", "numeric", "double")))){
digits_x = digits_num(dat_x)
}else{
digits_x = 2
}
opt = options(scipen = 200, stringsAsFactors = FALSE, digits = digits_x + 1) #
sp_value_char = sp_value_num = NULL
x_miss = any(dat_x %in% sp_values)
if (!is.null(sp_values) && x_miss) {
sp_value_char = unlist(sp_values[sapply(sp_values, is.character)])
sp_value_num = unlist(sp_values[sapply(sp_values, is.numeric)])
dat_x = dat_x[!(dat_x %in% sp_values)]
}
if (any(is.element(class(dat_x), c("integer", "numeric", "double")))) {
dat_x = round(dat_x, digits_x)
if (length(unique(dat_x)) < 2) {
cuts = unique(dat_x)
} else {
if (cut_bin == 'equal_depth') {
none_na_num = sum(!is.na(dat_x))
if (g < 1) stop("g must be >=1")
tbl_x = table(dat_x)
x_unique_value = as.numeric(names(tbl_x))
cum_sum = sort(cumsum(tbl_x))
cuts_sum = approx(cum_sum, x_unique_value, xout = (1:g) * round(none_na_num / g),
method = "constant", ties = "ordered", rule = 2, f = 1)$y
n_cuts = table(cuts_sum)
max_n_cuts = as.numeric(names(n_cuts)[which.max(n_cuts)])
cuts_unique = unique(cuts_sum)
cuts = cuts_unique[-which.max(cuts_unique)]
if (length(cuts_unique) <= 2 & length(x_unique_value) > 1) {
x_unique_ncuts = x_unique_value[which(x_unique_value != max_n_cuts)]
min_x_unique_value = x_unique_ncuts[which.min(x_unique_ncuts)]
cuts = append(cuts, min_x_unique_value, 1)
}
} else {
labs = levels(cut(dat_x, breaks = g, include.lowest = FALSE, right = TRUE, dig.lab = digits_x))
cuts = as.numeric(sub("\\((.+),.*", "\\1", labs))[-1]
}
}
cuts = round(cuts, digits = digits_x)
if (!is.null(sp_values) && x_miss && length(sp_value_num) > 0) {
cuts = sort(unique(append(c(cuts, Inf), sp_value_num, 0)))
} else {
cuts = sort(unique(c(cuts, Inf)))
}
} else {
if(any(is.element(class(dat_x), c("Date")))){
if (cut_bin == 'equal_depth') {
cuts = c()
g = 9
p = 1/g*1:(g-1)
for(i in 1:(g-1)){
cuts[i] = date_cut(dat_time = dat_x,pct = p[i],g = g)
}
cuts = as.Date(cuts)
}else{
labs = levels(cut(dat_x, breaks = g, include.lowest = FALSE, right = TRUE, dig.lab = digits_x))
cuts = sub("\\((.+),.*", "\\1", labs)[-1]
}
}else{
if (length(unique(dat_x)) < 2) {
cuts = unique(dat_x)
} else {
cuts = as.list(names(table(dat_x)))
}
if (!is.null(sp_values) && x_miss && length(sp_value_char) > 0) {
cuts = unique(append(sp_value_char, cuts, 0))
} else {
cuts = unique(cuts)
}
}
}
rm(dat_x)
options(opt) # reset
return(unique(c(cuts)))
}
#' Generates Best Binning Breaks
#'
#'
#' \code{select_best_class} & \code{select_best_breaks} are for merging initial breaks of variables using chi-square, odds-ratio,PSI,G/B index and so on.
#' The \code{get_breaks} is a simpler wrapper for \code{select_best_class} & \code{select_best_class}.
#' @param dat A data frame with x and target.
#' @param target The name of target variable.
#' @param breaks Splitting points for an independent variable. Default is NULL.
#' @param sp_values A list of special value.
#' @param x The name of variable to process.
#' @param pos_flag The value of positive class of target variable, default: "1".
#' @param occur_time The name of the variable that represents the time at which each observation takes place.
#' @param oot_pct The percentage of Actual and Expected set for PSI calculating.
#' @param bins_control the list of parameters.
#' \itemize{
#' \item \code{bins_num} The maximum number of bins. 5 to 10 usually work. Default: 10
#' \item \code{bins_pct} The minimum percent of observations in any bins. 0 < bins_pct < 1 , 0.01 to 0.1 usually work. Default: 0.02.
#' \item \code{b_chi} The minimum threshold of chi-square merge. 0 < b_chi< 1; 0.01 to 0.1 usually work. Default: 0.02.
#' \item \code{b_odds} The minimum threshold of odds merge. 0 < b_odds < 1; 0.05 to 0.2 usually work. Default: 0.1.
#' \item \code{b_psi} The maximum threshold of PSI in any bins. 0 < b_psi < 1 ; 0 to 0.1 usually work. Default: 0.05.
#' \item \code{b_or} The maximum threshold of G/B index in any bins. 0 < b_or < 1 ; 0.05 to 0.3 usually work. Default: 0.15.
#' \item \code{odds_psi} The maximum threshold of Training and Testing G/B index PSI in any bins. 0 < odds_psi < 1 ; 0.01 to 0.3 usually work. Default: 0.1.
#' \item \code{mono} Monotonicity of all bins, the larger, the more nonmonotonic the bins will be. 0 < mono < 0.5 ; 0.2 to 0.4 usually work. Default: 0.2.
#' \item \code{kc} number of cross-validations. 1 to 5 usually work. Default: 1.
#' }
#' @param ... Other parameters.
#' @return A list of breaks for x.
#' @details
#' The folloiwing is the list of Reference Principles
#' \itemize{
#' \item 1.The increasing or decreasing trend of variables is consistent with the actual business experience.(The percent of Non-monotonic intervals of which are not head or tail is less than 0.35)
#' \item 2.Maximum 10 intervals for a single variable.
#' \item 3.Each interval should cover more than 2% of the model development samples.
#' \item 4.Each interval needs at least 30 or 1% positive samples. .
#' \item 5.Combining the values of blank, missing or other special value into the same interval called missing.
#' \item 6.The difference of Chi effect size between intervals should be at least 0.02 or more.
#' \item 7.The difference of absolute odds ratio between intervals should be at least 0.1 or more.
#' \item 8.The difference of positive rate between intervals should be at least 1/10 of the total positive rate.
#' \item 9.The difference of G/B index between intervals should be at least 15 or more.
#' \item 10.The PSI of each interval should be less than 0.1.
#' }
#' @seealso
#' \code{\link{get_tree_breaks}},
#' \code{\link{cut_equal}},
#' \code{\link{get_breaks}}
#' @examples
#' #equal sample size breaks
#' equ_breaks = cut_equal(dat = UCICreditCard[, "PAY_AMT2"], g = 10)
#'
#' # select best bins
#' bins_control = list(bins_num = 10, bins_pct = 0.02, b_chi = 0.02,
#' b_odds = 0.1, b_psi = 0.05, b_or = 0.15, mono = 0.3, odds_psi = 0.1, kc = 1)
#' select_best_breaks(dat = UCICreditCard, x = "PAY_AMT2", breaks = equ_breaks,
#' target = "default.payment.next.month", occur_time = "apply_date",
#' sp_values = NULL, bins_control = bins_control)
#' @export
select_best_class = function(dat, x, target, breaks = NULL, occur_time = NULL, oot_pct = 0.7,
pos_flag = NULL, bins_control = NULL, sp_values = NULL, ...) {
dat = checking_data(dat = dat, target = target, pos_flag = pos_flag)
if (is.null(breaks) || any(is.na(breaks)) || length(breaks) < 1) {
stop("breaks is missing.\n")
}
if (!is.character(dat[, x])) {
stop(paste(x, "must be a character.\n"))
}
if (length(breaks) > 2) {
break_class = sp_value_char = NULL
x_miss = any(dat[, x] %in% sp_values)
if (!is.null(sp_values) && x_miss) {
sp_value_char = unlist(sp_values[sapply(sp_values, is.character)])
miss_class = unlist(breaks[sapply(breaks, function(x) any(sp_value_char %in% x))])
non_miss_class = breaks[!sapply(breaks, function(x) any(sp_value_char %in% x))]
breaks = unique(non_miss_class)
dat = dat[dat[, x] %in% unlist(breaks),]
} else {
breaks = unique(breaks)
}
b_chi = ifelse(!is.null(bins_control[["b_chi"]]), bins_control[["b_chi"]], 0.02) / 2
b_odds = ifelse(!is.null(bins_control[["b_odds"]]), bins_control[["b_odds"]], 0.05) / 2
bins_num = ifelse(!is.null(bins_control[["bins_num"]]), bins_control[["bins_num"]], 10)
bins_pct = ifelse(!is.null(bins_control[["bins_pct"]]), bins_control[["bins_pct"]], 0.02)
b_psi = ifelse(!is.null(bins_control[["b_psi"]]), bins_control[["b_psi"]], 0.1) * 1.5
b_or = ifelse(!is.null(bins_control[["b_or"]]), bins_control[["b_or"]], 0.1) / 2
odds_psi = ifelse(!is.null(bins_control[["odds_psi"]]), bins_control[["odds_psi"]], 0.2) * 1.5
kc = ifelse(!is.null(bins_control[["kc"]]), bins_control[["kc"]], 1)
if (!is.null(kc) && kc > 1) {
cv_list = cv_split(dat, k = kc, occur_time = occur_time, seed = 46)
} else {
cv_list = cv_split(dat, k = 1 / (1 - oot_pct), occur_time = occur_time, seed = 46)
kc = 1
}
breaks_cv = iv_list_sub = psi_list_sub = list()
for (k in 1:kc) {
dat_train = dat[-cv_list[[k]],]
dat_test = dat[cv_list[[k]],]
break_class = breaks
dt_bins = NULL
while (TRUE) {
if (length(unique(break_class)) <= 1 | length(dat_train) < 1 | length(dat_test) < 1) break
dt_bins = get_psi_iv(dat = dat_train, x = x, dat_test = dat_test, target = target,
pos_flag = pos_flag, breaks = break_class, breaks_list = NULL, occur_time = occur_time,
oot_pct = oot_pct, bins_total = FALSE, note = FALSE, bins_no = TRUE)
gb = dt_bins[, c("expected_0", "expected_1")]
cut_psi = dt_bins[, "PSIi"]
bins_odds_ratio_s = dt_bins[, "odds_ratio_s"]
gb_index = dt_bins[, "odds_ratio"]
gb_percent = dt_bins[, "%expected"]
dif_gb = c()
for (brk in 1:(dim(gb)[1] - 1)) {
cross_table = rbind(gb[brk,], gb[brk + 1,])
a = cross_table[1, 1]
b = cross_table[1, 2]
c = cross_table[2, 1]
d = cross_table[2, 2]
if (any(is.na(cross_table)) | any(cross_table < 30) |
any(cross_table[, 2] / (cross_table[, 1] + cross_table[, 2]) < 0.01)) {
dif_gb[brk] = 0
} else {
dif_gb[brk] = gb_index[brk] - gb_index[brk + 1]
}
}
if (length(break_class) <= 2) {
break
} else {
if (any(abs(dif_gb) < b_or)) {
if (which.min(abs(dif_gb)) < length(break_class)) {
break_class[[which.min(abs(dif_gb)) + 1]] = append(unlist(break_class[which.min(abs(dif_gb))]),
unlist(break_class[which.min(abs(dif_gb)) + 1]), 0)
} else {
break_class[[which.min(abs(dif_gb)) - 1]] = append(unlist(break_class[which.min(abs(dif_gb))]),
unlist(break_class[which.min(abs(dif_gb)) - 1]), 0)
}
break_class = break_class[-which.min(abs(dif_gb))]
} else {
if (length(break_class) > bins_num) {
if (which.min(abs(dif_gb)) < length(break_class)) {
break_class[[which.min(abs(dif_gb)) + 1]] = append(unlist(break_class[which.min(abs(dif_gb))]),
unlist(break_class[which.min(abs(dif_gb)) + 1]), 0)
} else {
break_class[[which.min(abs(dif_gb)) - 1]] = append(unlist(break_class[which.min(abs(dif_gb))]),
unlist(break_class[which.min(abs(dif_gb)) - 1]), 0)
}
break_class = break_class[-which.min(abs(dif_gb))]
} else {
if (length(gb_percent) > 2 & any(gb_percent < bins_pct)) {
bins_pct_cut = dt_bins[, "cuts"][[which.min(gb_percent)]]
max_psi_bin = dt_bins[, "bins"][[which.min(gb_percent)]]
min_pct_m = which(dt_bins[, "bins"] %islike% max_psi_bin)
if (length(min_pct_m) > 0 ){
if ( min_pct_m < length(break_class)) {
break_class[[min_pct_m + 1]] = append(bins_pct_cut,
break_class[[min_pct_m + 1]], 0)
} else {
break_class[[min_pct_m - 1]] = append(bins_pct_cut,
break_class[[min_pct_m - 1]], 0)
}
break_class = break_class[-min_pct_m]
}
} else {
if (length(cut_psi) > 2 & any(cut_psi > b_psi)) {
max_psi_cut = dt_bins[, "cuts"][[which.max(cut_psi)]]
max_psi_bin = dt_bins[, "bins"][[which.max(cut_psi)]]
max_psi_m = which(dt_bins[, "bins"] %islike% max_psi_bin)
if(length(max_psi_m) > 0 ){
if ( max_psi_m < length(break_class)) {
break_class[[max_psi_m + 1]] = append(max_psi_cut,
break_class[[max_psi_m + 1]], 0)
} else {
break_class[[max_psi_m - 1]] = append(max_psi_cut,
break_class[[max_psi_m - 1]], 0)
}
break_class = break_class[-max_psi_m]
}
} else {
if (length(bins_odds_ratio_s) > 2 & any(bins_odds_ratio_s > odds_psi)) {
max_odds_psi_cuts = dt_bins[, "cuts"][[which.max(bins_odds_ratio_s)]]
max_odds_psi_bin = dt_bins[, "bins"][[which.max(bins_odds_ratio_s)]]
max_odds_psi_m = which(unlist(dt_bins[, "bins"]) %islike% max_odds_psi_bin)
if (length(max_odds_psi_m) > 0){
if (max_odds_psi_m < length(break_class)) {
break_class[[max_odds_psi_m + 1]] = append(max_odds_psi_cuts,
break_class[[max_odds_psi_m + 1]], 0)
} else {
break_class[[max_odds_psi_m - 1]] = append(max_odds_psi_cuts,
break_class[[max_odds_psi_m - 1]], 0)
}
break_class = break_class[-max_odds_psi_m]
}
} else {
break
}
}
}
}
}
}
}
dt_iv = get_iv(dat = dat_test, x = x, target = target, pos_flag = pos_flag,
breaks = break_class, note = FALSE)
iv_list_sub[[k]] = unlist(dt_iv[, "IV"])
breaks_cv[[k]] = break_class
}
max_iv = ifelse(max(unlist(iv_list_sub)) == 0, unlist(iv_list_sub), unlist(iv_list_sub) / max(unlist(iv_list_sub)))
best_ind = which.max(max_iv)
best_class = unique(breaks_cv[[best_ind]])
if (length(best_ind) > 0) {
best_class = unique(breaks_cv[[best_ind]])
} else {
best_class = unique(unlist(breaks_cv))
}
if (!is.null(sp_values) && x_miss && length(miss_class) > 0) {
best_class = unique(append(c(best_class), miss_class, 0))
}
} else {
best_class = breaks
}
return(best_class)
}
#' @rdname select_best_class
#' @export
select_best_breaks = function(dat, x, target, breaks = NULL, pos_flag = NULL,
sp_values = NULL, occur_time = NULL, oot_pct = 0.7,
bins_control = NULL, ...) {
opt = options(scipen = 200, stringsAsFactors = FALSE, digits = 6) #
dat = checking_data(dat = dat, target = target, pos_flag = pos_flag)
if (is.null(breaks) || any(is.na(breaks)) || length(breaks) < 1) {
stop("breaks is missing")
}
if (!any(c("integer", "numeric", "double") == class(dat[, x]))) {
stop("x must be numeric.")
}
if (length(breaks) > 2) {
break_points = sp_value_num = NULL
x_miss = any(dat[, x] %in% sp_values)
if (!is.null(sp_values) & x_miss) {
sp_value_num = unlist(sp_values[sapply(sp_values, is.numeric)])
miss_num = unlist(breaks[sapply(breaks, function(x) any(sp_value_num %in% x))])
breaks = breaks[!sapply(breaks, function(x) any(sp_value_num %in% x))]
dat = dat[!(dat[, x] %in% miss_num | is.na(dat[, x])),]
}
b_chi = ifelse(!is.null(bins_control[["b_chi"]]), bins_control[["b_chi"]], 0.01)
b_odds = ifelse(!is.null(bins_control[["b_odds"]]), bins_control[["b_odds"]], 0.05)
bins_num = ifelse(!is.null(bins_control[["bins_num"]]), bins_control[["bins_num"]], 10)
bins_pct = ifelse(!is.null(bins_control[["bins_pct"]]), bins_control[["bins_pct"]], 0.02)
b_psi = ifelse(!is.null(bins_control[["b_psi"]]), bins_control[["b_psi"]], 0.1)
b_or = ifelse(!is.null(bins_control[["b_or"]]), bins_control[["b_or"]], 0.1)
odds_psi = ifelse(!is.null(bins_control[["odds_psi"]]), bins_control[["odds_psi"]], 0.3)
kc = ifelse(!is.null(bins_control[["kc"]]), bins_control[["kc"]], 1)
mono = ifelse(!is.null(bins_control[["mono"]]), bins_control[["mono"]], 0.3)
# kc = 1
if (!is.null(kc) && kc > 1) {
cv_list = cv_split(dat, k = kc, occur_time = occur_time, seed = 46)
} else {
cv_list = cv_split(dat, k = 1 / (1 - oot_pct), occur_time = occur_time, seed = 46)
kc = 1
}
breaks_cv = iv_list_sub = psi_list_sub = list()
for (k in 1:kc) {
dat_train = dat[-cv_list[[k]],]
dat_test = dat[cv_list[[k]],]
break_points = unique(unlist(c(breaks, Inf)))
dt_bins = NULL
while (TRUE) {
if (length(unique(break_points)) <= 2 | length(dat_train) < 1 | length(dat_test) < 1) break
dt_bins = get_psi_iv(dat = dat_train, dat_test = dat_test, x = x, target = target,
pos_flag = pos_flag, breaks = break_points, breaks_list = NULL,
occur_time = occur_time, oot_pct = oot_pct,
bins_total = FALSE, note = FALSE, bins_no = TRUE)
gb = dt_bins[, c("expected_0", "expected_1")]
cut_psi = dt_bins[, "PSIi"]
bins_odds_ratio_s = dt_bins[, "odds_ratio_s"]
gb_index = dt_bins[, "odds_ratio"]
gb_percent = dt_bins[, "%expected"]
effect_sz = odds = dif_gb = c()
for (brk in 1:(dim(gb)[1] - 1)) {
cross_table = rbind(gb[brk,], gb[brk + 1,])
a = cross_table[1, 1]
b = cross_table[1, 2]
c = cross_table[2, 1]
d = cross_table[2, 2]
if (any(is.na(cross_table)) | any(cross_table < 30) | any(cross_table[, 2] / (cross_table[, 1] + cross_table[, 2]) < 0.01)) {
effect_sz[brk] = 0
odds[brk] = 1
dif_gb[brk] = 0
} else {
effect_sz[brk] = ((a * d - b * c) / 10000) / ((sqrt((a + b)) / 10000) * sqrt((c + d)) * sqrt((a + c)) * sqrt((b + d)))
odds[brk] = (a * d / 10000) / (b * c / 10000)
dif_gb[brk] = gb_index[brk] - gb_index[brk + 1]
}
}
gb_single = non_mono_break = gb_single_break = gb_single_pct = c()
if (length(gb_index) > 3) {
for (i in 2:(length(gb_index) - 1)) {
gb_single[i] = (gb_index[i] > gb_index[i - 1] &
gb_index[i] > gb_index[i + 1]) | (gb_index[i] < gb_index[i - 1] &
gb_index[i] < gb_index[i + 1])
}
gb_single_break = which(gb_single)
gb_single_pct = length(gb_single_break) / (length(gb_index))
if (length(gb_single_break) > 0 && gb_single_pct > mono) {
non_mono_break = sort(gb_single_break)
# non_mono_break = gb_single_break
}
}
if (length(break_points) <= 2) {
break
} else {
if (any(abs(effect_sz) < b_chi)) {
min_chi_bin = which.min(abs(effect_sz))
break_points = break_points[-c(min_chi_bin)]
} else {
if (any(abs(odds - 1) < b_odds)) {
min_odds_bin = which.min(abs(odds - 1))
break_points = break_points[-c(min_odds_bin)]
} else {
if (any(abs(dif_gb) < b_or)) {
min_gb_bin = which.min(abs(dif_gb))
break_points = break_points[-min_gb_bin]
} else {
if (length(non_mono_break) > 0) {
min_bins_gb = min(abs(dif_gb[non_mono_break]))[1]
min_gb_bin = which(abs(dif_gb) == min_bins_gb)
break_points = break_points[-min_gb_bin]
} else {
if (length(break_points) > bins_num) {
min_gb_bin = which.min(abs(dif_gb))
break_points = break_points[-min_gb_bin]
} else {
if (length(gb_percent) > 2 & any(gb_percent < bins_pct)) {
bins_pct_bin = which.min(gb_percent)
if (length(bins_pct_bin) > 0 &&
bins_pct_bin == length(gb_percent)) {
break_points = break_points[-c(bins_pct_bin - 1)]
} else {
break_points = break_points[-bins_pct_bin]
}
} else {
if (length(cut_psi) > 2 & any(cut_psi > b_psi)) {
max_psi_bin = which.max(cut_psi)
if (length(max_psi_bin) > 0 &&
max_psi_bin == length(cut_psi)) {
break_points = break_points[-c(max_psi_bin - 1)]
} else {
break_points = break_points[-max_psi_bin]
}
} else {
if (length(bins_odds_ratio_s) > 2 & any(bins_odds_ratio_s > odds_psi)) {
max_odds_psi_bin = which.max(bins_odds_ratio_s)
if (length(max_odds_psi_bin) > 0 &&
max_odds_psi_bin == length(bins_odds_ratio_s)) {
break_points = break_points[-c(max_odds_psi_bin - 1)]
} else {
break_points = break_points[-max_odds_psi_bin]
}
} else {
break
}
}
}
}
}
}
}
}
}
}
dt_iv = get_iv(dat = dat_test, x = x, target = target, pos_flag = pos_flag,
breaks = break_points, note = FALSE)
iv_list_sub[[k]] = unlist(dt_iv[, "IV"])
breaks_cv[[k]] = break_points
}
max_iv = ifelse(max(unlist(iv_list_sub)) == 0, unlist(iv_list_sub), unlist(iv_list_sub) / max(unlist(iv_list_sub)))
best_ind = which.max(max_iv)
best_class = unique(breaks_cv[[best_ind]])
if (length(best_ind) > 0) {
best_breaks = sort(unlist(unique(c(breaks_cv[[best_ind]], Inf))))
} else {
best_breaks = sort(unique(unlist(unique(c(breaks_cv, Inf)))))
}
if (!is.null(sp_values) && x_miss && length(miss_num) > 0) {
best_breaks = sort(unlist(unique(append(best_breaks, miss_num, 0))))
}
} else {
best_breaks = sort(unique(unlist(c(breaks, Inf))))
}
return(best_breaks)
options(opt) # reset
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.