# scBinning
#
# Author : Tri Le <lmtleminh@gmail.com>
#
#' @importFrom magrittr "%>%"
#' @importFrom foreach "%do%" "%dopar%"
NULL
#' Numeric binning for Credit Scoring
#'
#' This is an easy numerical binning solution for credit scorecard build. It is designed to
#' choose the optimal binning solution by utilizing the \href{https://cran.r-project.org/package=partykit}{Recursive Partitioning}.
#' This will only bin numeric or integer variables and ignore factor or character variables.
#'
#' @param data A data frame which contains target varible as well as predictor variables.
#' @param target Target variable name.
#' @param n Number of bootstrap iterations. Default 10 times.
#' @param p The minimum percentage of observation per bin. Default 3\%.
#' @param thres Threshold differences of target between bins. Default 0.5\%.
#' @param freqCut Utilizing \code{\link[caret:nearZeroVar]{nearZeroVar}} function. The cutoff for the ratio of the most common value to the second most common value. Default 95/5.
#' @param uniqueCut Utilizing \code{\link[caret:nearZeroVar]{nearZeroVar}} function. The cutoff for the percentage of distinct values out of the number of total samples. Default 10\%.
#' @param best A logical scalar. Use different methods which maximize IV. Default TRUE.
#' @param parallel A logical scalar. Use parallel backend. Default FALSE. In case of Error with pending connections, run the \code{\link{sc.unregister}} function to close all current connections.
#'
#' @return The output is a list of cut plan which can be applied to the orginal data frame via
#' the \code{\link{predict}} function.
#' The user can also update the cut plan via the \code{\link{update}} function.
#'
#' @examples
#' \dontrun{
#' # Load library
#' library(easysc)
#'
#' # Generate a cut plan which maximize IV via 500 bootstrap resampling
#' cut.plan <- sc.binning(data = df, target = BAD, n = 500, p = 5, best = TRUE, parallel = TRUE)
#' # Update the cut plan
#' update(cut.plan, AGE = c(20, 30, 40))
#' # Apply to the data frame
#' predict(cut.plan, df, keepTarget = TRUE)
#' }
#woe calculation
woeZ <- function(X, y) {
df <- tibble::tibble(X, yZ = y) %>%
dplyr::group_by(X) %>%
dplyr::summarise(
pct_bad = sum(yZ)/sum(y),
pct_good = sum(1-yZ)/sum(1-y),
woe = log(pct_good/pct_bad),
iv = (pct_good - pct_bad)*woe
)
return(sum(df$iv))
}
#spb method
superbin <- function(X, y, n = 10, p = 3, thres = .5, parallel = FALSE) {
data <- data.frame(X, y)
tree_bin <- function(seed, data) {
if (seed == 0) b <- 1:nrow(data)
else {
set.seed(seed)
b <- sample(nrow(data), nrow(data), replace = T)
}
t <- party::ctree(factor(y) ~ ., data = data[b,],
controls = party::ctree_control(maxdepth = 2))
c(t@tree$psplit$splitpoint,
t@tree$left$psplit$splitpoint,
t@tree$right$psplit$splitpoint)
}
seeds <- c(0, round(runif(n) * as.numeric(paste('1e', ceiling(log10(n)) + 2, sep = '')), 0))
x <- c()
if (parallel) {
mc <- parallel::detectCores() - 1
cl <- parallel::makeCluster(mc)
doParallel::registerDoParallel(cl)
foreach::foreach(s = seeds, .combine = 'c') %dopar% {
tree_bin(s, data)
} -> x
parallel::stopCluster(cl)
} else {
foreach::foreach(s = seeds, .combine = 'c') %do% {
tree_bin(s, data)
} -> x
}
m <- table(x) %>% as.matrix()
if (!is.null(m) & length(m) > 1) {
m <- data.frame(x = as.numeric(rownames(m)), Freq = m)
m[['Check']] <- 1
for (i in 1:(nrow(m)-1)) {
if (m[i + 1,'x'] == m[i,'x'] + 1 & m[i + 1, 'Freq'] >= m[i, 'Freq'])
m[i,'Check'] <- 0
if (i >= 2) {
if (m[i - 1,'x'] == m[i,'x'] - 1 & m[i - 1, 'Freq'] >= m[i, 'Freq'])
m[i,'Check'] <- 0
}
}
m <- m[m[['Check']] == 1,]
m[['w']] <- (m[['Freq']] - min(m[['Freq']])) / (max(m[['Freq']]) - min(m[['Freq']]))
m[['w']][is.na(m[['w']])] <- 1
m <- m[m[['w']] >=.2, 'x']
woeC <- woeZ(cut(X, breaks = c(-Inf, m, Inf)), y)
len <- length(m)
findMonotonic <- function(m) {
l1 <- length(m)
tbl <- data %>%
dplyr::mutate(X = cut(X, breaks = c(-Inf, m, Inf))) %>%
dplyr::group_by(X) %>%
dplyr::summarise(
pct_bad = mean(y) * 100,
pct_n = n()/nrow(data) * 100
)
tbl$dif <- c(diff(tbl$pct_bad), 0)
tbl$sign <- sign(tbl$dif)
tbl$m <- c(m, m[length(m)])
tbl$rk <- sign(sum(tbl$sign)) * 1:nrow(tbl)
reg <- isoreg(tbl$rk, tbl$pct_bad)
cut <- knots(as.stepfun(reg))
m <- unique(tbl$m[tbl$rk %in% cut])
l2 <- length(m)
if (l1 == l2) {
m <- unique(tbl$m[abs(tbl$dif) >= thres])
l2 <- length(m)
}
if (l1 == l2) {
m <- unique(tbl$m[tbl$pct_n >= p])
}
return(m)
}
while(length(m) > length(findMonotonic(m))) {
m <- findMonotonic(m)
woeC <- c(woeC, woeZ(cut(X, breaks = c(-Inf, m, Inf)), y))
len <- c(len, length(m))
}
} else {
m <- rownames(m)
}
return(m)
}
#bpb method - based on this post https://statcompute.wordpress.com/2018/11/25/improving-binning-by-bootstrap-bumping/
bump_bin <- function(X, y, n, p, parallel = FALSE) {
n1 <- round(p * length(y), 0)
n2 <- 10
#set.seed(2019)
seeds <- c(0, round(runif(n) * as.numeric(paste('1e', ceiling(log10(n)) + 2, sep = '')), 0))
df1 <- data.frame(X, y)
df2 <- df1[!is.na(df1[, 'X']), c('X', 'y')]
cor <- cor(df2[, 2], df2[, 1], method = "spearman", use = "complete.obs")
### MONOTONIC BINNING WITH BOOTSTRAP SAMPLES ###
bin <- function(seed, df2) {
if (seed == 0) b <- 1:nrow(df2)
else {
set.seed(seed)
b <- sample(nrow(df2), nrow(df2), replace = T)
}
smp <- df2
reg <- isoreg(smp[, 1], cor / abs(cor) * smp[, 2])
cut <- knots(as.stepfun(reg))
df2[['cut']] <- cut(df2[['X']], breaks = unique(cut), include.lowest = T)
df3 <- Reduce(rbind,
lapply(split(df2, df2[['cut']]),
function(x) data.frame(n = nrow(x), b = sum(x[['y']]), g = sum(1 - x[['y']]),
maxx = max(x[['X']]), minx = min(x[['X']]))))
df4 <- df3[which(df3[["n"]] > n1 & df3[["b"]] > n2 & df3[["g"]] > n2), ]
df2[['good']] <- 1 - df2[['y']]
out <- smbinning::smbinning.custom(df2, "good", 'X', cuts = df4[['maxx']][-nrow(df4)])
if (out == "No Bins") return(NULL)
out <- out[['ivtable']]
return(data.frame(iv = out[['IV']][length(out[['IV']])], nbin = nrow(out) - 2,
cuts = I(list(df4[['maxx']][-nrow(df4)])),
abs_cor = abs(cor(as.numeric(row.names(out)[1:(nrow(out) - 2)]),
out[['WoE']][1:(nrow(out) - 2)], method = "spearman"))))
}
if (parallel) {
mc <- parallel::detectCores() - 1
cl <- parallel::makeCluster(mc)
doParallel::registerDoParallel(cl)
foreach::foreach(s = seeds, .combine = 'rbind') %dopar% {
bin(s, df2)
} -> bump_out
parallel::stopCluster(cl)
} else {
foreach::foreach(s = seeds, .combine = 'rbind') %do% {
bin(s, df2)
} -> bump_out
}
#bump_out <- Reduce(rbind, lapply(seeds, bin))
### FIND THE CUT MAXIMIZING THE INFORMATION VALUE ###
if (is.null(bump_out)) return(NULL)
cut2 <- bump_out[order(-bump_out["abs_cor"], -bump_out["iv"], bump_out["nbin"]), ]$cuts[[1]]
return(cut2)
}
#establish threshold
thresd <- function(data, m, thres) {
tbl <- data %>%
dplyr::mutate(X = cut(X, breaks = c(-Inf, m, Inf))) %>%
dplyr::group_by(X) %>%
dplyr::summarise(
pct_bad = mean(y) * 100,
pct_n = n()/nrow(data) * 100
)
tbl$dif <- c(diff(tbl$pct_bad), 0)
tbl$m <- c(m, m[length(m)])
return(unique(tbl$m[abs(tbl$dif) >= thres]))
}
#unregistering foreach backend
#' @export
sc.unregister <- function(parallel = FALSE) {
if (parallel) {
env <- foreach:::.foreachGlobals
rm(list = ls(name = env), pos = env)
}
}
#for numeric binning
#' @export
sc.binning <- function(data, target, n = 10, p = 3, thres = .5, freqCut = 95/5, uniqueCut = 10, best = TRUE, parallel = FALSE) {
start_time <- Sys.time()
target <- deparse(substitute(target))
if (!(target %in% names(data)))
stop(paste0(target, ' is not exist!'))
y <- data[[target]]
data[,target] <- NULL
bestbin <- function(X, y, n = 10, p = 3, thres = .5, name = NULL, best = T, parallel = FALSE) {
print(paste0(name, '...'))
if (!(class(X) %in% c('numeric', 'integer')) | length(unique(X)) <= 4) {
return(NULL)
}
spb <- superbin(X, y, n, p, thres, parallel)
if (!is.null(spb)) attr(spb, 'method') <- 'spb'
finalBin <- spb
if (best) {
smb <- smbinning::smbinning(data.frame(X, y), 'y', 'X', p = p / 100)
#fixing smb give no split
if (length(smb) == 1) {
smb.cuts <- 0
smb.iv <- -1
} else {
smb.cuts <- thresd(data.frame(X, y), smb[['cuts']], thres)
smb.iv <- woeZ(cut(X, breaks = c(-Inf, smb.cuts, Inf)), y)
attr(smb.cuts, 'method') <- 'smb'
}
bpb <- bump_bin(X, y, n, p = p / 100, parallel)
if (is.null(bpb)) {
bpb.iv <- -1
} else {
bpb <- thresd(data.frame(X, y), bpb, thres)
bpb.iv <- woeZ(cut(X, breaks = c(-Inf, bpb, Inf)), y)
attr(bpb, 'method') <- 'bpb'
}
iv <- c(spb = woeZ(cut(X, breaks = c(-Inf, spb, Inf)), y),
smb = smb.iv,
bpb = bpb.iv)
finalBin <- switch(names(which.max(iv)),
spb = spb,
smb = smb.cuts,
bpb = bpb)
}
sc.unregister(parallel)
print(finalBin)
return(finalBin)
}
nzv <- caret::nearZeroVar(data, freqCut = freqCut, uniqueCut = uniqueCut,
foreach = parallel, allowParallel = parallel)
data[, nzv] <- 0
cut_plan <- lapply(names(data), function(x)
bestbin(data[[x]], y, n, p, thres, x, best, parallel)
)
sc.unregister(parallel)
names(cut_plan) <- names(data)
end_time <- Sys.time()
diff = end_time - start_time
print(diff)
attr(cut_plan, 'target') <- target
structure(cut_plan, class = 'cut.plan')
}
#' @method update cut.plan
#' @export
#for manual updating
update.cut.plan <- function(cut_plan, ...) {
un_arg <- list(...)
if (class(cut_plan) != 'cut.plan') {
stop('Not a cut plan!')
} else if (length(un_arg) != 0) {
for (i in 1:length(un_arg)) {
if (!is.na(names(cut_plan[names(un_arg)[i]]))) {
cut_plan[[names(un_arg)[i]]] <- un_arg[[i]]
attr(cut_plan[names(un_arg)[i]], 'method') <- 'manual'
} else
stop('Column names are not matched!')
}
} else {
print('Nothing happens!')
}
return(cut_plan)
}
#' @method predict cut.plan
#' @export
#for applying to data frame
predict.cut.plan <- function (cut_plan, data, keepTarget = FALSE) {
if (class(cut_plan) != 'cut.plan') {
stop('Not a cut plan!')
} else {
NewData <- data.frame(A = rep(0, nrow(data)))
for (i in 1:length(cut_plan)) {
if (!is.null(cut_plan[[i]])) {
if (class(data[[names(cut_plan)[i]]]) %in% c('numeric', 'integer')) {
col <- cut(data[[names(cut_plan)[i]]], breaks = c(-Inf, cut_plan[[i]], Inf))
} else {
stop(paste0(names(cut_plan)[i], ' is not numeric'))
}
} else if (names(cut_plan)[i] %in% colnames(data)){
col <- data[,names(cut_plan)[i]]
} else {
col <- rep(NA, nrow(data))
}
NewData <- cbind(NewData, col)
}
NewData['A'] <- NULL
names(NewData) <- names(cut_plan)
if (keepTarget & attr(cut_plan, 'target') %in% names(data))
NewData[attr(cut_plan, 'target')] <- data[attr(cut_plan, 'target')]
if (!(attr(cut_plan, 'target') %in% names(data)))
warning(paste0(attr(cut_plan, 'target'), ' is not exist!'))
return(NewData)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.