# woebin woebin_plot woebin_ply woebin_adj
# converting vector (breaks & special_values) to data frame
split_vec2df = function(vec) {
value = . = bin_chr = V1 = NULL
if (!is.null(vec)) data.table(
value=vec, bin_chr=vec
)[, rowid := .I
][, strsplit(as.character(value), "%,%", fixed=TRUE), by = .(rowid, bin_chr)
][, .(rowid, bin_chr, value = ifelse(V1=="missing", NA, as.character(V1)) )]
}
# add missing to spl_val if there is na in dtm$value and
# missing is not specified in breaks and spl_val
add_missing_spl_val = function(dtm, breaks, spl_val) {
value = NULL
if (dtm[,any(is.na(value))]) {
no_missing = !any(grepl('missing', c(breaks, spl_val)))
if (no_missing) {
spl_val = c('missing',spl_val)
}
}
return(spl_val)
}
# split dtm into bin_sv and dtm (without speical_values)
dtm_binning_sv = function(dtm, breaks, spl_val) {
binning_sv = value = . = y = variable = count = neg = pos = bin = bin_chr = NULL
# spl_val
spl_val = add_missing_spl_val(dtm, breaks, spl_val)
if (!is.null(spl_val)) {
# special_values from vector to data frame
sv_df = split_vec2df(spl_val)
# dtm_sv & dtm
dtm_sv = setDT(dtm)[value %in% sv_df$value]
dtm = setDT(dtm)[!(value %in% sv_df$value)]
# if (nrow(dtm_sv) == 0) return(list(binning_sv=NULL, dtm=dtm))
# binning_sv
binning_sv = merge(
dtm_sv[, .(count=.N, neg = sum(y==0), pos = sum(y==1), variable=unique(variable)) , by = value][,value:=as.character(value)],
sv_df[,value:=as.character(value)],
all.x = TRUE, by='value'
)[, value:=ifelse(is.na(value), "missing", as.character(value))
][, .(bin=unique(bin_chr), count=sum(count), neg=sum(neg), pos=sum(pos), variable=unique(variable)), by=rowid
][, .(variable, bin, count, neg, pos)]
}
return(list(binning_sv=binning_sv, dtm=dtm))
}
# check empty bins for unmeric variable
check_empty_bins = function(dtm, binning, bin_close_right) {
. = bin = value = variable = y = NULL
# check empty bins
## break points from bin
breaks_list = lapply(
list(left="\\1", right="\\2"),
function(x) setdiff(sub(binpattern('leftright_brkp', bin_close_right), x, unique(binning$bin)), c("Inf","-Inf")) )
## if there are empty bins
if (!setequal(breaks_list$left, breaks_list$right)) {
bstbrks = unique(c(-Inf, unique(breaks_list$right), Inf))
binning = dtm[
, bin := cut(value, bstbrks, right = bin_close_right, dig.lab = 10, ordered_result = FALSE)
][, .(count=.N, neg = sum(y==0), pos = sum(y==1), variable=unique(variable)) , keyby = .(bin)
]
# warning( paste0("The break points are modified into \'", paste0(breaks_list$right, collapse = ", "), "\'. There are empty bins based on the provided break points." ) )
}
return(binning)
}
# check zero in neg pos, remove bins that have zeros in neg or pos column
check_zero_negpos = function(dtm, binning, count_distr_limit = NULL, bin_close_right ) {
brkp = neg = pos = count = merge_tolead = count_lag = count_lead = brkp2 = . = variable = bin = posprob = value = NULL
while (binning[!is.na(brkp)][neg==0 | pos==0,.N] > 0) {
# brkp needs to be removed if neg==0 or pos==0
rm_brkp = binning[!is.na(brkp)][
,count := neg+pos
][,`:=`(
count_lag=shift(count,type="lag", fill=nrow(dtm)+1),
count_lead=shift(count,type="lead", fill=nrow(dtm)+1)
)][, merge_tolead := count_lag > count_lead
][neg == 0 | pos == 0][count == min(count)]
# set brkp to lead's or lag's
shift_type = ifelse(rm_brkp[1,merge_tolead], 'lead', 'lag')
binning = binning[
,brkp2 := shift(brkp,type=shift_type)
][brkp == rm_brkp[1,brkp], brkp := brkp2]
# groupby brkp
binning = binning[
,.(variable=unique(variable), bin=paste0(bin, collapse = "%,%"), neg=sum(neg), pos=sum(pos)), by=brkp
][, posprob:=pos/(neg+pos)]
}
# format bin
if (is.numeric(dtm[,value])) {
binning = binning[
grepl("%,%",bin), bin := sub(binpattern('multibin', bin_close_right), "\\1\\2", bin)
][bin == 'missing', brkp := NA
][bin != 'missing', brkp := get_brkp_bin(bin, bin_close_right)]
}
return(binning)
}
# check count distri, remove bins that count_distribution rate less than count_distr_limit
check_count_distri = function(dtm, binning, count_distr_limit, bin_close_right ) {
count_distr = count = neg = pos = brkp = merge_tolead = count_lag = count_lead = brkp2 = . = variable = bin = value = NULL
if (!('count' %in% names(binning))) binning[, count := neg + pos]
binning[, count_distr := (count)/sum(count)]
while (binning[!is.na(brkp)][count_distr<count_distr_limit,.N] > 0) {
# brkp needs to be removed if neg==0 or pos==0
rm_brkp = binning[!is.na(brkp)][
,count_distr := (count)/sum(count)
][,`:=`(
count_lag=shift(count_distr,type="lag", fill=nrow(dtm)+1),
count_lead=shift(count_distr,type="lead", fill=nrow(dtm)+1)
)][, merge_tolead := count_lag > count_lead
][count_distr<count_distr_limit][count_distr == min(count_distr)]
# set brkp to lead's or lag's
shift_type = ifelse(rm_brkp[1,merge_tolead], 'lead', 'lag')
binning = binning[
,brkp2 := shift(brkp,type=shift_type)
][brkp == rm_brkp[1,brkp], brkp := brkp2]
# groupby brkp
binning = binning[
,.(variable=unique(variable), bin=paste0(bin, collapse = "%,%"), count=sum(count), neg=sum(neg), pos=sum(pos)), by=brkp
][, count_distr := (count)/sum(count)]
}
# format bin
if (is.numeric(dtm[,value])) {
binning = binning[
grepl("%,%",bin), bin := sub(binpattern('multibin', bin_close_right), "\\1\\2", bin)
][bin == 'missing', brkp := NA
][bin != 'missing', brkp := get_brkp_bin(bin, bin_close_right)]
}
return(binning)
}
# required in woebin2 # return binning if breaks provided
#' @import data.table
woebin2_breaks = function(dtm, breaks, spl_val, bin_close_right ) {
# global variables or functions
value = bin = . = y = variable = count = pos = neg = V1 = posprob = bksv_list = bin_chr = NULL
# breaks from vector to data frame
bk_df = split_vec2df(breaks)
# dtm $ binning_sv
dtm_binsv_list = dtm_binning_sv(dtm, breaks, spl_val)
dtm = dtm_binsv_list$dtm
binning_sv = dtm_binsv_list$binning_sv
if (dtm[,.N] == 0 || is.null(dtm)) return(list(binning_sv=binning_sv, binning=NULL))
# binning
if (is.numeric(dtm[,value])) {
bstbrks = brk_numx_init(unique(bk_df$value), dtm$value, bin_close_right)
binning = dtm[
, bin := cut(value, bstbrks, right = bin_close_right, dig.lab = 10, ordered_result = FALSE)
][, .(count = .N, neg = sum(y==0), pos = sum(y==1), variable=unique(variable)) , by = .(bin)
][order(bin)]
# check empty bins
binning = check_empty_bins(dtm, binning, bin_close_right=bin_close_right)
# merge binning with bk_df
if (bk_df[is.na(value),.N] == 1) {
binning = merge(
binning[, value:=sub(binpattern('leftright_brkp', bin_close_right),"\\2",bin)],
bk_df,
all.x = TRUE, by="value"
)[order(rowid,value)][, bin:=ifelse(is.na(bin), "missing", as.character(bin))
][, .(bin=paste0(bin,collapse="%,%"), count = sum(count), neg=sum(neg), pos=sum(pos), variable=unique(variable)), by=rowid
][order(rowid)]
}
} else if (is.factor(dtm[,value]) || is.character(dtm[,value])) {
dtm = dtm[,value := as.character(value)]
# the values not specified in breaks_list
diff_dt_brk = setdiff(dtm[,unique(value)], bk_df[,value])
if (length(diff_dt_brk) > 0) {
warning(sprintf('The categorical values (`%s`) are not specified in `breaks_list` for the column `%s`.', paste0(diff_dt_brk, collapse = ', '), dtm[1,variable]) )
stop()
}
# merge binning with bk_df
binning = merge(
dtm, bk_df[,bin:=bin_chr], all.x = TRUE
)[order(rowid, bin)][, .(count = .N, neg = sum(y==0), pos = sum(y==1), variable=unique(variable)) , by = .(rowid, bin)]
}
# # remove rowid column in binning data frame
binning = binning[,rowid:=1][,rowid:=NULL]
# # bind binning_sv and binning
# if (setDT(binning_sv)[,.N] > 0) binning = rbind(binning_sv, binning)
return(list(binning_sv=binning_sv, binning=binning))
}
# required in woebin2 # return initial binning
woebin2_init_bin = function(dtm, init_count_distr, breaks, spl_val, bin_close_right ) {
# global variables or functions
. = pos = posprob = bin = brkp = count = neg = value = variable = y = NULL
# dtm $ binning_sv
dtm_binsv_list = dtm_binning_sv(dtm, breaks, spl_val)
dtm = dtm_binsv_list$dtm
binning_sv = dtm_binsv_list$binning_sv
if (is.null(dtm) || dtm[,.N]==0) return(list(binning_sv=binning_sv, initial_binning=NULL))
# binning
if (is.numeric(dtm[,value])) {
# numeric variable ------
xvalue = dtm[, value]
# breaks vector & outlier
iqr = IQR(xvalue, na.rm = TRUE)
if (iqr == 0) {
prob_down = 0.01
prob_up = 0.99
} else {
prob_down = 0.25
prob_up = 0.75
}
xrng = quantile(xvalue, probs = c(prob_down, prob_up), na.rm = TRUE)
xvalue_rm_outlier = xvalue[which(xvalue >= xrng[1]-3*iqr & xvalue <= xrng[2]+3*iqr)]
# number of initial binning
n = trunc(1/init_count_distr)
len_uniq_x = length(setdiff(unique(xvalue_rm_outlier), c(NA,Inf,-Inf)))
if (len_uniq_x < n) n = len_uniq_x
# initial breaks
if (len_uniq_x < 10) {
brk = setdiff(unique(xvalue_rm_outlier), c(NA, Inf, -Inf))
} else {
brk = suppressWarnings(pretty(xvalue_rm_outlier, n))
ndigits = data.table(n=nchar(sub('.*\\.', '', as.character(brk))))[, .N, by=n][, max(n)+1]
brk = unique(round(brk, digits=ndigits))
}
brk = brk_numx_init(brk, xvalue, bin_close_right)
if (anyNA(xvalue)) brk = c(brk, NA)
# initial binning datatable
init_bin = dtm[
, bin := cut(value, brk, right = bin_close_right, dig.lab = 10, ordered_result = FALSE)
][, .(neg = sum(y==0), pos = sum(y==1), variable=unique(variable)) , by = bin
][order(bin)]
# check empty bins
init_bin = check_empty_bins(dtm, init_bin, bin_close_right = bin_close_right)
init_bin = init_bin[
, `:=`(brkp = get_brkp_bin(bin, bin_close_right), posprob = pos/(neg+pos))
][, .(variable, bin, brkp, neg, pos, posprob)]
} else if ( is.logical(dtm[,value]) || is.factor(dtm[,value]) || is.character(dtm[,value]) ) {
# other variable ------
# initial binning datatable
init_bin = dtm[
, .(variable = unique(variable), neg = sum(y==0), pos = sum(y==1)), by=value
][, posprob := pos/(neg+pos)]
# order by bin if is.factor, or by posprob if is.character
if (is.logical(dtm[,value]) || is.factor(dtm[,value])) {
init_bin = init_bin[
order(value)
][, brkp := ifelse(is.na(value), NA, .I)
][, .(variable, bin=value, brkp, neg, pos, posprob)]
} else {
init_bin = init_bin[
order(posprob)
# next 3 lines make NA located at the last rows
][, brkp := ifelse(is.na(value), NA, .I)
][order(brkp)
][, brkp := ifelse(is.na(value), NA, .I)
][, .(variable, bin=value, brkp, neg, pos, posprob)]
}
}
# remove brkp that neg == 0 or pos == 0 ------
init_bin = check_zero_negpos(dtm, init_bin, bin_close_right=bin_close_right)[, count := neg + pos]
return(list(binning_sv=binning_sv, initial_binning=init_bin))
}
# required in woebin2_tree # add 1 best break for tree-like binning
woebin2_tree_add_1brkp = function(dtm, initial_binning, count_distr_limit, bestbreaks=NULL, bin_close_right ) {
# global variables or functions
brkp = patterns = . = neg = pos = variable = count_distr = value = min_count_distr = bstbin = min_count_distr = total_iv = bstbin = brkp = bin = NULL
# total_iv for all best breaks
total_iv_all_breaks = function(initial_binning, bestbreaks, dtm_rows) {
# best breaks set
breaks_set = setdiff( initial_binning[,brkp], c(bestbreaks, -Inf, Inf, NA) )
init_bin_all_breaks = copy(initial_binning)
# loop on breaks_set
for (i in breaks_set) {
# best break + i
bestbreaks_i = sort(c(bestbreaks, i))
# best break datatable
init_bin_all_breaks = init_bin_all_breaks[
, paste0("bstbin",i) := cut(brkp, c(-Inf, bestbreaks_i, Inf), right = bin_close_right, dig.lab = 10, ordered_result = FALSE) ]
}
# best break dt
total_iv_all_brks = melt(
init_bin_all_breaks, id = c("variable", "neg", "pos"), variable.name = "bstbin", measure = patterns("bstbin.+")
)[, .(neg = sum(neg), pos = sum(pos), variable = unique(variable))
, by=.(bstbin, value)
][, count_distr := (neg+pos)/dtm_rows, by=bstbin
][!is.na(value), min_count_distr := min(count_distr), by=bstbin
][, .(total_iv = iv_01(neg, pos), variable = unique(variable), min_count_distr = min(min_count_distr,na.rm=TRUE)), by=bstbin
][, bstbin := as.numeric(sub("bstbin(.+)", "\\1", bstbin))][]
return(total_iv_all_brks)
}
# binning add 1best break
binning_add_1bst = function(initial_binning, bestbreaks) {
value = bstbin = . = neg = pos = variable = woe = bin_iv = total_iv = bstbrkp = posprob = NULL # no visible binding for global variable
if ( is.numeric(dtm[,value]) ) {
binning_1bst_brk = initial_binning[
, bstbin := cut(brkp, c(-Inf, bestbreaks, Inf), right = bin_close_right, dig.lab = 10, ordered_result = FALSE)
][, .(variable=unique(variable), bin=unique(bstbin), neg = sum(neg), pos = sum(pos)) , by = bstbin
]
} else if (is.logical(dtm[,value]) || is.factor(dtm[,value]) || is.character(dtm[,value]) ) {
bestbreaks = setdiff(bestbreaks, min(initial_binning[,brkp]))
binning_1bst_brk = initial_binning[
, bstbin := cut(brkp, c(-Inf, bestbreaks, Inf), right = bin_close_right, dig.lab = 10, ordered_result = FALSE)
][, .(variable=unique(variable), bin = paste0(bin, collapse = "%,%"), neg = sum(neg), pos = sum(pos)), by = bstbin ]
}
binning_1bst_brk = binning_1bst_brk[
order(bstbin)
][, total_iv := iv_01(neg, pos)
][, bstbrkp := get_brkp_bin(bstbin, bin_close_right)
][, .(variable, bin, bstbin, bstbrkp, neg, pos, total_iv)]
return(binning_1bst_brk)
}
# adding 1 best breakpoint
dtm_rows = nrow(dtm)
total_iv_all_brks = total_iv_all_breaks(initial_binning, bestbreaks, dtm_rows)
# bestbreaks: total_iv == max(total_iv) & min(count_distr) >= count_distr_limit
if (total_iv_all_brks[min_count_distr >= count_distr_limit, .N > 0]) {
bstbrk_max_iv = total_iv_all_brks[min_count_distr >= count_distr_limit][total_iv==max(total_iv)][, bstbin]
# add 1best break to bestbreaks
bestbreaks = unique(c(bestbreaks, bstbrk_max_iv[1]))
}
bin_add_1bst = binning_add_1bst(initial_binning, bestbreaks)
return(bin_add_1bst)
}
# required in woebin2 # return tree-like binning
woebin2_tree = function(
dtm,
init_count_distr = 0.02,
count_distr_limit = 0.05,
stop_limit = 0.1,
bin_num_limit = 8,
breaks = NULL,
spl_val = NULL,
bin_close_right
) {
# global variables or functions
brkp = bstbrkp = total_iv = count = neg = pos = NULL
# initial binning
bin_list = woebin2_init_bin(dtm, init_count_distr=init_count_distr, breaks=breaks, spl_val=spl_val, bin_close_right=bin_close_right)
initial_binning = bin_list$initial_binning
binning_sv = bin_list$binning_sv
if (nrow(initial_binning)<=1 || is.null(initial_binning)) {
return(list(binning_sv=binning_sv, binning=initial_binning))
}
# initialize parameters
## length all breaks
len_brks = initial_binning[!is.na(brkp), .N]
## param
bestbreaks = NULL ## best breaks
IVt1 = IVt2 = 1e-10
IVchg = 1 ## IV gain ratio
step_num = 1
# best breaks from three to n+1 bins
binning_tree = NULL
while ( (IVchg >= stop_limit) & (step_num+1 <= min(bin_num_limit, len_brks)) ) {
binning_tree = woebin2_tree_add_1brkp(dtm, copy(initial_binning), count_distr_limit, bestbreaks, bin_close_right=bin_close_right)
# print(binning_tree)
# update parameters
## best breaks
bestbreaks = binning_tree[!(bstbrkp %in% c(-Inf, Inf)) & !is.na(bstbrkp), bstbrkp]
## information value
IVt2 = binning_tree[1, total_iv]
IVchg = IVt2/IVt1-1 ## ratio gain
IVt1 = IVt2
# print(IVchg)
step_num = step_num + 1
}
if (is.null(binning_tree)) binning_tree = initial_binning
bin_list = list(binning_sv=binning_sv, binning=binning_tree[, count := neg + pos])
return(bin_list)
# return(binning_tree)
}
# examples
# system.time( binning_list <- woebin2_init_bin(dtm, init_count_distr=0.02, breaks =NULL, spl_val=NULL) )
# initial_binning=binning_list$initial_binning
# binning_sv = binning_list$binning_sv
# system.time( woebin2_tree_add_1brkp(dtm, initial_binning, count_distr_limit=0.05) )
# system.time( woebin2_tree(dtm, initial_binning, count_distr_limit=0.05) )
# required in woebin2 # return chimerge binning
#' @importFrom stats qchisq
woebin2_chimerge = function(
dtm,
init_count_distr = 0.02,
count_distr_limit = 0.05,
stop_limit = 0.1,
bin_num_limit = 8,
breaks = NULL,
spl_val = NULL,
bin_close_right
) {
.= a= a_colsum= a_lag= a_lag_rowsum= a_rowsum= a_sum= pos= bin= brkp= brkp2= chisq= count= count_distr= e= e_lag= chisq_lead= neg= negpos= merge_tolead =value= variable= NULL
# [chimerge](http://blog.csdn.net/qunxingvip/article/details/50449376)
# [ChiMerge:Discretization of numeric attributs](http://www.aaai.org/Papers/AAAI/1992/AAAI92-019.pdf)
# chisq = function(a11, a12, a21, a22) {
# A = list(a1 = c(a11, a12), a2 = c(a21, a22))
# Adf = do.call(rbind, A)
#
# Edf =
# matrix(rowSums(Adf), ncol = 1) %*%
# matrix(colSums(Adf), nrow = 1) /
# sum(Adf)
#
# sum((Adf-Edf)^2/Edf)
# }
# initial binning
bin_list = woebin2_init_bin(dtm, init_count_distr=init_count_distr, breaks=breaks, spl_val=spl_val, bin_close_right=bin_close_right)
initial_binning = bin_list$initial_binning
binning_sv = bin_list$binning_sv
if (nrow(initial_binning)<=1 || is.null(initial_binning)) {
return(list(binning_sv=binning_sv, binning=initial_binning))
}
# function to create a chisq column in initial_binning
add_chisq = function(initial_binning) {
chisq_df = melt(initial_binning[!is.na(brkp)], id.vars = c("brkp", "variable", "bin"), measure.vars = c("neg", "pos"), variable.name = "negpos", value.name = "a"
)[order(brkp)
][, a_lag := shift(a, type="lag"), by=.(negpos)
][, `:=`(
a_rowsum = sum(a),
a_lag_rowsum = sum(a_lag),
a_colsum = a+a_lag,
a_sum = sum(a+a_lag)), by='bin'
][, `:=`(
e = (a_rowsum*a_colsum)/a_sum,
e_lag = prod(a_lag_rowsum, a_colsum, na.rm = TRUE) / a_sum
)][, .(chisq=sum((a-e)^2/e + (a_lag-e_lag)^2/e_lag)), by='bin']
return(merge(initial_binning[,count:=neg+pos], chisq_df, all.x = TRUE, sort = FALSE))
}
# dtm_rows
dtm_rows = nrow(dtm)
# chisq limit
chisq_limit = qchisq(1-stop_limit,1)
# binning with chisq column
binning_chisq = add_chisq(initial_binning)
# param
bin_chisq_min = binning_chisq[!is.na(chisq), min(chisq)]
bin_count_distr_min = binning_chisq[!is.na(brkp), min((neg+pos)/dtm_rows)]
bin_nrow = binning_chisq[,.N]
# remove brkp if chisq < chisq_limit
while (
bin_chisq_min < chisq_limit ||
bin_count_distr_min < count_distr_limit ||
bin_nrow > bin_num_limit) {
# brkp needs to be removed
if (bin_count_distr_min < count_distr_limit) {
rm_brkp = binning_chisq[,`:=`(
count_distr = count/sum(count),
chisq_lead = shift(chisq, type = "lead", fill = Inf)
)][,merge_tolead := ifelse(is.na(chisq), TRUE, chisq > chisq_lead)
][!is.na(brkp)][order(count_distr)][1,]
} else if (bin_chisq_min < chisq_limit) {
rm_brkp = binning_chisq[, merge_tolead := FALSE][order(chisq, count)][1,]
} else if (bin_nrow > bin_num_limit) {
rm_brkp = binning_chisq[, merge_tolead := FALSE][order(chisq, count)][1,]
} else break
# groupby brkp
shift_type = ifelse(rm_brkp[1,merge_tolead], 'lead', 'lag')
binning_chisq = binning_chisq[
,brkp2 := shift(brkp,type=shift_type)
][brkp == rm_brkp[1,brkp], brkp := brkp2
][,.(variable=unique(variable), bin=paste0(bin, collapse = "%,%"), neg=sum(neg), pos=sum(pos)), by=brkp
]#[, posprob:=pos/(neg+pos)]
# update
## add chisq to new binning data frame
binning_chisq = add_chisq(binning_chisq)
## param
bin_nrow = binning_chisq[,.N]
if (bin_nrow == 1 || binning_chisq[!is.na(chisq), .N==0]) break
bin_chisq_min = binning_chisq[!is.na(chisq), min(chisq)]
bin_count_distr_min = binning_chisq[!is.na(brkp), min((neg+pos)/dtm_rows)]
}
# format bin # remove (.+\\)%,%\\[.+,)
if (is.numeric(dtm[,value])) {
binning_chisq = binning_chisq[grepl("%,%",bin), bin := sub(binpattern('multibin', bin_close_right), "\\1\\2", bin)]
}
bin_list = list(binning_sv=binning_sv, binning=binning_chisq[, count := neg + pos])
return(bin_list)
# return(binning_chisq)
}
# required in woebin2 # return equal binning, supports numerical variables only
woebin2_equal = function(
dtm, init_count_distr=0.02, count_distr_limit=0.05, stop_limit=0.1, bin_num_limit=8, breaks=NULL, spl_val=NULL, method='freq', bin_close_right ) {
count = value = group = . = minv = maxv = bin = y = variable = pos = neg = posprob = NULL
# dtm $ binning_sv
dtm_binsv_list = dtm_binning_sv(dtm, breaks, spl_val)
dtm = dtm_binsv_list$dtm
binning_sv = dtm_binsv_list$binning_sv
if (is.null(dtm) || dtm[,.N]==0) return(list(binning_sv=binning_sv, binning=NULL))
# dt_sl = dtm[,.(label=y, datset=variable, score=value)]
# dtm = dt_sl[,.(y=label, variable=datset, value=score)]
# breaks
unique_xvalue = dtm[, unique(value)]
if (bin_num_limit >= length(unique_xvalue) ) {
# in each value
brkp = unique_xvalue
} else {
if (method == 'freq') {
brkp = copy(dtm)[order(value)
][, group := ceiling(.I/(.N/bin_num_limit))]
if (bin_close_right) {
brkp = brkp[, .(value = value[.N]), by = group][,value]
} else {
brkp = brkp[, .(value = value[ 1]), by = group][,value]
}
} else if (method == 'width') {
brkp = seq(min(unique_xvalue, na.rm = TRUE), max(unique_xvalue, na.rm = TRUE), length.out = bin_num_limit+1)
brkp = brkp[-c(1, length(brkp))]
}
}
brkp = brk_numx_init(brkp, unique_xvalue, bin_close_right)
binning_equal = dtm[, bin := cut(value, unique(brkp), right = bin_close_right, dig.lab = 10, ordered_result = F)
][, .(neg = sum(y==0), pos = sum(y==1), count = .N), keyby = .(variable, bin)
][, `:=`(brkp = get_brkp_bin(bin, bin_close_right), posprob = pos/(neg+pos))
][, .(variable, bin, brkp, count, neg, pos, posprob)]
# create binning
binning_equal = check_empty_bins(dtm, binning_equal, bin_close_right=bin_close_right)
binning_equal = check_zero_negpos(dtm, binning_equal, bin_close_right=bin_close_right)
binning_equal = check_count_distri(dtm, binning_equal, count_distr_limit, bin_close_right=bin_close_right)
bin_list = list(binning_sv=binning_sv, binning=binning_equal)
return(bin_list)
}
# required in woebin2 # # format binning output
binning_format = function(binning, bin_close_right ) {
# global variables or functions
. = pos = posprob = bin = bin_iv = neg = total_iv = variable = woe = is_sv = count = NULL
# required columns in input binning: variable, bin, neg, pos
if (!('count' %in% names(binning))) binning[, count := neg+pos]
binning = binning[
, posprob:=pos/(neg+pos)
][, woe := lapply(.SD, woe_01, pos), .SDcols = "neg"
][, bin_iv := lapply(.SD, miv_01, pos), .SDcols = "neg"
][, total_iv := sum(bin_iv)
][, bin := ifelse(is.na(bin) | bin=="NA", "missing", as.character(bin)) # replace NA by missing
][, .(variable, bin, count, count_distr=(neg+pos)/sum(neg+pos), neg, pos, posprob, woe, bin_iv, total_iv, breaks = sub(binpattern('leftrightbrkp_missing', bin_close_right), "\\2\\3", bin), is_special_values=is_sv)]
# move missing from last row to first
if ( "missing" %in% binning$bin ) {
binning = rbind(binning[bin=="missing"], binning[bin != "missing"])
}
return(binning[])
}
binlst_missing_merge = function(bin_list, missing_join) {
. = bin = count = neg = pos = value = variable = N = nr = NULL
if (missing_join == 'left') { # left
bin_list$binning = rbindlist(
list(
bin_list$binning[, rowid := .I],
bin_list$binning_sv[bin == 'missing'][, rowid := 1]
), fill = TRUE
)[, .(bin=paste0(bin,collapse="%,%"), count = sum(count), neg=sum(neg), pos=sum(pos), variable=unique(variable)), keyby=rowid
][]
bin_list$binning_sv = bin_list$binning_sv[bin != 'missing'][]
} else if (missing_join == 'right') { # right
bin_list$binning = rbindlist(
list(
bin_list$binning[, rowid := .I],
bin_list$binning_sv[bin == 'missing'][, rowid := nrow(bin_list$binning)]
), fill = TRUE
)[, .(bin=paste0(bin,collapse="%,%"), count = sum(count), neg=sum(neg), pos=sum(pos), variable=unique(variable)), keyby=rowid
][]
bin_list$binning_sv = bin_list$binning_sv[bin != 'missing'][]
}
return(bin_list)
}
# woebin2
# This function provides woe binning for only two columns (one x and one y) data frame.
woebin2 = function(
dtm,
breaks = NULL,
spl_val = NULL,
missing_join = NULL,
init_count_distr = 0.02,
count_distr_limit = 0.05,
stop_limit = 0.1,
bin_num_limit = 8,
bin_close_right = FALSE,
method = "tree", ...
) {
# global variables or functions
. = pos = posprob = bin = bin_iv = neg = total_iv = variable = woe = is_sv = count = value = NULL
# binning
if (!anyNA(breaks) & !is.null(breaks)) {
# 1.return binning if breaks provided
bin_list = woebin2_breaks(dtm=dtm, breaks=breaks, spl_val=spl_val, bin_close_right=bin_close_right)
} else {
if (stop_limit == "N") {
# binning of initial & specialvalues
bin_list = woebin2_init_bin(dtm, init_count_distr=init_count_distr, breaks=breaks, spl_val=spl_val, bin_close_right=bin_close_right)
} else {
if (method == "tree") {
# 2.tree-like optimal binning
bin_list = woebin2_tree(dtm, init_count_distr, count_distr_limit, stop_limit, bin_num_limit, breaks=breaks, spl_val=spl_val, bin_close_right=bin_close_right)
} else if (method == "chimerge") {
# 2.chimerge optimal binning
bin_list = woebin2_chimerge(dtm, init_count_distr, count_distr_limit, stop_limit, bin_num_limit, breaks=breaks, spl_val=spl_val, bin_close_right=bin_close_right)
} else if (method %in% c('freq','width')) {
# 3. in equal freq or width
bin_list = woebin2_equal(dtm, init_count_distr, count_distr_limit, stop_limit, bin_num_limit, breaks=breaks, spl_val=spl_val, method = method, bin_close_right=bin_close_right)
}
# missing join
missingrate = dtm[, mean(is.na(value))]
if (missingrate > 0 & missingrate < count_distr_limit & any(missing_join %in% c('left', 'right'))) {
bin_list = binlst_missing_merge(bin_list=bin_list, missing_join=missing_join)
}
}
}
# # binding binning_sv and binning
if (any(sapply(bin_list, is.null))) {
binning = rbindlist(bin_list)[, is_sv := names(bin_list)[!sapply(bin_list, is.null)]]
} else {
binning = rbindlist(bin_list, use.names = TRUE, fill = TRUE, idcol = 'is_sv')
}
binning = binning[, is_sv := is_sv == 'binning_sv']
binning = binning_format(binning, bin_close_right=bin_close_right)
return(binning)
}
# converting breaks list to bins_breakslist
breaks_list2binbrklst = function(breaks_list) {
bins_breakslist = setDT(
as.data.frame(t(setDT(lapply(breaks_list, function(x) paste(sprintf('"%s"', x), collapse = ', '))))),
keep.rownames = TRUE)[]
setnames(bins_breakslist, c('variable', 'x_breaks'))
return(bins_breakslist)
}
# converting bins to bins_breakslist
bins2binbrklst = function(bins, dt, breaks_list=NULL, bin_close_right=FALSE) {
.= bin= bin2= is_special_values= variable= x_breaks= x_class = NULL
# bins # if (is.list(bins)) rbindlist(bins)
bins = check_bincard(bins)
# x variables
xs_all = bins[,unique(variable)]
# class of variables
vars_class = data.table(
variable = xs_all,
x_class = dt[,sapply(.SD, class), .SDcols = xs_all]
)
# breaks
bins_breakslist = bins[
, bin2 := sub(binpattern('leftrightbrkp_missing', bin_close_right), "\\2\\3", bin)
][!(bin2 %in% c("-Inf","Inf","missing")) & !is_special_values
][vars_class, on="variable"
][, .(
x_breaks = paste0(paste0("\"",bin2,"\""), collapse=", "),
# x_breaks = paste(ifelse(x_class=="numeric", bin2, paste0("\"",bin2,"\"")), collapse=", "),
x_class=unique(x_class)
), by=variable]
if (!is.null(breaks_list)) {
bins_breakslist = rbind(breaks_list2binbrklst(breaks_list), bins_breakslist, fill = TRUE)[, .SD[.N], by = 'variable']
}
return(bins_breakslist)
}
# converting bins_breakslist to text
binbrklst2txt = function(bins_breakslist, header = FALSE, bin_close_right) {
variable = x_breaks = NULL
brklst_char = paste0(bins_breakslist[
, sprintf("`%s`=c(%s)", variable, x_breaks)
# paste0(variable, "=c(", x_breaks, ")")
], collapse = ", \n ")
brklst_char = paste0(c("list(", brklst_char, ")"), collapse = "\n ")
if (isTRUE(header)) brklst_char = sprintf("# %s \noptions(scorecard.bin_close_right = %s) \n%s", Sys.time(), bin_close_right, brklst_char)
return(brklst_char)
}
# saving breaks list
brklst_save = function(bins_breakslist, save_as=NULL, ...) {
backup = list(...)$backup
if (is.null(backup)) backup = TRUE
if (is.null(save_as)) save_as = sprintf('brklst_%s', format(Sys.time(),'%Y%m%d_%H%M%S'))
save_as = sprintf('%s.R', save_as)
if (isTRUE(backup)) {
if (file.exists(save_as)) {
brklst_ori = readLines(save_as)[-1]
brklst_new = unlist(strsplit(bins_breakslist, '\n'))[-1]
if (!identical(brklst_ori, brklst_new)) {
new_name = sub('\\.R$', sprintf('_bck@%s.R', format(Sys.time(),'%Y%m%d_%H%M%S')), save_as)
file.rename(save_as, new_name)
cli_inform(c(i = sprintf("The existed breaks_list file '%s' is renamed as '%s'\n", save_as, new_name)))
writeLines(bins_breakslist, save_as, useBytes = TRUE)
cli_inform(c(i = sprintf("The breaks_list is saved as '%s'", save_as)))
}
} else {
writeLines(bins_breakslist, save_as, useBytes = TRUE)
cli_inform(c(i = sprintf("The breaks_list is saved as '%s'", save_as)))
}
}
return(invisible())
}
# @param init_count_distr The minimum percentage of initial binning class number over total. Accepted range: 0.01-0.2; Defaults to 0.02, which means initial cut into 50 fine bins for continuous variables.
#' WOE Binning
#'
#' \code{woebin} generates optimal binning for numerical, factor and categorical variables using methods including tree-like segmentation or chi-square merge. \code{woebin} can also customizing breakpoints if the `breaks_list` was provided. The default `woe` is defined as ln(Pos_i/Neg_i). If you prefer ln(Neg_i/Pos_i), please set the argument `positive` as negative value, such as '0' or 'good'. If there is a zero frequency class when calculating woe, the zero will replaced by 0.99 to make the woe calculable.
#'
#' @param dt A data frame with both x (predictor/feature) and y (response/label) variables.
#' @param y Name of y variable.
#' @param x Name of x variables. Defaults to NULL. If x is NULL, then all columns except y and var_skip are counted as x variables.
#' @param var_skip Name of variables that will skip for binning. Defaults to NULL.
#' @param breaks_list List of break points, Defaults to NULL. If it is not NULL, variable binning will based on the provided breaks.
#' @param special_values the values specified in special_values will be in separate bins. Defaults to NULL.
#' @param missing_join missing values join with the left non-missing bin if its share is lower than the threshold. Accepted values include 'left' and 'right'. If it sets to NULL, the missing values will be placed in a separate bin.
#' @param stop_limit Stop binning segmentation when information value gain ratio less than the 'stop_limit' if using tree method; or stop binning merge when the chi-square of each neighbor bins are larger than the threshold under significance level of 'stop_limit' and freedom degree of 1 if using chimerge method. Accepted range: 0-0.5; Defaults to 0.1. If it is 'N', each x value is a bin.
# 'qchisq(1-stoplimit, 1)'
#' @param count_distr_limit The minimum count distribution percentage. Accepted range: 0.01-0.2; Defaults to 0.05.
#' @param bin_num_limit Integer. The maximum number of binning. Defaults to 8.
#' @param positive Value of positive class, defaults to "bad|1".
#' @param no_cores Number of CPU cores for parallel computation. Defaults to 2, if it sets to NULL then 90 percent of total cpu cores will be used.
#' @param print_step A non-negative integer. Defaults to 1. If print_step>0, print variable names by each print_step-th iteration. If print_step=0 or no_cores>1, no message is print.
#' @param method Four methods are provided, "tree" and "chimerge" for optimal binning that support both numerical and categorical variables, and 'width' and 'freq' for equal binning that support numerical variables only. Defaults to "tree".
#' @param ignore_const_cols Logical. Ignore constant columns. Defaults to TRUE.
#' @param ignore_datetime_cols Logical. Ignore datetime columns. Defaults to TRUE.
#' @param check_cate_num Logical. Check whether the number of unique values in categorical columns larger than 50. It might make the binning process slow if there are too many unique categories. Defaults to TRUE.
#' @param replace_blank_inf Logical. Replace blank values with NA and infinite with -1. Defaults to TRUE.
#' @param save_as A string. The file name to save breaks_list. Defaults to None.
#' @param ... Additional parameters.
#'
#' @return A list of data frames include binning information for each x variables.
#'
#' @seealso \code{\link{woebin_ply}}, \code{\link{woebin_plot}}, \code{\link{woebin_adj}}
#'
#' @examples
#' # load germancredit data
#' data(germancredit)
#'
#' # Example I
#' # binning of two variables in germancredit dataset
#' # using tree method
#' bins2_tree = woebin(germancredit, y="creditability",
#' x=c("credit.amount","housing"), method="tree")
#' bins2_tree
#'
#' \dontrun{
#' # using chimerge method
#' bins2_chi = woebin(germancredit, y="creditability",
#' x=c("credit.amount","housing"), method="chimerge")
#'
#' # binning in equal freq/width # only supports numerical variables
#' numeric_cols = c("duration.in.month", "credit.amount",
#' "installment.rate.in.percentage.of.disposable.income", "present.residence.since",
#' "age.in.years", "number.of.existing.credits.at.this.bank",
#' "number.of.people.being.liable.to.provide.maintenance.for")
#' bins_freq = woebin(germancredit, y="creditability", x=numeric_cols, method="freq")
#' bins_width = woebin(germancredit, y="creditability", x=numeric_cols, method="width")
#'
#' # y can be NULL if no label column in dataset
#' bins_freq_noy = woebin(germancredit, y=NULL, x=numeric_cols)
#'
#' # Example II
#' # setting of stop_limit
#' # stop_limit = 0.1 (by default)
#' bins_x1 = woebin(germancredit, y = 'creditability', x = 'foreign.worker', stop_limit = 0.1)
#' # stop_limit = 'N', each x value is a bin
#' bins_x1_N = woebin(germancredit, y = 'creditability', x = 'foreign.worker', stop_limit = 'N')
#'
#' # Example III
#' # binning of the germancredit dataset
#' bins_germ = woebin(germancredit, y = "creditability")
#' # converting bins_germ into a data frame
#' # bins_germ_df = data.table::rbindlist(bins_germ)
#'
#' # Example IV
#' # customizing the breakpoints of binning
#' library(data.table)
#' dat = rbind(
#' setDT(germancredit),
#' data.table(creditability=sample(c("good","bad"),10,replace=TRUE)),
#' fill=TRUE)
#'
#' breaks_list = list(
#' age.in.years = c(26, 35, 37, "Inf%,%missing"),
#' housing = c("own", "for free%,%rent")
#' )
#'
#' special_values = list(
#' credit.amount = c(2600, 9960, "6850%,%missing"),
#' purpose = c("education", "others%,%missing")
#' )
#'
#' bins_cus_brk = woebin(dat, y="creditability",
#' x=c("age.in.years","credit.amount","housing","purpose"),
#' breaks_list=breaks_list, special_values=special_values)
#'
#' # Example V
#' # save breaks_list as a R file
#' bins2 = woebin(germancredit, y="creditability",
#' x=c("credit.amount","housing"), save_as='breaks_list')
#'
#' # Example VI
#' # setting bin closed on the right
#' options(scorecard.bin_close_right = TRUE)
#' binsRight = woebin(germancredit, y = 'creditability', x = 'age.in.years')
#' binsRight
#' # setting bin close on the left, the default setting
#' options(scorecard.bin_close_right = FALSE)
#' }
#'
#' @import data.table foreach
#' @importFrom stats IQR quantile setNames
#' @importFrom doParallel registerDoParallel stopImplicitCluster
#' @importFrom parallel detectCores makeCluster stopCluster
#' @export
woebin = function(
dt, y, x=NULL, var_skip=NULL, breaks_list=NULL, special_values=NULL, missing_join='left',
stop_limit=0.1, count_distr_limit=0.05, bin_num_limit=8,
positive="bad|1", no_cores=2, print_step=0L, method="tree",
ignore_const_cols=TRUE, ignore_datetime_cols=TRUE, check_cate_num=TRUE, replace_blank_inf=TRUE,
save_as=NULL, ...) {
# global variable
i = NULL
# start time
start_time = proc.time()
# arguments ------
kwargs = list(...)
# print_info
print_info = kwargs[['print_info']]
if (is.null(print_info)) print_info = TRUE
if (print_info) cli_inform(c(i='Creating woe binning ...'))
# method
method = try(match.arg(method, c("tree", "chimerge", 'freq', 'width')), silent = TRUE)
if (inherits(method, 'try-error')) {
warning("Incorrect inputs; method should be tree or chimerge. Parameter was set to default (tree).")
method = "tree"
}
if (is.null(y) & !(method %in% c('freq', 'width'))) method = 'freq'
if (!is.null(missing_join)) {
missing_join = match.arg(missing_join, c("left", "right"))
}
# init_count_distr
min_perc_fine_bin = kwargs[['min_perc_fine_bin']]
init_count_distr = kwargs[['init_count_distr']]
if (is.null(init_count_distr)) {
init_count_distr <- ifelse(!is.null(min_perc_fine_bin), min_perc_fine_bin, 0.02)
}
# count_distr_limit
min_perc_coarse_bin = kwargs[['min_perc_coarse_bin']]
if (!is.null(min_perc_coarse_bin)) count_distr_limit = min_perc_coarse_bin
# bin_num_limit
max_num_bin = kwargs[['max_num_bin']]
if (!is.null(max_num_bin)) bin_num_limit = max_num_bin
# bin_close_right
bin_close_right = getarg('bin_close_right')
if (print_info & !is.null(breaks_list)) cli_inform(c(i = sprintf("The option bin_close_right was set to %s.", bin_close_right)), col = 'grey')
# save_breaks_list
save_breaks_list = kwargs[['save_breaks_list']]
if (!is.null(save_breaks_list)) save_as = save_breaks_list
#set dt as data.table
dt = setDT(copy(dt)) #copy(setDT(dt))
if (!is.null(x)) dt = dt[, c(y,x), with=FALSE]
# check y
if (!is.null(y)) dt = check_y(dt, y, positive)
# remove constant columns
if (ignore_const_cols) dt = check_const_cols(dt)
# remove date/time columns
if (ignore_datetime_cols) dt = check_datetime_cols(dt)
# check categorical columns' unique values
if (check_cate_num) check_cateCols_uniqueValues(dt, var_skip)
# replace black with na
if (replace_blank_inf) dt = rep_blank_na(dt)
# x variable names
xs = x_variable(dt, y, x, var_skip, method)
xs_len = length(xs)
# print_step
print_step = check_print_step(print_step)
# breaks_list
breaks_list = check_breaks_list(breaks_list, xs)
# special_values
special_values = check_special_values(special_values, xs)
# stop_limit
stop_limit = check_stop_limit(stop_limit, xs)
# init_count_distr range
if ( init_count_distr<0.01 || init_count_distr>0.2 || !is.numeric(init_count_distr) ) {
warning("Incorrect parameter specification; accepted init_count_distr parameter range is 0.01-0.2. Parameter was set to default (0.02).")
init_count_distr = 0.02
}
# count_distr_limit
if ( count_distr_limit<0.01 || count_distr_limit>0.2 || !is.numeric(count_distr_limit) ) {
warning("Incorrect parameter specification; accepted count_distr_limit parameter range is 0.01-0.2. Parameter was set to default (0.05).")
count_distr_limit = 0.05
}
# bin_num_limit
if (!is.numeric(bin_num_limit)) {
warning("Incorrect inputs; bin_num_limit should be numeric variable. Parameter was set to default (8).")
bin_num_limit = 8
}
# binning ------
# loop on xs
# https://www.r-bloggers.com/how-to-go-parallel-in-r-basics-tips/
# https://privefl.github.io/blog/a-guide-to-parallelism-in-r/
no_cores = check_no_cores(no_cores, xs_len)
bins = list()
if (!is.null(y)) {
ycol = dt[[y]]
} else ycol = NA
args1list = list(
missing_join = missing_join,
init_count_distr = init_count_distr,
count_distr_limit = count_distr_limit,
bin_num_limit = bin_num_limit,
bin_close_right = bin_close_right,
method = method
)
if (no_cores == 1) {
for (i in 1:xs_len) {
x_i = xs[i]
dtm = data.table(y=ycol, variable=x_i, value=dt[[x_i]])
# print xs
if (print_step>0 & i %% print_step == 0) cat_bullet(sprintf('%s/%s %s', i, xs_len, x_i), bullet = "tick", bullet_col = "green")
# woebining on one variable
bins[[x_i]] <-
try(do.call('woebin2', args = c(
list(
dtm = dtm,
breaks = breaks_list[[x_i]],
spl_val = special_values[[x_i]],
stop_limit = stop_limit[[x_i]]
), args1list
)), silent = TRUE)
}
} else {
# type_psock_fork = ifelse(Sys.info()["sysname"] == 'Windows', 'PSOCK', 'FORK')
cl = makeCluster(no_cores)#, type = type_psock_fork)
registerDoParallel(cl)
# registerDoParallel(no_cores)
# run
bins <-
foreach(
i = seq_len(xs_len),
.combine = 'rbind',
# .final = function(bs) {
# if (xs_len==1) bs = list(bs)
# setNames(bs, xs)
# },
.inorder = FALSE,
.multicombine = TRUE,
# .maxcombine = xs_len+1,
.errorhandling = "remove"#,
# .packages = 'data.table',
# .verbose = TRUE
# .export = c('dt', 'xs', 'ycol', 'breaks_list', 'special_values', 'init_count_distr', 'count_distr_limit', 'stop_limit', 'bin_num_limit', 'bin_close_right', 'method')
) %dopar% {
x_i = xs[i]
dtm = data.table(y=ycol, variable=x_i, value=dt[[x_i]])
# woebining on one variable
try(do.call('woebin2', args = c(
list(
dtm = dtm,
breaks = breaks_list[[x_i]],
spl_val = special_values[[x_i]],
stop_limit = stop_limit[[x_i]]
), args1list
)), silent = TRUE)
}
# finish
stopCluster(cl)
# stopImplicitCluster()
bins = split(bins, by = 'variable')
}
# if (inherits(bins, 'data.frame')) bins = split(bins, by = 'variable')
# check errors in binning
bins = bins[! sapply(bins, function(x) inherits(x, 'try-error'))]
error_variables = setdiff(xs, names(bins))
if (length(error_variables) > 0) {
warning(sprintf('The following columns are removed from binning results due to errors:\n%s', paste0(error_variables, collapse=', ')))
bins = bins[setdiff(names(bins), error_variables)]
}
# running time
rs = proc.time() - start_time
# hms
if (print_info) cat_bullet(
sprintf("Binning on %s rows and %s columns in %s",nrow(dt),ncol(dt),sec_to_hms(rs[3])),
bullet = "tick", bullet_col = "green", col = 'grey'
)
# save breaks_list
if (!is.null(save_as)) {
bins_breakslist = bins2binbrklst(bins, dt, bin_close_right=bin_close_right)
brklst_save(
binbrklst2txt(bins_breakslist, header=TRUE, bin_close_right = bin_close_right),
save_as=save_as, ...
)
}
return(bins)
}
#' @import data.table
woepoints_ply1 = function(dtx, binx, x_i, woe_points, bin_close_right ) {
# woe_points: "woe" "points"
. = V1 = bin = NULL
# binx
bycols = unique(c('bin',woe_points))
binx = binx[
, bin:=as.character(bin)
][, .(unlist(strsplit(bin, "%,%", fixed=TRUE))), by=bycols]
# dtx
## cut numeric variable
if ( is.numeric(dtx[[x_i]]) ) {
binx_sv = binx[!grepl(binpattern('isbin', bin_close_right),V1)]
binx_other = binx[grepl(binpattern('isbin', bin_close_right),V1)]
dtx[[x_i]] = ifelse(
dtx[[x_i]] %in% binx_sv$V1,
dtx[[x_i]],
as.character(cut(dtx[[x_i]], unique(c(-Inf, binx_other[bin != "missing", get_brkp_bin(V1, bin_close_right)], Inf)), right = bin_close_right, dig.lab = 10, ordered_result = FALSE))
)
}
## to charcarter, na to missing
dtx[[x_i]] = as.character(dtx[[x_i]])
dtx[[x_i]] = ifelse(is.na(dtx[[x_i]]), "missing", dtx[[x_i]])
## add rowid column
dtx = setDT(dtx)[, rowid := .I]
# rename binx
setnames(binx, c('V1', woe_points), c(x_i, paste(x_i, woe_points, sep="_")))
# merge
dtx_suffix = merge(dtx, binx, by=x_i, all.x = TRUE)
dtx_suffix = setDT(dtx_suffix)[order(rowid)][, (intersect(names(dtx_suffix), c("rowid", "bin", x_i))) := NULL][]
return(dtx_suffix)
}
#' WOE/BIN Transformation
#'
#' \code{woebin_ply} converts original values of input data into woe or bin based on the binning information generated from \code{woebin}.
#'
#' @param dt A data frame.
#' @param bins Binning information generated from \code{woebin}.
#' @param to Converting original values to woe or bin. Defaults to woe.
#' @param no_cores Number of CPU cores for parallel computation. Defaults to 2, if it sets to NULL then 90 percent of total cpu cores will be used.
#' @param print_step A non-negative integer. Defaults to 1. If print_step>0, print variable names by each print_step-th iteration. If print_step=0 or no_cores>1, no message is print.
#' @param replace_blank_inf Logical. Replace blank values with NA and infinite with -1. Defaults to TRUE. This argument should be the same with \code{woebin}'s.
#' @param ... Additional parameters.
#'
#' @return A data frame with columns for variables converted into woe values.
#'
#' @seealso \code{\link{woebin}}, \code{\link{woebin_plot}}, \code{\link{woebin_adj}}
#'
#' @examples
#' # load germancredit data
#' data(germancredit)
#'
#' # Example I
#' dt = germancredit[, c("creditability", "credit.amount", "purpose")]
#'
#' # binning for dt
#' bins = woebin(dt, y = "creditability")
#'
#' # converting to woe
#' dt_woe = woebin_ply(dt, bins=bins)
#' str(dt_woe)
#'
#' # converting to bin
#' dt_bin = woebin_ply(dt, bins=bins, to = 'bin')
#' str(dt_bin)
#'
#' \donttest{
#' # Example II
#' # binning for germancredit dataset
#' bins_germancredit = woebin(germancredit, y="creditability")
#'
#' # converting the values in germancredit to woe
#' # bins is a list which generated from woebin()
#' germancredit_woe = woebin_ply(germancredit, bins_germancredit)
#'
#' # bins is a data frame
#' bins_df = data.table::rbindlist(bins_germancredit)
#' germancredit_woe = woebin_ply(germancredit, bins_df)
#'
#' }
#'
#' @import data.table
#' @export
#'
woebin_ply = function(dt, bins, to='woe', no_cores=2, print_step=0L, replace_blank_inf=TRUE, ...) {
# global variables or functions
. = V1 = bin = variable = woe = i = databc_colomun_placeholder = NULL
# start time
start_time = proc.time()
# arguments
kwargs = list(...)
# print info
print_info = kwargs[['print_info']]
if (is.null(print_info)) print_info = TRUE
if (print_info) cli_inform(c(i='Converting into woe values ...'))
# to woe/bin
if (!is.null(kwargs[['value']])) to = kwargs[['value']]
if (is.null(to) || !(to %in% c('woe', 'bin'))) to = 'woe'
# set dt as data.table
dt = setDT(copy(dt))
# # remove date/time col
# dt = rmcol_datetime_unique1(dt)
# replace "" by NA
if (replace_blank_inf) dt = rep_blank_na(dt)
# print_step
print_step = check_print_step(print_step)
# bins # if (is.list(bins)) rbindlist(bins)
bins = check_bincard(bins)
# bin_close_right
bin_close_right = check_bcr(bins)
# x variables
xs_bin = bins[,unique(variable)]
xs_dt = names(dt)
xs = intersect(xs_bin, xs_dt)
# loop on x variables
xs_len = length(xs)
# initial data set
n = 0
while (paste0('col_init',n) %in% xs) n = n+1
dt_init = copy(dt)[, (paste0('col_init',n)) := 1][,(xs) := NULL]
# the databc_colomun_placeholder will be remove in the result, in case dt_init is an empty dataframe
# loop on xs # https://www.r-bloggers.com/how-to-go-parallel-in-r-basics-tips/
no_cores = check_no_cores(no_cores, xs_len = xs_len)
if (no_cores == 1) {
dat = dt_init
for (i in 1:xs_len) {
x_i = xs[i]
# print x
if (print_step > 0 & i %% print_step == 0) cat_bullet(sprintf('%s/%s %s', i, xs_len, x_i), bullet = "tick", bullet_col = "green")
binx = bins[variable==x_i]
dtx = dt[, x_i, with=FALSE]
dat = cbind(dat, woepoints_ply1(dtx, binx, x_i, woe_points=to, bin_close_right = bin_close_right))
}
} else {
# type_psock_fork = ifelse(Sys.info()["sysname"] == 'Windows', 'PSOCK', 'FORK')
cl = makeCluster(no_cores)#, type = type_psock_fork)
registerDoParallel(cl)
# registerDoParallel(no_cores)
# run
dat <-
foreach(
i = 1:xs_len,
.combine=cbind,
.init = dt_init,
.inorder = FALSE,
.errorhandling = "pass"#,
# .export = c('dt', 'bins', 'xs')
) %dopar% {
x_i = xs[i]
binx = bins[variable==x_i]
dtx = dt[, x_i, with=FALSE]
woepoints_ply1(dtx, binx, x_i, woe_points=to, bin_close_right = bin_close_right)
}
# finish
stopCluster(cl)
# stopImplicitCluster()
}
# running time
rs = proc.time() - start_time
# hms
if (print_info) cat_bullet(
sprintf("Woe transformating on %s rows and %s columns in %s",nrow(dt),xs_len,sec_to_hms(rs[3])),
bullet = "tick", bullet_col = "green", col = 'grey'
)
return(dat[, (paste0('col_init',n)) := NULL])
}
# required in woebin_plot
#' @import data.table ggplot2
plot_bin = function(bin, title, show_iv, show_lineval = TRUE, show_barval = TRUE, line_value = 'posprob', line_color = 'blue', bar_color = NULL, ...) {
# global variables or functions
. = pos = posprob = posprob2 = count = count_distr = count_distr2 = count_num = neg = negpos = total_iv = value = variable = woe = lin_val = lin_val2 = lin_val_label = NULL
if ( all(is.na(bin$neg)) || all(is.na(bin$pos)) ) return(NULL)
# data
## y_right_max
if (line_value == 'posprob') {
y_right_max = ceiling(max(bin[[line_value]], na.rm=T)*10)
if (y_right_max %% 2 ==1) y_right_max=y_right_max+1
if (y_right_max - max(bin[[line_value]], na.rm=T)*10 <= 0.3) y_right_max = y_right_max+2
y_right_max = y_right_max/10
if (y_right_max>1 || y_right_max<=0 || is.na(y_right_max) || is.null(y_right_max)) y_right_max=1
} else {
y_right_max = max(bin[[line_value]])
}
## y_right_min
y_right_min = ifelse(line_value == 'posprob', 0, min(bin[[line_value]]))
## y_right_name
y_right_name = ifelse(line_value == 'posprob', 'Positive probability', 'woe')
## y_left_max
y_left_max = ceiling(max(bin$count_distr, na.rm=T)*10)/10
if (y_left_max>1 || y_left_max<=0 || is.na(y_left_max) || is.null(y_left_max)) y_left_max = 1
## data set
bin = setDT(bin)
dat = bin[,.(
variable, bin, count_num=count, count_distr2=count_distr, count_distr, neg=neg/sum(count), pos=pos/sum(count), posprob, woe
)][, `:=`(
bin = ifelse(is.na(bin), "NA", bin),
lin_val2 = (get(line_value)-y_right_min)*(y_left_max/(y_right_max-y_right_min)),
lin_val = round(get(line_value),4),
rowid = .I
)][, `:=`(
bin = factor(bin, levels = bin),
lin_val_label = paste0(round(lin_val*100, 1), "%")
)][]
if (line_value == 'woe') dat = dat[, lin_val_label := round(lin_val, 2)]
dat_melt = melt(dat, id.vars = c("variable", "bin","rowid"), measure.vars =c("neg", "pos"), variable.name = "negpos")[
,negpos:=factor(negpos, levels=c( "pos", "neg")) ]
# title
if (!is.null(title)) title = paste0(title, "-")
if (show_iv) {
title_string = paste0(title, dat[1, variable]," (iv:",bin[1,round(total_iv,4)],")")
} else {
title_string = paste0(title, dat[1, variable])
}
# plot
p_bin = ggplot() +
# geom_text(aes(label="@shichen.name/getpedr", x=dat[, x[.N], by=symbol][,V1[1]], y=Inf), vjust = -0.5, hjust = 1, color = "#F0F0F0") +
# coord_cartesian(clip = 'off') +
geom_bar(data=dat_melt, aes(x=bin, y=value, fill=negpos), stat="identity") +
geom_line(data=dat, aes(x = rowid, y = lin_val2), colour = line_color) +
geom_point(data=dat, aes(x = rowid, y = lin_val2), colour = line_color, shape=21, fill="white") +
scale_y_continuous(limits = c(0, y_left_max), sec.axis = sec_axis(~./(y_left_max/(y_right_max-y_right_min))+y_right_min, name = y_right_name)) +
labs(title = title_string, x=NULL, y = "Count distribution", fill=NULL) +
theme_bw() +
theme(
legend.position="bottom", legend.direction="horizontal",
axis.title.y.right = element_text(colour = line_color),
axis.text.y.right = element_text(colour = line_color, angle = 90, hjust = 0.5),
axis.text.y = element_text(angle = 90, hjust = 0.5) )
if (show_barval) p_bin = p_bin + geom_text(data=dat, aes(x = bin, y = count_distr2, label = paste0(round(count_distr2*100, 1), "%, ", count_num) ), vjust = 0.5)
if (show_lineval) p_bin = p_bin + geom_text(data=dat, aes(x = rowid, y = lin_val2, label = lin_val_label), colour = line_color, vjust = -0.5)
if (!is.null(bar_color)) p_bin = p_bin + scale_fill_manual(values= bar_color)
return(p_bin)
}
#' WOE Binning Visualization
#'
#' \code{woebin_plot} create plots of count distribution and positive probability for each bin. The binning informations are generates by \code{woebin}.
#'
#' @name woebin_plot
#' @param bins A list of data frames. Binning information generated by \code{woebin}.
#' @param x Name of x variables. Defaults to NULL. If x is NULL, then all columns except y are counted as x variables.
#' @param title String added to the plot title. Defaults to NULL.
#' @param show_iv Logical. Defaults to TRUE, which means show information value in the plot title.
#' @param line_value The value displayed as line. Accepted values are 'posprob' and 'woe'. Defaults to positive probability.
#' @param ... Additional parameters
#'
#' @return A list of binning graphics.
#'
#' @seealso \code{\link{woebin}}, \code{\link{woebin_ply}}, \code{\link{woebin_adj}}
#'
#' @examples
#' # Load German credit data
#' data(germancredit)
#'
#' # Example I
#' bins1 = woebin(germancredit, y="creditability", x="credit.amount")
#'
#' p1 = woebin_plot(bins1)
#' print(p1)
#'
#' # modify line value
#' p1_w = woebin_plot(bins1, line_value = 'woe')
#' print(p1_w)
#'
#' # modify colors
#' p1_c = woebin_plot(bins1, line_color='#FC8D59', bar_color=c('#FFFFBF', '#99D594'))
#' print(p1_c)
#'
#' # show iv, line value, bar value
#' p1_iv = woebin_plot(bins1, show_iv = FALSE)
#' print(p1_iv)
#' p1_lineval = woebin_plot(bins1, show_lineval = FALSE)
#' print(p1_lineval)
#' p1_barval = woebin_plot(bins1, show_barval = FALSE)
#' print(p1_barval)
#'
#' \donttest{
#' # Example II
#' bins = woebin(germancredit, y="creditability")
#' plotlist = woebin_plot(bins)
#' print(plotlist$credit.amount)
#'
#' # # save binning plot
#' # for (i in 1:length(plotlist)) {
#' # ggplot2::ggsave(
#' # paste0(names(plotlist[i]), ".png"), plotlist[[i]],
#' # width = 15, height = 9, units="cm" )
#' # }
#' }
#'
#' @import data.table ggplot2
#' @export
#'
woebin_plot = function(bins, x=NULL, title=NULL, show_iv = TRUE, line_value = 'posprob', ...) {
# global variables or functions
variable = NULL
xs = x
# kwargs = list(...)
# line value
if (!(line_value %in% c('posprob', 'woe'))) line_value = 'posprob'
# line bar colors
# line_color = kwargs[['line_color']]
# if (is.null(line_color)) line_color = 'blue'
# bar_color = kwargs[['bar_color']]
# bins # if (is.list(bins)) rbindlist(bins)
bins = check_bincard(bins)
# x variable names
if (is.null(xs)) xs = bins[,unique(variable)]
# plot export
plotlist = list()
for (i in xs) plotlist[[i]] = plot_bin(bins[variable==i], title, show_iv, line_value = line_value, ...)
return(plotlist)
}
# print basic information in woebin_adj
#' @import cli
woebin_adj_print_basic_info = function(dt, y, xs_adj, i, bins, bins_breakslist, ...) {
x_i = xs_adj[i]
xs_len = length(xs_adj)
variable = x_breaks = NULL
cli::cat_rule(sprintf("%s/%s %s (%s)", i, xs_len, x_i, class(dt[[x_i]])), col = 'cyan')
## class
# cat(paste0("> class(",x_i,"): "),"\n",class(dt[[x_i]]),"\n","\n")
## summary
cli::cat_line(sprintf("> summary(%s)", x_i))
# cat(paste0("> summary(",x_i,"): "),"\n")
print(summary(dt[[x_i]]))
cat("\n")
## table
if (length(table(dt[[x_i]])) < 10 || !is.numeric(dt[[x_i]])) {
cli::cat_line(sprintf("> table(%s)", x_i))
# cat(paste0("> table(",x_i,"): "))
print(table(dt[[x_i]]))
cat("\n")
} else {
if ( is.numeric(dt[[x_i]])) {
ht = hist(dt[[x_i]], plot = FALSE)
plot(ht, main = x_i, xlab = NULL)
}
}
## current breaks
breaks_bin = bins_breakslist[variable == x_i, x_breaks]
# cat("> Current breaks: \n", breaks_bin,"\n \n")
cli::cat_line("> Current breaks")
cat(breaks_bin,"\n \n")
## woebin plotting
brklst = list()
brklst[x_i] = list(brk_txt2vector(breaks_bin))
plist = woebin_plot(woebin(dt, y = y, x=x_i, breaks_list = brklst, print_info = FALSE, no_cores = 1, ...), ...)
print(plist[[1]])
}
# plot adjusted binning in woebin_adj
woebin_adj_break_plot = function(dt, y, x_i, breaks, bin_close_right, ...) {
# stop_limit, svlst, method,
bin_adj = brk_lst = spc_val = NULL
brk_lst[x_i] = list(brk_txt2vector(breaks))
# if (stop_limit != 'N') stop_limit = 0.1
# text_woebin = paste0("bin_adj=woebin(dt[,c(\"",x_i,"\",\"",y,"\"),with=F], \"",y,"\", breaks_list=list(",x_i,"=c(",breaks,")), special_values =list(",x_i,"=c(", sv_i, ")), ", ifelse(stop_limit=="N","stop_limit = \"N\", ",NULL), "print_step=0L, print_info=FALSE, method=\"",method,"\")")
# eval(parse(text = text_woebin))
bin_adj = woebin(dt = dt, y = y, x = x_i, breaks_list = brk_lst, print_step = 0L, print_info=FALSE, no_cores = 1, ...)
## print adjust breaks
breaks_bin = setdiff(sub(binpattern('leftrightbrkp_missing', bin_close_right), "\\2\\3", bin_adj[[1]]$bin), c("-Inf","Inf","missing"))
breaks_bin = paste0(paste0("\"",breaks_bin,"\""), collapse=", ")
# ifelse(
# is.numeric(dt[[x_i]]),
# paste0(breaks_bin, collapse=", "),
# paste0(paste0("\"",breaks_bin,"\""), collapse=", "))
# cat("> Current breaks: ","\n",breaks_bin,"\n","\n")
cli::cat_line("> Current breaks")
cat(breaks_bin,"\n \n")
# print bin_adj
print(woebin_plot(bin_adj, ...)[[1]])
# # breaks
# if (breaks == "" || is.null(breaks))
breaks = breaks_bin
return(breaks)
}
#' WOE Binning Adjustment
#'
#' \code{woebin_adj} interactively adjust the binning breaks.
#'
#' @param dt A data frame.
#' @param y Name of y variable.
#' @param bins A list of data frames. Binning information generated from \code{woebin}.
#' @param breaks_list List of break points, Defaults to NULL. If it is not NULL, variable binning will based on the provided breaks.
#' @param adj_all_var Logical, whether to show variables have monotonic woe trends. Defaults to TRUE
#' @param to Adjusting bins into breaks_list or bins_list. Defaults to breaks_list.
#' @param save_as A string. The file name to save breaks_list. Defaults to None.
#' @param ... Additional parameters.
#'
#' @return A list of modified break points of each x variables.
#'
#' @seealso \code{\link{woebin}}, \code{\link{woebin_ply}}, \code{\link{woebin_plot}}
#'
#' @examples
#' \dontrun{
#' # Load German credit data
#' data(germancredit)
#'
#' # Example I
#' dt = germancredit[, c("creditability", "age.in.years", "credit.amount")]
#' bins = woebin(dt, y="creditability")
#' breaks_adj = woebin_adj(dt, y="creditability", bins)
#' bins_final = woebin(dt, y="creditability",
#' breaks_list=breaks_adj)
#'
#' # Example II adjust two variables' breaks in brklst
#' binsII = woebin(germancredit, y="creditability", save_as = 'breaks')
#' brklst = source('breaks.R')$value
#' # update break list file
#' brklst_adj = woebin_adj(germancredit, "creditability", binsII[1:2],
#' breaks_list = brklst, save_as = 'breaks')
#' }
#'
#' @import data.table
#' @importFrom utils menu
#' @importFrom graphics hist plot
#' @export
woebin_adj = function(dt, y, bins, breaks_list=NULL, adj_all_var=TRUE, to='breaks_list', save_as=NULL, ...) {
# global variables or functions
. = V1 = posprob = posprob_chg = bin2 = bin = bin_adj = count_distr = variable = x_breaks = x_class = NULL
dt = setDT(copy(dt))
# bins # if (is.list(bins)) rbindlist(bins)
bins = check_bincard(bins)
# args
kwargs = list(...)
# count_distr_limit
count_distr_limit = kwargs$count_distr_limit
if (is.null(count_distr_limit)) count_distr_limit = 0.05
# save_breaks_list
save_breaks_list = kwargs[['save_breaks_list']]
if (!is.null(save_breaks_list)) save_as = save_breaks_list
# x variables
xs_all = bins[,unique(variable)]
if (adj_all_var == FALSE) {
xs_adj = unique(c(
bins[count_distr < count_distr_limit, unique(variable)],
bins[
bin != "missing"
][, posprob_chg := posprob >= shift(posprob, type = "lag"), by=variable
][!is.na(posprob_chg), length(unique(posprob_chg)), by=variable
][V1 > 1, variable]
))
} else {
xs_adj = xs_all
}
# length of adjusting variables
xs_len = length(xs_adj)
# to
to = match.arg(to, c('breaks_list', 'bins_list'))
# bin_close_right
bin_close_right = check_bcr(bins)
# bin_close_right = getarg('bin_close_right')
# breakslist of bins
bins_breakslist = bins2binbrklst(bins, dt, breaks_list = breaks_list, bin_close_right=bin_close_right)
# loop on adjusting variables
if (xs_len == 0) {
warning("The binning breaks of all variables are perfect according to default settings.")
brklst_char = binbrklst2txt(bins_breakslist, bin_close_right = bin_close_right)
return(brklst_char)
}
i = 1
breaks_list = NULL
while (i <= xs_len) {
# x variable
breaks = NULL
x_i = xs_adj[i]
# basic information of x_i variable ------
woebin_adj_print_basic_info(dt, y, xs_adj, i, bins, bins_breakslist, ...)
# adjusting breaks ------
adj_brk = menu2(choices = c("next", "yes", "back"), title=paste0("> Adjust breaks for (", i, "/", xs_len, ") ", x_i, "?"))
while (adj_brk == 2 || adj_brk == 'save') {
if (adj_brk == 2) {
# modify breaks adj_brk == 2
breaks = readline("> Enter modified breaks: ")
breaks = gsub("^[,\\.]+|[,\\.]+$", "", breaks)
# if (breaks == "N") {
# stp_lmt = "N"
# breaks = NULL
# }
breaks <- try(woebin_adj_break_plot(dt, y, x_i, breaks, bin_close_right = bin_close_right, ...), silent = TRUE)
} else { # adj_brk == 'save'
# go next adj_brk == 1
if (!(is.null(breaks) || breaks == "")) bins_breakslist = bins_breakslist[variable == x_i, x_breaks := breaks]
brklst_save(
binbrklst2txt(bins_breakslist, header = TRUE, bin_close_right = bin_close_right),
save_as = save_as, ...
)
}
adj_brk = menu2(c("next", "yes", "back"), title=paste0("> Adjust breaks for (", i, "/", xs_len, ") ", x_i, "?"))
}
if (adj_brk == 3) {
# go back
i = ifelse(i > 1, i-1, i)
} else if (adj_brk == 1) {
# go next
if (!(is.null(breaks) || breaks == "")) bins_breakslist = bins_breakslist[variable == x_i, x_breaks := breaks]
i = i + 1
} else if (grepl('^go[1-9][0-9]*$', adj_brk)) {
# go x
i = as.integer(sub('go','', adj_brk))
}
}
# cat(sprintf("options(scorecard.bin_close_right = %s) \n", bin_close_right))
brklst_char = binbrklst2txt(bins_breakslist, bin_close_right = bin_close_right)
# cat(brklst_char,"\n")
if (!is.null(save_as)) brklst_save(
binbrklst2txt(bins_breakslist, header = TRUE, bin_close_right = bin_close_right),
save_as = save_as, ...
)
if (to == 'breaks_list') {
return(brklst_char)
} else if (to == 'bins_list') {
bins = woebin(dt, y, x = xs_all, breaks_list = brklst_char, ...)
return(bins)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.