#' Scorecard Modeling Report
#'
#' \code{report} creates a scorecard modeling report and save it as a xlsx file.
#'
#' @param dt A data frame or a list of data frames that have both x (predictor/feature) and y (response/label) variables. If there are multiple data frames are provided, only the first data frame would be used for training, and the others would be used for testing/validation.
#' @param y Name of y variable.
#' @param x Name of x variables. Defaults to NULL. If x is NULL, then all columns except y are counted as x variables.
#' @param breaks_list A list of break points. It can be extracted from \code{woebin} and \code{woebin_adj} via the argument save_breaks_list.
#' @param x_name A vector of x variables' name.
#' @param special_values The values specified in special_values will be in separate bins. Defaults to NULL.
#' @param seed A random seed to split input data frame. Defaults to 618. If it is NULL, input dt will not split into two datasets.
#' @param save_report The name of xlsx file where the report is to be saved. Defaults to 'report'.
#' @param positive Value of positive class, default "bad|1".
#' @param ... Additional parameters.
#'
#' @examples
#' \dontrun{
#' data("germancredit")
#'
#' y = 'creditability'
#' x = c(
#' "status.of.existing.checking.account",
#' "duration.in.month",
#' "credit.history",
#' "purpose",
#' "credit.amount",
#' "savings.account.and.bonds",
#' "present.employment.since",
#' "installment.rate.in.percentage.of.disposable.income",
#' "personal.status.and.sex",
#' "property",
#' "age.in.years",
#' "other.installment.plans",
#' "housing"
#' )
#'
#' special_values=NULL
#' breaks_list=list(
#' status.of.existing.checking.account=c("... < 0 DM%,%0 <= ... < 200 DM",
#' "... >= 200 DM / salary assignments for at least 1 year", "no checking account"),
#' duration.in.month=c(8, 16, 34, 44),
#' credit.history=c(
#' "no credits taken/ all credits paid back duly%,%all credits at this bank paid back duly",
#' "existing credits paid back duly till now", "delay in paying off in the past",
#' "critical account/ other credits existing (not at this bank)"),
#' purpose=c("retraining%,%car (used)", "radio/television",
#' "furniture/equipment%,%domestic appliances%,%business%,%repairs",
#' "car (new)%,%others%,%education"),
#' credit.amount=c(1400, 1800, 4000, 9200),
#' savings.account.and.bonds=c("... < 100 DM", "100 <= ... < 500 DM",
#' "500 <= ... < 1000 DM%,%... >= 1000 DM%,%unknown/ no savings account"),
#' present.employment.since=c("unemployed%,%... < 1 year", "1 <= ... < 4 years",
#' "4 <= ... < 7 years", "... >= 7 years"),
#' installment.rate.in.percentage.of.disposable.income=c(2, 3),
#' personal.status.and.sex=c("male : divorced/separated", "female : divorced/separated/married",
#' "male : single", "male : married/widowed"),
#' property=c("real estate", "building society savings agreement/ life insurance",
#' "car or other, not in attribute Savings account/bonds", "unknown / no property"),
#' age.in.years=c(26, 28, 35, 37),
#' other.installment.plans=c("bank%,%stores", "none"),
#' housing=c("rent", "own", "for free")
#' )
#'
#' # Example I
#' # input dt is a data frame
#' # split input data frame into two
#' report(germancredit, y, x, breaks_list, special_values, seed=618, save_report='report1',
#' show_plot = c('ks', 'lift', 'gain', 'roc', 'lz', 'pr', 'f1', 'density'))
#'
#' # donot split input data
#' report(germancredit, y, x, breaks_list, special_values, seed=NULL, save_report='report2')
#'
#' # Example II
#' # input dt is a list
#' # only one dataset
#' report(list(dt=germancredit), y, x,
#' breaks_list, special_values, seed=NULL, save_report='report3')
#'
#' # multiple datasets
#' report(list(dt1=germancredit[sample(1000,500)],
#' dt2=germancredit[sample(1000,500)]), y, x,
#' breaks_list, special_values, seed=NULL, save_report='report4')
#'
#' # multiple datasets
#' report(list(dt1=germancredit[sample(1000,500)],
#' dt2=germancredit[sample(1000,500)],
#' dt3=germancredit[sample(1000,500)]), y, x,
#' breaks_list, special_values, seed=NULL, save_report='report5')
#'
#' }
#'
#' @import openxlsx
#' @importFrom stats as.formula glm predict.glm cor
#' @export
report = function(dt, y, x, breaks_list, x_name = NULL, special_values=NULL, seed=618, save_report='report', positive='bad|1', ...) {
# info_value = gvif = . = variable = bin = woe = points = NULL
. = bin = gvif = info_value = points = variable = woe = points = name = x1 = NULL
kwargs = list(...)
# x name
if (!is.null(x_name)) {
if (inherits(x_name, 'character')) {
x_name = setDT(list(variable=x, name = x_name))
} else if (inherits(x_name, 'data.frame')) {
x_name = setDT(x_name)[x, on='variable']
}
}
# data list
dat_lst = list()
if (is.data.frame(dt)) {
if (is.null(seed)) {
dat_lst[['dat']] = setDT(copy(dt))
} else {
dat_lst = split_df(dt, y, seed = seed)
}
} else if ((inherits(dt, 'list') & all(sapply(dt, is.data.frame)))) {
dat_lst = lapply(dt, setDT)
} else {
stop('The input dt should be a data frame, or a list of two data frames.')
}
dat_lst = lapply(dat_lst, function(x) check_y(x, y, positive))
# label list
label_list = lapply(dat_lst, function(x) x[,y,with=FALSE])
# binning
bins_lst = lapply(dat_lst, function(dat) {
suppressWarnings(woebin(dat, y = y, x = x, breaks_list = breaks_list, special_values = special_values, print_info=FALSE, ...))
})
dat_woe_lst = lapply(dat_lst, function(dat) {
woebin_ply(dat, bins_lst[[1]], print_info=FALSE, ...)
})
# fitting
m = glm(as.formula(paste0(y, " ~ .")), family = "binomial",
data = dat_woe_lst[[1]][,c(paste0(x,"_woe"),y),with=F])
pred_lst = lapply(dat_woe_lst, function(d) {
predict.glm(m, newdata = d, type='response')
})
binomial_metric = c("mse", "rmse", "logloss", "r2", "ks", "auc", "gini")
if ('binomial_metric' %in% names(kwargs)) binomial_metric = kwargs$binomial_metric
m_perf = perf_eva(pred = pred_lst, label = label_list, binomial_metric=binomial_metric, confusion_matrix = FALSE, show_plot = NULL)
# scaling
card <- do.call( scorecard, args = c(
list(bins=bins_lst[[1]], model=m),
kwargs[intersect(c('points0', 'odds0', 'pdo', 'basepoints_eq0'), names(kwargs))] ) )
score_lst = lapply(dat_lst, function(x) scorecard_ply(x, card, print_step=0L))
bin_num = ifelse('bin_num' %in% names(kwargs), kwargs$bin_num, 10)
bin_type = ifelse('bin_type' %in% names(kwargs), kwargs$bin_type, 'freq')
gains_tbl = gains_table(score = rbindlist(score_lst), label = rbindlist(label_list), bin_num = bin_num, bin_type=bin_type)
gains_table_cols = c('dataset', 'bin', 'count', 'cumulative count', 'negative', 'positive', 'cumulative negative', 'cumulative positive', 'count distribution', 'positive probability', 'cumulative positive probability', 'rejected rate', 'approval rate')
wb <- createWorkbook()
# dataset information ------
n = 1
cat_bullet(sprintf("sheet%s - dataset information", n), bullet = "tick", bullet_col = "green", col = 'grey')
sheet <- addWorksheet(wb, sheetName="dataset information")
sample_info <- lapply(dat_lst, function(x) {
data.table(`sample size` = nrow(x),
`feature size` = ncol(x)-1,
`positive rate` = sum(x[[y]])/nrow(x))
})
dt_dtinfo = rbindlist(sample_info, idcol = 'dataset')
dt_xdea = describe(rbindlist(dat_lst))
writeData(wb, sheet, dt_dtinfo, startRow=1, startCol=1, colNames=T)
writeData(wb, sheet, dt_xdea, startRow=nrow(dt_dtinfo)+4, startCol=1, colNames=T)
# model performance ------
n = n+1
cat_bullet(sprintf("sheet%s - model performance", n), bullet = "tick", bullet_col = "green", col = 'grey')
sheet <- addWorksheet(wb, sheetName="model performance")
eva_tbl = rbindlist(m_perf$binomial_metric, idcol = 'dataset')
writeData(wb, sheet, eva_tbl, startRow=1, startCol=1, colNames=T)
show_plot = c("ks","lift")
if ('show_plot' %in% names(kwargs)) show_plot = kwargs$show_plot
perf_eva(pred = pred_lst, label = label_list, confusion_matrix = FALSE, binomial_metric = NULL, show_plot = show_plot)$pic
Sys.sleep(2)
plot_ncol = ceiling(sqrt(length(show_plot)))
plot_nrow = ceiling(length(show_plot)/plot_ncol)
insertPlot(wb, sheet, width = 8*plot_ncol, height = 7*plot_nrow, xy = NULL, startRow = nrow(eva_tbl)+4, startCol = 1, fileType = "png", units = "cm")
# model coefficients ------
n = n+1
cat_bullet(sprintf("sheet%s - model coefficients", n), bullet = "tick", bullet_col = "green", col = 'grey')
sheet <- addWorksheet(wb, sheetName="model coefficients")
dt_vif = vif(m, merge_coef = TRUE)[, gvif := round(gvif, 4)]
dt_iv = iv(dat_woe_lst[[1]][,c(paste0(x,"_woe"), y),with=FALSE], y, order = FALSE)[, info_value := round(info_value, 4)]
# dt_mr = data.table(variable=paste0(x,'_woe'), missing_rate=dat_lst[[1]][,x,with=FALSE][, sapply(.SD, function(x) sum(is.na(x))/.N)])
sum_tbl = Reduce(
function(x,y) merge(x,y, all.x=TRUE, by='variable', sort=FALSE), list(dt_vif, dt_iv, copy(dt_xdea)[, variable := paste0(variable, '_woe')])
)[, variable := sub('_woe$', '', variable)]
if (!is.null(x_name)) sum_tbl = merge(x_name, sum_tbl, by = 'variable', sort = FALSE, all = TRUE)[c(.N, seq_len(.N-1)),]
writeData(wb,sheet, sprintf('Model coefficients based on %s dataset', names(dat_lst)[1]), startRow=1, startCol=1, colNames=F)
writeData(wb,sheet, sum_tbl, startRow=2, startCol=1, colNames=T)
# correlation
dtcor1 = rbindlist(lapply(dat_woe_lst[1], function(d) cor2(d,paste0(x,'_woe'), uptri = FALSE)))[, x1 := sub('_woe$', '', x1)]
setnames(dtcor1, 'x1', 'variable')
if (!is.null(x_name)) dtcor1 = merge(x_name, dtcor1, by = 'variable', sort = FALSE, all = TRUE)[]
writeData(wb,sheet, 'Correlation of variables in woe values', startRow=5+nrow(sum_tbl), startCol=1, colNames=F)
writeData(wb,sheet, dtcor1, startRow=6+nrow(sum_tbl), startCol=1, colNames=T)
# dtcor2 = rbindlist(lapply(dat_lst[1], function(d) cor2(d,x, uptri = TRUE, diag = TRUE)))
# setnames(dtcor2, 'x1', 'variable')
# if (!is.null(x_name)) dtcor2 = merge(x_name, dtcor2, by = 'variable', sort = FALSE, all = TRUE)[]
# writeData(wb,sheet, 'Correlation of numeric variables', startRow=9+nrow(sum_tbl)+nrow(dtcor1), startCol=1, colNames=F)
# writeData(wb,sheet, dtcor2, startRow=10+nrow(sum_tbl)+nrow(dtcor1), startCol=1, colNames=T)
# variable binning ------
n = n+1
cat_bullet(sprintf("sheet%s - variable woe binning", n), bullet = "tick", bullet_col = "green", col = 'grey')
sheet <- addWorksheet(wb, sheetName="variable woe binning")
names_dat = names(dat_lst)
for (i in seq_along(names_dat)) {
di = names_dat[i]
# title row
writeData(wb,sheet, sprintf('graphics of %s dataset', di), startRow=1, startCol=7*(i-1)+1, colNames=F)
writeData(wb, sheet, sprintf('binning of %s dataset', di), startRow=1, startCol=7*length(names_dat)+1+14*(i-1), colNames=F)
# binning
binning_df = rbindlist(bins_lst[[i]])
if (!is.null(x_name)) binning_df = merge(x_name, binning_df, by = 'variable', sort=FALSE, all = TRUE)
writeData(wb,sheet, binning_df,
startRow=2, startCol=7*length(names_dat)+1+14*(i-1), colNames=T)
}
# plots
for (i in seq_along(names_dat)) {
di = names_dat[i]
plist = woebin_plot(bins_lst[[di]], title = di, ...)
for (j in seq_along(x)) {
if (!is.null(x_name)) writeData(wb,sheet, x_name[variable == x[j], name], startRow = (j-1)*15+3, startCol = 7*(i-1)+1, rowNames = FALSE)
print(plist[[j]])
insertPlot(wb, sheet, width = 12, height = 7, xy = NULL,
startRow = (j-1)*15+4, startCol = 7*(i-1)+1,
fileType = "png", units = "cm")
}
}
# scorecard ------
n = n+1
cat_bullet(sprintf("sheet%s - scorecard", n), bullet = "tick", bullet_col = "green", col = 'grey')
sheet <- addWorksheet(wb, sheetName="scorecard")
odds0 = ifelse('odds0' %in% names(kwargs), kwargs$odds0, 1/19)
points0 = ifelse('points0' %in% names(kwargs), kwargs$points0, 600)
pdo = ifelse('pdo' %in% names(kwargs), kwargs$pdo, 50)
# add scorecard scaling rule
writeData(wb,sheet, "scorecard scaling", startCol=1, startRow=1, colNames=F)
writeData(wb,sheet, data.table( c("Target Odds", "Target Points", "Points to Double the Odds"), c(odds0, points0, pdo) ), startCol=1, startRow=2, colNames=F)
# add scorecard datatable
writeData(wb,sheet, "scorecard", startCol=1, startRow=7, colNames=F)
card_df = rbindlist(card, fill = T)[,.(variable, bin, woe, points)]
if (!is.null(x_name)) card_df = merge(x_name, card_df, by = 'variable', sort=FALSE, all = TRUE)[c(.N, seq_len(.N-1)),]
writeData(wb,sheet, card_df, startCol=1, startRow=8, colNames=T)
# population stability ------
if (length(dat_lst) > 1) {
n = n+1
cat_bullet(sprintf("sheet%s - population stability", n), bullet = "tick", bullet_col = "green", col = 'grey')
sheet <- addWorksheet(wb, sheetName="population stability")
m_psi = perf_psi(score = score_lst, label = label_list, return_distr_dat = TRUE)
# table in equal width
psi_tbl = m_psi$dat[[1]]
setnames(psi_tbl, gains_table_cols)
writeData(wb, sheet, psi_tbl, startCol=1, startRow=1, colNames=T)
# pic
for (i in seq_len(length(dat_lst)-1)) {
if (length(dat_lst)>2) {
print(m_psi$pic$score[[i]])
} else print(m_psi$pic$score)
Sys.sleep(2)
insertPlot(wb, sheet, width = 16, height = 7, xy = NULL, startRow=nrow(psi_tbl)+4+15*(i-1), startCol=1, fileType="png", units= "cm")
}
}
# gains table ------
n = n+1
cat_bullet(sprintf("sheet%s - gains table", n), bullet = "tick", bullet_col = "green", col = 'grey')
sheet <- addWorksheet(wb, sheetName="gains table")
setnames(gains_tbl, gains_table_cols)
writeData(wb, sheet, gains_tbl, startCol=1, startRow=1, colNames=T)
# saving workbook ------
report_name = sprintf('%s_%s.xlsx', save_report, format(Sys.time(),"%Y%m%d_%H%M%S"))
saveWorkbook(wb, report_name, overwrite=TRUE)
cli_inform(c(i = sprintf('The report is saved as %s', report_name)))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.