# evaluation metric ------
# dt = data.table(label=label, pred=pred)[!(is.na(label) | is.na(pred))]
## mean squared error, MSE
mse = function(dt, ...) {
label = pred = NULL
setDT(dt)
return(dt[,mean((label-pred)^2)])
}
## root mean squared error, RMSE
rmse = function(dt, ...) {
setDT(dt)
return(sqrt(mse(dt)))
}
## mean absolute error, MAE
mae = function(dt, ...) {
label = pred = NULL
setDT(dt)
return(dt[,mean(abs(label-pred))])
}
## mean square logarithmic error, MSLE
msle = function(dt, ...) {
label = pred = NULL
setDT(dt)
return(dt[,mean((log1p(label)-log1p(pred))^2)]) # log1p = log(1+x)
}
## root mean square logarithmic error, RMSLE
rmsle = function(dt, ...) {
setDT(dt)
return(sqrt(msle(dt)))
}
## coefficient of determination, R2
r2 = function(dt, ...) {
label = pred = NULL
setDT(dt)
# total sum of squares, SST
sst = dt[, sum((label-mean(label))^2)]
## regression sum of squares, SSR
sse = dt[, sum((label-pred)^2)]
# ## error sum of squares, SSE
# ssr = dt[, sum((pred-mean(label))^2)]
return(1-sse/sst)
}
## log loss for binary classification
## http://wiki.fast.ai/index.php/Log_Loss
logloss = function(dt, ...) {
ll = label = pred = NULL
setDT(dt)
return(dt[,ll:=label*log(pred)+(1-label)*log(1-pred)][,mean(-ll)])
}
## aera under curve (ROC), AUC
auc = function(dt_ev, ...) {
. = FPR = TPR = NULL
rbind(dt_ev[,.(FPR,TPR)], data.table(FPR=0:1,TPR=0:1), fill=TRUE
)[order(FPR,TPR)
][, sum((TPR+shift(TPR, fill=0, type="lag"))/2*(FPR-shift(FPR, fill=0, type="lag")))]
}
## Gini
gini = function(dt_ev, ...) {
2*auc(dt_ev)-1
}
## KS
ks = function(dt_ev, ...) {
TN = totN = FN = totP = NULL
dt_ev[, max(abs(TN/totN - FN/totP))]
}
## Lift
lift = function(dt_ev, threshold = 0.5, ...) {
precision = totP = totN = cumpop = NULL
# posrate of rejected sample / of overalll
# threshold: approval rate
dt_ev[, lift := precision/(totP/(totP+totN))][][cumpop >= 1-threshold,][1,lift]
}
# optimal cutoff ------
# ks
cutoff_ks = function(dt_ev, pred_desc=FALSE) {
setDT(dt_ev)
TN = totN = FN = totP = . = cumpop = pred = NULL
rt = dt_ev[, ks := abs(TN/totN - FN/totP)
][ks == max(ks, na.rm = TRUE)
][,.(cumpop, pred, ks)]
if (pred_desc) rt[.N] else rt[1]
}
# roc
cutoff_roc = function(dt_ev, pred_desc=FALSE) {
setDT(dt_ev)
. = cumpop = pred = TPR = FPR = co = NULL
rt = dt_ev[, .(cumpop,pred,TPR,FPR)
][, co := (TPR-FPR)^2/2
][co == max(co, na.rm = TRUE)
][, .(cumpop, pred, TPR, FPR)]
if (pred_desc) rt[.N] else rt[1]
}
# fbeta
cutoff_fbeta = function(dt_ev, beta=1, pred_desc=FALSE, ...) {
setDT(dt_ev)
. = cumpop = pred = precision = recall = f = NULL
rt = dt_ev[, .(cumpop, pred,precision,recall)
][, f := 1/(1/(1+beta^2)*(1/precision+beta^2/recall))
][which.max(f)]
if (pred_desc) rt[.N] else rt[1]
setnames(rt, c('cumpop', 'pred', 'precision', 'recall', paste0('f',beta)))
}
## confusion matrix
confusionMatrix = function(dt, threshold=0.5, ...) {
pred_label = pred = . = label = error = pred_1 = pred_0 = NULL
setDT(dt)
# data of actual and predicted label
dt_alpl = dt[, pred_label := pred >= threshold][, .N, keyby = .(label, pred_label)][, pred_label := paste0('pred_', as.integer(pred_label))]
if (length(table(dt_alpl$pred_label)) == 1) return(NULL)
# confusion matrix
cm = dcast(
dt_alpl, label ~pred_label, value.var = 'N'
)[1, error := pred_1/sum(pred_1+pred_0)
][2, error := pred_0/sum(pred_1+pred_0)]
# total row
cm = rbind(cm, data.table(label = 'total', t(colSums(cm[,-1]))), fill=TRUE)
cm[3, error := (cm[1,3]+cm[2,2])/sum(cm[3,2]+cm[3,3])]
return(cm)
}
# renamed as perf_eva
#
# The function perf_plot has renamed as perf_eva.
#
# @param label Label values, such as 0s and 1s, 0 represent for negative and 1 for positive.
# @param pred Predicted probability values.
# @param title Title of plot, default "train".
# @param groupnum The group number when calculating positive probability, default NULL.
# @param type Types of performance plot, such as "ks", "lift", "roc", "pr". Default c("ks", "roc").
# @param show_plot Logical value, default TRUE. It means whether to show plot.
# @param seed An integer. The specify seed is used for random sorting data, default: 186.
perf_plot = function(label, pred, title=NULL, groupnum=NULL, type=c("ks", "roc"), show_plot=TRUE, seed=618) {
stop("This function has renamed as perf_eva.")
}
# plot ------
#Our transformation function
fmt_dcimals <- function(x) sprintf("%.1f", x)
number_ticks <- function(n) {function(limits) pretty(limits, n)}
#' @importFrom stats density
plot_density = function(dt_lst, title=NULL, positive, ...) {
.=datset=dens=label=label_str=max_dens=pred=NULL
dt_df = rbindlist(dt_lst, idcol = 'datset')[, label := factor(label)]
# max pred
if (dt_df[,mean(pred) < -1]) dt_df[, pred := abs(pred)]
max_pred = dt_df[,max(pred)]
# if (max_pred < 1) max_pred = 1
min_pred = dt_df[,min(pred)]
# if (max_pred == 1) min_pred = 0
# data frame of max density by datset and label
max_density_by_datset_label = dt_df[
, .(pred=density(pred)$x, dens=density(pred)$y), by=c('datset','label')
][, max_dens := max(dens), by=c('datset','label')
][ dens == max_dens
][, max_dens := NULL]
# max density
max_density = max_density_by_datset_label[, ceiling2(max(dens))]
# coord for label_string
coord_label = max_density_by_datset_label[
, .(pred=mean(pred), dens=mean(dens)), by=label
][, label_str := ifelse(grepl(positive, label), 'Pos', 'Neg')]
# plot
pdens = ggplot(data = dt_df) +
# geom_histogram(aes(x=pred)) +
# geom_density(aes(x=pred), linetype='dotted') +
geom_density(aes(x=pred, #y=..scaled..,
color=datset, linetype=label), fill='gray', alpha=0.1) +
geom_text(data = coord_label, aes(x=pred, y=dens, label=label_str)) +
# geom_vline(xintercept = threshold, linetype='dotted') +
# geom_text(aes(label='cut-off', x=threshold, y = 0), vjust=0) +
guides(linetype="none", color=guide_legend(title=NULL)) +
theme_bw() +
theme(legend.position=c(1,1),
legend.justification=c(1,1),
legend.background=element_blank(),
legend.key=element_blank(),
legend.key.size = unit(1.5, 'lines'))
# axis, labs
pdens = pdens + ggtitle(paste0(title, 'Density')) +
labs(x = "Prediction", y = "Density") +
scale_y_continuous(labels=fmt_dcimals, breaks=number_ticks(5)) +
scale_x_continuous(labels=fmt_dcimals, breaks=number_ticks(5)) +
coord_fixed(ratio = (max_pred-min_pred)/(max_density), xlim = c(min_pred,max_pred), ylim = c(0,max_density), expand = FALSE)
return(pdens)
}
plot_ks = function(dat_eva_lst, pm=NULL, co=NULL, title=NULL, ...) {
. = datset = KS = metrics = pred_threshold = coord = maxks = oc = pred = cumpop = cumneg = cumpos = nN = nP = NULL
# data for ks plot
dt_ks = lapply(dat_eva_lst, function(x) {
TN = totN = FN = totP = NULL
x = x[, .(
cumpop, pred, cumneg = cumsum(nN)/totN, cumpos = cumsum(nP)/totP
)][, ks := abs(cumneg - cumpos)]
x = rbind(x, data.table(cumpop=0), fill=TRUE)
x[is.na(x)] <- 0
return(x[order(cumpop)])
})
dt_ks = merge(
rbindlist(dt_ks, fill = TRUE, idcol = 'datset'),
merge(rbindlist(pm, idcol = 'datset')[,.(datset,maxks=KS)],
rbindlist(co, idcol = 'datset')[metrics == 'ks',.(datset, pred_threshold,coord)], by = 'datset'),
by = 'datset', all.x = TRUE
)[, datset := sprintf('%s, KS=%.4f\np=%.2f, %s', format(datset), round(maxks,4), abs(pred_threshold), coord)][]
# max ks row
dfks = dt_ks[, .SD[ks == max(ks)][1], by = 'datset'][, oc := sprintf('@%.4f', round(pred,4))][]#[, oc := sprintf('%.4f\n(%.4f,%.4f)', round(pred,4), round(cumpop,4), round(ks,4))]
x_posi = 0.4
x_neg = 0.95
if (dt_ks[, mean(cumpos) < mean(cumneg)]) {
x_neg = 0.4
x_posi = 0.95
}
# plot
pks = ggplot(data = dt_ks, aes(x=cumpop)) +
geom_line(aes(y=cumneg, color=datset), linetype='dotted') +
geom_line(aes(y=cumpos, color=datset), linetype='dotted') +
geom_line(aes(y=ks, color=datset)) +
geom_segment(data = dfks, aes(x = cumpop, y = 0, xend = cumpop, yend = ks, color=datset), linetype = "dashed") +
geom_point(data = dfks, aes(x=cumpop, y=ks), color='red') +
# geom_text(data = dfks, aes(x=cumpop, y=ks, label=oc, color=datset), vjust=0) +
annotate("text", x=x_posi, y=0.7, vjust = -0.2, label="Pos", colour = "gray") +
annotate("text", x=x_neg, y=0.7, vjust = -0.2, label="Neg", colour = "gray") +
theme_bw() +
theme(legend.position=c(0,1),
legend.justification=c(0,1),
legend.background=element_blank(),
legend.key=element_blank(),
legend.key.size = unit(1.5, 'lines')) +
guides(color=guide_legend(title=NULL))
# axis, labs, theme
pks = pks + ggtitle(paste0(title, 'K-S')) +
labs(x = "% of population", y = "% of total Neg/Pos") +
scale_y_continuous(labels=fmt_dcimals, breaks=number_ticks(5)) +
scale_x_continuous(labels=fmt_dcimals, breaks=number_ticks(5)) +
coord_fixed(xlim = c(0,1), ylim = c(0,1), expand = FALSE)
return(pks)
}
plot_lift = function(dat_eva_lst, pm=NULL, co=NULL, title=NULL, ...) {
cumpop = datset = NULL
dt_lift = lapply(dat_eva_lst, function(x) {
. = precision = totP = totN = NULL
x = x[, .(cumpop, lift = precision/(totP/(totP+totN)))]
x = rbind(x, data.table(cumpop=0), fill=TRUE)
return(x[order(cumpop)])
})
dt_lift = rbindlist(dt_lift, fill = TRUE, idcol = 'datset')
max_lift = dt_lift[, ceiling(max(lift,na.rm = TRUE))]
legend_xposition = 0
if (dt_lift[cumpop<0.5,mean(lift, na.rm = TRUE)] > dt_lift[cumpop>0.5,mean(lift, na.rm = TRUE)]) legend_xposition = 1
# plotting
plift = ggplot(data = dt_lift, aes(x=cumpop, color = datset)) +
geom_line(aes(y = lift), na.rm = TRUE) +
theme_bw() +
theme(legend.position=c(legend_xposition,1),
legend.justification=c(legend_xposition,1),
legend.background=element_blank(),
legend.key=element_blank(),
legend.key.size = unit(1.5, 'lines')) +
guides(color=guide_legend(title=NULL))
# axis, labs, theme
plift = plift +
ggtitle(paste0(title, 'Lift')) +
labs(x = "% of population", y = "Lift") +
scale_y_continuous(labels=fmt_dcimals, breaks=number_ticks(5)) +
scale_x_continuous(labels=fmt_dcimals, breaks=number_ticks(5)) +
coord_fixed(ratio = 1/(max_lift-1),xlim = c(0,1), ylim = c(1,max_lift), expand = FALSE)
return(plift)
}
plot_gain = function(dat_eva_lst, pm=NULL, co=NULL, title=NULL, ...) {
. = cumpop = datset = precision = NULL
dt_gain = lapply(dat_eva_lst, function(x) {
x = x[, .(cumpop, precision)]
x = rbind(x, data.table(cumpop=0), fill=TRUE)
return(x[order(cumpop)])
})
dt_gain = rbindlist(dt_gain, fill = TRUE, idcol = 'datset')
# max_ppv = dt_gain[, ceiling2(max(precision,na.rm = TRUE))]
legend_xposition = 0
if (dt_gain[cumpop<0.1,mean(precision, na.rm = TRUE)] > dt_gain[cumpop>0.9,mean(precision, na.rm = TRUE)]) legend_xposition = 1
# plotting
pgain = ggplot(data = dt_gain, aes(x=cumpop, color = datset)) +
geom_line(aes(y = precision), na.rm = TRUE) +
theme_bw() +
theme(legend.position=c(legend_xposition,1),
legend.justification=c(legend_xposition,1),
legend.background=element_blank(),
legend.key=element_blank(),
legend.key.size = unit(1.5, 'lines')) +
guides(color=guide_legend(title=NULL))
# axis, labs, theme
pgain = pgain +
ggtitle(paste0(title, 'Gain')) +
labs(x = "% of population", y = "Precision / PPV") +
scale_y_continuous(labels=fmt_dcimals, breaks=number_ticks(5)) +
scale_x_continuous(labels=fmt_dcimals, breaks=number_ticks(5)) +
coord_fixed(xlim = c(0,1), ylim = c(0,1), expand = FALSE)
# coord_fixed(ratio = 1/(max_ppv),xlim = c(0,1), ylim = c(0,max_ppv), expand = FALSE)
return(pgain)
}
plot_roc = function(dat_eva_lst, pm=NULL, co=NULL, title=NULL, ...) {
. = datset = AUC = metrics = pred_threshold = coord = pred = TPR = FPR = ocFPR = ocTPR = NULL
dt_roc = lapply(dat_eva_lst, function(x) {
x = x[, .(TPR, FPR)]
x = rbind(x, data.table(TPR=0:1, FPR=0:1), fill=TRUE)
return(x[order(TPR, FPR)])
})
# merge with auc
dt_roc = merge(
rbindlist(dt_roc, fill = TRUE, idcol = 'datset'),
merge(rbindlist(pm, idcol = 'datset')[,.(datset,auc=AUC)],
rbindlist(co, idcol = 'datset')[metrics == 'roc',.(datset, pred_threshold,coord)], by = 'datset'),
by = 'datset', all.x = TRUE
)[, datset := sprintf('%s, AUC=%.4f\np=%.2f, %s', format(datset), round(auc,4), abs(pred_threshold), coord)][]
# optimal cutoff
dt_cut = merge(
rbindlist(pm, idcol = 'datset')[,.(datset,auc=AUC)],
rbindlist(lapply(dat_eva_lst, cutoff_roc), idcol = 'datset'), by = 'datset'
)[,.(datset, pred, ocTPR=TPR, ocFPR=FPR)
]#[, oc := sprintf('@%.4f', round(pred,4))]#[, oc := sprintf('%.4f\n(%.4f,%.4f)', round(pred,4), round(ocFPR,4), round(ocTPR,4))]
# plot
proc = ggplot(dt_roc, aes(x=FPR)) +
geom_line(aes(y=TPR, color=datset)) +
geom_line(aes(y=FPR), linetype = "dashed", colour="gray") +
geom_ribbon(aes(ymin=0, ymax=TPR, fill=datset), alpha=0.1) +
geom_point(data = dt_cut, aes(x=ocFPR, y=ocTPR), color='red') +
# geom_text(data = dt_cut, aes(x=ocFPR, y=ocTPR, label=oc, color=datset), vjust=1) +
# geom_segment(aes(x=0, y=0, xend=1, yend=1), linetype = "dashed", colour="red") +
theme_bw() +
theme(legend.position=c(1,0),
legend.justification=c(1,0),
legend.background=element_blank(),
legend.key=element_blank(),
legend.key.size = unit(1.5, 'lines')) +
guides(color=guide_legend(title=NULL), fill=FALSE)
# axis, labs, theme
proc = proc + ggtitle(paste0(title, 'ROC')) +
labs(x = "1-Specificity / FPR", y = "Sensitivity / TPR") +
scale_y_continuous(labels=fmt_dcimals, breaks=number_ticks(5)) +
scale_x_continuous(labels=fmt_dcimals, breaks=number_ticks(5)) +
coord_fixed(xlim = c(0,1), ylim = c(0,1), expand = FALSE)
return(proc)
}
plot_pr = function(dat_eva_lst, pm=NULL, co=NULL, title=NULL, ...) {
. = recall = precision = datset = NULL
dt_pr = lapply(dat_eva_lst, function(x) x[, .(recall, precision)][order(precision, recall)])
dt_pr = rbindlist(dt_pr, idcol = 'datset')
# max_ppv = dt_pr[, ceiling2(max(precision,na.rm = TRUE))]
# plot
ppr = ggplot(dt_pr) +
geom_line(aes(x=recall, y=precision, color=datset), na.rm = TRUE) +
geom_line(aes(x=recall, y=recall), na.rm = TRUE, linetype = "dashed", colour="gray") +
# geom_segment(aes(x = 0, y = 0, xend = 1, yend = 1), colour = "red", linetype="dashed") +
theme_bw() +
theme(legend.position=c(0,0),
legend.justification=c(0,0),
legend.background=element_blank(),
legend.key=element_blank(),
legend.key.size = unit(1.5, 'lines')) +
guides(color=guide_legend(title=NULL))
# axis, labs, theme
ppr = ppr + ggtitle(paste0(title, 'P-R')) +
labs(x = "Recall / TPR", y = "Precision / PPV") +
scale_y_continuous(labels=fmt_dcimals, breaks=number_ticks(5)) +
scale_x_continuous(labels=fmt_dcimals, breaks=number_ticks(5)) +
coord_fixed(xlim = c(0,1), ylim = c(0,1), expand = FALSE)
# coord_fixed(ratio = 1/(max_ppv),xlim = c(0,1), ylim = c(0,max_ppv), expand = FALSE)
return(ppr)
}
plot_lz = function(dat_eva_lst, pm=NULL, co=NULL, title=NULL, ...) {
FN = nP = totP = . = datset = Gini = cumpop = cumposrate = NULL
dt_lz = lapply(dat_eva_lst, function(x) {
x = x[, .(cumpop, cumposrate = cumsum(nP)/totP)]
x = rbind(x, data.table(cumpop=0, cumposrate=0), fill=TRUE)
return(x[order(cumpop)])
})
dt_lz = merge(
rbindlist(dt_lz, fill = TRUE, idcol = 'datset'),
rbindlist(pm, idcol = 'datset')[,.(datset,gini=Gini)], by = 'datset', all.x = TRUE
)[, datset := sprintf('%s, Gini=%.4f', format(datset), round(gini,4))][]
# plot
plz = ggplot(dt_lz, aes(x=cumpop)) +
geom_line(aes(y=cumposrate, color=datset)) +
geom_line(aes(y=cumpop), linetype = "dashed", colour="gray") +
# geom_segment(aes(x=0, y=0, xend=1, yend=1), linetype = "dashed", colour="red") +
geom_ribbon(aes(ymin=cumpop, ymax=cumposrate, fill=datset), alpha=0.1) +
theme_bw() +
theme(legend.position=c(0,1),
legend.justification=c(0,1),
legend.background=element_blank(),
legend.key=element_blank(),
legend.key.size = unit(1.5, 'lines')) +
guides(color=guide_legend(title=NULL), fill=FALSE)
# axis, labs, theme
plz = plz + ggtitle(paste0(title, 'Lorenz')) +
labs(x = "% of population", y = "% of total positive") +
scale_y_continuous(labels=fmt_dcimals, breaks=number_ticks(5)) +
scale_x_continuous(labels=fmt_dcimals, breaks=number_ticks(5)) +
coord_fixed(xlim = c(0,1), ylim = c(0,1), expand = FALSE)
return(plz)
}
plot_f1 = function(dat_eva_lst, pm=NULL, co=NULL, beta=1, title=NULL, ...) {
. = datset = pred_threshold = coord = f1 = cumpop = metrics = NULL
dt_f = lapply(dat_eva_lst, function(x) {
. = cumpop = precision = recall = NULL
x = x[, .(cumpop, f = 1/(1/(1+beta^2)*(1/precision+beta^2/recall)))]
setnames(x, c('cumpop', paste0('f',beta)))
return(x[order(cumpop)])
})
dt_f = merge(
rbindlist(dt_f, fill = TRUE, idcol = 'datset'),
rbindlist(co, idcol = 'datset')[metrics == paste0('max_f',beta),.(datset, pred_threshold,coord)],
by = 'datset', all.x = TRUE
)[, datset := sprintf('%s\np=%.2f, %s', format(datset), abs(pred_threshold), coord)][]
# optimal cutoff
dt_cut = dt_f[, .SD[f1 == max(f1, na.rm = TRUE)][1], by = 'datset']#[, oc := sprintf('%.4f\n(%.4f,%.4f)', round(pred,4), round(cumpop,4), round(ks,4))]
# max_fb = ceiling2(max(dt_f[[paste0('f',beta)]],na.rm = TRUE))
# plot
pf = ggplot(data = dt_f, aes(x=cumpop)) +
geom_line(aes_string(y=paste0('f',beta), color='datset'), na.rm = TRUE) +
geom_point(data = dt_cut, aes_string(x='cumpop', y=paste0('f',beta)), color='red') +
# geom_text(data = dt_cut, aes_string(x='cumpop', y=paste0('f',beta), label='oc', color='datset'), vjust=0) +
geom_segment(data = dt_cut, aes_string(x = 'cumpop', y = 0, xend = 'cumpop', yend = paste0('f',beta), color='datset'), linetype = "dashed") +
theme_bw() +
theme(legend.position=c(1,0),
legend.justification=c(1,0),
legend.background=element_blank(),
legend.key=element_blank(),
legend.key.size = unit(1.5, 'lines')) +
guides(color=guide_legend(title=NULL), fill=FALSE)
# axis, labs, theme
pf = pf + ggtitle(paste0(title, paste0("F",beta))) +
labs(x = "% of population", y = paste0("F",beta)) +
scale_y_continuous(labels=fmt_dcimals, breaks=number_ticks(5)) +
scale_x_continuous(labels=fmt_dcimals, breaks=number_ticks(5)) +
coord_fixed(xlim = c(0,1), ylim = c(0,1), expand = FALSE)
# coord_fixed(ratio = 1/(max_fb),xlim = c(0,1), ylim = c(0,max_fb), expand = FALSE)
return(pf)
}
# eva ------
# dataset label pred
#' @importFrom stats complete.cases
func_dat_labelpred = function(pred, label, title, positive, seed, ...) {
# convert pred/label into list of list, such as:
# pred = list(datset = list(pred = ...))
lst_pl = list(pred=pred, label=label)
# map on lst_pl
lst_pl2 = Map(function(x, xname) {
if (is.null(x)) return(x)
# if not list, convert into list
if (!inherits(x,'list')) {
x = list(dat=x)
if (!is.null(title)) names(x) <- title
}
# convert the objects in list of pred/label, into lists
x = lapply(x, function(xi) {
xi_lst = list()
if (!is.data.frame(xi)) {
xi_lst[[xname]] = xi
} else {
xi_lst = as.list(xi)
if (xname == 'label') names(xi_lst) = xname
}
return(xi_lst)
})
# return pred/label
return(x)
}, x = lst_pl, xname = names(lst_pl))
# convert list into datatable using setDT()
dt_lst = list()
for (ds in names(lst_pl2$pred)) { # ds = dataset
lst_pred_ds = lst_pl2$pred[[ds]]
lst_label_ds = NULL
if (!is.null(label)) lst_label_ds = lst_pl2$label[[ds]]
if (is.null(lst_label_ds)) lst_label_ds[['label']] = rep_len(NA, unique(sapply(lst_pred_ds, length)))
dt_lst[[ds]] = setDT(c(lst_pred_ds, lst_label_ds))
}
# random sort datatable
dt_lst = lapply(dt_lst, function(x) {
set.seed(seed)
return(x[sample(.N, .N)])
})
# # if pred is score
# if (for_psi) {
# for (pn in setdiff(names(dt_lst[[1]]), 'label')) {
# if ( all(sapply(dt_lst, function(x) mean(x[[pn]],na.rm = TRUE)>1)) ) {
# warning("Since the average of pred is not in [0,1], it is treated as predicted score but not probability.")
# for (ds in names(dt_lst)) {
# dt_lst[[ds]][[pn]] = -dt_lst[[ds]][[pn]]
# }
# }
# }
# }
# check label & remove rows with missing values in pred
dt_lst = lapply(dt_lst, function(x) {
if (!is.null(label)) {
if (length(unique(x$label)) > 5) stop('Please double check the arguments. The position of "label" have exchanged with "pred" to the second argument since version 0.2.0, in order to consistent with perf_psi.')
x = check_y(x, 'label', positive)
}
if (anyNA(x)) {
warning("The NAs in dataset have been removed.")
x = x[complete.cases(copy(x)[,label:=NULL]), ]#x[!is.na(pred)]
}
return(x)
})
return(dt_lst)
}
# dataset evaluation
func_dat_eva = function(dt, groupnum=NULL, pred_desc=FALSE, ...) {
. = label = pred = nP = nN = pred2 = nP2 = nN2 = totP = FN = totN = TN = TP = FP = group = NULL
# pred # P | N
# actual #P # TP | FN
#N # FP | TN
# nP, number of Positive samples in each predicted prob
# nN, number of Negative samples in each predicted prob
# totP, total Positive
# totN, total Negative
# FN, False Negative; cumsum of positive, cumpos
# TN, True Negative; cumsum of negative, cumneg
# TP, True Positive; totP-FN
# FP, False Positive; totN-TN
setDT(dt)
total_num = dt[,.N]
dt_ev = dt[, .(nP = sum(label==1), nN = sum(label==0)), keyby = pred]
if (!(is.null(groupnum) || total_num <= groupnum)) {
dt_ev = dt_ev[
, pred2 := ceiling(cumsum(nP+nN)/(total_num/groupnum))
][,`:=`(
nP2 = sum(nP), nN2 = sum(nN)
), by = pred2
][, .SD[.N], by = pred2
][, .(pred, nP = nP2, nN = nN2)]
}
dt_ev = dt_ev[, `:=`(totP = sum(nP), totN = sum(nN))]
if (pred_desc) {
dt_ev = dt_ev[
order(-pred)
][, `:=`(TP = cumsum(nP), FP = cumsum(nN))
][, `:=`(
FN = totP-TP,
TN = totN-FP,
cumpop = (TP+FP)/(totP+totN)
)]
} else {
dt_ev = dt_ev[
order(pred)
][, `:=`(FN = cumsum(nP), TN = cumsum(nN))
][, `:=`(
TP = totP-FN,
FP = totN-TN,
cumpop = (FN+TN)/(totP+totN)
)]
}
dt_ev = dt_ev[, `:=`(
TPR = TP/totP, FPR = FP/totN,
precision = TP/(TP+FP), recall = TP/totP
)][]
return(dt_ev)
}
# performance metrics
pf_metrics = function(dt_lst, dt_ev_lst, all_metrics, sel_metrics) {
metrics_to_names = function(bm) {
bm = toupper(bm)
if (bm=='LOGLOSS') {
bm = 'LogLoss'
} else if (bm == 'GINI') {
bm = 'Gini'
}
return(bm)
}
# all metrics
all_pm = list()
for (n in names(dt_lst)) {
d1 = dt_lst[[n]]
d2 = dt_ev_lst[[n]]
allpm_list = list()
for (i in all_metrics) {
allpm_list[[metrics_to_names(i)]] = do.call(i, args = list(dt = d1, dt_ev=d2))
}
all_pm[[n]] = setDT(allpm_list)
}
# selected metrics
sel_pm = lapply(all_pm, function(x) x[,sapply(sel_metrics, metrics_to_names),with=FALSE])
return(list(all_pm=all_pm, sel_pm=sel_pm))
}
# optimal cutoffs
pf_cutoffs = function(dt_ev_lst, pred_desc = FALSE) {
. = pred = cumpop = f1 = f2 = FPR = TPR = NULL
co = list()
for (n in names(dt_ev_lst)) {
con = list(
max_f1 = cutoff_fbeta(dt_ev_lst[[n]],1, pred_desc=pred_desc)[,.(pred_threshold=pred, coord = sprintf('(%.2f,%.2f)',cumpop,f1))],
max_f2 = cutoff_fbeta(dt_ev_lst[[n]],2, pred_desc=pred_desc)[,.(pred_threshold=pred, coord = sprintf('(%.2f,%.2f)',cumpop,f2))],
ks = cutoff_ks(dt_ev_lst[[n]], pred_desc=pred_desc)[,.(pred_threshold=pred, coord = sprintf('(%.2f,%.2f)',cumpop,ks))],
roc = cutoff_roc(dt_ev_lst[[n]], pred_desc=pred_desc)[,.(pred_threshold=pred, coord = sprintf('(%.2f,%.2f)',FPR,TPR))] )
co[[n]] = rbindlist(con, idcol = 'metrics')
}
return(co)
}
#' Binomial Metrics
#'
#' \code{perf_eva} calculates metrics to evaluate the performance of binomial classification model. It can also creates confusion matrix and model performance graphics.
#'
#' @param pred A list or vector of predicted probability or score.
#' @param label A list or vector of label values.
#' @param title The title of plot. Defaults to NULL.
#' @param binomial_metric Defaults to c('mse', 'rmse', 'logloss', 'r2', 'ks', 'auc', 'gini'). If it is NULL, then no metric will calculated.
#' @param confusion_matrix Logical, whether to create a confusion matrix. Defaults to TRUE.
#' @param threshold Confusion matrix threshold. Defaults to the pred on maximum F1.
#' @param show_plot Defaults to c('ks', 'roc'). Accepted values including c('ks', 'lift', 'gain', 'roc', 'lz', 'pr', 'f1', 'density').
#' @param pred_desc whether to sort the argument of pred in descending order. Defaults to TRUE.
#' @param positive Value of positive class. Defaults to "bad|1".
#' @param ... Additional parameters.
#'
#' @return A list of binomial metric, confusion matrix and graphics
#' @seealso \code{\link{perf_psi}}
#'
#' @details
#' Accuracy = true positive and true negative/total cases
#'
#' Error rate = false positive and false negative/total cases
#'
#' TPR, True Positive Rate(Recall or Sensitivity) = true positive/total actual positive
#'
#' PPV, Positive Predicted Value(Precision) = true positive/total predicted positive
#'
#' TNR, True Negative Rate(Specificity) = true negative/total actual negative = 1-FPR
#'
#' NPV, Negative Predicted Value = true negative/total predicted negative
#'
#'
#'
#'
#'
# https://en.wikipedia.org/wiki/Positive_and_negative_predictive_values
# ROC curve: Sensitivity ~ 1-Specificity with different threshold
# Lift chart: Lift(PV+/p1) ~ Depth with different threshold
# Gains chart: PV + ~ Depth with different threshold
#'
#'
#' @examples
#' \donttest{
#' # load germancredit data
#' data("germancredit")
#' # filter variable via missing rate, iv, identical value rate
#' dtvf = var_filter(germancredit, "creditability")
#'
#' # breaking dt into train and test
#' dt_list = split_df(dtvf, "creditability")
#' label_list = lapply(dt_list, function(x) x$creditability)
#'
#' # woe binning
#' bins = woebin(dt_list$train, "creditability")
#' # scorecard, prob
#' cardprob = scorecard2(bins, dt = dt_list, y = 'creditability', return_prob = TRUE)
#'
#' # credit score
#' score_list = lapply(dt_list, function(x) scorecard_ply(x, cardprob$card))
#'
#' ###### perf_eva examples ######
#' # Example I, one datset
#' ## predicted p1
#' perf_eva(pred = cardprob$prob$train, label=label_list$train,
#' title = 'train')
#' ## predicted score
#' # perf_eva(pred = score_list$train, label=label_list$train,
#' # title = 'train')
#'
#' # Example II, multiple datsets
#' ## predicted p1
#' perf_eva(pred = cardprob$prob, label = label_list,
#' show_plot = c('ks', 'lift', 'gain', 'roc', 'lz', 'pr', 'f1', 'density'))
#' ## predicted score
#' # perf_eva(score_list, label_list)
#'
#' }
#'
#' @import data.table ggplot2 gridExtra
#' @importFrom utils head tail
#' @export
#'
perf_eva = function(pred, label, title=NULL, binomial_metric=c('mse', 'rmse', 'logloss', 'r2', 'ks', 'auc', 'gini'), confusion_matrix=FALSE, threshold=NULL, show_plot=c('ks', 'lift'), pred_desc=TRUE, positive="bad|1", ...) {
. = f1 = NULL
kwargs = list(...)
# arguments
seed = kwargs[['seed']]
if (is.null(seed)) seed = 618
# list of data with label and pred
dt_lst = suppressMessages(
func_dat_labelpred(pred=pred, label=label, title=title, positive=positive, seed=seed))
dt_lst = lapply(dt_lst, function(x) x[,.(label, pred=x[[setdiff(names(x),'label')]])])
# if pred is score
pred_is_score = all(sapply(dt_lst, function(x) mean(x$pred, na.rm = TRUE)>1))
dt_lst = lapply(dt_lst, function(x) {
nP = nN = NULL
if (pred_is_score) {
# make sure the positive samples are locate at below when order by pred
x2 = x[, .(nP = sum(label==1), nN = sum(label==0)), keyby = pred
][,.(nP,nN)]
h_x2 = x2[,lapply(.SD, function(x) sum(head(x,.N*0.2))/sum(x))]
t_x2 = x2[,lapply(.SD, function(x) sum(tail(x,.N*0.2))/sum(x))]
if ( h_x2[, nP > nN] && t_x2[, nP < nN] ) {
x = x[, pred := -pred]
# warning("Since the average of pred is not locate in [0,1], it is treated as predicted score but not probability.")
}
}
return(x)
})
# datasets for evaluation
dt_ev_lst = lapply(dt_lst, function(x) func_dat_eva(x, groupnum = NULL, pred_desc = pred_desc))
# cutoff, Maximum Metrics
co = pf_cutoffs(dt_ev_lst, pred_desc = pred_desc)
# return list
rt_list = list()
###### performance metric
all_metrics = c('mse','rmse','logloss','r2','ks','auc','gini')
if (pred_is_score) all_metrics = c('ks', 'auc', 'gini')
sel_metrics = intersect(binomial_metric, all_metrics)
pm_lst = pf_metrics(dt_lst, dt_ev_lst, all_metrics, sel_metrics)
if (length(sel_metrics)>0) rt_list[['binomial_metric']] = pm_lst$sel_pm
# cat('Binomial Metrics:\n')
# print(pm)
###### confusion matrix
if (confusion_matrix) {
# if threshold is not provided, set it as max F1
if (is.null(threshold) || !is.numeric(threshold)) threshold = cutoff_fbeta(dt_ev_lst[[1]])[,pred]
threshold_abs = threshold
if (pred_is_score) threshold_abs = abs(threshold)
# confusion matrix
cli_inform(c(i=sprintf('The threshold of confusion matrix is %.4f.', threshold_abs)))
rt_list[['confusion_matrix']] = lapply(dt_lst, function(x) confusionMatrix(dt=x, threshold=threshold))
}
# cat(sprintf('Confusion Matrix with threshold=%s:\n', round(threshold,4)))
# print(cm)
###### plot
# title
if (!is.null(title)) title = paste0(title,': ')
# type
type = kwargs[["type"]]
if (isTRUE(show_plot) & !is.null(show_plot) & !is.null(type)) show_plot = type
# show_plot
show_plot = intersect(show_plot, c('ks', 'lift', 'gain', 'roc', 'lz', 'pr', 'f1', 'density'))
# pic
if (length(show_plot)>0) {
# datasets for visualization
groupnum = kwargs[["groupnum"]]
if (is.null(groupnum)) groupnum = 1000
dt_ev_lst_plot = lapply(dt_lst, function(x) func_dat_eva(x, groupnum = groupnum, pred_desc = pred_desc))
# plot
plist = lapply(paste0('plot_', show_plot), function(x) do.call(x, args = list(dat_eva_lst = dt_ev_lst_plot, dt_lst=dt_lst, pm=pm_lst$all_pm, co=co, title=title, positive=positive)))
# return ggpubr, cowplot and gridExtra
rt_list[['pic']] = do.call(grid.arrange, c(plist, list(ncol=ceiling(sqrt(length(show_plot))), padding = 0)))
}
return(rt_list)
}
# psi ------
# PSI function
psicsi_metric = function(dt_sn, names_datset, is_totalscore=TRUE) {
A=E=logAE=bin_psi=bin= AE = bin_PSI = NULL
# psi = sum((Actual% - Expected%)*ln(Actual%/Expected%))
# dat = copy(dat)[,y:=NULL][complete.cases(dat),]
# data frame of bin, actual, expected
dt_bae = dcast(
dt_sn[, .N, keyby = c('datset', 'bin')],
bin ~ datset, value.var="N", fill = 0.99
)
if (is_totalscore) {
# psi
psi_dt = dt_bae[
, (c("A","E")) := lapply(.SD, function(x) x/sum(x)), .SDcols = names_datset
][, `:=`(AE = A-E, logAE = log(A/E))
][, sum(AE*logAE)]
} else {
# csi
psi_dt = dt_bae[
, (c("A","E")) := lapply(.SD, function(x) x/sum(x)), .SDcols = names_datset
][, sum((A-E) * as.numeric(as.character(bin)))]
}
return(psi_dt)
}
# psi plot
psi_plot = function(dt_psi, psi_sn, title, sn, line_color = 'blue', bar_color = NULL, bin_close_right) {
. = label = N = pos = posprob = distr = bin = midbin = bin1 = bin2 = datset = posprob2 = NULL
# plot title
title = paste0(ifelse(is.null(title), sn, title), " PSI: ", round(psi_sn, 4))
distr_prob =
dt_psi[, .(.N, pos = sum(label==1)), keyby = c('datset','bin')
][, `:=`(distr = N/sum(N), posprob = pos/N), by = 'datset'
][, `:=`(posprob2 = posprob*max(distr)), by = "datset"
][, `:=`(
bin1 = as.numeric(sub(binpattern('leftright_brkp', bin_close_right), "\\1", bin)),
bin2 = as.numeric(sub(binpattern('leftright_brkp', bin_close_right), "\\2", bin))
)][, midbin := (bin1+bin2)/2 ][]
# plot
p_score_distr =
ggplot(distr_prob) +
geom_bar(aes(x=bin, y=distr, fill=datset), stat="identity", position="dodge", na.rm=TRUE) +
geom_line(aes(x=bin, y=posprob2, group=datset, linetype=datset), colour= line_color, na.rm=TRUE) +
geom_point(aes(x=bin, y=posprob2), colour= line_color, shape=21, fill="white", na.rm=TRUE) +
guides(fill=guide_legend(title="Distribution"), colour=guide_legend(title="Probability"), linetype=guide_legend(title="Probability")) +
scale_y_continuous(expand = c(0, 0), sec.axis = sec_axis(~./max(distr_prob$distr), name = "Positive probability")) +
ggtitle(title) +
labs(x=NULL, y="Score distribution") +
# geom_text(aes(label="@http://shichen.name/scorecard", x=Inf, y=Inf), vjust = -1, hjust = 1, color = "#F0F0F0") +
theme_bw() +
theme(
plot.title=element_text(vjust = -2.5),
legend.position=c(1,1),
legend.justification=c(1,1),
legend.background=element_blank(),
axis.title.y.right = element_text(colour = line_color),
axis.text.y.right = element_text(colour = line_color,angle=90, hjust = 0.5),
axis.text.y = element_text(angle=90, hjust = 0.5))
if (!is.null(bar_color)) p_score_distr = p_score_distr + scale_fill_manual(values= bar_color)
return(p_score_distr)
}
gains_table_format = function(dt_distr, ret_bin_avg=FALSE) {
. = neg = pos = bin = count = datset = bin_avg = NULL
dt_distr = dt_distr[, .(
bin,
count, cum_count = cumsum(count),
neg,
pos,
cum_neg = cumsum(neg),
cum_pos = cumsum(pos),
count_distr = round(count/sum(count),4),
posprob = round(pos/count,4),
cum_posprob = round(cumsum(pos)/cumsum(count),4),
rejected_rate = round(cumsum(count)/sum(count),4),
approval_rate = round(1-cumsum(count)/sum(count),4),
bin_avg
), by = datset]
if (!ret_bin_avg) dt_distr = dt_distr[, bin_avg := NULL]
return(dt_distr[])
}
#' Gains Table
#'
#' \code{gains_table} creates a data frame including distribution of total, negative, positive, positive rate and rejected rate by score bins. The gains table is used in conjunction with financial and operational considerations to make cutoff decisions.
#'
#' @param score A list of credit score for actual and expected data samples. For example, score = list(actual = scoreA, expect = scoreE).
#' @param label A list of label value for actual and expected data samples. For example, label = list(actual = labelA, expect = labelE).
#' @param bin_num Integer, the number of score bins. Defaults to 10. If it is 'max', then individual scores are used as bins.
#' @param method The score is binning by equal frequency or equal width. Accepted values are 'freq' and 'width'. Defaults to 'freq'.
#' @param width_by Number, increment of the score breaks when method is set as 'width'. If it is provided the above parameter bin_num will not be used. Defaults to NULL.
#' @param breaks_by The name of data set to create breakpoints. Defaults to the first data set. Or numeric values to set breakpoints manually.
#' @param positive Value of positive class, Defaults to "bad|1".
#' @param ... Additional parameters.
#'
#' @return A data frame
#' @seealso \code{\link{perf_eva}} \code{\link{perf_psi}}
#'
#' @examples
#' \donttest{
#' # load germancredit data
#' data("germancredit")
#' # filter variable via missing rate, iv, identical value rate
#' dtvf = var_filter(germancredit, "creditability")
#'
#' # breaking dt into train and test
#' dt_list = split_df(dtvf, "creditability")
#' label_list = lapply(dt_list, function(x) x$creditability)
#'
#' # binning
#' bins = woebin(dt_list$train, "creditability")
#' # scorecard
#' card = scorecard2(bins, dt = dt_list$train, y = 'creditability')
#'
#' # credit score
#' score_list = lapply(dt_list, function(x) scorecard_ply(x, card))
#'
#'
#' ###### gains_table examples ######
#' # Example I, input score and label can be a vector or a list
#' g1 = gains_table(score = unlist(score_list), label = unlist(label_list))
#' g2 = gains_table(score = score_list, label = label_list)
#'
#' # Example II, specify the bins number and type
#' g3 = gains_table(score = unlist(score_list), label = unlist(label_list), bin_num = 20)
#' g4 = gains_table(score = unlist(score_list), label = unlist(label_list), method = 'width')
#' }
#'
#' @export
gains_table = function(score, label, bin_num=10, method='freq', width_by=NULL, breaks_by=NULL, positive='bad|1', ...) {
. = V1 = V2 = pos = bin = count = datset = group = NULL
# arguments
kwargs = list(...)
# seed
seed = kwargs[['seed']]
if (is.null(seed)) seed = 618
# title
title = kwargs[['title']]
if (is.null(title)) title = NULL
# return_dt_psi
return_dt_psi = kwargs[['return_dt_psi']]
if (is.null(return_dt_psi)) return_dt_psi = FALSE
# ret_bin_avg
ret_bin_avg = kwargs[['ret_bin_avg']]
if (is.null(ret_bin_avg)) ret_bin_avg = FALSE
# bin_num
if (bin_num != 'max' & bin_num <= 1) bin_num = 10
# method
bin_type = kwargs[['seed']]
if (!is.null(bin_type)) method = bin_type
if (!(method %in% c('freq', 'width'))) method = 'freq'
# width_by
if (!is.numeric(width_by) || width_by <= 0) width_by = NULL
# data frame of score and label
dt_sl = kwargs[['dt_sl']]
if (is.null(dt_sl) & !is.null(score) & !is.null(label)) {
# dateset list of score and label
dt_sl = suppressWarnings( func_dat_labelpred(
pred=score, label=label, title=title, positive=positive, seed=seed) )
# rename dt_sl as c('label','score')
dt_sl = lapply(dt_sl, function(x) {
x[,.(label, score=x[[setdiff(names(x),'label')]])]
})
# rbind the list into a data frame, and set datset column as factor
names_datset = names(dt_sl)
dt_sl = rbindlist(dt_sl, idcol = 'datset')[, datset := factor(datset, levels = names_datset)]
}
is_score = dt_sl[,mean(score)>1]
# breaks
if ( bin_num=='max' || bin_num >= dt_sl[, length(unique(score))] ) {
# in each value
dt_psi = copy(dt_sl)[, bin := factor(score)]
} else {
# set breakpoints manually.
if (inherits(breaks_by, 'numeric')) {
breaks_by = breaks_by[between(breaks_by, dt_sl[, min(score)], dt_sl[, max(score)])]
if (length(breaks_by)==0) breaks_by = NULL
brkp = breaks_by
}
# set breakpoints basedon dataset specified
if (is.null(breaks_by) | inherits(breaks_by, 'character')) {
# the name of dataset to create breakpoints, defaults to the name of 1st dataset
if (is.null(breaks_by)) breaks_by = dt_sl[1,datset]
breaks_by = intersect(breaks_by, dt_sl[, unique(datset)])
if (length(breaks_by) == 0) breaks_by = dt_sl[1,datset]
# the dataset of score/label to create breakpoints
dt_sl_brkp = dt_sl[datset %in% breaks_by]
# create breakpoints by method freq/width
if (method == 'freq') {
# in equal frequency
brkp = copy(dt_sl_brkp)[order(score)
][, group := ceiling(.I/(.N/bin_num))
][, .(score = score[1]), by = group
][, score[-1]]
} else if (method == 'width') {
sminmax = dt_sl_brkp[, quantile(score, c(0.001, 0.999))]
# in equal width
if (is.null(width_by) || width_by > max(score)-min(score)) {
# in equal width
# minmax = dt_sl_brkp[, sapply(.SD, function(x) list(min(x), max(x))), by=datset, .SDcols=c('score')
# ][,.(mins = max(V1), maxs = min(V2))] # choose min of max value, and max of min value by dataset
brkp = seq(sminmax[1], sminmax[2], length.out = bin_num+1)
# brkp = dt_sl_brkp[score >= sminmax[1] & score <= sminmax[2]][,pretty(score,bin_num)]
} else {
minmax = round(sminmax/width_by) * width_by
brkp = seq(minmax[[1]]-width_by, minmax[[2]]+width_by, by = width_by)
}
if (is_score) brkp = round(brkp)
brkp = brkp[-c(1, length(brkp))]
}
}
brkp = unique(c(-Inf, brkp, Inf))
dt_psi = dt_sl[, bin := cut(score, brkp, right = FALSE, dig.lab = 10, ordered_result = F)]
}
if (return_dt_psi) return(dt_psi) # innter result usded in perf_psi function
# distribution table
dt_distr = dt_psi[, .(count=.N, neg = sum(label==0), pos = sum(label==1), bin_avg = mean(score)), keyby = .(datset,bin)
][order(datset, bin)]
if (!is_score) dt_distr = dt_distr[order(datset, -bin)] #is predicted probability
# gains table
dt_distr = gains_table_format(dt_distr, ret_bin_avg)
return(dt_distr)
}
# @param method Whether in equal frequency or width when preparing dataset to calculates psi. Defaults to 'width'.
# @param return_distr_dat Logical. Defaults to FALSE. Whether to return a list of data frames including distribution of total, negative, positive cases by score bins in both equal width and equal frequency. This table is also named gains table.
# @param bin_num Integer. Defaults to 10. The number of score bins in distribution tables.
#' PSI
#'
#' \code{perf_psi} calculates population stability index (PSI) for total credit score and Characteristic Stability Index (CSI) for variables. It can also creates graphics to display score distribution and positive rate trends.
#'
#' @param score A list of credit score for actual and expected data samples. For example, score = list(expect = scoreE, actual = scoreA).
#' @param label A list of label value for actual and expected data samples. For example, label = list(expect = labelE, actual = labelA). Defaults to NULL.
#' @param title Title of plot, Defaults to NULL.
#' @param show_plot Logical. Defaults to TRUE.
#' @param positive Value of positive class, Defaults to "bad|1".
#' @param threshold_variable Integer. Defaults to 20. If the number of unique values > threshold_variable, the provided score will be counted as total credit score, otherwise, it is variable score.
#' @param var_skip Name of variables that are not score, such as id column. It should be the same with the var_kp in scorecard_ply function. Defaults to NULL.
#' @param ... Additional parameters.
#'
#' @return A data frame of psi and graphics of credit score distribution
#' @seealso \code{\link{perf_eva}} \code{\link{gains_table}}
#'
#' @details The population stability index (PSI) formula is displayed below: \deqn{PSI = \sum((Actual\% - Expected\%)*(\ln(\frac{Actual\%}{Expected\%}))).} The rule of thumb for the PSI is as follows: Less than 0.1 inference insignificant change, no action required; 0.1 - 0.25 inference some minor change, check other scorecard monitoring metrics; Greater than 0.25 inference major shift in population, need to delve deeper.
#'
#' Characteristic Stability Index (CSI) formula is displayed below: \deqn{CSI = \sum((Actual\% - Expected\%)*score).}
#'
#'
#' @examples
#' \donttest{
#' # load germancredit data
#' data("germancredit")
#' # filter variable via missing rate, iv, identical value rate
#' dtvf = var_filter(germancredit, "creditability")
#'
#' # breaking dt into train and test
#' dt_list = split_df(dtvf, "creditability")
#' label_list = lapply(dt_list, function(x) x$creditability)
#'
#' # binning
#' bins = woebin(dt_list$train, "creditability")
#' # scorecard
#' card = scorecard2(bins, dt = dt_list$train, y = 'creditability')
#'
#' # credit score
#' score_list = lapply(dt_list, function(x) scorecard_ply(x, card))
#' # credit score, only_total_score = FALSE
#' score_list2 = lapply(dt_list, function(x) scorecard_ply(x, card,
#' only_total_score=FALSE))
#'
#'
#' ###### perf_psi examples ######
#' # Example I # only total psi
#' psi1 = perf_psi(score = score_list, label = label_list)
#' psi1$psi # psi data frame
#' psi1$pic # pic of score distribution
#' # modify colors
#' # perf_psi(score = score_list, label = label_list,
#' # line_color='#FC8D59', bar_color=c('#FFFFBF', '#99D594'))
#'
#' # Example II # both total and variable psi
#' psi2 = perf_psi(score = score_list2, label = label_list)
#' # psi2$psi # psi data frame
#' # psi2$pic # pic of score distribution
#'
#' }
#' @import data.table ggplot2 gridExtra
#' @export
#'
perf_psi = function(score, label=NULL, title=NULL, show_plot=TRUE, positive="bad|1", threshold_variable=20, var_skip=NULL, ...) {
# # global variables
. = datset = group = V1 = bin = NULL
# arguments
kwargs = list(...)
bin_type = kwargs[['bin_type']]
method = kwargs[['method']]
bin_close_right = kwargs[['bin_close_right']]
if (!is.null(bin_type)) method = bin_type
if (is.null(method) || !(method %in% c('freq', 'width'))) method='width'
if (is.null(bin_close_right)) bin_close_right = getarg('bin_close_right')
seed = kwargs[['seed']]
if (is.null(seed)) seed = 618
breaks_by = kwargs[['breaks_by']]
return_distr_dat = kwargs[['return_distr_dat']]
if (is.null(return_distr_dat)) return_distr_dat = FALSE
if (show_plot) {
line_color = kwargs[['line_color']]
if (is.null(line_color)) line_color = 'blue'
bar_color = kwargs[['bar_color']]
}
# dateset list of score and label
dt_sl = suppressWarnings( func_dat_labelpred(
pred=score, label=label, title=title, positive=positive, seed=seed) )
names_datset = names(dt_sl) # names of dataset
dt_sl = rbindlist(dt_sl, idcol = 'datset')[, datset := factor(datset, levels = names_datset)]
rt = list() # return list
for (sn in setdiff(names(dt_sl), c('datset', 'label', var_skip))) { # sn: score names
# dataset for sn
dt_sn = dt_sl[,.(datset, label, score=dt_sl[[sn]])]
sn_is_totalscore = dt_sn[,length(unique(score)) > threshold_variable]
bin_num <- ifelse(sn_is_totalscore, 10, 'max')
dt_psi = gains_table(score=NULL, label=NULL, bin_num=10, method=method, positive = positive, return_dt_psi=TRUE, dt_sl=dt_sn, breaks_by = breaks_by)
# return list
temp_psi = list()
for (i in names_datset[-1]) {
# population stability index
names_dts = c(names_datset[1], i)
psi_sn = psicsi_metric(dt_psi[datset %in% names_dts], names_dts, is_totalscore = sn_is_totalscore)
dt_psi_sn = data.table(psi = psi_sn)
if (!sn_is_totalscore) dt_psi_sn = data.table(csi = psi_sn)
temp_psi[[paste0(names_dts, collapse = '_')]] = dt_psi_sn
# pic
temp_pic = NULL
if (show_plot) {
temp_pic = psi_plot(
dt_psi[datset %in% names_dts], psi_sn, title, sn,
line_color = line_color, bar_color = bar_color, bin_close_right = bin_close_right)
if (length(names_datset) > 2) {
rt[['pic']][[sn]][[paste0(names_dts, collapse = '_')]] = temp_pic
} else {
rt[['pic']][[sn]] = temp_pic
}
}
}
rt[['psi']][[sn]] = rbindlist(temp_psi, idcol = 'dataset')
# equal freq / width data frame
if (return_distr_dat) rt[['dat']][[sn]] = gains_table(score=NULL, label=NULL, bin_num=10, method=method, positive = positive, return_dt_psi=FALSE, dt_sl=dt_sn)
}
rt$psi = rbindlist(rt$psi, idcol = "variable", fill = TRUE)
# rt$dat = rbindlist(rt$dat, idcol = "variable")
return(rt)
}
#' Cross Validation
#'
#' \code{perf_cv} provides cross validation on logistic regression and other binomial classification models.
#'
#' @param dt A data frame with both x (predictor/feature) and y (response/label) variables.
#' @param y Name of y variable.
#' @param x Name of x variables. Defaults to NULL. If x is NULL, then all columns except y are counted as x variables.
#' @param breaks_list List of break points, defaults to NULL. If it is NULL, then using original values of the input data to fitting model, otherwise converting into woe values based on training data.
#' @param no_folds Number of folds for K-fold cross-validation. Defaults to 5.
#' @param seeds The seeds to create multiple random splits of the input dataset into training and validation data by using \code{split_df} function. Defaults to NULL.
#' @param binomial_metric Defaults to ks.
#' @param positive Value of positive class, defaults to "bad|1".
#' @param ... Additional parameters.
#'
#' @return A list of data frames of binomial metrics for each datasets.
#'
#'
#' @examples
#' \dontrun{
#' data("germancredit")
#'
#' dt = var_filter(germancredit, y = 'creditability')
#' bins = woebin(dt, y = 'creditability')
#' dt_woe = woebin_ply(dt, bins)
#'
#' perf1 = perf_cv(dt_woe, y = 'creditability', no_folds = 5)
#'
#' perf2 = perf_cv(dt_woe, y = 'creditability', no_folds = 5,
#' seeds = sample(1000, 10))
#'
#' perf3 = perf_cv(dt_woe, y = 'creditability', no_folds = 5,
#' binomial_metric = c('ks', 'auc'))
#'
#' }
#'
#' @export
#' @importFrom stats binomial
perf_cv = function(dt, y, x=NULL, no_folds = 5, seeds = NULL, binomial_metric = 'ks', positive="bad|1", breaks_list = NULL, ...) {
# set dt as data.table
dt = setDT(copy(dt))
# check y
dt = check_y(dt, y, positive)
# x variable names
x = x_variable(dt, y, x)
# dt
dt = dt[, c(y,x), with=FALSE]
kwargs = list(...)
# seed
seed = kwargs[['seed']]
if (is.null(seed)) seed = 618
# ratio
ratio = kwargs[['ratio']]
if (is.null(ratio)) ratio = c(1-1/no_folds, 1/no_folds)
# model
model = kwargs[['model']]
# split dt into no_folds
if (is.null(seeds)) {
dts = do.call('split_df', list(
dt = dt, y = y,
ratio = rep(1/no_folds, no_folds),
name_dfs = as.character(seq_len(no_folds)),
seed = seed
))
tt_lst = lapply(as.list(seq_len(no_folds)), function(x) {
return(list(train = rbindlist(dts[-x]),
validation = rbindlist(dts[x])))
})
names(tt_lst) = seq_len(no_folds)
} else {
tt_lst = lapply(as.list(seeds), function(x) {
do.call('split_df', list(
dt = dt, y = y, ratio = ratio, seed = x, name_dfs = c("train", "validation")
))
})
names(tt_lst) = seeds
}
# glm
if (is.null(breaks_list)) {
f_glm = function(tt, y) {
m1 = glm( as.formula(sprintf('%s ~ .', y)), family = binomial(), data = tt$train)
pp = lapply(tt, function(t) predict.glm(m1, newdata = t, type='response'))
return(pp)
}
} else {
f_glm = function(tt, y) {
bin_train = woebin(tt$train, y = y, breaks_list = breaks_list, print_info = FALSE)
tt_woe = lapply(tt, function(t) woebin_ply(t, bin_train, print_info = FALSE))
m1 = glm( as.formula(sprintf('%s ~ .', y)), family = binomial(), data = tt_woe$train)
pp = lapply(tt_woe, function(t) predict.glm(m1, newdata = t, type='response'))
return(pp)
}
}
if (is.null(model)) model = 'f_glm'
# performance
perf_list = lapply(tt_lst, function(tt) {
pp = do.call(model, list(tt = tt, y = y))
eva_tt = lapply(names(pp), function(x) {
perf_eva(pp[[x]], tt[[x]][[y]], show_plot = NULL, binomial_metric = binomial_metric, confusion_matrix = F)$binomial_metric$dat
})
names(eva_tt) = names(pp)
eva = rbindlist(eva_tt, idcol = 'tt')
})
perf = rbindlist(perf_list, idcol = 'datset')
setnames(perf, tolower(names(perf)))
perf_metric = lapply(tolower(binomial_metric), function(m) dcast(perf, datset~tt, value.var = m))
names(perf_metric) = tolower(binomial_metric)
return(perf_metric)
}
score_eva = function(dt, score = 'score', y = NULL) {
cumy = NULL
score_summary =
data.table(
unique_count = uniqueN(dt[[score]], na.rm = TRUE),
identical_rate = fun_identical_rate(dt[[score]])
)
if (!is.null(y)) {
setnames(dt, y, 'y', skip_absent=TRUE)
score_summary$p80bad_rejectrate = dt[order(score)][, cumy := cumsum(y)/sum(y) <= 0.8 ][, sum(cumy)/.N]
}
return(score_summary)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.