#' @include classing_class.R classing_adjust_method.R
NULL
#' @title Scorecard reference class generator
#' @name Scorecard-class
#' @description Scorecard class that wraps a data.frame and prepares it for
#' scorecard modeling.
#' @field seed saved random seed that is based on \code{\link{Sys.time}}.
#' @field models list of fitted models for this Scorecard
#' @field selected_model name of last selected that was loaded into the
#' transforms
#' @field inmodel character vector of variable names that are in the selected
#' model
#' @export Scorecard
#' @exportClass Scorecard
Scorecard <- setRefClass("Scorecard",
fields = c(
seed = "numeric",
models = "list",
selected_model = "character",
inmodel = "character",
steptwo = "numeric"),
contains = "Classing")
Scorecard$methods(initialize = function(..., seed=as.numeric(strftime(Sys.time(), format="%H%M%S"))) {
seed <<- seed
callSuper(...)
})
Scorecard$methods(has_model = function(model) {
"Assert whether the Scorecard contains the requested model"
if (!model %in% names(models)) {
stop("Requested model not found: ", model, call. = FALSE)
}
})
#' Select a fitted model and load the Transforms
#'
#' @name Scorecard_select
#' @description Select searches the model fitting history for the requested
#' model. It then loads the associated transforms for the model variables into
#' the current Scorevard object. Additionally it loads the dropped and inmodel
#' vectors at the time of the fit.
NULL
Scorecard$methods(select = function(model) {
has_model(model) ## check that model exists
mod <- models[[model]]
selected_model <<- model
dropped <<- mod@dropped
inmodel <<- mod@inmodel
steptwo <<- mod@steptwo
for (v in names(mod@transforms)) {
variables[[v]]$tf <<- mod@transforms[[v]]
}
})
Scorecard$methods(add_model = function(mod) {
"Register a new Model object with the scorecard"
models[[mod@name]] <<- mod
select(mod@name)
})
#' Subsequent call from the bin function passed to the Scorecard
#'
#' @name Scorecard_bin
#' @description This bin function should not be directly called by the user.
#' The Scorecard bin function is subsequently called from the
#' \code{\link{bin}} wrapper function.
NULL
Scorecard$methods(bin = function(...) {
callSuper(...)
scratch <- new("Model", name="scratch", description="", fit=NULL, ks=0,
dropped=dropped, transforms=get_transforms())
add_model(scratch)
})
#' Fit a model to the current set of variable transforms
#'
#' @name Scorecard_fit
#' @description fits a regularized regressoion model using the glmnet
#' package.
#' @details the fit function first calls predict and substitutes the
#' weight-of-evidence for all predictor variables. It then passes this matrix
#' on to \code{\link[glmnet]{cv.glmnet}}. The coefficients of a binner model fit
#' are restricted to [0,3]. This ensures there are no sign flips in the
#' model coefficients and that the relationships observed on margin are
#' retained in the final model.
#' \code{\link{bin}} wrapper function.
#' @param name brief model name as character
#' @param description character description describing the model
#' @param overwrite should model be overwriiten if it already exists?
#' @param newdata data.frame of independent variables. Default is to
#' use the binned data.
#' @param y target to fit the data to. Default is the y variable used
#' for discretization.
#' @param w optional weight variable
#' @param nfolds number of k-folds with which to select the optimal
#' lambda value
#' @param upper.limits maximum value of fitted coefficients
#' @param lower.limits minimimum value of fitted coefficients
#' @param alpha type of regularization. Default is alpha == 1 for LASSO
#' regression. Alpha of 0 is Ridge regression while anythin in between
#' is the elastic net mixture.
#' @param family response variable distribution. Default is "binomial".
#' @param ... additional arguments passed on to cv.glmnet
NULL
Scorecard$methods(fit = function(name, description="", overwrite=FALSE,
newdata=.self$get_variables(), y=performance$y, w=performance$w,
nfolds=5, upper.limits=3, lower.limits=0, alpha=1,
family="binomial", ...) {
## check for consistent dimensions
if (length(newdata[[1]]) != length(y)) {
stop("newdata and y must be the same length", call. = FALSE)
}
if (length(y) != length(w)) {
stop("y and w must be the same length", call. = FALSE)
}
if (!overwrite) {
if (name %in% names(models)) {
stop("Model name already exists and overwrite=FALSE",
call. = FALSE)
}
}
v <- setdiff(vnames, dropped)
x <- predict(newdata=newdata[v], type="woe")
set.seed(seed)
this_fit <- cv.glmnet(x = x, y = y, weights = w, nfolds = nfolds,
family=family, alpha=alpha, upper.limits=upper.limits,
lower.limits=lower.limits, keep=TRUE, ...)
## get the coeficients
coefs <- glmnet::coef.cv.glmnet(this_fit, s="lambda.min")[,1]
coefs <- coefs[which(coefs != 0)]
## set the inmodel vector
inmodel <<- names(coefs)[-1]
## set the steptwo vector
betas <- as.matrix(this_fit$glmnet.fit$beta)
step2 <- matrix(order(betas, decreasing = TRUE), nrow = nrow(betas))
step2 <- setdiff(v[unique(row(step2)[step2])], inmodel)
steptwo <<- setNames(seq_along(step2), step2)
## performance metrics
contr <- contributions_(x[,names(coefs)[-1],drop=F], coefs, y, w)
ks <- ks_(this_fit$fit.preval[,which.min(this_fit$cvm)], y, w) # kfold
## store the last transforms
m <- new("Model", name=name, description=description, dropped=dropped,
transforms=get_transforms(), coefs=coefs, inmodel=inmodel,
steptwo=steptwo, contribution=contr, ks=ks, fit=this_fit)
add_model(m)
})
#' Print the Scorecard representation to the console
#'
#' @name Scorecard_show
NULL
Scorecard$methods(show = function(...) {
## show the models / coefs?
cat(sprintf("%d models", length(models)), sep="\n")
i <- rep("", length(models))
i[names(models) == selected_model] <- "*"
cat(sprintf(" |-- %-2s %-20s | %04.1f ks | %s", i,
sapply(models, slot, "name"),
sapply(models, slot, "ks") * 100,
sapply(models, slot, "description")), sep="\n")
})
#' Return scorecard predictions or WoE substitution
#'
#' @name Scorecard_predict
#' @param newdata data.frame on which to calculate predictions
#' @param keep whether to keep dropped values
#' @param type "score" to return the model score. "woe" to return the WoE
#' substitution for the input dataset
#' @return Either a single column matrix of score predictions or a matrix
#' matching the input dimension of the dataset containing weight-of-evidence
#' substitutions.
NULL
Scorecard$methods(predict = function(newdata=NULL, keep=FALSE, type=c("score", "woe", "labels"), ...) {
type <- match.arg(type)
mod <- models[[selected_model]]
v <- names(mod@coefs[-1])
woe <- callSuper(newdata=newdata, keep=keep)
switch(type,
labels = data.frame(mapply(function(w, va) {
l <- unlist(sapply(va$tf@repr[1:3], row.names))
factor(names(w), levels = l, labels = l)
}, woe, variables[names(woe)], SIMPLIFY = FALSE)),
woe = `row.names<-`(do.call(cbind, woe), NULL),
score = `row.names<-`(
do.call(cbind, woe)[,v] %*% mod@coefs[v] + mod@coefs[1], NULL), NA)
})
#' Summarize the currently selected model
#'
#' @name Scorecard_summary
#' @param keep whether to summarize droppped variables as well
#' @return a matrix summarizing the independent variables using the performance
#' implementation summary function. Also displays the coefficients and model
#' contributions of the predictors.
NULL
Scorecard$methods(summary = function(keep=FALSE, inmodel.only=FALSE) {
mod <- models[[selected_model]]
cat(mod@name, "\nOut-of-Fold KS: ", mod@ks, "\n")
res <- callSuper(keep=keep)
vars <- row.names(res)
out <- cbind(res, `In Model` = 0, `Step Two` = 0, `Coefs` = mod@coefs[vars],
`Contribution` = mod@contribution[vars])
out[inmodel,"In Model"] <- 1
out[names(steptwo),"Step Two"] <- steptwo
if (inmodel.only) {
out[match(inmodel, row.names(out), 0), ]
} else {
out
}
})
#' Scorecard adjust method entry point
#'
#' @name Scorecard_adjust
#' @details calling adjust enters an interactive variable edit mode. Press "h"
#' for a list of commands.
NULL
Scorecard$methods(adjust = function(...) {
callSuper(...)
})
#' Sort variables of a scorecard
#'
#' @name Scorecard_sort
#' @details variables that are in the currently selected model are sorted to the
#' front while dropped variables are sorted to the end. Variables within each
#' group are sort by descending information value.
NULL
Scorecard$methods(sort = function() {
v <- setNames(sapply(variables, function(x) x$sort_value()), names(variables))
base <- setNames(rep(0, length(v)), names(v))
im <- base
dr <- base
s2 <- base
im[inmodel] <- 1
dr[dropped] <- 1
s2[names(steptwo)] <- steptwo
i <- order(im, -s2, -dr, v, decreasing = TRUE, na.last = TRUE)
variables <<- variables[i]
})
#' Deprecated method. Use Scorecard_bootstrap instead.
#'
#' @name Scorecard_pseudo_pvalues
NULL
Scorecard$methods(pseudo_pvalues = function(times=20, bag.fraction = 0.50,
replace=FALSE, nfolds=5, upper.limits=3, lower.limits=0, alpha=1, ...) {
warning("pseudo_pvalues will be deprecated in future versions of binnr. Use `bootstrap` instead.")
res <- bootstrap(times=20, bag.fraction = 0.50, replace=FALSE, nfolds=5,
upper.limits=3, lower.limits=0, alpha=1, ...)
class(res) <- "pseudo_values"
res
})
#' Run bootstrap model fits to assess coefficient distributions.
#'
#' @name Scorecard_bootstrap
#' @param times number of bootstrap samples to run
#' @param bag.fraction fraction of observations to sample for each bootstrap run
#' @param replace whether to sample with or without replacement
#' @param nfolds number of CV folds to run within each bootstrap fit
#' @param upper.limits maximum coffecient value for each fit
#' @param lower.limits minimum coffecient value for each fit
#' @param alpha mixing paramater between LASSO and Ridge regression. Default 1.
#' @param ... other parameters passed on to cv.glmnet
#' @details True boostratp samples should be run with \code{bag.fraction=1} and
#' \code{replace=TRUE}.
#' @return a list with two elements: pvals and coefficients. The former is a
#' vector indicating what proportion of bootstrap model fits each coefficient
#' returned a zero. The latter an nRuns x nVars matrix containing the
#' coefficients of each run.
NULL
Scorecard$methods(bootstrap = function(times=20, bag.fraction = 0.50,
replace=FALSE, nfolds=5, upper.limits=3, lower.limits=0, alpha=1, ...) {
x <- predict(newdata=get_variables(), type="woe")
coefs <- list()
for (i in seq.int(times)) {
progress_(i, times, "Fitting ")
s <- sample.int(nrow(x), nrow(x)*bag.fraction, replace = replace)
fit <- glmnet::cv.glmnet(x = x[s,], y = performance$y[s],
weights = performance$w[s], nfolds = 10, alpha = alpha,
upper.limits=upper.limits, lower.limits=lower.limits, keep=TRUE, ...)
coefs[[i]] <- coef(fit, s="lambda.min")
}
res <- as.matrix(do.call(cbind, coefs))
## what is the probability of the coefficient being zero?
pvals <- sapply(apply(res, 1, ecdf), function(x) x(0))
structure(
list(
pvalues = pvals,
coefs = res),
class = "bootstrap")
})
#' Compare multiple scorecards side-by-side
#'
#' @name Scorecard_compare
#' @details calling adjust enters an interactive variable edit mode. Press "h"
#' for a list of commands.
NULL
Scorecard$methods(compare = function(...) {
mods <- unlist(list(...))
## check that requested models are in the scorecard
stopifnot(all(mods %in% names(models)))
on.exit(select(selected_model))
## select each and get the summary
summaries <- lapply(mods, function(x) {
select(x)
res <- summary(inmodel.only = TRUE)
## only keep vars in model
res[, c("IV","Dropped","In Model","Coefs","Contribution")]
})
## merge them all
contribution <- lapply(summaries, function(x) x[,"Contribution"])
coefficients <- lapply(summaries, function(x) x[,"Coefs"])
# merge helper for use with Reduce
merge_ <- function(a, b) {
tmp <- merge(a,b, by=0, all=T)
row.names(tmp) <- tmp$Row.names
subset(tmp, select = -Row.names)
}
res <- merge(
Reduce(merge_, contribution),
Reduce(merge_, coefficients), by=0, all=T)
cols <- c("Contribution", "Coefficients")
colnames(res) <- c("Variable", paste(rep(cols, each=length(mods)), mods))
res[order(-res[2], na.last = TRUE),]
})
#' Generate SAS code for Scorecard object
#'
#' @name Scorecard_gen_code_sas
#' @description generate SAS code represenation of the Scorecard object. The SAS
#' code that is genereated calculates the score, adverse action code distances,
#' and provides a set of macro assignments for assigning adverse action codes
#' to particular bin levels.
#' @param pfx character prefix to prepend to variable names
#' by the Scorecard model object. Defaults to 1.
#' @param method method used for calculating the reference level for adverse
#' action codes. Three possible choices:
#' \itemize{
#' \item{"min" }{Calculate difference from minimum of perf values - default}
#' \item{"max" }{Calculate difference from maximum of perf values}
#' \item{"neutral" }{Calculate difference from zero}
#' }
#' @return a character vector of SAS code
NULL
Scorecard$methods(gen_code_sas = function(pfx="", method="min", ...) {
out <- character(0)
v <- inmodel
mod <- models[[selected_model]]
coefs <- mod@coefs[-1][v]
if (getOption("mkivtools_REGISTERED", default = FALSE)) {
out <- do.call(c, lapply(v, mkivtools::get_mkiv_code))
# out <- do.call(c, mkivtools::pkg.env$mkiv_map[tolower(v)])
}
## Print the reason code mappings
out <- c(out, "/** Adverse Action Code Mappings **/")
out <- c(out, lapply(seq_along(v), function(i) {
sprintf("%%let %s_AA_%02d = \"\"; /** %s **/", pfx, i, v[i])
}))
### Print the variables
out <- c(out, lapply(seq_along(v), function(i) {
variables[[v[i]]]$gen_code_sas(method=method, pfx=pfx, coef=coefs[i], i=i)
}))
out <- c(out,
sprintf("\n/*** Final Score Calculation ***/"),
sprintf("%s_lgt = %s", pfx, mod@coefs[1]),
sprintf(" + %s_V%02d_w", pfx, seq_along(v)),
";")
unname(unlist(out))
})
#' Return names of varibales flagged as dropped
#'
#' @name Scorecard_get_dropped
#' @return a character vector of dropped variable names
NULL
Scorecard$methods(get_dropped = function(invert=FALSE) {
if (invert) setdiff(vnames, dropped) else dropped
})
#' Return names of varibales flagged as inmodel
#'
#' @name Scorecard_get_inmodel
#' @return a character vector of inmodel variable names
NULL
Scorecard$methods(get_inmodel = function(invert=FALSE) {
if (invert) setdiff(vnames, inmodel) else inmodel
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.