#' @include performance_class.R
NULL
#' @title Classing reference class generator
#' @name Classing_Class
#' @description Classing class that wraps a data.frame and prepares it for
#' scorecard modeling.
#' @field variables list of binned variables
#' @field vnames character vector of binned variable names
#' @field performance Performance object with \code{y} and \code{w} values and
#' implementation of the Performance object interface.
#' @field dropped character vector of variable names that are flagged as
#' dropped
#' @export Classing
#' @exportClass Classing
Classing <- setRefClass("Classing",
fields = c(
variables = "list",
vnames = "character",
performance = "Performance",
dropped = "character"))
#' generic method for create_bin
#' @name create_bin
#' @param x the varibale used for the Bins \code{x} value
#' @param ... additional arguments passed on to the Bin constructors
setGeneric("create_bin", function(x, ...) callGeneric("create_bin"))
#' @describeIn create_bin wrap variable in Continuous object
#' @return a Continuous object
setMethod("create_bin", "numeric", function(x, ...) {
Continuous$new(x = x, ...)
})
#' @describeIn create_bin wrap variable in Discrete object
#' @return a Discrete object
setMethod("create_bin", "factor", function(x, ...) {
Discrete$new(x = x, ...)
})
#' @describeIn create_bin create_bin fallback warning user
setMethod("create_bin", "ANY", function(x, ...) {
warning(sprintf("Class: %s cannot be binned. Removed from classing.",
class(x)), call. = FALSE)
NULL
})
Classing$methods(initialize = function(data=NULL,
performance=Performance$new(), ...) {
.self$performance <<- performance
vnames <<- setNames(names(data), names(data))
variables <<- lapply(vnames, function(nm) {
create_bin(x = data[[nm]], perf = performance, name = nm, ...)
})
## drop variables that aren't numeric or factors
f <- !sapply(variables, is.null)
variables <<- variables[f]
vnames <<- vnames[f]
})
#' Subsequent call from the bin function passed to Classing object
#'
#' @name Classing_bin
#' @description This bin function should not be directly called by the user.
#' The Classing bin function is subsequently called from the
#' \link{\code{bin}} wrapper function.
NULL
Classing$methods(bin = function(...) {
on.exit(cat(sep = "\n"))
for (i in seq_along(variables)) {
progress_(i, length(variables), "Binning ", variables[[i]]$name)
variables[[i]]$bin(...)
}
## drop vars with zero information value
zeros <- sapply(variables, function(b) b$sort_value()) == 0
dropped <<- names(variables)[zeros]
})
# Classing$methods(show = function() {
# print("Classing object")
# })
Classing$methods(get_variables = function(..., keep=FALSE) {
if (!keep) {
lapply(variables[setdiff(vnames, dropped)], function(x) x$x)
} else {
lapply(variables, function(x) x$x)
}
})
Classing$methods(get_transforms = function(..., keep=FALSE) {
if (!keep) {
lapply(variables[setdiff(vnames, dropped)], function(x) x$tf)
} else {
lapply(variables, function(x) x$tf)
}
})
Classing$methods(predict = function(newdata=NULL, keep=keep) {
on.exit(cat(sep = "\n"))
if (is.null(newdata)) newdata <- get_variables(keep=keep)
vnm <- if (keep) vnames else setdiff(vnames, dropped)
## check that data has var names
stopifnot(!is.null(names(newdata)))
dnm <- names(newdata)
## check that all variables are found in newdata
if (!all(vnm %in% dnm)) {
msg <- paste0(vnm[!vnm %in% dnm], collapse = ", ")
stop(sprintf("Vars not found in data: %s", msg), call. = F)
}
## put the newdata in the same order as the variables
func <- function(i, b, v) {
progress_(i, length(vnm), "Predicting", b$name)
b$predict(newdata=v)
}
woe <- mapply(func, seq_along(vnm), variables[vnm], newdata[vnm])
colnames(woe) <- vnm
woe
})
#' Flag supplied variables as dropped
#'
#' @name Scorecard_drop
#' @param vars character vector of variables to drop
#' @param all logical indicating whether all variables should be dropped
NULL
Classing$methods(drop = function(vars=character(0), all=FALSE, ...) {
if (all) {
dropped <<- vnames
} else {
stopifnot(all(vars %in% vnames))
dropped <<- unique(c(dropped, vars))
}
})
#' Flag supplied variables as undropped
#'
#' @name Scorecard_undrop
#' @param vars character vector of variables to undrop
#' @param all logical indicating whether all variables should be undropped
NULL
Classing$methods(undrop = function(vars=character(0), all=FALSE, ...) {
if (all) {
dropped <<- character(0)
} else {
stopifnot(all(vars %in% vnames))
dropped <<- setdiff(dropped, vars)
}
})
#' Cluster variables by correlation
#'
#' @name Scorecard_cluster
#' @param keep logical indicating whether to include dropped variables in the
#' correlation cluster analysis.
#' @details the cluster function first performs weight-of-evidence substitution.
#' Cluster returns a classing_cluster object which is a list containing
#' two fields: correlation & cluster. The first is a correlation matrix for all
#' of the variables in the classing. The second is an hclust object which is
#' result of hierarchical clustering of the correlation matrix.
#' @return a classing_cluster object
NULL
Classing$methods(cluster = function(keep=FALSE, ...) {
woe <- predict(newdata=get_variables(keep = keep), type="woe", ...)
dups <- apply(woe, 2, function(x) all(duplicated(x)[-1L]))
corr <- cor(woe[,which(!dups)])
structure(
list(
correlations = corr,
cluster = hclust(as.dist(1 - abs(corr)))),
class="classing_cluster")
})
#' Prune clusters keeping only the most informative variables
#'
#' @name Scorecard_prune_clusters
#' @param cc classing_cluster object produced by \link{\code{Scorecard_cluster}}
#' method
#' @param corr minimum correlation coefficient threshold with which to group
#' variables
#' @param n number of variables to keep from each cluster exceeding the
#' correlation threshold. The n variables with the highest information value
#' are retained. The remaining variables are returned as a character vector.
#' @return a character vector of variables to drop
NULL
Classing$methods(prune_clusters = function(cc, corr=0.80, n=1) {
stopifnot(is(cc, "classing_cluster"))
## get information values
p <- sapply(variables[colnames(cc$correlations)], function(x) x$sort_value())
## cutree
grps <- cutree(cc$cluster, h=1-corr)
# split correlations into groups and return everyone after the first
splt <- split(data.frame(var=names(grps), val=p, stringsAsFactors = F), grps)
## order each group by descending perf value and drop all but the first
to_drop <- lapply(splt, function(x) x$var[order(-x$val)][-seq.int(n)])
unlist(to_drop)
})
#' Summarize the Classing object
#'
#' @name Classing_summary
#' @return a matrix summarizing the independent variables using the Performance
#' object summary function
NULL
Classing$methods(summary = function(...) {
s <- lapply(variables, function(v) v$summary())
res <- cbind(do.call(rbind, s), Dropped=0)
res[dropped, "Dropped"] <- 1
res
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.