### This script contains functions for binning by Information Value
# Weight of Evidence: WOE
# = log(%non-events / %events), where %events = #events in bin / #events in data
# So WOE == log(#nonevents in bin / #nonevents in data) - log(#events in bin - #events in data)
# WOE is calculated for each variable and each bin
# Information Value: sum(%nonevents - %events) x WOE, where the sum is taken
# over all bins
#' Calculate the Weights of Evidence for a variable, given predefined cutpoints
#'
#' This function will calculate the Weight of Evidence for each bin of a given
#' variable, according to a specified binary response. A vector of bins can be
#' supplied, or a pre-binned variable can be supplied.
#' @param dat The dataset containing the response and desired variable
#' @param var Character string containing the name of the variable
#' @param response Character string specifying the name of the response variable
#' @param bins Either a numeric vector specifying the cutpoints for the desired
#' variable, or 0. If 0, the function assumes
#' that the variable is already binned, and throws an error if it is
#' not of type "factor"
#' @param warn Logical- should the function print a warning if a given bin
#' contains only one level of the response (and hence has a WOE of +/-
#' Inf)? Recommended to keep this to TRUE; FALSE option is provided for
#' using this function in programming
#' @param auto_merge Logical. If bins contain only one value of the response,
#' should they be merged with an adjacent bin? If set to TRUE and warn is
#' set to FALSE, the function can modify your data without telling you-
#' be careful
#' @return A two column \code{tbl} containing the bin and the WOE value
#'
#' @family metrics
#' @seealso information_value
#' @details \code{woe_single} allows the user to easily calculate Weights of
#' Evidence for arbitrary bins for a given variable.
#' The function will throw an error if \code{bins} is 0 and the
#' supplied variable is not a factor. The idea is for the user to
#' supply their own bins obtained using, for example, one of the useful
#' functions in the \code{modellingTools} package, such as
#' \code{\link[modellingTools]{vector_bin}}. When bins have only one level of the
#' response, the WOE will be +/-; by default the function will not prevent this
#' and issue a warning. You can turn off warnings and have the function auto-merge
#' bins using the supplied options, but be sure to inspect the results carefully.
#'
#' @export
#' @import magrittr
woe_single <- function(dat,
var,
bins = 0,
response,
warn = TRUE,
auto_merge = FALSE) {
if (length(unique(modellingTools::column_vector(dat,response))) > 2) {
stop(stringr::str_c(response," is not binary"))
}
if (length(bins) == 1 && !is.factor(modellingTools::column_vector(dat,var))) {
stop(stringr::str_c("No bins provided, but ",var," is not a factor"))
}
if (length(bins) == 1) {
if (bins > 0) stop(stringr::str_c("You provided a positive number of bins- ",
"this function does not do automatic ",
"binning. Have you tried ",
"modellingTools::vector_bin?"))
}
dat %<>%
dplyr::select_(var,response)
if (length(bins) > 1) {
dat %<>%
modellingTools::simple_bin(bins = bins,
include_vars = var)
}
total_bad <- sum(modellingTools::column_vector(dat,response))
total_good <- nrow(dat) - total_bad
woe_dat <- dat %>%
dplyr::group_by_(var) %>%
dplyr::summarize_("good_capture" = stringr::str_c("sum(",response," == 0) / ",total_good),
"bad_capture" = stringr::str_c("sum(",response," == 1) / ",total_bad)
)
# Deal with bins that have no goods/bads
to_merge <- c()
all_lv <- sort(levels(modellingTools::column_vector(dat,var)))
no_goods <- woe_dat %>%
dplyr::filter_("good_capture == 0") %>%
modellingTools::column_vector(var) %>%
# Remove "factor" class
as.character()
no_bads <- woe_dat %>%
dplyr::filter_("bad_capture == 0") %>%
modellingTools::column_vector(var) %>%
# Remove "factor" class
as.character()
to_merge <- c(no_bads,no_goods)
if (warn) {
if (length(no_goods) > 0) {
for (lv in no_goods) {
warning(stringr::str_c("Bin ",lv," has no obs = 0. WOE will be Inf. ",
"Consider merging this with another bin."))
}
}
if (length(no_bads) > 0) {
for (lv in no_bads) {
warning(stringr::str_c("Bin ",lv," has no obs = 1. WOE will be -Inf. ",
"Consider merging this with another bin."))
}
}
}
# Auto-merge if requested
if(auto_merge) {
if (length(to_merge) > 0) {
# Take first element of to_merge
# Find the bin adjacent to it
# Merge them
# Recalculate woe with the new bins
which_level <- which(all_lv == to_merge[1])
if (which_level > 1) {
# Merge with the lower bin
adjacent_bin <- all_lv[which_level - 1]
} else {
adjacent_bin <- all_lv[which_level + 1]
}
cat("Merging bins ",to_merge[1]," and ",adjacent_bin,"\n\n")
dat[[var]] <- modellingTools::merge_levels(var = modellingTools::column_vector(dat,var),
lv = c(to_merge[1],adjacent_bin),
merge_names = TRUE)
to_merge <- to_merge[-1]
woe_dat <- dat %>%
modellingTools::woe_single(dat = .,
var = var,
response = response,
warn = warn,
auto_merge = TRUE)
}
}
woe_dat %<>%
dplyr::mutate_("woe" = "log(bad_capture / good_capture)")
return(woe_dat)
}
#' Calculate the Information Value for a given variable and set of bins
#'
#' This function calculates the Weight of Evidence for each supplied bin using
#' \code{\link{woe_single}}, then applies the formula for Information Value to
#' the result. See the vignette for more details.
#'
#' @param dat The dataframe containing the input variable and response
#' @param var Character string indicating the name of the variable
#' @param response Character string indicating the name of the response
#' @param bins Numeric vector of cutpoints defining the bins for \code{var}, or
#' 0, indicating that \code{var} is already binned
#' @param warn Logical- should the function print a warning if a given bin
#' contains only one level of the response (and hence has a WOE of +/-
#' Inf)? Recommended to keep this to TRUE; FALSE option is provided for
#' using this function in programming
#' @param auto_merge Logical. If bins contain only one value of the response,
#' should they be merged with an adjacent bin? If set to TRUE and warn is
#' set to FALSE, the function can modify your data without telling you-
#' be careful
#' @param output_cutpoints Logical. If TRUE, a list with the IV and the final
#' cutpoints used to compute it is returned. Default FALSE, in which
#' case a single number representing the IV is returned
#' @return If output_cutpoints is \code{TRUE}, a list containing the IV and the final
#' cutpoints used (after auto_merging, if applicable). If output_cutpoints
#' is \code{FALSE}, A single number giving the Information Value.
#' @family metrics
#' @seealso woe_single
#' @details
#' This function is designed as a helper, to be used in analyzing IV for a given
#' binning structure or in applying an optimization routine to find optimal
#' bins for variables in a dataset. When bins have only one level of the
#' response, the WOE will be +/-; by default the function will not prevent this
#' and issue a warning. You can turn off warnings and have the function auto-merge
#' bins using the supplied options, but be sure to inspect the results carefully.
#'
#' @export
#' @import magrittr
#'
#'
information_value <- function(dat,
var,
bins = 0,
response,
warn = TRUE,
auto_merge = FALSE,
output_cutpoints = FALSE) {
woe <- dat %>% modellingTools::woe_single(var = var,
bins = bins,
response = response,
warn = warn,
auto_merge = auto_merge)
iv <- woe %>%
dplyr::filter_("!(woe %in% c(-Inf,Inf))") %>%
dplyr::mutate_("IV_part" = "(bad_capture - good_capture) * woe") %>%
dplyr::summarize_("IV" = "sum(IV_part)") %>%
modellingTools::column_vector("IV")
if (output_cutpoints) {
cutpoints <- modellingTools::column_vector(woe,var) %>% modellingTools::get_vector_cutpoints()
return(list(
cutpoints = cutpoints,
iv = iv
))
} else {
return(iv)
}
}
#' Compute and sort variables in a dataset by information value
#'
#' This function will compute the information value for each variable
#' in the supplied dataset. Variables must be pre-binned; any variable
#' other than the response not of class "factor" will cause an error.
#' Response should be numeric and binary
#'
#' @param dat Dataset, a \code{data.frame} or \code{dplyr::tbl_df} containing pre-binned variables and
#' binary response
#' @param response string giving the name of the binary (0/1) response
#' variable in the dataset
#' @param var_grouping optional table giving the grouping structure of the variables. If provided, variables
#' will be sorted by IV within the groups. Useful for selecting variables after performing
#' some clustering procedure. Format: a \code{tbl} with 2 columns: \code{var}, the names
#' of the variables and \code{group}, a number or string identifying groups
#' @return a \code{nrow(dat) x 2 tbl_df} with two columns: \code{var}, giving each
#' variable name, and \code{iv}, giving the Information Value
#' The function will auto-merge pure bins, and return the information value obtained using the final
#' merged bins.
#' @export
#' @import magrittr
#' @import foreach
iv_sort <- function(dat,
response,
var_grouping = NULL) {
# Check: single binary response, and only factored input variables?
if (!(response %in% colnames(dat))) {
stop("Response variable ",response," not found in dataset")
}
if (!is.numeric(modellingTools::column_vector(dat,response))) {
tp <- class(modellingTools::column_vector(dat,response))
stop(stringr::str_c("Response must be of numeric type. Response is of type ",tp))
}
if (length(unique(modellingTools::column_vector(dat,response))) > 2) {
un <- unique(modellingTools::column_vector(dat,response))
stop(stringr::str_c("Response must be binary. Response has the following unique values: ",un))
}
nf <- c()
for (nm in colnames(dat)[colnames(dat) != response]) {
if (!is.factor(modellingTools::column_vector(dat,nm))) {
nf <- c(nf,nm)
}
}
if (length(nf) > 0) {
stop(stringr::str_c("Input variables are not all factors. The following variables are not factored: ",nf))
}
if (!is.null(var_grouping)) {
if (!all(class(var_grouping) == c("tbl_df","tbl","data.frame"))) {
warning("var_grouping was not provided as a tbl- grouping structure ignored")
var_grouping <- NULL
} else if (!all(sort(names(var_grouping)) == c("group","var"))) {
warning("var_grouping does not follow required format of being a tbl with exactly two columns named 'group' and 'var'. Argument ignored")
var_grouping = NULL
}
}
# Create a dataframe of the column names and their IV, sorted in descending order
iv_unsorted <- foreach::foreach(nm = colnames(dat)[colnames(dat) != response],
.inorder = FALSE,
.combine = dplyr::bind_rows,
.init = dplyr::data_frame(var = "",iv = -100),
.final = function(x) {
x %>%
dplyr::filter_("var != ''")
},
.export = c("response")
) %do% {
dplyr::data_frame(var = nm,
iv = modellingTools::information_value(dat,
nm,
bins = 0,
response = response,
warn = FALSE,
auto_merge = TRUE)
)
}
if (!is.null(var_grouping)) {
iv_sorted <- iv_unsorted %>%
dplyr::inner_join(var_grouping,by = "var") %>%
dplyr::arrange_("group","desc(iv)")
} else {
iv_sorted <- iv_unsorted %>%
dplyr::arrange_("desc(iv)")
}
return(iv_sorted)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.