### This script contains functions for binning by Information Value
# Weight of Evidence: WOE
# = log(%non-events / %events), where %events = #events in bin / #events in data
# So WOE == log(#nonevents in bin / #nonevents in data) - log(#events in bin - #events in data)
# WOE is calculated for each variable and each bin
# Information Value: sum(%nonevents - %events) x WOE, where the sum is taken
# over all bins
#' Calculate the Weights of Evidence for a variable, given predefined cutpoints
#'
#' This function will calculate the Weight of Evidence for each bin of a given
#' variable, according to a specified binary response. A vector of bins can be
#' supplied, or a pre-binned variable can be supplied.
#' @param dat The dataset containing the response and desired variable
#' @param var Character string containing the name of the variable
#' @param response Character string specifying the name of the response variable
#' @param bins Either a numeric vector specifying the cutpoints for the desired
#' variable, or 0. If 0, the function assumes
#' that the variable is already binned, and throws an error if it is
#' not of type "factor"
#' @param warn Logical- should the function print a warning if a given bin
#' contains only one level of the response (and hence has a WOE of +/-
#' Inf)? Recommended to keep this to TRUE; FALSE option is provided for
#' using this function in programming
#' @param auto_merge Logical. If bins contain only one value of the response,
#' should they be merged with an adjacent bin? If set to TRUE and warn is
#' set to FALSE, the function can modify your data without telling you-
#' be careful
#' @return A two column \code{tbl} containing the bin and the WOE value
#'
#' @family metrics
#' @seealso information_value
#' @details \code{woe_single} allows the user to easily calculate Weights of
#' Evidence for arbitrary bins for a given variable.
#' The function will throw an error if \code{bins} is 0 and the
#' supplied variable is not a factor. The idea is for the user to
#' supply their own bins obtained using, for example, one of the useful
#' functions in the \code{modellingTools} package, such as
#' \code{\link[modellingTools]{vector_bin}}. When bins have only one level of the
#' response, the WOE will be +/-; by default the function will not prevent this
#' and issue a warning. You can turn off warnings and have the function auto-merge
#' bins using the supplied options, but be sure to inspect the results carefully.
#'
#' @examples
#' @export
#' @import magrittr
woe_single <- function(dat,
var,
bins = 0,
response,
warn = TRUE,
auto_merge = FALSE) {
if (length(unique(modellingTools::column_vector(dat,response))) > 2) {
stop(stringr::str_c(response," is not binary"))
}
if (length(bins) == 1 && !is.factor(modellingTools::column_vector(dat,var))) {
stop(stringr::str_c("No bins provided, but ",var," is not a factor"))
}
if (length(bins) == 1) {
if (bins > 0) stop(stringr::str_c("You provided a positive number of bins- ",
"this function does not do automatic ",
"binning. Have you tried ",
"modellingTools::vector_bin?"))
}
dat %<>%
dplyr::select_(var,response) %>%
dplyr::mutate_(var = var,response = response)
if (length(bins) > 1) {
dat %<>%
modellingTools::simple_bin(bins = bins,
type = "width",
include_vars = "var")
}
total_bad <- sum(modellingTools::column_vector(dat,response))
total_good <- nrow(dat) - total_bad
woe_dat <- dat %>%
dplyr::group_by(var) %>%
dplyr::summarize(good_capture = sum(response == 0) / total_good,
bad_capture = sum(response == 1) / total_bad)
# Deal with bins that have no goods/bads
to_merge <- c()
all_lv <- sort(levels(modellingTools::column_vector(dat,"var")))
no_goods <- woe_dat %>% filter(good_capture == 0)
no_bads <- woe_dat %>% filter(bad_capture == 0)
# Auto-merge if requested
if(auto_merge && length(to_merge) > 0) {
# Take first element of to_merge
# Find the bin adjacent to it
# Merge them
# Recalculate woe with the new bins
which_level <- which(all_lv == to_merge[1])
if (which_level > 1) {
# Merge with the lower bin
adjacent_bin <- all_lv[which_level - 1]
} else {
adjacent_bin <- all_lv[which_level + 1]
}
new_bins <- innR2binnR::merge_levels(var = modellingTools::column_vector(dat,"var"),
lv = c(to_merge[1],adjacent_bin),
merge_names = TRUE) %>%
modellingTools::get_vector_cutpoints()
woe_dat <- woe_single(dat = dat,
var = var,
response = "response",
bins = new_bins,
warn = warn,
auto_merge = TRUE)
}
else {
if (nrow(no_goods) > 0) {
for (lv in modellingTools::column_vector(no_goods,"var")) {
if(warn) {
warning(stringr::str_c("Bin ",lv," has no obs = 0. WOE will be Inf. ",
"Consider merging this with another bin."))
}
to_merge <- c(to_merge,lv)
}
}
if (nrow(no_bads) > 0) {
for (lv in modellingTools::column_vector(no_bads,"var")) {
if(warn) {
warning(stringr::str_c("Bin ",lv," has no obs = 1. WOE will be -Inf. ",
"Consider merging this with another bin."))
}
to_merge <- c(to_merge,lv)
}
}
}
woe_dat %<>% dplyr::mutate(woe = log(bad_capture / good_capture))
return(woe_dat)
}
#' Calculate the Information Value for a given variable and set of bins
#'
#' This function calculates the Weight of Evidence for each supplied bin using
#' \code{\link{woe_single}}, then applies the formula for Information Value to
#' the result. See the vignette for more details.
#'
#' @param dat The dataframe containing the input variable and response
#' @param var Character string indicating the name of the variable
#' @param response Character string indicating the name of the response
#' @param bins Numeric vector of cutpoints defining the bins for \code{var}, or
#' 0, indicating that \code{var} is already binned
#' @param warn Logical- should the function print a warning if a given bin
#' contains only one level of the response (and hence has a WOE of +/-
#' Inf)? Recommended to keep this to TRUE; FALSE option is provided for
#' using this function in programming
#' @param auto_merge Logical. If bins contain only one value of the response,
#' should they be merged with an adjacent bin? If set to TRUE and warn is
#' set to FALSE, the function can modify your data without telling you-
#' be careful
#' @return A single number giving the Information Value
#' @family metrics
#' @seealso woe_single
#' @details
#' This function is designed as a helper, to be used in analyzing IV for a given
#' binning structure or in applying an optimization routine to find optimal
#' bins for variables in a dataset. When bins have only one level of the
#' response, the WOE will be +/-; by default the function will not prevent this
#' and issue a warning. You can turn off warnings and have the function auto-merge
#' bins using the supplied options, but be sure to inspect the results carefully.
#'
#' @examples
#' @export
#' @import magrittr
information_value <- function(dat,
var,
bins = 0,
response,
warn = TRUE,
auto_merge = FALSE) {
iv <- dat %>% woe_single(var = var,
bins = bins,
response = response,
warn = warn,
auto_merge = auto_merge) %>%
filter(!(woe %in% c(-Inf,Inf))) %>%
mutate(IV_part = (bad_capture - good_capture) * woe) %>%
summarize(IV = sum(IV_part))
return(modellingTools::column_vector(iv,"IV"))
}
#' Compute and sort variables in a dataset by information value
#'
#' This function will compute the information value for each variable
#' in the supplied dataset. Variables can be pre-binned, else the
#' function will bin them using \code{modellingTools::simple_bin} with
#' a user-specified number of bins
#'
#' @param dat: Dataset, a \code{data.frame} or \code{dplyr::tbl_df} containing pre-binned variables and
#' binary response
#' @param response: string giving the name of the binary (0/1) response
#' variable in the dataset
#' @export
#' @import magrittr
#' @import foreach
iv_sort <- function(dat,
response) {
# Check: single binary response, and only factored input variables?
if (!(response %in% colnames(dat))) {
stop("Response variable ",response," not found in dataset")
}
if (!is.numeric(modellingTools::column_vector(dat,response))) {
tp <- class(modellingTools::column_vector(dat,response))
stop(stringr::str_c("Response must be of numeric type. Response is of type ",tp))
}
if (length(unique(modellingTools::column_vector(dat,response))) > 2) {
un <- unique(modellingTools::column_vector(dat,response))
stop(stringr::str_c("Response must be binary. Response has the following unique values: ",un))
}
nf <- c()
for (nm in colnames(dat)[colnames(dat) != response]) {
if (!is.factor(modellingTools::column_vector(dat,nm))) {
nf <- c(nf,nm)
}
}
if (length(nf) > 0) {
stop(stringr::str_c("Input variables are not all factors. The following variables are not factored: ",nf))
}
# Create a dataframe of the column names and their IV, sorted in descending order
iv_sorted <- foreach::foreach(nm = colnames(dat)[colnames(dat) != response],
.inorder = FALSE,
.combine = dplyr::bind_rows,
.init = dplyr::data_frame(var = "",iv = -100),
.final = function(x) x %>% dplyr::filter(var != "") %>% dplyr::arrange(desc(iv)),
.export = c("response")
) %do% {
dplyr::data_frame(var = nm,
iv = innR2binnR::information_value(dat,
nm,
bins = 0,
response = response,
warn = FALSE,
auto_merge = TRUE)
)
}
return(iv_sorted)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.