R/information_value.R

### This script contains functions for binning by Information Value

# Weight of Evidence: WOE
# = log(%non-events / %events), where %events = #events in bin / #events in data
# So WOE == log(#nonevents in bin / #nonevents in data) - log(#events in bin - #events in data)
# WOE is calculated for each variable and each bin

# Information Value: sum(%nonevents - %events) x WOE, where the sum is taken
# over all bins

#' Calculate the Weights of Evidence for a variable, given predefined cutpoints
#'
#' This function will calculate the Weight of Evidence for each bin of a given
#' variable, according to a specified binary response. A vector of bins can be
#' supplied, or a pre-binned variable can be supplied.
#' @param dat The dataset containing the response and desired variable
#' @param var Character string containing the name of the variable
#' @param response Character string specifying the name of the response variable
#' @param bins Either a numeric vector specifying the cutpoints for the desired
#'             variable, or 0. If 0, the function assumes
#'             that the variable is already binned, and throws an error if it is
#'             not of type "factor"
#' @param warn Logical- should the function print a warning if a given bin
#'        contains only one level of the response (and hence has a WOE of +/-
#'        Inf)? Recommended to keep this to TRUE; FALSE option is provided for
#'        using this function in programming
#' @param auto_merge Logical. If bins contain only one value of the response,
#'        should they be merged with an adjacent bin? If set to TRUE and warn is
#'        set to FALSE, the function can modify your data without telling you-
#'        be careful
#' @return A two column \code{tbl} containing the bin and the WOE value
#'
#' @family metrics
#' @seealso information_value
#' @details \code{woe_single} allows the user to easily calculate Weights of
#'          Evidence for arbitrary bins for a given variable.
#'          The function will throw an error if \code{bins} is 0 and the
#'          supplied variable is not a factor. The idea is for the user to
#'          supply their own bins obtained using, for example, one of the useful
#'          functions in the \code{modellingTools} package, such as
#'          \code{\link[modellingTools]{vector_bin}}. When bins have only one level of the
#' response, the WOE will be +/-; by default the function will not prevent this
#' and issue a warning. You can turn off warnings and have the function auto-merge
#' bins using the supplied options, but be sure to inspect the results carefully.
#'
#' @examples
#' @export
#' @import magrittr

woe_single <- function(dat,
                       var,
                       bins = 0,
                       response,
                       warn = TRUE,
                       auto_merge = FALSE) {
  if (length(unique(modellingTools::column_vector(dat,response))) > 2) {
    stop(stringr::str_c(response," is not binary"))
  }

  if (length(bins) == 1 && !is.factor(modellingTools::column_vector(dat,var))) {
    stop(stringr::str_c("No bins provided, but ",var," is not a factor"))
  }

  if (length(bins) == 1) {
    if (bins > 0) stop(stringr::str_c("You provided a positive number of bins- ",
                                      "this function does not do automatic ",
                                      "binning. Have you tried ",
                                      "modellingTools::vector_bin?"))
  }

  dat %<>%
    dplyr::select_(var,response) %>%
    dplyr::mutate_(var = var,response = response)

  if (length(bins) > 1) {
    dat %<>%
      modellingTools::simple_bin(bins = bins,
                                 type = "width",
                                 include_vars = "var")
  }

  total_bad <- sum(modellingTools::column_vector(dat,response))
  total_good <- nrow(dat) - total_bad

  woe_dat <- dat %>%
    dplyr::group_by(var) %>%
    dplyr::summarize(good_capture = sum(response == 0) / total_good,
                     bad_capture = sum(response == 1) / total_bad)

  # Deal with bins that have no goods/bads
  to_merge <- c()
  all_lv <- sort(levels(modellingTools::column_vector(dat,"var")))

  no_goods <- woe_dat %>% filter(good_capture == 0)
  no_bads <- woe_dat %>% filter(bad_capture == 0)

  # Auto-merge if requested
  if(auto_merge && length(to_merge) > 0) {
    # Take first element of to_merge
    # Find the bin adjacent to it
    # Merge them
    # Recalculate woe with the new bins
    which_level <- which(all_lv == to_merge[1])
    if (which_level > 1) {
      # Merge with the lower bin
      adjacent_bin <- all_lv[which_level - 1]
    } else {
      adjacent_bin <- all_lv[which_level + 1]
    }

    new_bins <- innR2binnR::merge_levels(var = modellingTools::column_vector(dat,"var"),
                                         lv = c(to_merge[1],adjacent_bin),
                                         merge_names = TRUE) %>%
                modellingTools::get_vector_cutpoints()

    woe_dat <- woe_single(dat = dat,
                          var = var,
                          response = "response",
                          bins = new_bins,
                          warn = warn,
                          auto_merge = TRUE)
  }
  else {
    if (nrow(no_goods) > 0) {
      for (lv in modellingTools::column_vector(no_goods,"var")) {
        if(warn) {
          warning(stringr::str_c("Bin ",lv," has no obs = 0. WOE will be Inf. ",
                                 "Consider merging this with another bin."))
        }
        to_merge <- c(to_merge,lv)
      }
    }

    if (nrow(no_bads) > 0) {
      for (lv in modellingTools::column_vector(no_bads,"var")) {
        if(warn) {
          warning(stringr::str_c("Bin ",lv," has no obs = 1. WOE will be -Inf. ",
                                 "Consider merging this with another bin."))
        }
        to_merge <- c(to_merge,lv)
      }
    }
  }

  woe_dat %<>% dplyr::mutate(woe = log(bad_capture / good_capture))

  return(woe_dat)

}

#' Calculate the Information Value for a given variable and set of bins
#'
#' This function calculates the Weight of Evidence for each supplied bin using
#' \code{\link{woe_single}}, then applies the formula for Information Value to
#' the result. See the vignette for more details.
#'
#' @param dat The dataframe containing the input variable and response
#' @param var Character string indicating the name of the variable
#' @param response Character string indicating the name of the response
#' @param bins Numeric vector of cutpoints defining the bins for \code{var}, or
#'        0, indicating that \code{var} is already binned
#' @param warn Logical- should the function print a warning if a given bin
#'        contains only one level of the response (and hence has a WOE of +/-
#'        Inf)? Recommended to keep this to TRUE; FALSE option is provided for
#'        using this function in programming
#' @param auto_merge Logical. If bins contain only one value of the response,
#'        should they be merged with an adjacent bin? If set to TRUE and warn is
#'        set to FALSE, the function can modify your data without telling you-
#'        be careful
#' @return A single number giving the Information Value
#' @family metrics
#' @seealso woe_single
#' @details
#' This function is designed as a helper, to be used in analyzing IV for a given
#' binning structure or in applying an optimization routine to find optimal
#' bins for variables in a dataset. When bins have only one level of the
#' response, the WOE will be +/-; by default the function will not prevent this
#' and issue a warning. You can turn off warnings and have the function auto-merge
#' bins using the supplied options, but be sure to inspect the results carefully.
#'
#' @examples
#' @export
#' @import magrittr

information_value <- function(dat,
                              var,
                              bins = 0,
                              response,
                              warn = TRUE,
                              auto_merge = FALSE) {
  iv <- dat %>% woe_single(var = var,
                      bins = bins,
                      response = response,
                      warn = warn,
                      auto_merge = auto_merge) %>%
    filter(!(woe %in% c(-Inf,Inf))) %>%
    mutate(IV_part = (bad_capture - good_capture) * woe) %>%
    summarize(IV = sum(IV_part))

  return(modellingTools::column_vector(iv,"IV"))
}

#' Compute and sort variables in a dataset by information value
#'
#' This function will compute the information value for each variable
#' in the supplied dataset. Variables can be pre-binned, else the
#' function will bin them using \code{modellingTools::simple_bin} with
#' a user-specified number of bins
#'
#' @param dat: Dataset, a \code{data.frame} or \code{dplyr::tbl_df} containing pre-binned variables and
#'             binary response
#' @param response: string giving the name of the binary (0/1) response
#'                  variable in the dataset
#' @export
#' @import magrittr
#' @import foreach

iv_sort <- function(dat,
                    response) {
  # Check: single binary response, and only factored input variables?
  if (!(response %in% colnames(dat))) {
    stop("Response variable ",response," not found in dataset")
  }

  if (!is.numeric(modellingTools::column_vector(dat,response))) {
    tp <- class(modellingTools::column_vector(dat,response))
    stop(stringr::str_c("Response must be of numeric type. Response is of type ",tp))
  }

  if (length(unique(modellingTools::column_vector(dat,response))) > 2) {
    un <- unique(modellingTools::column_vector(dat,response))
    stop(stringr::str_c("Response must be binary. Response has the following unique values: ",un))
  }

  nf <- c()
  for (nm in colnames(dat)[colnames(dat) != response]) {
    if (!is.factor(modellingTools::column_vector(dat,nm))) {
      nf <- c(nf,nm)
    }
  }

  if (length(nf) > 0) {
    stop(stringr::str_c("Input variables are not all factors. The following variables are not factored: ",nf))
  }

  # Create a dataframe of the column names and their IV, sorted in descending order
  iv_sorted <- foreach::foreach(nm = colnames(dat)[colnames(dat) != response],
                   .inorder = FALSE,
                   .combine = dplyr::bind_rows,
                   .init = dplyr::data_frame(var = "",iv = -100),
                   .final = function(x) x %>% dplyr::filter(var != "") %>% dplyr::arrange(desc(iv)),
                   .export = c("response")
  ) %do% {
    dplyr::data_frame(var = nm,
                      iv = innR2binnR::information_value(dat,
                                                         nm,
                                                         bins = 0,
                                                         response = response,
                                                         warn = FALSE,
                                                         auto_merge = TRUE)
    )
  }


  return(iv_sorted)

}
awstringer/innR2binnR documentation built on May 11, 2019, 4:11 p.m.