R/information_value.R

### This script contains functions for binning by Information Value

# Weight of Evidence: WOE
# = log(%non-events / %events), where %events = #events in bin / #events in data
# So WOE == log(#nonevents in bin / #nonevents in data) - log(#events in bin - #events in data)
# WOE is calculated for each variable and each bin

# Information Value: sum(%nonevents - %events) x WOE, where the sum is taken
# over all bins

#' Calculate the Weights of Evidence for a variable, given predefined cutpoints
#'
#' This function will calculate the Weight of Evidence for each bin of a given
#' variable, according to a specified binary response. A vector of bins can be
#' supplied, or a pre-binned variable can be supplied.
#' @param dat The dataset containing the response and desired variable
#' @param var Character string containing the name of the variable
#' @param response Character string specifying the name of the response variable
#' @param bins Either a numeric vector specifying the cutpoints for the desired
#'             variable, or 0. If 0, the function assumes
#'             that the variable is already binned, and throws an error if it is
#'             not of type "factor"
#' @param warn Logical- should the function print a warning if a given bin
#'        contains only one level of the response (and hence has a WOE of +/-
#'        Inf)? Recommended to keep this to TRUE; FALSE option is provided for
#'        using this function in programming
#' @param auto_merge Logical. If bins contain only one value of the response,
#'        should they be merged with an adjacent bin? If set to TRUE and warn is
#'        set to FALSE, the function can modify your data without telling you-
#'        be careful
#' @return A two column \code{tbl} containing the bin and the WOE value
#'
#' @family metrics
#' @seealso information_value
#' @details \code{woe_single} allows the user to easily calculate Weights of
#'          Evidence for arbitrary bins for a given variable.
#'          The function will throw an error if \code{bins} is 0 and the
#'          supplied variable is not a factor. The idea is for the user to
#'          supply their own bins obtained using, for example, one of the useful
#'          functions in the \code{modellingTools} package, such as
#'          \code{\link[modellingTools]{vector_bin}}. When bins have only one level of the
#' response, the WOE will be +/-; by default the function will not prevent this
#' and issue a warning. You can turn off warnings and have the function auto-merge
#' bins using the supplied options, but be sure to inspect the results carefully.
#'
#' @export
#' @import magrittr

woe_single <- function(dat,
                       var,
                       bins = 0,
                       response,
                       warn = TRUE,
                       auto_merge = FALSE) {
  if (length(unique(modellingTools::column_vector(dat,response))) > 2) {
    stop(stringr::str_c(response," is not binary"))
  }

  if (length(bins) == 1 && !is.factor(modellingTools::column_vector(dat,var))) {
    stop(stringr::str_c("No bins provided, but ",var," is not a factor"))
  }

  if (length(bins) == 1) {
    if (bins > 0) stop(stringr::str_c("You provided a positive number of bins- ",
                                      "this function does not do automatic ",
                                      "binning. Have you tried ",
                                      "modellingTools::vector_bin?"))
  }

  dat %<>%
    dplyr::select_(var,response)

  if (length(bins) > 1) {
    dat %<>%
      modellingTools::simple_bin(bins = bins,
                                 include_vars = var)
  }

  total_bad <- sum(modellingTools::column_vector(dat,response))
  total_good <- nrow(dat) - total_bad

  woe_dat <- dat %>%
    dplyr::group_by_(var) %>%
    dplyr::summarize_("good_capture" = stringr::str_c("sum(",response," == 0) / ",total_good),
                     "bad_capture" = stringr::str_c("sum(",response," == 1) / ",total_bad)
    )

  # Deal with bins that have no goods/bads
  to_merge <- c()
  all_lv <- sort(levels(modellingTools::column_vector(dat,var)))

  no_goods <- woe_dat %>%
                dplyr::filter_("good_capture == 0") %>%
                modellingTools::column_vector(var) %>%
                # Remove "factor" class
                as.character()
  no_bads <- woe_dat %>%
                dplyr::filter_("bad_capture == 0") %>%
                modellingTools::column_vector(var) %>%
                # Remove "factor" class
                as.character()

  to_merge <- c(no_bads,no_goods)

  if (warn) {
    if (length(no_goods) > 0) {
      for (lv in no_goods) {
        warning(stringr::str_c("Bin ",lv," has no obs = 0. WOE will be Inf. ",
                               "Consider merging this with another bin."))
      }
    }

    if (length(no_bads) > 0) {
      for (lv in no_bads) {
        warning(stringr::str_c("Bin ",lv," has no obs = 1. WOE will be -Inf. ",
                               "Consider merging this with another bin."))
      }
    }
  }

  # Auto-merge if requested
  if(auto_merge) {
    if (length(to_merge) > 0) {
      # Take first element of to_merge
      # Find the bin adjacent to it
      # Merge them
      # Recalculate woe with the new bins
      which_level <- which(all_lv == to_merge[1])
      if (which_level > 1) {
        # Merge with the lower bin
        adjacent_bin <- all_lv[which_level - 1]
      } else {
        adjacent_bin <- all_lv[which_level + 1]
      }

      cat("Merging bins ",to_merge[1]," and ",adjacent_bin,"\n\n")

      dat[[var]] <- modellingTools::merge_levels(var = modellingTools::column_vector(dat,var),
                                                 lv = c(to_merge[1],adjacent_bin),
                                                 merge_names = TRUE)
      to_merge <- to_merge[-1]
      woe_dat <- dat %>%
                  modellingTools::woe_single(dat = .,
                                             var = var,
                                             response = response,
                                             warn = warn,
                                             auto_merge = TRUE)
    }
  }

  woe_dat %<>%
    dplyr::mutate_("woe" = "log(bad_capture / good_capture)")

  return(woe_dat)

}

#' Calculate the Information Value for a given variable and set of bins
#'
#' This function calculates the Weight of Evidence for each supplied bin using
#' \code{\link{woe_single}}, then applies the formula for Information Value to
#' the result. See the vignette for more details.
#'
#' @param dat The dataframe containing the input variable and response
#' @param var Character string indicating the name of the variable
#' @param response Character string indicating the name of the response
#' @param bins Numeric vector of cutpoints defining the bins for \code{var}, or
#'        0, indicating that \code{var} is already binned
#' @param warn Logical- should the function print a warning if a given bin
#'        contains only one level of the response (and hence has a WOE of +/-
#'        Inf)? Recommended to keep this to TRUE; FALSE option is provided for
#'        using this function in programming
#' @param auto_merge Logical. If bins contain only one value of the response,
#'        should they be merged with an adjacent bin? If set to TRUE and warn is
#'        set to FALSE, the function can modify your data without telling you-
#'        be careful
#' @param output_cutpoints Logical. If TRUE, a list with the IV and the final
#'        cutpoints used to compute it is returned. Default FALSE, in which
#'        case a single number representing the IV is returned
#' @return If output_cutpoints is \code{TRUE}, a list containing the IV and the final
#'         cutpoints used (after auto_merging, if applicable). If output_cutpoints
#'         is \code{FALSE}, A single number giving the Information Value.
#' @family metrics
#' @seealso woe_single
#' @details
#' This function is designed as a helper, to be used in analyzing IV for a given
#' binning structure or in applying an optimization routine to find optimal
#' bins for variables in a dataset. When bins have only one level of the
#' response, the WOE will be +/-; by default the function will not prevent this
#' and issue a warning. You can turn off warnings and have the function auto-merge
#' bins using the supplied options, but be sure to inspect the results carefully.
#'
#' @export
#' @import magrittr
#'
#'

information_value <- function(dat,
                              var,
                              bins = 0,
                              response,
                              warn = TRUE,
                              auto_merge = FALSE,
                              output_cutpoints = FALSE) {
  woe <- dat %>% modellingTools::woe_single(var = var,
                      bins = bins,
                      response = response,
                      warn = warn,
                      auto_merge = auto_merge)
  iv <- woe %>%
    dplyr::filter_("!(woe %in% c(-Inf,Inf))") %>%
    dplyr::mutate_("IV_part" = "(bad_capture - good_capture) * woe") %>%
    dplyr::summarize_("IV" = "sum(IV_part)") %>%
    modellingTools::column_vector("IV")

  if (output_cutpoints) {
    cutpoints <- modellingTools::column_vector(woe,var) %>% modellingTools::get_vector_cutpoints()
    return(list(
      cutpoints = cutpoints,
      iv = iv
    ))
  } else {
    return(iv)
  }
}

#' Compute and sort variables in a dataset by information value
#'
#' This function will compute the information value for each variable
#' in the supplied dataset. Variables must be pre-binned; any variable
#' other than the response not of class "factor" will cause an error.
#' Response should be numeric and binary
#'
#' @param dat Dataset, a \code{data.frame} or \code{dplyr::tbl_df} containing pre-binned variables and
#'             binary response
#' @param response string giving the name of the binary (0/1) response
#'                  variable in the dataset
#' @param var_grouping optional table giving the grouping structure of the variables. If provided, variables
#'                     will be sorted by IV within the groups. Useful for selecting variables after performing
#'                     some clustering procedure. Format: a \code{tbl} with 2 columns: \code{var}, the names
#'                     of the variables and \code{group}, a number or string identifying groups
#' @return a \code{nrow(dat) x 2 tbl_df} with two columns: \code{var}, giving each
#'         variable name, and \code{iv}, giving the Information Value
#' The function will auto-merge pure bins, and return the information value obtained using the final
#' merged bins.
#' @export
#' @import magrittr
#' @import foreach

iv_sort <- function(dat,
                    response,
                    var_grouping = NULL) {
  # Check: single binary response, and only factored input variables?
  if (!(response %in% colnames(dat))) {
    stop("Response variable ",response," not found in dataset")
  }

  if (!is.numeric(modellingTools::column_vector(dat,response))) {
    tp <- class(modellingTools::column_vector(dat,response))
    stop(stringr::str_c("Response must be of numeric type. Response is of type ",tp))
  }

  if (length(unique(modellingTools::column_vector(dat,response))) > 2) {
    un <- unique(modellingTools::column_vector(dat,response))
    stop(stringr::str_c("Response must be binary. Response has the following unique values: ",un))
  }

  nf <- c()
  for (nm in colnames(dat)[colnames(dat) != response]) {
    if (!is.factor(modellingTools::column_vector(dat,nm))) {
      nf <- c(nf,nm)
    }
  }

  if (length(nf) > 0) {
    stop(stringr::str_c("Input variables are not all factors. The following variables are not factored: ",nf))
  }

  if (!is.null(var_grouping)) {
    if (!all(class(var_grouping) == c("tbl_df","tbl","data.frame"))) {
      warning("var_grouping was not provided as a tbl- grouping structure ignored")
      var_grouping <- NULL
    } else if (!all(sort(names(var_grouping)) == c("group","var"))) {
      warning("var_grouping does not follow required format of being a tbl with exactly two columns named 'group' and 'var'. Argument ignored")
      var_grouping = NULL
    }
  }

  # Create a dataframe of the column names and their IV, sorted in descending order
  iv_unsorted <- foreach::foreach(nm = colnames(dat)[colnames(dat) != response],
                   .inorder = FALSE,
                   .combine = dplyr::bind_rows,
                   .init = dplyr::data_frame(var = "",iv = -100),
                   .final = function(x) {
                              x %>%
                              dplyr::filter_("var != ''")
                     },
                   .export = c("response")
  ) %do% {
    dplyr::data_frame(var = nm,
                      iv = modellingTools::information_value(dat,
                                                         nm,
                                                         bins = 0,
                                                         response = response,
                                                         warn = FALSE,
                                                         auto_merge = TRUE)
    )
  }

  if (!is.null(var_grouping)) {
    iv_sorted <- iv_unsorted %>%
                  dplyr::inner_join(var_grouping,by = "var") %>%
                  dplyr::arrange_("group","desc(iv)")
  } else {
    iv_sorted <- iv_unsorted %>%
                 dplyr::arrange_("desc(iv)")
  }

  return(iv_sorted)

}
awstringer/modellingTools documentation built on May 11, 2019, 4:11 p.m.