R/variable_binning.R

# Functions to support variable binning

# Contents:
#   - function to bin a vector into equal height, equal width, or custom bins
#   - function to take the columns from a dataframe and replace them with their
#     binned values; as well can optionally bin an additional "test" set into
#     those same values
#   - function to take a dataframe of binned data and return a list containing
#     the unique cut points for each bin


#==============================================================================#

#' Get the cutpoints from a single factor vector.
#'
#' get_vector_cutpoints returns a numeric vector giving the unique
#' cutpoints of a variable that has been discretized using
#' vector_bin- more generally, using \code{\link[base]{cut}} and any
#' functions that depend on \code{\link[base]{cut}}
#'
#' @param v vector to get cutpoints from
#' @return a vector containing the unique cutpoints in v
#' @details
#' This function is provided for convienience, and is built to work with the
#' exact format for discretized variables that is used by the cut family. Hence
#' it will work for cut, cut_number/cut_interval, and any of the binning
#' functions from modellingTools, but it is not guaranteed to work for
#' arbitrary factors with numeric levels
#' @family discretization
#' @seealso \code{\link[base]{cut}}, \code{\link[ggplot2]{cut_number}},
#' \code{\link[ggplot2]{cut_interval}}, \code{\link{vector_bin}}
#' @examples
#' x <- cut(rnorm(100),c(-1,0,1))
#' get_vector_cutpoints(x) # -1, 0, 1
#' @export
#' @import magrittr

get_vector_cutpoints <- function(v) {
  if (is.factor(v)) {
    lv <- levels(v)
  } else {
    lv <- v
  }

  cut_points <- lv %>%
                stringr::str_split(",") %>%
                #unlist() %>%
                stringr::str_extract_all("[-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?") %>%
                unlist() %>%
                unique() %>%
                as.numeric() %>%
                sort()



  return(cut_points)
}


#' Get the unique cutpoints of each appropriate column in a tbl.
#'
#' This function takes a dataframe where any number of columns have been binned
#' into factors using cut/vector_bin and returns a named list containing the
#' cutpoints for each variable.
#' This is useful for, for example, binning a new dataset into the same bins
#' as an older dataset- such as when making predictions on test data
#'
#' @param d a tbl
#' @param parallel logical. If TRUE, parallel foreach used. Must register
#' parallel beforehand. Default false
#' @return a named list containing one vector for each factor type variable.
#' Each vector contains the unique cut points of that variable
#' @family discretization
#' @seealso \code{\link{get_vector_cutpoints}}, \code{\link{simple_bin}}
#' @details
#' This function essentially calls \code{\link{get_vector_cutpoints}} on all
#' factor type columns of \code{d}. It is meant to be used to provide an output
#' format that works with the \code{bins} argument of \code{\link{simple_bin}},
#' for the purpose of defining cutpoints based on one dataset and then applying
#' them to other datasets. The basic functionality of binning on training data
#' and applying those bins to test data is built right in to
#' \code{\link{simple_bin}}, but this function allows the user total
#' flexibility.
#' @examples
#' x <- dplyr::data_frame(v1 = cut(rnorm(100),c(-1,0,1)),
#'                        v2 = cut(rnorm(100),c(-.5,0,.5)))
#' binned_data_cutpoints(x)
#' binned_data_cutpoints(x,parallel = TRUE)
#' @export
#' @import foreach

binned_data_cutpoints <- function(d,parallel = FALSE) {

  # Which variables are factors?
  f_list <- c()
  for (i in 1:ncol(d)) {
    if (is.factor(column_vector(d,i))) {
      f_list <- c(f_list,colnames(d)[i])
    }
  }

  # Get the cutpoints
  if (parallel) {
    cutpoints <- foreach::foreach(cl = f_list,
                         .final = function(x) stats::setNames(x,f_list),
                         .export = c("get_vector_cutpoints",
                                     "column_vector",
                                     "cl"),
                         .packages = c("stringr")) %dopar% {
                            get_vector_cutpoints(column_vector(d,cl))
                         }

  } else {
    cutpoints <- foreach::foreach(cl = f_list,
                         .final = function(x) stats::setNames(x,f_list)) %do% {
                           get_vector_cutpoints(column_vector(d,cl))
                         }
  }

  return(cutpoints)
}

#' Bin a vector into equal height, equal width, or custom bins
#'
#' This function essentially calls \code{\link[base]{cut}}/
#' \code{\link[ggplot2]{cut_interval}}/\code{\link[ggplot2]{cut_number}},
#' depending on the value of bins and type. The one major difference is in the
#' treatment of missing values; those functions return NA, while vector_bin has
#' the default option of returning a bin for the missing values
#'
#' @param x vector of numeric data to bin
#' @param bins numeric vector. If length 1, then this is taken to be the number of
#'         desired bins, computed according to "type". If length > 1, this is taken
#'         to be the actual cutpoints desired
#' @param type character, equal to "height" or "width". Only used if length(bins) == 1.
#'         If "height", then bins are computed to have roughly equal numbers of
#'         observations; else, bins are computed to be of roughly equal width
#' @param na_include logical. If TRUE, then a bin labelled "missing" will be included
#'               in the output. Else NA values are dropped
#' @return the input vector x, with values replaced by the appropriate bins.
#'         Type also changed to factor
#' @family discretization
#' @seealso \code{\link[base]{cut}}, \code{\link[ggplot2]{cut_number}},
#' \code{\link[ggplot2]{cut_interval}},
#' @examples
#' x <- rnorm(100)
#' y <- x; y[sample(1:100,20)] <- NA
#'
#' cut(x,c(-1,0,1))
#' vector_bin(x,bins = c(-1,0,1))
#' cut(y,c(-1,0,1))
#' vector_bin(y,bins = c(-1,0,1))
#' vector_bin(y,bins = c(-1,0,1),na_include = FALSE)
#'
#' ggplot2::cut_number(x,5)
#' vector_bin(x,5)
#'
#' ggplot2::cut_interval(x,5)
#' vector_bin(x,5,type = "width")
#' @export
#' @import magrittr

vector_bin <- function(x,bins,type = "height",na_include = TRUE) {

  if (!is.numeric(x)) return(x)

  if (length(bins) > 1) {
    binned_x <- try(as.character(cut(x,
                                     bins,
                                     right = FALSE,
                                     include.lowest = TRUE)),silent = TRUE)
  }
  else if (type == "height") {
    if (bins >= length(unique(x))) return(x)
    binned_x <- try(as.character(ggplot2::cut_number(x,bins,right = FALSE)),silent = TRUE)
  }
  else {
    if (bins >= length(unique(x))) return(x)
    binned_x <- try(as.character(ggplot2::cut_interval(x,bins,right = FALSE)),silent = TRUE)
  }

  if (class(binned_x) == "try-error") {
    print("Could not bin x")
    return(x)
  }

  if (na_include) {
    binned_x[is.na(binned_x)] <- "Missing"
  }
  else {
    binned_x <- binned_x[!is.na(binned_x)]
  }

  # Replace all open brackets, replace with closed
  binned_x <- binned_x %>%
              stringr::str_replace_all("\\(","\\[") %>%
              stringr::str_replace_all("\\)","\\]")

  # Removed factor class to deal with NAs: re-add it now
  binned_x <- factor(binned_x)

  return(binned_x)

}

#' Discretize variables in your training and test datasets
#'
#' Function to apply simple equal-width or equal-height binning to columns of a
#' training dataset, and then optionally bin the columns of a test set into bins
#' with the appropriate cutpoints
#'
#' @param train training set
#' @param test test set
#' @param exclude_vars variables to exclude (e.g. the target, or the row ID)
#' @param include_vars if you only want certain variables binned, you may specify them
#'                 directly instead of excluding all other variables
#' @param bins single number specifying the number of bins to create on each variable,
#'         or a named list specifying cut-points for each variable
#' @param type if bins is given as a number, then this determines whether to create
#'         bins with equal number of observations ("height") or of equal width
#'         ("width")
#' @param na_include logical. Give missing values their own bin?
#' @return if test is not NULL, a list containing two tbl_df objects, with appropriate
#'     columns replaced by their binned values and all other columns unchanged
#'     if test is NULL, returns the training set portion of the list
#'
#' @family discretization
#' @seealso \code{\link{vector_bin}}, \code{\link{get_vector_cutpoints}}
#' @details
#' This function was built as a convenience, to automate the process of binning
#' continuous variables into disrete levels, and also to provide a simple,
#' interpretible, unambiguous method of dealing with missing values in data
#' science problems.
#' @export
#' @import foreach magrittr

simple_bin <- function(train,
                       test = NULL,
                       exclude_vars = NULL,
                       include_vars = NULL,
                       bins,
                       type = "height",
                       na_include = TRUE) {

  if (length(exclude_vars) > 0 && length(include_vars > 0)) {
    stop("Cannot specify both include and exclude lists")
  }

  index <- c()

  # If bins is a list, then bin only the variables included
  # else, bin everything not in exclude_list, OR that is in include_list

  if (is.list(bins)) {
    index <- names(bins)
  } else if (length(exclude_vars > 0)) {
    for (i in 1:ncol(train)) {
      if (!(colnames(train)[i] %in% exclude_vars)) {
        index <- c(index,colnames(train)[i])
      }
    }
  } else if (length(include_vars) > 0) {
    for (i in 1:ncol(train)) {
      if (colnames(train)[i] %in% include_vars) {
        index <- c(index,colnames(train)[i])
      }
    }
  } else {
    index <- colnames(train)
  }

  # Bin variables on training set
  k <- 1
  binned_train <- foreach::foreach(i = 1:ncol(train),
                         .combine = dplyr::bind_cols,
                         .multicombine = TRUE,
                         .final = function(x) {
                           x <- dplyr::tbl_df(as.data.frame(x))
                           x <- stats::setNames(x,colnames(train))
                           return(x)
                           },
                         .export = c("column_vector",
                                     "vector_bin"),
                         .packages = c("dplyr")) %do% {
    nm <- colnames(train)[i]
    if (nm %in% index) {
      if (is.list(bins)) {
        tmp_bins <- bins[[nm]]
      } else {
        tmp_bins <- bins
      }
      df <- dplyr::data_frame(var1 = vector_bin(column_vector(train,nm),
                                          tmp_bins,
                                          type,
                                          na_include))
    }
    else {
      df <- dplyr::data_frame(var1 = column_vector(train,nm))
    }
    stats::setNames(df,nm)
  }

  colnames(binned_train) <- colnames(train)

  # Now bin on the test set, if present
  if (!is.null(test)) {
    # Get the cutpoints from the binned training set
    train_cutpoints <- binned_data_cutpoints(binned_train,parallel = TRUE)

    # Recursively apply the parent function, with train = test and test ~ NULL
    binned_test <- simple_bin(train = test,test = NULL,
                              bins = train_cutpoints,
                              exclude_vars = exclude_vars)

    return(list(train = binned_train,
                test = binned_test))
  } else {
    # If no test set, returned the binned training set
    return(binned_train)
  }

}


#' Create a usable model matrix from a data frame containing a mix of
#' continuous and categorical variables
#'
#' This function takes your dataframe of input variables and returns a new
#' dataframe (or matrix) with the categorical variables replaced by dummy
#' variables, using \code{\link[stats]{model.matrix}}
#'
#' @param dat a tbl
#' @param id character, naming the variable in dat which serves as the unique
#'            row identifier. If blank, will be created
#' @param matrix_out logical. Should the result be a matrix (\code{TRUE}), suitable
#'            for input into many modelling functions, or should the result be
#'            a tbl (\code{FALSE}), suitible for inspection and further analysis?
#'            Default \code{TRUE}
#' @param parallel logical. If \code{TRUE}, parallel
#'            \code{\link[foreach]{foreach}} is used to compute on each
#'            variable. Must register a parallel backend first. Default
#'            \code{FALSE}.
#' @return a matrix or a tbl, consisting of dummy columns with 0/1 indicators
#'         of membership in each factor level for each factor variable, and all
#'         other input variables unchanged.
#'
#' @details
#' The function will only alter variables which are type \code{factor}. Contrary
#' to how it may sound, this actually offers the user greater flexibility, for
#' two reasons: it allows you to keep character type variables intact, and it
#' forces you to think about the levels of each factor variable rather than
#' picking them straight from the input data
#'
#' @examples
#' x <- simple_bin(iris,bins = 3)
#' create_model_matrix(x)
#' create_model_matrix(x,matrix_out = FALSE)
#'
#' @export
#' @import magrittr foreach
create_model_matrix <- function(dat,
                                id = c(),
                                matrix_out = TRUE,
                                parallel = FALSE) {

  # If no provided row id, create one
  if (length(id) == 0) {
    dat %<>% dplyr::mutate_("id" = stringr::str_c("1:",nrow(dat)))
    id <- c("id")
  }

  # Create a vector of names of variables to include
  include <- foreach::foreach(i = 1:ncol(dat),
                                  .combine = c,
                                  .export = c("i")) %do% {
                                    if (is.factor(column_vector(dat,i))) {
                                          colnames(dat)[i]
                                        }
                                      }

  # Create a list of data frames, each with one widened variable
  # plus a column for the id

  if (parallel) {
    wide <- foreach::foreach(nm = colnames(dat),
                             .combine = dplyr::bind_cols,
                             .final = function(x) {
                               stats::setNames(x,include)
                               return(x)
                             },
                             .multicombine = TRUE,
                             .packages = c("dplyr","modellingTools"),
                             .export = c("nm")) %dopar% {
                               ff <- stats::formula("~ + -1 + " + nm)
                               stats::model.matrix(ff,dat) %>%
                                 as.data.frame() %>%
                                 dplyr::tbl_df()
                             }
  } else {
    wide <- foreach::foreach(nm = include,
                             .combine = dplyr::bind_cols,
                             .final = function(x) {
                               stats::setNames(x,include)
                               return(x)
                             },
                             .multicombine = TRUE,
                             .export = c("nm")) %do% {
                               ff <- stats::formula("~ -1 + " + nm)
                               stats::model.matrix(ff,dat) %>%
                                 as.data.frame() %>%
                                 dplyr::tbl_df()
                             }
  }


  if (matrix_out) {
    # If using as input to a model, remove the id and target columns
    wide %<>% as.matrix()
    return(wide)
  } else {
    wide %<>% dplyr::bind_cols(dat[ ,id])
    return(wide)
  }

}


#' Optimally discretize variables in your training and test datasets
#'
#' This function will perform supervised discritization of all variables in the supplied
#' training, and optionally test, datasets. This utilizes the binning functionality of
#' \code{modellingTools::simple_bin} combined with the optimal
#' bin calculations performed in \code{smbinning::smbinning}. Note that
#' no filtering is done on the resulting binning structure; there may be pure bins,
#' non-monotonic Weights of Evidience, etc. This is left to the user- the package
#' provides a tool-set for dealing with any such concerns.
#'
#' @param train training set
#' @param response a string naming the response variable; must be 0/1 and coercible to \code{factor}
#' @param exclude_vars variables to exclude (e.g. the target, or the row ID)
#' @param include_vars if you only want certain variables binned, you may specify them
#'                 directly instead of excluding all other variables
#' @return a list containing the following elements:
#'            iv: a dataframe containing the variables and their information values,
#'                sorted in descending order
#'            train: a \code{tbl_df} containing the same variables
#'                   as \code{train}, with the appropriate ones binned (per \code{exclude_vars} or
#'                   \code{include_vars})
#'            test: if \code{test} is \code{NULL}, then \code{NULL}; else a \code{tbl_df} containing
#'                  the same variables as \code{test}, binned in the same manner as \code{train}.
#' @export
#' @import magrittr foreach

optimal_bin <- function(train,
                        response,
                        exclude_vars = NULL,
                        include_vars = NULL) {

  # Convert column names to lower case, change . to _ for smbinning
  colnames(train) %<>%
    stringr::str_replace_all("\\.","_") %>%
    tolower()

  if (!is.null(exclude_vars)) {
    exclude_vars %<>%
      stringr::str_replace_all("\\.","_") %>%
      tolower()
  }

  if (!is.null(include_vars)) {
    include_vars %<>%
      stringr::str_replace_all("\\.","_") %>%
      tolower()
  }



  cnames <- colnames(train)
  if (!is.null(exclude_vars)) {
        vars <- cnames[!(cnames %in% exclude_vars) & cnames != response]
     } else if(!(is.null(include_vars))) {
       vars <- cnames[cnames %in% include_vars & cnames != response]
     } else {
       vars <- cnames[cnames != response]
     }

  # Modify the training data
  train <- train[,c(vars,response)]
  train[[response]] <- as.numeric(train[[response]])

  # Helper function for extracting optimal bins from smbinning
  extract_cutpoints <- function(lst) {
    if (class(lst) == "list") {
      if (exists("bands",where = lst)) {
        return(lst$bands)
      }
    }
    return(c())
  }


  # Compute the optimal bins, for numeric variables
  vars_n <- vars[sapply(train,is.numeric)]
  vars_n <- vars_n[!is.na(vars_n)]

  opt_bins <- foreach::foreach(vr = vars_n,
                               .inorder = TRUE,
                               .final = function(x) stats::setNames(x,vars_n),
                               .export = c("train","bin_help",response)
              ) %do% {
                train %>%
                    as.data.frame() %>%
                    smbinning::smbinning(df = .,
                                         y = response,
                                         x = vr,
                                         p = 0.05) %>%
                    extract_cutpoints()

              }


  return(opt_bins)

}
awstringer/modellingTools documentation built on May 11, 2019, 4:11 p.m.