R/utility_functions.R

#' Merge specific levels of a factor
#'
#' This function allows the user to programatically merge levels of a factor
#' together to create aggregate levels. This is designed to work for data that
#' has been discretized by, e.g. \code{\link[modellingTools]{vector_bin}},
#' but will work for arbitrary factors. In the case of factors with numeric
#' levels, the user can specify to merge the names of the old levels when
#' naming the new level
#' @param var Vector for which \code{\link[base]{is.factor}} returns
#' \code{TRUE}
#' @param lv Character vector containing 2 or more levels of \code{var} to
#' merge.
#' @param merge_names Logical. Should names of numeric factors be merged?
#' Default is \code{FALSE}.
#' @return A new factor level corresponding to \code{var}, with values that were
#' in \code{lv} replaced with a new factor level. The name of thre level depends
#' on \code{merge_names}.
#' @details
#' When not merging names, if levels \code{A}, \code{B}, and \code{C} are to be
#' merged, the resulting level will be \code{AxBxC}. If the factor levels
#' contain numbers, specifying \code{merge_names = TRUE} will cause
#' \code{merge_levels} to call \code{\link[modellingTools]{get_vector_cutpoints}}
#' on the new level, and then use only the minimum and maximum cutpoints- this
#' will result in the merge of \code{[1,2]}, \code{[2,3]}, and \code{[3,4]}
#' being named \code{[1,4]} instead of \code{[1,2]x[2,3]x[3,4]}. While this
#' seems like obviously what one would want the default behaviour to be, the
#' user must specify this explicitly to avoid unexpected results.
#' @family utility functions
#' @seealso woe_single
#' @examples
#' x <- factor(1:10)
#' merge_levels(x,c(1,2,3))
#' merge_levels(x,c(1,2,3),merge_names = TRUE)
#'
#' y <- modellingTools::vector_bin(x,bins = 3)
#' @export

merge_levels <- function(var,lv,merge_names = FALSE) {
  # Check if var is a factor
  if (!is.factor(var)) {
    stop(stringr::str_c("Variable is not a factor"))
  }

  # Check if supplied levels actually exist
  levels_exist <- lv %in% levels(var)
  if (!any(levels_exist)) {
    stop("None of the supplied levels exist within supplied variable")
  } else if (!all(levels_exist)) {
    which_dont_exist <- lv[which(!levels_exist)]
    warning(stringr::str_c("The following level is not present in supplied variable: ",
                          which_dont_exist))
    lv <- lv[which(levels_exist)]
  }

  # Convert it to a character
  var <- as.character(var)

  # Make sure > 1 levels provided
  if (length(lv) < 2) stop("Please provide > 1 levels to combine")

  new_level <- stringr::str_c(lv[1],"x")
  if (length(lv) > 2) {
    for (l in 2:(length(lv) - 1)) {
      new_level <- stringr::str_c(new_level,lv[l],"x")
    }
  }
  new_level <- stringr::str_c(new_level,lv[length(lv)])

  if (merge_names) {
    new_bin_cutpoints <- modellingTools::get_vector_cutpoints(new_level)
    new_level <- stringr::str_c("[",
                                min(new_bin_cutpoints),
                                ",",
                                max(new_bin_cutpoints),
                                "]")
  }

  for (i in 1:length(var)) {
    if (var[i] %in% lv) var[i] <- new_level
  }



  return(factor(var))
}
awstringer/innR2binnR documentation built on May 11, 2019, 4:11 p.m.