R/plotting_numerical_buckets.R

Defines functions plotting_numerical_buckets

Documented in plotting_numerical_buckets

#' plotting_numerical_buckets
#'
#' @description
#' This function takes a numeric vector \code{var_to_band} and groups it into buckets which are useful for plotting.
#' These groups have approximately equal weight
#' Outliers are excluded/included using the \code{include_outliers} option.
#' This function is built on \code{\link{prep_num_bin}}
#'
#' @param var_to_band Vector[numeric] - vector of values to be binned
#' @param n_bins numeric - Number of bins to split exposure into
#' @param weight numeric - vector weight for observations
#' @param include_outliers logical - should outliers be trimmed
#'
#' @return data.frame with columns
#' bin [numeric]: numeric label of bin
#' labels [character]: string output from \code{\link{prep_num_bin}} if called
#' lower [numeric]: Min bound of the bin
#' upper [numeric]: Max bound of the bin
#' center [numeric] : Center of the bin - (upper + lower) / 2
#' width [numeric] : Width of the bin - (upper - lower)
#' @export
#'
#' @examples
#' plotting_numerical_buckets(var_to_band=runif(100), n_bins=10)
plotting_numerical_buckets <- function(var_to_band, n_bins=25, weight=NULL, include_outliers=FALSE){

  checkmate::assert_numeric(var_to_band)
  checkmate::assert_integerish(n_bins, len=1)
  checkmate::assert_logical(include_outliers, len=1)

  # Use no weighting if none given
  if (is.null(weight)){
    weight <- rep(1, length(var_to_band))
  }else{
    checkmate::assert_numeric(weight, len=length(var_to_band), lower=0)
  }

  if(var_to_band %>% unique() %>% length() < (n_bins*2)){ # low number of levels. Just return the number of levels

    binning_df <- data.frame(bin=1:(var_to_band %>% unique() %>% length()),
                             labels="",
                             center=var_to_band %>% unique() %>% sort(),
                             stringsAsFactors = FALSE) %>%
      dplyr::mutate(width=min(c(1, center - dplyr::lag(center)), na.rm=TRUE),
                    lower=center - width/2,
                    upper=center + width/2)

    return(binning_df)

  }else{ # Data needs splitting into buckets

    # Remove outliers using inter quantile range
    Q <- quantile(var_to_band, probs=c(.25, .5, .75), names = FALSE)
    Q1 <- Q[[1]]
    Q3 <- Q[[3]]
    IQR <- Q3 - Q1

    # Remove outliers (This is standard logic used in boxplots)
    lower.tail <- Q1 - (1.5 * IQR)
    upper.tail <- Q3 + (1.5 * IQR)

    lower.start <- max(min(var_to_band), lower.tail)
    upper.start <- min(max(var_to_band), upper.tail)

    data_inliers <- var_to_band[var_to_band>=lower.start & var_to_band<=upper.start]
    weight_inliers <- weight[var_to_band>=lower.start & var_to_band<=upper.start]

    data_outliers <- var_to_band[var_to_band<lower.start | var_to_band>upper.start]
    outlier_values <- data_outliers %>% sort() %>% unique()

    binning_out <- prep_num_bin(data_inliers, n_bins=n_bins, weight=weight_inliers)

    # labels look like [ x, y ) regex sets lower to x and upper to y
    binning_df <- data.frame(bin=1:length(binning_out$labels), labels=binning_out$labels, stringsAsFactors = FALSE) %>%
      dplyr::mutate(lower=gsub("(\\[|\\()(\\d*.?\\d*)( - )(\\d*.?\\d*)(\\)|\\])", "\\2", labels) %>% as.numeric(),
                    upper=gsub("(\\[|\\()(\\d*.?\\d*)( - )(\\d*.?\\d*)(\\)|\\])", "\\4", labels) %>% as.numeric(),
                    center=(upper+lower)/2,
                    width=upper-lower)

    if (include_outliers==TRUE & length(outlier_values)>0 & length(outlier_values)<n_bins){

      width_t = min(c(1, outlier_values-lag(outlier_values), binning_df$width), na.rm=TRUE)

      outliers_df <- data.frame(bin=-(1:length(outlier_values)), center=outlier_values, labels="") %>%
        dplyr::mutate(lower=center-(width_t/2), upper=center+(width_t/2), width=width_t)

      binning_df <- rbind(binning_df, outliers_df)
    }

    return(binning_df)
  }

}
gloverd2/admr documentation built on Dec. 2, 2020, 11:16 p.m.