#' prep_num_bin
#'
#' @description
#' Function which bins numeric vector into bins of equal weight (or a smooth function on weight).
#'
#' @param var_to_band Vector[numeric] - vector of values to be binned
#' @param n_bins numeric - Number of bins to split exposure into
#' @param weight numeric - vector weight for observations
#' @param use_labels logical - should the bins be numbered or human readable labelled
#' @param method string - One of \code{c("even_weight", "gaussian_weight")}.
#' If \code{"even_weight"} is used the weight (exposure) in each bucket is even
#' If \code{"gaussian_weight"} is used the weight look like a normal distribution centered on \code{mean} with width \code{sd}. The effect of this is to give thinner tails
#' @param mean numeric - Only used when \code{method}=\code{"gaussian_weight"} - value between 0 - 1 for if the bucket with maximum weight is at min or max prediction. Default is 0.5 (middle bucket)
#' @param sd numeric - Only used when \code{method}=\code{"gaussian_weight"} - value between 0.1 - 1 for how thin the distribution tails should be. Low numbers give thin tails
#'
#' @return a list with the following named entried:
#' bins: Either a numeric vector or a factor (depending on \code{use_labels}) which gives the bin. The vector is of length \code{length(var_to_band)} and has \code{n_bins} unique values
#' labels: Human readable labels of where binning has occored
#' vector with values 1 to \code{n_bins} indicating bin of var_to_band. (1 is low \code{var_to_band} and \code{n_bins} is high \code{var_to_band})
#' @export
#'
#' @examples
#'
#' prep_num_bin(var_to_band=1:20, n_bins=5)
#' prep_num_bin(var_to_band=1:20, n_bins=5, method="gaussian_weight")
#' prep_num_bin(var_to_band=1:20, n_bins=5, method="gaussian_weight", use_labels=TRUE)
prep_num_bin <- function(var_to_band, n_bins=10, weight=NULL, method="even_weight", use_labels=FALSE, mean=0.5, sd=0.3){
# Data prep and checking --------------------------------------------------
checkmate::assert_choice(method, c("even_weight", "gaussian_weight"))
checkmate::assert_integerish(n_bins, len=1, lower=2) # check n_bins is numeric of length 1
checkmate::assert_numeric(var_to_band, min.len = n_bins) # check var_to_band is a numeric vector
checkmate::assert_logical(use_labels, len = 1)
if (is.null(weight)){# If weight is null use identity
weight <- rep(1, length(var_to_band))
}else{# check weight is same length as var_to_band
checkmate::assert_numeric(weight, len=length(var_to_band), lower=0)
}
if (method == "even_weight"){
weight_cut <- seq(0,1,length.out = n_bins + 1)[2:n_bins] * sum(weight, na.rm = TRUE) # Find the exposure to go in each bin
}else if (method == "gaussian_weight"){
checkmate::assert_numeric(mean, lower=0, upper=1, len = 1)
checkmate::assert_numeric(sd, lower=0.1, upper=1, len = 1)
norm.shape <- dnorm(seq(0,1,length.out = n_bins), mean=mean, sd=sd)
cumnorm.shape <- (cumsum(norm.shape) / sum(norm.shape))[1:(n_bins-1)]
weight_cut <- cumnorm.shape * sum(weight, na.rm = TRUE)
}
# Use a dataframe so columns can be ordered by one another
data_binned <- data.frame(n=1:length(var_to_band), var_to_band=var_to_band, weight=weight) %>%
dplyr::arrange(var_to_band) %>% # sort by var_to_band value
dplyr::mutate(weight = ifelse(is.na(.data[["weight"]]),0,.data[["weight"]]), # remove NAs in weight
cumsum.weight=cumsum(weight), # cumulative exposure
bin=1 # place exposure into equal sized bins
)
# Don't place bin edge between identical values
for (ii in 2:nrow(data_binned)){
if(data_binned[ii,"var_to_band"]==data_binned[ii-1,"var_to_band"] & !is.na(data_binned[ii,"var_to_band"]) & !is.na(data_binned[ii-1,"var_to_band"])){
data_binned[ii,"cumsum.weight"] <- data_binned[ii-1,"cumsum.weight"]
}
}
# placed back into original order
data_binned <- data_binned %>%
dplyr::arrange(n)
for (ii in 1:length(weight_cut)){
data_binned$bin = data_binned$bin + ifelse(data_binned$cumsum.weight > weight_cut[ii], 1, 0)
}
#If value is unknown the bin is unknown
data_binned$bin = ifelse(is.na(data_binned$var_to_band), NA, data_binned$bin)
# Get bins
data_bins <- data_binned %>% dplyr::pull("bin")
# Get bins
data_labels <- data_binned %>%
dplyr::group_by(bin) %>%
dplyr::summarise(min=format(min(var_to_band),scipen=5), max=format(max(var_to_band),scipen=5), .groups="keep") %>%
dplyr::ungroup() %>%
dplyr::mutate(label=paste0("[", min, " - ", ifelse(!is.na(dplyr::lead(min)), dplyr::lead(min), max), ")")) %>%
dplyr::pull("label")
if (use_labels==TRUE){
data_bins <- factor(data_bins, levels=unique(data_bins) %>% sort(), labels = data_labels, ordered=TRUE)
}
return(list(bins=data_bins, labels=data_labels))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.