#' Cut a Numeric Variable into Intervals
#'
#' Enhanced cut that supports among other things factor inputs, optimal grouping,
#' and flexible formatting.
#'
#' @param x numeric vector to classify into intervals
#' @param i numeric, character, or list, main parameter depending on `what`
#' @param what character, choices can be abreviated
#' @param labels character of the same length as the resulting bins or function
#' (or formula) to apply on the relevant bin's values.
#' @param closed character, which side of the intervals should be closed
#' @param expand logical, if TRUE cuts are added if necessary to cover min and max values
#' @param crop logical, if TRUE intervals which go past the min or max values will be cropped
#' @param simplify logical, if TRUE categories containing only one distinct
#' value will be named by it
#' @param squeeze logical, if TRUE all bins are cropped so they are closed on
#' both sides on their min and max values, useful for sparse data and factors
#' @param open_end include in last interval on open side the values which fall on the last cutpoint
#' @param sep,brackets character, used to build the default labels
#' @param output character, class of output
#' @param format_fun formatting function
#' @param ... additional arguments passed to \code{format_fun}
#'
#' @section what:
#'
#' Depending on the value of `what`, i is:
#'
#' \describe{
#' \item{breaks}{the actual cut points}
#' \item{groups}{the number of desired groups, by default cuts are
#' calculated as quantiles, which might not always give i groups for some
#' distributions, see help on optim_fun below to handle these cases }
#' \item{n_by_group}{the number of desired items by group, with the
#' same caveat as above}
#' \item{n_intervals}{the number of desired intervals}
#' \item{width}{the interval width, which will be centered on 0 by default or
#' at a different value (see dedicated section)}
#' \item{cluster}{the number of clusters}
#' \item{bins}{the bin values, useful if using an external clustering function}
#' \item{rough}{cuts into i groups of equal size (if possible), 2 elements of
#' same value can be in different buckets, hence the "rough" adjective. Default
#' brackets might be misleading when making this choice}
#' }
#'
#' If `what` is `"group"` or `"width"` i can be a list in which the second
#' element is a' function, see following sections.
#'
#' @section optimize groups:
#'
#' If `what = "groups"` then `i` can be a list in which the second element is a
#' function that will be applied on all possible combinations.
#' It will be fed the size of bins as its first argument and the
#' cuts as its second. From the results the combination that gives the minimum
#' result will be kept.
#'
#' Alternatively the parameter can be any of the following strings:
#'
#' \describe{
#' \item{"balanced"}{Returns a combination with the minimal group size variance}
#' \item{"biggest_small_bin"}{Returns a combination that has the biggest smallest
#' bin, to avoid narrow intervals}
#' \item{"smallest_big_bin"}{Returns a combination that has the biggest smallest
#' bin, to avoid wide intervals}
#' }
#'
#' In practice the results should be quite similar and balanced should be enough
#' most of the time, for continuous data of a decent size without singular points,
#' optimization of groups is not necessary and will be ressource expensive.
#'
#' @section custom left boundary for `what = "width"`:
#'
#' If `what = "width"` then `i` can be either single numeric value setting the
#' width of the interval or a list in which the second element is a
#' function that will be applied on x (as a first parameter) and the cut points
#' (as a second parameter). The output of this function will determine where the
#' leftmost interval starts. Formula notation is supported.
#'
#' Alternatively the parameter can be a numeric value or any of the following
#' strings:
#'
#' \describe{
#' \item{"left"}{First interval starts at the data point}
#' \item{"right"}{Last interval stops at the last data point}
#' \item{"centered"}{Margins are balanced on both sides}
#' \item{"centered0"}{Interval containing zero is centered on zero}
#' }
#'
#' @section cluster:
#'
#' Uses function `stats::kmean` to
#' cluster `x` into `i` groups
#'
#' @section format_fun:
#'
#' The original base::cut uses formatC in its code to format the labels while
#' the commonly used Hmisc::cut2 uses format. smart_cut allows one to choose and to
#' pass additional parameters to ... .
#'
#' Any formating function can be used as long as it takes as a first argument a
#' vector of characters and returns one.
#'
#' The function format_metric including in cutr permits additional formatting
#' especially well suited for smart_cut
#'
#' @seealso
#' `?cut`
#' `?Hmisc::cut2`
#' `?format`
#' `?formatC`
#' `?format_metric`
#' `?kmeans
#'
#' @return a factor variable with levels of the form \code{"[a,b]"} or formatted means (character strings) unless \code{onlycuts} is \code{TRUE} in which case a numeric vector is returned
#' @export
#'
#' @examples
#' x <- c(rep(1,3),rep(2,2),3:6,17:20)
#'
#' # fixed breaks
#' table(smart_cut(x,cuts,"breaks"))
#'
#' # groups defined by quantiles
#' table(smart_cut(x,2,"groups"))
#'
#' # optimized groups of equal size
#' table(smart_cut(x,list(2,"balanced"),"groups"))
#'
#' # try to get 3 items by group using quantiles
#' table(smart_cut(x,3,"n_by_group"))
#'
#' # try to get 3 items by group using optimization
#' table(smart_cut(x,list(3,"balanced"),"n_by_group"))
#'
#' # intervals of equal width
#' table(smart_cut(x,3,"n_intervals"))
#'
#' # interval of equal defined width,
#' table(smart_cut(x,7,"width")) # start on 1st value
#' table(smart_cut(x,list(7,"right"),"width")) # end on last value
#' table(smart_cut(x,list(6,"centered"),"width")) # centered
#' table(smart_cut(x,list(6,"centered0"),"width")) # centered on 0
#' table(smart_cut(x,list(7,0),"width")) # starting on 0
#'
#' # create groups by running a kmeans clustering
#' table(smart_cut(x,3,"cluster"))
#' simplify
#' table(smart_cut(x, 5, "width"))
#' table(smart_cut(x, 5, "width", simplify = FALSE))
#'
#' # expand
#' table(smart_cut(x,c(4,10,18)))
#' table(smart_cut(x,c(4,10,18),expand = FALSE))
#'
#' # crop
#' table(smart_cut(x,c(0,10,30)))
#' table(smart_cut(x,c(0,10,30),crop = TRUE))
#'
#' # squeeze
#' table(smart_cut(x,c(0,10,30)))
#' table(smart_cut(x,c(0,10,30),squeeze = TRUE))
#'
#' # brackets
#' table(smart_cut(x,c(0,10,30), brackets = c("]","[","[","]")))
#' table(smart_cut(x,c(0,10,30), brackets = NULL, sep = "~", squeeze= TRUE))
#'
#' # labels
#' table(smart_cut(x,c(4,10)))
#' table(smart_cut(x,c(4,10),labels = ~mean(.x))) # mean of values by interval
#' table(smart_cut(x,c(4,10),labels = ~mean(.y))) # center of interval
#' table(smart_cut(x,c(4,10),labels = ~median(.x))) # median
#' table(smart_cut(x,c(4,10),labels = ~paste(
#' sep="~",.y[1],round(mean(.x),2),.y[2]))) # a more sophisticated label
#'
#' # format_fun
#' table(smart_cut(x^6 + x/100,5,"g"))
#' table(smart_cut(x^6 + x/100,5,"g",format_fun = format, digits = 3))
#' table(smart_cut(x^6,5,"g",format_fun = signif))
#' table(smart_cut(x^6,5,"g",format_fun = smart_signif))
#' table(smart_cut(x^6,5,"g",format_fun = format_metric))
smart_cut <- function(
x,
i,
what = c("breaks", "groups", "n_by_group", "n_intervals", "width", "cluster", "bins", "rough"),
labels = NULL,
closed = c("left", "right"),
expand = TRUE,
crop = FALSE,
simplify = TRUE,
squeeze = FALSE,
open_end = FALSE,
brackets = c("(", "[", ")", "]"),
sep = ",",
output = c("ordered", "factor", "character","numeric","breaks","labels"),
format_fun = formatC, ...){
# checks
what <- match.arg(what)
closed <- match.arg(closed)
output <- match.arg(output)
if (is.null(brackets)) brackets <- rep("",4)
if (what == "rough") {
i <- as.integer(floor(i * (rank(x, ties.method = "first", na.last = "keep") - 1)/sum(!is.na(x)) + 1))
what <- "bins"
}
if (what == "bins") {
if (length(i) != length(x))
stop("i and x have different lengths, invalid for `what = 'bins'`")
if (anyNA(i))
stop("bins containing NA values are not supported")
if (is.unsorted(i[order(x)]))
stop("bins must be sorted along x")
}
# extract relevant functions from i arg
if (what %in% c("groups", "n_by_group") && length(i) > 1) {
optim_fun <- i[[2]]
i <- i[[1]]
} else optim_fun <- NULL
# convert "n_by_group" case into a "groups" case
if (what == "n_by_group") {
i <- max(1, floor(sum(!is.na(x))/i))
what <- "groups"}
if (what == "width" && length(i) > 1) {
width_fun <- i[[2]]
i <- i[[1]]
} else width_fun <- NULL
i >= 1 || what == "breaks" || stop("i must be positive")
# set mappers (handle formula notation if relevant)
set_mappers(labels, optim_fun, format_fun, width_fun, only_formulas = TRUE)
# handle factors
if (is.factor(x) && what == "breaks" && (is.character(i) || is.factor(i)))
i <- match(as.character(i),levels(x))
# get breaks
cuts <- get_cuts(x = as.numeric(x), i = i, what = what, expand = expand,
crop = crop, closed = closed, open_end = open_end,
optim_fun = optim_fun, width_fun = width_fun)
if (output == "breaks") return(cuts)
# after the cropping is done, ends are closed by definition
if (crop || expand) open_end <- FALSE
if (length(cuts) == 1 && !expand)
stop("Can't cut data if only one break is provided and `expand` is FALSE")
# get numeric bins
if (what == "bins") {
bins <- i
} else {
bins <- .bincode(
as.numeric(x), breaks = cuts, right = (closed == "right"),
include.lowest = !open_end)
}
if (output == "numeric") return(bins)
# get raw output
bins <- cut_explicit(bins = bins, x = x, cuts = cuts , labels = labels, simplify = simplify,
closed = closed, squeeze = squeeze, open_end = open_end,
brackets = brackets, sep = sep,
format_fun = format_fun, output = output, ...)
bins
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.