Nothing
#' Add a column of ntiles to a data table
#' @param DT A \code{data.table}.
#' @param col The column name (quoted or unquoted) for which quantiles are desired.
#' @param n A positive integer, the number of groups to split \code{col}.
#' @param weights If \code{NULL}, the default, use unweighted quantiles.
#' Otherwise, a string designating the column that is passed to
#' \code{\link{weighted_ntile}}.
#' @param by,keyby Produce a grouped quantile column, as in \code{\link[data.table]{data.table}}.
#' \code{keyby} will set a key on the result (\emph{i.e.} order by \code{keyby}).
#' @param new.col If not \code{NULL}, the name of the column to be added.
#' If \code{NULL} (the default) a name will be inferred from \code{n}.
#' (For example, \code{n = 100} will be \code{<col>Percentile}).
#' @param character.only (logical, default: \code{FALSE}) Do not contemplate
#' \code{col} to be an unquoted column name.
#' @param overwrite (logical, default: \code{TRUE}) If \code{TRUE} and
#' \code{new.col} already exists in \code{DT}, the column will be overwritten.
#' If \code{FALSE}, attempting to overwrite an existing column is an error.
#' @param check.na (logical, default: \code{FALSE}) If \code{TRUE}, \code{NA}s
#' in \code{DT[[col]]} will throw an error. If \code{NA}'s are present, the
#' corresponding n-tile may take any value.
#'
#' @examples
#' library(data.table)
#' DT <- data.table(x = 1:20, y = 2:1)
#' mutate_ntile(DT, "x", n = 10)
#' mutate_ntile(DT, "x", n = 5)
#' mutate_ntile(DT, "x", n = 10, by = "y")
#' mutate_ntile(DT, "x", n = 10, keyby = "y")
#'
#' y <- "x"
#' DT <- data.table(x = 1:20, y = 2:1)
#' mutate_ntile(DT, y, n = 5) # Use DT$y
#' mutate_ntile(DT, y, n = 5, character.only = TRUE) # Use DT$x
#'
#' @return \code{DT} with a new integer column \code{new.col} containing the
#' quantiles. If \code{DT} is not a \code{data.table} its class may be preserved
#' unless \code{keyby} is used, where it will always be a \code{data.table}.
#'
#' @export mutate_ntile
mutate_ntile <- function(DT,
col,
n,
weights = NULL,
by = NULL,
keyby = NULL,
new.col = NULL,
character.only = FALSE,
overwrite = TRUE,
check.na = FALSE) {
if (length(n) != 1L) {
stop("`length(n) = ", length(n), "`.",
"`n` must be a single positive whole number.")
}
if (!is.integer(n)) {
if (!is.double(n)) {
stop("`n` was type ", typeof(n), ".",
"`n` must be a single positive whole number.")
}
.n <- as.integer(n)
if (.n != n) {
stop("`n` was type double but `n != as.integer(n)`.",
"`n` must be a single positive whole number.")
}
n <- .n
}
if (missing(col)) {
stop("Argument `col` is missing, with no default.")
}
if (character.only) {
if (length(col) != 1L) {
stop("`col` had length ", length(col), ". ",
"Ensure `col` has length one.")
}
if (!is.character(col)) {
stop("`col = ", col, "` was a ", class(col)[1L],
", but must be a length-one character.")
}
if (col %notchin% names(DT)) {
stop("`col = ", col, "` but this is not a column in ",
"`DT`. Ensure `col` is a string matching a name of `DT`.",
if (length(suggestion <- agrep(col, names(DT), value = TRUE)) == 1) {
paste0("\n\n(Did you mean `col = ", suggestion, "`?)")
})
}
.col <- col
} else {
# First see whether the unquoted name refers to an extant object
# e.g.
# DT <- data.table(x = 1:5)
#
# x <- "foo"
# mutate_ntile(DT, x) -> MSG: just use the x column, even though
# 'foo' might have been intended
#
# y <- "foo"
# mutate_ntile(DT, y) -> ERR: neither 'y' not 'foo' a column
#
# y <- "x"
# mutate_ntile(DT, y) -> WARN: y may refer to 'y' or 'x'
if (exists(deparse(substitute(col)),
envir = parent.frame(),
mode = "character")) {
.col <- as.character(substitute(col))
if (.col %chin% names(DT)) {
if (col %in% names(DT)) {
warning("Interpreting `col = ", deparse(substitute(col)), "` as ",
"column `DT[['", .col, "']]`, not ",
"column `DT[['", col, "']]`, despite an extant object ",
.col, ".")
} else {
message("Interpreting `col = ", deparse(substitute(col)), "` as ",
"column `DT[['", .col, "']]`",
", despite an extant object `y`.")
}
} else {
if (col %in% names(DT)) { # WARN
.col <- col
warning("Interpreting `col = ", deparse(substitute(col)), "` as ",
"`col = ", col, "`.")
} else { # Not a name of the table
stop("`col = ", deparse(substitute(col)),
"` but this was not a column of `DT`. ",
"`col` must be a column name of `DT`.")
}
}
} else {
.col <- as.character(substitute(col))
if (.col %notchin% names(DT)) {
stop("`col = ", deparse(substitute(col)),
"` but this was not a column of `DT`. ",
"`col` must be a column name of `DT`.")
}
}
}
if (is.null(new.col)) {
suffix <-
switch(as.character(n),
"3" = "Terciles",
"4" = "Quartile",
"5" = "Quintile",
"6" = "Sextile",
"7" = "Septile",
"8" = "Octile",
"10" = "Decile",
"12" = "DuoDecile",
"16" = "Hexadecile",
"20" = "Vigintile",
"100" = "Percentile",
"1000" = "Permilles",
stop("`n = ", n, "` and new.col is NULL, ",
"but no default column suffix is supported.",
"Supply a column name using `new.col`."))
new.col <- paste0(.col, suffix)
}
if (!overwrite && new.col %chin% names(DT)) {
stop("`DT` already has a column '", new.col, "' ",
"and `overwrite = TRUE`, so stopping, as requested.",
"Either supply a different name to `new.col` or set `overwrite = TRUE` ",
"to allow the new column.")
}
# Needs to be after new.col construction to provide early return
if (n <= 1L) {
if (n == 1L) {
message("`n = 1L` so adding a column of 1s. (This is an unlikely value for ntiles.)")
if (is.data.table(DT)) {
return(DT[, (new.col) := 1L][])
} else {
DT[[new.col]] <- 1L
return(DT)
}
} else {
stop("`n = ", n, "`.",
"`n` must be a single positive whole number.")
}
}
if (not_DT <- !is.data.table(DT)) {
input_class <- class(DT)
if (!is.data.frame(DT)) {
stop("`DT` was a ", class(DT)[1], " but must be a data.frame.")
}
DT <- as.data.table(DT)
}
keyddt <- key(DT)
if (is.null(weights) && # .ntile can't use weights
definitely_sorted(DT, .col, check.na, keyby = keyby)) {
if (is.null(by) && is.null(keyby)) {
DT[, (new.col) := .ntile(.SD[[1L]], n, check.na = check.na),
.SDcols = c(.col)]
}
if (!is.null(by) && !is.null(keyby)) {
stop("`by` is NULL, yet `keyby` is NULL too. ",
"Only one of `by` and `keyby` may be provided.")
}
if (!is.null(by)) {
DT[, (new.col) := .ntile(.SD[[.col]], n, check.na = check.na),
.SDcols = c(.col),
by = c(by)]
}
if (!is.null(keyby)) {
DT[, (new.col) := .ntile(.SD[[.col]], n, check.na = check.na),
.SDcols = c(.col),
keyby = c(keyby)]
return(setkeyv(DT, keyby))
}
} else {
if (!is.null(by) && !is.null(keyby)) {
stop("`by` is NULL, yet `keyby` is NULL too. ",
"Only one of `by` and `keyby` may be provided.")
}
if (is.null(by) && is.null(keyby)) {
DT[, (new.col) := weighted_ntile(.SD[[.col]],
weights = if (!is.null(weights)){
.SD[[weights]]
},
n = n),
.SDcols = c(.col, weights)]
} else if (!is.null(by)) {
DT[, (new.col) := weighted_ntile(.SD[[.col]],
weights = if (!is.null(weights)) {
.SD[[weights]]
},
n = n),
.SDcols = c(.col, weights),
by = c(by)]
} else if (!is.null(keyby)) {
DT[, (new.col) := weighted_ntile(.SD[[.col]],
weights = if (!is.null(weights)) .SD[[weights]],
n = n),
.SDcols = c(.col, weights),
keyby = c(keyby)]
return(setkeyv(DT, keyby)[])
}
}
if (not_DT) {
setDF(DT)
if (!identical(input_class, "data.frame")) {
class(DT) <- input_class
}
}
DT[]
}
.ntile <- function(x, n, check.sorted = FALSE, check.na = FALSE) {
if (check.sorted && is.unsorted(x)) {
stop("`x` must be already sorted.")
}
if (check.na && anyNA(x)) {
stop("`anyNA(x)` is TRUE.")
}
# n * seq_along(x) overflows unless:
if (n < .Machine$integer.max / length(x)) {
n <- as.integer(n)
as.integer(n * (seq_along(x) - 1L) %/% length(x) + 1L)
}
lengthx0 <- length(x) - 1L # 0:lengthx0 faster than seq_along(x) - 1
o <- {as.double(n) * {0:lengthx0}} / length(x) + 1
if (n < .Machine$integer.max) {
return(as.integer(o))
} else {
return(o)
}
}
definitely_sorted <- function(ddt, nom, check_na, keyddt = key(ddt), keyby = NULL) {
# TRUE if and only if column 'nom' in ddt is definitely sorted (ascending)
# FALSE does not imply unsortedness: designed to support short circuits
if (haskey(ddt)) {
if (length(keyddt) == 1L) {
return(keyddt == nom)
}
if (keyddt[1L] == nom) {
return(TRUE)
}
if (length(keyddt) == length(keyby) + 1L) {
return(tail(keyddt, 1L) == nom)
}
}
if (check_na && anyNA(.subset2(ddt, nom))) {
stop("`check.na = TRUE` yet `DT[['", nom, "']]` ",
"so stopping, as requested.")
}
FALSE
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.