#' Apply a function (or functions) across multiple columns
#'
#' @description
#' `across()` makes it easy to apply the same transformation to multiple
#' columns, allowing you to use [select()] semantics inside in "data-masking"
#' functions like [summarise()] and [mutate()]. See `vignette("colwise")` for
#' more details.
#'
#' `if_any()` and `if_all()` apply the same
#' predicate function to a selection of columns and combine the
#' results into a single logical vector: `if_any()` is `TRUE` when
#' the predicate is `TRUE` for *any* of the selected columns, `if_all()`
#' is `TRUE` when the predicate is `TRUE` for *all* selected columns.
#'
#' `across()` supersedes the family of "scoped variants" like
#' `summarise_at()`, `summarise_if()`, and `summarise_all()`.
#'
#' @param cols,.cols <[`tidy-select`][dplyr_tidy_select]> Columns to transform.
#' Because `across()` is used within functions like `summarise()` and
#' `mutate()`, you can't select or compute upon grouping variables.
#' @param .fns Functions to apply to each of the selected columns.
#' Possible values are:
#'
#' - `NULL`, to returns the columns untransformed.
#' - A function, e.g. `mean`.
#' - A purrr-style lambda, e.g. `~ mean(.x, na.rm = TRUE)`
#' - A list of functions/lambdas, e.g.
#' `list(mean = mean, n_miss = ~ sum(is.na(.x))`
#'
#' Within these functions you can use [cur_column()] and [cur_group()]
#' to access the current column and grouping keys respectively.
#' @param ... Additional arguments for the function calls in `.fns`.
#' @param .names A glue specification that describes how to name the output
#' columns. This can use `{.col}` to stand for the selected column name, and
#' `{.fn}` to stand for the name of the function being applied. The default
#' (`NULL`) is equivalent to `"{.col}"` for the single function case and
#' `"{.col}_{.fn}"` for the case where a list is used for `.fns`.
#'
#' @returns
#' `across()` returns a tibble with one column for each column in `.cols` and each function in `.fns`.
#'
#' `if_any()` and `if_all()` return a logical vector.
#'
#' @section Timing of evaluation:
#' R code in dplyr verbs is generally evaluated once per group.
#' Inside `across()` however, code is evaluated once for each
#' combination of columns and groups. If the evaluation timing is
#' important, for example if you're generating random variables, think
#' about when it should happen and place your code in consequence.
#'
#' ```{r}
#' gdf <-
#' tibble(g = c(1, 1, 2, 3), v1 = 10:13, v2 = 20:23) %>%
#' group_by(g)
#'
#' set.seed(1)
#'
#' # Outside: 1 normal variate
#' n <- rnorm(1)
#' gdf %>% mutate(across(v1:v2, ~ .x + n))
#'
#' # Inside a verb: 3 normal variates (ngroup)
#' gdf %>% mutate(n = rnorm(1), across(v1:v2, ~ .x + n))
#'
#' # Inside `across()`: 6 normal variates (ncol * ngroup)
#' gdf %>% mutate(across(v1:v2, ~ .x + rnorm(1)))
#' ````
#'
#' @examples
#' # across() -----------------------------------------------------------------
#' # Different ways to select the same set of columns
#' # See <https://tidyselect.r-lib.org/articles/syntax.html> for details
#' iris %>%
#' as_tibble() %>%
#' mutate(across(c(Sepal.Length, Sepal.Width), round))
#' iris %>%
#' as_tibble() %>%
#' mutate(across(c(1, 2), round))
#' iris %>%
#' as_tibble() %>%
#' mutate(across(1:Sepal.Width, round))
#' iris %>%
#' as_tibble() %>%
#' mutate(across(where(is.double) & !c(Petal.Length, Petal.Width), round))
#'
#' # A purrr-style formula
#' iris %>%
#' group_by(Species) %>%
#' summarise(across(starts_with("Sepal"), ~ mean(.x, na.rm = TRUE)))
#'
#' # A named list of functions
#' iris %>%
#' group_by(Species) %>%
#' summarise(across(starts_with("Sepal"), list(mean = mean, sd = sd)))
#'
#' # Use the .names argument to control the output names
#' iris %>%
#' group_by(Species) %>%
#' summarise(across(starts_with("Sepal"), mean, .names = "mean_{.col}"))
#' iris %>%
#' group_by(Species) %>%
#' summarise(across(starts_with("Sepal"), list(mean = mean, sd = sd), .names = "{.col}.{.fn}"))
#'
#' # When the list is not named, .fn is replaced by the function's position
#' iris %>%
#' group_by(Species) %>%
#' summarise(across(starts_with("Sepal"), list(mean, sd), .names = "{.col}.fn{.fn}"))
#'
#' # if_any() and if_all() ----------------------------------------------------
#' iris %>%
#' filter(if_any(ends_with("Width"), ~ . > 4))
#' iris %>%
#' filter(if_all(ends_with("Width"), ~ . > 2))
#'
#' @export
#' @seealso [c_across()] for a function that returns a vector
across <- function(.cols = everything(), .fns = NULL, ..., .names = NULL) {
setup <- across_setup(
{{ .cols }},
fns = .fns,
names = .names,
.caller_env = caller_env(),
inline = FALSE
)
vars <- setup$vars
if (length(vars) == 0L) {
return(new_tibble(list(), nrow = 1L))
}
fns <- setup$fns
names <- setup$names
mask <- peek_mask()
data <- mask$current_cols(vars)
if (is.null(fns)) {
nrow <- length(mask$current_rows())
data <- new_data_frame(data, n = nrow, class = c("tbl_df", "tbl"))
if (is.null(names)) {
return(data)
} else {
return(set_names(data, names))
}
}
n_cols <- length(data)
n_fns <- length(fns)
seq_n_cols <- seq_len(n_cols)
seq_fns <- seq_len(n_fns)
k <- 1L
out <- vector("list", n_cols * n_fns)
# Reset `cur_column()` info on exit
old_var <- context_peek_bare("column")
on.exit(context_poke("column", old_var), add = TRUE)
# Loop in such an order that all functions are applied
# to a single column before moving on to the next column
for (i in seq_n_cols) {
var <- vars[[i]]
col <- data[[i]]
context_poke("column", var)
for (j in seq_fns) {
fn <- fns[[j]]
out[[k]] <- fn(col, ...)
k <- k + 1L
}
}
size <- vec_size_common(!!!out)
out <- vec_recycle_common(!!!out, .size = size)
names(out) <- names
new_data_frame(out, n = size, class = c("tbl_df", "tbl"))
}
#' @rdname across
#' @export
if_any <- function(.cols = everything(), .fns = NULL, ..., .names = NULL) {
context_local("across_if_fn", "if_any")
if_across(`|`, across({{ .cols }}, .fns, ..., .names = .names))
}
#' @rdname across
#' @export
if_all <- function(.cols = everything(), .fns = NULL, ..., .names = NULL) {
context_local("across_if_fn", "if_all")
if_across(`&`, across({{ .cols }}, .fns, ..., .names = .names))
}
if_across <- function(op, df) {
n <- nrow(df)
if (!length(df)) {
return(TRUE)
}
combine <- function(x, y) {
if (is_null(x)) {
y
} else {
op(x, y)
}
}
reduce(df, combine, .init = NULL)
}
#' Combine values from multiple columns
#'
#' @description
#' `c_across()` is designed to work with [rowwise()] to make it easy to
#' perform row-wise aggregations. It has two differences from `c()`:
#'
#' * It uses tidy select semantics so you can easily select multiple variables.
#' See `vignette("rowwise")` for more details.
#'
#' * It uses [vctrs::vec_c()] in order to give safer outputs.
#'
#' @inheritParams across
#' @seealso [across()] for a function that returns a tibble.
#' @export
#' @examples
#' df <- tibble(id = 1:4, w = runif(4), x = runif(4), y = runif(4), z = runif(4))
#' df %>%
#' rowwise() %>%
#' mutate(
#' sum = sum(c_across(w:z)),
#' sd = sd(c_across(w:z))
#' )
c_across <- function(cols = everything()) {
cols <- enquo(cols)
vars <- c_across_setup(!!cols)
mask <- peek_mask("c_across()")
cols <- mask$current_cols(vars)
vec_c(!!!cols, .name_spec = zap())
}
across_glue_mask <- function(.col, .fn, .caller_env) {
glue_mask <- env(.caller_env, .col = .col, .fn = .fn)
# TODO: we can make these bindings louder later
env_bind_active(
glue_mask, col = function() glue_mask$.col, fn = function() glue_mask$.fn
)
glue_mask
}
across_setup <- function(cols,
fns,
names,
.caller_env,
mask = peek_mask("across()"),
inline = FALSE) {
cols <- enquo(cols)
# `across()` is evaluated in a data mask so we need to remove the
# mask layer from the quosure environment (#5460)
cols <- quo_set_env(cols, data_mask_top(quo_get_env(cols), recursive = FALSE, inherit = FALSE))
# TODO: call eval_select with a calling handler to intercept
# classed error, after https://github.com/r-lib/tidyselect/issues/233
if (is.null(fns) && quo_is_call(cols, "~")) {
if_fn <- context_peek_bare("across_if_fn") %||% "across"
abort(c(
"Predicate used in lieu of column selection.",
i = glue("You most likely meant: `{if_fn}(everything(), {as_label(cols)})`."),
i = "The first argument `.cols` selects a set of columns.",
i = "The second argument `.fns` operates on each selected columns."
))
}
across_cols <- mask$across_cols()
vars <- tidyselect::eval_select(cols, data = across_cols)
names_vars <- names(vars)
vars <- names(across_cols)[vars]
if (is.null(fns)) {
if (!is.null(names)) {
glue_mask <- across_glue_mask(.caller_env, .col = names_vars, .fn = "1")
names <- vec_as_names(glue(names, .envir = glue_mask), repair = "check_unique")
} else {
names <- names_vars
}
value <- list(vars = vars, fns = fns, names = names)
return(value)
}
# apply `.names` smart default
if (is.function(fns) || is_formula(fns)) {
names <- names %||% "{.col}"
fns <- list("1" = fns)
} else {
names <- names %||% "{.col}_{.fn}"
}
if (!is.list(fns)) {
abort(c("Problem with `across()` input `.fns`.",
i = "`.fns` must be NULL, a function, a formula, or a list of functions/formulas."
))
}
# make sure fns has names, use number to replace unnamed
if (is.null(names(fns))) {
names_fns <- seq_along(fns)
} else {
names_fns <- names(fns)
empties <- which(names_fns == "")
if (length(empties)) {
names_fns[empties] <- empties
}
}
glue_mask <- glue_mask <- across_glue_mask(.caller_env,
.col = rep(names_vars, each = length(fns)),
.fn = rep(names_fns , length(vars))
)
names <- vec_as_names(glue(names, .envir = glue_mask), repair = "check_unique")
if (!inline) {
fns <- map(fns, as_function)
}
list(vars = vars, fns = fns, names = names)
}
# FIXME: This pattern should be encapsulated by rlang
data_mask_top <- function(env, recursive = FALSE, inherit = FALSE) {
while (env_has(env, ".__tidyeval_data_mask__.", inherit = inherit)) {
env <- env_parent(env_get(env, ".top_env", inherit = inherit))
if (!recursive) {
return(env)
}
}
env
}
c_across_setup <- function(cols) {
mask <- peek_mask("c_across()")
cols <- enquo(cols)
across_cols <- mask$across_cols()
vars <- tidyselect::eval_select(expr(!!cols), across_cols)
value <- names(vars)
value
}
new_dplyr_quosure <- function(quo, ...) {
attr(quo, "dplyr:::data") <- list2(...)
quo
}
dplyr_quosures <- function(...) {
quosures <- enquos(..., .ignore_empty = "all")
names_given <- names2(quosures)
names_auto <- names(enquos(..., .named = TRUE, .ignore_empty = "all"))
for (i in seq_along(quosures)) {
quosures[[i]] <- new_dplyr_quosure(quosures[[i]],
name_given = names_given[i],
name_auto = names_auto[i],
is_named = names_given[i] != "",
index = i
)
}
quosures
}
# When mutate() or summarise() have an unnamed call to across() at the top level, e.g.
# summarise(across(<...>)) or mutate(across(<...>))
#
# a call to top_across(<...>) is evaluated instead.
# top_across() returns a flattened list of expressions along with some
# information about the "current column" for each expression
# in the "columns" attribute:
#
# For example with: summarise(across(c(x, y), mean, .names = "mean_{.col}")) top_across() will return
# something like:
#
# structure(
# list(mean_x = expr(mean(x)), mean_y = expr(mean(y)))
# columns = c("x", "y")
# )
# Technically this always returns a single quosure but we wrap it in a
# list to follow the pattern in `expand_across()`
expand_if_across <- function(quo) {
quo_data <- attr(quo, "dplyr:::data")
if (!quo_is_call(quo, c("if_any", "if_all"), ns = c("", "dplyr"))) {
return(list(quo))
}
call <- match.call(
definition = if_any,
call = quo_get_expr(quo),
expand.dots = FALSE,
envir = quo_get_env(quo)
)
if (!is_null(call$...)) {
return(list(quo))
}
if (is_call(call, "if_any")) {
op <- "|"
if_fn <- "if_any"
} else {
op <- "&"
if_fn <- "if_all"
}
context_local("across_if_fn", if_fn)
call[[1]] <- quote(across)
quos <- expand_across(quo_set_expr(quo, call))
# Select all rows if there are no inputs
if (!length(quos)) {
return(list(quo(TRUE)))
}
combine <- function(x, y) {
if (is_null(x)) {
y
} else {
call(op, x, y)
}
}
expr <- reduce(quos, combine, .init = NULL)
# Use `as_quosure()` instead of `new_quosure()` to avoid rewrapping
# quosure in case of single input
list(as_quosure(expr, env = baseenv()))
}
expand_across <- function(quo) {
quo_data <- attr(quo, "dplyr:::data")
if (!quo_is_call(quo, "across", ns = c("", "dplyr")) || quo_data$is_named) {
return(list(quo))
}
# Expand dots in lexical env
env <- quo_get_env(quo)
expr <- match.call(
definition = across,
call = quo_get_expr(quo),
expand.dots = FALSE,
envir = env
)
# Abort expansion if there are any expression supplied because dots
# must be evaluated once per group in the data mask. Expanding the
# `across()` call would lead to either `n_group * n_col` evaluations
# if dots are delayed or only 1 evaluation if they are eagerly
# evaluated.
if (!is_null(expr$...)) {
return(list(quo))
}
dplyr_mask <- peek_mask()
mask <- dplyr_mask$get_rlang_mask()
# Differentiate between missing and null (`match.call()` doesn't
# expand default argument)
if (".cols" %in% names(expr)) {
cols <- expr$.cols
} else {
cols <- quote(everything())
}
cols <- as_quosure(cols, env)
setup <- across_setup(
!!cols,
fns = eval_tidy(expr$.fns, mask, env = env),
names = eval_tidy(expr$.names, mask, env = env),
.caller_env = dplyr_mask$get_caller_env(),
inline = TRUE
)
vars <- setup$vars
# Empty expansion
if (length(vars) == 0L) {
return(list())
}
fns <- setup$fns
names <- setup$names %||% vars
# No functions, so just return a list of symbols
if (is.null(fns)) {
expressions <- pmap(list(vars, names, seq_along(vars)), function(var, name, k) {
quo <- new_quosure(sym(var), empty_env())
quo <- new_dplyr_quosure(
quo,
name_given = name,
name_auto = name,
is_named = TRUE,
index = c(quo_data$index, k),
column = var
)
})
names(expressions) <- names
return(expressions)
}
n_vars <- length(vars)
n_fns <- length(fns)
seq_vars <- seq_len(n_vars)
seq_fns <- seq_len(n_fns)
expressions <- vector(mode = "list", n_vars * n_fns)
columns <- character(n_vars * n_fns)
k <- 1L
for (i in seq_vars) {
var <- vars[[i]]
for (j in seq_fns) {
fn_call <- as_across_fn_call(fns[[j]], var, env, mask)
name <- names[[k]]
expressions[[k]] <- new_dplyr_quosure(
fn_call,
name_given = name,
name_auto = name,
is_named = TRUE,
index = c(quo_data$index, k),
column = var
)
k <- k + 1L
}
}
names(expressions) <- names
expressions
}
# TODO: Take unevaluated `.fns` and inline calls to `function`. This
# will enable support for R 4.1 lambdas. Note that unlike formulas,
# only unevaluated `function` calls can be inlined. This will have
# performance implications for lists of lambdas where formulas will
# have better performance. It is possible that we will be able to
# inline evaluated functions with strictness annotations.
as_across_fn_call <- function(fn, var, env, mask) {
if (is_inlinable_formula(fn, mask)) {
# Don't need to worry about arguments passed through `...`
# because we cancel expansion in that case
expr <- f_rhs(fn)
expr <- expr_substitute(expr, quote(.), sym(var))
expr <- expr_substitute(expr, quote(.x), sym(var))
# If the formula environment is the data mask it means the formula
# was unevaluated, and in that case we can use the original
# quosure environment. Otherwise, use the formula environment
# which might include local data that is not reachable from the
# data mask.
f_env <- f_env(fn)
if (identical(f_env, mask)) {
f_env <- env
}
new_quosure(expr, f_env)
} else {
fn_call <- call2(as_function(fn), sym(var))
new_quosure(fn_call, env)
}
}
# Don't inline formulas that don't inherit directly from the mask
# because of a tidyeval bug/limitation that causes an infinite loop.
# If the formula env is the data mask, we replace it with the original
# quosure environment (which is maskable) later on to work around that
# bug.
is_inlinable_formula <- function(x, mask) {
if (is_formula(x, lhs = FALSE, scoped = TRUE)) {
identical(x, mask) || !env_inherits(f_env(x), mask)
} else {
FALSE
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.