#' Nest rows into a list-column of data frames
#'
#' @description
#' Nesting creates a list-column of data frames; unnesting flattens it back out
#' into regular columns. Nesting is implicitly a summarising operation: you
#' get one row for each group defined by the non-nested columns. This is useful
#' in conjunction with other summaries that work with whole datasets, most
#' notably models.
#'
#' Learn more in `vignette("nest")`.
#'
#' @details
#' If neither `...` nor `.by` are supplied, `nest()` will nest all variables,
#' and will use the column name supplied through `.key`.
#'
#' @section New syntax:
#' tidyr 1.0.0 introduced a new syntax for `nest()` and `unnest()` that's
#' designed to be more similar to other functions. Converting to the new syntax
#' should be straightforward (guided by the message you'll receive) but if
#' you just need to run an old analysis, you can easily revert to the previous
#' behaviour using [nest_legacy()] and [unnest_legacy()] as follows:
#'
#' ```
#' library(tidyr)
#' nest <- nest_legacy
#' unnest <- unnest_legacy
#' ```
#'
#' @section Grouped data frames:
#' `df %>% nest(data = c(x, y))` specifies the columns to be nested; i.e. the
#' columns that will appear in the inner data frame. `df %>% nest(.by = c(x,
#' y))` specifies the columns to nest _by_; i.e. the columns that will remain in
#' the outer data frame. An alternative way to achieve the latter is to `nest()`
#' a grouped data frame created by [dplyr::group_by()]. The grouping variables
#' remain in the outer data frame and the others are nested. The result
#' preserves the grouping of the input.
#'
#' Variables supplied to `nest()` will override grouping variables so that
#' `df %>% group_by(x, y) %>% nest(data = !z)` will be equivalent to
#' `df %>% nest(data = !z)`.
#'
#' You can't supply `.by` with a grouped data frame, as the groups already
#' represent what you are nesting by.
#'
#' @param .data A data frame.
#' @param ... <[`tidy-select`][tidyr_tidy_select]> Columns to nest; these will
#' appear in the inner data frames.
#'
#' Specified using name-variable pairs of the form
#' `new_col = c(col1, col2, col3)`. The right hand side can be any valid
#' tidyselect expression.
#'
#' If not supplied, then `...` is derived as all columns _not_ selected by
#' `.by`, and will use the column name from `.key`.
#'
#' `r lifecycle::badge("deprecated")`:
#' previously you could write `df %>% nest(x, y, z)`.
#' Convert to `df %>% nest(data = c(x, y, z))`.
#' @param .by <[`tidy-select`][tidyr_tidy_select]> Columns to nest _by_; these
#' will remain in the outer data frame.
#'
#' `.by` can be used in place of or in conjunction with columns supplied
#' through `...`.
#'
#' If not supplied, then `.by` is derived as all columns _not_ selected by
#' `...`.
#' @param .key The name of the resulting nested column. Only applicable when
#' `...` isn't specified, i.e. in the case of `df %>% nest(.by = x)`.
#'
#' If `NULL`, then `"data"` will be used by default.
#' @param .names_sep If `NULL`, the default, the inner names will come from
#' the former outer names. If a string, the new inner names will use the
#' outer names with `names_sep` automatically stripped. This makes
#' `names_sep` roughly symmetric between nesting and unnesting.
#' @export
#' @examples
#' df <- tibble(x = c(1, 1, 1, 2, 2, 3), y = 1:6, z = 6:1)
#'
#' # Specify variables to nest using name-variable pairs.
#' # Note that we get one row of output for each unique combination of
#' # non-nested variables.
#' df %>% nest(data = c(y, z))
#'
#' # Specify variables to nest by (rather than variables to nest) using `.by`
#' df %>% nest(.by = x)
#'
#' # In this case, since `...` isn't used you can specify the resulting column
#' # name with `.key`
#' df %>% nest(.by = x, .key = "cols")
#'
#' # Use tidyselect syntax and helpers, just like in `dplyr::select()`
#' df %>% nest(data = any_of(c("y", "z")))
#'
#' # `...` and `.by` can be used together to drop columns you no longer need,
#' # or to include the columns you are nesting by in the inner data frame too.
#' # This drops `z`:
#' df %>% nest(data = y, .by = x)
#' # This includes `x` in the inner data frame:
#' df %>% nest(data = everything(), .by = x)
#'
#' # Multiple nesting structures can be specified at once
#' iris %>%
#' nest(petal = starts_with("Petal"), sepal = starts_with("Sepal"))
#' iris %>%
#' nest(width = contains("Width"), length = contains("Length"))
#'
#' # Nesting a grouped data frame nests all variables apart from the group vars
#' fish_encounters %>%
#' dplyr::group_by(fish) %>%
#' nest()
#'
#' # That is similar to `nest(.by = )`, except here the result isn't grouped
#' fish_encounters %>%
#' nest(.by = fish)
#'
#' # Nesting is often useful for creating per group models
#' mtcars %>%
#' nest(.by = cyl) %>%
#' dplyr::mutate(models = lapply(data, function(df) lm(mpg ~ wt, data = df)))
nest <- function(.data,
...,
.by = NULL,
.key = NULL,
.names_sep = NULL) {
cols <- enquos(...)
empty <- names2(cols) == ""
if (any(empty)) {
cols_good <- cols[!empty]
cols_bad <- cols[empty]
.key <- check_key(.key)
if (length(cols_bad) == 1L) {
cols_bad <- cols_bad[[1]]
cols_fixed_expr <- expr(!!cols_bad)
} else {
cols_fixed_expr <- expr(c(!!!cols_bad))
}
cols_fixed_label <- as_label(cols_fixed_expr)
cols_fixed <- quos(!!.key := !!cols_fixed_expr)
cols <- c(cols_good, cols_fixed)
lifecycle::deprecate_warn(
when = "1.0.0",
what = I("Supplying `...` without names"),
details = c(
i = "Please specify a name for each selection.",
i = cli::format_inline("Did you want `{(.key)} = {cols_fixed_label}`?")
),
always = TRUE
)
return(nest(.data, !!!cols, .by = {{ .by }}))
}
UseMethod("nest")
}
#' @export
nest.data.frame <- function(.data,
...,
.by = NULL,
.key = NULL,
.names_sep = NULL) {
# The data frame print handles nested data frames poorly, so we want to
# convert data frames (but not subclasses) to tibbles
if (identical(class(.data), "data.frame")) {
.data <- as_tibble(.data)
}
nest.tbl_df(
.data,
...,
.by = {{ .by }},
.key = .key,
.names_sep = .names_sep
)
}
#' @export
nest.tbl_df <- function(.data,
...,
.by = NULL,
.key = NULL,
.names_sep = NULL) {
error_call <- current_env()
info <- nest_info(.data, ..., .by = {{ .by }}, .key = .key)
cols <- info$cols
inner <- info$inner
outer <- info$outer
inner <- .data[inner]
inner <- pack(inner, !!!cols, .names_sep = .names_sep, .error_call = error_call)
out <- .data[outer]
out <- vec_cbind(out, inner, .name_repair = "check_unique", .error_call = error_call)
out <- reconstruct_tibble(.data, out)
out <- chop(out, cols = all_of(names(cols)), error_call = error_call)
# `nest()` currently doesn't return list-of columns
for (name in names(cols)) {
out[[name]] <- tidyr_new_list(out[[name]])
}
out
}
#' @export
nest.grouped_df <- function(.data,
...,
.by = NULL,
.key = NULL,
.names_sep = NULL) {
by <- enquo(.by)
if (!quo_is_null(by)) {
cli::cli_abort("Can't supply {.arg .by} when {.arg .data} is a grouped data frame.")
}
if (missing(...)) {
.key <- check_key(.key)
cols <- setdiff(names(.data), dplyr::group_vars(.data))
nest.tbl_df(.data, !!.key := all_of(cols), .names_sep = .names_sep)
} else {
nest.tbl_df(.data, ..., .key = .key, .names_sep = .names_sep)
}
}
nest_info <- function(.data,
...,
.by = NULL,
.key = NULL,
.error_call = caller_env()) {
by <- enquo(.by)
cols <- enquos(...)
n_cols <- length(cols)
key <- check_key(.key, error_call = .error_call)
if (n_cols != 0L && !is_default_key(.key)) {
warn_unused_key(error_call = .error_call)
}
cols <- with_indexed_errors(
map(cols, function(col) {
names(tidyselect::eval_select(
expr = col,
data = .data,
allow_rename = FALSE,
error_call = NULL
))
}),
message = function(cnd) {
cli::format_inline("In expression named {.arg {cnd$name}}:")
},
.error_call = .error_call
)
names <- names(.data)
outer <- names(tidyselect::eval_select(
expr = by,
data = .data,
allow_rename = FALSE,
error_call = .error_call
))
inner <- list_unchop(cols, ptype = character(), name_spec = zap())
inner <- vec_unique(inner)
if (n_cols == 0L) {
# Derive `inner` names from `.by`
inner <- setdiff(names, outer)
cols <- list2(!!key := inner)
}
if (quo_is_null(by)) {
# Derive `outer` names from `...`
outer <- setdiff(names, inner)
}
# Regenerate quosures for `pack()`
cols <- map(cols, function(col) {
quo(all_of(!!col))
})
cols <- new_quosures(cols)
list(
cols = cols,
inner = inner,
outer = outer
)
}
warn_unused_key <- function(error_call = caller_env()) {
message <- c(
"Can't supply both {.arg .key} and {.arg ...}.",
i = "{.arg .key} will be ignored."
)
cli::cli_warn(message, call = error_call)
}
check_key <- function(key, error_call = caller_env()) {
if (is_default_key(key)) {
"data"
} else {
check_string(key, allow_empty = FALSE, arg = ".key", call = error_call)
key
}
}
is_default_key <- function(key) {
if (identical(maybe_missing(key), deprecated())) {
# Temporary support for S3 method authors that set `.key = deprecated()`.
# Remove this entire helper all methods have been updated.
key <- NULL
}
is.null(key)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.