#' Pivot data from long to wide
#'
#' @description
#' `pivot_wider()` "widens" data, increasing the number of columns and
#' decreasing the number of rows. The inverse transformation is
#' `pivot_longer()`.
#' Learn more in `vignette("pivot", "tidyr")`.
#'
#' Note that `pivot_wider()` is not and cannot be lazy because we need to look
#' at the data to figure out what the new column names will be.
#' If you have a long running query you have two options:
#'
#' * (temporarily) store the result of the query via `compute()`.
#' * Create a spec before and use `dbplyr_pivot_wider_spec()` - dbplyr's version
#' of `tidyr::pivot_wider_spec()`. Note that this function is only a temporary
#' solution until `pivot_wider_spec()` becomes a generic. It will then be
#' removed soon afterwards.
#'
#' @details
#' The big difference to `pivot_wider()` for local data frames is that
#' `values_fn` must not be `NULL`. By default it is `max()` which yields
#' the same results as for local data frames if the combination of `id_cols`
#' and `value` column uniquely identify an observation.
#' Mind that you also do not get a warning if an observation is not uniquely
#' identified.
#'
#' The translation to SQL code basically works as follows:
#'
#' 1. Get unique keys in `names_from` column.
#' 2. For each key value generate an expression of the form:
#' ```sql
#' value_fn(
#' CASE WHEN (`names from column` == `key value`)
#' THEN (`value column`)
#' END
#' ) AS `output column`
#' ```
#' 3. Group data by id columns.
#' 4. Summarise the grouped data with the expressions from step 2.
#'
#' @param data A lazy data frame backed by a database query.
#' @param id_cols A set of columns that uniquely identifies each observation.
#' @param id_expand Unused; included for compatibility with the generic.
#' @param names_from,values_from A pair of
#' arguments describing which column (or columns) to get the name of the
#' output column (`names_from`), and which column (or columns) to get the
#' cell values from (`values_from`).
#'
#' If `values_from` contains multiple values, the value will be added to the
#' front of the output column.
#' @param names_prefix String added to the start of every variable name.
#' @param names_sep If `names_from` or `values_from` contains multiple
#' variables, this will be used to join their values together into a single
#' string to use as a column name.
#' @param names_glue Instead of `names_sep` and `names_prefix`, you can supply
#' a glue specification that uses the `names_from` columns (and special
#' `.value`) to create custom column names.
#' @param names_sort Should the column names be sorted? If `FALSE`, the default,
#' column names are ordered by first appearance.
#' @param names_vary When `names_from` identifies a column (or columns) with
#' multiple unique values, and multiple `values_from` columns are provided,
#' in what order should the resulting column names be combined?
#'
#' - `"fastest"` varies `names_from` values fastest, resulting in a column
#' naming scheme of the form: `value1_name1, value1_name2, value2_name1,
#' value2_name2`. This is the default.
#'
#' - `"slowest"` varies `names_from` values slowest, resulting in a column
#' naming scheme of the form: `value1_name1, value2_name1, value1_name2,
#' value2_name2`.
#' @param names_repair What happens if the output has invalid column names?
#' @param names_expand Should the values in the `names_from` columns be expanded
#' by [expand()] before pivoting? This results in more columns, the output
#' will contain column names corresponding to a complete expansion of all
#' possible values in `names_from`. Additionally, the column names will be
#' sorted, identical to what `names_sort` would produce.
#' @param values_fill Optionally, a (scalar) value that specifies what each
#' `value` should be filled in with when missing.
#' @param values_fn A function, the default is `max()`, applied to the `value`
#' in each cell in the output. In contrast to local data frames it must not be
#' `NULL`.
#' @param unused_fn Optionally, a function applied to summarize the values from
#' the unused columns (i.e. columns not identified by `id_cols`,
#' `names_from`, or `values_from`).
#'
#' The default drops all unused columns from the result.
#'
#' This can be a named list if you want to apply different aggregations
#' to different unused columns.
#'
#' `id_cols` must be supplied for `unused_fn` to be useful, since otherwise
#' all unspecified columns will be considered `id_cols`.
#'
#' This is similar to grouping by the `id_cols` then summarizing the
#' unused columns using `unused_fn`.
#' @param ... Unused; included for compatibility with generic.
#'
#' @examplesIf rlang::is_installed("tidyr", version = "1.0.0")
#' memdb_frame(
#' id = 1,
#' key = c("x", "y"),
#' value = 1:2
#' ) %>%
#' tidyr::pivot_wider(
#' id_cols = id,
#' names_from = key,
#' values_from = value
#' )
#' @exportS3Method tidyr::pivot_wider
pivot_wider.tbl_lazy <- function(data,
...,
id_cols = NULL,
id_expand = FALSE,
names_from = name,
names_prefix = "",
names_sep = "_",
names_glue = NULL,
names_sort = FALSE,
names_vary = "fastest",
names_expand = FALSE,
names_repair = "check_unique",
values_from = value,
values_fill = NULL,
values_fn = ~ max(.x, na.rm = TRUE),
unused_fn = NULL) {
rlang::check_dots_empty()
check_unsupported_arg(id_expand, FALSE)
names_from <- enquo(names_from)
values_from <- enquo(values_from)
spec <- dbplyr_build_wider_spec(data,
names_from = !!names_from,
values_from = !!values_from,
names_prefix = names_prefix,
names_sep = names_sep,
names_glue = names_glue,
names_sort = names_sort,
names_vary = names_vary,
names_expand = names_expand,
error_call = current_env()
)
id_cols <- build_wider_id_cols_expr(
data = data,
id_cols = {{id_cols}},
names_from = !!names_from,
values_from = !!values_from
)
dbplyr_pivot_wider_spec(
data = data,
spec = spec,
...,
names_repair = names_repair,
id_cols = !!id_cols,
id_expand = FALSE,
values_fill = values_fill,
values_fn = values_fn,
unused_fn = unused_fn,
error_call = current_env()
)
}
dbplyr_build_wider_spec <- function(data,
names_from = name,
values_from = value,
names_prefix = "",
names_sep = "_",
names_glue = NULL,
names_sort = FALSE,
names_vary = "fastest",
names_expand = FALSE,
error_call = current_env()) {
if (!inherits(data, "tbl_sql")) {
cli_abort(c(
"{.fun dbplyr_build_wider_spec} doesn't work with local lazy tibbles.",
i = "Use {.fun memdb_frame} together with {.fun show_query} to see the SQL code."
))
}
# prepare a minimal local tibble we can pass to `tidyr::build_wider_spec`
# 1. create a tibble with unique values in the `names_from` column
# row_ids <- vec_unique(data[names_from])
names_from <- tidyselect::eval_select(enquo(names_from), data) %>% names()
if (is_empty(names_from)) {
cli_abort("{.arg names_from} must select at least one column.")
}
distinct_data <- collect(distinct(data, !!!syms(names_from)))
# 2. add `values_from` column
values_from <- tidyselect::eval_select(enquo(values_from), data) %>% names()
if (is_empty(values_from)) {
cli_abort("{.arg values_from} must select at least one column.")
}
dummy_data <- vctrs::vec_cbind(
distinct_data,
!!!rlang::rep_named(values_from, list(TRUE)),
.name_repair = "check_unique"
)
tidyr::build_wider_spec(dummy_data,
names_from = !!enquo(names_from),
values_from = !!enquo(values_from),
names_prefix = names_prefix,
names_sep = names_sep,
names_glue = names_glue,
names_sort = names_sort,
names_vary = names_vary,
names_expand = names_expand,
error_call = error_call
)
}
#' @inheritParams rlang::args_error_context
#' @inheritParams tidyr::pivot_wider_spec
#' @export
#' @rdname pivot_wider.tbl_lazy
dbplyr_pivot_wider_spec <- function(data,
spec,
...,
names_repair = "check_unique",
id_cols = NULL,
id_expand = FALSE,
values_fill = NULL,
values_fn = ~ max(.x, na.rm = TRUE),
unused_fn = NULL,
error_call = current_env()) {
check_unsupported_arg(id_expand, FALSE)
spec <- tidyr::check_pivot_spec(spec)
names_from_cols <- names(spec)[-(1:2)]
values_from_cols <- vctrs::vec_unique(spec$.value)
id_cols <- select_wider_id_cols(
data = data,
id_cols = {{ id_cols }},
names_from_cols = names_from_cols,
values_from_cols = values_from_cols,
error_call = error_call
)
values_fn <- check_list_of_functions(values_fn, values_from_cols, call = error_call)
unused_cols <- setdiff(colnames(data), c(id_cols, names_from_cols, values_from_cols))
unused_fn <- check_list_of_functions(unused_fn, unused_cols, call = error_call)
unused_cols <- names(unused_fn)
if (is.null(values_fill)) {
values_fill <- list()
} else if (is_scalar(values_fill)) {
values_fill <- rep_named(values_from_cols, list(values_fill))
} else if (!vctrs::vec_is_list(values_fill)) {
cli::cli_abort(
"{.arg values_fill} must be {.code NULL}, a scalar, or a named list, not {.obj_type_friendly {values_fill}}.",
call = error_call
)
}
values_fill <- values_fill[intersect(names(values_fill), values_from_cols)]
missing_values <- setdiff(values_from_cols, names(values_fn))
if (!is_empty(missing_values)) {
cli_abort("{.arg values_fn} must specify a function for each col in {.arg values_from}")
}
n_unused_fn <- length(unused_fn)
unused_col_expr <- vector("list", length = n_unused_fn)
names(unused_col_expr) <- unused_cols
for (i in seq_len(n_unused_fn)) {
unused_col <- unused_cols[[i]]
unused_fn_i <- unused_fn[[i]]
unused_col_expr[[i]] <- resolve_fun(unused_fn_i, sym(unused_col), call = error_call)
}
spec_idx <- set_names(vctrs::vec_seq_along(spec), spec$.name)
pivot_exprs <- with_indexed_errors(
purrr::map(
spec_idx,
~ build_pivot_wider_exprs(.x, spec, values_fill, values_fn, call = NULL)
),
message = function(cnd) {
col <- spec[[".value"]][cnd$location]
cli::format_inline("Can't pivot column {.field {col}}:")
},
.error_call = error_call
)
non_id_cols <- c(names_from_cols, values_from_cols)
key_vars <- setdiff(id_cols, non_id_cols)
data_grouped <- group_by(data, !!!syms(key_vars), .add = TRUE)
group_names <- group_vars(data_grouped)
out_nms <- c(group_names, names(pivot_exprs))
out_nms_repaired <- vctrs::vec_as_names(out_nms, repair = names_repair)
if (!is_empty(group_names)) {
out_nms_repaired <- out_nms_repaired[-(1:length(group_names))]
}
pivot_exprs <- set_names(pivot_exprs, out_nms_repaired)
data_grouped %>%
summarise(
!!!pivot_exprs,
!!!unused_col_expr,
.groups = "drop"
) %>%
group_by(!!!syms(group_vars(data)))
}
utils::globalVariables(c("name", "value"))
build_wider_id_cols_expr <- function(data,
id_cols = NULL,
names_from = name,
values_from = value,
call = caller_env()) {
# COPIED FROM tidyr
names_from <- tidyselect::eval_select(
enquo(names_from),
data,
allow_rename = FALSE,
error_call = error_call
)
values_from <- tidyselect::eval_select(
enquo(values_from),
data,
allow_rename = FALSE,
error_call = error_call
)
out <- select_wider_id_cols(
data = data,
id_cols = {{ id_cols }},
names_from_cols = names(names_from),
values_from_cols = names(values_from),
error_call = error_call
)
expr(c(!!!out))
}
select_wider_id_cols <- function(data,
id_cols = NULL,
names_from_cols = character(),
values_from_cols = character(),
error_call = caller_env()) {
# COPIED FROM tidyr
id_cols <- enquo(id_cols)
sim_data <- tidyselect_data_proxy(data)
# Remove known non-id-cols so they are never selected
sim_data <- sim_data[setdiff(names(sim_data), c(names_from_cols, values_from_cols))]
if (quo_is_null(id_cols)) {
# Default selects everything in `sim_data` after non-id-cols have been removed
return(names(sim_data))
}
withCallingHandlers(
id_cols <- tidyselect::eval_select(
enquo(id_cols),
sim_data,
allow_rename = FALSE,
error_call = error_call
),
vctrs_error_subscript_oob = function(cnd) {
rethrow_id_cols_oob(cnd, names_from_cols, values_from_cols, error_call)
}
)
names(id_cols)
}
rethrow_id_cols_oob <- function(cnd, names_from_cols, values_from_cols, call) {
i <- cnd[["i"]]
check_string(i, .internal = TRUE)
if (i %in% names_from_cols) {
stop_id_cols_oob(i, "names_from", call = call)
} else if (i %in% values_from_cols) {
stop_id_cols_oob(i, "values_from", call = call)
} else {
# Zap this special handler, throw the normal condition
zap()
}
}
stop_id_cols_oob <- function(i, arg, call) {
cli::cli_abort(
c(
"`id_cols` can't select a column already selected by `{arg}`.",
i = "Column `{i}` has already been selected."
),
parent = NA,
call = call
)
}
build_pivot_wider_exprs <- function(row_id, spec, values_fill, values_fn, call) {
values_col <- spec[[".value"]][row_id]
fill_value <- values_fill[[values_col]]
keys <- vctrs::vec_slice(spec[, -(1:2)], row_id)
keys_cond <- purrr::imap(
keys,
function(value, name) {
if (is.na(value)) {
expr(is.na(!!sym(name)))
} else {
expr(!!sym(name) == !!value)
}
}
)
keys_cond <- purrr::reduce(keys_cond, ~ expr(!!.x & !!.y))
case_expr <- expr(ifelse(!!keys_cond, !!sym(values_col), !!fill_value))
agg_fn <- values_fn[[values_col]]
resolve_fun(agg_fn, case_expr, arg = paste0("values_fn$", values_col), call = call)
}
is_scalar <- function(x) {
if (is.null(x)) {
return(FALSE)
}
if (is.list(x)) {
(length(x) == 1) && !have_name(x)
} else {
length(x) == 1
}
}
resolve_fun <- function(x, var, arg = caller_arg(x), call = caller_env()) {
if (is_formula(x)) {
.fn_expr <- across_fun(x, env = empty_env(), dots = NULL, fn = "across")
exec(.fn_expr, var, NULL)
} else {
fn_name <- find_fun(x)
if (is_null(fn_name)) {
cli_abort("Can't convert {.arg {arg}}, {.code {as_label(x)}}, to a function.", call = call)
}
call2(fn_name, var)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.