Nothing
#' Fast versions of `tidyr::expand()` and `tidyr::complete()`.
#'
#' @details
#' For un-grouped data `fexpand()` is similar in speed to `tidyr::expand()`.
#' When the data contain many groups, `fexpand()` is much much faster (see examples).
#'
#' The 2 main differences between `fexpand()` and `tidyr::expand()` are that:
#'
#' * tidyr style helpers like `nesting()` and `crossing()` are ignored.
#' The type of expansion used is controlled through `expand_type` and applies to
#' all supplied variables.
#' * Expressions are first calculated on the entire ungrouped dataset before being
#' expanded but within-group expansions will work on variables that already exist
#' in the dataset.
#' For example, `iris %>% group_by(Species) %>% fexpand(Sepal.Length, Sepal.Width)`
#' will perform a grouped expansion but
#' `iris %>% group_by(Species) %>% fexpand(range(Sepal.Length))`
#' will not.
#'
#' For efficiency, when supplying groups, expansion is done on a by-group basis only if
#' there are 2 or more variables that aren't part of the grouping.
#' The reason is that a by-group calculation does not need to be done with 1 expansion variable
#' as all combinations across groups already exist against that 1 variable.
#' When `expand_type = "nesting"` groups are ignored for speed purposes as the result is the same.
#'
#' An advantage of `fexpand()` is that it returns a data frame with the same class
#' as the input. It also uses `data.table` for memory efficiency and `collapse` for speed.
#'
#' A future development for `fcomplete()` would be to only fill values of variables that
#' correspond only to both additional completed rows and rows that match the expanded rows, are
#' filled in. For example,
#' `iris %>% mutate(test = NA_real_) %>% complete(Sepal.Length = 0:100, fill = list(test = 0))`
#' fills in all `NA` values of test, whereas
#' `iris %>% mutate(test = NA_real_) %>% fcomplete(Sepal.Length = 0:100, fill = list(test = 0))`
#' should only fill in values of test that correspond to Sepal.Length values of `0:100`.
#'
#' An additional note to add when `expand_type = "nesting"` is that if one of the
#' supplied variables in `...` does not exist in the data, but can be recycled
#' to the length of the data, then it is added and treated as a data variable.
#'
#' @param data A data frame
#' @param ... Variables to expand
#' @param expand_type Type of expansion to use where "nesting"
#' finds combinations already present in the data
#' (exactly the same as using `distinct()` but `fexpand()`
#' allows new variables to be created on the fly
#' and columns are sorted in the order given.
#' "crossing" finds all combinations of values in the group variables.
#' @param fill A named list containing value-name pairs
#' to fill the named implicit missing values.
#' @param sort Logical. If `TRUE` expanded/completed variables are sorted.
#' The default is `FALSE`.
#' @param .by (Optional). A selection of columns to group by for this operation.
#' Columns are specified using tidy-select.
#'
#' @returns
#' A `data.frame` of expanded groups.
#'
#' @examples
#' library(timeplyr)
#' library(dplyr)
#' library(lubridate)
#' library(nycflights13)
#' \dontshow{
#' .n_dt_threads <- data.table::getDTthreads()
#' .n_collapse_threads <- collapse::get_collapse()$nthreads
#' data.table::setDTthreads(threads = 2L)
#' collapse::set_collapse(nthreads = 1L)
#' }
#' flights %>%
#' fexpand(origin, dest)
#' flights %>%
#' fexpand(origin, dest, sort = FALSE)
#'
#' # Grouped expansions example
#' # 1 extra group (carrier) this is very quick
#' flights %>%
#' group_by(origin, dest, tailnum) %>%
#' fexpand(carrier)
#' \dontshow{
#' data.table::setDTthreads(threads = .n_dt_threads)
#' collapse::set_collapse(nthreads = .n_collapse_threads)
#'}
#' @rdname fexpand
#' @export
fexpand <- function(data, ..., expand_type = c("crossing", "nesting"),
sort = FALSE,
.by = NULL){
expand_type <- rlang::arg_match(expand_type)
group_vars <- get_groups(data, {{ .by }})
summarise_vars <- summarise_list(data, ...)
grps_missed <- setdiff(group_vars, names(summarise_vars))
# Add group vars to summary list
if (length(grps_missed) > 0){
summarise_vars <- c(add_names(
lapply(
grps_missed, function(x) fpluck(
dplyr_summarise(
safe_ungroup(data), across(all_of(x))
), 1)
), grps_missed
),
summarise_vars)
}
# Re-order list so that groups are first
summarise_vars <- summarise_vars[c(group_vars,
setdiff(names(summarise_vars),
group_vars))]
summarise_var_nms <- names(summarise_vars)
out_nms <- c(group_vars, setdiff(summarise_var_nms, group_vars))
# All variables minus grouped ones
leftover_grp_nms <- setdiff(summarise_var_nms, group_vars)
if (expand_type == "nesting" ||
# Special case when data is grouped but only 1 data variable is specified to expand
# There is no need from a speed perspective to do grouped calculation in this case
(length(group_vars) > 0L &&
length(leftover_grp_nms) <= 1L &&
expand_type == "crossing")){
out <- nested_join(summarise_vars, N = df_nrow(data),
sort = sort)
} else {
# Method for grouped data which performs a separate cross-join of
# non-grouped variables for each group
if (length(group_vars) > 0L && length(leftover_grp_nms) >= 2L){
out1 <- nested_join(summarise_vars, N = df_nrow(data),
sort = FALSE)
# Add group ID
grp_nm <- new_var_nm(out1, ".group.id")
out1[, (grp_nm) := group_id(.SD, order = FALSE, .cols = names(.SD)),
.SDcols = group_vars]
setorderv2(out1, cols = grp_nm)
# Add group IDs for each non-group variable
# This will allow us to calculate final expanded size
for (i in seq_along(leftover_grp_nms)){
assign(paste0("grp_nm_", i),
new_var_nm(out1, ".group.id"))
out1[, (get(paste0("grp_nm_", i))) := group_id(.SD, order = FALSE, .cols = names(.SD)),
.SDcols = leftover_grp_nms[[i]]]
}
group_id_nms <- unlist(mget(paste0("grp_nm_",
seq_len(length(leftover_grp_nms)))),
recursive = FALSE, use.names = FALSE)
# Figure out final size before expansion, to do this we can
# Calculate the vector product of unique expanded elements across groups.
out_temp <- collapse::fndistinct(fselect(out1, .cols = group_id_nms),
g = out1[[grp_nm]],
use.g.names = FALSE, na.rm = FALSE)
sizes <- rowProds(out_temp)
expanded_nrow <- sum(sizes)
data.table::setkeyv(out1, cols = grp_nm)
out2 <- out1[, lapply(.SD, function(x) list(collapse::funique(x))),
keyby = grp_nm,
.SDcols = group_id_nms]
# # This is fastest but can't get it to work in the package
# out <- out2[, .Call(Ccj, unlist(.SD, recursive = FALSE, use.names = FALSE)),
# keyby = grp_nm,
# .SDcols = group_id_nms]
out <- out2[, CJ2(unlist(.SD, recursive = FALSE, use.names = FALSE)),
keyby = grp_nm,
.SDcols = group_id_nms]
out <- frename(out, .cols = add_names(names(out), c(grp_nm, leftover_grp_nms)))
for (i in seq_along(group_id_nms)){
grp_to_modify <- leftover_grp_nms[[i]]
grp_to_match_on <- group_id_nms[[i]]
data.table::set(out, j = grp_to_modify,
value = out1[[grp_to_modify]][
collapse::fmatch(out[[grp_to_modify]],
out1[[grp_to_match_on]],
overid = 2L)
])
}
for (i in seq_along(group_vars)){
data.table::set(out, j = group_vars[[i]],
value = out1[[group_vars[[i]]]][
collapse::fmatch(out[[grp_nm]],
out1[[grp_nm]],
overid = 2L)
])
}
set_rm_cols(out, grp_nm)
if (sort){
setorderv2(out, cols = c(group_vars, leftover_grp_nms))
}
}
# If no groups then cross-join everything
else {
out <- crossed_join(summarise_vars, sort = sort,
unique = TRUE)
}
}
out <- fselect(out, .cols = out_nms)
df_reconstruct(out, data)
}
# Nested join, recycling newly created variables with data variables
nested_join <- function(X, sort = FALSE, N){
X_nms <- names(X)
if (length(X_nms) == 0L){
X_nms <- rep_len("Var", length(X))
X_nms <- paste0(X_nms, seq_len(length(X)))
}
X_lens <- cheapr::lengths_(X)
# If N is not supplied, then calculate N iff all list lengths are equal
if (missing(N)){
N <- unique(X_lens)
}
check_length(N, 1L)
# Data variables
data_nms <- X_nms[(X_lens %% N) == 0]
# Newly created variables
other_nms <- X_nms[!X_nms %in% data_nms]
df <- data.table::as.data.table(X[X_nms %in% data_nms])
n_data <- df_nrow(df)
if (n_data > 0L){
df <- collapse::funique(df)
n_data <- df_nrow(df)
}
X_other <- X[X_nms %in% other_nms]
X_other <- lapply(X_other, function(x) collapse::funique(x, sort = FALSE))
n_data <- max(n_data, 1L)
n_other <- prod(cheapr::lengths_(X_other))
n_other <- max(n_other, 1, na.rm = TRUE)
expanded_n <- prod(c(n_data, n_other), na.rm = TRUE)
# Nested cross-join
grp_seq <- seq_len(n_data)
if (df_nrow(df) == 0L){
out <- crossed_join(X_other, unique = FALSE)
} else {
out <- df_row_slice(df, rep(grp_seq, each = n_other))
if (length(X_other) > 0L){
rep_times <- df_nrow(out) / cheapr::lengths_(X_other)
for (i in seq_along(X_other)){
data.table::set(out, j = other_nms[i],
value = rep(X_other[[i]], rep_times[i]))
}
}
}
if (sort){
setorderv2(out, cols = names(out))
}
out
}
#' @rdname fexpand
#' @export
fcomplete <- function(data, ..., expand_type = c("crossing", "nesting"),
sort = FALSE, .by = NULL,
fill = NA){
expand_type <- rlang::arg_match(expand_type)
group_vars <- get_groups(data, {{ .by }})
expanded_df <- fexpand(data,
...,
sort = FALSE, .by = {{ .by }},
expand_type = expand_type)
fill_na <- any(!is.na(fill))
out <- data
# Full-join
if (df_nrow(expanded_df) > 0 && df_ncol(expanded_df) > 0){
extra <- cheapr::setdiff_(
expanded_df,
cheapr::sset(out, j = names(expanded_df))
)
if (df_nrow(extra) > 0){
extra <- df_cbind(
extra,
df_init(cheapr::sset(out, j = setdiff(names(out), names(expanded_df))),
df_nrow(extra))
)
out <- vctrs::vec_rbind(out, extra)
}
# out <- dplyr::full_join(out, expanded_df, by = names(expanded_df))
# out <- collapse_join(out, expanded_df,
# on = names(expanded_df),
# how = "full",
# sort = FALSE)
if (sort){
out <- farrange(out, .cols = c(group_vars,
setdiff(names(expanded_df), group_vars)))
}
}
# Replace NA with fill
if (fill_na){
fill <- fill[!is.na(fill)]
fill_nms <- names(fill)
for (i in seq_along(fill)){
if (length(fill[[i]]) != 1){
stop("fill values must be of length 1")
}
out[[fill_nms[[i]]]][cheapr::which_na(out[[fill_nms[[i]]]])] <-
fill[[i]]
}
}
out_order <- c(names(data), setdiff(names(out), names(data)))
out <- fselect(out, .cols = out_order)
df_reconstruct(out, data)
}
expand_check <- function(N, log_limit){
if (log10(N) >= log_limit || N > .Machine$integer.max){
stop("Requested expansion results in >= ",
N,
" rows, aborting.")
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.