tidyr: Tidy Messy Data

Documented in complete

#' Complete a data frame with missing combinations of data
#'
#' Turns implicit missing values into explicit missing values. This is a wrapper
#' around [expand()], [dplyr::full_join()] and [replace_na()] that's useful for
#' completing missing combinations of data.
#'
#' @section Grouped data frames:
#' With grouped data frames created by [dplyr::group_by()], `complete()`
#' operates _within_ each group. Because of this, you cannot complete a grouping
#' column.
#'
#' @inheritParams expand
#' @param fill A named list that for each variable supplies a single value to
#'   use instead of `NA` for missing combinations.
#' @param explicit Should both implicit (newly created) and explicit
#'   (pre-existing) missing values be filled by `fill`? By default, this is
#'   `TRUE`, but if set to `FALSE` this will limit the fill to only implicit
#'   missing values.
#' @export
#' @examples
#' df <- tibble(
#'   group = c(1:2, 1, 2),
#'   item_id = c(1:2, 2, 3),
#'   item_name = c("a", "a", "b", "b"),
#'   value1 = c(1, NA, 3, 4),
#'   value2 = 4:7
#' )
#' df
#'
#' # Combinations --------------------------------------------------------------
#' # Generate all possible combinations of `group`, `item_id`, and `item_name`
#' # (whether or not they appear in the data)
#' df %>% complete(group, item_id, item_name)
#'
#' # Cross all possible `group` values with the unique pairs of
#' # `(item_id, item_name)` that already exist in the data
#' df %>% complete(group, nesting(item_id, item_name))
#'
#' # Within each `group`, generate all possible combinations of
#' # `item_id` and `item_name` that occur in that group
#' df %>%
#'   dplyr::group_by(group) %>%
#'   complete(item_id, item_name)
#'
#' # Supplying values for new rows ---------------------------------------------
#' # Use `fill` to replace NAs with some value. By default, affects both new
#' # (implicit) and pre-existing (explicit) missing values.
#' df %>%
#'   complete(
#'     group,
#'     nesting(item_id, item_name),
#'     fill = list(value1 = 0, value2 = 99)
#'   )
#'
#' # Limit the fill to only the newly created (i.e. previously implicit)
#' # missing values with `explicit = FALSE`
#' df %>%
#'   complete(
#'     group,
#'     nesting(item_id, item_name),
#'     fill = list(value1 = 0, value2 = 99),
#'     explicit = FALSE
#'   )
complete <- function(data, ..., fill = list(), explicit = TRUE) {
  UseMethod("complete")
}

#' @export
complete.data.frame <- function(data, ..., fill = list(), explicit = TRUE) {
  check_bool(explicit)

  out <- expand(data, ...)
  names <- names(out)

  if (length(names) > 0L) {
    out <- dplyr::full_join(out, data, by = names, multiple = "all")
  } else {
    # Avoid joining the 1x0 result from `expand()` with `data`.
    # That causes issues when `data` has zero rows.
    out <- data
  }

  if (explicit) {
    out <- replace_na(out, replace = fill)
  } else {
    new <- !vec_in(out[names], data[names])
    slice <- vec_slice(out, new)
    slice <- replace_na(slice, replace = fill)
    out <- vec_assign(out, new, slice)
  }

  reconstruct_tibble(data, out)
}

#' @export
complete.grouped_df <- function(data, ..., fill = list(), explicit = TRUE) {
  out <- dplyr::reframe(
    data,
    complete(
      data = dplyr::pick(everything()),
      ...,
      fill = fill,
      explicit = explicit
    )
  )

  drop <- dplyr::group_by_drop_default(data)

  dplyr::group_by(
    out,
    !!!dplyr::groups(data),
    .drop = drop
  )
}