dplyr: A Grammar of Data Manipulation

Documented in group_by group_by_drop_default group_by_prepare ungroup

#' Group by one or more variables
#'
#' @description
#' Most data operations are done on groups defined by variables.
#' `group_by()` takes an existing tbl and converts it into a grouped tbl
#' where operations are performed "by group". `ungroup()` removes grouping.
#'
#' @family grouping functions
#' @inheritParams arrange
#' @param ... <[`data-masking`][rlang::args_data_masking]> In `group_by()`,
#'   variables or computations to group by. Computations are always done on the
#'   ungrouped data frame. To perform computations on the grouped data, you need
#'   to use a separate `mutate()` step before the `group_by()`.
#'   Computations are not allowed in `nest_by()`.
#'   In `ungroup()`, variables to remove from the grouping.
#' @param .add When `FALSE`, the default, `group_by()` will
#'   override existing groups. To add to the existing groups, use
#'   `.add = TRUE`.
#'
#'   This argument was previously called `add`, but that prevented
#'   creating a new grouping variable called `add`, and conflicts with
#'   our naming conventions.
#' @param .drop Drop groups formed by factor levels that don't appear in the
#'   data? The default is `TRUE` except when `.data` has been previously
#'   grouped with `.drop = FALSE`. See [group_by_drop_default()] for details.
#' @return A grouped data frame with class [`grouped_df`][grouped_df],
#'   unless the combination of `...` and `add` yields a empty set of
#'   grouping columns, in which case a tibble will be returned.
#' @section Methods:
#' These function are **generic**s, which means that packages can provide
#' implementations (methods) for other classes. See the documentation of
#' individual methods for extra arguments and differences in behaviour.
#'
#' Methods available in currently loaded packages:
#'
#' * `group_by()`: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("group_by")}.
#' * `ungroup()`: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("ungroup")}.
#'
#' @section Ordering:
#' Currently, `group_by()` internally orders the groups in ascending order. This
#' results in ordered output from functions that aggregate groups, such as
#' [summarise()].
#'
#' When used as grouping columns, character vectors are ordered in the C locale
#' for performance and reproducibility across R sessions. If the resulting
#' ordering of your grouped operation matters and is dependent on the locale,
#' you should follow up the grouped operation with an explicit call to
#' [arrange()] and set the `.locale` argument. For example:
#'
#' ```
#' data %>%
#'   group_by(chr) %>%
#'   summarise(avg = mean(x)) %>%
#'   arrange(chr, .locale = "en")
#' ```
#'
#' This is often useful as a preliminary step before generating content intended
#' for humans, such as an HTML table.
#'
#' ## Legacy behavior
#'
#' Prior to dplyr 1.1.0, character vector grouping columns were ordered in the
#' system locale. If you need to temporarily revert to this behavior, you can
#' set the global option `dplyr.legacy_locale` to `TRUE`, but this should be
#' used sparingly and you should expect this option to be removed in a future
#' version of dplyr. It is better to update existing code to explicitly call
#' `arrange(.locale = )` instead. Note that setting `dplyr.legacy_locale` will
#' also force calls to [arrange()] to use the system locale.
#'
#' @export
#' @examples
#' by_cyl <- mtcars %>% group_by(cyl)
#'
#' # grouping doesn't change how the data looks (apart from listing
#' # how it's grouped):
#' by_cyl
#'
#' # It changes how it acts with the other dplyr verbs:
#' by_cyl %>% summarise(
#'   disp = mean(disp),
#'   hp = mean(hp)
#' )
#' by_cyl %>% filter(disp == max(disp))
#'
#' # Each call to summarise() removes a layer of grouping
#' by_vs_am <- mtcars %>% group_by(vs, am)
#' by_vs <- by_vs_am %>% summarise(n = n())
#' by_vs
#' by_vs %>% summarise(n = sum(n))
#'
#' # To removing grouping, use ungroup
#' by_vs %>%
#'   ungroup() %>%
#'   summarise(n = sum(n))
#'
#' # By default, group_by() overrides existing grouping
#' by_cyl %>%
#'   group_by(vs, am) %>%
#'   group_vars()
#'
#' # Use add = TRUE to instead append
#' by_cyl %>%
#'   group_by(vs, am, .add = TRUE) %>%
#'   group_vars()
#'
#' # You can group by expressions: this is a short-hand
#' # for a mutate() followed by a group_by()
#' mtcars %>%
#'   group_by(vsam = vs + am)
#'
#' # The implicit mutate() step is always performed on the
#' # ungrouped data. Here we get 3 groups:
#' mtcars %>%
#'   group_by(vs) %>%
#'   group_by(hp_cut = cut(hp, 3))
#'
#' # If you want it to be performed by groups,
#' # you have to use an explicit mutate() call.
#' # Here we get 3 groups per value of vs
#' mtcars %>%
#'   group_by(vs) %>%
#'   mutate(hp_cut = cut(hp, 3)) %>%
#'   group_by(hp_cut)
#'
#' # when factors are involved and .drop = FALSE, groups can be empty
#' tbl <- tibble(
#'   x = 1:10,
#'   y = factor(rep(c("a", "c"), each  = 5), levels = c("a", "b", "c"))
#' )
#' tbl %>%
#'   group_by(y, .drop = FALSE) %>%
#'   group_rows()
#'
group_by <- function(.data, ..., .add = FALSE, .drop = group_by_drop_default(.data)) {
  UseMethod("group_by")
}

#' @export
group_by.data.frame <- function(.data, ..., .add = FALSE, .drop = group_by_drop_default(.data)) {
  groups <- group_by_prepare(
    .data,
    ...,
    .add = .add,
    error_call = current_env()
  )
  grouped_df(groups$data, groups$group_names, .drop)
}

#' @rdname group_by
#' @export
#' @param x A [tbl()]
ungroup <- function(x, ...) {
  UseMethod("ungroup")
}

#' @export
ungroup.grouped_df <- function(x, ...) {
  if (missing(...)) {
    as_tibble(x)
  } else {
    old_groups <- group_vars(x)
    to_remove <- tidyselect::eval_select(
      expr = expr(c(...)),
      data = x,
      allow_rename = FALSE
    )
    to_remove <- names(to_remove)

    new_groups <- setdiff(old_groups, to_remove)
    group_by(x, !!!syms(new_groups))
  }
}

#' @export
ungroup.rowwise_df <- function(x, ...) {
  check_dots_empty()
  as_tibble(x)
}

#' @export
ungroup.data.frame <- function(x, ...) {
  check_dots_empty()
  x
}

#' Prepare for grouping and other operations
#'
#' `*_prepare()` performs standard manipulation that is needed prior
#' to actual data processing. They are only be needed by packages
#' that implement dplyr backends.
#'
#' @return A list
#'   \item{data}{Modified tbl}
#'   \item{groups}{Modified groups}
#' @export
#' @keywords internal
group_by_prepare <- function(.data,
                             ...,
                             .add = FALSE,
                             .dots = deprecated(),
                             add = deprecated(),
                             error_call = caller_env()) {
  error_call <- dplyr_error_call(error_call)

  if (!missing(add)) {
    lifecycle::deprecate_warn("1.0.0", "group_by(add = )", "group_by(.add = )", always = TRUE)
    .add <- add
  }

  new_groups <- enquos(..., .ignore_empty = "all")
  if (!missing(.dots)) {
    # Used by dbplyr 1.4.2 so can't aggressively deprecate
    lifecycle::deprecate_warn("1.0.0", "group_by(.dots = )", always = TRUE)
    new_groups <- c(new_groups, compat_lazy_dots(.dots, env = caller_env(2)))
  }

  # If any calls, use mutate to add new columns, then group by those
  computed_columns <- add_computed_columns(.data, new_groups, error_call = error_call)

  out <- computed_columns$data
  group_names <- computed_columns$added_names

  if (.add) {
    group_names <- union(group_vars(.data), group_names)
  }

  unknown <- setdiff(group_names, tbl_vars(out))
  if (length(unknown) > 0) {
    bullets <- c(
      "Must group by variables found in `.data`.",
      x = glue("Column `{unknown}` is not found.")
    )
    abort(bullets, call = error_call)
  }

  list(
    data = out,
    groups = syms(group_names),
    group_names = group_names
  )
}

add_computed_columns <- function(.data,
                                 vars,
                                 error_call = caller_env()) {
  is_symbol <- map_lgl(vars, quo_is_variable_reference)
  needs_mutate <- have_name(vars) | !is_symbol

  if (any(needs_mutate)) {
    # TODO: use less of a hack
    if (inherits(.data, "data.frame")) {
      bare_data <- ungroup(.data)
      by <- compute_by(by = NULL, data = bare_data)

      cols <- mutate_cols(
        bare_data,
        dplyr_quosures(!!!vars),
        by = by,
        error_call = error_call
      )

      out <- dplyr_col_modify(.data, cols)
      col_names <- names(cols)
    } else {
      out <- mutate(.data, !!!vars)
      col_names <- names(exprs_auto_name(vars))
    }
  } else {
    out <- .data
    col_names <- names(exprs_auto_name(vars))
  }

  list(data = out, added_names = col_names)
}

quo_is_variable_reference <- function(quo) {
  if (quo_is_symbol(quo)) {
    return(TRUE)
  }

  if (quo_is_call(quo, n = 2)) {
    expr <- quo_get_expr(quo)

    if (is_call(expr, c("$", "[["))) {
      if (!identical(expr[[2]], sym(".data"))) {
        return(FALSE)
      }

      param <- expr[[3]]
      if (is_symbol(param) || is_string(param)) {
        return(TRUE)
      }
    }
  }

  FALSE
}

#' Default value for .drop argument of group_by
#'
#' @param .tbl A data frame
#'
#' @return `TRUE` unless `.tbl` is a grouped data frame that was previously
#'   obtained by `group_by(.drop = FALSE)`
#'
#' @examples
#' group_by_drop_default(iris)
#'
#' iris %>%
#'   group_by(Species) %>%
#'   group_by_drop_default()
#'
#' iris %>%
#'   group_by(Species, .drop = FALSE) %>%
#'   group_by_drop_default()
#'
#' @keywords internal
#' @export
group_by_drop_default <- function(.tbl) {
  UseMethod("group_by_drop_default")
}

#' @export
group_by_drop_default.default <- function(.tbl) {
  TRUE
}

#' @export
group_by_drop_default.grouped_df <- function(.tbl) {
  tryCatch({
    !identical(attr(group_data(.tbl), ".drop"), FALSE)
  }, error = function(e){
    TRUE
  })
}