R/fcount.R
In timeplyr: Fast Tidy Tools for Date and Date-Time Manipulation

Documented in fadd_count fcount

#' A fast replacement to dplyr::count()
#'
#' @description
#' Near-identical alternative to `dplyr::count()`.
#'
#' @param data A data frame.
#' @param ... Variables to group by.
#' @param wt Frequency weights.
#'   Can be `NULL` or a variable:
#'
#'   * If `NULL` (the default), counts the number of rows in each group.
#'   * If a variable, computes `sum(wt)` for each group.
#' @param sort If `TRUE`, will show the largest groups at the top.
#' @param order Should the groups be calculated as ordered groups?
#' If `FALSE`, this will return the groups in order of first appearance,
#' and in many cases is faster.
#' If `TRUE` (the default), the groups are returned in sorted order,
#' exactly the same way as `dplyr::count`.
#' @param name The name of the new column in the output.
#'  If there's already a column called `n`,
#'  it will use `nn`.
#'  If there's a column called `n` and `n`n,
#'  it'll use `nnn`, and so on, adding `n`s until it gets a new name.
#' @param .by (Optional). A selection of columns to group by for this operation.
#' Columns are specified using tidy-select.
#' @param .cols (Optional) alternative to `...` that accepts
#' a named character vector or numeric vector.
#' If speed is an expensive resource, it is recommended to use this.
#'
#' @details
#' This is a fast and near-identical alternative to dplyr::count() using the `collapse` package.
#' Unlike `collapse::fcount()`, this works very similarly to `dplyr::count()`.
#' The only main difference is that anything supplied to `wt`
#' is recycled and added as a data variable.
#' Other than that everything works exactly as the dplyr equivalent.
#'
#' `fcount()` and `fadd_count()` can be up to >100x faster than the dplyr equivalents.
#'
#' @returns
#' A `data.frame` of frequency counts by group.
#'
#' @examples
#' library(timeplyr)
#' library(dplyr)
#' \dontshow{
#' .n_dt_threads <- data.table::getDTthreads()
#' .n_collapse_threads <- collapse::get_collapse()$nthreads
#' data.table::setDTthreads(threads = 2L)
#' collapse::set_collapse(nthreads = 1L)
#' }
#' iris %>%
#'   fcount()
#' iris %>%
#'   fadd_count(name = "count") %>%
#'   fslice_head(n = 10)
#' iris %>%
#'   group_by(Species) %>%
#'   fcount()
#' iris %>%
#'   fcount(Species)
#' iris %>%
#'   fcount(across(where(is.numeric), mean))
#'
#' ### Sorting behaviour
#'
#' # Sorted by group
#' starwars %>%
#'   fcount(hair_color)
#' # Sorted by frequency
#' starwars %>%
#'   fcount(hair_color, sort = TRUE)
#' # Groups sorted by order of first appearance (faster)
#' starwars %>%
#'   fcount(hair_color, order = FALSE)
#' \dontshow{
#' data.table::setDTthreads(threads = .n_dt_threads)
#' collapse::set_collapse(nthreads = .n_collapse_threads)
#'}
#' @export
fcount <- function(data, ..., wt = NULL, sort = FALSE,
                   order = df_group_by_order_default(data),
                   name = NULL, .by = NULL, .cols = NULL){
  if (dots_length(...) == 0 && rlang::quo_is_null(rlang::enquo(.by)) && is.null(.cols)){
    return(
      count_simple(data, ..., wt = !!enquo(wt), sort = sort, order = order,
                   name = name, .by = {{ .by }}, .cols = .cols)
    )
  }
  group_vars <- group_vars(data)
  group_info <- tidy_group_info(data, ..., .by = {{ .by }},
                                .cols = .cols,
                                ungroup = TRUE,
                                rename = TRUE)
  out <- group_info[["data"]]
  all_vars <- group_info[["all_groups"]]
  N <- df_nrow(out)
  # Weights
  if (!rlang::quo_is_null(enquo(wt))){
    out_info <- mutate_summary_grouped(out, !!enquo(wt))
    wt_var <- out_info[["cols"]]
    out <- out_info[["data"]]
  } else {
    wt_var <- character()
  }
  if (length(wt_var) > 0L){
    wtv <- out[[wt_var]]
  }
  use_only_grouped_df_groups <- !group_info[["groups_changed"]] && (
    length(all_vars) == 0L ||
      (order && length(group_vars) > 0L && length(group_vars) == length(all_vars))
  )
  if (use_only_grouped_df_groups){
    g <- df_to_GRP(data, return.order = FALSE, order = order, return.groups = TRUE)
  } else {
    g <- df_to_GRP(out, .cols = all_vars, return.order = FALSE,
                   order = order, return.groups = TRUE)
  }
  group_data <- GRP_groups(g)
  if (is.null(group_data)){
      out <- fselect(out, .cols = all_vars)
      gstarts <- GRP_starts(g)
      out <- df_row_slice(out, gstarts, reconstruct = FALSE)
  } else {
    out <- group_data
  }
  group_sizes <- GRP_group_sizes(g)
  if (length(all_vars) == 0L){
   g <- NULL
  }
  N <- df_nrow(out)
  if (is.null(name)){
    name <- new_n_var_nm(out)
  }
  # Edge-case, not sure how to fix this
  if (N == 0L && length(all_vars) == 0L){
    out <- df_init(out, 1L)
  }
  if (length(wt_var) == 0){
    nobs <- group_sizes
  } else {
    nobs <- collapse::fsum(as.double(wtv),
                           g = g,
                           na.rm = TRUE,
                           use.g.names = FALSE,
                           fill = FALSE)
    if (isTRUE(all_integerable(nobs))){
      nobs <- as.integer(nobs)
    }
    # Replace NA with 0
    nobs[cheapr::which_na(nobs)] <- 0L
  }
  out[[name]] <- nobs
  if (sort){
    row_order <- radix_order(-nobs)
    out <- df_row_slice(out, row_order)
  }
  df_reconstruct(out, data)
}
# A basic and very fast count() method
# The above method is faster when there are many groups
# because creating the list of group locations through
# group_by() is unnecessarily expensive
# If the data is already grouped and no variables are supplied
# through ..., then this is very fast
count_simple <- function(data, ..., wt = NULL, sort = FALSE,
                         order = df_group_by_order_default(data),
                         name = NULL, .by = NULL, .cols = NULL){
  out <- fgroup_by(data, ..., .add = TRUE,
                   order = order, .cols = .cols,
                   .by = {{ .by }})
  if (!rlang::quo_is_null(enquo(wt))){
    out_info <- mutate_summary_ungrouped(out, !!enquo(wt))
    wt_var <- out_info[["cols"]]
    weights <- out_info[["data"]][[wt_var]]
  } else {
    weights <- NULL
  }
  if (is.null(name)){
    name <- new_n_var_nm(group_vars(out))
  }
  out <- df_count(out, weights = weights, name = name)
  if (sort){
    out <- farrange(out, desc(.data[[name]]))
  }
  df_reconstruct(out, data)
}
add_count_simple <- function(data, ..., wt = NULL, sort = FALSE,
                         order = df_group_by_order_default(data),
                         name = NULL, .by = NULL, .cols = NULL){
  out <- fgroup_by(data, ..., .add = TRUE,
                   order = order, .cols = .cols,
                   .by = {{ .by }})
  if (!rlang::quo_is_null(enquo(wt))){
    out_info <- mutate_summary_ungrouped(out, !!enquo(wt))
    wt_var <- out_info[["cols"]]
    weights <- out_info[["data"]][[wt_var]]
  } else {
    weights <- NULL
  }
  if (is.null(name)){
    name <- new_n_var_nm(out)
  }
  out <- df_add_count(out, weights = weights, name = name)
  if (sort){
    out <- farrange(out, desc(.data[[name]]))
  }
  df_reconstruct(out, data)
}

#' @rdname fcount
#' @export
fadd_count <- function(data, ..., wt = NULL, sort = FALSE,
                       order = df_group_by_order_default(data),
                       name = NULL, .by = NULL, .cols = NULL){
  if (dots_length(...) == 0 && rlang::quo_is_null(rlang::enquo(.by)) && is.null(.cols)){
    return(
      add_count_simple(data, ..., wt = !!enquo(wt), sort = sort, order = order,
                       name = name, .by = {{ .by }}, .cols = .cols)
    )
  }
  group_vars <- group_vars(data)
  group_info <- tidy_group_info(data, ..., .by = {{ .by }},
                                .cols = .cols,
                                ungroup = TRUE,
                                rename = TRUE)
  out <- group_info[["data"]]
  all_vars <- group_info[["all_groups"]]
  if (rlang::quo_is_null(enquo(wt))){
    wt_var <- character()
  } else {
    ncol1 <- df_ncol(out)
    out_info <- mutate_summary_grouped(out, !!enquo(wt))
    out <- out_info[["data"]]
    ncol2 <- df_ncol(out)
    has_wt <- (ncol2 == ncol1)
    wt_var <- out_info[["cols"]]
    if (length(wt_var) > 0L){
      wtv <- out[[wt_var]]
      if (!has_wt){
        out <- df_rm_cols(out, wt_var)
      }
    }
  }
  use_only_grouped_df_groups <- !group_info[["groups_changed"]] && (
    length(all_vars) == 0L ||
      (order && length(group_vars) > 0L && length(group_vars) == length(all_vars))
  )
  if (use_only_grouped_df_groups){
    g <- df_to_GRP(data, return.order = FALSE, order = order)
  } else {
    g <- df_to_GRP(out, .cols = all_vars, return.order = FALSE, order = order)
  }
  if (is.null(name)){
    name <- new_n_var_nm(out)
  }
  if (length(wt_var) > 0L){
    if (length(all_vars) == 0L){
      g <- NULL
    }
    nobs <- gsum(as.double(wtv),
                   g = g,
                   na.rm = TRUE)
    # Replace NA with 0
    nobs[cheapr::which_na(nobs)] <- 0
    if (isTRUE(all_integerable(nobs))){
      nobs <- as.integer(nobs)
    }
  } else {
    nobs <- GRP_expanded_group_sizes(g)
  }
  out <- dplyr::dplyr_col_modify(out, cols = add_names(list(nobs),
                                                      name))
  if (sort){
    row_order <- radix_order(-out[[name]])
    out <- df_row_slice(out, row_order)
  }
  df_reconstruct(out, data)
}