R/mean_table.R
In meantables: Make Quick Descriptive Tables for Continuous Variables

Documented in mean_table

#' @title Estimate Mean and 95 Percent Confidence Intervals in dplyr Pipelines
#'
#' @description The mean_table function produces overall and grouped
#'   tables of means with related statistics. In addition to means, the
#'   mean_table missing/non-missing frequencies, the standard error of the
#'   mean (sem), the 95% confidence intervals for the mean(s), the minimum
#'   value, and the maximum value. For grouped tibbles, mean_table displays
#'   these statistics for each category of the group_by variable.
#'
#' @param .data A tibble or grouped tibble.
#'
#' @param .x The continuous response variable for which the statistics are
#'   desired.
#'
#' @param t_prob (1 - alpha / 2). Default value is 0.975, which corresponds to
#'   an alpha of 0.05. Used to calculate a critical value from Student's t
#'   distribution with n - 1 degrees of freedom.
#'
#' @param output Options for this parameter are "default" and "all".
#'
#'   Default output includes the n, mean, sem, and 95% confidence interval for
#'   the mean. Using output = "all" also returns the the number of missing
#'   values for .x and the critical t-value.
#'
#' @param digits Round mean, lcl, and ucl to digits. Default is 2.
#'
#' @param ... Other parameters to be passed on.
#'
#' @return A tibble of class "mean_table" or "mean_table_grouped"
#' @export
#' @importFrom dplyr %>%
#'
#' @references
#'   SAS documentation: http://support.sas.com/documentation/cdl/en/proc/65145/HTML/default/viewer.htm#p0klmrp4k89pz0n1p72t0clpavyx.htm
#'
#' @examples
#' \dontrun{
#' library(dplyr)
#' library(meantables)
#'
#' data(mtcars)
#'
#' # Overall mean table with defaults
#'
#' mtcars %>%
#'   mean_table(mpg)
#'
#' # A tibble: 1 x 9
#'   response_var     n  mean    sd   sem   lcl   ucl   min   max
#'   <chr>        <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#' 1 mpg             32  20.1  6.03  1.07  17.9  22.3  10.4  33.9
#'
#' # Grouped means table with defaults
#'
#' mtcars %>%
#'   group_by(cyl) %>%
#'   mean_table(mpg)
#'
#' # A tibble: 3 x 11
#'   response_var group_var group_cat     n  mean    sd   sem   lcl   ucl   min   max
#'   <chr>        <chr>         <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#' 1 mpg          cyl               4    11  26.7  4.51 1.36   23.6  29.7  21.4  33.9
#' 2 mpg          cyl               6     7  19.7  1.45 0.549  18.4  21.1  17.8  21.4
#' 3 mpg          cyl               8    14  15.1  2.56 0.684  13.6  16.6  10.4  19.2
#' }

mean_table <- function(.data, .x, t_prob = 0.975, output = default, digits = 2, ...) {

  # ------------------------------------------------------------------
  # Prevents R CMD check: "no visible binding for global variable ‘.’"
  # ------------------------------------------------------------------
  n = sd = t_crit = sem = lcl = ucl = var = n_groups = group_1 = NULL
  group_2 = output_arg = default = response_var = `.` = NULL
  group_var = group_cat = group_1_cat = group_2_cat = n_miss = NULL

  # ===========================================================================
  # Quick data checks
  # Input data frame is class data.frame
  # The ".x" argument is a numeric vector
  # There are zero, one, or two group_by variables
  # ===========================================================================
  if (!("data.frame" %in% class(.data))) {
    message("Expecting the class of .data to include data.frame. Instead, the ",
            "class was ", class(.data))
  }

  if (missing(.x)) {
    stop("No argument was passed to the '.x' parameter. Expecting '.x' to be a ",
         "numeric column.")
  }

  # ===========================================================================
  # Enquo arguments
  # enquo the .x argument so that it can be used in the dplyr pipeline below.
  # The .x argument is the variable you want the mean of.
  # enquo/quo_name/UQ the output argument so that I don't have to use
  # quotation marks around the argument being passed.
  # ===========================================================================
  .x          <- rlang::enquo(.x)
  output_arg <- rlang::enquo(output) %>% rlang::quo_name()

  # ===========================================================================
  # Grouping variables
  # Count the number of them - accept zero, one, or two
  # Grab their names - later returned in summary table
  # ===========================================================================
  if ("grouped_df" %in% class(.data)) {
    n_groups <- attributes(.data)$groups %>% length() - 1
  } else {
    n_groups <- 0L
  }

  if (n_groups == 1) {
    group_1 <- attributes(.data)$groups[1] %>% names()

  } else if (n_groups == 2) {
    group_1 <- attributes(.data)$groups[1] %>% names()
    group_2 <- attributes(.data)$groups[2] %>% names()

  } else if (n_groups > 2) {
    stop(".data can be grouped by up to two variables. It is currently grouped ",
         n_groups, " variables")
  }

  # ===========================================================================
  # Create a general summary table of means and related stats. Then, add
  # group variable names to summary table where applicable
  # 1. No group_by variables
  # 2. One group_by variable
  # 3. Two group_by variables
  # ===========================================================================
  out <- .data %>%
    dplyr::summarise(
      # Grab variable (.x) name
      response_var = rlang::quo_name(.x),
      n_miss       = sum(is.na(.data[[rlang::quo_name(.x)]])),
      n            = sum(!is.na(.data[[rlang::quo_name(.x)]])),
      mean         = mean(!! .x, na.rm = TRUE),
      sd           = stats::sd(!! .x, na.rm = TRUE),
      t_crit       = stats::qt(t_prob, n - 1),
      sem          = sd / sqrt(n),
      lcl          = mean - t_crit * sem,
      ucl          = mean + t_crit * sem,
      mean         = round(mean, digits),   # Round mean
      sd           = round(sd, digits),     # Round sd
      lcl          = round(lcl, digits),    # Round lcl
      ucl          = round(ucl, digits),    # Round ucl
      min          = min(!! .x, na.rm = TRUE),
      max          = max(!! .x, na.rm = TRUE)
    ) %>%
    tibble::as_tibble()

  # ===========================================================================
  # Add group variable names to summary table - if applicable
  # Then move to the front of the summary table.

  # Also add classes to "out"
  # If the input data frame (.data) was a grouped data frame, then the output
  # will be a bivariate analysis of means ("mean_table_grouped"). Pass that
  # information on to "out." It can be used later in mean_format.
  # Otherwise the output will be a univariate analysis of means ("mean_table")
  # That class will also be used later in mean_format.
  # ===========================================================================
  if (n_groups == 0) {

    out <- out %>%
      dplyr::select(response_var, dplyr::everything())
    class(out) <- c("mean_table", class(out))

  }
  else if (n_groups == 1) {

    out <- out %>%
      dplyr::mutate(group_var = group_1) %>%
      dplyr::rename(group_cat = !! names(.)[1]) %>%
      dplyr::select(response_var, group_var, group_cat, dplyr::everything())
    class(out) <- c("mean_table_grouped", class(out))

  } else if (n_groups == 2) {

    out <- out %>%
      dplyr::mutate(
        group_1 = group_1,
        group_2 = group_2
      ) %>%
      dplyr::rename(
        group_1_cat = !! names(.)[1],
        group_2_cat = !! names(.)[2]
      ) %>%
      dplyr::select(response_var, group_1, group_1_cat, group_2, group_2_cat,
                    dplyr::everything()) %>%
      dplyr::ungroup()
    class(out) <- c("mean_table_grouped", class(out))
  }

  # ===========================================================================
  # Control output:
  # Typically, I only want the frequency, mean, 95% CI, sem, min, and max.
  # Make that the default.
  # ===========================================================================
  if (output_arg == "default") {
    out <- out %>%
      dplyr::select(-c(n_miss, t_crit))
  }

  # Return summary table
  out
}