#' @title Summary of mean values by group
#' @name means_by_group
#'
#' @description Computes summary table of means by groups.
#'
#' @param x A vector or a data frame.
#' @param by If `x` is a numeric vector, `by` should be a factor that
#' indicates the group-classifying categories. If `x` is a data frame, `by`
#' should be a character string, naming the variable in `x` that is used for
#' grouping. Numeric vectors are coerced to factors. Not that `by` should
#' only refer to a single variable.
#' @param ci Level of confidence interval for mean estimates. Default is `0.95`.
#' Use `ci = NA` to suppress confidence intervals.
#' @param weights If `x` is a numeric vector, `weights` should be a vector of
#' weights that will be applied to weight all observations. If `x` is a data
#' frame, `weights` can also be a character string indicating the name of the
#' variable in `x` that should be used for weighting. Default is `NULL`, so no
#' weights are used.
#' @param digits Optional scalar, indicating the amount of digits after decimal
#' point when rounding estimates and values.
#' @param ... Currently not used
#' @inheritParams find_columns
#'
#' @return A data frame with information on mean and further summary statistics
#' for each sub-group.
#'
#' @details This function is comparable to `aggregate(x, by, mean)`, but provides
#' some further information, including summary statistics from a One-Way-ANOVA
#' using `x` as dependent and `by` as independent variable. [`emmeans::contrast()`]
#' is used to get p-values for each sub-group. P-values indicate whether each
#' group-mean is significantly different from the total mean.
#'
#' @examples
#' data(efc)
#' means_by_group(efc, "c12hour", "e42dep")
#'
#' data(iris)
#' means_by_group(iris, "Sepal.Width", "Species")
#'
#' # weighting
#' efc$weight <- abs(rnorm(n = nrow(efc), mean = 1, sd = .5))
#' means_by_group(efc, "c12hour", "e42dep", weights = "weight")
#' @export
means_by_group <- function(x, ...) {
UseMethod("means_by_group")
}
#' @export
means_by_group.default <- function(x, ...) {
insight::format_error("`means_by_group()` does not work for objects of class `", class(x)[1], "`.")
}
#' @rdname means_by_group
#' @export
means_by_group.numeric <- function(x,
by = NULL,
ci = 0.95,
weights = NULL,
digits = NULL,
...) {
# validation check for arguments
# "by" must be provided
if (is.null(by)) {
insight::format_error("Argument `by` is missing.")
}
# by must be of same length as x
if (length(by) != length(x)) {
insight::format_error("Argument `by` must be of same length as `x`.")
}
# if weights are provided, must be of same length as x
if (!is.null(weights) && length(weights) != length(x)) {
insight::format_error("Argument `weights` must be of same length as `x`.")
}
# if weights are NULL, set weights to 1
if (is.null(weights)) weights <- rep(1, length(x))
# retrieve labels
var_mean_label <- attr(x, "label", exact = TRUE)
var_grp_label <- attr(by, "label", exact = TRUE)
# if no labels present, use variable names directly
if (is.null(var_mean_label)) {
var_mean_label <- deparse(substitute(x))
}
if (is.null(var_grp_label)) {
var_grp_label <- deparse(substitute(by))
}
# coerce group to factor if numeric, or convert labels to levels, if factor
if (is.factor(by)) {
by <- tryCatch(labels_to_levels(by, verbose = FALSE), error = function(e) by)
} else {
by <- to_factor(by)
}
my_data <- stats::na.omit(data.frame(
x = x,
group = by,
weights = weights,
stringsAsFactors = FALSE
))
# get grouped means table
out <- .means_by_group(my_data, ci = ci)
# attributes
attr(out, "var_mean_label") <- var_mean_label
attr(out, "var_grp_label") <- var_grp_label
attr(out, "digits") <- digits
class(out) <- c("dw_groupmeans", "data.frame")
out
}
#' @rdname means_by_group
#' @export
means_by_group.data.frame <- function(x,
select = NULL,
by = NULL,
ci = 0.95,
weights = NULL,
digits = NULL,
exclude = NULL,
ignore_case = FALSE,
regex = FALSE,
verbose = TRUE,
...) {
# evaluate select/exclude, may be select-helpers
select <- .select_nse(select,
x,
exclude,
ignore_case,
regex = regex,
verbose = verbose
)
if (is.null(weights)) {
w <- NULL
} else if (is.character(weights)) {
w <- x[[weights]]
} else {
w <- weights
}
out <- lapply(select, function(i) {
# if no labels present, use variable names directy
if (is.null(attr(x[[i]], "label", exact = TRUE))) {
attr(x[[i]], "label") <- i
}
if (is.null(attr(x[[by]], "label", exact = TRUE))) {
attr(x[[by]], "label") <- by
}
# compute means table
means_by_group(x[[i]], by = x[[by]], ci = ci, weights = w, digits = digits, ...)
})
class(out) <- c("dw_groupmeans_list", "list")
out
}
#' @keywords internal
.means_by_group <- function(data, ci = 0.95) {
# compute anova statistics for mean table
if (is.null(data$weights) || all(data$weights == 1)) {
fit <- stats::lm(x ~ group, data = data)
} else {
fit <- stats::lm(x ~ group, weights = data$weights, data = data)
}
# summary table data
groups <- split(data$x, data$group)
group_weights <- split(data$weights, data$group)
out <- do.call(rbind, Map(function(x, w) {
data.frame(
Mean = weighted_mean(x, weights = w),
SD = weighted_sd(x, weights = w),
N = round(sum(w)),
stringsAsFactors = FALSE
)
}, groups, group_weights))
# add group names
out$Category <- levels(data$group)
out$p <- out$CI_high <- out$CI_low <- NA
# p-values of contrast-means
if (insight::check_if_installed("emmeans", quietly = TRUE)) {
# create summary table of contrasts, for p-values and confidence intervals
predicted <- emmeans::emmeans(fit, specs = "group", level = ci)
emm_contrasts <- emmeans::contrast(predicted, method = "eff")
# add p-values and confidence intervals to "out"
if (!is.null(ci) && !is.na(ci)) {
summary_table <- as.data.frame(predicted)
out$CI_low <- summary_table$lower.CL
out$CI_high <- summary_table$upper.CL
}
summary_table <- as.data.frame(emm_contrasts)
out$p <- summary_table$p.value
}
# reorder columns
out <- out[c("Category", "Mean", "N", "SD", "CI_low", "CI_high", "p")]
# finally, add total-row
out <- rbind(
out,
data.frame(
Category = "Total",
Mean = weighted_mean(data$x, weights = data$weights),
N = nrow(data),
SD = weighted_sd(data$x, weights = data$weights),
CI_low = NA,
CI_high = NA,
p = NA,
stringsAsFactors = FALSE
)
)
# get anova statistics for mean table
sum.fit <- summary(fit)
# r-squared values
r2 <- sum.fit$r.squared
r2.adj <- sum.fit$adj.r.squared
# F-statistics
fstat <- sum.fit$fstatistic
pval <- stats::pf(fstat[1], fstat[2], fstat[3], lower.tail = FALSE)
# copy as attributes
attr(out, "r2") <- r2
attr(out, "ci") <- ci
attr(out, "adj.r2") <- r2.adj
attr(out, "fstat") <- fstat[1]
attr(out, "p.value") <- pval
out
}
# methods -----------------
#' @export
format.dw_groupmeans <- function(x, digits = NULL, ...) {
if (is.null(digits)) {
digits <- attr(x, "digits", exact = TRUE)
}
if (is.null(digits)) {
digits <- 2
}
x$N <- insight::format_value(x$N, digits = 0)
insight::format_table(remove_empty_columns(x), digits = digits, ...)
}
#' @export
print.dw_groupmeans <- function(x, digits = NULL, ...) {
out <- format(x, digits = digits, ...)
# caption
l1 <- attributes(x)$var_mean_label
l2 <- attributes(x)$var_grp_label
if (!is.null(l1) && !is.null(l2)) {
caption <- c(paste0("# Mean of ", l1, " by ", l2), "blue")
} else {
caption <- NULL
}
# footer
footer <- paste0(
"\nAnova: R2=", insight::format_value(attributes(x)$r2, digits = 3),
"; adj.R2=", insight::format_value(attributes(x)$adj.r2, digits = 3),
"; F=", insight::format_value(attributes(x)$fstat, digits = 3),
"; ", insight::format_p(attributes(x)$p.value, whitespace = FALSE),
"\n"
)
cat(insight::export_table(out, caption = caption, footer = footer, ...))
}
#' @export
print.dw_groupmeans_list <- function(x, digits = NULL, ...) {
for (i in seq_along(x)) {
if (i > 1) cat("\n")
print(x[[i]], digits = digits, ...)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.