#' @title Summary of mean values by group
#' @name means_by_group
#'
#' @description Computes mean, sd and se for each sub-group (indicated by \code{grp})
#' of \code{dv}.
#'
#' @param x A (grouped) data frame.
#' @param dv Name of the dependent variable, for which the mean value, grouped
#' by \code{grp}, is computed.
#' @param grp Factor with the cross-classifying variable, where \code{dv} is
#' grouped into the categories represented by \code{grp}. Numeric vectors
#' are coerced to factors.
#' @param weights Name of variable in \code{x} that indicated the vector of
#' weights that will be applied to weight all observations. Default is
#' \code{NULL}, so no weights are used.
#' @param digits Numeric, amount of digits after decimal point when rounding
#' estimates and values.
#' @param file Destination file, if the output should be saved as file.
#' Only used when \code{out} is not \code{"txt"}.
#' @param encoding Character vector, indicating the charset encoding used
#' for variable and value labels. Default is \code{"UTF-8"}. Only used
#' when \code{out} is not \code{"txt"}.
#' @param out Character vector, indicating whether the results should be printed
#' to console (\code{out = "txt"}) or as HTML-table in the viewer-pane
#' (\code{out = "viewer"}) or browser (\code{out = "browser"}), of if the
#' results should be plotted (\code{out = "plot"}, only applies to certain
#' functions). May be abbreviated.
#'
#' @return For non-grouped data frames, \code{means_by_group()} returns a data frame with
#' following columns: \code{term}, \code{mean}, \code{N}, \code{std.dev},
#' \code{std.error} and \code{p.value}. For grouped data frames, returns
#' a list of such data frames.
#'
#' @details This function performs a One-Way-Anova with \code{dv} as dependent
#' and \code{grp} as independent variable, by calling
#' \code{lm(count ~ as.factor(grp))}. Then \code{\link[emmeans]{contrast}}
#' is called to get p-values for each sub-group. P-values indicate whether
#' each group-mean is significantly different from the total mean.
#'
#' @examples
#' data(efc)
#' means_by_group(efc, c12hour, e42dep)
#'
#' data(iris)
#' means_by_group(iris, Sepal.Width, Species)
#'
#' # also works for grouped data frames
#' if (require("dplyr")) {
#' efc %>%
#' group_by(c172code) %>%
#' means_by_group(c12hour, e42dep)
#' }
#'
#' # weighting
#' efc$weight <- abs(rnorm(n = nrow(efc), mean = 1, sd = .5))
#' means_by_group(efc, c12hour, e42dep, weights = weight)
#' @importFrom sjlabelled get_label drop_labels get_labels
#' @importFrom stats lm na.omit sd weighted.mean
#' @importFrom purrr map_chr map_df
#' @importFrom sjmisc to_value is_empty
#' @importFrom rlang enquo .data quo_name
#' @export
means_by_group <- function(x,
dv,
grp,
weights = NULL,
digits = 2,
out = c("txt", "viewer", "browser"),
encoding = "UTF-8",
file = NULL) {
out <- match.arg(out)
if (out != "txt" && !requireNamespace("sjPlot", quietly = TRUE)) {
message("Package `sjPlot` needs to be loaded to print HTML tables.")
out <- "txt"
}
# create quosures
grp.name <- rlang::quo_name(rlang::enquo(grp))
dv.name <- rlang::quo_name(rlang::enquo(dv))
# weights need extra checking, might be NULL
if (!missing(weights)) {
.weights <- try(rlang::quo_name(rlang::enquo(weights)), silent = TRUE)
if (inherits(.weights, "try-error")) .weights <- NULL
w.string <- try(eval(weights), silent = TRUE)
if (!inherits(w.string, "try-error") && !is.null(w.string) && is.character(w.string)) .weights <- w.string
if (sjmisc::is_empty(.weights) || .weights == "NULL") .weights <- NULL
} else
.weights <- NULL
# create string with variable names
vars <- c(grp.name, dv.name, .weights)
# get data
x <- suppressMessages(dplyr::select(x, !! vars))
# set value and row labels
varGrpLabel <- sjlabelled::get_label(x[[grp.name]], def.value = grp.name)
varCountLabel <- sjlabelled::get_label(x[[dv.name]], def.value = dv.name)
# first, drop unused labels
x[[grp.name]] <- sjlabelled::drop_labels(x[[grp.name]], drop.na = TRUE)
# now get valid value labels
value.labels <- sjlabelled::get_labels(
x[[grp.name]], attr.only = F, values = "n", non.labelled = TRUE
)
# return values
dataframes <- list()
# do we have a grouped data frame?
if (inherits(x, "grouped_df")) {
# get grouped data
grps <- get_grouped_data(x)
# now plot everything
for (i in seq_len(nrow(grps))) {
# copy back labels to grouped data frame
tmp <- sjlabelled::copy_labels(grps$data[[i]], x)
# get grouped means table
dummy <- means_by_group_helper(
x = tmp,
dv = dv.name,
grp = grp.name,
weight.by = .weights,
value.labels = value.labels,
varCountLabel = varCountLabel,
varGrpLabel = varGrpLabel
)
attr(dummy, "group") <- get_grouped_title(x, grps, i, sep = "\n")
# save data frame for return value
dataframes[[length(dataframes) + 1]] <- dummy
}
# add class-attr for print-method()
if (out == "txt")
class(dataframes) <- c("sj_grpmeans", "list")
else
class(dataframes) <- c("sjt_grpmeans", "list")
} else {
dataframes <- means_by_group_helper(
x = x,
dv = dv.name,
grp = grp.name,
weight.by = .weights,
value.labels = value.labels,
varCountLabel = varCountLabel,
varGrpLabel = varGrpLabel
)
# add class-attr for print-method()
if (out == "txt")
class(dataframes) <- c("sj_grpmean", class(dataframes))
else
class(dataframes) <- c("sjt_grpmean", class(dataframes))
}
# save how to print output
attr(dataframes, "print") <- out
attr(dataframes, "encoding") <- encoding
attr(dataframes, "file") <- file
attr(dataframes, "digits") <- digits
dataframes
}
#' @importFrom stats pf lm weighted.mean na.omit sd
#' @importFrom sjmisc to_value add_variables
#' @importFrom emmeans emmeans contrast
#' @importFrom dplyr pull select n_distinct
#' @importFrom purrr map_chr
#' @importFrom rlang .data
means_by_group_helper <- function(x, dv, grp, weight.by, value.labels, varCountLabel, varGrpLabel) {
# copy vectors from data frame
dv <- x[[dv]]
grp <- x[[grp]]
if (!is.null(weight.by))
weight.by <- x[[weight.by]]
else
weight.by <- 1
# convert values to numeric
dv <- sjmisc::to_value(dv)
# create data frame, for emmeans
mydf <- stats::na.omit(data.frame(
dv = dv,
grp = as.factor(grp),
weight.by = weight.by
))
# compute anova statistics for mean table
fit <- stats::lm(dv ~ grp, weights = weight.by, data = mydf)
# p-values of contrast-means
means.p <- fit %>%
emmeans::emmeans(specs = "grp") %>%
emmeans::contrast(method = "eff") %>%
summary() %>%
dplyr::pull("p.value")
## TODO
# efc %>%
# group_by(c172code, c161sex) %>%
# means_by_group(c12hour, e42dep)
# check if value labels length matches group count
if (dplyr::n_distinct(mydf$grp) != length(value.labels)) {
# get unique factor levels and check if these are numeric.
# if so, we match the values from value labels and the remaining
# factor levels, so we get the correct value labels for printing
nl <- unique(mydf$grp)
if (sjmisc::is_num_fac(nl))
value.labels <- value.labels[names(value.labels) %in% levels(nl)]
else
value.labels <- nl
}
# create summary
dat <- mydf %>%
dplyr::group_by(.data$grp) %>%
summarise(
mean = stats::weighted.mean(.data$dv, w = .data$weight.by, na.rm = TRUE),
N = round(sum(.data$weight.by)),
std.dev = weighted_sd(.data$dv, .data$weight.by),
std.error = weighted_se(.data$dv, .data$weight.by)
) %>%
mutate(p.value = means.p) %>%
dplyr::select(-.data$grp)
# finally, add total-row
dat <- dplyr::bind_rows(
dat,
data_frame(
mean = stats::weighted.mean(mydf$dv, w = mydf$weight.by, na.rm = TRUE),
N = nrow(mydf),
std.dev = weighted_sd(mydf$dv, mydf$weight.by),
std.error = weighted_se(mydf$dv, mydf$weight.by),
p.value = NA
)
)
# add row labels
dat <- sjmisc::add_variables(
dat,
term = c(unname(value.labels), "Total"),
.after = -1
)
# get anova statistics for mean table
sum.fit <- summary(fit)
# r-squared values
r2 <- sum.fit$r.squared
r2.adj <- sum.fit$adj.r.squared
# F-statistics
fstat <- sum.fit$fstatistic
pval <- stats::pf(fstat[1], fstat[2], fstat[3], lower.tail = F)
# copy as attributes
attr(dat, "r2") <- r2
attr(dat, "adj.r2") <- r2.adj
attr(dat, "fstat") <- fstat[1]
attr(dat, "p.value") <- pval
attr(dat, "dv.label") <- varCountLabel
attr(dat, "grp.label") <- varGrpLabel
dat
}
get_grouped_title <- function(x, grps, i, sep = "\n") {
# create title for first grouping level
tp <- get_title_part(x, grps, 1, i)
title <- sprintf("%s: %s", tp[1], tp[2])
# do we have another groupng variable?
if (length(dplyr::group_vars(x)) > 1) {
tp <- get_title_part(x, grps, 2, i)
title <- sprintf("%s%s%s: %s", title, sep, tp[1], tp[2])
}
# return title
title
}
get_title_part <- function(x, grps, level, i) {
# prepare title for group
var.name <- colnames(grps)[level]
# get values from value labels
vals <- sjlabelled::get_values(x[[var.name]])
# if we have no value labels, get values directly
if (is.null(vals)) {
vals <- unique(x[[var.name]])
lab.pos <- i
} else {
# find position of value labels for current group
lab.pos <- which(vals == grps[[var.name]][i])
}
# get variable and value labels
t1 <- sjlabelled::get_label(x[[var.name]], def.value = var.name)
t2 <- sjlabelled::get_labels(x[[var.name]])[lab.pos]
# if we have no value label, use value instead
if (is.null(t2)) t2 <- vals[lab.pos]
# generate title
c(t1, t2)
}
#' @rdname means_by_group
#' @export
grpmean <- means_by_group
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.