Nothing
#' Compute t-tests
#'
#' Computes t-tests for one group variable and specified test variables.
#' If no variables are specified, all numeric (integer or double) variables are
#' used. A Levene's test will automatically determine whether the pooled variance is used to
#' estimate the variance. Otherwise the Welch (or Satterthwaite) approximation
#' to the degrees of freedom is used.
#'
#' @param data a [tibble][tibble::tibble-package] or a [tdcmm] model
#' @param group_var group variable (column name) to specify where to split two
#' samples (two-sample t-test) or which variable to compare a one-sample
#' t-test on
#' @param ... test variables (column names). Leave empty to compute t-tests for
#' all numeric variables in data. Also leave empty for one-sample t-tests.
#' @param var.equal this parameter is deprecated (previously: a logical variable indicating whether to treat the two
#' variances as being equal. If `TRUE` then the pooled variance is used to
#' estimate the variance otherwise the Welch (or Satterthwaite) approximation
#' to the degrees of freedom is used. Defaults to `TRUE`).
#' @param paired a logical indicating whether you want a paired t-test. Defaults
#' to `FALSE`.
#' @param pooled_sd a logical indicating whether to use the pooled standard
#' deviation in the calculation of Cohen's d. Defaults to `TRUE`.
#' @param levels optional: a vector of length two specifying the two levels of
#' the group variable.
#' @param case_var optional: case-identifying variable (column name). If you
#' set `paired = TRUE`, specifying a case variable will ensure that data
#' are properly sorted for a dependent t-test.
#' @param mu optional: a number indicating the *true* value of the mean in the
#' general population (\eqn{\mu}). If set, a one-sample t-test (i.e., a
#' location test) is being calculated. Leave to `NULL` to calculate
#' two-sample t-test(s).
#'
#' @return a [tdcmm] model
#'
#' @family t-test
#'
#' @examples
#' WoJ %>% t_test(temp_contract, autonomy_selection, autonomy_emphasis)
#' WoJ %>% t_test(temp_contract)
#' WoJ %>% t_test(employment, autonomy_selection, autonomy_emphasis,
#' levels = c("Full-time", "Freelancer"))
#' WoJ %>% t_test(autonomy_selection, mu = 3.62)
#'
#' @export
t_test <- function(data, group_var, ...,
var.equal = TRUE, paired = FALSE, pooled_sd = TRUE,
levels = NULL, case_var = NULL, mu = NULL) {
# Add warning for the var.equal deprecation
if (!missing(var.equal)) {
warning("The 'var.equal' parameter is deprecated and will be removed in future versions. A Levene's test will automatically evaluate whether variances should be treated as equal.",
immediate. = TRUE)
}
# Check if group_var is provided
if (missing(group_var)) {
stop("Please provide at least one variable.")
}
# Get group var name
group_var_str <- as_label(quo({{ group_var }}))
# Drop unused levels (if data is filtered)
data <- droplevels(data)
# run one-sample t-test if requested
if (!is.null(mu)) {
# Get vars
test_vars <- grab_vars(data, quos(...), alternative = "none")
if (length(test_vars) > 0) {
test_vars_string <- purrr::map_chr(test_vars, as_label)
stop(glue("If using the mu argument, a one-sample t-test is being ",
"calculated. This cannot interfere with a two-sample t-test. ",
"Please omit any test variables (i.e., ",
paste(test_vars_string, collapse = ", "),
")."),
call. = FALSE)
} else {
return(one_sample_t_test(data, {{ group_var }}, group_var_str, mu))
}
}
# Get vars
test_vars <- grab_vars(data, quos(...))
test_vars_string <- purrr::map_chr(test_vars, as_label)
# Filter group var if necessary
if (group_var_str %in% test_vars_string) {
test_vars <- syms(test_vars_string[test_vars_string != group_var_str])
}
# if not one-sample, it might be a two-sample t-test without case_var
if (missing(case_var)) {
return(two_sample_t_test(data,
{{ group_var }}, group_var_str,
{{ test_vars }}, test_vars_string,
var.equal, paired, pooled_sd, levels, case_var))
}
# ...or with a provided case_var
return(two_sample_t_test(data,
{{ group_var }}, group_var_str,
{{ test_vars }}, test_vars_string,
var.equal, paired, pooled_sd, levels, ensym(case_var)))
}
#' @rdname visualize
#' @export
visualize.tdcmm_ttst <- function(x, ..., .design = design_lmu()) {
if (attr(x, "func") == "t_test") {
return(visualize_t_test(x, .design))
}
return(warn_about_missing_visualization(x))
}
### Internal functions ###
## Run one-sample t-test
##
## Run actual one-sample or location t-test as specified
##
## @inheritParams t_test
## @param group_var_str Stringified version of group variable
##
## @return a [tdcmm] model
##
## @family t-test
##
## @keywords internal
one_sample_t_test <- function(data, group_var, group_var_str, mu) {
# Prepare data
data_prepared <- data %>%
dplyr::pull({{ group_var }})
if (!is.numeric(data_prepared)) {
stop(glue("Within a one-sample t-test, {group_var_str} must be numeric."),
call. = FALSE)
}
# Compute and Create output
tt <- t.test(data_prepared, mu = mu)
out <- tibble::tibble(
Variable = group_var_str,
M = mean(data_prepared, na.rm = TRUE),
SD = sd(data_prepared, na.rm = TRUE),
CI_95_LL = tt$conf.int[[1]],
CI_95_UL = tt$conf.int[[2]],
Mu = mu,
t = tt$statistic,
df = tt$parameter,
p = tt$p.value
)
# Output
return(new_tdcmm_ttst(
new_tdcmm(out,
func = "t_test",
data = data,
params = list(group_var = group_var_str,
mu = mu),
model = list(tt)))
)
}
## Run two-sample t-test
##
## Run actual two-sample t-test(s) as specified
##
## @inheritParams t_test
## @param group_var_str Stringified version of group variable
## @param test_vars Test variables
##
## @return a [tdcmm] model
##
## @family t-test
##
## @keywords internal
two_sample_t_test <- function(data, group_var, group_var_str, test_vars,
test_vars_str,
var.equal, paired, pooled_sd, levels, case_var) {
if (is.null(levels)) {
# Get levels
levels <- data %>%
dplyr::pull({{ group_var }}) %>%
na.omit() %>%
unique() %>%
as.character()
# Check
if (length(levels) < 2) {
stop("Grouping variable must have more than one level", call. = FALSE)
} else if (length(levels) > 2) {
warning(glue("{group_var_str} has more than 2 levels, defaulting to first two ",
"({levels[1]} and {levels[2]}). ",
"Consider filtering your data ",
"or setting levels with the levels argument"),
call. = FALSE)
data <- data %>%
dplyr::filter({{ group_var }} %in% levels[1:2]) %>%
droplevels()
}
} else if (length(levels) != 2) {
stop("If using the levels argument, please provide exactly two levels",
call. = FALSE)
} else if (!all(levels %in% unique(data[[group_var_str]]))) {
stop("At least one level specified in the levels argument not found in data",
call. = FALSE)
}
# Prepare data
levels <- levels[1:2]
if (!is.null(case_var)) {
case_var_sym <- rlang::sym(case_var)
data <- data %>%
dplyr::arrange({{ group_var }}, !!case_var_sym)
}
data <- dplyr::select(data, {{ group_var }}, !!!test_vars)
# Main function
model_list_t <- list()
model_list_levene <- list()
out_t <- NULL
out_levene <- NULL
for (test_var in test_vars) {
# Split data
x <- data %>%
dplyr::filter({{ group_var }} == levels[1]) %>%
dplyr::pull({{ test_var }})
y <- data %>%
dplyr::filter({{ group_var }} == levels[2]) %>%
dplyr::pull({{ test_var }})
# Compute Levene test
test_var_string <- as_label(enquo(test_var))
levene_test <- suppressWarnings(
car::leveneTest(as.formula(as.formula(
paste0("`", test_var_string, "` ~ `", group_var_str, "`"))),
data = data)
)
levene_row <- data.frame(
Variable = as.character(test_var),
Levene_p = round(levene_test$`Pr(>F)`[1], digits = 3)
)
# Compute output based on Levene's test
if (levene_row$Levene_p < 0.05) {
# Unequal variances, use Welch's ttest
equal_var_assumption <- FALSE
tt <- t.test(x, y, var.equal = FALSE, paired = paired)
} else {
# Equal variances, use regular ttest
equal_var_assumption <- TRUE
tt <- t.test(x, y, var.equal = TRUE, paired = paired)
}
# Create output
tt_row <- format_t_test(!!test_var, tt, x, y, levels, pooled_sd)
# Collect t_test
model_list_t[[length(model_list_t) + 1]] <- tt
out_t <- out_t %>%
dplyr::bind_rows(tt_row)
# Collect levene test
if (equal_var_assumption == FALSE)
levene_row <- levene_row %>%
dplyr::mutate(var_equal = "FALSE")
else {
levene_row <- levene_row %>%
dplyr::mutate(var_equal = "TRUE")
}
model_list_levene[[length(model_list_levene) + 1]] <- levene_row
out_levene <- out_levene %>%
dplyr::bind_rows(levene_row)
}
if (levene_row$Levene_p < 0.05) {
# Unequal variances, Welch's ttest used
message(glue("The significant result from Levene's test suggests unequal variances among the groups, violating standard t-test assumptions. This necessitates the use of Welch approximation to the degrees of freedom, which is robust against heteroscedasticity."))
}
out <- dplyr::full_join(out_t, out_levene, by = "Variable")
# Output
return(new_tdcmm_ttst(
new_tdcmm(out,
func = "t_test",
data = data,
params = list(group_var = group_var_str,
vars = test_vars_str,
var.equal = var.equal,
paired = paired,
pooled_sd = pooled_sd,
levels = levels,
case_var = case_var),
model = model_list_t))
)
}
## Format computed t-test
##
## Outputs a t-test for one test variable
##
## @inheritParams t_test
## @param test_var Test variable
## @param tt [htest] t.test object as returned from [compute_t_test]
## @param x splitted x part of the data
## @param y splitted y part of the data
##
## @return a [tibble][tibble::tibble-package]
##
## @family t-test
##
## @keywords internal
format_t_test <- function(test_var, tt, x, y, levels, pooled_sd) {
# Get names
level_names <- levels %>%
stringr::str_replace_all(" ", "_") %>%
stringr::str_sub(1, 10)
M_str <- paste("M", level_names, sep = "_")
SD_str <- paste("SD", level_names, sep = "_")
test_var_str <- as_label(quo({{ test_var }}))
tibble::tibble(
Variable = test_var_str,
!!M_str[1] := pillar::num(mean(x, na.rm = TRUE), digits = 3),
!!SD_str[1] := pillar::num(sd(x, na.rm = TRUE), digits = 3),
!!M_str[2] := pillar::num(mean(y, na.rm = TRUE), digits = 3),
!!SD_str[2] := pillar::num(sd(y, na.rm = TRUE), digits = 3),
Delta_M = pillar::num(mean(x, na.rm = TRUE) - mean(y, na.rm = TRUE), digits = 3),
t = pillar::num(tt$statistic, digits = 3),
df = round(tt$parameter, digits = 0),
p = pillar::num(tt$p.value, digits = 3),
d = pillar::num(cohens_d(x, y, pooled_sd, na.rm = TRUE), digits = 3)
)
}
## Compute Cohen's d
##
## Computes the effect size estimate Cohen's d for two sets of numerical values
##
## @param x a (non-empty) numeric vector of data values.
## @param y a (non-empty) numeric vector of data values.
## @param pooled_sd a logical indicating whether to use the pooled standard
## deviation in the calculation of Cohen's d. Defaults to `TRUE`.
## @param na.rm a logical value indicating whether NA values should be stripped
## before the computation proceeds. Defaults to `TRUE`.
##
## @return a `dbl`
##
## @family t-test
##
## @keywords internal
cohens_d <- function(x, y, pooled_sd = TRUE, na.rm = TRUE) {
nx <- length(!is.na(x))
ny <- length(!is.na(y))
mx <- mean(x, na.rm = na.rm)
my <- mean(y, na.rm = na.rm)
varx <- var(x, na.rm = na.rm)
vary <- var(y, na.rm = na.rm)
if (pooled_sd) {
s <- sqrt(((nx - 1) * varx + (ny - 1) * vary) / (nx + ny - 2))
} else {
s <- sqrt((varx + vary) / 2)
}
(mx - my) / s
}
## Visualize `t_test()` as points with 95% CI ranges
##
## @param x a [tdcmm] model
##
## @return a [ggplot2] object
##
## @family tdcmm visualize
#
## @keywords internal
visualize_t_test <- function(x, design = design_lmu()) {
if ("mu" %in% names(attr(x, "params"))) {
return(warn_about_missing_visualization(x))
}
# get variables
group_var_str <- attr(x, "params")$group_var
group_var <- sym(group_var_str)
test_vars_str <- attr(x, "params")$vars
test_vars_str <- test_vars_str[test_vars_str != group_var_str]
test_vars <- syms(test_vars_str)
data <- attr(x, "data")
levels <- attr(x, "params")$levels
level_names <- levels %>%
stringr::str_replace_all(" ", "_") %>%
stringr::str_sub(1, 10)
n_str <- paste("N", level_names, sep = "_")
# collect n
n <- NULL
for (test_var_str in test_vars_str) {
n <- n %>%
rbind(tibble::tibble(Variable = test_var_str,
!!n_str[1] := (
data %>%
dplyr::filter({{ group_var }} == levels[1]) %>%
nrow()
),
!!n_str[2] := (
data %>%
dplyr::filter({{ group_var }} == levels[2]) %>%
nrow()
)))
}
# merge
data <- x %>%
dplyr::select(-c("Delta_M", "t", "df", "p", "d")) %>%
dplyr::left_join(n, by = "Variable") %>%
tidyr::pivot_longer(tidyselect::ends_with(level_names),
names_to = "level") %>%
dplyr::mutate(var = stringr::str_split_i(.data$level, "_", 1),
level = stringr::str_split_i(.data$level, "_", 2)) %>%
tidyr::pivot_wider(names_from = "var",
values_from = "value") %>%
dplyr::mutate(ci_95_ll = calculate_ci_ll(.data$M, .data$SD, .data$N),
ci_95_ul = calculate_ci_ul(.data$M, .data$SD, .data$N))
# build graph
data %>%
ggplot2::ggplot(ggplot2::aes(xmin = .data$ci_95_ll,
x = .data$M,
xmax = .data$ci_95_ul,
y = .data$Variable,
color = .data$level)) +
ggplot2::geom_pointrange(stat = "identity",
position = ggplot2::position_dodge2(width = 0.9),
linewidth = design$main_size) +
ggplot2::scale_x_continuous(NULL,
n.breaks = 8) +
ggplot2::scale_y_discrete(NULL) +
ggplot2::scale_color_manual(NULL,
values = design$main_colors,
guide = ggplot2::guide_legend(reverse = TRUE)) +
design$theme() +
ggplot2::theme(legend.position = "bottom")
}
# Constructors ----
new_tdcmm_ttst <- function(x) {
stopifnot(is_tdcmm(x))
structure(
x,
class = c("tdcmm_ttst", class(x))
)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.