Nothing
#' Output effect sizes and test statistics for count data
#'
#' @description
#' The type of effect size depends on the number of selected columns:
#' - One categorical column: see \link{effect_counts_one}
#' - Multiple categorical columns: see \link{effect_counts_items}
#'
#' Cross tabulations:
#'
#' - One categorical column and one grouping column: see \link{effect_counts_one_grouped}
#' - Multiple categorical columns and one grouping column: see \link{effect_counts_items_grouped} (not yet implemented)
#' - Multiple categorical columns and multiple grouping columns: \link{effect_counts_items_grouped_items} (not yet implemented)
#'
#' By default, if you provide two column selections, the second column is treated as categorical.
#' Setting the metric-parameter to TRUE will call the appropriate functions for correlation analysis:
#'
#' - One categorical column and one metric column: see \link{effect_counts_one_cor} (not yet implemented)
#' - Multiple categorical columns and one metric column: see \link{effect_counts_items_cor} (not yet implemented)
#' - Multiple categorical columns and multiple metric columns:\link{effect_counts_items_cor_items} (not yet implemented)
#'
#' `r lifecycle::badge("experimental")`
#'
#' @param data A data frame.
#' @param cols A tidy column selection,
#' e.g. a single column (without quotes)
#' or multiple columns selected by methods such as starts_with().
#' @param cross Optional, a grouping column. The column name without quotes.
#' @param metric When crossing variables, the cross column parameter can contain categorical or metric values.
#' By default, the cross column selection is treated as categorical data.
#' Set metric to TRUE, to treat it as metric and calculate correlations.
#' @param clean Prepare data by \link{data_clean}.
#' @param ... Other parameters passed to the appropriate effect function.
#' @return A volker tibble.
#' @examples
#' library(volker)
#' data <- volker::chatgpt
#'
#' effect_counts(data, sd_gender, adopter)
#'
#' @export
effect_counts <- function(data, cols, cross = NULL, metric = FALSE, clean = TRUE, ...) {
# Check
check_is_dataframe(data)
# 2. Clean
if (clean) {
data <- data_clean(data, clean)
}
# Find columns
cols_eval <- tidyselect::eval_select(expr = enquo(cols), data = data)
cross_eval <- tidyselect::eval_select(expr = enquo(cross), data = data)
is_items <- length(cols_eval) > 1
is_grouped <- length(cross_eval) == 1
is_multi <- length(cross_eval) > 1
is_metric <- metric != FALSE
# Single variables
if (!is_items && !is_grouped && !is_multi) {
effect_counts_one(data, {{ cols }}, ...)
}
else if (!is_items && is_grouped && !is_metric) {
effect_counts_one_grouped(data, {{ cols }}, {{ cross }}, ...)
}
else if (!is_items && is_grouped && is_metric) {
effect_counts_one_cor(data, {{ cols }}, {{ cross }}, ...)
}
# Items
else if (is_items && !is_grouped && !is_multi) {
effect_counts_items(data, {{ cols }} , ...)
}
else if (is_items && is_grouped && !is_metric) {
effect_counts_items_grouped(data, {{ cols }}, {{ cross }}, ...)
}
else if (is_items && is_grouped && is_metric) {
effect_counts_items_cor(data, {{ cols }}, {{ cross }}, ...)
}
else if (is_items && !is_grouped && is_multi && !is_metric) {
effect_counts_items_grouped_items(data, {{ cols }}, {{ cross }}, ...)
}
# Not found
else {
stop("Check your parameters: the column selection is not yet supported by volker functions.")
}
}
#' Output effect sizes and test statistics for metric data
#'
#' @description
#' The calculations depend on the number of selected columns:
#'
#' - One metric column: see \link{effect_metrics_one}
#' - Multiple metric columns: see \link{effect_metrics_items}
#'
#' Group comparisons:
#'
#' - One metric column and one grouping column: see \link{effect_metrics_one_grouped}
#' - Multiple metric columns and one grouping column: see \link{effect_metrics_items_grouped}
#' - Multiple metric columns and multiple grouping columns: not yet implemented
#'
#' By default, if you provide two column selections, the second column is treated as categorical.
#' Setting the metric-parameter to TRUE will call the appropriate functions for correlation analysis:
#'
#' - Two metric columns: see \link{effect_metrics_one_cor}
#' - Multiple metric columns and one metric column: see \link{effect_metrics_items_cor}
#' - Two metric column selections: see \link{effect_metrics_items_cor_items}
#'
#' `r lifecycle::badge("experimental")`
#'
#' @param data A data frame.
#' @param cols A tidy column selection,
#' e.g. a single column (without quotes)
#' or multiple columns selected by methods such as starts_with().
#' @param cross Optional, a grouping column (without quotes).
#' @param metric When crossing variables, the cross column parameter can contain categorical or metric values.
#' By default, the cross column selection is treated as categorical data.
#' Set metric to TRUE, to treat it as metric and calculate correlations.
#' @param clean Prepare data by \link{data_clean}.
#' @param ... Other parameters passed to the appropriate effect function.
#' @return A volker tibble.
#' @examples
#' library(volker)
#' data <- volker::chatgpt
#'
#' effect_metrics(data, sd_age, sd_gender)
#'
#' @export
effect_metrics <- function(data, cols, cross = NULL, metric = FALSE, clean = TRUE, ...) {
# Check
check_is_dataframe(data)
# 2. Clean
if (clean) {
data <- data_clean(data, clean)
}
# Find columns
cols_eval <- tidyselect::eval_select(expr = enquo(cols), data = data)
cross_eval <- tidyselect::eval_select(expr = enquo(cross), data = data)
is_items <- length(cols_eval) > 1
is_grouped <- length(cross_eval)== 1
is_multi <- length(cross_eval) > 1
is_metric <- metric != FALSE
# Single variables
if (!is_items && !is_grouped && !is_multi) {
effect_metrics_one(data, {{ cols }}, ...)
}
else if (!is_items && is_grouped && !is_metric) {
effect_metrics_one_grouped(data, {{ cols }}, {{ cross }}, ...)
}
else if (!is_items && is_grouped && is_metric) {
effect_metrics_one_cor(data, {{ cols }}, {{ cross }}, ...)
}
# Items
else if (is_items && !is_grouped && !is_multi) {
effect_metrics_items(data, {{ cols }} , ...)
}
else if (is_items && is_grouped && !is_metric) {
effect_metrics_items_grouped(data, {{ cols }}, {{ cross }}, ...)
}
else if (is_items && is_grouped && is_metric) {
effect_metrics_items_cor(data, {{ cols }}, {{ cross }}, ...)
}
else if (is_items && !is_grouped && is_multi && is_metric) {
effect_metrics_items_cor_items(data, {{ cols }}, {{ cross }}, ...)
}
# Not found
else {
stop("Check your parameters: the column selection is not yet supported by volker functions.")
}
}
#' Test homogeneity of category shares
#'
#' Performs a goodness-of-fit test and calculates the Gini coefficient.
#' The goodness-of-fit-test is calculated using \code{stats::\link[stats:chisq.test]{chisq.test}}.
#'
#' @keywords internal
#'
#' @param data A tibble.
#' @param col The column holding factor values.
#' @param clean Prepare data by \link{data_clean}
#' @param ... Placeholder to allow calling the method with unused parameters from \link{effect_counts}.
#' @return A volker tibble with the following statistical measures:
#' - **Gini coefficient**: Gini coefficient, measuring inequality.
#' - **n**: Number of cases the calculation is based on.
#' - **Chi-squared**: Chi-Squared test statistic.
#' - **p**: p-value for the statistical test.
#' - **stars**: Significance stars based on p-value (*, **, ***).
#'
#' @examples
#' library(volker)
#' data <- volker::chatgpt
#'
#' data |>
#' filter(sd_gender != "diverse") |>
#' effect_counts_one(sd_gender)
#'
#' @export
#' @importFrom rlang .data
effect_counts_one <- function(data, col, clean = TRUE, ...) {
# 1. Checks, clean, remove missings
data <- data_prepare(data, {{ col }}, cols.categorical = {{ col }}, clean = clean)
# 2. Chi-squared test
counts <- data %>%
dplyr::count({{ col }}) %>%
dplyr::pull(.data$n)
fit <- stats::chisq.test(counts)
# 3. Result
result <- list(
"Gini coefficient" = sprintf("%.2f", get_gini(counts)),
"n" = as.character(sum(counts)),
"Chi-squared" = sprintf("%.2f", round(fit$statistic, 2)),
"p" = sprintf("%.3f", round(fit$p.value, 3)),
"stars" = get_stars(fit$p.value)
) %>%
tibble::enframe(
name = "Statistic",
value = "Value"
)
result <- .attr_transfer(result, data, "missings")
.to_vlkr_tab(result, caption = fit$method)
}
#' Output test statistics and effect size for contingency tables
#'
#' Chi squared is calculated using \code{stats::\link[stats:chisq.test]{chisq.test}}.
#' If any cell contains less than 5 observations, the exact-parameter is set.
#'
#' Phi is derived from the Chi squared value by \code{sqrt(fit$statistic / n)}.
#' Cramer's V is derived by \code{sqrt(phi / (min(dim(contingency)[1], dim(contingency)[2]) - 1))}.
#' Cramer's V is set to 1.0 for diagonal contingency matrices, indicating perfect association.
#'
#' @keywords internal
#'
#' @param data A tibble.
#' @param col The column holding factor values.
#' @param cross The column holding groups to compare.
#' @param clean Prepare data by \link{data_clean}.
#' @param ... Placeholder to allow calling the method with unused parameters from \link{effect_counts}.
#' @return A volker list with two volker tibbles.
#' The first tibble contains npmi values for each combinations:
#' - n Number the combination occurs.
#' - p_x Marginal share of the first category.
#' - p_y Marginal share of the second category.
#' - p_xy Share of the combination based on all combinations.
#' - ratio The ratio of p_xy to (p_x * p_y).
#' - pmi Pointwise Mutual information, derived from the ratio.
#' - npmi Normalized Pointwise Mutual Information, derived from the pmi.
#'
#' The second tibble contains effect sizes based on the cross table:
#'
#' - Cramer's V: Effect size measuring the association between the two variables.
#' - n: Number of cases the calculation is based on.
#' - Chi-squared: Chi-Squared test statistic.
#' If expected values are below 5 in at least one cell, an exact Fisher test is conducted.
#' - df: Degrees of freedo of the chi-squared test. Empty for the exact Fisher test.
#' - p: p-value of the chi-squared test.
#' - stars: Significance stars based on p-value (*, **, ***).
#'
#' @examples
#' library(volker)
#' data <- volker::chatgpt
#'
#' effect_counts_one_grouped(data, adopter, sd_gender)
#'
#' @importFrom rlang .data
#' @export
effect_counts_one_grouped <- function(data, col, cross, clean = TRUE, ...) {
# 1. Checks, clean, remove missings
data <- data_prepare(data, {{ col }}, {{ cross }}, cols.categorical = c({{ col }}, {{ cross }}), clean = clean)
# 2. NPMI
result_items <- get_npmi(data, {{ col }}, {{ cross }}, ...)
result_items$p_y_if_x <- NULL
result_items$p_x_if_y <- NULL
# 3. Cramer's V
result_stats <- get_correlation(
dplyr::pull(data, {{ col }}),
dplyr::pull(data, {{ cross }}),
method = "cramer"
)
result_stats$p = sprintf("%.3f", round(result_stats$p, 3))
result_stats$stars = get_stars(result_stats$p)
result_stats <- tibble::enframe(
result_stats,
name = "Statistic",
value = "Value"
)
# 4. Assemble results
result <- c(
list(.to_vlkr_tab(result_items, digits=2)),
list(.to_vlkr_tab(result_stats, digits=2))
)
result <- .attr_transfer(result, data, "missings")
.to_vlkr_list(result)
}
#' Output test statistics and effect size from a logistic regression of one metric predictor
#'
#' \strong{Not yet implemented. The future will come.}
#'
#' @keywords internal
#'
#' @param data A tibble.
#' @param col The column holding factor values.
#' @param cross The column holding metric values.
#' @param clean Prepare data by \link{data_clean}.
#' @param ... Placeholder to allow calling the method with unused parameters from \link{effect_counts}.
#' @return A volker tibble.
#' @importFrom rlang .data
effect_counts_one_cor <- function(data, col, cross, clean = TRUE, labels = TRUE, ...) {
warning("Not implemented yet. The future will come.", noBreaks. = TRUE)
}
#' Test homogeneity of category shares for multiple items
#'
#' Performs a goodness-of-fit test and calculates the Gini coefficient for each item.
#' The goodness-of-fit-test is calculated using \code{stats::\link[stats:chisq.test]{chisq.test}}.
#'
#' @keywords internal
#'
#' @param data A tibble containing item measures.
#' @param cols Tidyselect item variables (e.g. starts_with...).
#' @param adjust Performing multiple significance tests inflates the alpha error.
#' Thus, p values need to be adjusted according to the number of tests.
#' Set a method supported by \code{stats::\link[stats:p.adjust]{p.adjust}},
#' e.g. "fdr" (the default) or "bonferroni". Disable adjustment with FALSE.
#' @param labels If TRUE (default) extracts labels from the attributes, see \link{codebook}.
#' @param clean Prepare data by \link{data_clean}.
#' @param ... Placeholder to allow calling the method with unused parameters from \link{effect_counts}.
#' @return A volker tibble with the following statistical measures:
#'
#' - **Gini coefficient**: Gini coefficient, measuring inequality.
#' - **n**: Number of cases the calculation is based on.
#' - **Chi-squared**: Chi-Squared test statistic.
#' - **p**: p-value for the statistical test.
#' - **stars**: Significance stars based on p-value (*, **, ***).
#'
#' @examples
#' library(volker)
#' data <- volker::chatgpt
#'
#' effect_counts_items(data, starts_with("cg_adoption_adv"))
#'
#' @export
#' @importFrom rlang .data
effect_counts_items <- function(data, cols, adjust = "fdr", labels = TRUE, clean = TRUE, ...) {
# 1. Checks, clean, remove missings
data <- data_prepare(data, {{ cols }}, cols.categorical = {{ cols }}, clean = clean)
# 2. Count
counts <- data %>%
labs_clear({{ cols }}) %>%
tidyr::pivot_longer(
{{ cols }},
names_to = "item",
values_to = ".value",
values_drop_na = TRUE
) %>%
dplyr::count(.data$item, .data$.value) %>%
dplyr::group_by(.data$item) %>%
dplyr::reframe(n = list(.data$n)) %>%
tibble::deframe()
# 3. Chi-square goodness-of-fit test for each item
result <- purrr::imap(
counts,
\(.x, .y) {
chi <- stats::chisq.test(.x)
list(
"item" = .y,
"Gini coefficient" = get_gini(.x),
"n" = sum(.x),
"Chi-squared" = chi$statistic,
"p" = chi$p.value
)
}) %>%
dplyr::bind_rows()
# Adjust
result <- adjust_p(result, "p", method = adjust)
# Round
result <- .attr_setcolumn(
result, tidyselect::all_of(c("Gini coefficient", "Chi-squared")),
"round", 2
)
# 3. Get variable caption from the attributes
if (labels) {
result <- labs_replace(result, "item", codebook(data, {{ cols }}), col_from="item_name", col_to="item_label")
prefix <- get_prefix(result$item)
result <- dplyr::mutate(result, item = trim_prefix(.data$item, prefix))
}
# 4. Rename first column
if (prefix != "") {
colnames(result)[1] <- prefix
} else {
result <- dplyr::rename(result, Item = tidyselect::all_of("item"))
}
# 5. Result
result <- .attr_transfer(result, data, "missings")
.to_vlkr_tab(result)
}
#' Effect size and test for comparing multiple variables by a grouping variable
#'
#' @keywords internal
#'
#' @param data A tibble containing item measures and grouping variable.
#' @param cols Tidyselect item variables (e.g. starts_with...).
#' @param cross The column holding groups to compare.
#' @param method The output metrics, currently only `cramer` is supported.
#' @param adjust Performing multiple significance tests inflates the alpha error.
#' Thus, p values need to be adjusted according to the number of tests.
#' Set a method supported by \code{stats::\link[stats:p.adjust]{p.adjust}},
#' e.g. "fdr" (the default) or "bonferroni". Disable adjustment with FALSE.
#' @param labels If TRUE (default) extracts labels from the attributes, see \link{codebook}.
#' @param clean Prepare data by \link{data_clean}.
#' @param ... Placeholder to allow calling the method with unused parameters from \link{effect_counts}.
#' @return A volker tibble.
#' @examples
#' library(volker)
#' data <- volker::chatgpt
#'
#' effect_counts_items_grouped(
#' data, starts_with("cg_adoption_adv"), sd_gender
#' )
#'
#' @export
#' @importFrom rlang .data
effect_counts_items_grouped <- function(data, cols, cross, method = "cramer", adjust = "fdr", labels = TRUE, clean = TRUE, ...) {
# 1. Checks, clean, remove missings
data <- data_prepare(data, {{ cols }}, {{ cross }}, cols.categorical = c({{ cols }}, {{ cross }}), clean = clean)
check_is_param(method, c("cramer", "npmi"))
# 2. Calculate correlations
result <- .effect_correlations(data, {{ cols }}, {{ cross }}, method = method, adjust = adjust, labels = labels)
# 3. Labels
prefix1 <- get_prefix(result$item1)
if (labels) {
prefix2 <- get_title(data, {{ cross }})
} else {
prefix2 <- rlang::as_string(rlang::ensym(cross))
}
result <- result %>%
dplyr::mutate(item1 = trim_prefix(.data$item1, prefix1)) %>%
dplyr::select(-tidyselect::all_of("item2"))
# Rename first column
if (prefix1 != "") {
colnames(result)[1] <- paste0(prefix1, ": Correlation with ", prefix2)
}
result <- .attr_transfer(result, data, "missings")
.to_vlkr_tab(result, digits = 2)
}
#' Effect size and test for comparing multiple variables by multiple grouping variables
#'
#' @keywords internal
#'
#' @param data A tibble containing item measures and grouping variable.
#' @param cols Tidyselect item variables (e.g. starts_with...).
#' @param cross The columns holding groups to compare.
#' @param method The output metrics: cramer = Cramer's V, pmi = Pointwise Mutual Information, npmi = Normalized PMI.
#' @param adjust Performing multiple significance tests inflates the alpha error.
#' Thus, p values need to be adjusted according to the number of tests.
#' Set a method supported by \code{stats::\link[stats:p.adjust]{p.adjust}},
#' e.g. "fdr" (the default) or "bonferroni". Disable adjustment with FALSE.
#' @param labels If TRUE (default) extracts labels from the attributes, see \link{codebook}.
#' @param clean Prepare data by \link{data_clean}.
#' @param ... Placeholder to allow calling the method with unused parameters from \link{effect_counts}.
#' @return A volker tibble.
#' @examples
#' library(volker)
#' data <- volker::chatgpt
#'
#' effect_counts(
#' data,
#' starts_with("cg_adoption_adv"),
#' starts_with("use_")
#')
#'
#' @export
#' @importFrom rlang .data
effect_counts_items_grouped_items <- function(data, cols, cross, method = "cramer", adjust = "fdr", category = NULL, labels = TRUE, clean = TRUE, ...) {
# 1. Checks, clean, remove missings
data <- data_prepare(data, {{ cols }}, {{ cross }}, cols.categorical = c({{ cols }}, {{ cross }}), clean = clean)
check_is_param(method, c("cramer", "npmi"))
# 2. Calculate correlations
result <- .effect_correlations(data, {{ cols }}, {{ cross }}, method = method, adjust = adjust, category = category, labels = labels)
# 3. Labels
prefix1 <- get_prefix(result$item1)
prefix2 <- get_prefix(result$item2)
result <- result %>%
dplyr::mutate(item1 = trim_prefix(.data$item1, prefix1)) |>
dplyr::mutate(item2 = trim_prefix(.data$item2, prefix2))
if ((prefix1 == prefix2) && (prefix1 != "")) {
prefix1 <- paste0("Item 1: ", trim_label(prefix1))
prefix2 <- paste0("Item 2: ", trim_label(prefix2))
}
# Rename first column
if (prefix1 != "") {
colnames(result)[1] <- prefix1
}
# Rename second column
if (prefix2 != "") {
colnames(result)[2] <- prefix2
}
result <- .attr_transfer(result, data, "missings")
.to_vlkr_tab(result, digits = 2)
}
#' Correlate the values in multiple items with one metric column and output effect sizes and tests
#'
#' \strong{Not yet implemented. The future will come.}
#'
#' @keywords internal
#'
#' @param data A tibble containing item measures.
#' @param cols Tidyselect item variables (e.g. starts_with...).
#' @param cross The metric column.
#' @param clean Prepare data by \link{data_clean}.
#' @param ... Placeholder to allow calling the method with unused parameters from \link{effect_counts}.
#' @return A volker tibble.
#' @importFrom rlang .data
effect_counts_items_cor <- function(data, cols, cross, clean = TRUE, ...) {
warning("Not implemented yet. The future will come.", noBreaks. = TRUE)
}
#' Correlate the values in multiple items with multiple metric columns and output effect sizes and tests
#'
#' \strong{Not yet implemented. The future will come.}
#'
#' @keywords internal
#'
#' @param data A tibble containing item measures.
#' @param cols Tidyselect item variables (e.g. starts_with...).
#' @param cross The metric target columns.
#' @param clean Prepare data by \link{data_clean}.
#' @param ... Placeholder to allow calling the method with unused parameters from \link{effect_counts}.
#' @return A volker tibble.
#' @importFrom rlang .data
effect_counts_items_cor_items <- function(data, cols, cross, clean = TRUE, ...) {
warning("Not implemented yet. The future will come.", noBreaks. = TRUE)
}
#' Test whether a distribution is normal
#'
#' The test is calculated using \code{stats::\link[stats:shapiro.test]{shapiro.test}}.
#'
#' @keywords internal
#'
#' @param data A tibble.
#' @param col The column holding metric values.
#' @param clean Prepare data by \link{data_clean}.
#' @param labels If TRUE (default) extracts labels from the attributes, see \link{codebook}.
#' @param ... Placeholder to allow calling the method with unused parameters from \link{effect_metrics}.
#' @return A volker list object with the following statistical measures:
#'
#' - **skewness**: Measure of asymmetry in the distribution. A value of 0 indicates perfect symmetry.
#' - **kurtosis**: Measure of the "tailedness" of the distribution.
#' - **W**: W-statistic from the Shapiro-Wilk normality test.
#' - **p**: p-value for the statistical test.
#' - **stars**: Significance stars based on p-value (*, **, ***).
#' - **normality**: Interpretation of normality based on Shapiro-Wilk test.
#'
#' @examples
#' library(volker)
#' data <- volker::chatgpt
#'
#' effect_metrics_one(data, sd_age)
#'
#' @export
#' @importFrom rlang .data
effect_metrics_one <- function(data, col, labels = TRUE, clean = TRUE, ... ) {
# 1. Checks, clean, remove missings
data <- data_prepare(data, {{ col }}, cols.numeric = {{ col }}, clean = clean)
# 2. Normality test
stats <- dplyr::select(data, av = {{ col }})
stats_shapiro <- stats::shapiro.test(stats$av)
stats_shapiro <- list(
"W" = sprintf("%.2f", round(stats_shapiro$statistic, 2)),
"p" = sprintf("%.3f", round(stats_shapiro$p.value, 3)),
"stars" = get_stars(stats_shapiro$p.value),
"normality" = ifelse(stats_shapiro$p.value > 0.05, "normal", "not normal")
) |>
tibble::enframe(
name = "Shapiro-Wilk normality test",
value = "Value"
)
# 3. Skewness and kurtosis
stats_skew <- psych::describe(stats$av)
stats <- list(
"skewness" = sprintf("%.2f", round(stats_skew$skew, 2)),
"kurtosis" = sprintf("%.2f", round(stats_skew$kurtosis, 2))
) |>
tibble::enframe(
name = "Metric",
value = "Value"
)
# 4. Get item label from the attributes
if (labels) {
label <- get_title(data, {{ col }})
stats <- dplyr::rename(stats, {{ label }} := "Metric")
}
# 5. Results
result <- c(
list(.to_vlkr_tab(stats, digits=2)),
list(.to_vlkr_tab(stats_shapiro, digits=2))
)
result <- .attr_transfer(result, data, "missings")
.to_vlkr_list(result)
}
#' Output a regression table with estimates and macro statistics
#'
#' The regression output comes from \code{stats::\link[stats:lm]{lm}}.
#' T-test is performed using \code{stats::\link[stats:t.test]{t.test}}.
#' Normality check is performed using
#' \code{stats::\link[stats:shapiro.test]{shapiro.test}}.
#' Equality of variances across groups is assessed using \code{car::\link[car:leveneTest]{leveneTest}}.
#' Cohen's d is calculated using \code{effectsize::\link[effectsize:cohens_d]{cohens_d}}.
#'
#' @keywords internal
#'
#' @param data A tibble.
#' @param col The column holding metric values.
#' @param cross The column holding groups to compare.
#' @param method A character vector of methods, e.g. c("t.test","lm").
#' Supported methods are t.test (only valid if the cross column contains two levels)
#' and lm (regression results).
#' @param adjust Performing multiple significance tests inflates the alpha error.
#' Thus, p values need to be adjusted according to the number of tests.
#' Set a method supported by \code{stats::\link[stats:p.adjust]{p.adjust}},
#' e.g. "fdr" (the default) or "bonferroni". Disable adjustment with FALSE.
#' @param labels If TRUE (default) extracts labels from the attributes, see \link{codebook}.
#' @param clean Prepare data by \link{data_clean}.
#' @param ... Placeholder to allow calling the method with unused parameters from \link{effect_metrics}.
#' @return A volker list object containing volker tables with the requested statistics.
#'
#' Regression table:
#' - **estimate**: Regression coefficient (unstandardized).
#' - **ci low / ci high**: lower and upper bound of the 95% confidence interval.
#' - **se**: Standard error of the estimate.
#' - **t**: t-statistic.
#' - **p**: p-value for the statistical test.
#' - **stars**: Significance stars based on p-value (*, **, ***).
#'
#' Macro statistics:
#' - **Adjusted R-squared**: Adjusted coefficient of determination.
#' - **F**: F-statistic for the overall significance of the model.
#' - **df**: Degrees of freedom for the model.
#' - **residual df**: Residual degrees of freedom.
#' - **p**: p-value for the statistical test.
#' - **stars**: Significance stars based on p-value (*, **, ***).
#'
#' If `method = t.test`:
#' ### Shapiro-Wilk test (normality check):
#' - **W**: W-statistic from the Shapiro-Wilk normality test.
#' - **p**: p-value for the test.
#' - **normality**: Interpretation of the Shapiro-Wilk test.
#'
#' ### Levene test (equality of variances):
#' - **F**: F-statistic from the Levene test for equality of variances between groups.
#' - **p**: p-value for Levene's test.
#' - **variances**: Interpretation of the Levene test.
#'
#' ### Cohen's d (effect size):
#' - **d**: Standardized mean difference between the two groups.
#' - **ci low / ci high**: Lower and upper bounds of the 95% confidence interval.
#'
#' ### t-test
#' - **method**: Type of t-test performed (e.g., "Two Sample t-test").
#' - **difference**: Observed difference between group means.
#' - **ci low / ci high**: Lower and upper bounds of the 95% confidence interval.
#' - **se**: Estimated standard error of the difference.
#' - **df**: Degrees of freedom used in the t-test.
#' - **t**: t-statistic.
#' - **p**: p-value for the t-test.
#' - **stars**: Significance stars based on p-value (`*`, `**`, `***`).
#'
#' @examples
#' library(volker)
#' data <- volker::chatgpt
#'
#' effect_metrics_one_grouped(data, sd_age, sd_gender)
#'
#' @export
#' @importFrom rlang .data
effect_metrics_one_grouped <- function(data, col, cross, method = "lm", adjust = "fdr", labels = TRUE, clean = TRUE, ...) {
# 1. Checks, clean, remove missings
data <- data_prepare(data, {{ col }}, {{ cross }}, cols.categorical = {{ cross }}, cols.numeric = {{ col }}, clean = clean)
check_is_param(method, c("lm", "t.test"))
# 2. Calculate
result <- list()
lm_data <- dplyr::select(data, av = {{ col }}, uv = {{ cross }})
# t.test
if ("t.test" %in% method) {
if (length(unique(lm_data$uv)) != 2) {
stop("Check your parameters: the t.test method is only allowed for comparing two groups.")
}
stats_shapiro <- stats::shapiro.test(lm_data$av)
stats_levene <- car::leveneTest(lm_data$av, group = lm_data$uv)
stats_varequal = stats_levene[["Pr(>F)"]][1] > 0.05
stats_cohen <- effectsize::cohens_d(lm_data$av, lm_data$uv, pooled_sd = stats_varequal, paired=FALSE)
stats_t <- stats::t.test(lm_data$av ~ lm_data$uv, var.equal = stats_varequal)
stats_t <- list(
"Shapiro-Wilk normality test" = list(
"W" = sprintf("%.2f", stats_shapiro$statistic),
"p" = sprintf("%.3f", stats_shapiro$p.value),
"stars" = get_stars(stats_shapiro$p.value),
"normality" = ifelse(stats_shapiro$p.value > 0.05, "normal", "not normal")
),
"Levene test" = list(
"F" = sprintf("%.2f", stats_levene[["F value"]][1]),
"p" = sprintf("%.3f", stats_levene[["Pr(>F)"]][1]),
"stars" = get_stars(stats_levene[["Pr(>F)"]][1]),
"variances" = ifelse(stats_varequal, "equal", "not equal")
),
"Cohen's d" = list(
"d" = sprintf("%.1f",round(stats_cohen$Cohens_d, 1)),
"ci low" = sprintf("%.1f",round(stats_cohen$CI_low, 1)),
"ci high" = sprintf("%.1f",round(stats_cohen$CI_high, 1))
),
"t-Test" = list(
"method" = stats_t$method,
"difference" = sprintf("%.2f", round(stats_t$estimate[1] - stats_t$estimate[2], 2)),
"ci low" = sprintf("%.2f", round(stats_t$conf.int[1], 2)),
"ci high" = sprintf("%.2f",round(stats_t$conf.int[2], 2)),
"se" = sprintf("%.2f",round(stats_t$stderr,2)),
"df" = round(stats_t$parameter,2),
"t" = sprintf("%.2f",round(stats_t$statistic,2)),
"p" = sprintf("%.3f",round(stats_t$p.value,3)),
"stars" = get_stars(stats_t$p.value)
)
) %>%
tibble::enframe(
name = "Test",
value = "Results"
)
stats_t <- stats_t |>
tidyr::unnest_longer(
tidyselect::all_of("Results"),
indices_to="Statistic",
values_to="Value",
transform=as.character
) |>
dplyr::select("Test","Statistic","Value")
result <- c(result, list(.to_vlkr_tab(stats_t)))
}
# Regression model
# TODO: should we omit the intercept from adjusting p values?
else if ("lm" %in% method) {
fit <- stats::lm(av ~ uv, data = lm_data)
# Regression parameters
lm_params <- tidy_lm_levels(fit)
lm_params <- lm_params |>
dplyr::mutate(
Term = .data$term,
estimate = .data$estimate,
"ci low" = .data$conf.low,
"ci high" = .data$conf.high,
"se" = .data$std.error,
t = .data$statistic,
p = .data$p.value
)
# Adjust and round
lm_params <- lm_params %>%
adjust_p("p", method = adjust) %>%
.attr_setcolumn(
tidyselect::all_of(c("estimate", "ci low", "ci high", "se", "t")),
"round", 2
)
lm_params <- lm_params %>%
dplyr::select(tidyselect::all_of(c(
"Term","estimate","ci low","ci high","se","t","p","stars"
)))
# Regression model statistics
lm_model <- broom::glance(fit) |>
dplyr::mutate(stars = get_stars(.data$p.value)) |>
data_round("p.value", 3) %>%
dplyr::mutate(dplyr::across(tidyselect::any_of(c("n", "df", "df.residual")), function(x) as.character(x))) |>
data_round(tidyselect::where(is.numeric), 2) %>%
tidyr::pivot_longer(
tidyselect::everything(),
names_to="Statistic",
values_to="Value"
) |>
labs_replace("Statistic", tibble::tibble(
value_name=c(
"adj.r.squared", "statistic", "df", "df.residual",
"p.value", "stars"
),
value_label=c(
"Adjusted R-squared","F", "df", "residual df",
"p", "stars"
)
), na.missing = TRUE) |>
stats::na.omit() |>
dplyr::arrange(.data$Statistic)
result <- c(
result,
list(.to_vlkr_tab(lm_params)),
list(.to_vlkr_tab(lm_model))
)
}
result <- .attr_transfer(result, data, "missings")
.to_vlkr_list(result)
}
#' Test whether the correlation is different from zero
#'
#' The correlation is calculated using \code{stats::\link[stats:cor.test]{cor.test}}.
#'
#' @keywords internal
#'
#' @param data A tibble.
#' @param col The column holding metric values.
#' @param cross The column holding metric values to correlate.
#' @param method The output metrics, TRUE or pearson = Pearson's R, spearman = Spearman's rho.
#' @param adjust Performing multiple significance tests inflates the alpha error.
#' Thus, p values need to be adjusted according to the number of tests.
#' Set a method supported by \code{stats::\link[stats:p.adjust]{p.adjust}},
#' e.g. "fdr" (the default) or "bonferroni". Disable adjustment with FALSE.
#' @param labels If TRUE (default) extracts labels from the attributes, see \link{codebook}.
#' @param clean Prepare data by \link{data_clean}.
#' @param ... Placeholder to allow calling the method with unused parameters from \link{effect_metrics}.
#' @return A volker table containing the requested statistics.
#'
#' If `method = "pearson"`:
#' - **R-squared**: Coefficient of determination.
#' - **n**: Number of cases the calculation is based on.
#' - **Pearson's r**: Correlation coefficient.
#' - **ci low / ci high**: Lower and upper bounds of the 95% confidence interval.
#' - **df**: Degrees of freedom.
#' - **t**: t-statistic.
#' - **p**: p-value for the statistical test, indicating whether the correlation differs from zero.
#' - **stars**: Significance stars based on the p-value (*, **, ***).
#'
#' If `method = "spearman"`:
#' - **Spearman's rho** is displayed instead of Pearson's r.
#' - **S-statistic** is used instead of the t-statistic.
#'
#' @examples
#' library(volker)
#' data <- volker::chatgpt
#'
#' effect_metrics_one_cor(data, sd_age, use_private, metric = TRUE)
#'
#' @export
#' @importFrom rlang .data
effect_metrics_one_cor <- function(data, col, cross, method = "pearson", adjust = "fdr", labels = TRUE, clean = TRUE, ...) {
# 1. Checks, clean, remove missings
data <- data_prepare(data, {{ col }}, {{ cross }}, cols.numeric = c({{ col }}, {{ cross }}), clean = clean)
check_is_param(method, c("pearson", "spearman"))
# 2. Calculate correlation
result <- .effect_correlations(data, {{ col }}, {{ cross}}, adjust = adjust, method = method, labels = labels)
# 3. Labeling
# Remove common prefix
prefix <- get_prefix(c(result$item1, result$item2))
result <- dplyr::mutate(result, item1 = trim_prefix(.data$item1, prefix))
result <- dplyr::mutate(result, item2 = trim_prefix(.data$item2, prefix))
if (prefix == "") {
prefix <- "Item"
}
result <- result %>%
dplyr::rename("Item 1" = tidyselect::all_of("item1")) |>
dplyr::rename("Item 2" = tidyselect::all_of("item2"))
if (prefix == "") {
title <- NULL
}
result <- result |>
data_round("p", 3) %>%
dplyr::mutate(dplyr::across(tidyselect::any_of(c("n", "df", "df.residual")), function(x) as.character(x))) |>
data_round(tidyselect::where(is.numeric), 2) %>%
#dplyr::mutate(dplyr::across(tidyselect::everything(), \(x) as.character(x))) |>
tidyr::pivot_longer(
cols = -tidyselect::all_of(c("Item 1", "Item 2")),
names_to ="Statistic"
) |>
dplyr::select(-tidyselect::all_of(c("Item 1", "Item 2")))
result <- .attr_transfer(result, data, "missings")
.to_vlkr_tab(result, digits= 2, caption=title)
}
#' Test whether a distribution is normal for each item
#'
#' The test is calculated using \code{stats::\link[stats:shapiro.test]{shapiro.test}}.
#'
#' @keywords internal
#'
#' @param data A tibble containing item measures.
#' @param cols The column holding metric values.
#' @param adjust Performing multiple significance tests inflates the alpha error.
#' Thus, p values need to be adjusted according to the number of tests.
#' Set a method supported by \code{stats::\link[stats:p.adjust]{p.adjust}},
#' e.g. "fdr" (the default) or "bonferroni". Disable adjustment with FALSE.
#' @param labels If TRUE (default) extracts labels from the attributes, see \link{codebook}.
#' @param clean Prepare data by \link{data_clean}.
#' @param ... Placeholder to allow calling the method with unused parameters from \link{effect_metrics}.
#' @return A volker table containing itemwise statistics:
#'
#' - **skewness**: Measure of asymmetry in the distribution. A value of 0 indicates perfect symmetry.
#' - **kurtosis**: Measure of the "tailedness" of the distribution.
#' - **W**: W-statistic from the Shapiro-Wilk normality test.
#' - **p**: p-value for the statistical test.
#' - **stars**: Significance stars based on p-value (*, **, ***).
#' - **normality**: Interpretation of normality based on Shapiro-Wilk test.
#'
#' @examples
#' library(volker)
#' data <- volker::chatgpt
#'
#' effect_metrics_items(data, starts_with("cg_adoption"))
#'
#'
#' @importFrom rlang .data
#' @export
effect_metrics_items <- function(data, cols, adjust = "fdr", labels = TRUE, clean = TRUE, ...) {
# 1. Checks, clean, remove missings
data <- data_prepare(data, {{ cols }}, cols.numeric = {{ cols }}, clean = clean)
# 2. Calculate
data <- dplyr::select(data, {{ cols }})
result <- purrr::imap(
data,
\(.x, .y) {
shapiro <- stats::shapiro.test(.x)
stats <- psych::describe(.x)
list(
"Item" = .y,
"skewness" = stats$skew,
"kurtosis" = stats$kurt,
"W" = shapiro$statistic,
"p" = shapiro$p.value
)
}
) %>%
dplyr::bind_rows()
# Adjust and round
result <- result %>%
adjust_p("p", method = adjust) %>%
.attr_setcolumn(
tidyselect::all_of(c("skewness", "kurtosis", "W")),
"round", 2
)
result$normality = ifelse(result$p > 0.05, "normal", "not normal")
# 3. Labels
if (labels) {
result <- labs_replace(
result, "Item",
codebook(data, {{ cols }}),
"item_name", "item_label"
)
}
prefix <- get_prefix(result$Item)
result <- dplyr::mutate(
result, Item = trim_prefix(.data$Item, prefix)
)
# Rename first column
if (prefix != "") {
colnames(result)[1] <- prefix
}
result <- .attr_transfer(result, data, "missings")
.to_vlkr_tab(result)
}
#' Compare groups for each item by calculating F-statistics and effect sizes
#'
#'
#' The models are fitted using \code{stats::\link[stats:lm]{lm}}.
#' ANOVA of type II is computed for each fitted model using \code{car::\link[car:Anova]{Anova}}.
#' Eta Squared is calculated for each ANOVA result
#' using \code{effectsize::\link[effectsize:eta_squared]{eta_squared}}.
#'
#' @keywords internal
#'
#' @param data A tibble containing item measures.
#' @param cols Tidyselect item variables (e.g. starts_with...).
#' @param cross The column holding groups to compare.
#' @param labels If TRUE (default) extracts labels from the attributes, see \link{codebook}.
#' @param adjust Performing multiple significance tests inflates the alpha error.
#' Thus, p values need to be adjusted according to the number of tests.
#' Set a method supported by \code{stats::\link[stats:p.adjust]{p.adjust}},
#' e.g. "fdr" (the default) or "bonferroni". Disable adjustment with FALSE.
#' @param clean Prepare data by \link{data_clean}.
#' @param ... Placeholder to allow calling the method with unused parameters from \link{effect_metrics}.
#' @return A volker tibble with the following statistical measures:
#'
#' - **Eta-squared**: Effect size indicating the proportion of variance in the dependent variable explained by the predictor.
#' - **Eta**: Root of Eta-squared, a standardized effect size.
#' - **n**: Number of cases the calculation is based on.
#' - **F**: F-statistic from the linear model.
#' - **p**: p-value for the statistical test.
#' - **stars**: Significance stars based on p-value (*, **, ***).
#'
#' @examples
#' library(volker)
#' data <- volker::chatgpt
#'
#' effect_metrics(data, starts_with("cg_adoption_"), adopter)
#'
#' @export
#' @importFrom rlang .data
effect_metrics_items_grouped <- function(data, cols, cross, adjust = "fdr", labels = TRUE, clean = TRUE, ...) {
# 1. Checks, clean, remove missings
data <- data_prepare(data, {{ cols }}, {{ cross }}, cols.categorical = {{ cross }}, cols.numeric = {{ cols }}, clean = clean)
# 2. Pivot longer
lm_data <- data %>%
dplyr::rename(uv = {{ cross }}) %>%
tidyr::pivot_longer(
cols = {{ cols }},
names_to = "item",
values_to = "value"
) %>%
dplyr::group_by(.data$item) %>%
tidyr::nest() %>%
tibble::deframe()
# 3. Linear model per item
result <- purrr::imap(
lm_data,
\(.x, .y) {
model <- stats::lm(value ~ uv, data = .x)
summ <- summary(model)
eta_sq <- get_etasq(model)
list(
"item" = .y,
"Eta-squared" = eta_sq$Eta2,
"Eta" = sqrt(eta_sq$Eta2),
"n" = nrow(.x),
"F" = summ$fstatistic[1],
"p" = summ$coefficients[2, 4]
)
}) %>%
dplyr::bind_rows()
# Adjust and round
result <- result %>%
adjust_p("p", method = adjust) %>%
.attr_setcolumn(
tidyselect::all_of(c("Eta-squared", "Eta", "F")),
"round", 2
)
# 5. Labels
if (labels) {
result <- labs_replace(
result, "item",
codebook(data, {{ cols }}),
"item_name", "item_label"
)
}
prefix <- get_prefix(result$item)
result <- dplyr::mutate(result, item = trim_prefix(.data$item, prefix))
# Rename first column
if (prefix != "") {
colnames(result)[1] <- prefix
} else {
result <- dplyr::rename(result, Item = tidyselect::all_of("item"))
}
result <- .attr_transfer(result, data, "missings")
.to_vlkr_tab(result)
}
#' Compare groups for each item with multiple target items by calculating F-statistics and effect sizes
#'
#' \strong{Not yet implemented. The future will come.}
#'
#' @keywords internal
#'
#' @param data A tibble containing item measures.
#' @param cols Tidyselect item variables (e.g. starts_with...).
#' @param cross The grouping items.
#' @param clean Prepare data by \link{data_clean}.
#' @param ... Placeholder to allow calling the method with unused parameters from \link{effect_counts}.
#' @return A volker tibble.
#' @importFrom rlang .data
effect_metrics_items_grouped_items <- function(data, cols, cross, clean = TRUE, ...) {
warning("Not implemented yet. The future will come.", noBreaks. = TRUE)
}
#' Output correlation coefficients for items and one metric variable
#'
#' The correlation is calculated using \code{stats::\link[stats:cor.test]{cor.test}}.
#'
#' @keywords internal
#'
#' @param data A tibble containing item measures.
#' @param cols Tidyselect item variables (e.g. starts_with...).
#' @param cross The column holding metric values to correlate.
#' @param method The output metrics, pearson = Pearson's R, spearman = Spearman's rho.
#' @param adjust Performing multiple significance tests inflates the alpha error.
#' Thus, p values need to be adjusted according to the number of tests.
#' Set a method supported by \code{stats::\link[stats:p.adjust]{p.adjust}},
#' e.g. "fdr" (the default) or "bonferroni". Disable adjustment with FALSE.
#' @param labels If TRUE (default) extracts labels from the attributes, see \link{codebook}.
#' @param clean Prepare data by \link{data_clean}.
#' @param ... Placeholder to allow calling the method with unused parameters from \link{effect_metrics}.
#' @return A volker table containing itemwise correlations:
#'
#' If `method = "pearson"`:
#' - **R-squared**: Coefficient of determination.
#' - **n**: Number of cases the calculation is based on.
#' - **Pearson's r**: Correlation coefficient.
#' - **ci low / ci high**: Lower and upper bounds of the 95% confidence interval.
#' - **df**: Degrees of freedom.
#' - **t**: t-statistic.
#' - **p**: p-value for the statistical test, indicating whether the correlation differs from zero.
#' - **stars**: Significance stars based on the p-value (*, **, ***).
#'
#' If `method = "spearman"`:
#' - **Spearman's rho** is displayed instead of Pearson's r.
#' - **S-statistic** is used instead of the t-statistic.
#'
#' @examples
#' library(volker)
#' data <- volker::chatgpt
#'
#' effect_metrics_items_cor(
#' data, starts_with("cg_adoption_adv"), sd_age
#' )
#'
#' @export
#' @importFrom rlang .data
effect_metrics_items_cor <- function(data, cols, cross, method = "pearson", adjust = "fdr", labels = TRUE, clean = TRUE, ...) {
# 1. Checks, clean, remove missings
data <- data_prepare(data, {{ cols }}, {{ cross }}, cols.numeric = c({{ cols }}, {{ cross }}), clean = clean)
check_is_param(method, c("pearson", "spearman"))
# 2. Calculate correlations
result <- .effect_correlations(data, {{ cols }}, {{ cross }}, method = method, adjust = adjust, labels = labels)
# 3. Labels
prefix1 <- get_prefix(result$item1)
if (labels) {
prefix2 <- get_title(data, {{ cross }})
} else {
prefix2 <- rlang::as_string(rlang::ensym(cross))
}
result <- result %>%
dplyr::mutate(item1 = trim_prefix(.data$item1, prefix1)) %>%
dplyr::select(-tidyselect::all_of("item2"))
# Rename first column
if (prefix1 != "") {
colnames(result)[1] <- paste0(prefix1, ": Correlation with ", prefix2)
}
result <- .attr_transfer(result, data, "missings")
.to_vlkr_tab(result, digits = 2)
}
#' Output correlation coefficients for multiple items
#'
#' The correlation is calculated using \code{stats::\link[stats:cor.test]{cor.test}}.
#'
#' @keywords internal
#'
#' @param data A tibble containing item measures.
#' @param cols Tidyselect item variables (e.g. starts_with...).
#' @param cross Tidyselect item variables (e.g. starts_with...).
#' @param method The output metrics, pearson = Pearson's R, spearman = Spearman's rho.
#' @param adjust Performing multiple significance tests inflates the alpha error.
#' Thus, p values need to be adjusted according to the number of tests.
#' Set a method supported by \code{stats::\link[stats:p.adjust]{p.adjust}},
#' e.g. "fdr" (the default) or "bonferroni". Disable adjustment with FALSE.
#' @param labels If TRUE (default) extracts labels from the attributes, see \link{codebook}.
#' @param clean Prepare data by \link{data_clean}.
#' @param ... Placeholder to allow calling the method with unused parameters from \link{effect_metrics}.
#' @return A volker table containing correlations.
#'
#' If `method = "pearson"`:
#' - **R-squared**: Coefficient of determination.
#' - **n**: Number of cases the calculation is based on.
#' - **Pearson's r**: Correlation coefficient.
#' - **ci low / ci high**: Lower and upper bounds of the 95% confidence interval.
#' - **df**: Degrees of freedom.
#' - **t**: t-statistic.
#' - **p**: p-value for the statistical test, indicating whether the correlation differs from zero.
#' - **stars**: Significance stars based on the p-value (*, **, ***).
#'
#' If `method = "spearman"`:
#' - **Spearman's rho** is displayed instead of Pearson's r.
#' - **S-statistic** is used instead of the t-statistic.
#'
#' @examples
#' library(volker)
#' data <- volker::chatgpt
#'
#' effect_metrics_items_cor_items(
#' data,
#' starts_with("cg_adoption_adv"),
#' starts_with("use"),
#' metric = TRUE
#' )
#'
#' @export
#' @importFrom rlang .data
effect_metrics_items_cor_items <- function(data, cols, cross, method = "pearson", adjust = "fdr", labels = TRUE, clean = TRUE, ...) {
# 1. Checks, clean, remove missings
data <- data_prepare(data, {{ cols }}, {{ cross }}, cols.numeric = c({{ cols }}, {{ cross }}), clean = clean)
check_is_param(method, c("pearson", "spearman"))
# 2. Calculate correlations
result <- .effect_correlations(data, {{ cols }}, {{ cross }}, method = method, adjust = adjust, labels = labels)
# 3. Labels
prefix1 <- get_prefix(result$item1)
prefix2 <- get_prefix(result$item2)
result <- result %>%
dplyr::mutate(item1 = trim_prefix(.data$item1, prefix1)) |>
dplyr::mutate(item2 = trim_prefix(.data$item2, prefix2))
if ((prefix1 == prefix2) && (prefix1 != "")) {
prefix1 <- paste0("Item 1: ", prefix1)
prefix2 <- paste0("Item 2: ", prefix2)
}
# Rename first column
if (prefix1 != "") {
colnames(result)[1] <- prefix1
}
# Rename second column
if (prefix2 != "") {
colnames(result)[2] <- prefix2
}
result <- .attr_transfer(result, data, "missings")
.to_vlkr_tab(result, digits = 2)
}
#' Calculate correlation and cooccurrence coefficients and test whether they are different from zero
#'
#' This function is used to calculate coefficients for all pairwise items
#' by calling `get_correlation()` on each combination of the items in the `cols`- by `cross`-parameter.
#'
#' @keywords internal
#'
#' @param data A tibble.
#' @param cols The columns holding metric values.
#' @param cross The columns holding metric values to correlate.
#' @param method The output metrics, pearson = Pearson's R, spearman = Spearman's rho,
#' cramer = Cramer's V, npmi = Normalized Pointwise Mutual Information.
#' The reported R square value is simply the square of Spearman's rho respective Pearson's r.
#' @param category Calculating NPMI for multiple items requires a focus category.
#' By default, for logical column types, only TRUE values are counted.
#' For other column types, the first category is counted.
#' Accepts both character and numeric values to override default counting behavior.
#' @param test Boolean, whether to perform significance tests (default = TRUE).
#' @param adjust Performing multiple significance tests inflates the alpha error.
#' Thus, p values need to be adjusted according to the number of tests.
#' Set a method supported by \code{stats::\link[stats:p.adjust]{p.adjust}},
#' e.g. "fdr" (the default) or "bonferroni". Disable adjustment with FALSE.
#' @param labels If TRUE (default) extracts labels from the attributes, see \link{codebook}.
#' @return A tibble with correlation results.
#' @importFrom rlang .data
.effect_correlations <- function(data, cols, cross, method = "pearson", category = NULL, test = TRUE, adjust = "fdr", labels = TRUE) {
cols_eval <- tidyselect::eval_select(expr = enquo(cols), data = data)
cross_eval <- tidyselect::eval_select(expr = enquo(cross), data = data)
# Check method
check_is_param(method, c("spearman", "pearson", "cramer", "npmi"))
# Category
if (is.null(category) & (method == "npmi")) {
if ("TRUE" %in% as.character(data[[cols_eval[1]]])) {
category <- "TRUE"
}
else if (is.factor(data[[cols_eval[1]]])) {
category <- levels(data[[cols_eval[1]]])[1]
}
else {
category <- unique(as.character(data[[cols_eval[1]]]))[1]
}
# TODO: Recode as in plot_counts_items_grouped() to reduce number of tests
# result <- result %>%
# mutate(.category = (.data$.value_name %in% base_category) | (.data$.value_label %in% base_category))
}
# Calculate
result <- expand.grid(
x = cols_eval, y = cross_eval,
stringsAsFactors = FALSE
) %>%
dplyr::mutate(
.test = purrr::pmap(
list(.data$x, .data$y),
\(x, y) get_correlation(data[[x]], data[[y]], method = method, category = category, test = test)
)
)
result <- result %>%
dplyr::mutate(
x_name = names(.data$x),
y_name = names(.data$y)
) %>%
tidyr::unnest_wider(".test") |>
dplyr::select(-tidyselect::any_of(c("x", "y")))
result <- dplyr::select(result, item1 = "x_name", item2 = "y_name", tidyselect::everything())
result <- dplyr::arrange(result, .data$item1, .data$item2)
# Adjust and round
# TODO: omit adjusting for test for self-correlation (item matrices)?
if (method == "npmi") {
if (test) {
result <- adjust_p(result, "fisher_p",stars = "fisher_stars", method = adjust)
}
result <- dplyr::mutate(result, npmi = round(.data$npmi, 2))
attr(result, "focus") <- category
}
else if (test) {
result <- adjust_p(result, "p", method = adjust)
}
# Get variable caption from the attributes
if (labels) {
result <- labs_replace(result, "item1", codebook(data, {{ cols }}), col_from="item_name", col_to="item_label")
result <- labs_replace(result, "item2", codebook(data, {{ cross }}), col_from="item_name", col_to="item_label" )
}
# Baseline
attr(result, "cases") <- nrow(data)
result
}
#' Tidy lm results, replace categorical parameter names by their levels and add the reference level
#'
#' @keywords internal
#'
#' @param fit Result of a \code{\link[stats:lm]{lm}} call.
#' @author Created with the help of ChatGPT.
#' @return A tibble with regression parameters.
tidy_lm_levels <- function(fit) {
lm_tidy <- broom::tidy(fit, conf.int = TRUE)
lm_tidy <- dplyr::left_join(lm_tidy, get_betas(fit), by = "term")
lm_data <- fit$model
# Work through each factor in the model frame
for (var in names(lm_data)) {
if (is.character(lm_data[[var]])) {
lm_data[[var]] <- as.factor(lm_data[[var]])
}
if (is.factor(lm_data[[var]])) {
levels <- levels(lm_data[[var]])
ref_level <- levels[1]
ref_first <- paste0(var, levels[2])
ref_index <- which(lm_tidy$term == ref_first)
# Rename the coefficients in tidy_data
for (level in levels[-1]) {
old_name <- paste0(var, level)
new_name <- paste0(level)
lm_tidy$term <- sub(paste0("^\\Q", var, level,"\\E"), new_name, lm_tidy$term)
}
# Create reference level row, assuming the first level is the reference
ref_row <- data.frame(term = paste0(ref_level, " (Reference)"))
# Insert the ref_row at ref_index in lm_tidy
lm_tidy <- dplyr::bind_rows(lm_tidy[1:ref_index-1,,drop = FALSE], ref_row, lm_tidy[ref_index:nrow(lm_tidy),,drop = FALSE])
}
}
lm_tidy
}
#' Adjust p-values from multiple tests and optionally annotate significance stars
#'
#' @keywords internal
#'
#' @param df A data frame or tibble containing the column to adjust.
#' @param col A tidyselect expression specifying the p-value column to adjust.
#' @param method Character string specifying the p-value adjustment method.
#' See \code{\link[stats]{p.adjust}} for available methods. Disable adjustment with FALSE.
#' @param digits Integer; number of decimal places for rounding.
#' @param stars Logical or character; if \code{TRUE}, add a "stars" column
#' with significance symbols (e.g., `"***"`, `"**"`, `"*"`)
#' based on the adjusted p-values.
#' If set to a character value it determines the new column name.
#'
#' @return A modified data frame with:
#' \itemize{
#' \item Adjusted p-values in the selected column.
#' \item (Optionally) a new column \code{stars} containing significance symbols.
#' \item An attribute \code{"adjust"} on the data frame storing the method.
#' \item A \code{"round"} attribute on the p-value column.
#' }
adjust_p <- function(df, col, method = "fdr", digits = 3, stars = TRUE) {
col_eval <- tidyselect::eval_select(expr = enquo(col), data = df)
col_name <- names(col_eval)
if (length(col_name) != 1) {
stop("`col` must select exactly one column.")
}
# Adjust p-values
if (nrow(df) > 1) {
method <- ifelse(method == "none", FALSE, method)
if (method != FALSE) {
df[[col_name]] <- stats::p.adjust(df[[col_name]], method = method)
attr(df, "adjust") <- method
}
}
# Set 'round' attribute on p-value column
df <- .attr_setcolumn(df, !!sym(col_name), "round", digits)
# Optionally add significance stars
if (!isFALSE(stars)) {
if (!is.character(stars)) {
stars <- "stars"
}
df[[stars]] <- get_stars(df[[col_name]])
}
return(df)
}
#' Calculate correlation between two vectors
#'
#' @keywords internal
#'
#' @param x First vector
#' @param y Second vector
#' @param method One of "spearman" or "pearson" for metric vectors.
#' For catecorical vectors, use "cramer" or "npmi".
#' @param category A vector of values to focus. Necessary for the npmi method only.
#' @param test Boolean; whether to perform significance tests.
#' @return The result of cor.test() for metric vectors, chisq.test for Cramer's V.
get_correlation <- function(x, y, method, category = NULL, test = TRUE) {
result <- list()
# TODO: round in print function, not here
if (method == "npmi") {
if (is.null(category)) {
stop("The npmi method needs a focus category, check your parameters.")
}
df <- data.frame(x = x, y = y)
result <- get_npmi(df, !!sym("x"), !!sym("y"), category = category, adjust = FALSE, test = test)
result$p_y_if_x <- NULL
result$p_x_if_y <- NULL
result <- as.list(result[1, -c(1,2) , drop = TRUE])
}
else if (method == "cramer") {
contingency <- table(as.character(x), as.character(y))
n <- sum(contingency)
sr <- rowSums(contingency)
sc <- colSums(contingency)
E <- outer(sr, sc) / sum(contingency)
exact <- any(E < 5)
isdiag = sum(diag(contingency)) == n
if ((nrow(contingency) < 2) | (ncol(contingency) < 2)) {
fit <- list(statistic = NA, parameter = NA, p.value = NA)
} else {
fit <- stats::chisq.test(contingency, simulate.p.value = exact)
}
cells <- min(dim(contingency)[1], dim(contingency)[2]) - 1
cramer_v <- round(sqrt( (fit$statistic / n) / cells), 2)
cramer_v <- ifelse(isdiag, 1.0, cramer_v)
result <- list(
"Cramer's V" = round(cramer_v, 2),
"Chi-squared" = sprintf("%.2f", round(fit$statistic, 2)),
"n" = as.character(n),
"df" = as.character(fit$parameter),
"p"= fit$p.value
)
}
else if (method == "spearman") {
fit <- stats::cor.test(x, y, method = method, exact = FALSE)
result = list(
"Spearman's rho" = round(as.numeric(fit$estimate),2),
"R-squared" = round(as.numeric(fit$estimate^2),2),
n = length(x),
s = sprintf("%.2f", round(fit$statistic,2)),
p = fit$p.value
)
}
else {
fit <- stats::cor.test(x, y, method = method, exact = TRUE)
result = list(
"Pearson's r" = round(as.numeric(fit$estimate),2),
"R-squared" = round(as.numeric(fit$estimate^2),2),
n = sum(!is.na(x) & !is.na(y)),
"ci low" = round(as.numeric(fit$conf.int[1]),2),
"ci high" = round(as.numeric(fit$conf.int[2]),2),
df = as.numeric(fit$parameter),
t = ifelse(all(x == y), "Inf", sprintf("%.2f", round(as.numeric(fit$statistic),2))),
p = fit$p.value
)
}
return (result)
}
#' Calculate nmpi
#'
#' @keywords internal
#'
#' @param data A tibble.
#' @param col The column holding factor values.
#' @param cross The column to correlate.
#' @param category A vector of values to focus. If not null, all other values will be removed from the result.
#' @param smoothing Add pseudocount. Calculate the pseudocount based on the number of trials
#' to apply Laplace's rule of succession.
#' @param adjust Performing multiple significance tests inflates the alpha error.
#' Thus, p values need to be adjusted according to the number of tests.
#' Set a method supported by \code{stats::\link[stats:p.adjust]{p.adjust}},
#' e.g. "fdr" (the default) or "bonferroni". Disable adjustment with FALSE.
#' @param test Boolean; whether to perform significance tests (default TRUE).
#' @param ... Placeholder to allow calling the method with unused parameters from \link{tab_counts}.
#' @return A volker tibble.
#' @importFrom rlang .data
get_npmi <- function(data, col, cross, category = NULL, smoothing = 0, adjust = "fdr", test = TRUE, ...) {
cols_eval <- tidyselect::eval_select(expr = enquo(col), data = data)
cross_eval <- tidyselect::eval_select(expr = enquo(cross), data = data)
col_name <- names(cols_eval)[1]
cross_name <- names(cross_eval)[1]
# Calculate marginal probabilities
result <- data %>%
dplyr::count(.data[[col_name]], .data[[cross_name]]) %>%
#tidyr::complete({{ col }}, {{ cross }}, fill=list(n=0)) |>
dplyr::group_by(!!sym(col_name)) %>%
dplyr::mutate(.total_x = sum(.data$n)) %>%
dplyr::ungroup() %>%
dplyr::group_by(!!sym(cross_name)) %>%
dplyr::mutate(.total_y = sum(.data$n)) %>%
dplyr::ungroup()
# Calculate joint probablities
result <- result %>%
dplyr::mutate(
.total = sum(.data$n),
p_x = (.data$.total_x + smoothing) / (.data$.total + (dplyr::n_distinct(!!sym(col_name)) * smoothing)),
p_y = (.data$.total_y + smoothing) / (.data$.total + (dplyr::n_distinct(!!sym(cross_name)) * smoothing)),
p_xy = (.data$n + smoothing) / (.data$.total + dplyr::n_distinct(!!sym(col_name)) * smoothing + dplyr::n_distinct(!!sym(cross_name)) * smoothing),
p_x_if_y = .data$n / .data$.total_y,
p_y_if_x = .data$n / .data$.total_x,
ratio = .data$p_xy / (.data$p_x * .data$p_y),
pmi = dplyr::case_when(
.data$p_xy == 0 ~ -Inf,
TRUE ~ log2(.data$ratio)
),
npmi = dplyr::case_when(
.data$p_xy == 0 ~ -1,
TRUE ~ .data$pmi / -log2(.data$p_xy)
)
)
# Filter out categories
if (!is.null(category)) {
result <- result[as.character(result[[1]]) %in% category,, drop = FALSE]
result <- result[as.character(result[[2]]) %in% category,, drop = FALSE]
}
# Add exact fisher test
if (test & (nrow(result) > 0)) {
result <- result %>%
dplyr::rowwise() %>%
dplyr::mutate(
fisher_p = {
a <- .data$n
b <- .data$.total_x - .data$n
c <- .data$.total_y - .data$n
d <- .data$.total - a - b - c
tbl <- matrix(c(a, b, c, d), nrow = 2)
suppressWarnings(stats::fisher.test(tbl)$p.value)
}) %>%
dplyr::ungroup()
# Adjust and round
result <- result %>%
adjust_p("fisher_p", method = adjust, stars = "fisher_stars")
}
else if (test & (nrow(result) == 0)) {
result$fisher_p <- NA
result$fisher_stars <- NA
}
result <- dplyr::select(result, -tidyselect::all_of(c(".total",".total_x", ".total_y")))
result
}
#' Calculate Eta squared
#'
#' @keywords internal
#'
#' @param fit A model
#' @return A data frame with at least the column Eta2
get_etasq <- function(fit) {
if(round(sum(fit$residuals),20) == 0) {
result <- data.frame("Eta2"=0)
} else {
result <- effectsize::eta_squared(car::Anova(fit, type = 2), verbose = FALSE)
}
result
}
#' Calculate the Gini coefficient
#'
#' @keywords internal
#'
#' @param x A vector of counts or other values
#' @return The gini coefficient
get_gini <- function(x) {
x <- sort(x)
n <- length(x)
gini <- sum(x * c(1:n))
gini <- 2 * gini/sum(x) - (n + 1)
gini <- gini/n
return(gini)
}
#' Calculate ci values to be used for error bars on a plot
#'
#' @keywords internal
#'
#' @param x A numeric vector.
#' @param conf The confidence level.
#' @return A named list with values for y, ymin, and ymax.
get_ci <- function(x, conf = 0.95) {
n <- length(x)
m <- mean(x)
se <- stats::sd(x) / sqrt(n)
error_margin <- stats::qt(conf + (1 - conf) / 2, df = n - 1) * se
return(c(y = m, ymin = m - error_margin, ymax = m + error_margin))
}
#' Get significance stars from p values
#'
#' @keywords internal
#'
#' @param x A vector of p values.
#' @return A character vector with significance stars.
get_stars <- function(x) {
sapply(x, function(p) {
if (is.na(p)) {
return(NA)
} else if (p < 0.001) {
return("***")
} else if (p < 0.01) {
return("**")
} else if (p < 0.05) {
return("*")
} else if (p < 0.1) {
return(".")
} else {
return("")
}
})
}
#' Calculate standardized betas
#'
#' @keywords internal
#'
#' @param fit A model fitted with lm()
#' @return A data frame with a row for each term
get_betas <- function(fit) {
# Raw betas (exclude intercept)
betas <- stats::coef(fit)[-1]
# Response SD
model_data <- stats::model.frame(fit)
y_sd <- stats::sd(model_data[[1]], na.rm = TRUE)
# Predictor SDs
mm <- stats::model.matrix(fit)[, -1, drop = FALSE]
x_sds <- apply(mm, 2, stats::sd, na.rm = TRUE)
# Compute standardized betas
beta_std <- betas * (x_sds / y_sd)
tibble::tibble(
term = names(beta_std),
beta_std = beta_std
)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.