#' Create balance table with group statistics; conduct t-tests and F-test
#'
#' Create an export-ready data frame containing group means and group standard deviations (or standard errors), output from difference-in-means tests; as well as output from a joint significance test for models of group membership
#' @param data The data frame to tabulate
#' @param balvar The variables to tabulate
#' @param grpvar If TRUE then drops NA values; if FALSE then includes NA values
#' @param refgrp Reference category for group comparisons
#' @param fevar Name of fixed effects or block variables to condition on
#' @param vcov Standard error type (e.g. "hc1", "iid", "hetero") or a name of a variable to cluster standard errors on
#' @param report_sd If TRUE then reports standard deviation of each variable; if FALSE then reports standard errors
#' @param report_ttest_pval If TRUE then reports p-values in difference in means test; if FALSE then reports mean difference
#' @return A data frame containing the balance table information
#' @import dplyr
#' @import tidyr
#' @import stringr
#' @import purrr
#' @import fixest
#' @import modelsummary
#' @import marginaleffects
kc_baltab <- function(data, balvar, grpvar, refgrp,
fevar = NULL, vcov = "hc1",
report_sd = FALSE, report_ttest_pval = TRUE) {
# Preliminaries
if (!is.data.frame(data))
stop("Data must be a data frame.")
if (!all(balvar %in% (names(data))))
stop("Covariates must be in data.")
if (!grpvar %in% (names(data)))
stop("Group variable must be in data.")
if (!refgrp %in% (data |> dplyr::pull({{grpvar}}) |> unique()))
stop("Reference group must be in group variable.")
if (!is.null(fevar))
if(!fevar %in% (names(data)))
stop("Fixed effects must be in data.")
# SE can be either fixest types below or a variable name to cluster on
if (vcov %in% c("iid", "normal", "standard", "hetero", "white", "hc1")) {
vcov_type = vcov
} else if (! vcov %in% c("iid", "normal", "standard", "hetero", "white", "hc1") &
vcov %in% names(data)) {
vcov_type = vcov_cluster(vcov)
} else if (! vcov %in% c("iid", "normal", "standard", "hetero", "white", "hc1") &
! vcov %in% names(data)) {
stop("SE type must be a variable in data or a recognized type.")
}
# Drop obs with NA group/treatment category
main_data <- data |>
tidyr::drop_na(grpvar)
# grpvar as factor
grpvar_fct <- main_data |>
dplyr::pull(grpvar) |>
unique() |>
sort() |>
factor()
# Define non-reference categories
nonref <- setdiff(main_data |> dplyr::pull({{grpvar}}) |> unique(),
refgrp)
# Setup
gm <- list(
list("raw" = "nobs",
"clean" = "nobs",
"fmt" = function(x) formatC(x, digits = 3, big.mark = ",", format = "f"))
)
# Compute mean / sd OR mean / se
if (report_sd == TRUE) {
mean_sd <- main_data |>
dplyr::group_by(dplyr::across({{grpvar}})) |>
dplyr::summarize(dplyr::across(balvar,
list(mean = \(x) mean(x, na.rm = TRUE),
sdev = \(x) sd(x, na.rm = TRUE),
nobs = \(x) sum(!is.na(x))))) |>
tidyr::pivot_longer(cols = ends_with("mean") | ends_with("sdev") | ends_with("nobs"),
values_to = "vals",
names_to = "vars") |>
dplyr::mutate(stat = stringr::str_sub(vars, -4, -1)) |>
dplyr::mutate(vars = stringr::str_remove(vars, "_[^_]*$")) |> # everything after underscore
tidyr::pivot_wider(names_from = grpvar, values_from = "vals") |>
dplyr::mutate(vars_fct = factor(vars, levels = balvar)) |>
dplyr::mutate(stat_fct = factor(stat, levels = c("mean", "sdev", "nobs"))) |>
dplyr::arrange(vars_fct, stat_fct) |>
dplyr::select(vars_fct, stat_fct, all_of(grpvar_fct)) |>
dplyr::mutate(dplyr::across(all_of(grpvar_fct),
~ dplyr::case_when(stat_fct == "mean" ~ formatC(., digits = 3, big.mark = ",", format = "f")), #sprintf("%.3f", round(., digits = 3)),
stat_fct == "sdev" ~ paste0("(", formatC(., digits = 3, big.mark = ",", format = "f"), ")"), # paste0("(", sprintf("%.3f", round(., digits = 3)), ")"),
stat_fct == "nobs" ~ paste0("[", format(as.integer(.), big.mark = ","), "]"))) |>
dplyr::select(-stat_fct) |>
# Replace duplicates with NA
dplyr::mutate(vars_fct = replace(vars_fct, duplicated(vars_fct), NA))
} else if (report_sd == FALSE) {
# Compute mean, SE, nobs
mean_se <- main_data |>
dplyr::group_by(dplyr::across({{grpvar}})) |>
tidyr::nest() |>
# Specify variables to get mean and SE
dplyr::mutate(covar = purrr::map2(data, .data[[grpvar]],
\(d, g) purrr::map(balvar, \(y) y))) |>
# Run regressions to get mean and SE
dplyr::mutate(reg_mean_se = purrr::map2(data, covar,
\(d, y) fixest::feols(.[y] ~ 1,
fixef = NULL,
data = d,
vcov = vcov_type))) |>
# Get data frame with mean, se, and nobs using modelsummary
dplyr::mutate(mean_se_nobs = purrr::map(reg_mean_se,
\(res) modelsummary::msummary(res,
output = "data.frame",
gof_map = gm))) |>
# Rename objects
dplyr::mutate(mean_se_nobs = purrr::map(mean_se_nobs,
\(d) d |>
dplyr::mutate(reg_stats = dplyr::case_when(part == "estimates" & statistic == "estimate" ~ "mean",
part == "estimates" & statistic == "std.error" ~ "serr",
part == "gof" & statistic == "" ~ "nobs")) |>
dplyr::select(reg_stats, starts_with("lhs")))) |>
# Keep only objects
dplyr::select({{grpvar}}, mean_se_nobs) |>
tidyr::unnest(c(mean_se_nobs)) |>
# Pivot so that grpvar is in column header
tidyr::pivot_wider(names_from = reg_stats, values_from = starts_with("lhs")) |>
tidyr::pivot_longer(cols = -1) |> # all but the first column
tidyr::pivot_wider(names_from = grpvar, values_from= "value") |>
# Arrange Mutate num obs format
tidyr::separate(name, into = c("vars", "stat"), sep="_(?=[^_]+$)") |>
dplyr::mutate(vars = stringr::str_remove(vars, "lhs: ")) |>
dplyr::mutate(vars_fct = factor(vars, levels = balvar)) |>
dplyr::mutate(stat_fct = factor(stat, levels = c("mean", "serr", "nobs"))) |>
dplyr::arrange(vars_fct, stat_fct) |>
dplyr::select(vars_fct, stat_fct, all_of(grpvar_fct)) |>
dplyr::mutate(dplyr::across(all_of(grpvar_fct), ~ dplyr::case_when(stat_fct == "nobs" ~ paste0("[", ., "]"),
stat_fct == "serr" ~ .,
stat_fct == "mean" ~ .))) |>
dplyr::select(-stat_fct) |>
# Replace duplicates with NA
dplyr::mutate(vars_fct = replace(vars_fct, duplicated(vars_fct), NA))
}
# Difference in mean test as regressions
ttest <- main_data |>
dplyr::filter(.data[[grpvar]] == refgrp) |>
tidyr::nest() |>
dplyr::rename(control_df = data) |>
dplyr::mutate(nonref_df = purrr::map(control_df,
\(x) main_data |>
dplyr::filter(.data[[grpvar]] %in% nonref) |>
tidyr::nest(.by = all_of(grpvar)) |>
dplyr::rename(treatment_df = data))) |>
tidyr::unnest(c(nonref_df)) |>
dplyr::mutate(treatment_df = purrr::map2(treatment_df, .data[[grpvar]],
\(d,g) d |> dplyr::mutate(!!rlang::sym(grpvar) := g))) |>
dplyr::mutate(contrast_df = purrr::map2(control_df, treatment_df,
\(x, y) dplyr::bind_rows(x, y))) |>
dplyr::mutate(covar = purrr::map(contrast_df,
\(d) purrr::map(balvar, \(y) y))) |>
# Run regressions to get difference in means (reg y D)
dplyr::mutate(reg_diff_mean = purrr::map2(contrast_df, covar,
\(d, y) fixest::feols(.[y] ~ i(.[grpvar], ref = refgrp),
fixef = fevar,
data = d,
vcov = vcov_type))) |>
# Get data frame with mean, se, and nobs using modelsummary
dplyr::mutate(diff_mean_list = purrr::map(reg_diff_mean,
\(res) modelsummary::msummary(res,
output = "data.frame",
stars = c('*' = .1, '**' = .05, '***' = 0.01),
coef_map = paste0(grpvar, "::", nonref),
shape = term + model ~ statistic,
estimate = "{estimate}{stars}",
statistic = "{p.value}{stars}"
))) |>
# Clean up data frame
dplyr::mutate(diff_mean_list = purrr::pmap(list(reg_diff_mean, diff_mean_list, .data[[grpvar]]),
\(res, d, g) d |>
{\(df) if (class(res) == "fixest")
dplyr::mutate(df, model = balvar)
else if (class(res) == "fixest_multi")
dplyr::mutate(df, model = stringr::str_remove(model, "lhs: "))}() |>
dplyr::select(-part, -term,
vars = model, diff_mean = "Est.",
diff_mean_pval = p) |>
dplyr::mutate(vars_fct = factor(vars, levels = balvar)) |>
dplyr::arrange(vars_fct)|>
{\(df) if (report_ttest_pval == TRUE) dplyr::select(df, -diff_mean, -vars) |>
dplyr::rename_with(~ paste0(g, " vs. ", refgrp), diff_mean_pval)
else dplyr::select(df, -diff_mean_pval, -vars) |>
dplyr::rename_with(~ paste0(g, " vs. ", refgrp), diff_mean)}() |>
dplyr::select(vars_fct, everything())
))
ttest_df <- purrr::reduce(ttest$diff_mean_list, dplyr::left_join, by = "vars_fct")
# F-test of joint significance
ftest <- main_data |>
dplyr::filter(.data[[grpvar]] == refgrp) |>
tidyr::nest() |>
dplyr::rename(control_df = data) |>
dplyr::mutate(nonref_df = purrr::map(control_df,
\(x) main_data |>
dplyr::filter(.data[[grpvar]] %in% nonref) |>
tidyr::nest(.by = all_of(grpvar)) |>
dplyr::rename(treatment_df = data))) |>
tidyr::unnest(c(nonref_df)) |>
dplyr::mutate(treatment_df = purrr::map2(treatment_df, .data[[grpvar]],
\(d,g) d |> dplyr::mutate(!!rlang::sym(grpvar) := g))) |>
dplyr::mutate(contrast_df = purrr::map2(control_df, treatment_df,
\(x, y) dplyr::bind_rows(x, y) |>
dplyr::mutate(dvar_grpvar = dplyr::if_else(.data[[grpvar]] == refgrp, 0, 1)))) |>
# Regression of grpvar on covariates
dplyr::mutate(reg_ftest = purrr::map(contrast_df,
\(d, y) fixest::feols(dvar_grpvar ~ .[balvar],
fixef = fevar,
data = d,
vcov = vcov_type))) |>
# F-test for grpvar status
dplyr::mutate(ftest_stat = purrr::map2(reg_ftest, .data[[grpvar]],
# \(res, g) fixest::fitstat(res, type = c("f.stat", "f.p"),
# simplify = TRUE) |>
\(res, g) marginaleffects::hypotheses(res, joint = balvar) |>
dplyr::as_tibble() |>
dplyr::select(f.stat = statistic, f.p = p.value) |>
dplyr::mutate(dplyr::across(everything(),
~sprintf("%.3f", round(., digits = 3)))) |>
tidyr::pivot_longer(cols = everything(),
names_to = "vars_fct",
values_to = paste0(g, " vs. ", refgrp))))
ftest_df <- purrr::reduce(ftest$ftest_stat, dplyr::left_join, by = "vars_fct")
if (report_sd == TRUE) {
report_df <- dplyr::full_join(mean_sd, ttest_df, by = "vars_fct") |>
dplyr::bind_rows(ftest_df)
} else if (report_sd == FALSE) {
report_df <- dplyr::full_join(mean_se, ttest_df, by = "vars_fct") |>
dplyr::bind_rows(ftest_df)
}
return(report_df)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.