Nothing
#' Enhanced alternative to lm()
#'
#' Runs a linear regression with better defaults (robust SE), and richer & better
#' formatted output than \code{lm}. For robust and clustered errors it relies on \code{\link[estimatr]{lm_robust}}.
#' The output reports classical and robust errors, number of missing observations per
#' variable, an effect size column (standardized regression coefficient), and a red.flag column per variable
#' flagging the need to conduct specific diagnostics. It relies by default on HC3 for standard errors;
#' \code{lm_robust} relies on HC2 (and Stata's 'reg y x, robust' on HC1), which can have
#' inflated false-positive rates in smaller samples (Long & Ervin, 2000).
#' @param ... same arguments as \code{\link[base]{lm}}, plus the arguments below
#' @param se_type The type of standard error to use. Default is \code{"HC3"}.
#' Without clusters: \code{"HC0"}, \code{"HC1"}, \code{"HC2"}, or \code{"HC3"}.
#' When \code{clusters} is specified, \code{se_type} is automatically set to \code{"CR2"}.
#' @param notes Logical. If TRUE (default), print explanatory notes below the table
#' when the result is printed.
#' @param clusters An optional variable indicating clusters for cluster-robust standard
#' errors. When specified, \code{se_type} is automatically set to \code{"CR2"}
#' (bias-reduced cluster-robust estimator). Passed to \code{\link[estimatr]{lm_robust}}.
#' @param fixed_effects An optional right-sided formula containing the fixed effects
#' to be projected out (absorbed) before estimation. Useful for models with many
#' fixed effect groups (e.g., \code{~ firm_id} or \code{~ firm_id + year}).
#' Passed to \code{\link[estimatr]{lm_robust}}.
#' @param ... Additional arguments passed to \code{\link[estimatr]{lm_robust}}.
#'
#' @details
#' Robust standard errors and clustered standard errors are computed using
#' \code{\link[estimatr]{lm_robust}}; see the documentation of that function for details (using by default CR2 errors)
#' The output shows both standard errors and when clustering errors it reports all three.
#' The red.flag column is based on the difference between robust and classical standard errors.
#'
#' The \code{red.flag} column provides diagnostic warnings:
#' \itemize{
#' \item \code{!}, \code{!!}, \code{!!!}: Robust and classical standard errors differ by
#' more than 25\%, 50\%, or 100\%, respectively. Large differences may suggest model
#' misspecification or outliers (but they may also be benign). When encountering a red flag,
#' authors should plot the distributions to look for outliers or skewed data, and use \code{\link{scatter.gam}}
#' to look for possible nonlinearities in the relevant variables.
#' King & Roberts (2015) propose a higher cutoff, at 100\%, and a bootstrapped significance test;
#' \code{statuser} does not follow either recommendation. The former seems too liberal, the
#' latter too time consuming to include in every regression, plus the focus here is on individual variables rather than joint tests.
#' \item \code{X}: For interaction terms, the component variables are correlated (|r| > 0.3 or p < .05),
#' which means the interaction term is likely to be biased. See Simonsohn (2024) "Interacting with curves"
#' \doi{10.1177/25152459231207787}.
#' }
#'
#' @references
#' King, G., & Roberts, M. E. (2015). How robust standard errors expose methodological
#' problems they do not fix, and what to do about it. \emph{Political Analysis}, 23(2), 159-179.
#'
#' Long, J. S., & Ervin, L. H. (2000). Using heteroscedasticity consistent standard errors
#' in the linear regression model. \emph{The American Statistician}, 54(3), 217-224.
#'
#' Simonsohn, U. (2024). Interacting with curves: How to validly test and probe
#' interactions in the real (nonlinear) world. \emph{Advances in Methods and Practices in
#' Psychological Science}, 7(1), 1-22.
#'
#' @examples
#' # Basic usage with data argument
#' lm2(mpg ~ wt + hp, data = mtcars)
#'
#' # Without data argument (variables from environment)
#' y <- mtcars$mpg
#' x1 <- mtcars$wt
#' x2 <- mtcars$hp
#' lm2(y ~ x1 + x2)
#'
#' # RED FLAG EXAMPLES
#'
#' # Example 1: red flag catches a nonlinearity
#' # True model is quadratic: y = x^2
#' set.seed(123)
#' x <- runif(200, -3, 3)
#' y <- x^2 + rnorm(200, sd = 2)
#'
#' # lm2() shows red flag due to misspecification
#' lm2(y ~ x)
#'
#' # Follow up with scatter.gam() to diagnose it
#' scatter.gam(x, y)
#'
#' # Example 2: red flag catches an outlier in y
#' # True model is y = x, but one observation has a very large y value
#' set.seed(123)
#' x <- sort(rnorm(200))
#' y <- round(x + rnorm(200, sd = 2), 1)
#' y[200] <- 100 # Outlier
#'
#' # lm2() flags x
#' lm2(y ~ x)
#'
#' # Look at distribution of y to spot the outlier
#' plot_freq(y)
#'
#' # Example 3: red flag catches an outlier in one predictor
#' # True model is y = x1 + x2, but x2 has an extreme value
#' set.seed(123)
#' x1 <- round(rnorm(200),.1)
#' x2 <- round(rnorm(200),.1)
#' y <- x1 + x2 + rnorm(200, sd = 0.5)
#' x2[200] <- 50 # Outlier in x2
#'
#' # lm2() flags x2 (but not x1)
#' lm2(y ~ x1 + x2)
#'
#' # Look at distribution of x2 to spot the outlier
#' plot_freq(x2)
#'
#' # CLUSTERED STANDARD ERRORS
#' # When observations are grouped (e.g., students within schools),
#' # use clusters to account for within-group correlation
#' set.seed(123)
#' n_clusters <- 20
#' n_per_cluster <- 15
#' cluster_id <- rep(1:n_clusters, each = n_per_cluster)
#' cluster_effect <- rnorm(n_clusters, sd = 2)[cluster_id]
#' x <- rnorm(n_clusters * n_per_cluster)
#' y <- 1 + 0.5 * x + cluster_effect + rnorm(n_clusters * n_per_cluster)
#' mydata <- data.frame(y = y, x = x, cluster_id = cluster_id)
#'
#' # Clustered SE (CR2) - note the SE.cluster column in output
#' lm2(y ~ x, data = mydata, clusters = cluster_id)
#'
#' # FIXED EFFECTS
#' # Use fixed_effects to absorb group-level variation (e.g., firm or year effects)
#' # This is useful for panel data or when you have many fixed effect levels
#' set.seed(456)
#' n_firms <- 30
#' n_years <- 5
#' firm_id <- rep(1:n_firms, each = n_years)
#' year <- rep(2018:2022, times = n_firms)
#' firm_effect <- rnorm(n_firms, sd = 3)[firm_id]
#' x <- rnorm(n_firms * n_years)
#' y <- 2 + 0.8 * x + firm_effect + rnorm(n_firms * n_years)
#' panel <- data.frame(y = y, x = x, firm_id = factor(firm_id), year = factor(year))
#'
#' # Absorb firm fixed effects (coefficient on x is estimated, firm dummies are not shown)
#' lm2(y ~ x, data = panel, fixed_effects = ~ firm_id)
#'
#' # Two-way fixed effects (firm and year)
#' lm2(y ~ x, data = panel, fixed_effects = ~ firm_id + year)
#'
#' @return An object of class \code{c("lm2", "lm_robust", "lm")}. This inherits
#' from \code{\link[estimatr]{lm_robust}} and can be used with packages like
#' \code{marginaleffects}. The object contains all components of an \code{lm_robust}
#' object plus additional attributes:
#' \describe{
#' \item{statuser_table}{A data frame with columns: \code{term}, \code{estimate},
#' \code{SE.robust}, \code{SE.classical}, \code{t}, \code{df}, \code{p.value},
#' \code{B} (standardized coefficient), and optionally \code{SE.cluster} when
#' clustered standard errors are used.}
#' \item{classical_fit}{The underlying \code{lm} object with classical standard errors.}
#' \item{na_counts}{Integer vector of missing value counts per variable.}
#' \item{n_missing}{Total number of observations excluded due to missing values.}
#' \item{has_clusters}{Logical indicating whether clustered standard errors were used.}
#' }
#' When printed, displays a formatted regression table with robust and classical
#' standard errors, effect sizes, and diagnostic red flags.
#'
#' @seealso \code{\link[estimatr]{lm_robust}}, \code{\link{scatter.gam}}
#' @usage NULL
#' @export lm2
lm2 <- function(formula, data = NULL, se_type = "HC3", notes = TRUE,
clusters = NULL, fixed_effects = NULL, ...) {
# Capture the call
cl <- match.call()
# Check that estimatr is available
if (!requireNamespace("estimatr", quietly = TRUE)) {
stop("Package 'estimatr' is required for lm2(). Please install it with: install.packages('estimatr')")
}
# Capture clusters expression for non-standard evaluation
clusters_expr <- substitute(clusters)
has_clusters <- !missing(clusters) && !is.null(clusters_expr) && !identical(clusters_expr, quote(NULL))
# Evaluate clusters in data context if provided
clusters_vec <- NULL
if (has_clusters && !is.null(data)) {
clusters_vec <- tryCatch(
eval(clusters_expr, envir = data, enclos = parent.frame()),
error = function(e) {
# Try evaluating in parent frame if not in data
eval(clusters_expr, envir = parent.frame(2))
}
)
} else if (has_clusters) {
clusters_vec <- eval(clusters_expr, envir = parent.frame())
}
# Validate formula early
validate_formula(formula, data, func_name = "lm2", calling_env = parent.frame())
# Validate inputs and construct data frame if needed
validated <- validate_lm2(
formula = formula,
data = data,
se_type = se_type,
se_type_missing = missing(se_type),
dots = list(..., clusters = clusters_vec),
calling_env = parent.frame()
)
data <- validated$data
se_type <- validated$se_type
# Build arguments for lm_robust (only include non-NULL optional args)
lm_robust_args <- list(formula = formula, data = data, se_type = se_type, ...)
if (has_clusters) lm_robust_args$clusters <- clusters_vec
if (!is.null(fixed_effects)) lm_robust_args$fixed_effects <- fixed_effects
# Run lm_robust with specified se_type
robust_fit <- do.call(estimatr::lm_robust, lm_robust_args)
# Build enhanced object that inherits from lm_robust
# This allows packages like marginaleffects to work with it seamlessly
# Extract components from lm_robust object
term_names <- names(robust_fit$coefficients)
estimates <- as.numeric(robust_fit$coefficients)
# When clusters are used, we need both clustered SE and HC3 robust SE
# The clustered SE comes from robust_fit (CR2)
# The HC3 robust SE comes from a separate run without clusters
# This lets us flag heteroskedasticity/outliers separately from clustering effects
if (has_clusters) {
cluster_se <- robust_fit$std.error
# t-values and p-values based on clustered SE
t_values <- robust_fit$statistic
p_values <- robust_fit$p.value
df_values <- robust_fit$df
# Run HC3 without clusters to get heteroskedasticity-robust SE (for red flag only)
hc3_args <- list(formula = formula, data = data, se_type = "HC3", ...)
if (!is.null(fixed_effects)) hc3_args$fixed_effects <- fixed_effects
hc3_fit <- do.call(estimatr::lm_robust, hc3_args)
robust_se <- hc3_fit$std.error
} else {
cluster_se <- NULL
robust_se <- robust_fit$std.error
t_values <- robust_fit$statistic
p_values <- robust_fit$p.value
df_values <- robust_fit$df
}
# Run classical SEs using the same model specification (including fixed effects)
classical_args <- list(formula = formula, data = data, se_type = "classical", ...)
if (!is.null(fixed_effects)) classical_args$fixed_effects <- fixed_effects
# Ensure classical SEs are not clustered
if ("clusters" %in% names(classical_args)) classical_args$clusters <- NULL
classical_fit <- do.call(estimatr::lm_robust, classical_args)
classical_se <- classical_fit$std.error
# Calculate standardized coefficients (beta weights)
# Beta = b * (SD_x / SD_y)
# For intercept, beta is NA
model_data <- stats::model.frame(formula, data = data)
y <- model_data[[1]]
sd_y <- stats::sd(y, na.rm = TRUE)
# Calculate total missing observations (rows excluded due to NAs)
n_original <- nrow(data)
n_used <- robust_fit$nobs
n_missing <- n_original - n_used
# Missing DV count (from original data)
y_name <- all.vars(formula)[1]
n_missing_y <- if (!is.null(y_name) && y_name %in% names(data)) {
sum(is.na(data[[y_name]]))
} else {
NA_integer_
}
# Fixed effects info (for printing)
fe_terms <- character(0)
fe_k <- integer(0)
fe_missing <- integer(0)
if (!is.null(fixed_effects)) {
fe_terms <- all.vars(fixed_effects)
fe_k <- sapply(fe_terms, function(term) {
if (!term %in% names(data)) return(NA_integer_)
v <- data[[term]]
length(unique(v[!is.na(v)]))
})
fe_missing <- sapply(fe_terms, function(term) {
if (!term %in% names(data)) return(NA_integer_)
sum(is.na(data[[term]]))
})
names(fe_k) <- fe_terms
names(fe_missing) <- fe_terms
}
# Calculate standardized coefficients, means/percentages, and NA counts
standardized_coefs <- numeric(length(term_names))
mean_or_pct <- numeric(length(term_names))
mean_is_pct <- logical(length(term_names)) # TRUE when mean_or_pct is 0-100 % (factor/factor*factor)
na_counts <- integer(length(term_names))
names(standardized_coefs) <- term_names
names(mean_or_pct) <- term_names
names(mean_is_pct) <- term_names
names(na_counts) <- term_names
# Get model matrix for dummy variable percentages
# Try from classical_fit first, then robust_fit
model_matrix <- tryCatch(
stats::model.matrix(classical_fit),
error = function(e) {
tryCatch(
stats::model.matrix(robust_fit),
error = function(e2) NULL
)
}
)
# Get original data to count NAs (before model.frame removes them)
# We need to look at the raw data columns
for (i in seq_along(term_names)) {
term <- term_names[i]
if (term == "(Intercept)") {
standardized_coefs[i] <- NA_real_
mean_or_pct[i] <- NA_real_
na_counts[i] <- NA_integer_ # NA count not meaningful for intercept
} else {
# Get the coefficient
b <- estimates[i]
# Check if this is an interaction term (contains ":")
if (grepl(":", term)) {
# Interaction term: compute b * sd(x1) * sd(x2) * ... / sd(y)
components <- strsplit(term, ":")[[1]]
sd_product <- 1
all_numeric <- TRUE
for (comp in components) {
if (comp %in% names(model_data)) {
x_comp <- model_data[[comp]]
if (is.numeric(x_comp)) {
sd_product <- sd_product * stats::sd(x_comp, na.rm = TRUE)
} else {
# Factor component - can't compute SD
all_numeric <- FALSE
break
}
} else {
# Component not found (likely a factor level like "groupB")
all_numeric <- FALSE
break
}
}
if (all_numeric) {
standardized_coefs[i] <- b * sd_product / sd_y
} else {
standardized_coefs[i] <- NA_real_
}
# Compute mean for interactions:
# For x*z where z is factor: mean of x for that factor level
# For x*z where both numeric: mean of x*z product
if (length(components) == 2) {
# Try to identify numeric vs factor components
comp1 <- components[1]
comp2 <- components[2]
# Helper to get variable and check if it's a factor level
get_var_info <- function(comp_name) {
# Direct numeric variable
if (comp_name %in% names(model_data)) {
v <- model_data[[comp_name]]
if (is.numeric(v)) return(list(type = "numeric", var = v, filter = NULL))
if (is.factor(v)) return(list(type = "factor", var = v, filter = NULL))
}
# Factor level term (e.g., "groupTreatment")
for (var_name in names(model_data)) {
if (is.factor(model_data[[var_name]]) || is.character(model_data[[var_name]])) {
if (startsWith(comp_name, var_name)) {
level_name <- substring(comp_name, nchar(var_name) + 1)
factor_var <- model_data[[var_name]]
if (level_name %in% levels(factor(factor_var))) {
return(list(type = "factor_level", var = factor_var, filter = level_name))
}
}
}
}
return(NULL)
}
info1 <- get_var_info(comp1)
info2 <- get_var_info(comp2)
if (!is.null(info1) && !is.null(info2)) {
# Case 1: numeric * factor_level -> mean of numeric where factor == level
if (info1$type == "numeric" && info2$type == "factor_level") {
filter_idx <- info2$var == info2$filter
mean_or_pct[i] <- mean(info1$var[filter_idx], na.rm = TRUE)
} else if (info1$type == "factor_level" && info2$type == "numeric") {
filter_idx <- info1$var == info1$filter
mean_or_pct[i] <- mean(info2$var[filter_idx], na.rm = TRUE)
# Case 2: numeric * numeric -> mean of product
} else if (info1$type == "numeric" && info2$type == "numeric") {
mean_or_pct[i] <- mean(info1$var * info2$var, na.rm = TRUE)
# Case 3: factor_level * factor_level -> percentage where both conditions met
} else if (info1$type == "factor_level" && info2$type == "factor_level") {
filter_idx <- (info1$var == info1$filter) & (info2$var == info2$filter)
mean_or_pct[i] <- mean(filter_idx, na.rm = TRUE) * 100
mean_is_pct[i] <- TRUE
} else {
mean_or_pct[i] <- NA_real_
}
} else {
mean_or_pct[i] <- NA_real_
}
} else {
# 3+ way interactions: skip for now
mean_or_pct[i] <- NA_real_
}
na_counts[i] <- NA_integer_ # NA count not meaningful for interactions
} else if (term %in% names(model_data)) {
# Simple term in model data: compute b * sd(x) / sd(y)
x <- model_data[[term]]
if (is.numeric(x)) {
sd_x <- stats::sd(x, na.rm = TRUE)
standardized_coefs[i] <- b * (sd_x / sd_y)
mean_or_pct[i] <- mean(x, na.rm = TRUE)
} else {
standardized_coefs[i] <- NA_real_
mean_or_pct[i] <- NA_real_
}
# Count NAs in original data column
if (term %in% names(data)) {
na_counts[i] <- sum(is.na(data[[term]]))
} else {
na_counts[i] <- NA_integer_
}
} else {
# For factor levels (e.g., "groupB"), get standardized coef and % from model matrix or model frame
# Try to get percentage from model matrix (dummy variable) first
if (!is.null(model_matrix) && term %in% colnames(model_matrix)) {
dummy_col <- model_matrix[, term]
# Calculate standardized coefficient: b * sd(dummy) / sd(y)
sd_dummy <- stats::sd(dummy_col, na.rm = TRUE)
standardized_coefs[i] <- b * sd_dummy / sd_y
# Check if this is an ordered factor polynomial contrast (.L, .Q, .C, .^n)
# These have non-0/1 values so percentage doesn't make sense
is_polynomial_contrast <- grepl("\\.(L|Q|C|\\^[0-9]+)$", term)
if (is_polynomial_contrast) {
# Polynomial contrasts: mean/pct not meaningful
mean_or_pct[i] <- NA_real_
} else {
# Treatment contrasts (dummy 0/1): calculate percentage of 1s
mean_or_pct[i] <- mean(dummy_col, na.rm = TRUE) * 100
mean_is_pct[i] <- TRUE
}
# Count NAs in the original factor variable
# Find the factor variable name by checking which data column this term came from
found_na_count <- FALSE
if (!is.null(data)) {
for (var_name in names(data)) {
if (is.factor(data[[var_name]]) || is.character(data[[var_name]])) {
if (startsWith(term, var_name)) {
na_counts[i] <- sum(is.na(data[[var_name]]))
found_na_count <- TRUE
break
}
}
}
}
if (!found_na_count) {
na_counts[i] <- 0L
}
} else {
# Fallback: infer factor level from term (e.g. "groupB" -> variable "group", level "B")
# and compute % from model frame (works when model.matrix fails or colnames don't match)
found_pct <- FALSE
for (var_name in names(model_data)) {
if (var_name == term) next
v <- model_data[[var_name]]
if (is.factor(v) || is.character(v)) {
levs <- levels(factor(v))
for (lev in levs[-1L]) { # skip baseline (first level)
if (term == paste0(var_name, lev)) {
mean_or_pct[i] <- mean(v == lev, na.rm = TRUE) * 100
mean_is_pct[i] <- TRUE
found_pct <- TRUE
# Standardized coef from dummy
dummy_col <- as.numeric(v == lev)
sd_dummy <- stats::sd(dummy_col, na.rm = TRUE)
standardized_coefs[i] <- b * sd_dummy / sd_y
break
}
}
if (found_pct) break
}
}
if (!found_pct) {
standardized_coefs[i] <- NA_real_
mean_or_pct[i] <- NA_real_
mean_is_pct[i] <- FALSE
}
# NA count for factor variable
if (!is.null(data)) {
found_na_count <- FALSE
for (var_name in names(data)) {
if (is.factor(data[[var_name]]) || is.character(data[[var_name]])) {
if (startsWith(term, var_name)) {
na_counts[i] <- sum(is.na(data[[var_name]]))
found_na_count <- TRUE
break
}
}
}
if (!found_na_count) na_counts[i] <- NA_integer_
} else {
na_counts[i] <- NA_integer_
}
}
}
}
}
# Build display table data frame (used by print method)
statuser_table <- data.frame(
term = term_names,
estimate = estimates,
SE.robust = robust_se,
SE.classical = classical_se[term_names],
t = t_values,
df = df_values,
p.value = p_values,
B = standardized_coefs,
mean_or_pct = mean_or_pct,
mean_is_pct = mean_is_pct,
row.names = NULL,
stringsAsFactors = FALSE
)
# Add SE.cluster column if clusters were used
if (!is.null(cluster_se)) {
statuser_table$SE.cluster <- cluster_se
}
# Start with the robust_fit object so we inherit from lm_robust
# This ensures compatibility with marginaleffects and other packages
result <- robust_fit
# Store the statuser-specific attributes
attr(result, "statuser_table") <- statuser_table
attr(result, "lm2_call") <- cl
attr(result, "classical_fit") <- classical_fit
attr(result, "na_counts") <- na_counts
attr(result, "n_missing") <- n_missing
attr(result, "n_missing_y") <- n_missing_y
attr(result, "y_name") <- y_name
attr(result, "notes") <- notes
attr(result, "has_clusters") <- has_clusters
attr(result, "fe_terms") <- fe_terms
attr(result, "fe_k") <- fe_k
attr(result, "fe_missing") <- fe_missing
# Prepend "lm2" class so our print method is used, but keep lm_robust inheritance
class(result) <- c("lm2", class(result))
return(result)
}
#' Print method for lm2 objects
#'
#' @param x An object of class \code{lm2}
#' @param notes Logical. If TRUE (default), print explanatory notes below the table.
#' If not specified, uses the value set when \code{lm2()} was called.
#' @param ... Additional arguments (ignored)
#'
#' @return Invisibly returns the original object
#' @export
print.lm2 <- function(x, notes = NULL, ...) {
# Use notes attribute from lm2() call if not explicitly specified
if (is.null(notes)) {
notes <- attr(x, "notes")
if (is.null(notes)) notes <- TRUE
}
# Get the statuser table from attribute
tbl <- attr(x, "statuser_table")
# Helper: smart rounding based on magnitude
# >=100: 1 decimal, >=10: 2 decimals, >=0.01: 3 decimals
# <0.01: show 2 significant non-zero digits (e.g., .000042)
smart_round <- function(val) {
if (is.na(val)) return(NA_character_)
abs_val <- abs(val)
if (abs_val >= 100) {
decimals <- 1
return(format(round(val, decimals), nsmall = decimals))
} else if (abs_val >= 10) {
decimals <- 2
return(format(round(val, decimals), nsmall = decimals))
} else if (abs_val >= 0.01 || abs_val == 0) {
decimals <- 3
return(format(round(val, decimals), nsmall = decimals))
} else {
# For values < 0.01, show 2 significant digits
# e.g., 0.000042323 -> .000042
sign_char <- if (val < 0) "-" else ""
# Find how many decimal places needed for 2 sig figs
log_val <- floor(log10(abs_val))
decimals <- -log_val + 1 # +1 to get 2 sig figs
rounded <- round(abs_val, decimals)
formatted <- format(rounded, nsmall = decimals, scientific = FALSE)
# Remove leading zero
formatted <- sub("^0\\.", ".", formatted)
return(paste0(sign_char, formatted))
}
}
# Helper: format p-value (no leading zero, <.0001 if tiny)
format_p <- function(p) {
if (is.na(p)) return(NA_character_)
if (p < 0.0001) return("<.0001")
# Round to 4 decimals, remove leading zero, add space to align with "<.0001"
p_rounded <- round(p, 4)
p_str <- format(p_rounded, nsmall = 4, scientific = FALSE)
p_str <- sub("^0\\.", " .", p_str) # Replace leading 0 with space for alignment
p_str
}
# Get NA counts from attribute before modifying display_df
na_counts <- attr(x, "na_counts")
# -------------------------------------------------------------------------
# Helper: Parse factor terms and identify baseline levels
# Returns a list with:
# - formatted_terms: vector of formatted term names (e.g., "group (B)")
# - baseline_info: list of baseline rows to insert
# - is_factor_level: logical vector indicating which terms are factor levels
# -------------------------------------------------------------------------
parse_factor_terms <- function(tbl, xlevels) {
n_terms <- nrow(tbl)
formatted_terms <- tbl$term
is_factor_level <- logical(n_terms)
baseline_info <- list() # Will store info about baseline rows to add
if (is.null(xlevels) || length(xlevels) == 0) {
return(list(
formatted_terms = formatted_terms,
baseline_info = baseline_info,
is_factor_level = is_factor_level
))
}
# Helper: Replace factor level in a term component
# e.g., "group2B" with var_name="group2", lev="B" -> "group2 (B)"
replace_factor_level <- function(component, var_name, lev) {
expected <- paste0(var_name, lev)
if (component == expected) {
return(paste0(var_name, " (", lev, ")"))
}
return(NULL)
}
# Helper: Check if term is an ordered factor polynomial contrast
# e.g., "group3_ord.L" -> "group3_ord (linear)"
# Returns NULL if not a match, or the formatted name if it is
check_ordered_contrast <- function(term, var_name) {
# Ordered factors use .L, .Q, .C, .^4, .^5, etc. for polynomial contrasts
contrast_map <- c(
".L" = "linear",
".Q" = "quadratic",
".C" = "cubic"
)
for (suffix in names(contrast_map)) {
expected <- paste0(var_name, suffix)
if (term == expected) {
return(paste0(var_name, " (", contrast_map[suffix], ")"))
}
}
# Check for higher order polynomials: .^4, .^5, etc.
pattern <- paste0("^", gsub("([.|()\\^{}+*?$\\[\\]\\\\])", "\\\\\\1", var_name), "\\.\\^(\\d+)$")
if (grepl(pattern, term)) {
degree <- sub(pattern, "\\1", term)
return(paste0(var_name, " (^", degree, ")"))
}
return(NULL)
}
# Track which factors we've seen (to add baseline only once)
factors_seen <- character(0)
for (i in seq_len(n_terms)) {
term <- tbl$term[i]
# Skip intercept
if (term == "(Intercept)") next
# Check if this is an interaction term
if (grepl(":", term)) {
# Split into components and try to format each
components <- strsplit(term, ":")[[1]]
new_components <- components
for (ci in seq_along(components)) {
comp <- components[ci]
# Try to match against factor levels
for (var_name in names(xlevels)) {
levels_vec <- xlevels[[var_name]]
for (lev in levels_vec[-1]) { # Skip baseline level (won't appear in interactions)
replacement <- replace_factor_level(comp, var_name, lev)
if (!is.null(replacement)) {
new_components[ci] <- replacement
break
}
}
# Also check for ordered factor contrasts in interactions
ordered_replacement <- check_ordered_contrast(comp, var_name)
if (!is.null(ordered_replacement)) {
new_components[ci] <- ordered_replacement
}
}
}
formatted_terms[i] <- paste(new_components, collapse = ":")
} else {
# Non-interaction term: check each factor variable
matched <- FALSE
for (var_name in names(xlevels)) {
levels_vec <- xlevels[[var_name]]
# First check for ordered factor polynomial contrasts (e.g., varname.L, varname.Q)
ordered_replacement <- check_ordered_contrast(term, var_name)
if (!is.null(ordered_replacement)) {
is_factor_level[i] <- TRUE
formatted_terms[i] <- ordered_replacement
matched <- TRUE
# Note: ordered factors don't have a baseline level to show
break
}
# Check if this term matches varname + level (treatment contrasts)
for (lev in levels_vec) {
expected_name <- paste0(var_name, lev)
if (term == expected_name) {
# This is a factor level term
is_factor_level[i] <- TRUE
formatted_terms[i] <- paste0(var_name, " (", lev, ")")
# If this is the first time we see this factor, record the baseline
if (!(var_name %in% factors_seen)) {
factors_seen <- c(factors_seen, var_name)
baseline_level <- levels_vec[1] # First level is baseline
baseline_info[[length(baseline_info) + 1]] <- list(
var_name = var_name,
baseline_level = baseline_level,
insert_before = i # Insert baseline row before this row
)
}
matched <- TRUE
break
}
}
if (matched) break
}
}
}
return(list(
formatted_terms = formatted_terms,
baseline_info = baseline_info,
is_factor_level = is_factor_level
))
}
# Get factor levels from the model
xlevels <- x$xlevels
if (is.null(xlevels)) {
# Fallback to classical fit
classical_fit <- attr(x, "classical_fit")
if (!is.null(classical_fit)) {
xlevels <- classical_fit$xlevels
}
}
# Parse factor terms
factor_info <- parse_factor_terms(tbl, xlevels)
# Get model data for correlation checks and baseline % (lm_robust may not have $model)
model_data <- x$model
if (is.null(model_data)) {
# Fallback: reconstruct from formula and data
model_data <- tryCatch(
stats::model.frame(x$terms, data = attr(x, "classical_fit")$model),
error = function(e) NULL
)
}
if (is.null(model_data)) {
# Second fallback: use data from the call (e.g. lm2(y ~ x + group, data = d))
cl <- attr(x, "lm2_call")
if (!is.null(cl) && !is.null(cl$data)) {
model_data <- tryCatch(
stats::model.frame(x$terms, data = eval(cl$data, envir = parent.frame())),
error = function(e) NULL
)
}
}
# Build model matrix for factor level handling (if available)
model_matrix <- tryCatch(
stats::model.matrix(x),
error = function(e) {
classical_fit <- attr(x, "classical_fit")
if (!is.null(classical_fit)) {
tryCatch(stats::model.matrix(classical_fit), error = function(e2) NULL)
} else {
NULL
}
}
)
# Calculate SE flag: ! if robust and classical differ by >25%, !! if >50%, !!! if >100%
# Also check for interaction term correlations and store r(x,z) values
se_flag <- character(nrow(tbl))
cor_rxz <- character(nrow(tbl)) # Store formatted correlation for r(x,z) column
has_interactions <- FALSE
has_se_flags <- FALSE # Track if any SE-related flags (!, !!, !!!)
has_cor_flags <- FALSE # Track if any correlation-related flags (X)
for (i in seq_len(nrow(tbl))) {
flags <- character(0)
cor_rxz[i] <- "--" # Default for non-interaction terms
# Check SE difference
se_classical <- tbl$SE.classical[i]
se_robust <- tbl$SE.robust[i]
if (!is.na(se_classical) && !is.na(se_robust) && se_classical != 0) {
ratio_diff <- abs(se_classical - se_robust) / se_classical
if (ratio_diff > 1) {
flags <- c(flags, "!!!")
has_se_flags <- TRUE
} else if (ratio_diff > 0.5) {
flags <- c(flags, "!!")
has_se_flags <- TRUE
} else if (ratio_diff > 0.25) {
flags <- c(flags, "!")
has_se_flags <- TRUE
}
}
# Check if this is an interaction term (contains ":")
term <- tbl$term[i]
if (grepl(":", term)) {
has_interactions <- TRUE
# Extract the component variable names
components <- strsplit(term, ":")[[1]]
# Check correlation between components if we have 2 components
if (length(components) == 2 && !is.null(model_data)) {
var1_name <- components[1]
var2_name <- components[2]
# Helper function to get numeric variable for correlation
# Handles both direct numeric variables and factor level terms
get_numeric_var <- function(comp_name, model_data, model_matrix) {
# First check if it's directly in model_data as numeric
if (comp_name %in% names(model_data)) {
v <- model_data[[comp_name]]
if (is.numeric(v)) return(v)
# If it's a factor with 2 levels, convert to 0/1
if (is.factor(v) && length(levels(v)) == 2) {
return(as.numeric(v) - 1)
}
}
# Check if it's a factor level term (e.g., "groupTreatment")
# Try to find the original factor variable
for (var_name in names(model_data)) {
if (is.factor(model_data[[var_name]]) || is.character(model_data[[var_name]])) {
if (startsWith(comp_name, var_name)) {
# Extract the level name
level_name <- substring(comp_name, nchar(var_name) + 1)
factor_var <- model_data[[var_name]]
if (level_name %in% levels(factor(factor_var))) {
# Convert to 0/1 for this specific level
return(as.numeric(factor_var == level_name))
}
}
}
}
# Try model matrix as fallback
if (!is.null(model_matrix) && comp_name %in% colnames(model_matrix)) {
return(model_matrix[, comp_name])
}
return(NULL)
}
var1 <- get_numeric_var(var1_name, model_data, model_matrix)
var2 <- get_numeric_var(var2_name, model_data, model_matrix)
# Calculate correlation if both are numeric
if (!is.null(var1) && !is.null(var2) && is.numeric(var1) && is.numeric(var2)) {
cor_test <- tryCatch(
stats::cor.test(var1, var2),
error = function(e) NULL
)
if (!is.null(cor_test)) {
cor_val <- cor_test$estimate # Keep sign for display
cor_p <- cor_test$p.value
# Format correlation: 2 decimals, †for p<.1, * for p<.05, ** for p<.01
cor_formatted <- format(round(cor_val, 2), nsmall = 2)
if (!is.na(cor_p) && cor_p < 0.01) {
cor_formatted <- paste0(cor_formatted, "**")
} else if (!is.na(cor_p) && cor_p < 0.05) {
cor_formatted <- paste0(cor_formatted, "*")
} else if (!is.na(cor_p) && cor_p < 0.1) {
cor_formatted <- paste0(cor_formatted, "\u2020") # †character
}
cor_rxz[i] <- cor_formatted
# X if correlation |r| > .3 OR p < .05
if ((!is.na(cor_val) && abs(cor_val) > 0.3) || (!is.na(cor_p) && cor_p < 0.05)) {
flags <- c(flags, "X")
has_cor_flags <- TRUE
}
}
}
}
}
se_flag[i] <- paste(flags, collapse = " ")
}
# Format B column, using " --" for intercept or NA (with extra padding for alignment)
B_formatted <- sapply(seq_along(tbl$B), function(i) {
if (tbl$term[i] == "(Intercept)") return(" --")
val <- smart_round(tbl$B[i])
if (is.na(val)) return(" --")
val
})
# Format mean column: mean for numeric vars, % only when mean_is_pct is TRUE (factor levels / factor*factor)
mean_is_pct <- if (!is.null(tbl$mean_is_pct)) tbl$mean_is_pct else rep(FALSE, nrow(tbl))
mean_formatted <- sapply(seq_along(tbl$mean_or_pct), function(i) {
if (tbl$term[i] == "(Intercept)") return("--")
if (is.na(tbl$mean_or_pct[i])) return("--")
val <- tbl$mean_or_pct[i]
if (mean_is_pct[i]) {
paste0(format(round(val, 1), nsmall = 1), "%")
} else {
smart_round(val)
}
})
# Format missing column, using "--" for intercept instead of NA
missing_formatted <- if (!is.null(na_counts) && length(na_counts) == nrow(tbl)) {
sapply(seq_along(na_counts), function(i) {
if (tbl$term[i] == "(Intercept)") return("--")
if (is.na(na_counts[i])) return("--")
as.character(na_counts[i])
})
} else {
NULL
}
# Build display dataframe from scratch with desired column order
# Add leading padding spaces to values for better column separation
pad <- function(x, n) paste0(strrep(" ", n), x)
# Right-align values within column, then add trailing space
right_align <- function(x, trail = 1) {
max_width <- max(nchar(x))
formatted <- format(x, width = max_width, justify = "right")
paste0(formatted, strrep(" ", trail))
}
# Check if we have clustered SE
has_clusters <- attr(x, "has_clusters")
if (is.null(has_clusters)) has_clusters <- FALSE
# Format columns
# Add significance stars to estimates: †p<.1, * p<.05, ** p<.01
# Pad stars to consistent width (2 chars) for alignment
estimate_vals <- sapply(seq_along(tbl$estimate), function(i) {
est <- smart_round(tbl$estimate[i])
p <- tbl$p.value[i]
if (is.na(p)) return(paste0(est, " "))
if (p < 0.01) {
paste0(est, "**")
} else if (p < 0.05) {
paste0(est, "* ")
} else if (p < 0.1) {
paste0(est, "\u2020 ") # †character + space
} else {
paste0(est, " ")
}
})
t_vals <- sapply(tbl$t, smart_round)
# Check if df varies across coefficients
df_vals <- tbl$df
df_varies <- length(unique(round(df_vals, 2))) > 1
# Format term names: use parsed factor terms (e.g., "group (B)"), with 2 trailing spaces
# Only lowercase "(Intercept)" -> "intercept", preserve original case for all other terms
term_names_formatted <- sapply(seq_along(factor_info$formatted_terms), function(i) {
term <- factor_info$formatted_terms[i]
term <- gsub("^\\(Intercept\\)$", "intercept", term)
paste0(term, " ")
})
if (has_clusters) {
# With clusters: SE.cluster, SE.robust (HC3), SE.classical
if (df_varies) {
# df varies: include as column between t.value and p.value
display_df <- data.frame(
term = term_names_formatted,
estimate = right_align(estimate_vals, 1),
SE.cluster = pad(sapply(tbl$SE.cluster, smart_round), 2),
SE.robust = pad(sapply(tbl$SE.robust, smart_round), 2),
SE.classical = pad(sapply(tbl$SE.classical, smart_round), 3),
t.value = right_align(t_vals, 1),
df = pad(sapply(df_vals, function(d) format(round(d, 1), nsmall = 1)), 1),
p.value = pad(sapply(tbl$p.value, format_p), 1),
std.estimate = pad(right_align(B_formatted, 0), 3),
` mean` = right_align(mean_formatted, 1),
stringsAsFactors = FALSE,
check.names = FALSE
)
} else {
display_df <- data.frame(
term = term_names_formatted,
estimate = right_align(estimate_vals, 1),
SE.cluster = pad(sapply(tbl$SE.cluster, smart_round), 2),
SE.robust = pad(sapply(tbl$SE.robust, smart_round), 2),
SE.classical = pad(sapply(tbl$SE.classical, smart_round), 3),
t.value = right_align(t_vals, 1),
p.value = pad(sapply(tbl$p.value, format_p), 1),
std.estimate = pad(right_align(B_formatted, 0), 3),
` mean` = right_align(mean_formatted, 1),
stringsAsFactors = FALSE,
check.names = FALSE
)
}
} else {
# Without clusters: SE.robust, SE.classical
if (df_varies) {
# df varies: include as column between t.value and p.value
display_df <- data.frame(
term = term_names_formatted,
estimate = right_align(estimate_vals, 1),
SE.robust = pad(sapply(tbl$SE.robust, smart_round), 2),
SE.classical = pad(sapply(tbl$SE.classical, smart_round), 3),
t.value = right_align(t_vals, 1),
df = pad(sapply(df_vals, function(d) format(round(d, 1), nsmall = 1)), 1),
p.value = pad(sapply(tbl$p.value, format_p), 1),
std.estimate = pad(right_align(B_formatted, 0), 3),
` mean` = right_align(mean_formatted, 1),
stringsAsFactors = FALSE,
check.names = FALSE
)
} else {
display_df <- data.frame(
term = term_names_formatted,
estimate = right_align(estimate_vals, 1),
SE.robust = pad(sapply(tbl$SE.robust, smart_round), 2),
SE.classical = pad(sapply(tbl$SE.classical, smart_round), 3),
t.value = right_align(t_vals, 1),
p.value = pad(sapply(tbl$p.value, format_p), 1),
std.estimate = pad(right_align(B_formatted, 0), 3),
` mean` = right_align(mean_formatted, 1),
stringsAsFactors = FALSE,
check.names = FALSE
)
}
}
# Add missing column if available
if (!is.null(missing_formatted)) {
display_df$missing <- pad(missing_formatted, 2)
}
# Add r(x,z) column for interaction correlations (only if there are interactions)
if (has_interactions) {
display_df$`r(x,z)` <- pad(right_align(cor_rxz, 0), 2)
}
# Add red flag column at the end (use "--" when no flag)
display_df$red.flag <- pad(ifelse(se_flag == "", "--", se_flag), 4)
# Insert baseline rows for factor variables (in reverse order to maintain correct positions)
if (length(factor_info$baseline_info) > 0) {
# Process in reverse order so row indices remain valid
for (j in rev(seq_along(factor_info$baseline_info))) {
bl <- factor_info$baseline_info[[j]]
baseline_term <- paste0(bl$var_name, " (", bl$baseline_level, ") ")
# Create a baseline row with "[baseline]" placeholder
baseline_row <- display_df[1, , drop = FALSE] # Copy structure
baseline_row$term <- baseline_term
# Set all numeric columns to "[baseline]" indicator
for (col in names(baseline_row)) {
if (col == "term") next
baseline_row[[col]] <- ""
}
baseline_row$estimate <- "[baseline]"
# Calculate percentage for the baseline level (show in mean column like other factor levels)
factor_col <- NULL
if (!is.null(model_data) && bl$var_name %in% names(model_data)) {
factor_col <- model_data[[bl$var_name]]
}
if (is.null(factor_col)) {
# Fallback: get data from call (lm_robust often has no $model)
cl <- attr(x, "lm2_call")
if (!is.null(cl) && !is.null(cl$data)) {
data_from_call <- tryCatch(eval(cl$data, envir = parent.frame()), error = function(e) NULL)
if (!is.null(data_from_call) && bl$var_name %in% names(data_from_call)) {
factor_col <- data_from_call[[bl$var_name]]
# Align to model: use same rows as used in fit (model frame may subset)
mf <- tryCatch(stats::model.frame(x$terms, data = data_from_call), error = function(e) NULL)
if (!is.null(mf) && bl$var_name %in% names(mf)) factor_col <- mf[[bl$var_name]]
}
}
}
if (!is.null(factor_col)) {
baseline_pct <- mean(factor_col == bl$baseline_level, na.rm = TRUE) * 100
baseline_pct_str <- paste0(format(round(baseline_pct, 1), nsmall = 1), "%")
existing_width <- max(nchar(display_df$` mean`))
baseline_row$` mean` <- paste0(format(baseline_pct_str, width = existing_width - 1, justify = "right"), " ")
}
# Count missing observations for this factor variable
# Get the NA count from an existing level of this factor in tbl
baseline_na_count <- 0L
for (k in seq_len(nrow(tbl))) {
# Look for a term that starts with this factor's variable name
if (startsWith(tbl$term[k], bl$var_name)) {
# Get the na_count for this term from the na_counts attribute
if (!is.null(na_counts) && !is.na(na_counts[k])) {
baseline_na_count <- na_counts[k]
break
}
}
}
baseline_row$missing <- paste0(" ", baseline_na_count)
# Insert before the first level of this factor
insert_pos <- bl$insert_before
if (insert_pos == 1) {
display_df <- rbind(baseline_row, display_df)
} else if (insert_pos > nrow(display_df)) {
display_df <- rbind(display_df, baseline_row)
} else {
display_df <- rbind(
display_df[1:(insert_pos - 1), , drop = FALSE],
baseline_row,
display_df[insert_pos:nrow(display_df), , drop = FALSE]
)
}
}
}
# Add fixed-effects rows (after coefficients) and adjust intercept label
fe_terms <- attr(x, "fe_terms")
fe_k <- attr(x, "fe_k")
fe_missing <- attr(x, "fe_missing")
has_fe <- !is.null(fe_terms) && length(fe_terms) > 0
if (has_fe) {
# Ensure intercept row is present when fixed effects are absorbed
intercept_idx <- which(trimws(display_df$term) == "intercept")
if (length(intercept_idx) == 0) {
intercept_row <- setNames(as.list(rep("", length(names(display_df)))), names(display_df))
intercept_row[["term"]] <- "intercept "
intercept_row[["estimate"]] <- "[dropped]"
for (col in setdiff(names(display_df), c("term", "estimate"))) {
if (col == "missing") {
intercept_row[[col]] <- pad("--", 2)
} else if (col == "SE.classical") {
intercept_row[[col]] <- pad("--", 3)
} else if (col %in% c("SE.robust", "SE.cluster")) {
intercept_row[[col]] <- pad("--", 2)
} else if (col == "t.value") {
intercept_row[[col]] <- right_align("--", 1)
} else if (col == "df") {
intercept_row[[col]] <- pad("--", 1)
} else if (col == "p.value") {
intercept_row[[col]] <- pad("--", 1)
} else if (col == "std.estimate") {
intercept_row[[col]] <- pad(right_align("--", 0), 3)
} else if (col == " mean") {
intercept_row[[col]] <- right_align("--", 1)
} else if (col == "r(x,z)") {
intercept_row[[col]] <- pad(right_align("--", 0), 2)
} else if (col == "red.flag") {
intercept_row[[col]] <- pad("--", 4)
} else {
intercept_row[[col]] <- "--"
}
}
display_df <- rbind(as.data.frame(intercept_row, stringsAsFactors = FALSE, check.names = FALSE), display_df)
intercept_idx <- 1
}
# Update intercept estimate to indicate absorption
if (length(intercept_idx) > 0) {
display_df$estimate[intercept_idx] <- "[dropped]"
}
make_fe_row <- function(term, k, missing) {
row <- setNames(as.list(rep("", length(names(display_df)))), names(display_df))
row[["term"]] <- paste0(term, " ")
estimate_val <- if (is.na(k)) "df=NA" else paste0("df=", k)
row[["estimate"]] <- right_align(estimate_val, 1)
for (col in setdiff(names(display_df), c("term", "estimate"))) {
if (col == "missing") {
missing_val <- if (is.na(missing)) "--" else as.character(missing)
row[[col]] <- pad(missing_val, 2)
} else if (col == "SE.classical") {
row[[col]] <- pad("--", 3)
} else if (col %in% c("SE.robust", "SE.cluster")) {
row[[col]] <- pad("--", 2)
} else if (col == "t.value") {
row[[col]] <- right_align("--", 1)
} else if (col == "df") {
row[[col]] <- pad("--", 1)
} else if (col == "p.value") {
row[[col]] <- pad("--", 1)
} else if (col == "std.estimate") {
row[[col]] <- pad(right_align("--", 0), 3)
} else if (col == " mean") {
row[[col]] <- right_align("--", 1)
} else if (col == "r(x,z)") {
row[[col]] <- pad(right_align("--", 0), 2)
} else if (col == "red.flag") {
row[[col]] <- pad("--", 4)
} else {
row[[col]] <- "--"
}
}
as.data.frame(row, stringsAsFactors = FALSE, check.names = FALSE)
}
fe_rows <- lapply(fe_terms, function(term) {
k <- if (!is.null(fe_k) && term %in% names(fe_k)) fe_k[[term]] else NA_integer_
miss <- if (!is.null(fe_missing) && term %in% names(fe_missing)) fe_missing[[term]] else NA_integer_
make_fe_row(term, k, miss)
})
fe_df <- do.call(rbind, fe_rows)
display_df <- rbind(display_df, fe_df)
}
# Use term values as row names, then remove the term column
rownames(display_df) <- display_df$term
display_df$term <- NULL
# Print the original call (use lm2_call attribute which stores the lm2() call)
cat("Call: ")
print(attr(x, "lm2_call"))
cat("\n")
# Print the table
print(display_df, right = FALSE)
# Print model summary info
cat("\n")
cat("N =", x$nobs, " | ")
cat("missing =", attr(x, "n_missing"), " | ")
if (!df_varies) {
cat("df =", round(tbl$df[1], 0), " | ")
}
cat("R\u00b2 =", format(round(x$r.squared, 3), nsmall = 3), " | ")
if (has_clusters) {
cat("SE type: CR2 (cluster)\n")
} else {
cat("SE type:", x$se_type, "\n")
}
if (notes) {
cat("\nNotes:\n")
n_missing_y <- attr(x, "n_missing_y")
y_name <- attr(x, "y_name")
if (!is.null(n_missing_y) && !is.na(n_missing_y) && n_missing_y > 0) {
y_label <- if (!is.null(y_name) && !is.na(y_name)) y_name else "DV"
cat(" - ", n_missing_y, " observations have missing \"", y_label, "\" values\n", sep = "")
}
show_dagger <- any(!is.na(tbl$p.value) & tbl$p.value >= 0.05 & tbl$p.value < 0.1)
show_star <- any(!is.na(tbl$p.value) & tbl$p.value >= 0.01 & tbl$p.value < 0.05)
show_double_star <- any(!is.na(tbl$p.value) & tbl$p.value < 0.01)
sig_parts <- character(0)
if (show_dagger) sig_parts <- c(sig_parts, "\u2020 p<.1")
if (show_star) sig_parts <- c(sig_parts, "* p<.05")
if (show_double_star) sig_parts <- c(sig_parts, "** p<.01")
sig_basis <- if (has_clusters) "SE.cluster" else "SE.robust"
if (length(sig_parts) > 0) {
cat(" - ", paste(sig_parts, collapse = ", "), " (based on ", sig_basis, ")\n", sep = "")
} else {
cat(" - All estimates are p>.1 (based on ", sig_basis, ")\n", sep = "")
}
if (has_clusters) {
cat(" - SE.robust (HC3) used only to contrast with SE.classical to flag observations\n")
}
cat(" - std.estimate is the standardized coefficient: beta = b * sd(x) / sd(y)\n")
if (has_interactions) {
cat(" - mean: for main effects mean of x; for x*z interaction where z is factor, mean of x when z==1\n")
} else {
cat(" - mean: for numeric variables, mean of x; for factors, % of observations\n")
}
cat(" - missing: number of observations excluded due to missing values\n")
if (has_interactions) {
cat(" - r(x,z): correlation between interacted variables\n")
}
# Build red.flag note based on which flags are present
if (has_se_flags || has_cor_flags) {
cat(" - red.flag:\n")
# SE flags explanation (only if present)
if (has_se_flags) {
present_se_flags <- intersect(
c("!", "!!", "!!!"),
unique(unlist(strsplit(se_flag, " ", fixed = TRUE)))
)
if (length(present_se_flags) > 0) {
thresholds <- c("!" = "25%", "!!" = "50%", "!!!" = "100%")
thresh_text <- paste(thresholds[present_se_flags], collapse = ", ")
flags_text <- paste(present_se_flags, collapse = ", ")
cat(" ", flags_text, ": robust & classical SE differ by more than ", thresh_text, "\n", sep = "")
}
# Get actual variable names for suggestions
dv_name <- attr(x, "y_name")
dv_label <- if (!is.null(dv_name) && !is.na(dv_name)) dv_name else "DV"
# Find which variables have flags (flagged variables)
flagged_vars <- character(0)
for (fi in seq_along(se_flag)) {
flag_val <- trimws(se_flag[fi])
if (flag_val != "" && flag_val != "--") {
term_name <- tbl$term[fi]
if (term_name != "(Intercept)" && !grepl(":", term_name)) {
flagged_vars <- c(flagged_vars, term_name)
}
}
}
flagged_label <- if (length(flagged_vars) > 0) paste(flagged_vars, collapse = ", ") else "flagged variable(s)"
cat(" - Suggestion 1: to evaluate possible extreme skew or outliers:\n")
cat(" plot_density() or plot_freq() for ", dv_label, " and for ", flagged_label, "\n", sep = "")
cat(" - Suggestion 2: to evaluate possible nonlinearity, do plot_gam(", dv_label, " ~ ", flagged_label, ")\n", sep = "")
}
# Correlation flags explanation (only if present)
if (has_cor_flags) {
cat(" X: interacted variables are correlated, interaction term is likely to be biased\n")
cat(" See Simonsohn (2024) \"Interacting with curves\" https://doi.org/10.1177/25152459231207787\n")
}
} else {
if (has_interactions) {
cat(" - red.flag: none (robust and classical SE are similar and interacted terms are not correlated)\n")
} else {
cat(" - red.flag: none (robust and classical SE are similar)\n")
}
}
# Fixed effects note if absorbed (after red.flag)
fe_terms <- attr(x, "fe_terms")
if (!is.null(fe_terms) && length(fe_terms) > 0) {
fe_vars <- paste(fe_terms, collapse = ", ")
fe_note <- "- <vars> were entered as absorbed fixed effects; df is the number of unique values."
fe_note <- gsub("<vars>", fe_vars, fe_note, fixed = TRUE)
cat(" ", fe_note, "\n", sep = "")
}
cat(" - To avoid these notes, lm2(..., notes=FALSE)\n")
}
invisible(x)
}
#' Summary method for lm2 objects
#'
#' @param object An object of class \code{lm2}
#' @param ... Additional arguments passed to \code{\link{print.lm2}}
#'
#' @return Invisibly returns the original object
#' @export
summary.lm2 <- function(object, ...) {
print(object, ...)
message2("print() and summary() show the same information for lm2()", col = "blue")
invisible(object)
}
#' Predict method for lm2 objects
#'
#' @param object An object of class \code{lm2}
#' @param newdata An optional data frame in which to look for variables with
#' which to predict. If omitted, the original model data is used.
#' @param ... Additional arguments passed to \code{\link[estimatr]{predict.lm_robust}},
#' including \code{se.fit} and \code{interval}.
#'
#' @return A vector of predicted values (or a list with \code{fit} and \code{se.fit}
#' if \code{se.fit = TRUE}, or a matrix with \code{fit}, \code{lwr}, \code{upr}
#' if \code{interval} is specified)
#' @export
predict.lm2 <- function(object, newdata, ...) {
# If newdata is not provided, use the original model data
if (missing(newdata)) {
# Get the original data from the model frame
newdata <- model.frame(object)
}
# Remove lm2 class temporarily and call predict on the lm_robust object
class(object) <- setdiff(class(object), "lm2")
predict(object, newdata = newdata, ...)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.