#' Replicate STATA Regressions
#'
#' \code{reg} replicates STATA regressions including regressions with robust standard
#' errors and clustered standard errors.
#'
#' @param formula A formula providing the specifications for your model.
#' @param data A data frame on which to run your regression.
#' @param robust A logical value to indicate if the model should use robust
#' standard errors.
#' @param cluster A character vector to specify up to two variables by which
#' to cluster standard errors.
#'
#' @return A data frame with one row for each independent variable and columns
#' containing the estimated coefficients and the associated standard errors,
#' t-statistics, and p-values.
#'
#' @examples
#' reg(y ~ x, my_data, robust = TRUE)
#' reg(y ~ x, my_data, cluster = "variable")
#' reg(y ~ x, my_data, cluster = c("variable_1", "variable_2"))
reg <- function(formula, data, robust = FALSE, cluster = NULL) {
# Create the foundational linear model
model <- lm(formula, data)
model_obs <- complete.cases(data[all.vars(formula)])
if (length(cluster) >= 3) {
# Stop if the user inputs more than two variables by which to cluster
stop("This function only clusters standard errors for up to two variables.")
} else if (robust == TRUE & length(cluster) >= 1) {
# Stop if the user uses robust and cluster
stop("Please choose either robust standard errors or clustered standard errors.")
} else if (robust == FALSE & is.null(cluster)) {
# If neither robust nor cluster use ordinary OLS
reg_vcov <- NULL
} else if (robust == TRUE & is.null(cluster)) {
# Calculate the HC1 vcov matrix which matches STATA's robust command
reg_vcov <- sandwich::vcovHC(model, "HC1")
} else if (robust == FALSE & length(cluster) == 1) {
# Calculate the vcov matrix adjusting for clustering by one variable
M <- nrow(unique(data[cluster]))
N <- nrow(data)
K <- model[["rank"]]
dfc <- (M / (M - 1)) * ((N - 1) / (N - K))
cluster_1 <- as.factor(data[[cluster]][model_obs])
u <- apply(sandwich::estfun(model), 2, function(x) tapply(x, cluster_1, sum))
reg_vcov <- dfc * sandwich::sandwich(model, meat = crossprod(u) / N)
} else if (robust == FALSE & length(cluster) == 2) {
# Calculate the vcov matrix adjusting for clustering by two variables
M1 <- nrow(unique(data[cluster[1]]))
M2 <- nrow(unique(data[cluster[2]]))
M12 <- nrow(unique(data[cluster]))
N <- nrow(data)
K <- model[["rank"]]
dfc1 <- (M1 / (M1 - 1)) * ((N - 1) / (N - K))
dfc2 <- (M2 / (M2 - 1)) * ((N - 1) / (N - K))
dfc12 <- (M12 / (M12 - 1)) * ((N - 1) / (N - K))
cluster_1 <- as.factor(data[[cluster[1]]][model_obs])
cluster_2 <- as.factor(data[[cluster[2]]][model_obs])
cluster_12 <- paste(as.numeric(cluster_1), as.numeric(cluster_2), sep = "_")
u1 <- apply(sandwich::estfun(model), 2, function(x) tapply(x, cluster_1, sum))
u2 <- apply(sandwich::estfun(model), 2, function(x) tapply(x, cluster_2, sum))
u12 <- apply(sandwich::estfun(model), 2, function(x) tapply(x, cluster_12, sum))
vc1 <- dfc1 * sandwich::sandwich(model, meat = crossprod(u1) / N)
vc2 <- dfc2 * sandwich::sandwich(model, meat = crossprod(u2) / N)
vc12 <- dfc12 * sandwich::sandwich(model, meat = crossprod(u12) / N)
reg_vcov <- vc1 + vc2 - vc12
}
# Create a data frame to export
model_coeftest <- lmtest::coeftest(model, reg_vcov)
model_df <- data.frame(model_coeftest[, 1:4, drop = FALSE])
model_df <- data.frame(variable = rownames(model_df), model_df)
model_df[["variable"]] <- as.character(model_df[["variable"]])
rownames(model_df) <- NULL
colnames(model_df) <- c("variable", "estimate", "std_error", "t-value", "p-value")
# Return the data frame
model_df
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.