regr: Replicate STATA Regressions

#' Replicate STATA Regressions
#'
#' \code{reg} replicates STATA regressions including regressions with robust standard
#' errors and clustered standard errors.
#'
#' @param formula A formula providing the specifications for your model.
#' @param data A data frame on which to run your regression.
#' @param robust A logical value to indicate if the model should use robust
#'   standard errors.
#' @param cluster A character vector to specify up to two variables by which
#'   to cluster standard errors.
#'
#' @return A data frame with one row for each independent variable and columns
#'   containing the estimated coefficients and the associated standard errors,
#'   t-statistics, and p-values.
#'
#' @examples
#' reg(y ~ x, my_data, robust = TRUE)
#' reg(y ~ x, my_data, cluster = "variable")
#' reg(y ~ x, my_data, cluster = c("variable_1", "variable_2"))

reg <- function(formula, data, robust = FALSE, cluster = NULL) {
  # Create the foundational linear model
  model <- lm(formula, data)
  model_obs <- complete.cases(data[all.vars(formula)])

  if (length(cluster) >= 3) {
    # Stop if the user inputs more than two variables by which to cluster
    stop("This function only clusters standard errors for up to two variables.")

  } else if (robust == TRUE & length(cluster) >= 1) {
    # Stop if the user uses robust and cluster
    stop("Please choose either robust standard errors or clustered standard errors.")

  } else if (robust == FALSE & is.null(cluster)) {
    # If neither robust nor cluster use ordinary OLS
    reg_vcov <- NULL

  } else if (robust == TRUE & is.null(cluster)) {
    # Calculate the HC1 vcov matrix which matches STATA's robust command
    reg_vcov <- sandwich::vcovHC(model, "HC1")

  } else if (robust == FALSE & length(cluster) == 1) {
    # Calculate the vcov matrix adjusting for clustering by one variable
    M <- nrow(unique(data[cluster]))
    N <- nrow(data)
    K <- model[["rank"]]
    dfc <- (M / (M - 1)) * ((N - 1) / (N - K))
    cluster_1 <- as.factor(data[[cluster]][model_obs])
    u <- apply(sandwich::estfun(model), 2, function(x) tapply(x, cluster_1, sum))
    reg_vcov <- dfc * sandwich::sandwich(model, meat = crossprod(u) / N)

  } else if (robust == FALSE & length(cluster) == 2) {
    # Calculate the vcov matrix adjusting for clustering by two variables
    M1 <- nrow(unique(data[cluster[1]]))
    M2 <- nrow(unique(data[cluster[2]]))
    M12 <- nrow(unique(data[cluster]))
    N <- nrow(data)
    K <- model[["rank"]]
    dfc1 <- (M1 / (M1 - 1)) * ((N - 1) / (N - K))
    dfc2 <- (M2 / (M2 - 1)) * ((N - 1) / (N - K))
    dfc12 <- (M12 / (M12 - 1)) * ((N - 1) / (N - K))
    cluster_1 <- as.factor(data[[cluster[1]]][model_obs])
    cluster_2 <- as.factor(data[[cluster[2]]][model_obs])
    cluster_12 <- paste(as.numeric(cluster_1), as.numeric(cluster_2), sep = "_")
    u1 <- apply(sandwich::estfun(model), 2, function(x) tapply(x, cluster_1, sum))
    u2 <- apply(sandwich::estfun(model), 2, function(x) tapply(x, cluster_2, sum))
    u12 <- apply(sandwich::estfun(model), 2, function(x) tapply(x, cluster_12, sum))
    vc1 <- dfc1 * sandwich::sandwich(model, meat = crossprod(u1) / N)
    vc2 <- dfc2 * sandwich::sandwich(model, meat = crossprod(u2) / N)
    vc12 <- dfc12 * sandwich::sandwich(model, meat = crossprod(u12) / N)
    reg_vcov <- vc1 + vc2 - vc12
  }

  # Create a data frame to export
  model_coeftest <- lmtest::coeftest(model, reg_vcov)
  model_df <- data.frame(model_coeftest[, 1:4, drop = FALSE])
  model_df <- data.frame(variable = rownames(model_df), model_df)
  model_df[["variable"]] <- as.character(model_df[["variable"]])
  rownames(model_df) <- NULL
  colnames(model_df) <- c("variable", "estimate", "std_error", "t-value", "p-value")

  # Return the data frame
  model_df
}