R/prepapre_regression_table.R
In ExPanDaR: Explore Your Data Interactively

Documented in prepare_regression_table

escape_for_latex <- function(s) {
  if (!is.character(s)) s_out <- as.character(s)
  else s_out <- s
  s_out <- gsub("\\", "\\textbackslash", s_out, fixed = TRUE)
  s_out <- gsub("&", "\\&", s_out, fixed = TRUE)
  s_out <- gsub("%", "\\%", s_out, fixed = TRUE)
  s_out <- gsub("#", "\\#", s_out, fixed = TRUE)
  s_out <- gsub("_", "\\_", s_out, fixed = TRUE)
  s_out <- gsub("{", "\\{", s_out, fixed = TRUE)
  s_out <- gsub("}", "\\}", s_out, fixed = TRUE)
  s_out <- gsub("~", "\\textasciitilde ", s_out, fixed = TRUE)
  s_out <- gsub("^", "\\textasciicircum ", s_out, fixed = TRUE)
  return(s_out)
}

estimate_model <- function(df, dl) {
  dv <- dl$dvs
  idvs <- dl$idvs
  feffects <- dl$feffects
  clusters <- dl$clusters
  type_str <- dl$models
  fe_str <- paste(feffects, collapse = ", ")
  cl_str <- paste(clusters, collapse = ", ")
  se <- NULL
  p <- NULL
  omit_vars <- NULL

  if (length(feffects) > 2)
    stop("Only up to two-dimensional fixed effect structures are supported")

  if (clusters[1] != "" & any(! clusters %in% feffects))
    stop("Only fixed effect variables can be used as clusters.")

  if (type_str == "auto") {
    if(!(is.factor(df[,dv]) | is.logical(df[,dv]))) type_str <- "ols"
    else type_str <- "logit"
  }
  if (feffects[1] != "") {
    df[,feffects] <- lapply(as.data.frame(df[,feffects]), function (x) factor(x, ordered = FALSE))
  }
  if(type_str == "ols") {
    f <- stats::as.formula(paste(dv, "~", paste(idvs, collapse = " + ")))
  } else if (nlevels(as.factor(df[,dv])) > 2) {
    stop("multinomial logit is not implemented. Sorry.")
  } else {
    df[,dv] <- as.factor(df[,dv])
    if (feffects[1] != "") {
      f <- stats::as.formula(paste(dv, "~", paste(idvs, collapse = " + "), " + ", paste(feffects, collapse = " + ")))
    } else {
      f <- stats::as.formula(paste(dv, "~", paste(idvs, collapse = " + ")))
    }
  }
  if (type_str == "logit") {
    model <- stats::glm(f, family = "binomial", df)
    # Stargazer might identify glm models based on call (wout package)
    model$call[1] <- parse(text = "glm()")[[1]]
    model$pseudo_r2 <- 1 - model$deviance / model$null.deviance
    model$success <- levels(as.factor(df[, dv]))[2]
    if (clusters[1] != "") {
      vcov <- multiwayvcov::cluster.vcov(model, cluster = df[, clusters])
      test <- lmtest::coeftest(model, vcov)
      se <- test[,2]
      p <- test[,4]
    }
    if (feffects[1] != "") {
        omit_vars <- names(model$coefficients)[! names(model$coefficients) %in% idvs]
    }
  } else {
    if (feffects[1] == "") {
      model <- stats::lm(f, data=df)
      # Stargazer identifies lm models based on call (wout package) ... sigh
      model$call[1] <- parse(text = "lm()")[[1]]
    } else {
      # plm() is much faster when the dimension with more units is the first
      # index member. Why I have no idea.
      if (length(feffects) > 1) {
        if (
          length(unique(df[, feffects[2]])) > length(unique(df[ ,feffects[1]]))
        ) {
          feffects <- c(feffects[2], feffects[1])
        }
      }
      df <- plm::pdata.frame(df, feffects)
      if (length(feffects) == 1) {
        model <- plm::plm(f, df, model = "within", effect = "individual")
        if (clusters[1] != "") {
          vcov <- plm::vcovHC(model, type="sss", cluster="group")
          test <- lmtest::coeftest(model, vcov)
          se <- test[,2]
          p <- test[,4]
        }
      } else {
        model <- plm::plm(f, df, model = "within", effect = "twoways")
        if (clusters[1] != "") {
          if (length(clusters) == 2) {
            vcov <- plm::vcovDC(model, type="sss")
          } else if (clusters[1] == feffects[1]) {
            vcov <- plm::vcovHC(model, type="sss", cluster="group")
          } else vcov <- plm::vcovHC(model, type="sss", cluster="time")
          test <- lmtest::coeftest(model, vcov)
          se <- test[,2]
          p <- test[,4]
        }
      }
      # Stargazer identifies plm models based on call (wout package) ... sigh
      model$call[1] <- parse(text = "plm()")[[1]]
    }
  }

  list(
    model = model, type_str = type_str, fe_str = fe_str, cl_str = cl_str,
    p = p, se = se, omit_vars = omit_vars
  )
}


#' @title Prepares a Regression Table
#'
#' @description
#' Builds a regression table based on a set of user-specified models or a single model and a partitioning variable.
#'
#' @param df Data frame containing the data to estimate the models on.
#' @param dvs A character vector containing the variable names for the dependent variable(s).
#' @param idvs A character vector or a a list of character vectors containing the variable names of the independent variables.
#' @param feffects A character vector or a a list of character vectors containing the variable names of the fixed effects.
#' @param clusters A character vector or a a list of character vectors containing the variable names of the cluster variables.
#' @param models A character vector indicating the model types to be estimated ('ols', 'logit', or 'auto')
#' @param byvar A factorial variable to estimate the model on (only possible if only one model is being estimated).
#' @param format A character scalar that is passed on \code{\link[stargazer]{stargazer}} as \code{type} to determine the presentation
#'   format ("html", "text", or "latex").
#'
#' @return A list containing two items
#' \describe{
#'  \item{"models"}{A list containing the model results and by values if appropriate}
#'  \item{"table"}{The output of \code{\link[stargazer]{stargazer}} containing the table}
#' }
#'
#' @details
#' This is a wrapper function calling the stargazer package. For numeric
#'   dependent variables the models are estimated using \code{\link[stats]{lm}}
#'   for models without and \code{\link[plm]{plm}} for models with fixed
#'   effects. Binary dependent variable models are estimated using
#'   \code{\link[stats]{glm}} (with \code{family = binomial(link="logit")}).
#'   You can override this behavior by specifying the model with the
#'   \code{models} parameter. Multinomial logit models are not supported.
#'   Clustered standard errors are estimated using \code{plm}'s robust
#'   covariance matrix estimators for OLS and
#'   \code{\link[multiwayvcov]{cluster.vcov}} for logit models.
#'   Only up to two dimensions are supported for fixed effects and standard
#'   error clusters need to be also present as fixed effects.
#'   If run with \code{byvar}, only levels that have more observations than
#'   coefficients are estimated.
#'
#'
#' @examples
#' df <- data.frame(year = as.factor(floor(stats::time(datasets::EuStockMarkets))),
#'                  datasets::EuStockMarkets)
#' dvs = c("DAX", "SMI", "CAC", "FTSE")
#' idvs = list(c("SMI", "CAC", "FTSE"),
#'             c("DAX", "CAC", "FTSE"),
#'             c("SMI", "DAX", "FTSE"),
#'             c("SMI", "CAC", "DAX"))
#' feffects = list("year", "year", "year", "year")
#' clusters = list("year", "year", "year", "year")
#' t <- prepare_regression_table(df, dvs, idvs, feffects, clusters, format = "text")
#' t$table
#' t <- prepare_regression_table(df, "DAX", c("SMI", "CAC", "FTSE"), byvar="year", format = "text")
#' print(t$table)
#' @export

prepare_regression_table <- function(df, dvs, idvs, feffects = rep("", length(dvs)),
                                     clusters = rep("", length(dvs)),
                                     models = rep("auto", length(dvs)),
                                     byvar = "", format = "html") {
  if(!is.data.frame(df)) stop("df needs to be a dataframe")
  df <- as.data.frame(df)
  if (byvar != "") if(!is.factor(df[,byvar])) stop("'byvar' needs to be a factor.")
  if ((length(dvs) > 1) & byvar != "") stop("you cannot subset multiple models in one table")
  if ((length(dvs) > 1) &&
      ((length(dvs) != length(idvs)) | (length(dvs) != length(feffects)) | (length(dvs) != length(clusters) | (length(dvs) != length(models)))))
    stop("'dvs', 'idvs', 'feffects', 'clusters' and 'models' need to be of equal lenghth.")

  datalist <- list()
  if (byvar != "") {
    datalist <- list(dvs = dvs,
                     idvs = idvs,
                     feffects = feffects,
                     clusters = clusters,
                     models = models)
    vars <- c(byvar, dvs, idvs, feffects, clusters)
    vars <- vars[which(!vars %in% "")]
    df <- df[stats::complete.cases(df[, vars]), vars]
    bylevels <- unique(as.character(df[,byvar]))[order(unique(df[,byvar]))]
    n_by_level <- unlist(lapply(bylevels, function(x) nrow(df[df[,byvar] == x,])))
    bylevels <- bylevels[n_by_level > length(idvs) + 1]
    if (length(bylevels) < 1) stop("no by levels with sufficient degrees of freedom to estimate model")
    mby <- lapply(bylevels, function(x) estimate_model(df[df[,byvar] == x,], datalist))
    models <- list()
    models[[1]] <- estimate_model(df, datalist)
    models[[1]]$byvalue <- "Full Sample"
    for (i in 2:(length(mby) + 1)) {
      models[[i]] <- mby[[i-1]]
      models[[i]]$byvalue <- bylevels[i-1]
    }
  } else {
    if (length(dvs) > 1) {
      for (i in 1:length(dvs))
        datalist[[i]] <- list(dvs = dvs[[i]],
                              idvs = idvs[[i]],
                              feffects = feffects[[i]],
                              clusters = clusters[[i]],
                              models = models[[i]])
      models <- lapply(datalist, function (x) estimate_model(df, x))
    } else {
      datalist <- list(dvs = dvs,
                       idvs = idvs,
                       feffects = feffects,
                       clusters = clusters,
                       models = models)
      models <- list(estimate_model(df, datalist))
    }
  }

  mt_str <- "Estimator"
  success_str <- "Probability estimated"
  fe_str <- "Fixed effects"
  cl_str <- "Std. errors clustered"
  n_str <- "Observations"
  r2_str <- "$R^{2}$"
  adjr2_str <- "Adjusted $R^{2}$"
  pr2_str <- "Pseudo $R^{2}$"

  m <- vector("list", length(models))
  mtype <- vector("list", length(models))
  ret <- vector("list", length(models))
  p <- vector("list", length(models))
  se <- vector("list", length(models))
  ols_present <- FALSE
  logit_present <- FALSE
  omit_vars <- NULL

  for (i in 1:length(models)) {
    m[[i]] <- models[[i]]$model
    if (i == 1) mtype <- models[[i]]$type_str
    else mtype <- c(mtype, models[[i]]$type_str)
    mt_str <- c(mt_str, mtype[[i]])

    if (models[[i]]$fe_str != "")  fe_str <- c(fe_str, models[[i]]$fe_str)
    else fe_str <- c(fe_str, "None")
    if (models[[i]]$cl_str != "")  cl_str <- c(cl_str, models[[i]]$cl_str)
    else cl_str <- c(cl_str, "No")

    n_str <- c(n_str, format(stats::nobs(m[[i]]), big.mark = ","))

    if (models[[i]]$type_str == "ols") {
      ols_present <- TRUE
      if (models[[i]]$fe_str == "") {
        r2_str <- c(r2_str, sprintf("%.3f", summary(m[[i]])$r.squared))
        adjr2_str <- c(adjr2_str, sprintf("%.3f", summary(m[[i]])$adj.r.squared))
      } else {
        r2_str <- c(r2_str, sprintf("%.3f", summary(m[[i]])$r.squared)[1])
        adjr2_str <- c(adjr2_str, sprintf("%.3f", summary(m[[i]])$r.squared)[2])
      }

      success_str <- c(success_str, "")
      pr2_str <- c(pr2_str, "")
    }

    if (models[[i]]$type_str == "logit") {
      logit_present <- TRUE
      pr2_str <- c(pr2_str, sprintf("%.3f", m[[i]]$pseudo_r2))
      success_str <- c(success_str, m[[i]]$success)
      r2_str <- c(r2_str, "")
      adjr2_str <- c(adjr2_str, "")
    }

    ret[[i]] <- models[[i]]
    if (!is.null(models[[i]]$se)) se[[i]] <- models[[i]]$se
    if (!is.null(models[[i]]$p)) p[[i]] <- models[[i]]$p
    if (byvar != "") {
      if (i == 1) labels <- models[[i]]$byvalue
      else labels <- c(labels, models[[i]]$byvalue)
    }
    if (!is.null(models[[i]]$omit_vars)) {
      if (is.null(omit_vars)) omit_vars <- models[[i]]$omit_vars
      else omit_vars <- c(omit_vars, models[[i]]$omit_vars)
    }
  }
  dvs <- escape_for_latex(dvs)
  fe_str <- escape_for_latex(fe_str)
  cl_str <- escape_for_latex(cl_str)

  # The following is not being used for the time being as I am unable to figure
  # out the logic that stargazer uses when multicolumn ist set to TRUE
  # dvs <- dvs[c(TRUE, (dvs[-length(dvs)] != dvs[-1]) | (mtype[-length(mtype)] != mtype[-1]))]

  if (ols_present & logit_present)
    add.lines <- list(mt_str, success_str, fe_str, cl_str, r2_str, adjr2_str, pr2_str)
  else if (logit_present)
    add.lines <- list(mt_str, success_str, fe_str, cl_str, pr2_str)
  else add.lines <- list(mt_str, fe_str, cl_str, n_str, r2_str, adjr2_str)

  if (!is.null(omit_vars)) {
    omit_vars <- unique(c("^Constant$", paste0("^", omit_vars, "$")))
  }
  if (byvar != "") {
    # Stargazer 5.2.2 seems to have a bug on how column.labels are treated in text/html
    # Escaping labels does not help and even sometimes introduces an error.
    labels <- gsub("&", "+", labels, fixed = TRUE)
    labels <- gsub("_", "\\_", labels, fixed = TRUE)
    labels <- gsub("$", "USD", labels, fixed = TRUE)
    htmlout <- utils::capture.output(stargazer::stargazer(m,
                                                   type=format,
                                                   column.labels = labels,
                                                   dep.var.labels = dvs,
                                                   model.names = FALSE,
                                                   omit.stat = "all",
                                                   omit = omit_vars,
                                                   se = se,
                                                   p = p,
                                                   add.lines = add.lines))
  } else htmlout <- utils::capture.output(stargazer::stargazer(m,
                                         type=format,
                                         dep.var.labels = dvs,
                                         model.names = FALSE,
                                         multicolumn = FALSE,
                                         omit.stat = "all",
                                         omit = omit_vars,
                                         se = se,
                                         p = p,
                                         add.lines = add.lines))

  list(models = ret, table = htmlout)
}