maar: Tidy Inference Under Misspecified Statistical Models in R

Documented in comp_boot_emp comp_boot_emp_samples fit_reg

#' Draw bootstrap sample from the data
#'
#' \code{comp_boot_emp_sample} draws one bootstrap samples
#' (i.e. sampling observations with replacement) from the input
#' \code{data}. It returns a list containing the indices of the
#' \code{m} observations that have been sampled (\code{indices}) and
#' the bootstrapped dataset (\code{data}).
#'
#' @param data A tibble or data frame containing the dataset to be
#'   sampled from.
#' @param B Bootstrap repetitions or number of bootstrap samples to
#'   be drawn.
#' @param m Number of observations to be sampled with replacement from
#'   the original dataset for each bootstrap repetition.
#' @param replace TODO: ADD
#'
#' @return A tibble containing the number of bootstrap iteration, the
#'   bootstrap samples (\code{data}),
#'   and the indices of the observations (linked to the original sample) in the
#'   sample (\code{indices}).
#'
#' @keywords internal
#'
#' @importFrom rlang .data
#'
#' @examples
#' \dontrun{
#' # Draw B=100 bootstrap samples of size m=5
#' set.seed(162632)
#' n <- 100
#' boot <- comp_boot_emp_samples(
#'   data.frame(
#'     y = stats::rnorm(n, 0, 1),
#'     x = stats::rnorm(n, 0, 1)
#'   ),
#'   B = 100,
#'   m = 5
#' )
#'
#' # Display the output
#' print(boot)
#' }
comp_boot_emp_samples <- function(data,
                                  B = 1,
                                  m = NULL,
                                  replace = TRUE) {
  n <- nrow(data)
  if (is.null(m)) {
    m <- n
  }
  # this is a simplified version of rsample because rsample currently
  # does not allow for sampling m!=n observations from the data
  indices <- purrr::map(rep(n, B), sample, replace = replace, size = m)

  out <- tibble::tibble(
    b = as.integer(paste0(1:length(indices))),
    data = purrr::map(indices, ~ dplyr::slice(data, .x)),
    indices = indices
  )

  return(out)
}


#' Fit OLS or GLM on the data
#'
#' \code{fit_reg} fits an OLS or GLM on the input dataset.
#'
#' @details The model specification obtained from \code{mod_fit}, of
#'   class \code{\link[stats]{lm}} or \code{\link[stats]{glm}}, is
#'   fitted on \code{data}. The user can choose to fit a weighted
#'   regression using the argument \code{weights}.
#'
#' @param mod_fit An object of class \code{\link[stats]{lm}} or
#'   \code{\link[stats]{glm}} to fit on the data. This object should contain
#'   the formula, the data, and, in case of \code{\link[stats]{glm}}, the
#'   glm family.
#' @param data A tibble or data frame containing the dataset on which the
#'   model will be trained.
#' @param weights A character corresponding to the name of the weights feature
#'   name in the data.
#'
#' @return A tibble containing the estimated coefficients (\code{term}) of
#'   the regressors (\code{term}).
#'
#' @keywords internal
#'
#' @importFrom rlang .data
#'
#' @examples
#' \dontrun{
#' set.seed(457468)
#' # Estimate ols from a bootstraped dataset
#' n <- 1e3
#' X <- stats::rnorm(n, 0, 1)
#' y <- 2 + X * 1 + stats::rnorm(n, 0, 1)
#' lm_fit <- stats::lm(y ~ X)
#' boot <- comp_boot_emp_samples(model.frame(lm_fit))$data[[1]]
#' mod <- fit_reg(lm_fit, boot)
#'
#' # Display the output
#' print(mod)
#'
#' #' # Estimate OLS from a dataset with weights
#' n <- 1e3
#' X <- stats::rnorm(n, 0, 1)
#' y <- 2 + X * 1 + stats::rnorm(n, 0, 1)
#' mod_fit <- lm(y ~ X)
#'
#' # fit weighted regression
#' reg_df <- tibble::tibble(y = y, X = X, weights = 1:length(X))
#' mod <- fit_reg(mod_fit, reg_df, "weights")
#' print(mod)
#'
#' # fit unweighted regression
#' mod <- fit_reg(mod_fit, reg_df)
#' print(mod)
#' # compare this output with the output from lm
#' coef(lm(y ~ X, reg_df))
#' }
fit_reg <- function(mod_fit, data, weights = NULL) {
  if (all("lm" == class(mod_fit))) {
    if (is.null(weights)) {
      out <- stats::lm(
        formula = stats::formula(mod_fit),
        data = data
      )
    } else {
      out <- stats::lm(
        formula = stats::formula(mod_fit),
        data = data,
        weights = weights
      )
    }
  } else {
    if (is.null(weights)) {
      out <- stats::glm(
        formula = stats::formula(mod_fit),
        data = data,
        family = stats::family(mod_fit)
      )
    } else {
      out <- stats::glm(
        formula = stats::formula(mod_fit),
        data = data,
        family = stats::family(mod_fit),
        weights = weights
      )
    }
  }

  out <- tibble::tibble(
    term = names(stats::coef(out)),
    estimate = stats::coef(out)
  )
  return(out)
}


#' A wrapper for the empirical bootstrap of a fitted OLS or GLM regression model
#'
#' \code{comp_boot_emp} is a wrapper for the empirical bootstrap of
#' a fitted \code{\link[stats]{lm}} or \code{\link[stats]{glm}} model.
#'
#' @details The empirical bootstrap consists of fitting the chosen statistical
#'   model (\code{mod_fit}) onto \code{B} bootstrap versions of size \code{m}
#'   of the dataset.
#'
#' @param mod_fit An object of class \code{\link[stats]{lm}} or
#'   \code{\link[stats]{glm}} to fit on the data. This object should contain
#'   the formula, the data, and, in case of \code{\link[stats]{glm}},
#'   the family.
#' @param B Bootstrap repetitions or number of bootstrap samples to be drawn.
#' @param m Number of observations to be sampled with replacement from the
#'   dataset for each bootstrap repetition.
#' @param replace TODO: ADD
#'
#' @return A list containing the following elements.
#'   \code{var_type}: The type of estimator for the variance of the coefficients
#'   estimates. An abbreviated string representing the
#'   type of the estimator of the variance  (\code{var_type_abb}).
#'   \code{var_summary}: A tibble containing the summary statistics for the model:
#'   terms (\code{term}), standard errors (\code{std.error}),
#'   statistics (\code{statistic}), p-values (\code{p.values}). The format
#'   of the tibble is exactly identical to the one generated by
#'   \code{\link[broom]{tidy}}, but the standard errors and p-values are computed
#'   via the bootstrap.
#'   \code{var_assumptions}: The assumptions under which the estimator of the
#'   variance is consistent.
#'   \code{cov_mat}: The covariance matrix of the coefficients estimates.
#'   \code{boot_out}: A tibble of the model's coefficients estimated (\code{term} and
#'   \code{estimate}) on the bootstrapped datasets,
#'   the size of the original dataset (\code{n}), and the number of the
#'   bootstrap repetition (\code{b}). In case of empirical bootstrap, it will
#'   also contain the size of each bootstrapped dataset (\code{m}).
#'
#' @keywords internal
#'
#' @importFrom rlang .data
#'
#' @examples
#' \dontrun{
#' # Obtain estimates of the coefficients on bootstrapped versions of the dataset
#' set.seed(35542)
#' n <- 1e3
#' X <- stats::rnorm(n, 0, 1)
#' y <- 2 + X * 1 + stats::rnorm(n, 0, 1)
#' lm_fit <- stats::lm(y ~ X)
#' out <- comp_boot_emp(lm_fit, B = 100, m = 1000)
#'
#' print(out)
#' }
comp_boot_emp <- function(mod_fit, B = 100, m = NULL, replace = TRUE) {
  assertthat::assert_that(all("lm" == class(mod_fit)) | any("glm" == class(mod_fit)),
                          msg = glue::glue("mod_fit must only be of class lm or glm")
  )
  check_fn_args(B = B, m = m)

  data <- stats::model.frame(mod_fit)
  n <- nrow(data)
  if (is.null(m)) {
    m <- n
  }

  assertthat::assert_that(replace || (!replace & m <= n),
                            msg = glue::glue("m must be less or equal to n for subsampling")
  )

  boot_type <- ifelse(replace, 'emp', 'sub')

  boot_out <- lapply(
    1:B,
    function(x) {
      fit_reg(
        mod_fit = mod_fit,
        data = comp_boot_emp_samples(data, B = 1, m = m, replace = replace)$data[[1]]
      )
    }
  )

  cov_mat <- boot_out %>%
    purrr::map(~ .x %>% dplyr::pull(estimate)) %>%
    dplyr::bind_rows(data = ., .id = NULL) %>%
    stats::cov(x = .) * m / n # check in case of subsampling

  boot_out <- boot_out %>%
    dplyr::bind_rows(.id = "b") %>%
    tidyr::nest(.data = .,
                boot_out = c(.data$estimate, .data$term)) %>%
    tibble::add_column(m = m, n = n)

  summary_boot <- get_boot_summary(
    mod_fit = mod_fit,
    boot_out = boot_out,
    boot_type = boot_type # check if subsampling needs any modification
  )

  out <- get_mms_comp_var_ind(var_type_abb = boot_type,
                              summary_tbl = summary_boot,
                              cov_mat = cov_mat,
                              B = B,
                              m = m,
                              n = n,
                              weights_type = NULL,
                              boot_out = boot_out)

  return(out)
}

#' Confidence intervals for on bootstrapped datasets via percentile bootstrap
#'
#' \code{comp_ci_boot} produces confidence intervals for regression
#' models estimates on bootstrapped datasets, via percentile bootstrap.
#'
#' @details `comp_ci_boot` takes in a set of bootstrap estimates
#'   (\code{boot_out}), a series of probabilities for the quantiles
#'   (\code{probs}), and optionally some "grouping" terms (\code{group_vars}).
#'   It returns the corresponding quantiles.
#'
#' @param boot_out A tibble of the model's coefficients estimated (\code{term}
#'   and \code{estimate}) on the bootstrapped datasets, the size of each
#'   bootstrapped dataset (\code{m}), the size of the original dataset
#'   (\code{n}), and the number of the bootstrap repetition (\code{b}).
#' @param probs A numeric vector containing the probabilities of the quantiles
#'   to be computed.
#' @param group_vars A vector of characters including the variables used
#'   to form the groups.
#'
#' @return A tibble containing the quantiles (\code{x}) and the
#'   probabilities (\code{q}) for each group as specified by \code{group_vars}.
#'
#' @keywords internal
#'
#' @importFrom rlang .data
#'
#' @examples
#' \dontrun{
#' # Get confidence interval for OLS estimates via bootstrap
#' n <- 1e3
#' X1 <- stats::rnorm(n, 0, 1)
#' X2 <- stats::rnorm(n, 0, 3)
#' y <- 2 + X1 + X2 * 0.3 + stats::rnorm(n, 0, 1)
#' reg_df <- tibble::tibble(y = y, X1 = X1, X2 = X2, n_obs = 1:length(X1))
#' mod_fit <- stats::lm(y ~ X1 + X2, reg_df)
#' boot_out <- comp_boot_emp(mod_fit)
#' conf_int <- comp_ci_boot(boot_out, c(0.05, 0.95)) %>%
#'   tidyr::pivot_wider(names_from = q, values_from = x)
#'
#' # Display the output
#' print(conf_int)
#' }
comp_ci_boot <- function(boot_out, probs = c(0.025, 0.975),
                         group_vars = "term") {
  out <- boot_out %>%
    tidyr::unnest(boot_out) %>%
    dplyr::group_by(dplyr::across(dplyr::all_of(group_vars))) %>%
    dplyr::summarize(.data = .,
                     x = stats::quantile(sqrt(.data$m / .data$n) * (.data$estimate - mean(.data$estimate)) + mean(.data$estimate),
                                         probs = probs
                     ),
                     q = probs,
                     .groups = "keep"
    )
  return(out)
}