R/standard_errors.R

Defines functions standard_errors get_strata columns_match_vector all_match subsample bootstrap weighted_bootstrap resample_qs getWeights subsampleRows bootstrapRows leaveRows getRows

Documented in all_match bootstrap bootstrapRows columns_match_vector getRows get_strata getWeights leaveRows resample_qs standard_errors subsample subsampleRows weighted_bootstrap

#' Function which gets resampled rows
#' @param rows rows to resample
#' @param sampling_method method for resampling
#' @param ... other arguments passed on to the method
getRows <- function(rows, sampling_method, ...) {
  do.call(sampling_method, args = list(rows = rows, ...))
}


#' Function that doesn't reorder rows (for weighted bootstrap)
#' @param rows rows to potentially resample
#' @param ... other arguments, ignored
leaveRows <- function(rows, ...) {
  rows
}

#' Function that fully bootstraps rows
#' @param rows rows to potentially resample
#' @param ... other arguments, ignored
bootstrapRows <- function(rows, ...) {
  sample(rows, replace = T)
}

#' Function that fully bootstraps rows
#' @param rows rows to potentially resample
#' @param subsample_percent percentage of data to use in subsampling
#' @param cluster_index number of weights to draw
#' @param ... other arguments, ignored
subsampleRows <- function(rows, subsample_percent, cluster_index, ...) {
  uci = unique(cluster_index)
  idx = sample(uci,
               floor(subsample_percent*length(uci)),
               replace = F)
  which(cluster_index %in% idx)

}

#' Function that returns the correct weights for weighted bootstrap
#' @param weights vector of existing regression weights
#' @param draw_weights whether draw random exponential weights
#' @param cluster_index number of weights to draw
getWeights <- function(weights, draw_weights, cluster_index) {
  uci = unique(cluster_index)
  n = length(uci)
  if(draw_weights) {
    new_weights <- pmax(stats::rexp(n),1e-3)
    wframe <- merge(data.frame(cluster_index = cluster_index),
                    data.frame(cluster_index = uci, w = new_weights))
    wframe <- merge(wframe, as.data.frame(table(cluster_index)))
    new_weights = wframe$w / wframe$Freq
    if(is.null(weights)) {
      w = new_weights
    } else {
      w = new_weights * weights
    }
    w
  } else {
    weights
  }
}

#' @rdname standard_errors
#' @param subsample_percent A number between 0 and one, specifying the percent of the data to subsample for standard error calculations
#' @param draw_weights Whether to use random exponential weights for bootstrap, either TRUE or FALSE
#' @param sampling_method One of "leaveRows", "subsampleRows", or "bootstrapRows".
#' @param cluster_index index for clusters to sample
#' @param ... Other arguments, ignored for now
resample_qs <- function(X,
                        y,
                        weights,
                        sampling_method,
                        alpha,
                        jstar,
                        control,
                        algorithm,
                        draw_weights,
                        var_names,
                        subsample_percent,
                        cluster_index,
                        ...) {


  if(!grepl("Rows$", sampling_method)) {
    sampling_method <- paste0(sampling_method, "Rows")
  }

  rows <- getRows(1:nrow(X), sampling_method = sampling_method,
                  subsample_percent = subsample_percent,
                  cluster_index = cluster_index)
  weights <- getWeights(weights, draw_weights,
                        cluster_index = cluster_index[rows])
  if(!is.null(weights)) {
    weights <- weights / sum(weights)
  }

  tryCatch({

  quantreg_spacing(
    y = y[rows],
    X = X[rows,],
    var_names = var_names,
    alpha = alpha,
    jstar = jstar,
    control = control,
    weights = weights,
    algorithm = algorithm,
    ...)

  }, error = function(e) {
    warning(e$message)
    return(NA)
    })
}

#' @rdname standard_errors
#' @export
weighted_bootstrap <- function(X,
                               y,
                               weights,
                               sampling_method,
                               alpha,
                               jstar,
                               control,
                               algorithm,
                               draw_weights,
                               var_names,
                               subsample_percent,
                               cluster_index,
                               ...) {
  resample_qs(X = X, y = y, weights = weights,
           sampling_method = "leaveRows", draw_weights = T,
           alpha = alpha, jstar = jstar, control = control,
           algorithm = algorithm, subsample_percent = 1,
           var_names = var_names,
           cluster_index = cluster_index, ...)
}

#' @rdname standard_errors
#' @export
bootstrap <- function(X,
                      y,
                      weights,
                      sampling_method,
                      alpha,
                      jstar,
                      control,
                      algorithm,
                      draw_weights,
                      var_names,
                      subsample_percent,
                      cluster_index,
                      ...) {
  resample_qs(X = X, y = y, weights = weights,
              sampling_method = "bootstrapRows", draw_weights = F,
              alpha = alpha, jstar = jstar, control = control,
              algorithm = algorithm, subsample_percent = 1,
              var_names = var_names,
              cluster_index = cluster_index, ...)
}

#' @rdname standard_errors
subsample <- function(X,
                      y,
                      weights,
                      sampling_method,
                      alpha,
                      jstar,
                      control,
                      algorithm,
                      draw_weights,
                      var_names,
                      subsample_percent,
                      cluster_index,
                      ...) {
  resample_qs(X = X, y = y, weights = weights,
              sampling_method = "subsampleRows", draw_weights = F,
              alpha = alpha, jstar = jstar, control = control,
              algorithm = algorithm, subsample_percent = subsample_percent,
              var_names = var_names,
              cluster_index = cluster_index, ...)
}

#' check if all values of two vectors match
#' @param x first vector
#' @param y second vector
#' @importFrom assertthat assert_that
all_match <- function(x, y) {
  all(x == y)
}

#' Check which rows of a matrix match a whole vector
#' @param X first matrix
#' @param Y vector to match
#' @importFrom assertthat assert_that
columns_match_vector <- function(X, Y) {
  assertthat::assert_that(ncol(X) == length(Y))

  matched_cols <- vector()

  for(i in 1:nrow(X)) {
    matched_cols[i] <- all_match(X[i, ], Y)
  }

  matched_cols

}

#' Get clusters for subsampling given a formula
#' @param cluster_matrix matrix of cluster column values
get_strata <- function(cluster_matrix) {

  if(is.vector(cluster_matrix)) {
    cluster_matrix <- as.matrix(cluster_matrix, ncol = 1)
  }

  if(is.data.frame(cluster_matrix)) {
    cluster_matrix <- as.matrix(cluster_matrix)
  }

  levels <- unique(cluster_matrix)
  level_vec <- rep(NA, length(levels))

  for(i in 1:nrow(levels)) {
    level_vec[which(columns_match_vector(cluster_matrix, levels[i,]))] <- i
  }

  level_vec
}

#' Computes standard errors for the quantile regression spacing method using
#' subsampling.
#' @param X Regression specification matrix.
#' @param y Column of response variable.
#' @param var_names RHS regression variable names.
#' @param alpha Quantiles to be estimated.
#' @param jstar First quantile to be estimated (usually the center one)
#' @param parallel whether to run in parallel or not
#' @param control control parameters to pass to the control arguments of [`quantreg_spacing`],
#' the lower-level function called by [`qs`]. This is a named list, with possible elements including:
#' * `trunc`: whether to truncate residual values below the argument "small"
#' * `small`: level of "small" values to guarentee numerical stability. If not specified, set dynamically based on the standard deviation of the outcome variable.
#' * `lambda`: For penalized regression, you can specify a level of lambda which will weight the penalty. If not set, will be determined based on 10-fold cross-validation.
#' * `output_quantiles`: whether to save fitted quantiles as part of the function output
#' * `calc_avg_me`: whether to return average marginal effects as part of the fitted object
#' * Any other arguments passed to specific downstream quantile regression algorithms (e.g. rq.fit).
#' @param cluster_matrix Matrix of cluster variables, as returned by a model formula
#' @param std_err_control control parameters to pass to the control arguments of [`quantreg_spacing`],
#' the lower-level function called by [`standard_errors`]. Possible arguments include:
#' * `se_method`: Method to use for standard errors, either "weighted_bootstrap",
#' "subsample", "bootstrap" or "custom" along with a specified subsampling method and
#' subsample percent. If specifying "custom", must also specify `subsampling_percent` and
#' `draw_weights`. If you specify "subsample", subsampling percent defaults to 0.2, but can be
#' changed. See details for details.
#' * `num_bs`: Number of bootstrap iterations to use, defaults to 100.
#' * `subsample_percent`: A number between 0 and one, specifying the percent of the data to subsample for standard error calculations
#' * `draw_weights`: Whether to use random exponential weights for bootstrap, either TRUE or FALSE
#' * `sampling_method` One of "leaveRows", "subsampleRows", or "bootstrapRows".
#' leaveRows doesn't resample rows at all. subsampleRows samples without replacement
#' given some percentage of the data (specified via subsample_percent), and boostrapRows
#' samples with replacement.`
#' @param weights vector of same length and order as dependent column, to be used as weights for estimation
#'              (note, if draw weights is set to TRUE, this variable will be the element-wise product
#'              of itself and a random vector of weights)
#' @param algorithm function which is actually used to fit each quantile regression
#' @param seed Seed to be used when generating RNG
#' @param ... other arguments passed to quantile fitting function
#' @importFrom methods is
#' @importFrom stats rexp
#' @importFrom stats cov
#' @importFrom purrr pmap
#' @importFrom stats dnorm
#' @importFrom future plan
#' @importFrom future sequential
#' @importFrom future.apply future_lapply
#' @importFrom purrr quietly
#' @export
standard_errors <- function(y,
                            X,
                            cluster_matrix,
                            algorithm,
                            control = qs_control(),
                            std_err_control = se_control(),
                            var_names,
                            alpha,
                            jstar,
                            parallel = F,
                            weights = NULL,
                            seed = NULL,
                            ...) {

  # if they set parallel to false, we still
  # want to restore the old plan afterwards
  if(!parallel) {
    future::plan(future::sequential)
  }

  if(is.null(seed)) {
    seed = T
  }

  if(parallel) {
    ncores <- getCores()
    setCores(ncores)
  }

  if(is.null(cluster_matrix)) {
    # if there are no clusters, resample and weight rows
    cluster_index <- 1:nrow(X)
  } else {
    # otherwise sample and weight clusters (like a block bootstrap)
    cluster_index <- get_strata(cluster_matrix)
  }

  if(std_err_control$se_method %in% c("bootstrap", "weighted_bootstrap")) {
    subsample_percent = 1
  } else {
    subsample_percent = std_err_control$subsample_percent
  }

  num_clusters <- length(unique(cluster_index))
  num_bs <- std_err_control$num_bs

  fits <- future.apply::future_lapply(1:num_bs,
                                      future.seed = seed,
                            purrr::quietly(
                            function(x, X_mat,
                                     y,
                                     cluster_index,
                                     num_clusters,
                                     algorithm,
                                     se_method,
                                     var_names,
                                     alpha,
                                     jstar,
                                     control,
                                     weights,
                                     subsample_percent,
                                     draw_weights,
                                     sampling_method = sampling_method,
                                     ...) {

                                    do.call(se_method,
                                          args = list(X = X_mat,
                                                      y = y,
                                                      weights = weights,
                                                      alpha = alpha, jstar = jstar, control = control,
                                                      algorithm = algorithm,
                                                      subsample_percent = subsample_percent,
                                                      var_names = var_names,
                                                      draw_weights = draw_weights,
                                                      sampling_method = sampling_method,
                                                      cluster_index = cluster_index,
                                                      ...))
  }), X_mat = X, se_method = std_err_control$se_method, cluster_index = cluster_index,
  y = y, weights = weights, alpha = alpha,
  jstar = jstar, var_names = var_names, control = control,
  algorithm = algorithm,
  subsample_percent = subsample_percent, sampling_method = std_err_control$sampling_method,
  num_clusters = num_clusters, draw_weights = std_err_control$draw_weights,  ...)

  warns <- Filter(function(x) length(x) > 0, lapply(fits, function(x) x$warnings))

  if(length(warns) > 0) {
    if(length(unique(unlist(warns))) > 10) {
      warning("\n",length(warns), " out of ", num_bs, " samples produced warnings. Showing first 10:\n\t",
              paste0(head(unique(unlist(warns)), 10), collapse = "\n\t"))
    } else {
      warning("\n",length(warns), " out of ", num_bs, " samples produced warnings:\n\n",
              paste0(unique(unlist(warns)), collapse = "\n"))
    }


  }

  fit <- purrr::pmap(lapply(fits, function(x) x$result), rbind)
  quant_cov_mat <- stats::cov(fit$coef, use = "pairwise.complete.obs")
  quant_cov_mat <- quant_cov_mat * subsample_percent

  return(list('quant_cov' = quant_cov_mat,
              'warnings' = fit$warnings,
              'iter' = fit$iter,
              'counts' = fit$counts,
              'coef_boot' = fit$coef,
              'subsample_percent' = subsample_percent))
}
be-green/quantspace documentation built on March 20, 2024, 5:30 p.m.