ri2: Randomization Inference for Randomized Experiments

Documented in conduct_ri

#' Conduct Randomization Inference
#'
#' This function makes it easy to conduct three kinds of randomization inference.
#'
#' 1. Conduct hypothesis tests under the sharp null when the test statistic is the difference-in-means or covariate-adjusted average treatment effect estimate.
#' 2. Conduct "ANOVA" style hypothesis tests, where the f-statistic from two nested models is the test statistic. This procedure is especially helpful when testing interaction terms under null of constant effects.
#' 3. Arbitrary (scalar) test statistics
#'
#' @param formula an object of class formula, as in \code{\link{lm}}. Use formula when conducting significance tests of an Average Treatment Effect estimate under a sharp null hypothesis. For the difference-in-means estimate, do not include covariates. For the OLS covariate-adjusted estimate, include covariates.
#' @param model_1 an object of class formula, as in \code{\link{lm}}. Models 1 and 2 must be "nested." model_1 should be the "restricted" model and model_2 should be the "unrestricted" model.
#' @param model_2 an object of class formula, as in \code{\link{lm}}. Models 1 and 2 must be "nested." model_1 should be the "restricted" model and model_2 should be the "unrestricted" model.
#' @param test_function A function that takes data and returns a scalar test statistic.
#' @param assignment a character string that indicates which variable is randomly assigned. Defaults to "Z".
#' @param outcome a character string that indicates which variable is the outcome variable. Defaults to NULL.
#' @param declaration A random assignment declaration, created by \code{\link{declare_ra}}.
#' @param sharp_hypothesis either a numeric scalar or a numeric vector of length k - 1, where k is the number of treatment conditions. In a two-arm trial, this number is the *hypothesized* difference between the treated and untreated potential potential outcomes for each unit.. In a multi-arm trial, each number in the vector is the hypothesized difference in potential outcomes between the baseline condition and each successive treatment condition.
#' @param studentize logical, defaults to FALSE. Should the test statistic be the t-ratio rather than the estimated ATE? T-ratios will be calculated using HC2 robust standard errors or their clustered equivalent. CLUSTERING NOT YET IMPLEMENTED.
#' @param IPW logical, defaults to TRUE. Should inverse probability weights be calculated?
#' @param IPW_weights a character string that indicates which variable is the existing inverse probability weights vector. Usually unnecessary, as IPW weights will be incorporated automatically if IPW = TRUE. Defaults to NULL.
#' @param sampling_weights a character string that indicates which variable is the sampling weights vector. Optional, defaults to NULL. NOT YET IMPLEMENTED
#' @param permutation_matrix An optional matrix of random assignments, typically created by \code{\link{obtain_permutation_matrix}}.
#' @param data A data.frame.
#' @param sims the number of simulations. Defaults to 1000.
#' @param progress_bar logical, defaults to FALSE.  Should a progress bar be displayed in the console?
#' @param p Should "two-tailed", "upper", or "lower" p-values be reported? Defaults to "two-tailed".  For two-tailed p-values, whether or not a simulated value is as large or larger than the observed value is determined with respect to the distance to the sharp null.
#'
#' @export
#'
#' @importFrom randomizr declare_ra conduct_ra obtain_condition_probabilities
#' @importFrom estimatr lm_robust_fit tidy
#'
#' @examples
#'
#' # Data from Gerber and Green Table 2.2
#'
#'
#' # Randomization Inference for the Average Treatment Effect
#'
#' table_2.2 <-
#'     data.frame(d = c(1, 0, 0, 0, 0, 0, 1),
#'                y = c(15, 15, 20, 20, 10, 15, 30))
#'
#' ## Declare randomization procedure
#' declaration <- declare_ra(N = 7, m = 2)
#'
#' ## Conduct Randomization Inference
#' out <- conduct_ri(y ~ d,
#'                   declaration = declaration,
#'                   assignment = "d",
#'                   sharp_hypothesis = 0,
#'                   data = table_2.2)
#'
#' summary(out)
#' plot(out)
#' tidy(out)
#'
#' # Using a custom permutation matrix
#'
#' permutation_matrix <-
#'  matrix(c(0, 0, 0, 0, 0, 0, 1,
#'           0, 0, 0, 0, 0, 1, 0,
#'           0, 0, 0, 0, 1, 0, 0,
#'           0, 0, 0, 1, 0, 0, 0,
#'           0, 0, 1, 0, 0, 0, 0,
#'           0, 1, 0, 0, 0, 0, 0,
#'           1, 0, 0, 0, 0, 0, 0),
#'         ncol = 7)
#'
#' conduct_ri(y ~d, assignment = "d", data = table_2.2,
#'                    permutation_matrix = permutation_matrix)
#'
#'
#'
#'
#' # Randomization Inference for an Interaction
#'
#'
#' N <- 100
#' declaration <- randomizr::declare_ra(N = N, m = 50)
#'
#' Z <- randomizr::conduct_ra(declaration)
#' X <- rnorm(N)
#' Y <- .9 * X + .2 * Z + 1 * X * Z + rnorm(N)
#' dat <- data.frame(Y, X, Z)
#'
#' ate_obs <- coef(lm(Y ~ Z, data = dat))[2]
#'
#' out <-
#'   conduct_ri(
#'     model_1 = Y ~ Z + X,
#'     model_2 = Y ~ Z + X + Z * X,
#'     declaration = declaration,
#'     assignment = "Z",
#'     sharp_hypothesis = ate_obs,
#'     data = dat, sims = 100
#'   )
#'
#' plot(out)
#' summary(out)
#'
#' summary(out, p = "two-tailed")
#' summary(out, p = "upper")
#' summary(out, p = "lower")
#'
#' tidy(out)
#'
#' # Randomization Inference for arbitrary test statistics
#'
#' ## In this example we're conducting a randomization check (in this case, a balance test).
#'
#' N <- 100
#' declaration <- randomizr::declare_ra(N = N, m = 50)
#'
#' Z <- randomizr::conduct_ra(declaration)
#' X <- rnorm(N)
#' Y <- .9 * X + .2 * Z + rnorm(N)
#' dat <- data.frame(Y, X, Z)
#'
#' balance_fun <- function(data) {
#'     f_stat <- summary(lm(Z ~ X, data = data))$f[1]
#'     names(f_stat) <- NULL
#'     return(f_stat)
#' }
#'
#' ## confirm function works as expected
#' balance_fun(dat)
#'
#' ## conduct randomization inference
#'
#' out <-
#'   conduct_ri(
#'     test_function = balance_fun,
#'     declaration = declaration,
#'     assignment = "Z",
#'     sharp_hypothesis = 0,
#'     data = dat, sims = 100
#'   )
#'
#' plot(out)
#' summary(out)
#' tidy(out)
#'
conduct_ri <- function(formula = NULL,
                       model_1 = NULL,
                       model_2 = NULL,
                       test_function = NULL,
                       assignment = "Z",
                       outcome = NULL,
                       declaration = NULL,
                       sharp_hypothesis = 0,
                       studentize = FALSE,
                       IPW = TRUE,
                       IPW_weights = NULL,
                       sampling_weights = NULL,
                       permutation_matrix = NULL,
                       data,
                       sims = 1000,
                       progress_bar = FALSE,
                       p = "two-tailed") {
  # some error checking -----------------------------------------------------

  if (is.null(declaration) &
    is.null(permutation_matrix)) {
    stop("Please supply either a random assignment declaration or a permutation matrix")
  }
  if (is.null(declaration) & !is.null(permutation_matrix)) {
    declaration <- randomizr::declare_ra(permutation_matrix = permutation_matrix)
    permutation_matrix <- NULL
  }
  if(!assignment %in% names(data)) {
    stop(paste0("Assignment variable ", assignment, " not found in data."))
  }

  # Case 1: ATE -------------------------------------------------------------

  if (!is.null(formula)) {
    ri_out <- conduct_ri_ATE(
      formula = formula,
      assignment = assignment,
      declaration = declaration,
      sharp_hypothesis = sharp_hypothesis,
      studentize = studentize,
      IPW = IPW,
      IPW_weights = IPW_weights,
      sampling_weights = sampling_weights,
      permutation_matrix = permutation_matrix,
      data = data,
      sims = sims,
      progress_bar = progress_bar
    )
  }

  # Case 2: F-test ----------------------------------------------------------

  if (!is.null(model_1) & !is.null(model_2)) {
    ri_out <- conduct_ri_f(
      model_1 = model_1,
      model_2 = model_2,
      assignment = assignment,
      declaration = declaration,
      sharp_hypothesis = sharp_hypothesis,
      IPW = IPW,
      IPW_weights = IPW_weights,
      sampling_weights = sampling_weights,
      permutation_matrix = permutation_matrix,
      data = data,
      sims = sims,
      progress_bar = progress_bar
    )
  }

  # Case 3: Arbitrary Function ----------------------------------------------

  if (!is.null(test_function)) {
    ri_out <- conduct_ri_test_function(
      test_function = test_function,
      assignment = assignment,
      outcome = outcome,
      declaration = declaration,
      sharp_hypothesis = sharp_hypothesis,
      IPW_weights = IPW_weights,
      sampling_weights = sampling_weights,
      permutation_matrix = permutation_matrix,
      data = data,
      sims = sims,
      progress_bar = progress_bar
    )
  }

  if (is.null(formula) &
    is.null(model_1) & is.null(model_2) & is.null(test_function)) {
    stop("You must specify either a formula, models 1 and 2, or a test function.")
  }

  # deal with numerical instability
  ri_out$sims_df <-
    within(ri_out$sims_df, {
      est_sim <- round(est_sim, 10)
      est_obs <- round(est_obs, 10)
    })


  ri_out$p <- p
  ri_out$sharp_hypothesis <- sharp_hypothesis

  return(ri_out)
}

#' @export
#' @import ggplot2
#'
#'
plot.ri <- function(x, p = NULL, ...) {
  if(is.null(p)){p <- x$p}
  if (p == "two-tailed") {
    x$sims_df <-
      within(
        x$sims_df,
        extreme <- abs(est_sim - x$sharp_hypothesis) >= abs(est_obs - x$sharp_hypothesis)
      )
  } else if (p == "lower") {
    x$sims_df <-
      within(
        x$sims_df,
        extreme <- est_sim <= est_obs
      )
  } else if (p == "upper") {
    x$sims_df <-
      within(
        x$sims_df,
        extreme <- est_sim >= est_obs
      )
  } else {
    stop('p must be either "two-tailed" (the default), "lower", or "upper".')
  }

  summary_fun <-
    function(dat) {
      with(
        dat,
        data.frame(
          est_obs = unique(est_obs),
          Estimate = "Observed Value"
        )
      )
    }


  summary_df <- split(x$sims_df, x$sims_df$term)
  summary_df <- lapply(summary_df[lapply(summary_df, nrow) != 0], FUN = summary_fun)
  summary_df <- do.call(rbind, summary_df)

  summary_df$term <- rownames(summary_df)

  ggplot(x$sims_df, aes(x = est_sim, alpha = extreme)) +
    geom_histogram(bins = max(30, nrow(x$sims_df) / 20)) +
    geom_vline(
      data = summary_df,
      aes(
        xintercept = est_obs,
        linetype = Estimate,
        colour = Estimate
      ),
      show.legend = TRUE
    ) +
    scale_alpha_manual(values = c(0.5, 1), guide = "none") +
    xlab("Simulated Estimates") +
    ggtitle("Randomization Inference") +
    facet_wrap(~ term) +
    theme_bw() +
    theme(
      legend.position = "bottom",
      axis.title.y = element_blank()
    )
}

#' @export
print.ri <- function(x, p = NULL, ...) {
  print(summary(x, p))
  invisible(summary(x, p))
}

#' @export
#' @importFrom stats quantile
summary.ri <- function(object, p = NULL, ...) {
  if(is.null(p)){p <- object$p}
  if (p == "two-tailed") {
    object$sims_df <-
      within(
        object$sims_df,
        extreme <- abs(est_sim - object$sharp_hypothesis) >= abs(est_obs - object$sharp_hypothesis)
      )
  } else if (p == "lower") {
    object$sims_df <-
      within(
        object$sims_df,
        extreme <- est_sim <= est_obs
      )
  } else if (p == "upper") {
    object$sims_df <-
      within(
        object$sims_df,
        extreme <- est_sim >= est_obs
      )
  } else {
    stop('p must be either "two-tailed" (the default), "lower", or "upper".')
  }


  summary_fun <-
    function(dat) {
      with(
        dat,
        data.frame(
          estimate = unique(est_obs),
          p_value = mean(extreme)
        )
      )
    }

  return_df <- split(object$sims_df, object$sims_df$term)
  return_df <- lapply(return_df[lapply(return_df, nrow) != 0], FUN = summary_fun)
  return_df <- do.call(rbind, return_df)
  return_df$term <- rownames(return_df)
  rownames(return_df) <- NULL
  return_df <-
    return_df[, c(
      "term",
      "estimate",
      "p_value"
    )]


  if (p == "two-tailed") {
    colnames(return_df)[3] <- "two_tailed_p_value"
  } else if (p == "lower") {
    colnames(return_df)[3] <- "lower_p_value"
  } else if (p == "upper") {
    colnames(return_df)[3] <- "upper_p_value"
  }
  return(return_df)
}

#' @export
tidy.ri <- function(x, ...) {
  ret <- summary(x)
  colnames(ret)[3] <- "p.value"
  ret
}