Nothing
#' Compute confidence interval
#'
#' @description
#'
#' Compute a confidence interval around a summary statistic. Both
#' simulation-based and theoretical methods are supported, though only
#' `type = "se"` is supported for theoretical methods.
#'
#' Learn more in `vignette("infer")`.
#'
#' @param x A distribution. For simulation-based inference, a data frame
#' containing a distribution of [calculate()]d statistics
#' or [`fit()`][fit.infer()]ted coefficient estimates. This object should
#' have been passed to [generate()] before being supplied or
#' [calculate()] to [`fit()`][fit.infer()]. For theory-based inference,
#' output of [assume()]. Distributions for confidence intervals do not
#' require a null hypothesis via [hypothesize()].
#' @param level A numerical value between 0 and 1 giving the confidence level.
#' Default value is 0.95.
#' @param type A string giving which method should be used for creating the
#' confidence interval. The default is `"percentile"` with `"se"`
#' corresponding to (multiplier * standard error) and `"bias-corrected"` for
#' bias-corrected interval as other options.
#' @param point_estimate A data frame containing the observed statistic (in a
#' [calculate()]-based workflow) or observed fit (in a
#' [`fit()`][fit.infer()]-based workflow). This object is likely the output
#' of [calculate()] or [`fit()`][fit.infer()] and need not
#' to have been passed to [generate()]. Set to `NULL` by
#' default. Must be provided if `type` is `"se"` or `"bias-corrected"`.
#'
#' @return A [tibble][tibble::tibble] containing the following columns:
#'
#' \itemize{
#' \item `term`: The explanatory variable (or intercept) in question. Only
#' supplied if the input had been previously passed to [`fit()`][fit.infer()].
#' \item `lower_ci`, `upper_ci`: The lower and upper bounds of the confidence
#' interval, respectively.
#' }
#'
#' @details
#' A null hypothesis is not required to compute a confidence interval. However,
#' including [hypothesize()] in a pipeline leading to `get_confidence_interval()`
#' will not break anything. This can be useful when computing a confidence
#' interval using the same distribution used to compute a p-value.
#'
#' Theoretical confidence intervals (i.e. calculated by supplying the output
#' of [assume()] to the `x` argument) require that the point estimate lies on
#' the scale of the data. The distribution defined in [assume()] will be
#' recentered and rescaled to align with the point estimate, as can be shown
#' in the output of [visualize()] when paired with [shade_confidence_interval()].
#' Confidence intervals are implemented for the following distributions and
#' point estimates:
#'
#' \itemize{
#' \item `distribution = "t"`: `point_estimate` should be the output of
#' [calculate()] with `stat = "mean"` or `stat = "diff in means"`
#' \item `distribution = "z"`: `point_estimate` should be the output of
#' [calculate()] with `stat = "prop"` or `stat = "diff in props"`
#' }
#'
#' @section Aliases:
#' `get_ci()` is an alias of `get_confidence_interval()`.
#' `conf_int()` is a deprecated alias of `get_confidence_interval()`.
#'
#' @examples
#'
#' boot_dist <- gss %>%
#' # We're interested in the number of hours worked per week
#' specify(response = hours) %>%
#' # Generate bootstrap samples
#' generate(reps = 1000, type = "bootstrap") %>%
#' # Calculate mean of each bootstrap sample
#' calculate(stat = "mean")
#'
#' boot_dist %>%
#' # Calculate the confidence interval around the point estimate
#' get_confidence_interval(
#' # At the 95% confidence level; percentile method
#' level = 0.95
#' )
#'
#' # for type = "se" or type = "bias-corrected" we need a point estimate
#' sample_mean <- gss %>%
#' specify(response = hours) %>%
#' calculate(stat = "mean")
#'
#' boot_dist %>%
#' get_confidence_interval(
#' point_estimate = sample_mean,
#' # At the 95% confidence level
#' level = 0.95,
#' # Using the standard error method
#' type = "se"
#' )
#'
#' # using a theoretical distribution -----------------------------------
#'
#' # define a sampling distribution
#' sampling_dist <- gss %>%
#' specify(response = hours) %>%
#' assume("t")
#'
#' # get the confidence interval---note that the
#' # point estimate is required here
#' get_confidence_interval(
#' sampling_dist,
#' level = .95,
#' point_estimate = sample_mean
#' )
#'
#' # using a model fitting workflow -----------------------
#'
#' # fit a linear model predicting number of hours worked per
#' # week using respondent age and degree status.
#' observed_fit <- gss %>%
#' specify(hours ~ age + college) %>%
#' fit()
#'
#' observed_fit
#'
#' # fit 100 models to resamples of the gss dataset, where the response
#' # `hours` is permuted in each. note that this code is the same as
#' # the above except for the addition of the `generate` step.
#' null_fits <- gss %>%
#' specify(hours ~ age + college) %>%
#' hypothesize(null = "independence") %>%
#' generate(reps = 100, type = "permute") %>%
#' fit()
#'
#' null_fits
#'
#' get_confidence_interval(
#' null_fits,
#' point_estimate = observed_fit,
#' level = .95
#' )
#'
#' # more in-depth explanation of how to use the infer package
#' \dontrun{
#' vignette("infer")
#' }
#'
#' @name get_confidence_interval
#' @family auxillary functions
#' @export
get_confidence_interval <- function(x, level = 0.95, type = NULL,
point_estimate = NULL) {
# Inform if no `level` was explicitly supplied
if (!("level" %in% rlang::call_args_names(match.call()))) {
cli_inform("Using `level = {level}` to compute confidence interval.")
}
if (is.null(type)) {
if (inherits(x, "infer_dist")) {
type <- "se"
} else {
type <- "percentile"
}
}
if (is_fitted(x)) {
# check that x and point estimate reference the same variables
check_mlr_x_and_obs_stat(
x,
point_estimate,
"get_confidence_interval",
"point_estimate"
)
# split up x and point estimate by term
term_data <- x %>%
dplyr::ungroup() %>%
dplyr::group_by(term) %>%
dplyr::group_split() %>%
purrr::map(copy_attrs, x)
term_estimates <- point_estimate %>%
dplyr::ungroup() %>%
dplyr::group_by(term) %>%
dplyr::group_split()
# check arguments for each term
purrr::map2_dfr(
term_data,
purrr::map(term_estimates, purrr::pluck, "estimate"),
check_ci_args,
level = level,
type = type
)
# map over switch_ci and then add the term column back in
purrr::map2_dfr(
term_data,
purrr::map(term_estimates, purrr::pluck, "estimate"),
switch_ci,
level = level,
type = type
) %>%
dplyr::mutate(
term = purrr::map_chr(term_estimates, purrr::pluck, "term"),
.before = dplyr::everything()
) %>%
copy_attrs(x)
} else {
check_ci_args(x, level, type, point_estimate)
switch_ci(type, x, level, point_estimate)
}
}
#' @rdname get_confidence_interval
#' @export
get_ci <- function(x, level = 0.95, type = NULL,
point_estimate = NULL) {
get_confidence_interval(
x,
level = level, type = type, point_estimate = point_estimate
)
}
switch_ci <- function(type, x, level, point_estimate) {
switch(
type,
percentile = ci_percentile(x, level),
se = ci_se(x, level, point_estimate),
`bias-corrected` = ci_bias_corrected(x, level, point_estimate)
)
}
remove_missing_estimates <- function(estimates) {
na_estimates <- is.na(estimates)
na_estimates_n <- sum(na_estimates)
if (na_estimates_n > 0) {
cli_warn("{na_estimates_n} estimates were missing and were removed when \\
calculating the confidence interval.")
}
estimates[!na_estimates]
}
ci_percentile <- function(x, level) {
# x[[ncol(x)]] pulls out the stat or estimate column
estimates <- remove_missing_estimates(x[[ncol(x)]])
ci_vec <- stats::quantile(estimates, probs = (1 + c(-level, level)) / 2)
make_ci_df(ci_vec)
}
ci_se <- function(x, level, point_estimate) {
point_estimate_ <- check_obs_stat(point_estimate)
args <- list()
if (inherits(x, "infer_dist")) {
se <- attr(point_estimate, "se")
qfn <- paste0("q", attr(x, "distribution"))
if (attr(x, "distribution") == "t") {
args <- list(df = attr(x, "df"))
}
} else {
# x[[ncol(x)]] pulls out the stat or estimate column
estimates <- remove_missing_estimates(x[[ncol(x)]])
se <- stats::sd(estimates)
qfn <- "qnorm"
}
args <- c(args, list(p = (1 + level) / 2))
multiplier <- do.call(qfn, args)
ci_vec <- point_estimate_ + c(-multiplier, multiplier) * se
res <- make_ci_df(ci_vec)
attr(res, "se") <- attr(point_estimate, "se")
attr(res, "point_estimate") <- point_estimate_
res
}
ci_bias_corrected <- function(x, level, point_estimate) {
point_estimate <- check_obs_stat(point_estimate)
# x[[ncol(x)]] pulls out the stat or estimate column
estimates <- remove_missing_estimates(x[[ncol(x)]])
p <- mean(estimates <= point_estimate)
z0 <- stats::qnorm(p)
# z_alpha_2 is z_(alpha/2)
z_alpha_2 <- stats::qnorm((1 + c(-level, level)) / 2)
new_probs <- stats::pnorm(2 * z0 + z_alpha_2)
ci_vec <- stats::quantile(estimates, probs = new_probs)
make_ci_df(ci_vec)
}
check_ci_args <- function(x, level, type, point_estimate, call = caller_env()) {
if (!is.null(point_estimate)) {
if (!is.data.frame(point_estimate)) {
check_type(point_estimate, is.numeric, call = call)
} else {
check_type(point_estimate, is.data.frame, call = call)
check_type(point_estimate[[1]][[1]], is.numeric, call = call)
}
}
check_is_distribution(x, "get_confidence_interval")
check_type(level, is.numeric, call = call)
if ((level <= 0) || (level >= 1)) {
cli_abort(
"The value of {.arg level} must be between 0 and 1, non-inclusive.",
call = call
)
}
if (inherits(x, "infer_dist") && !is.null(type) && type != "se") {
cli_abort(
'The only {.arg type} option for theory-based confidence intervals \\
is `type = "se"`.',
call = call
)
}
if (!(type %in% c("percentile", "se", "bias-corrected"))) {
cli_abort(
'The options for `type` are "percentile", "se", or "bias-corrected".',
call = call
)
}
if ((type %in% c("se", "bias-corrected")) && is.null(point_estimate)) {
cli_abort(
'A numeric value needs to be given for {.arg point_estimate} \\
for `type` "se" or "bias-corrected".',
call = call
)
}
if (inherits(x, "infer_dist")) {
# theoretical CIs require the full point estimate infer object as they
# contain the necessary standard error
if (!inherits(point_estimate, "infer")) {
cli_abort(
'For theoretical confidence intervals, the `point_estimate` argument \\
must be an `infer` object. Have you made sure to supply the output of \\
{.fun calculate} as the `point_estimate` argument?', call = call)
}
if (!attr(point_estimate, "stat") %in%
c("mean", "prop", "diff in means", "diff in props")) {
cli_abort(
'The only allowable statistics for theoretical confidence intervals \\
are "mean", "prop", "diff in means", and "diff in props". See \\
the "Details" section of \\
{.help [{.fun get_confidence_interval}](infer::get_confidence_interval)} \\
for more details.',
call = call
)
}
if ((attr(x, "distribution") == "t" &&
!attr(point_estimate, "stat") %in% c("mean", "diff in means")) ||
(attr(x, "distribution") == "norm" &&
!attr(point_estimate, "stat") %in% c("prop", "diff in props"))) {
cli_abort(
'Confidence intervals using a `{attr(x, "dist_")}` distribution for \\
`stat = {attr(point_estimate, "stat")}` are not implemented.',
call = call
)
}
}
}
make_ci_df <- function(ci_vec) {
tibble::tibble(lower_ci = ci_vec[[1]], upper_ci = ci_vec[[2]])
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.