Nothing
#' Define a theoretical distribution
#'
#' @description
#'
#' This function allows the user to define a null distribution based on
#' theoretical methods. In many infer pipelines, `assume()` can be
#' used in place of [generate()] and [calculate()] to create a null
#' distribution. Rather than outputting a data frame containing a
#' distribution of test statistics calculated from resamples of the observed
#' data, `assume()` outputs a more abstract type of object just containing
#' the distributional details supplied in the `distribution` and `df` arguments.
#' However, `assume()` output can be passed to [visualize()], [get_p_value()],
#' and [get_confidence_interval()] in the same way that simulation-based
#' distributions can.
#'
#' To define a theoretical null distribution (for use in hypothesis testing),
#' be sure to provide a null hypothesis via [hypothesize()]. To define a
#' theoretical sampling distribution (for use in confidence intervals),
#' provide the output of [specify()]. Sampling distributions (only
#' implemented for `t` and `z`) lie on the scale of the data, and will be
#' recentered and rescaled to match the corresponding `stat` given in
#' [calculate()] to calculate the observed statistic.
#'
#' @param x The output of [specify()] or [hypothesize()], giving the
#' observed data, variable(s) of interest, and (optionally) null hypothesis.
#' @param distribution The distribution in question, as a string. One of
#' `"F"`, `"Chisq"`, `"t"`, or `"z"`.
#' @param df Optional. The degrees of freedom parameter(s) for the `distribution`
#' supplied, as a numeric vector. For `distribution = "F"`, this should have
#' length two (e.g. `c(10, 3)`). For `distribution = "Chisq"` or
#' `distribution = "t"`, this should have length one. For
#' `distribution = "z"`, this argument is not required. The package
#' will supply a message if the supplied `df` argument is different from
#' recognized values. See the Details section below for more information.
#' @param ... Currently ignored.
#'
#' @return An infer theoretical distribution that can be passed to helpers
#' like [visualize()], [get_p_value()], and [get_confidence_interval()].
#'
#' @details
#'
#' Note that the assumption being expressed here, for use in theory-based
#' inference, only extends to _distributional_ assumptions: the null
#' distribution in question and its parameters. Statistical inference with
#' infer, whether carried out via simulation (i.e. based on pipelines
#' using [generate()] and [calculate()]) or theory (i.e. with `assume()`),
#' always involves the condition that observations are independent of
#' each other.
#'
#' `infer` only supports theoretical tests on one or two means via the
#' `t` distribution and one or two proportions via the `z`.
#'
#' For tests comparing two means, if `n1` is the group size for one level of
#' the explanatory variable, and `n2` is that for the other level, `infer`
#' will recognize the following degrees of freedom (`df`) arguments:
#'
#' * `min(n1 - 1, n2 - 1)`
#' * `n1 + n2 - 2`
#' * The `"parameter"` entry of the analogous `stats::t.test()` call
#' * The `"parameter"` entry of the analogous `stats::t.test()` call with `var.equal = TRUE`
#'
#' By default, the package will use the `"parameter"` entry of the analogous
#' `stats::t.test()` call with `var.equal = FALSE` (the default).
#'
#' @examples
#' # construct theoretical distributions ---------------------------------
#'
#' # F distribution
#' # with the `partyid` explanatory variable
#' gss %>%
#' specify(age ~ partyid) %>%
#' assume(distribution = "F")
#'
#' # Chi-squared goodness of fit distribution
#' # on the `finrela` variable
#' gss %>%
#' specify(response = finrela) %>%
#' hypothesize(null = "point",
#' p = c("far below average" = 1/6,
#' "below average" = 1/6,
#' "average" = 1/6,
#' "above average" = 1/6,
#' "far above average" = 1/6,
#' "DK" = 1/6)) %>%
#' assume("Chisq")
#'
#' # Chi-squared test of independence
#' # on the `finrela` and `sex` variables
#' gss %>%
#' specify(formula = finrela ~ sex) %>%
#' assume(distribution = "Chisq")
#'
#' # T distribution
#' gss %>%
#' specify(age ~ college) %>%
#' assume("t")
#'
#' # Z distribution
#' gss %>%
#' specify(response = sex, success = "female") %>%
#' assume("z")
#'
#' \dontrun{
#' # each of these distributions can be passed to infer helper
#' # functions alongside observed statistics!
#'
#' # for example, a 1-sample t-test -------------------------------------
#'
#' # calculate the observed statistic
#' obs_stat <- gss %>%
#' specify(response = hours) %>%
#' hypothesize(null = "point", mu = 40) %>%
#' calculate(stat = "t")
#'
#' # construct a null distribution
#' null_dist <- gss %>%
#' specify(response = hours) %>%
#' assume("t")
#'
#' # juxtapose them visually
#' visualize(null_dist) +
#' shade_p_value(obs_stat, direction = "both")
#'
#' # calculate a p-value
#' get_p_value(null_dist, obs_stat, direction = "both")
#'
#' # or, an F test ------------------------------------------------------
#'
#' # calculate the observed statistic
#' obs_stat <- gss %>%
#' specify(age ~ partyid) %>%
#' hypothesize(null = "independence") %>%
#' calculate(stat = "F")
#'
#' # construct a null distribution
#' null_dist <- gss %>%
#' specify(age ~ partyid) %>%
#' assume(distribution = "F")
#'
#' # juxtapose them visually
#' visualize(null_dist) +
#' shade_p_value(obs_stat, direction = "both")
#'
#' # calculate a p-value
#' get_p_value(null_dist, obs_stat, direction = "both")
#' }
#'
#' @export
assume <- function(x, distribution, df = NULL, ...) {
if (!inherits(x, "infer")) {
abort(paste0(
"The `x` argument must be the output of a core infer function, ",
"likely `specify()` or `hypothesize()`."
))
}
# check that `distribution` aligns with what is expected from
# `x` and that `distribution` and `df` are consistent with each other
df <- check_distribution(x, distribution, df, ...)
structure(
glue(
"{distribution_desc(distribution)} distribution{df_desc(df)}.",
.null = "NULL"
),
# store distribution as the suffix to p* in dist function
distribution = dist_fn(distribution),
dist_ = distribution,
# store df for easier passing to p* functions
df = df,
# store df in `specify`-esque format for use in `visualize`
distr_param = if (length(df) > 0) {df[1]} else {NULL},
distr_param2 = if (length(df) == 2) {df[2]} else {NULL},
# bring along x attributes
theory_type = attr(x, "theory_type"),
params = attr(x, "params"),
hypothesized = attr(x, "hypothesized"),
# bring along dots
dots = list(...),
# append class
class = "infer_dist"
)
}
# check that the distribution is well-specified
check_distribution <- function(x, distribution, df, ..., call = caller_env()) {
dist <- tolower(distribution)
if (!dist %in% c("f", "chisq", "t", "z")) {
abort(
'The distribution argument must be one of "Chisq", "F", "t", or "z".',
call = call
)
}
if ((dist == "f" && attr(x, "theory_type") != "ANOVA") ||
(dist == "chisq" && !attr(x, "theory_type") %in% c("Chi-square test of indep",
"Chi-square Goodness of Fit")) ||
(dist == "t" && !attr(x, "theory_type") %in% c("One sample t",
"Two sample t")) ||
(dist == "z" && !attr(x, "theory_type") %in% c("One sample prop z",
"Two sample props z"))) {
if (has_explanatory(x)) {
msg_tail <- glue(
"a {get_stat_type_desc(attr(x, 'type_desc_explanatory'))} ",
"explanatory variable ({explanatory_name(x)}).",
.null = "NULL"
)
} else {
msg_tail <- "no explanatory variable."
}
abort(glue(
'The supplied distribution "{distribution}" is not well-defined for a ',
"{get_stat_type_desc(attr(x, 'type_desc_response'))} response ",
"variable ({response_name(x)}) and ", msg_tail
), call = call)
}
if (!is.numeric(df) && !is.null(df)) {
abort(glue(
"`assume()` expects the `df` argument to be a numeric vector, ",
"but you supplied a {list(class(df))} object."
), call = call)
}
if (length(list(...)) != 0) {
plural <- length(list(...)) != 1
dots <- list(...)
abort(glue(
"`assume()` ignores the dots `...` argument, though the ",
"argument{if (plural) 's' else ''} `{list(dots)}` ",
"{if (plural) 'were' else 'was'} supplied. Did you forget to ",
"concatenate the `df` argument with `c()`?"
), call = call)
}
if (dist_df_length(distribution) != length(df) && !is.null(df)) {
plural <- length(df) != 1
abort(glue(
'{distribution_desc(distribution)} distribution requires ',
'{dist_df_length(distribution)} degrees of freedom argument',
'{if (!plural) "s" else ""}, but {length(df)} ',
'{if (plural) "were" else "was"} supplied.'
), call = call)
}
df <- determine_df(x, dist, df)
return(df)
}
# convert the distribution argument to its r distribution function suffix
dist_fn <- function(distribution) {
switch(
tolower(distribution),
`f` = "f",
`chisq` = "chisq",
`t` = "t",
`z` = "norm"
)
}
# return expected degrees of freedom length
dist_df_length <- function(distribution) {
switch(
tolower(distribution),
`f` = 2,
`chisq` = , `t` = 1,
`z` = 0
)
}
# describe the distribution
distribution_desc <- function(distribution) {
switch(
tolower(distribution),
`f` = "An F",
`chisq` = "A Chi-squared",
`t` = "A T",
`z` = "A Z"
)
}
# describe the degrees of freedom
df_desc <- function(df) {
if (is.null(df)) {
""
} else {
plural <- length(df) != 1
paste0(
' with ',
if (plural) {paste0(round(df), collapse = " and ")} else {round(df)},
' degree',
if (!plural && df == 1) {''} else {'s'},
' of freedom')
}
}
# process df for passing to p* functions
process_df <- function(df) {
switch(as.character(length(df)),
"0" = list(),
"1" = list(df = df),
"2" = list(df1 = df[1], df2 = df[2])
)
}
# generate an automatic "df" value using logic from
# hypothesize and, if it doesn't match the
# supplied one, raise a message
determine_df <- function(x, dist, df) {
if (!is.null(df) && !all(round(df) %in% round(acceptable_dfs(x)))) {
inform(glue(
"Message: The supplied `df` argument does not match its ",
"expected value. If this is unexpected, ensure that your calculation ",
"for `df` is correct (see `?assume` for recognized values) or ",
"supply `df = NULL` to `assume()`."
))
return(df)
}
if (is.null(df)) {
df <- acceptable_dfs(x)
}
if (attr(x, "theory_type") == "Two sample t") {
df <- df[1]
}
df
}
# return a vector of dfs recognized by `assume`
acceptable_dfs <- function(x) {
if (attr(x, "theory_type") == "Two sample t") {
c(
# t.test param with var.equal = FALSE
unname(
unlist(
attr(x, "distr_param") <-
stats::t.test(response_variable(x) ~
explanatory_variable(x))[["parameter"]]
)
),
# t.test param with var.equal = TRUE
unname(
unlist(
attr(x, "distr_param") <-
stats::t.test(response_variable(x) ~
explanatory_variable(x),
var.equal = TRUE)[["parameter"]]
)
),
# min(n1 - 1, n2 - 1)
x %>%
dplyr::count(!!explanatory_expr(x)) %>%
dplyr::pull(n) %>%
min() %>%
`-`(1),
# n1 + n2 - 2
x %>%
dplyr::count(!!explanatory_expr(x)) %>%
dplyr::pull(n) %>%
sum() %>%
`-`(2)
)
} else {
c(
unname(unlist(attr(x, "distr_param"))),
unname(unlist(attr(x, "distr_param2")))
)
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.