Nothing
#' Perform repeated sampling
#'
#' @description
#'
#' These functions extend the functionality of [dplyr::sample_n()] and
#' [dplyr::slice_sample()] by allowing for repeated sampling of data.
#' This operation is especially helpful while creating sampling
#' distributions—see the examples below!
#'
#' @param tbl,.data Data frame of population from which to sample.
#' @param size,n,prop `size` and `n` refer to the sample size of each sample.
#' The `size` argument to `rep_sample_n()` is required, while in
#' `rep_slice_sample()` sample size defaults to 1 if not specified. `prop`, an
#' argument to `rep_slice_sample()`, refers to the proportion of rows to sample
#' in each sample, and is rounded down in the case that `prop * nrow(.data)` is
#' not an integer. When using `rep_slice_sample()`, please only supply one of
#' `n` or `prop`.
#' @param replace Should samples be taken with replacement?
#' @param reps Number of samples to take.
#' @param prob,weight_by A vector of sampling weights for each of the rows in
#' `.data`—must have length equal to `nrow(.data)`. For `weight_by`, this
#' may also be an unquoted column name in `.data`.
#'
#' @details
#'
#' `rep_sample_n()` and `rep_slice_sample()` are designed to behave similar to
#' their dplyr counterparts. As such, they have at least the following
#' differences:
#' - In case `replace = FALSE` having `size` bigger than number of data rows in
#' `rep_sample_n()` will give an error. In `rep_slice_sample()` having such `n`
#' or `prop > 1` will give warning and output sample size will be set to number
#' of rows in data.
#'
#' Note that the [dplyr::sample_n()] function has been superseded by
#' [dplyr::slice_sample()].
#'
#' @return A tibble of size `reps * n` rows corresponding to `reps`
#' samples of size `n` from `.data`, grouped by `replicate`.
#'
#' @examples
#' library(dplyr)
#' library(ggplot2)
#' library(tibble)
#'
#' # take 1000 samples of size n = 50, without replacement
#' slices <- gss %>%
#' rep_slice_sample(n = 50, reps = 1000)
#'
#' slices
#'
#' # compute the proportion of respondents with a college
#' # degree in each replicate
#' p_hats <- slices %>%
#' group_by(replicate) %>%
#' summarize(prop_college = mean(college == "degree"))
#'
#' # plot sampling distribution
#' ggplot(p_hats, aes(x = prop_college)) +
#' geom_density() +
#' labs(
#' x = "p_hat", y = "Number of samples",
#' title = "Sampling distribution of p_hat"
#' )
#'
#' # sampling with probability weights. Note probabilities are automatically
#' # renormalized to sum to 1
#' df <- tibble(
#' id = 1:5,
#' letter = factor(c("a", "b", "c", "d", "e"))
#' )
#'
#' rep_slice_sample(df, n = 2, reps = 5, weight_by = c(.5, .4, .3, .2, .1))
#'
#' # alternatively, pass an unquoted column name in `.data` as `weight_by`
#' df <- df %>% mutate(wts = c(.5, .4, .3, .2, .1))
#'
#' rep_slice_sample(df, n = 2, reps = 5, weight_by = wts)
#' @export
rep_sample_n <- function(tbl, size, replace = FALSE, reps = 1, prob = NULL) {
check_type(tbl, is.data.frame)
check_type(size, is_single_number, "single non-negative number", min_val = 0)
check_type(replace, is_truefalse, "TRUE or FALSE")
check_type(
reps,
is_single_number,
"single number not less than 1",
min_val = 1
)
check_type(
prob,
~ is.numeric(.) && (length(.) == nrow(tbl)),
glue::glue("numeric vector with length `nrow(tbl)` = {nrow(tbl)}"),
allow_null = TRUE
)
# In `dplyr::sample_n()` `size` can't be more than number of rows in data
notify_extra_size(size, tbl, replace, notify_type = "sample_n")
make_replicate_tbl(
tbl = tbl,
size = size,
replace = replace,
prob = prob,
reps = reps
)
}
#' @rdname rep_sample_n
#' @export
rep_slice_sample <- function(.data, n = NULL, prop = NULL, replace = FALSE,
weight_by = NULL, reps = 1) {
check_type(.data, is.data.frame)
check_type(
n,
is_single_number,
"single non-negative number",
allow_null = TRUE,
min_val = 0
)
check_type(
prop,
is_single_number,
"single non-negative number",
allow_null = TRUE,
min_val = 0
)
check_type(replace, is_truefalse, "TRUE or FALSE")
eval_weight_by <- try(rlang::eval_tidy(weight_by), silent = TRUE)
if (inherits(eval_weight_by, "try-error")) {
weight_by <- rlang::enquo(weight_by)
check_cols(.data, weight_by, "permute", FALSE, "weight_by")
weight_by <- .data[[rlang::as_name(weight_by)]]
}
check_type(
weight_by,
~ is.numeric(.) && (length(.) == nrow(.data)),
glue::glue("a numeric vector with length `nrow(.data)` = {nrow(.data)} \\
or an unquoted column name"),
allow_null = TRUE
)
check_type(
reps,
is_single_number,
"single number not less than 1",
min_val = 1
)
# Compute sample size based on `n` and `prop`
size <- make_slice_size(n = n, prop = prop, n_total = nrow(.data))
# In `dplyr::slice_sample()` asked sample size is allowed to be bigger than
# number of rows in data. In that case (at least currently) sample size is
# silently replaced to be number of rows. Here we give a warning.
notify_extra_size(size, .data, replace, notify_type = "slice_sample")
make_replicate_tbl(
tbl = .data,
size = size,
replace = replace,
prob = weight_by,
reps = reps
)
}
make_replicate_tbl <- function(tbl, size, replace, prob, reps) {
# Generate row indexes for every future replicate (this way it respects
# possibility of `replace = FALSE`)
n <- nrow(tbl)
if (!replace) {
idx_list <- replicate(
reps,
sample_int(n, size, replace = FALSE, prob = prob),
simplify = FALSE
)
} else {
idx_list <- sample_int(n, size * reps, replace = TRUE, prob = prob)
idx_list <- vctrs::vec_chop(idx_list, sizes = rep(size, reps))
}
# Get actual sample size which can differ from `size` (currently if it is
# bigger than number of rows in `tbl` inside `rep_slice_sample()`)
sample_size <- length(idx_list[[1]])
i <- unlist(idx_list)
res <- vctrs::vec_slice(tbl, i)
res <-
dplyr::bind_cols(
tibble::new_tibble(list(replicate = rep(seq_len(reps), each = sample_size))),
res
)
res <- group_by_replicate(res, reps = reps, n = sample_size)
copy_attrs(res, tbl)
}
notify_extra_size <- function(size, tbl, replace, notify_type, call = caller_env()) {
if (!replace && (size > nrow(tbl))) {
msg <- glue::glue(
"Asked sample size ({size}) is bigger than ",
"number of rows in data ({nrow(tbl)}) while `replace` is FALSE"
)
switch(
notify_type,
sample_n = cli_abort("{msg}. Use `replace = TRUE`.", call = call),
slice_sample = cli_warn("{msg}. Using number of rows as sample size.")
)
}
TRUE
}
# Modified code from https://github.com/tidyverse/dplyr/blob/master/R/slice.R
# (at commit 0f29aa4)
sample_int <- function(n, size, replace = FALSE, prob = NULL) {
if (!replace) {
# If `replace` is `FALSE`, allow `size` to be bigger than `n` by silently
# replacing it with `n`
size <- min(size, n)
}
if (size == 0L) {
integer(0)
} else {
sample.int(n, size, prob = prob, replace = replace)
}
}
make_slice_size <- function(n, prop, n_total, call = caller_env()) {
if (is.null(n)) {
if (is.null(prop)) {
# By default return size 1
1L
} else {
as.integer(n_total * prop)
}
} else {
if (is.null(prop)) {
n
} else {
cli_abort(
"Please supply exactly one of the `n` or `prop` arguments.",
call = call
)
}
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.