#' Permutation sampling
#'
#' @description
#' A permutation sample is the same size as the original data set and is made
#' by permuting/shuffling one or more columns. This results in analysis
#' samples where some columns are in their original order and some columns
#' are permuted to a random order. Unlike other sampling functions in
#' rsample, there is no assessment set and calling [assessment()] on a
#' permutation split will throw an error.
#'
#' @param data A data frame.
#' @param permute One or more columns to shuffle. This argument supports
#' tidyselect selectors. Multiple expressions can be combined with `c()`.
#' Variable names can be used as if they were positions in the data frame, so
#' expressions like `x:y` can be used to select a range of variables.
#' See \code{\link[tidyselect]{language}} for more details.
#' @param times The number of permutation samples.
#' @param apparent A logical. Should an extra resample be added where the
#' analysis is the standard data set.
#' @inheritParams rlang::args_dots_empty
#'
#' @details The argument `apparent` enables the option of an additional
#' "resample" where the analysis data set is the same as the original data
#' set. Permutation-based resampling can be especially helpful for computing
#' a statistic under the null hypothesis (e.g. t-statistic). This forms the
#' basis of a permutation test, which computes a test statistic under all
#' possible permutations of the data.
#'
#' @return A `tibble` with classes `permutations`, `rset`, `tbl_df`, `tbl`, and
#' `data.frame`. The results include a column for the data split objects and a
#' column called `id` that has a character string with the resample
#' identifier.
#'
#' @examples
#' permutations(mtcars, mpg, times = 2)
#' permutations(mtcars, mpg, times = 2, apparent = TRUE)
#'
#' library(purrr)
#' resample1 <- permutations(mtcars, starts_with("c"), times = 1)
#' resample1$splits[[1]] %>% analysis()
#'
#' resample2 <- permutations(mtcars, hp, times = 10, apparent = TRUE)
#' map_dbl(resample2$splits, function(x) {
#' t.test(hp ~ vs, data = analysis(x))$statistic
#' })
#' @export
permutations <- function(data,
permute = NULL,
times = 25,
apparent = FALSE,
...) {
check_dots_empty()
permute <- rlang::enquo(permute)
if (is.null(permute)) {
cli_abort("You must specify at least one column to permute.")
}
col_id <- tidyselect::eval_select(permute, data)
if (identical(length(col_id), 0L)) {
cli_abort("You must specify at least one column to permute.")
} else if (identical(length(col_id), ncol(data))) {
cli_abort(c(
"You have selected all columns to permute.",
"i" = "This effectively reorders the rows in the original data without
changing the data structure.",
">" = "Please select fewer columns to permute."
))
}
split_objs <- perm_splits(data, times)
if (apparent) {
split_objs <- dplyr::bind_rows(split_objs, apparent(data))
}
split_objs$splits <-
purrr::map(split_objs$splits, function(x) {
x$col_id <- col_id
x
})
perm_att <- list(
times = times,
apparent = apparent,
col_id = col_id,
permute = names(col_id)
)
new_rset(
splits = split_objs$splits,
ids = split_objs$id,
attrib = perm_att,
subclass = c("permutations", "rset")
)
}
perm_complement <- function(ind, n) {
list(analysis = ind, assessment = NA)
}
perm_splits <- function(data, times = 25) {
n <- nrow(data)
indices <- purrr::map(rep(n, times), sample, replace = FALSE)
indices <- lapply(indices, perm_complement, n = n)
split_objs <-
purrr::map(indices, make_splits, data = data, class = "perm_split")
list(
splits = split_objs,
id = names0(length(split_objs), "Permutations")
)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.