#' Generate non-random cross-validated test-training pairs
#'
#' @param data A data frame
#' @param n Number of test-training pairs to generate (an integer).
#' Either single number of cluster groups, or vector with number of columns,
#' rows of sampling grid.
#' @param cluster Name of the clustering variable.
#' @param response Name of the response variable.
#' @param id Name of variable that gives each model a unique integer id.
#'
#' @return A data frame with n / prod(n) rows and columns test and train.
#' test and train are list-columns containing \code{\link{resample}} objects.
#' @export
crossv_cluster <- function(data, n = 5, cluster, id = ".id"){
if (!is.numeric(n) || length(n) != 1) {
stop("`n` must be a single integer.", call. = FALSE)
}
# cluster-levels in random order
levels <- data %>% dplyr::pull(cluster) %>% unique() %>% sample()
idx <- seq_len(nrow(data))
# split into evenly-sized chunks
groups <- split(levels, factor(sort(rank(levels)%%n)))
fold_idx <- purrr::map(groups, function(x){which(data %>% dplyr::pull(cluster) %in% x)})
fold <- function(test) {
list(train = modelr::resample(data, setdiff(idx, test)), test = modelr::resample(data, test))
}
cols <- purrr::transpose(purrr::map(fold_idx, fold))
cols[[id]] <- as.character(seq_len(n))
tibble::as_tibble(cols)
}
#' @rdname crossv_cluster
crossv_loo <- function(data, cluster, response = "occ", id = "holdout", buffer = NULL){
# cluster-levels in sorted order
levels <- data %>% dplyr::pull(cluster) %>% unique() %>% sort()
idx <- seq_len(nrow(data))
# split
groups <- as.list(levels)
fold_idx <- purrr::map(groups, function(x){which(data %>% dplyr::pull(cluster) %in% x)})
if(is.null(buffer)) {
fold <- function(test) {
list(train = modelr::resample(data, setdiff(idx, test)), test = modelr::resample(data, test))
}
} else {
fold <- function(test) {
level <- data %>% dplyr::slice(test) %>% dplyr::pull(cluster) %>% unique()
filter_levels_pos <- c(match(level, levels) -seq_len(buffer), seq_len(buffer) + match(level, levels))
filter_levels_pos <- filter_levels_pos[filter_levels_pos > 0]
filter_levels <- levels[filter_levels_pos]
filter_pos <- which(data %>% dplyr::pull(cluster) %in% filter_levels)
list(train = modelr::resample(data, setdiff(idx, c(test, filter_pos))), test = modelr::resample(data, test))
}
}
cols <- purrr::transpose(purrr::map(fold_idx, fold))
cols[[id]] <- as.character(levels)
out <- tibble::as_tibble(cols)
}
#' @rdname crossv_cluster
crossv_grid <- function(data, n = c(3,3), response = "occ", id = ".id"){
grid <- sf::st_make_grid(data, n=n)
fold_idx <- sf::st_intersects(grid, data)
# remove grid cells that don't contain all response levels
contains_both <- purrr::map_lgl(fold_idx, function(x){
data %>% dplyr::slice(x) %>% dplyr::pull(response) %>% unique() %>% length() ==
data %>% dplyr::slice(x) %>% dplyr::pull(response) %>% nlevels()
})
idx <- seq_len(nrow(data))
if(length(which(contains_both)) < prod(n)) {
message("Removing some of the grid cells because they do not contain all response levels.")
fold_idx <- fold_idx[contains_both]
}
data <- st_set_geometry(data, NULL)
fold <- function(test) {
list(train = modelr::resample(data, setdiff(idx, test)), test = modelr::resample(data, test))
}
cols <- purrr::transpose(purrr::map(fold_idx, fold))
cols[[id]] <- as.character(seq_along(fold_idx))
tibble::as_tibble(cols)
}
#' @rdname crossv_cluster
crossv_predstrat <- function(data, n = 5, id = ".id", response = "occ", id_col = "id"){
if (!is.numeric(n) || length(n) != 1) {
stop("`n` must be a single integer.", call. = FALSE)
}
factor_cols <- data %>%
dplyr::select_if(is.factor) %>%
dplyr::select(-one_of(response, id_col)) %>%
as.list()
factor_names <- names(factor_cols)
if(purrr::is_empty(factor_cols)){
return(modelr::crossv_kfold(data, k=n, id = id))
}
# remove rows with levels fewer than n observations
rare_levels <- purrr::map(factor_cols, function(x) {
table(x)[table(x)< n] %>% names()
})
remove_rows <- purrr::map2(factor_cols, rare_levels, function(x,y){
which(x %in% y)
}) %>% unique() %>% unlist
if (!purrr::is_empty(remove_rows)) {
data <- data %>%
dplyr::slice(-remove_rows) %>%
dplyr::mutate_at(vars(one_of(factor_names)), droplevels)
}
# set aside 1 obs of every factor level for each resample
save <- purrr::map(factor_names, function(x){
s <- data %>%
mutate(rownumber = 1:nrow(.)) %>%
group_by_at(.vars = x) %>%
sample_n(n) %>%
mutate(resample = 1:n) %>%
ungroup()
})
aside <- do.call(rbind, save) %>%
dplyr::distinct() %>%
split(.$resample) %>%
purrr::map(pull, rownumber)
# create folds for rest of data
rest <- data %>%
dplyr::slice(-unlist(aside))
n_rest <- nrow(rest)
folds <- sample(rep(1:n, length.out = n_rest))
idx <- seq_len(n_rest)
fold_idx <- split(idx, folds)
# combine
fold_idx <- map2(aside, fold_idx, c)
fold <- function(test) {
list(train = modelr::resample(data, setdiff(idx, test)), test = modelr::resample(data, test))
}
cols <- purrr::transpose(purrr::map(fold_idx, fold))
cols[[id]] <- as.character(seq_len(n))
tibble::as_tibble(cols)
}
#' @rdname crossv_cluster
crossv_temporal_blocking <- function(data, cluster, blocksize, time = "year", id = ".id") {
mean_time <- data %>%
group_split(!!!rlang::syms(cluster)) %>%
map(function(x) {tibble(id = x$id[[1]], mean_time = mean(x %>% pull(time)))})
mean_time <- do.call(bind_rows, mean_time) %>% arrange(mean_time)
sorted <- data %>%
right_join(mean_time, by = "id")
levels <- sorted %>% pull(cluster) %>% unique()
groups <- split(levels, ceiling(seq_along(levels)/blocksize))
fold_idx <- purrr::map(groups, function(x){which(data %>% dplyr::pull(cluster) %in% x)})
combinations <- combn(fold_idx, 2, simplify = FALSE)
train_idx <- purrr::map(combinations, 1)
test_idx <- purrr::map(combinations, 2)
fold <- function(train, test) {
list(train = modelr::resample(data, train), test = modelr::resample(data, test))
}
cols <- purrr::transpose(purrr::map2(train_idx, test_idx, fold))
cols[[id]] <- as.character(seq_along(combinations))
tibble::as_tibble(cols)
}
#' @rdname crossv_cluster
crossv_year_window<- function(data, windowsize) {
years <- data %>% pull(year) %>% unique()
if(windowsize > 0 ){
years_used <- years %>% head(-windowsize) %>% tail(-windowsize)
} else {
years_used <- years
}
map_dfr(years_used, function(middle_year){
year_sequence <- seq(middle_year-windowsize, middle_year+windowsize)
test_years <- setdiff(years, year_sequence)
train_idx <- which(data$year %in% year_sequence)
map_dfr(test_years, function(x){
test_idx <- which(data$year == x)
tibble(train = list(modelr::resample(data, train_idx)),
test = list(modelr::resample(data, test_idx)),
middle_year = middle_year,
test_year = x,
windowsize = windowsize,
training_size = length(train_idx),
testing_size = length(test_idx)) %>%
mutate(tdiff = min(abs(year_sequence-x)))
})
})
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.