# Utility functions for cross-validation methods
#' @import dplyr
#' @import purrr
set_lm_par <- function(object, data){
x <- object[["x"]]; y <- object[["y"]]; data <- object[["model"]];
contrasts.arg <- object$contrasts; family_name <- object$family$family
if(is.null(x) && !is.null(data)){
x <- model.matrix(object$terms, data, contrasts.arg = contrasts.arg)
}
if(is.null(y) && !is.null(data)){
y <- purrr::as_vector(data[all.vars(object$terms)][1])
}
if(is.null(x) || is.null(y)){
stop(paste(
"Model data not found in model object.",
"Use `data` argument to supply data frame to which model was fit"))
}
names(y) <- rownames(x)
w <- object$weights
if(is.null(w)) w <- rep(1, nrow(x))
offset <- object$offset
if(is.null(offset)) offset <- rep(0, nrow(x))
list(x = x, y = y, w = w, offset = offset)
}
set_glm_par <- function(object, data){
x <- object[["x"]]; y <- object[["y"]]; data <- object[["model"]];
contrasts.arg <- object$contrasts; family_name <- object$family$family
fitter <- "glm.fit"
if(grepl("Negative Binomial", family_name)){
family_name <- "negbin"
fitter <- "glm_nb"
}
if(is.null(x) && !is.null(data)){
x <- model.matrix(object$terms, data, contrasts.arg = contrasts.arg)
}
if(is.null(y) && !is.null(data)){
y <- purrr::as_vector(data[all.vars(object$terms)][1])
}
if(is.null(x) || is.null(y)){
stop(paste(
"Model data not found in model object.",
"Use `data` argument to supply data frame to which model was fit"))
}
if(family_name == "binomial" && !is.factor(y)) y <- factor(y)
names(y) <- rownames(x)
weights <- object$prior.weights
if(is.null(weights)) weights <- rep(1, nrow(x))
offset <- object$offset
if(is.null(offset)) offset <- rep(0, nrow(x))
other_args <- list(family = object$family,
control = object$control,
intercept = hasName(object$coefficients, "(Intercept)"))
list(x = x, y = y, weights = weights, offset = offset,
other_args = other_args, family_name = family_name, fitter = fitter)
}
set_zi_par <- function(object){
x <- model.matrix(object, "count")
z <- model.matrix(object, "zero")
data <- object$model
y <- if(!is.null(object$y)){
object$y
} else if(!is.null(data)){
model.response(data)
} else {
stop(
paste(
"Model data not found in model object.",
"Refit `zeroinfl` and set either `model` or `y` argument to `TRUE`.")
)
}
names(y) <- rownames(x)
weights <- object$weights
offset <- object$offset
control <- object$control
control$start <- object$start
list(x = x, y = y, z = z, weights = weights, offset = offset,
dist = object$dist, link = object$link, control = control)
}
set_cv_par <- function(n_obs, n_folds, n_reps, silent = FALSE,
nest_cv = FALSE){
if(n_folds > n_obs) n_folds <- n_obs
if(n_folds < n_obs){
fold_size <- n_obs / n_folds
if(nest_cv) fold_size <- fold_size * (n_folds - 1) / n_folds
if(fold_size < 2){
if(!silent){
warning(
paste(
"Your sample size is too small to perform ", n_folds,
"-fold cross-validation.\n",
" Performing leave-one-out cross-validation instead.", sep = ""
), immediate. = TRUE
)
}
n_folds <- n_obs
}
}
if(n_folds == n_obs && n_reps > 1){
if(!silent){
message(
paste(
"\nNOTE: Repetitions of leave-one-out cross-validation are pointless",
" and will not be performed.", sep = ""
)
)
}
n_reps <- 1
}
list(n_folds = n_folds, n_reps = n_reps)
}
get_cv_stats <- function(y, y_hat, family, n_folds, n_reps, theta = NULL){
fold_stats <- map(
y_hat, ~ predict_metrics_(
y = if(is.list(.x)) y[names(.x$mu)] else y[rownames(.x)],
y_hat = if(is.list(.x)) .x$y_hat else .x,
family = family,
mu = if(is.list(.x)) .x$mu else NULL,
phi = if(is.list(.x)) .x$phi else NULL,
theta = theta
)
) %>% transpose %>% simplify_all
btwn_fold_error <- map(
fold_stats, function(x){
na_adjustment <- sum(is.na(x)) / n_reps
sd(x, na.rm = TRUE) / sqrt(n_folds - na_adjustment)
}
)
repeats <- paste("Rep", 1:n_reps, "$", sep = "")
hold_out_pred <- map(repeats, ~ y_hat[grepl(.x, names(y_hat))])
if(is.list(y_hat[[1]])){
hold_out_mu <- hold_out_pred %>% map(~ map(.x, "mu")) %>%
map(~ reduce(. , c)) %>% map(~.x[names(y)])
hold_out_phi <- hold_out_pred %>% map(~ map(.x, "phi")) %>%
map(~ reduce(. , c)) %>% map(~.x[names(y)])
hold_out_yhat <- hold_out_pred %>% map(~ map(.x, "y_hat")) %>%
map(~ reduce(. , c)) %>% map(~.x[names(y)])
names(hold_out_mu) <- names(hold_out_phi) <- names(hold_out_yhat) <-
paste("Rep", 1:n_reps, sep = "")
hold_out_mu <- as_tibble(hold_out_mu)
hold_out_phi <- as_tibble(hold_out_phi)
hold_out_yhat <- as_tibble(hold_out_yhat)
hold_out_pred <- list(
mu = hold_out_mu,
phi = hold_out_phi,
yhat = hold_out_yhat
)
rep_stats <- map(
1:n_reps,
~ predict_metrics_(
y = y, y_hat = hold_out_yhat[[.x]], family = family,
mu = hold_out_mu[[.x]], phi = hold_out_phi[[.x]], theta = theta
)
) %>% transpose %>% simplify_all %>% as_tibble
} else {
hold_out_pred <- map(hold_out_pred, ~ reduce(. , rbind)) %>%
map(~.x[names(y),])
names(hold_out_pred) <- paste("Rep", 1:n_reps, sep = "")
hold_out_pred <- as_tibble(hold_out_pred)
rep_stats <- map(
hold_out_pred, ~ predict_metrics_(
y = y, y_hat = .x, family = family, theta = theta
)
) %>% transpose %>% simplify_all %>% as_tibble
}
btwn_rep_range <- if(n_reps > 1){
map(rep_stats, ~ range(.x, na.rm = TRUE))
} else map(rep_stats, ~c(NA, NA))
cv_means <- map(rep_stats, ~mean(.x, na.rm = TRUE))
cv_stats <- list(mean = cv_means,
btwn_fold_se = btwn_fold_error,
btwn_rep_range = btwn_rep_range) %>% transpose()
list(stats = cv_stats, predictions = hold_out_pred)
}
get_fold_ids <- function(fold_ids, n_reps){
repeats <- paste("Rep", 1:n_reps, "$", sep = "")
holdout_ids <- map(repeats, ~ fold_ids[grepl(.x, names(fold_ids))])
holdout_fold <- map(
holdout_ids, ~ reduce(
map2(.x, seq_along(.x), ~rep(.y, length(.x))), c)) %>%
map2(holdout_ids, ~ .x[order(reduce(.y, c))])
names(holdout_fold) <- paste("Rep", 1:n_reps, sep = "")
as_tibble(holdout_fold)
}
create_folds <- function(y, n_folds, n_reps, seed = 42){
if(!is.factor(y) && length(unique(y)) <= 5) y <- factor(y)
set.seed(seed)
folds <- purrr::map(1:n_reps, ~ stratify_folds(y, n_folds))
fold_ids <- expand.grid(Fold = 1:n_folds, Rep = 1:n_reps)
fold_names <- paste("Fold", fold_ids$Fold, ".Rep", fold_ids$Rep, sep = "")
fold_ids <- if(length(y) == n_folds){
as.list(1:length(y))
} else {
purrr::map2(fold_ids$Fold, fold_ids$Rep, ~ which(folds[[.y]] == .x))
}
names(fold_ids) <- fold_names
fold_ids
}
assign_folds <- function(n, n_folds = 10){
fold_ids_1 <- rep(1:n_folds, n %/% n_folds)
fold_ids_1 <- sample(fold_ids_1)
modulus <- n - length(fold_ids_1)
if(modulus > 0){
fold_ids_2 <- sample.int(n_folds, size = modulus)
c(fold_ids_1, fold_ids_2)
} else fold_ids_1
}
stratify_folds <- function(y, n_folds = 10){
folds <- vector("integer", length(y))
if(is.numeric(y)){
if(min(y) == 0 && sum(y == 0) >= n_folds){
# make separate fold assignments for zero vs. non-zero values to insure
# both 0 and count process is represented in every fold
folds[y == 0] <- assign_folds(sum(y == 0), n_folds)
folds[y != 0] <- stratify_folds(y[y != 0], n_folds)
return(folds)
} else {
cuts <- length(y) %/% n_folds
if(cuts < 2) cuts <- 2
if(cuts > 5) cuts <- 5
breaks <- unique(quantile(y, probs = seq(0, 1, length = cuts)))
y <- as.integer(cut(y, breaks, include.lowest = TRUE))
}
}
# Verify that each strata can appear in every fold
if(all(table(y) >= n_folds)){
purrr::walk(unique(y), function(x){
folds[y == x] <<- assign_folds(sum(y == x), n_folds)
})
# Otherwise abort stratification and use simple randomization
} else {
folds <- assign_folds(length(y), n_folds)
}
folds
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.