Nothing
# --- Global helpers and shared data
gen.data <- function(n, p, scale = 10) {
X <- matrix(runif(n * p, -scale, scale), n, p)
y <- runif(n, -scale, scale)
cbind(data.frame(y = y), X)
}
muffle <- function(expr) {
withCallingHandlers(expr, warning = function(w) {
# Expected warning patterns from boot::cv.glm and cvLM
pats <- "K has been changed|'K' has been set to|prediction from rank-deficient fit"
if (grepl(pats, w$message)) {
invokeRestart("muffleWarning")
}
})
}
set.seed(1)
# Typical narrow regression data (using a prime n to ensure the CV handles uneven folds)
df.narrow <- gen.data(101, 11)
# Ill-conditioned data
df.ill <- gen.data(83, 4)
df.ill[, 2] <- df.ill[, 2] * 1e-3
df.ill[, 3] <- df.ill[, 3] * 1e3
# Rank-deficient (but still overdetermined)
df.rd <- gen.data(113, 4)
df.rd <- cbind(df.rd, `5` = df.rd[, 2] + df.rd[, 3])
# Wide data - here the results should differ for OLS w/ boot::cv.glm
df.wide <- gen.data(63, 117)
# Test parameters
seed <- 73568569
K.vals <- c(2, 3, 4, 5, 7, 8, 10, NA) # null corresponds to loocv
lambdas <- c(0.6148868, 48.08172, 7230.901)
test_that("cvLM matches boot::cv.glm for OLS", {
# Note: cvLM will not match boot::cv.glm for matrices that are wide, R's lm gives a least-squares solution
# on overdetermined systems by placing 0 coefficients on redundant covariates, cvLM instead estimates using
# the unique minimum norm solution; however, this will only affect out-of-sample predictions on wide data
# sets
skip_if_not_installed("boot")
for (data.set in list(df.narrow, df.ill, df.rd, df.wide)) {
fit.glm <- glm(y ~ ., data = data.set)
is.wide <- nrow(data.set) < ncol(data.set)
for (K in K.vals) {
# LOOCV
if (is.na(K)) {
K <- nrow(data.set)
}
# Skip computing boot for wide LOOCV because diag(H) = 1 and we get zero division
if (is.wide && K == nrow(data.set)) {
res <- cvLM(fit.glm, data = data.set, K.vals = K, center = FALSE)$CV
expect_true(is.nan(res))
next
}
set.seed(seed)
boot.res <- muffle(boot::cv.glm(
data = data.set,
glmfit = fit.glm,
K = K
))$delta[1]
cvLM.res <- muffle(cvLM(
fit.glm,
data = data.set,
K.vals = K,
seed = seed,
n.threads = 1,
tol = min(1e-07, fit.glm$control$epsilon / 1000), # match glm.fit
center = FALSE
))$CV
if (is.wide) {
expect_true(abs(boot.res - cvLM.res) > testthat_tolerance())
} else {
expect_equal(boot.res, cvLM.res)
}
}
}
})
test_that("OLS CV is invariant to centering on narrow data", {
# Only narrow / overdetermined datasets (could differ on underdetermined systems since we use minimum norm
# solutions)
for (data.set in list(df.narrow, df.ill, df.rd)) {
for (K in K.vals) {
if (is.na(K)) {
K <- nrow(data.set)
}
common.args <- list(
y ~ .,
data = data.set,
K.vals = K,
lambda = 0,
seed = seed
)
res.center <- muffle(do.call(
cvLM,
c(common.args, list(center = TRUE))
))$CV
res.nocenter <- muffle(do.call(
cvLM,
c(common.args, list(center = FALSE))
))$CV
expect_equal(res.center, res.nocenter)
}
}
})
test_that("cvLM matches manual matrix algebra for Ridge (K-fold and GCV)", {
ridge.cv.ref <- function(y, X, K, lambda, seed, center) {
set.seed(seed)
n <- nrow(X)
# Mirror boot's logic of changing the number of folds
if ((K > n) || (K <= 1)) {
stop("'K' outside allowable range")
}
K <- round(K)
kvals <- unique(round(n / (1L:floor(n / 2))))
temp <- abs(kvals - K)
if (!any(temp == 0)) {
K <- kvals[temp == min(temp)][1L]
}
s <- sample(rep(1L:K, ceiling(n / K)), n)
cv.errors <- numeric(K)
weights <- table(s) / n
for (i in seq_len(max(s))) {
j.in <- which(s != i)
j.out <- which(s == i)
if (center) {
X.in.colMeans <- colMeans(X[j.in, , drop = FALSE])
y.in.mean <- mean(y[j.in])
X.in <- scale(
X[j.in, , drop = FALSE],
center = X.in.colMeans,
scale = FALSE
)
y.in <- y[j.in] - y.in.mean
X.out <- scale(
X[j.out, , drop = FALSE],
center = X.in.colMeans,
scale = FALSE
)
beta <- solve(
crossprod(X.in) + lambda * diag(ncol(X.in)),
crossprod(X.in, y.in)
)
preds <- (X.out %*% beta) + y.in.mean
} else {
X.in <- cbind(1, X[j.in, , drop = FALSE])
y.in <- y[j.in]
X.out <- cbind(1, X[j.out, , drop = FALSE])
beta <- solve(
crossprod(X.in) + lambda * diag(ncol(X.in)),
crossprod(X.in, y.in)
)
preds <- X.out %*% beta
}
cv.errors[i] <- mean((y[j.out] - preds)^2)
}
sum(as.numeric(weights) * cv.errors)
}
ridge.gcv.ref <- function(y, X, lambda, center) {
n <- nrow(X)
if (center) {
y.mean <- mean(y)
X.mat <- scale(X, center = colMeans(X), scale = FALSE)
y.target <- y - y.mean
XtX.lambda.inv <- solve(crossprod(X.mat) + (lambda * diag(ncol(X.mat))))
beta <- XtX.lambda.inv %*% crossprod(X.mat, y.target)
H <- X.mat %*% XtX.lambda.inv %*% t(X.mat) + matrix(1 / n, n, n)
preds <- (X.mat %*% beta) + y.mean
} else {
X.mat <- cbind(1, X)
XtX.lambda.inv <- solve(crossprod(X.mat) + (lambda * diag(ncol(X.mat))))
beta <- XtX.lambda.inv %*% crossprod(X.mat, y)
H <- X.mat %*% XtX.lambda.inv %*% t(X.mat)
preds <- X.mat %*% beta
}
trH <- sum(diag(H))
mean(((y - preds) / (1 - trH / n))^2)
}
for (data.set in list(df.narrow, df.wide, df.rd)) {
X.mat <- as.matrix(data.set[, -1])
y.vec <- data.set[, 1]
for (lambda in lambdas) {
for (center in c(FALSE, TRUE)) {
# GCV test
expect_equal(
cvLM(
y ~ .,
data = data.set,
lambda = lambda,
generalized = TRUE,
center = center
)$CV,
ridge.gcv.ref(y.vec, X.mat, lambda, center)
)
# K-fold and LOOCV tests
for (K in K.vals) {
if (is.na(K)) {
K <- nrow(data.set)
}
expect_equal(
muffle(cvLM(
y ~ .,
data = data.set,
K.vals = K,
lambda = lambda,
seed = seed,
center = center
))$CV,
ridge.cv.ref(y.vec, X.mat, K, lambda, seed, center)
)
}
}
}
}
})
test_that("cvLM S3 methods return identical results", {
for (data.set in list(df.narrow, df.wide, df.rd, df.ill)) {
fit.lm <- lm(y ~ ., data = data.set)
fit.glm <- glm(y ~ ., data = data.set)
for (K in K.vals) {
if (is.na(K)) {
K <- nrow(data.set)
}
for (lambda in lambdas) {
common.args <- list(
data = data.set,
K.vals = K,
lambda = lambda,
seed = seed
)
res.formula <- muffle(do.call(
cvLM,
c(list(y ~ .), common.args)
))
res.lm <- muffle(do.call(cvLM, c(list(fit.lm), common.args)))
expect_identical(res.formula, res.lm)
res.glm <- muffle(do.call(cvLM, c(list(fit.glm), common.args)))
expect_identical(res.formula, res.glm)
}
}
}
})
test_that("cvLM results are agnostic to the number of threads", {
# Skip multithreaded tests on CRAN
skip_on_cran()
multi.threads <- max(RcppParallel::defaultNumThreads(), 2L)
for (data.set in list(df.narrow, df.wide, df.rd, df.ill)) {
for (K in K.vals) {
if (is.na(K)) {
next # LOOCV isn't multithreaded
}
for (lambda in lambdas) {
common.args <- list(
y ~ .,
data = data.set,
K.vals = K,
lambda = lambda,
seed = seed
)
res.single <- muffle(do.call(
cvLM,
c(common.args, list(n.threads = 1L))
))
res.multiple <- muffle(do.call(
cvLM,
c(common.args, list(n.threads = multi.threads))
))
# Results may not be exactly identical because of the lack of associativity for floating point
# addition
expect_equal(res.single, res.multiple)
}
}
}
})
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.