Nothing
## Internal helpers for krls(..., approx = "nystrom").
##
## Nystrom replaces the full n x n kernel by a low-rank approximation
## anchored at m << n landmarks. The ridge problem then lives in
## m-dimensional space:
##
## Phi = C %*% U %*% diag(D_reg^{-1/2}) (n x m feature map)
## beta_hat = (Phi' Phi + lambda I)^{-1} Phi' y
## f_hat(x) = K(x, Z) %*% alpha, alpha = U %*% (D_reg^{-1/2} * beta)
##
## where C = K(X, Z), W = K(Z, Z), and (U, D) is the eigendecomposition
## of W with relative-ridge stabilization D_reg = pmax(D, eps * max(D)).
##
## See plans/krls-nystrom-1.4-0.md for the full design.
.select_landmarks_random <- function(n, m) {
sort.int(sample.int(n, m))
}
## Run `expr` with the RNG temporarily seeded at `seed`, restoring the
## caller's .Random.seed (or its absence) on exit. Used so that
## passing `landmark_seed` to krls() does NOT perturb the user's
## downstream RNG state -- e.g. in simulations where the caller has
## set their own seed and any silent re-seeding would shift subsequent
## draws.
.with_seed <- function(seed, expr) {
if (is.null(seed)) return(expr)
had_seed <- exists(".Random.seed", envir = .GlobalEnv, inherits = FALSE)
old_seed <- if (had_seed) get(".Random.seed", envir = .GlobalEnv,
inherits = FALSE) else NULL
on.exit({
if (had_seed) {
assign(".Random.seed", old_seed, envir = .GlobalEnv)
} else if (exists(".Random.seed", envir = .GlobalEnv,
inherits = FALSE)) {
rm(".Random.seed", envir = .GlobalEnv)
}
})
set.seed(seed)
expr
}
.validate_nystrom_m <- function(nystrom_m, n) {
if (!is.numeric(nystrom_m) || length(nystrom_m) != 1L ||
is.na(nystrom_m) || !is.finite(nystrom_m) ||
nystrom_m != as.integer(nystrom_m)) {
stop("nystrom_m must be a finite integer")
}
nystrom_m <- as.integer(nystrom_m)
if (nystrom_m < 1L || nystrom_m > n) {
stop("nystrom_m must satisfy 1 <= nystrom_m <= nrow(X)")
}
nystrom_m
}
.validate_nystrom_eps <- function(nystrom_eps) {
if (!is.numeric(nystrom_eps) || length(nystrom_eps) != 1L ||
is.na(nystrom_eps) || !is.finite(nystrom_eps) ||
nystrom_eps <= 0) {
stop("nystrom_eps must be a finite positive scalar")
}
nystrom_eps
}
## X_std is the already-standardized training matrix; X_centers / X_scales
## are the column means / SDs of the original-scale training X. Index- and
## NULL-form landmarks resolve via X_std rows (already standardized).
## Matrix-form landmarks are taken as user-supplied original-scale
## coordinates and standardized with the same centers/scales so they live
## in the same kernel space the model uses internally.
.resolve_landmarks <- function(landmarks, landmark_method, nystrom_m,
X_std, X_centers, X_scales,
landmark_seed = NULL) {
n <- nrow(X_std)
d <- ncol(X_std)
if (is.null(landmarks)) {
if (is.null(nystrom_m)) {
# Default scales up to 500 landmarks; cheap to compute even
# at large n and gives a much better approximation than the
# historical sqrt(n) heuristic at moderate n.
nystrom_m <- min(500L, n)
}
nystrom_m <- .validate_nystrom_m(nystrom_m, n)
# Stochastic landmark selection (kmeans or random) is wrapped in
# .with_seed() so that a non-NULL landmark_seed seeds locally
# without disturbing the caller's global RNG state.
return(.with_seed(landmark_seed, {
if (landmark_method == "kmeans") {
# k-means on standardized X. Centers come back in the same
# standardized space, so they slot in alongside index-form
# landmarks without further transformation. Not bit-stable
# across R versions even under set.seed() because of the
# Hartigan-Wong algorithm's initialization; documented.
#
# stats::kmeans() requires centers < n; when m == n we
# short-circuit to every row as its own cluster (the
# degenerate "kmeans" answer).
if (nystrom_m == n) {
list(indices = seq_len(n),
matrix = X_std,
method_used = "kmeans")
} else {
km <- stats::kmeans(X_std, centers = nystrom_m,
nstart = 10L, iter.max = 50L)
Z <- unname(km$centers)
attr(Z, "kmeans_iter") <- km$iter
attr(Z, "kmeans_tot") <- km$tot.withinss
list(indices = NULL, matrix = Z, method_used = "kmeans")
}
} else {
idx <- .select_landmarks_random(n, nystrom_m)
list(indices = idx,
matrix = X_std[idx, , drop = FALSE],
method_used = "random")
}
}))
}
if (is.numeric(landmarks) && is.null(dim(landmarks))) {
if (anyNA(landmarks) || any(!is.finite(landmarks)) ||
any(landmarks != as.integer(landmarks))) {
stop("landmarks (as indices) must be integer-valued")
}
idx <- as.integer(landmarks)
if (anyNA(idx) || any(idx < 1L) || any(idx > n) || anyDuplicated(idx)) {
stop("landmarks (as indices) must be unique integers in 1:nrow(X)")
}
return(list(indices = idx,
matrix = X_std[idx, , drop = FALSE],
method_used = "user_indices"))
}
if (is.matrix(landmarks) || is.data.frame(landmarks)) {
Z <- as.matrix(landmarks)
if (!is.numeric(Z)) stop("landmarks matrix must be numeric")
if (ncol(Z) != d) stop("ncol(landmarks) must equal ncol(X)")
if (anyNA(Z) || any(!is.finite(Z)))
stop("landmarks matrix must contain only finite, non-NA values")
if (anyDuplicated(Z))
stop("landmarks matrix must not contain duplicate rows ",
"(W would be rank-deficient)")
# User-supplied landmarks are in original X-scale; standardize using
# the training X's centers/scales to land in the same standardized
# kernel space the model uses internally.
Z_std <- scale(Z, center = X_centers, scale = X_scales)
attr(Z_std, "scaled:center") <- NULL
attr(Z_std, "scaled:scale") <- NULL
return(list(indices = NULL, matrix = Z_std, method_used = "user_matrix"))
}
stop("`landmarks` must be NULL, an integer vector of row indices, ",
"or an m x d numeric matrix (in original X-scale)")
}
## Cross-kernel K(A, B) for a Gaussian kernel with bandwidth sigma.
## Uses the ||a-b||^2 = ||a||^2 + ||b||^2 - 2 a'b decomposition.
.gauss_cross_kernel <- function(A, B, sigma) {
aa <- rowSums(A * A)
bb <- rowSums(B * B)
D2 <- outer(aa, bb, `+`) - 2 * tcrossprod(A, B)
D2[D2 < 0] <- 0 # guard against tiny negatives from rounding
exp(-D2 / sigma)
}
## Bound-finding for the Nystrom lambda search. The exact-path heuristic
## in `lambdasearch()` walks U down from n until EDF >= 1; that
## assumes the spectrum can supply at least one full effective degree
## of freedom. For Nystrom with small m (e.g. m = 1, where max EDF is
## also 1) the heuristic collapses to U = 0 and the search runs over a
## degenerate near-zero interval.
##
## Here we instead anchor U at the dominant landmark eigenvalue (which
## corresponds to "heavily regularized" relative to the data scale) and
## grow it if needed to drive total EDF below 1. L starts at machine
## epsilon and grows until effective dimensionality approaches the
## quantile-defined target -- matching the spirit of `lambdasearch()`
## but never decrementing below eps.
.nystrom_lambda_bounds <- function(Sigma2, n, L, U) {
Smax <- max(Sigma2)
if (is.null(U)) {
U <- Smax
# Grow U until EDF drops below 1 (well-regularized). Bounded so the
# loop terminates even on pathological spectra.
iter <- 0L
while (sum(Sigma2 / (Sigma2 + U)) >= 1 && iter < 200L) {
U <- U * 2
iter <- iter + 1L
}
} else {
stopifnot(is.numeric(U), length(U) == 1L, U > 0)
}
if (is.null(L)) {
q <- which.min(abs(Sigma2 - (Smax / 1000)))
L <- .Machine$double.eps
while (sum(Sigma2 / (Sigma2 + L)) > q && L < Smax) L <- L + 0.05
} else {
stopifnot(is.numeric(L), length(L) == 1L, L >= 0)
}
if (!(L < U)) {
stop("Nystrom lambda-search bounds collapsed to L >= U (L = ",
signif(L, 3), ", U = ", signif(U, 3),
"); try supplying L and U explicitly")
}
list(L = L, U = U)
}
## Core fit. Operates on already-standardized X and y, returns
## standardized-scale outputs plus the Nystrom bookkeeping that krls()
## will splice into the final fit object.
.fit_krls_nystrom <- function(X, y, sigma, landmarks_resolved,
lambda, nystrom_eps,
L, U, tol, noisy,
compute_vcov = TRUE,
lambda_method = c("loo", "gcv")) {
lambda_method <- match.arg(lambda_method)
n <- nrow(X)
Z <- landmarks_resolved$matrix
m <- nrow(Z)
y_vec <- as.vector(y)
nystrom_eps <- .validate_nystrom_eps(nystrom_eps)
C <- .gauss_cross_kernel(X, Z, sigma) # n x m
W <- .gauss_cross_kernel(Z, Z, sigma) # m x m
We <- eigen(W, symmetric = TRUE)
Dvals <- We$values
Dvals <- pmax(Dvals, 0) # numerical guard
Dmax <- max(Dvals)
if (Dmax <= 0) stop("anchor kernel W is numerically zero; try a larger sigma")
D_reg <- pmax(Dvals, nystrom_eps * Dmax)
Dinvsqrt <- 1 / sqrt(D_reg)
# Diagnostics: how many landmark-kernel eigenvalues hit the relative
# ridge floor, and the pre-floor spectrum range. Useful for spotting
# near-collinear landmarks / oversized m.
floored_count <- sum(Dvals < nystrom_eps * Dmax)
D_min_raw <- min(Dvals)
# Phi = C %*% U %*% diag(Dinvsqrt) (n x m)
Phi <- C %*% sweep(We$vectors, 2, Dinvsqrt, `*`)
# Cache SVD for the lambda loop.
svd_phi <- svd(Phi)
Sigma2 <- svd_phi$d^2
# Guard against a numerically zero feature spectrum. This happens
# when sigma is so small (or the landmarks so far from the data)
# that every cross-kernel entry underflows; the lambda-bound search
# then evaluates 0/0 and aborts with a cryptic NaN comparison.
if (max(Sigma2) <= 0) {
stop("Nystrom feature spectrum is numerically zero -- the cross ",
"kernel K(X, landmarks) underflowed. This usually means ",
"sigma is too small for the data scale, or the supplied ",
"landmarks are very far from the observations. Try a larger ",
"sigma, use landmark_method = 'random' to sample landmarks ",
"from X, or rescale the predictors.")
}
## LOO and GCV objectives at a given lambda. Both run in O(n m) from
## the cached SVD; both are guarded against near-singular denominators
## (which can arise at extremely small lambda).
loo_loss <- function(lambda_val) {
w <- Sigma2 / (Sigma2 + lambda_val)
yfit <- as.numeric(svd_phi$u %*% (w * crossprod(svd_phi$u, y_vec)))
diagS <- as.numeric((svd_phi$u^2) %*% w)
denom <- 1 - diagS
if (any(denom <= .Machine$double.eps)) return(.Machine$double.xmax)
sum(((y_vec - yfit) / denom)^2)
}
gcv_loss <- function(lambda_val) {
w <- Sigma2 / (Sigma2 + lambda_val)
yfit <- as.numeric(svd_phi$u %*% (w * crossprod(svd_phi$u, y_vec)))
RSS <- sum((y_vec - yfit)^2)
tr_S <- sum(w)
denom <- 1 - tr_S / n
if (denom <= .Machine$double.eps) return(.Machine$double.xmax)
RSS / (denom^2)
}
obj_fn <- if (lambda_method == "loo") loo_loss else gcv_loss
if (is.null(lambda)) {
if (is.null(tol)) tol <- 1e-3 * n
bounds <- .nystrom_lambda_bounds(Sigma2, n, L, U)
L <- bounds$L; U <- bounds$U
# Golden-section search, structurally identical to lambdasearch().
gr <- 0.381966
X1 <- L + gr * (U - L); X2 <- U - gr * (U - L)
S1 <- obj_fn(X1); S2 <- obj_fn(X2)
if (noisy) cat("L:", L, "X1:", X1, "X2:", X2, "U:", U,
"S1:", S1, "S2:", S2, "\n")
while (abs(S1 - S2) > tol) {
if (S1 < S2) {
U <- X2; X2 <- X1; X1 <- L + gr * (U - L)
S2 <- S1; S1 <- obj_fn(X1)
} else {
L <- X1; X1 <- X2; X2 <- U - gr * (U - L)
S1 <- S2; S2 <- obj_fn(X2)
}
if (noisy) cat("L:", L, "X1:", X1, "X2:", X2, "U:", U,
"S1:", S1, "S2:", S2, "\n")
}
lambda <- if (S1 < S2) X1 else X2
if (noisy) cat("Lambda:", lambda, "\n")
} else {
stopifnot(is.numeric(lambda), length(lambda) == 1L, lambda > 0)
}
# Solution at chosen lambda.
w <- Sigma2 / (Sigma2 + lambda)
yfit_std <- as.numeric(svd_phi$u %*% (w * crossprod(svd_phi$u, y_vec)))
# beta = V (Sigma / (Sigma^2 + lambda)) U' y (length m)
beta <- as.numeric(
svd_phi$v %*% ((svd_phi$d * crossprod(svd_phi$u, y_vec)) / (Sigma2 + lambda))
)
# alpha = U diag(Dinvsqrt) beta (length m; gives f(x) = K(x,Z) %*% alpha)
alpha <- as.numeric(We$vectors %*% (Dinvsqrt * beta))
diagS <- as.numeric((svd_phi$u^2) %*% w)
Looe_std <- sum(((y_vec - yfit_std) / (1 - diagS))^2)
vcov_alpha_std <- NULL
if (isTRUE(compute_vcov)) {
sigmasq_std <- as.numeric((1 / n) * crossprod(y_vec - yfit_std))
beta_weights <- sigmasq_std * Sigma2 / (Sigma2 + lambda)^2
vcov_beta <- tcrossprod(sweep(svd_phi$v, 2, beta_weights, `*`),
svd_phi$v)
alpha_map <- sweep(We$vectors, 2, Dinvsqrt, `*`)
vcov_alpha_std <- tcrossprod(alpha_map %*% vcov_beta, alpha_map)
}
list(
coeffs = matrix(alpha, ncol = 1), # length-m
fitted_std = yfit_std, # length-n
lambda = lambda,
Looe_std = Looe_std,
landmarks = Z, # m x d, standardized
landmark_indices = landmarks_resolved$indices,
landmark_method = landmarks_resolved$method_used,
W_eigen = list(values = Dvals, vectors = We$vectors),
Dinvsqrt = Dinvsqrt,
Sigma2 = Sigma2, # m-length, for eff_df / GCV
vcov_alpha_std = vcov_alpha_std,
C = C, # n x m, for derivative loop
nystrom_m = m,
nystrom_eps = nystrom_eps,
floored_count = floored_count,
D_min_raw = D_min_raw,
D_max_raw = Dmax
)
}
## Pointwise derivatives under Nystrom:
## d/dx_d f_hat(x_i) = sum_j alpha_j (-2/sigma) (X[i,d] - Z[j,d]) C[i,j]
.nystrom_derivatives <- function(X, Z, C, alpha, sigma) {
n <- nrow(X); d <- ncol(X); m <- nrow(Z)
derivmat <- matrix(NA_real_, n, d)
scale_factor <- -2 / sigma
for (k in seq_len(d)) {
# distk[i, j] = X[i, k] - Z[j, k]
distk <- outer(X[, k], Z[, k], `-`) # n x m
Lmat <- distk * C # n x m
derivmat[, k] <- scale_factor * (Lmat %*% alpha)
}
derivmat
}
.nystrom_ame_variances <- function(X, Z, C, vcov_alpha, sigma) {
if (is.null(vcov_alpha)) return(NULL)
d <- ncol(X)
out <- matrix(NA_real_, 1, d)
scale_factor <- -2 / sigma
for (k in seq_len(d)) {
distk <- outer(X[, k], Z[, k], `-`)
Lmat <- distk * C
g <- as.numeric(scale_factor * colMeans(Lmat))
out[1, k] <- as.numeric(crossprod(g, vcov_alpha %*% g))
}
out
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.