# R/rda-high-dim.r In sparsediscrim: Sparse and Regularized Discriminant Analysis

#### Documented in plot.rda_high_dim_cvpredict.rda_high_dimprint.rda_high_dimrda_high_dimrda_high_dim_cvrda_high_dim.defaultrda_high_dim.formulaupdate_rda_high_dim

#' High-Dimensional Regularized Discriminant Analysis (HDRDA)
#'
#' Given a set of training data, this function builds the HDRDA classifier from
#' Ramey, Stein, and Young (2017). Specially designed for small-sample,
#' high-dimensional data, the HDRDA classifier incorporates dimension reduction
#' and covariance-matrix shrinkage to enable a computationally efficient
#' classifier.
#'
#' The HDRDA classifier utilizes a covariance-matrix estimator that is a convex
#' combination of the covariance-matrix estimators used in the Linear
#' Discriminant Analysis (LDA) and Quadratic Discriminant Analysis (QDA)
#' classifiers. For each of the K classes given in y,
#' \eqn{(k = 1, \ldots, K)}, we first define this convex combination as
#' \deqn{\hat{\Sigma}_k(\lambda) = (1 - \lambda) \hat{\Sigma}_k
#' + \lambda \hat{\Sigma},}
#' where \eqn{\lambda \in [0, 1]} is the _pooling_ parameter. We then
#' calculate the covariance-matrix estimator
#' \deqn{\tilde{\Sigma}_k = \alpha_k \hat{\Sigma}_k(\lambda) + \gamma I_p,}
#' where \eqn{I_p} is the \eqn{p \times p} identity matrix. The matrix
#' \eqn{\tilde{\Sigma}_k} is substituted into the HDRDA classifier. See Ramey et
#' al. (2017) for more details.
#'
#' The matrix of training observations are given in x. The rows of
#' x contain the sample observations, and the columns contain the features
#' for each training observation. The vector of class labels given in y
#' are coerced to a factor. The length of y should match the number
#' of rows in x.
#'
#' The vector prior contains the _a priori_ class membership for
#' each class. If prior is NULL (default), the class membership
#' probabilities are estimated as the sample proportion of observations
#' belonging to each class. Otherwise, prior should be a vector with the
#' same length as the number of classes in y. The prior
#' probabilities should be nonnegative and sum to one. The order of the prior
#' probabilities is assumed to match the levels of factor(y).
#'
#' @export
#' @references Ramey, J. A., Stein, C. K., and Young, D. M. (2017),
#' "High-Dimensional Regularized Discriminant Analysis."
#' \url{https://arxiv.org/abs/1602.01182}.
#' @references Friedman, J. H. (1989), "Regularized Discriminant Analysis,"
#' Journal of American Statistical Association, 84, 405, 165-175.
#' \url{http://www.jstor.org/stable/2289860} (Requires full-text access).
#' @inheritParams lda_diag
#' @param y vector of class labels for each training observation
#' @param lambda the HDRDA pooling parameter. Must be between 0 and 1,
#' inclusively.
#' @param gamma a numeric values used for the shrinkage parameter.
#' @param shrinkage_type the type of covariance-matrix shrinkage to apply. By
#' default, a ridge-like shrinkage is applied. If convex is given, then
#' shrinkage similar to Friedman (1989) is applied. See Ramey et al. (2017) for
#' details.
#' @param prior vector with prior probabilities for each class. If NULL
#' (default), then the sample proportion of observations belonging to each class
#' equal probabilities are used. See details.
#' @param tol a threshold for determining nonzero eigenvalues.
#' @return rda_high_dim object that contains the trained HDRDA classifier
rda_high_dim <- function(x, ...) {
UseMethod("rda_high_dim")
}

#' @rdname rda_high_dim
#' @export
rda_high_dim.default <- function(x, y, lambda = 1, gamma = 0,
shrinkage_type = c("ridge", "convex"), prior = NULL,
tol = 1e-6, ...) {
x <- pred_to_matrix(x)
y <- outcome_to_factor(y)
complete <- complete.cases(x) & complete.cases(y)
x <- x[complete,,drop = FALSE]
y <- y[complete]
lambda <- as.numeric(lambda)
gamma <- as.numeric(gamma)
shrinkage_type <- match.arg(shrinkage_type)

if (lambda < 0 || lambda > 1) {
rlang::abort("The value for 'lambda' must be between 0 and 1, inclusively.")
}

if (gamma < 0) {
rlang::abort("The value for 'gamma' must be nonnegative.")
}

obj <- regdiscrim_estimates(x = x, y = y, cov = FALSE, prior = prior, ...)
x_centered <- center_data(x = x, y = y)
K <- obj$num_groups # Computes the eigenvalue decomposition of the pooled sample covariance matrix # using the Fast SVD approach. cov_pool_eigen <- cov_eigen(x = x, y = y, pool = TRUE, fast = TRUE, tol = tol) obj$D_q <- cov_pool_eigen$values obj$U1 <- cov_pool_eigen$vectors obj$q <- length(obj$D_q) obj$shrinkage_type <- shrinkage_type
obj$lambda <- lambda obj$gamma <- gamma

if (shrinkage_type == "ridge") {
alpha <- 1
} else {
# shinkage_family == "convex"
alpha <- 1 - gamma
}

# Transforms the centered data
XU <- x_centered %*% obj$U1 # For each class, we calculate the following quantities necessary to train the # HDRDA classifier. # 1. \Gamma_k # 2. Q_k # 3. W_k^{-1} for (k in seq_len(K)) { X_k <- x_centered[y == levels(y)[k], , drop=FALSE] n_k <- nrow(X_k) # Extracts the transformed, centered data # No need to calculate it for the classes individually XU_k <- XU[y == levels(y)[k], , drop=FALSE] # Transforms the sample mean to the lower dimension xbar_U1 <- crossprod(obj$U1, obj$est[[k]]$xbar)

# In the special case that (lambda, gamma) = (0, 0), the improvements to
# HDRDA's speed via the Sherman-Woodbury formula are not applicable because
# Gamma = 0. In this case, we calculate W_inv directly. If the matrix is
# singular, a slight amount of shrinkage is applied.
if (lambda == 0 && gamma == 0) {
W_k <- cov_mle(XU_k)
W_inv <- try(solve_chol(W_k), silent=TRUE)

if (inherits(W_inv, "try-error")) {
W_k <- W_k + diag(0.001, nrow=nrow(W_k), ncol=ncol(W_k))
W_inv <- solve_chol(W_k)
}

Gamma <- matrix(0, nrow=obj$q, ncol=obj$q)
Q <- diag(n_k)
} else {
# Although 'Gamma_k' is a diagonal matrix, we store only its diagonal
# elements.
Gamma <- alpha * lambda * obj$D_q + gamma Gamma_inv <- diag(Gamma^(-1)) # X_k %*% U_1 %*% Gamma^{-1} is computed repeatedly in the equations. # We compute the matrix once and use it where necessary to avoid # unnecessary computations. XU_Gamma_inv <- XU_k %*% Gamma_inv Q <- diag(n_k) + alpha * (1 - lambda) / n_k * tcrossprod(XU_Gamma_inv, XU_k) W_inv <- alpha * (1 - lambda) / n_k * crossprod(XU_Gamma_inv, solve(Q, XU_Gamma_inv)) W_inv <- Gamma_inv - W_inv } obj$est[[k]]$n_k <- n_k obj$est[[k]]$alpha <- alpha obj$est[[k]]$XU <- XU_k obj$est[[k]]$xbar_U1 <- xbar_U1 obj$est[[k]]$Gamma <- Gamma obj$est[[k]]$Q <- Q obj$est[[k]]$W_inv <- W_inv } # Creates an object of type 'rda_high_dim' obj$col_names <- colnames(x)
obj <- new_discrim_object(obj, "rda_high_dim")

obj
}

#' @inheritParams lda_diag
#' @rdname rda_high_dim
#' @importFrom stats model.frame model.matrix model.response
#' @export
rda_high_dim.formula <- function(formula, data, ...) {
# The formula interface includes an intercept. If the user includes the
# intercept in the model, it should be removed. Otherwise, errors and doom
# happen.
# To remove the intercept, we update the formula, like so:
# (NOTE: The terms must be collected in case the dot (.) notation is used)
formula <- no_intercept(formula, data)

mf <- model.frame(formula = formula, data = data)
.terms <- attr(mf, "terms")
x <- model.matrix(.terms, data = mf)
y <- model.response(mf)

est <- rda_high_dim.default(x = x, y = y, ...)
est$.terms <- .terms est <- new_discrim_object(est, class(est)) est } #' Outputs the summary for a HDRDA classifier object. #' #' Summarizes the trained rda_high_dim classifier in a nice manner. #' #' @inheritParams print.lda_diag #' @keywords internal #' @export print.rda_high_dim <- function(x, ...) { cat("High-Dimensional Regularized Discriminant Analysis\n\n") print_basics(x, ...) cat("\nReduced dimension:", x$q, "\n")
cat("Shrinkage type:", x$shrinkage_type, "\n") cat("Lambda:", signif(x$lambda, 3), "Gamma:", signif(x$gamma, 3), "\n") invisible(x) } #' Predicts the class membership of a matrix of unlabeled observations with a #' trained HDRDA classifier. #' #' For a given rda_high_dim object, we predict the class of each observation #' (row) of the the matrix given in newdata. #' #' @rdname rda_high_dim #' @inheritParams predict.lda_diag #' @export #' @param object Object of type rda_high_dim that contains the trained HDRDA #' classifier #' @param projected logical indicating whether newdata have already been #' projected to a q-dimensional subspace. This argument can yield large gains in #' speed when the linear transformation has already been performed. #' @return list with predicted class and discriminant scores for each of the K #' classes predict.rda_high_dim <- function(object, newdata, projected = FALSE, type = c("class", "prob", "score"), ...) { type <- rlang::arg_match0(type, c("class", "prob", "score"), arg_nm = "type") newdata <- process_newdata(object, newdata) scores <- sapply(object$est, function(class_est) {
if (object$lambda == 0 && object$gamma == 0) {
# Want: log(det(W_k)) = -log(det(W_k_inv))
log_det <- -log_determinant(class_est$W_inv) } else { log_det <- log_determinant(class_est$Q)
}

if (projected) {
# The newdata have already been projected. Yay for speed!
# Center the 'newdata' by the projected class sample mean
U1_x <- scale(newdata, center = class_est$xbar_U1, scale = FALSE) quad_forms <- diag(drop(tcrossprod(U1_x %*% class_est$W_inv, U1_x)))
} else {
# Center the 'newdata' by the class sample mean
x_centered <- scale(newdata, center = class_est$xbar, scale = FALSE) U1_x <- crossprod(object$U1, t(x_centered))

quad_forms <- apply(U1_x, 2, function(z) {
quadform(class_est$W_inv, z) }) } quad_forms + log_det - 2 * log(class_est$prior)
})

if (is.vector(scores)) {
# When sapply above returns a vector, the naming is thrown off.
# Hence, we rename it to the groups.
names(scores) <- object$groups min_scores <- min_index(scores) posterior <- exp(-(scores - min(scores))) posterior <- posterior / sum(posterior) } else { min_scores <- apply(scores, 1, min_index) # Grabbed code to calculate 'posterior' from MASS:::predict.qda, which # handles numerical overflow unlike the more direct: # exp(scores) / (1 + exp(sum(scores))) posterior <- exp(-(scores - apply(scores, 1, min))) posterior <- posterior / rowSums(posterior) } if (type == "prob") { if (is.vector(posterior)) { posterior <- matrix(posterior, nrow = 1) colnames(posterior) <- object$groups
}
res <- as.data.frame(posterior)
} else if (type == "class") {
res <- with(object, factor(groups[min_scores], levels = groups))
} else {
if (is.vector(scores)) {
scores <- matrix(scores, nrow = 1)
colnames(scores) <- object$groups } res <- as.data.frame(scores) } res } #' Helper function to optimize the HDRDA classifier via cross-validation #' #' For a given data set, we apply cross-validation (cv) to select the optimal #' HDRDA tuning parameters. #' #' The number of cross-validation folds is given in num_folds. #' #' @export #' @importFrom dplyr arrange #' @inheritParams lda_diag #' @param y vector of class labels for each training observation #' @param num_folds the number of cross-validation folds. #' @param num_lambda The number of values of lambda to consider #' @param num_gamma The number of values of gamma to consider #' @param shrinkage_type the type of covariance-matrix shrinkage to apply. By #' default, a ridge-like shrinkage is applied. If convex is given, then #' shrinkage similar to Friedman (1989) is applied. See Ramey et al. (2017) for #' details. #' @param verbose If set to TRUE, summary information will be outputted #' as the optimal model is being determined. #' @param ... Options passed to [rda_high_dim()]. #' @return list containing the HDRDA model that minimizes cross-validation as #' well as a data.frame that summarizes the cross-validation results. rda_high_dim_cv <- function(x, y, num_folds = 10, num_lambda = 21, num_gamma = 8, shrinkage_type=c("ridge", "convex"), verbose=FALSE, ...) { x <- pred_to_matrix(x) y <- outcome_to_factor(y) complete <- complete.cases(x) & complete.cases(y) x <- x[complete,,drop = FALSE] y <- y[complete] shrinkage_type <- match.arg(shrinkage_type) cv_folds <- cv_partition(y = y, num_folds = num_folds) # The grid of tuning parameters # We sort the grid by model preference. Models with lambda = 0 are most # preferred because they assume that the covariance matrices are equal. This # preference is further based on model parsimony. seq_lambda <- seq(0, 1, length = num_lambda) if (shrinkage_type == "ridge") { seq_gamma <- c(0, 10^seq.int(-2, num_gamma - 4)) } else { # shrinkage_type == "convex" seq_gamma <- seq(0, 1, length = num_gamma) } tuning_grid <- expand.grid(lambda = seq_lambda, gamma = seq_gamma) tuning_grid <- dplyr::arrange(tuning_grid, lambda, gamma) cv_errors <- sapply(seq_along(cv_folds), function(fold_i) { if (verbose) { cat("CV Fold: ", fold_i, " of ", num_folds, "\n") } fold <- cv_folds[[fold_i]] train_x <- x[fold$training, ]
train_y <- y[fold$training] test_x <- x[fold$test, ]
test_y <- y[fold$test] hdrda_out <- rda_high_dim(x = train_x, y = train_y, lambda = 1, gamma = 0, ...) # Projects the test data to the q-dimensional subspace. # No need to do this for each lambda/gamma pair. # NOTE: It's not centered yet. test_x <- test_x %*% hdrda_out$U1

# For each value of lambda and gamma, we train the HDRDA classifier,
# classify the test observations, and record the number of classification
# errors.
fold_errors <- mapply(function(lambda, gamma) {
errors <- try({
# Updates Gamma, Q, and W_inv for each class in hdrda_out
# If an error is thrown, we return 'NA'.
hdrda_updated <- update_rda_high_dim(hdrda_out, lambda, gamma)
# Call to predict.rda_high_dim is explicit to pass R CMD CHECK.
sum(predict.rda_high_dim(hdrda_updated, test_x, projected = TRUE)$class != test_y) }, silent = TRUE) errors }, tuning_grid$lambda, tuning_grid$gamma) fold_errors }) cv_errors <- rowSums(cv_errors) error_rate <- cv_errors / nrow(x) cv_summary <- cbind(tuning_grid, cv_errors, error_rate) optimal <- which.min(cv_errors) lambda <- tuning_grid$lambda[optimal]
gamma <- tuning_grid$gamma[optimal] # Trains a classifier based on optimal model hdrda_out <- rda_high_dim(x=x, y=y, lambda=lambda, gamma=gamma, shrinkage_type=shrinkage_type) # Adds optimal parameters and cv_summary to classifier object hdrda_out$lambda <- lambda
hdrda_out$gamma <- gamma hdrda_out$cv_summary <- cv_summary
class(hdrda_out) <- c("hdrda_cv", "rda_high_dim")

hdrda_out
}

#' Plots a heatmap of cross-validation error grid for a HDRDA classifier object.
#'
#' Uses [ggplot2::ggplot2()] to plot a heatmap of the training error
#' grid.
#'
#' @param x object to plot
#' @param ... unused
#' @return A ggplot object.
#' @export
#' @importFrom ggplot2 ggplot aes scale_fill_gradient labs scale_x_discrete
#' @importFrom ggplot2 scale_y_discrete theme labs geom_tile element_blank
plot.rda_high_dim_cv <- function(x, ...) {
cv_summary <- x$cv_summary cv_summary <- within(cv_summary, { lambda <- round(lambda, 3) gamma <- round(gamma, 3) }) # Fixes NOTE from R CMD CHECK # "no visible binding for global variable 'error_rate'" error_rate <- 1 p <- ggplot(cv_summary, aes(factor(gamma), factor(lambda))) p <- p + geom_tile(aes(fill = error_rate), colour = "white") p <- p + scale_fill_gradient(low = "white", high = "steelblue", name = "CV Error Rate") p <- p + labs(x = expression(gamma), y = expression(lambda)) p <- p + scale_x_discrete(expand = c(0, 0)) p <- p + scale_y_discrete(expand = c(0, 0)) p <- p + theme(axis.ticks = element_blank()) p } #' Helper function to update tuning parameters for the HDRDA classifier #' #' This function updates some of the quantities in the HDRDA classifier based on #' updated values of lambda and gamma. The update can greatly #' expedite cross-validation to examine a large grid of values for lambda #' and gamma. #' #' @param obj a rda_high_dim object #' @param lambda a numeric value between 0 and 1, inclusively #' @param gamma a numeric value (nonnegative) #' @return a rda_high_dim object with updated estimates update_rda_high_dim <- function(obj, lambda = 1, gamma = 0) { # In the special case that (lambda, gamma) = (0, 0), the improvements to # HDRDA's speed via the Sherman-Woodbury formula are not applicable because # Gamma = 0. In this case, we calculate W_inv directly. if (lambda == 0 && gamma == 0) { Gamma <- matrix(0, nrow=obj$q, ncol=obj$q) } else { # NOTE: alpha_k is constant across all classes, so that alpha_k = alpha_1 # for all k. As a result, Gamma and Gamma_inv are constant across all k. We # compute both before looping through all K classes. Gamma <- obj$est[[1]]$alpha * lambda * obj$D_q + gamma
Gamma_inv <- diag(Gamma^(-1))
}

for (k in seq_len(obj$num_groups)) { n_k <- obj$est[[k]]$n_k if (lambda == 0 && gamma == 0) { Q <- diag(n_k) W_k <- cov_mle(obj$est[[k]]$XU) W_inv <- try(solve_chol(W_k), silent=TRUE) if (inherits(W_inv, "try-error")) { W_k <- W_k + diag(0.001, nrow=nrow(W_k), ncol=ncol(W_k)) W_inv <- solve_chol(W_k) } } else { # X_k %*% U_1 %*% Gamma^{-1} is computed repeatedly in the equations. # We compute the matrix once and use it where necessary to avoid # unnecessary computations. XU_Gamma_inv <- obj$est[[k]]$XU %*% Gamma_inv Q <- diag(n_k) + obj$est[[k]]$alpha * (1 - lambda) / n_k * tcrossprod(XU_Gamma_inv, obj$est[[k]]$XU) W_inv <- obj$est[[k]]$alpha * (1 - lambda) / n_k * crossprod(XU_Gamma_inv, solve(Q, XU_Gamma_inv)) W_inv <- Gamma_inv - W_inv } obj$est[[k]]$Gamma <- Gamma obj$est[[k]]$Q <- Q obj$est[[k]]$W_inv <- W_inv } obj$lambda <- lambda
obj\$gamma <- gamma
obj
}


## Try the sparsediscrim package in your browser

Any scripts or data that you put into this service are public.

sparsediscrim documentation built on July 1, 2021, 9:07 a.m.