randomiz.test <- function(residualsNew, residualsReference, nperm) {
d <- residualsNew^2 - residualsReference^2
md <- mean(d)
N <- length(d)
signs <- round(matrix(runif(N * nperm), N, nperm)) * 2 - 1
dsigns <- d * signs
mdsigns <- colMeans(dsigns)
count <- sum(mdsigns >= md) ## equality will never occur, right?
(count + .5) / (nperm + 1)
}
## Starting from the minimum in the CV results do formal randomization
## tests for the models with fewer components or apply the one-sigma
## criterion. This is done on the validation data. At this moment only
## for univariate Y, and for the "CV" estimate.
#' @title Suggestions for the optimal number of components in PCR and PLSR models
#'
#' @description Choosing the best number of components in PCR and PLSR models is difficult
#' and usually done on the basis of visual inspection of the validation plots.
#' In cases where large numbers of models are built this choice needs to be
#' automated. This function implements two proposals, one based on
#' randomization (permutation) testing, and an approach based on the standard
#' error of the cross-validation residuals.
#'
#' @details In both approaches the results of cross-validation are used, so the model
#' should have been calculated with some form of cross-validation. First, the
#' absolute minimum in the CV curve is determined (considering only the first
#' ncomp components), leading to the reference model. The randomization test
#' approach (Van der Voet, 1994) checks whether the squared prediction errors
#' of models with fewer components are significantly larger than in the
#' reference model. This leads for each model considered to a \eqn{p} value;
#' the smallest model not significantly worse than the reference model is
#' returned as the selected one.
#'
#' The approach \code{"onesigma"} simply returns the first model where the
#' optimal CV is within one standard error of the absolute optimum (Hastie,
#' Tibshirani and Friedman, 2009). Note that here we simply use the standard
#' deviation of the cross-validation residuals, in line with the procedure used
#' to calculate the error measure itself. Some other packages implementing
#' similar procedures (such as \code{glmnet}) calculate an error measure for
#' each validation segment separately and use the average as the final
#' estimate. In such cases the standard error across segments is the relevant
#' measure of spread. For LOO, the two procedures are identical. In other forms
#' of validation, small differences will occur.
#'
#' @param object an \code{mvr} object. The fitted model. It should contain a
#' \code{validation} element.
#' @param method character string, indicating the heuristic to use.
#' @param nperm number of permutations in the \code{"randomization"} approach -
#' not used in the \code{"onesigma"} approach.
#' @param alpha cutoff for p values in the \code{"randomization"} approach -
#' not used in the \code{"onesigma"} approach.
#' @param ncomp maximum number of components to consider when determining the
#' global minimum in the cross-validation curve.
#' @param plot whether or not to show a cross-validation plot. The plot for the
#' \code{"randomization"} approach shows models that do not differ
#' significantly from the global RMSEP minimum with open circles; the
#' \code{"onesigma"} approach shows the one-sigma bands around the RMSEP
#' values. In both cases, the selection is indicated with a blue dashed line.
#' @param \dots Further plotting arguments, e.g., to add a title to the plot,
#' or to limit the plotting range.
#' @return A number indicating the suggested number of components in the model.
#' @author Ron Wehrens, Hilko van der Voet and Gerie van der Heijden
#' @seealso \code{\link{mvr}}, \code{\link{summary.mvr}}
#' @references Van der Voet, H. (1994) Comparing the predictive accuracy of
#' models using a simple randomization test. Chemom. Intell. Lab. Syst. 25 (2),
#' 313-323
#'
#' Hastie, T., Friedman, J. and Tibshirani, R. The Elements of Statistical
#' Learning: data mining, inference, and prediction, Springer (2013), 10th
#' printing with corrections, paragraph 7.10.
#' @keywords regression multivariate
#' @examples
#'
#' data(yarn)
#' yarn.pls <- plsr(density ~ NIR, data = yarn, scale = TRUE,
#' ncomp = 20, validation = "LOO")
#' selectNcomp(yarn.pls, "onesigma", plot = TRUE, ylim = c(0, 3))
#' selectNcomp(yarn.pls, "randomization", plot = TRUE)
#' selectNcomp(yarn.pls, "randomization", plot = TRUE,
#' ncomp = 10, ylim = c(0, 3))
#'
#' @export
selectNcomp <- function(object,
method = c("randomization", "onesigma"),
nperm = 999, alpha = 0.01, ncomp = object$ncomp,
plot = FALSE, ...) {
if (!isTRUE(object$validation$method == "CV"))
stop("No cross-validation data available in model")
## check that Y is univariate
if (dim(residuals(object))[2] > 1)
stop("Only univariate response supported")
rmseps <- c(RMSEP(object, "CV")$val) ## includes zero
maxIdx <- ncomp + 1
absBest <- which.min(rmseps[seq_len(maxIdx)])
if (absBest > 0) {
method <- match.arg(method)
if (is.null(origResponse <- object$y))
origResponse <-
c(predict(object, ncomp = 1) + residuals(object)[,1,1])
## include LOO prediction with zero components (i.e., the
## mean). For the mean we should also use the LOO estimate...
allresids <- cbind(origResponse -
(sum(origResponse) - origResponse) /
(length(origResponse) - 1),
object$validation$pred[,1,] - origResponse)
if (method == "randomization") {
pvals <- sapply(seq_len(absBest - 1),
function(ii) randomiz.test(allresids[,ii],
allresids[,absBest],
nperm = nperm))
idx <- which(pvals > alpha)
selection <- min(c(idx, absBest)) - 1
} else {
residsds <- apply(allresids, 2, sd) / sqrt(nrow(allresids))
uls <- rmseps - residsds
selection <- min(which(uls < rmseps[absBest])) - 1
}
if (isTRUE(plot)) {
xvals <- seq_along(rmseps) - 1
plot(xvals, rmseps, ylab = "RMSEP",
xlab = "Number of components", type = "b", ...)
if (method == "onesigma") {
arrows(xvals, uls, xvals, rmseps + residsds,
code = 3, col = "gray", angle = 90, length = .1)
} else {
points(xvals[idx], rmseps[idx], cex = 2, col = 4)
}
abline(h = rmseps[absBest], col = "gray", lty = 3)
abline(v = absBest - 1, col = "gray", lty = 3)
abline(v = selection, col = "blue", lty = 2)
legend("topright", legend = c("Abs. minimum", "Selection"),
lty = 3:2, col = c("gray", "blue"), bg = "white")
}
selection
} else {
warning("Lowest CV error found at 0 components, no testing performed")
0
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.