#' Supervised (and unsupervised) principal components
#'
#' Computes dimension reduction based on the supervised principal components
#' algorithm. In essense, algorithm performs a screening step based on univariate
#' scores for the features, and then computes standard PCA on using only the retained
#' features.
#'
#' @param x The original feature matrix, columns denoting the features and rows the instances.
#' @param y A vector with the observed target values we try to predict using \code{x}.
#' Can be factor for classification problems. If missing, then this function computes
#' standard unsupervised principal components.
#' @param nctot Total number of latent features to extract.
#' @param ncsup Maximum number of latent features to extract that use supervision.
#' If \code{nctot > ncsup}, then the remaining \code{nctot-ncsup} features are computed in
#' unsupervised manner (or ignored if \code{sup.only=TRUE}).
#' @param window Maximum number of features that will survive the screening and from which
#' the supervised components are computed. Affects also how the \code{screenthresh}-argument
#' is interpreted.
#' @param exclude Columns (variables) in \code{x} to ignore when extrating the new features.
#' @param verbose Whether to print some messages along the way.
#' @param normalize Whether to scale the extracted features so that they all have standard deviation
#' of one.
#' @param preprocess Whether to center and scale the features before extracting the new features.
#' @param alpha Significance level for the p-values of the univariate scores used to determine
#' which features survive the screening and are used to compute the supervised components.
#' @param perms Number of permutations to estimate the p-values for univariate scores.
#' @param screenthresh Value between 0 and 1 (or \code{NULL}). If not \code{NULL}, then no permutation tests are
#' run, and the supervised components
#' are computed among those features that have their univariate score equal or larger than this.
#' Value 1 means that only the feature with the highest score survives the screening, whereas value 0 means that
#' the top \code{min(window, ncol(x))} survive the screening. Overwrites also \code{nfeat}-argument.
#' @param nfeat Number of features to retain in the screening step. If this option is used, then
#' the algorithm does not perform the permutation tests for the p-values, but instead computes the
#' supervised components from those features that have their univariate score among the \code{nfeat}
#' highest scores (in this case \code{perms} and \code{alpha} are ignored).
#' @param sup.only If \code{TRUE}, then no unsupervised components are ever computed even if the number of
#' supervised components that could be extracted was less than \code{nctot}.
#' @param ... Currently ignored.
#'
#'
#' @return spca-object that is similar to the object returned by \code{\link[stats]{prcomp}}.
#' The object will have the following elements:
#' \describe{
#' \item{\code{w}}{The projection (or rotation) matrix W, that transforms the original data
#' \eqn{X} into the new features \eqn{Z = X W} .}
#' \item{\code{z}}{The extracted latent features corresponding to the training inputs \eqn{X}.}
#' \item{\code{v}}{Matrix \eqn{V} that is used to compute \eqn{W} when combining supervised and
#' unsupervised components (see the Piironen and Vehtari (2018) for more information).}
#' \item{\code{sdev}}{Standard deviations of the new features.}
#' \item{\code{centers}}{Mean values for the original variables.}
#' \item{\code{scales}}{Scales of the original variables.}
#' \item{\code{exclude}}{Excluded variables.}
#' }
#'
#' @section Details:
#'
#' In the original paper, the authors proposed estimating the screening threshold
#' using cross-validation for the model obtained when the extracted features are used
#' for regression or classification. This implementation performs the screening
#' based on the estimated p-values for the univariate scores (these are estimated using
#' a permutation test) and the screening step retains only those features with p-value
#' less than the specified level \code{alpha}.
#'
#' @section References:
#'
#' Bair, E., Hastie, T., Paul, D., and Tibshirani, R. (2006).
#' Prediction by supervised principal components. \emph{Journal
#' of the American Statistical Association}, 101(473):119-137.
#'
#' Piironen, J. and Vehtari, A. (2018). Iterative supervised principal components.
#' In \emph{Proceedings of the 21st International Conference on Artificial
#' Intelligence and Statistics (AISTATS) PMLR 84: 106-114}.
#'
#' @examples
#' \donttest{
#' ###
#'
#' # load data
#' data("ovarian", package = "dimreduce")
#' x <- ovarian$x
#' y <- ovarian$y
#'
#' # dimension reduction
#' dr <- spca(x, y, nctot = 2)
#' z <- predict(dr, x) # the latent features
#' }
#'
#' @export
spca <- function(x, y = NULL, nctot = NULL, ncsup = NULL, window = 500,
exclude = NULL, verbose = TRUE, normalize = FALSE,
preprocess = TRUE, alpha = NULL, perms = 1000,
screenthresh = NULL, nfeat = NULL, sup.only = FALSE, ...) {
n <- NROW(x)
d <- NCOL(x)
if (!is.null(screenthresh)) {
# a relative screening threshold given, so find out what is the number of features this
# threshold corresponds to
scores <- featscore(x, y, exclude = exclude)
scores_sorted <- sort(scores, decreasing = T)
smax <- scores_sorted[1]
smin <- scores_sorted[min(length(scores), window)]
nfeat <- sum(scores >= smin + screenthresh * (smax - smin))
}
if (is.null(alpha)) {
alpha <- 0.001
}
if (is.null(nctot)) {
if (!is.null(nfeat)) {
nctot <- min(nfeat, min(n - 1, d))
} else {
nctot <- min(n - 1, d)
}
}
else if (nctot > min(n - 1, d)) {
stop("nctot cannot exceed min(n-1,d)")
}
if (is.null(ncsup)) {
ncsup <- nctot
} # ceiling(nctot/2)
if (is.null(y)) {
ncsup <- 0
}
if (preprocess) {
# center and scale x for computation
centers <- colMeans(x)
scales <- apply(x, 2, "sd")
x <- scale(x) # this may produce NaNs in x if some columns have zero variance
} else {
centers <- rep(0, d)
scales <- rep(1, d)
}
# check if there are NA/NaNs in some columns, exclude those
exclude <- c(exclude, which(is.na(colSums(x))))
ok <- setdiff(1:d, exclude)
v_all <- matrix(0, nrow = d, ncol = nctot)
b_all <- matrix(0, nrow = d, ncol = nctot)
latent <- matrix(nrow = n, ncol = nctot)
rotation <- matrix(nrow = d, ncol = nctot)
pval <- NULL
# Procedure:
# 1) compute the marginal scores
# 2) threshold alpha for the permutation tests => compute ncsup SPCs
# 3) subtract the variation explained by the SPCs from x
# 4) compute nctot - ncsup unsupervised PCs
# 5) compute the rotation matrices
if (ncsup > 0) {
if (verbose) {
print("Performing permutation tests for the marginal scores..")
}
if (!is.null(nfeat)) {
# screen out all except those among nfeat most relevant (based on the univariate scores)
scores <- featscore(x, y, exclude = exclude)
indsup <- order(scores, decreasing = T)[1:nfeat]
indsup <- setdiff(indsup, exclude)
} else {
# permutation test for the marginal scores, find out subset for the supervised PCs
pval <- featscore.test(x, y, exclude = exclude, perms = perms)
indsup <- which(pval < alpha)
if (length(indsup) > window) {
# more than window significant features, so choose only the window most relevant ones
scores <- featscore(x, y, exclude = exclude)
indsup <- order(scores, decreasing = T)[1:window]
}
}
if (length(indsup) > 0) {
if (verbose) {
print("Computing the supervised principal components..")
}
# supervised PCs
spcs <- stats::prcomp(x[, indsup], center = F, scale. = F)
if (NCOL(spcs$rotation) < ncsup) {
ncsup <- NCOL(spcs$rotation)
}
v_all[indsup, 1:ncsup] <- spcs$rotation[, 1:ncsup]
latent[, 1:ncsup] <- spcs$x[, 1:ncsup]
# subtract the variation explained by the SPCs from x
for (k in 1:ncsup) {
z <- latent[, k]
b <- rep(0, d)
b[ok] <- colSums(x[, ok, drop = F] * z) / sum(z^2)
x <- x - t(t(matrix(z, nrow = n, ncol = d)) * b)
b_all[, k] <- b
}
} else {
# none of the variables passed the permutation test
ncsup <- 0
}
}
rotation <- v_all
if (!sup.only) {
if (ncsup < nctot) {
if (verbose) {
print("Computing the unsupervised principal components..")
}
# compute standard pc with the rest of the data variation
if (length(ok) < nctot - ncsup) {
nctot <- length(ok) - ncsup
} # how many unsupervised PCs we can compute
pcs <- stats::prcomp(x[, ok], center = F, scale. = F)
v_all[ok, (ncsup + 1):nctot] <- pcs$rotation[, 1:(nctot - ncsup)]
latent[, (ncsup + 1):nctot] <- pcs$x[, 1:(nctot - ncsup)]
# compute the rotation matrix from vs
# v <- v_all # store the original vs
rotation <- v_all
if (ncsup > 0 && nctot > 1) {
for (j in seq(nctot, ncsup + 1)) {
for (h in seq(ncsup, 1)) {
rotation[, j] <- rotation[, j] - sum(b_all[, h] * rotation[, j]) * v_all[, h]
}
}
}
}
} else {
# remove the NA-columns from the matrices that will be returned
if (nctot > ncsup) {
v_all <- v_all[, 1:ncsup, drop = F]
b_all <- b_all[, 1:ncsup, drop = F]
latent <- latent[, 1:ncsup, drop = F]
rotation <- rotation[, 1:ncsup, drop = F]
}
}
if (normalize) {
sz <- apply(latent, 2, "sd")
ok <- sz > 1e-6
latent <- t(t(latent[, ok]) / sz[ok])
rotation <- t(t(rotation[, ok]) / sz[ok])
}
if (verbose) {
print("Done.")
}
res <- list(
w = rotation, z = latent, v = v_all, sdev = apply(latent, 2, "sd"),
pval = pval, centers = centers, scales = scales, exclude = exclude
)
class(res) <- c("spca", "dimred")
return(res)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.