##------------------------------------------------------------##
## Author Giovanni Merola please acknowledge my work
## email giovanni.merola@xjtlu.edu.cn
##------------------------------------------------------------##
#' @title Computes LS SPCA components using different variable selection algorithms
#'
#'
#' @description For each component, the variables are selected so as to explain
#' a percentage \emph{alpha} of the variance explained by the corresponding principal component.
#'
#' @usage lsspca(X, alpha = 0.95, maxcard = 0, ncomps = 4,
#' spcaMethod = "u", scalex = FALSE,
#' variableSelection = c("exhaustive", "seqrep", "backward", "forward", "lasso"),
#' really.big = FALSE, force.in = NULL, force.out = NULL, selectfromthese = NULL,
#' lsspca_forLasso = TRUE, lasso_penalty = 0.5)
#'
#' @param X The data matrix.
#' @param alpha Real in [0,1]. percentage of variance of the PCs explained by the sparse component.
#' @param ncomps number of components to compute
#' @param spcaMethod character vector how LS SPCA components are computed:
#' "u" for uncorrelated, "c" for correlated and "p" for projection.
#' If only one value, the same method is used for all components.
#' @param variableSelection how the variables for each components are selected
#' 'seqrep' stepwise, 'exhaustive' all subsets 'backward', 'forward', 'lasso'
#' @param really.big logical, set to true if the matrix is large for faster variable selection
#' no exhaustive search, of course
#' @param scalex = FALSE, whether to scale the variables to unit variance. Variables are
#' scaled to zero mean (if needed) even if scaleX = FALSE
#' @param maxcard a vector or an integer. Missing values filled with last value.
#' @param force.in NULL or list of indeces that must be in component. not for lasso. [NULL]
#' @param force.out NULL or list of indeces cannot be in component. [NULL]
#' @param selectfromthese NULL or list of indeces from which model chosen. [NULL]
#' @param lsspca_forLasso use lsspca with indeces selected with lasso or just the lasso regression
#' @param lasso_penalty real between 0 and 1. 0-> ridge regression, 1 -> lasso
#'
#'
#' @details for USPCA, \code{maxcard} cannot be smaller than the order of the components
#' computed, so \code{maxcard = c(1, 1, 1)} will be automatically changed to
#' \code{maxcard = c(1, 2, 3)}. Exhaustive search can be slow for matrices with
#' 30 or more variables. See the documentation for \code{leaps::regsubset}
#' and {glmnet::glmnet} for the options.
#' @return a list
## #' \loadmathjax
#' \describe{
## #' \item{loadings}{Matrix with the loadings scaled to unit \mjeqn{L_2}{ASCII representation} norm.}
## #' \item{contributions}{Matrix of loadings scaled to unit \eqn{L_1}{ASCII representation} norm.}
#' \item{loadings}{Matrix with the loadings scaled to unit \eqn{L_2} norm.}
#' \item{contributions}{Matrix of loadings scaled to unit \eqn{L_1} norm.}
#' \item{ncomps}{integer number of components computed. Default is 4.}
#' \item{cardinality}{Vector with the cardinalities of each loadings.}
#' \item{ind}{List with the indices of the non-zero loadings for each component.}
#' \item{loadingslist}{A list with only the nonzero ladings for each component.}
#' \item{vexp}{Vector with the \% variance explained by each component.}
#' \item{vexpPC}{Vector with the \% variance explained by each principal component.}
#' \item{cvexp}{Vector with the \% cumulative variance explained by each component.}
#' \item{rcvexp}{Vector with the \% proportion of cumulative variance explained by each component to that explained by the PCs.}
#' \item{scores}{the SPCs scores.}
#' \item{PCloadings}{Matrix with the PCs loadings scaled to unit \eqn{L_2} norm. }
#' \item{PCscores}{the PCs scores.}
#' \item{spcaMethod}{method used to compute the sparse loadings}
#' \item{corComp}{Matrix of correlations among the sparse components.
#' Only if spcaMethod != "u" and ncomps > 1.}
#' \item{Call}{The called with its arguments.}
#' }
#' @references Giovanni M. Merola. 2014. \emph{Least Squares Sparse Principal
#' Component Analysis: a Backward Elimination approach to attain large
#' loadings.} Austr.&NZ Jou. Stats. 57, pp 391-429\cr\cr
#' Giovanni M. Merola and Gemai Chen. 2019. \emph{Sparse Principal Component Analysis: an
#' efficient Least Squares approach.} Jou. Multiv. Analysis 173, pp 366--382
#' \url{http://arxiv.org/abs/1406.1381}
#'
#' @examples
#' \dontrun{
#' library(LSSPCA)
#' data(hitters)
#'
#' dim(hitters)
#' ## USPCA 95
#' hit_uspca95 = lsspca(X = hitters, alpha = 0.95, ncomps = 4,
#' spcaMethod = "u", subsectSelection = "e")
#' #> Warning message:
#' #> In log(vr) : NaNs produced
#' ## the warnings come from the variable selection, don't worry
#'
#' ## print contributions (only.nonzero)
#' print_spca(hit_uspca95)
#'
#' ## summaries
#' summary_spca(hit_uspca95, contributions = TRUE, digits = 1)
#'
#' ## print loadings individually
#' lapply(hit_uspca95$loadingslist, function(x) round(x, 2))
#' ## print contributions individually
#' lapply(hit_uspca95$loadingslist, function(x) round(x/sum(abs(x)), 2))
#'
#' ## plot PC and USPC loadings
#' par(mfrow = c(1, 2))
#' barplot(-hit_uspca95$PCloadings[, 1], main = "PCA")
#' barplot(-hit_uspca95$loadings[, 1], main = "USPCA")
#' par(mfrow = c(1,1))
#'
#' ## Holzinger data
#' data(holzinger)
#' dim(holzinger)
#'
#' ## CSPCA
#' hol_cspca95 = lsspca(X = holzinger, alpha = 0.95, ncomps = 4,
#' spcaMethod = "c", subsectSelection = "e")
#'
#' ## summaries
#' t(data.frame(card = hol_cspca95$cardinality,
#' cvexp = round(hol_cspca95$cvexp, 2),
#' rcvexp = round(hol_cspca95$rcvexp, 2)))
#'
#' ## print loadings
#' lapply(hol_cspca95$loadingslist, function(x) round(x, 2))
#' ## print contributions
#' lapply(hol_cspca95$loadingslist, function(x) round(x/sum(abs(x)), 2))
#'
#' ## correlation between SPCs
#' round(hol_cspca95$corComp, 2)
#'
#' ## plot contributions
#' barplot(-hol_cspca95$contributions[, 1])
#'
#' ## SPCs scores against PC scores
#' plot(hol_cspca95$scores[, 1], hol_cspca95$PCscores[, 1], pch = 16)
#' regline = lm(hol_cspca95$PCscores[, 1] ~ hol_cspca95$scores[, 1]- 1)$coef
#' abline(a = 0, b = regline, col = 2)
#'
#'
#' ## SPCA on each ability separately
#' h_groups = lapply(seq(1, 10, 3), function(x) x:(x + 2))
#'
#' ## projection SPCA
#' hol_block_spca95 = lsspca(X = holzinger, alpha = 0.95, ncomps = 4,
#' spcaMethod = "p", subsectSelection = "e",
#' selectfromthese = h_groups)
#'
#' ## summaries
#' t(data.frame(card = hol_block_spca95$cardinality,
#' cvexp = round(hol_block_spca95$cvexp, 2),
#' rcvexp = round(hol_block_spca95$rcvexp, 2)))
#'
#' ## print loadings
#' lapply(hol_block_spca95$loadingslist, function(x) round(x, 2))
#'
#' ## print contributions
#' lapply(hol_block_spca95$loadingslist, function(x) round(x/sum(abs(x)), 2))
#'
#' ## correlation between SPCs
#' round(hol_block_spca95$corComp, 2)
#'
#' ## plot the contributions for each SPC
#' par(mfrow = c(2, 2))
#' for(k in 1:4){
#' barplot(-hol_block_spca95$contributions[, k])
#' }
#' par(mfrow = c(1, 1))
#' }
#'
#' @author Giovanni Merola
#'
#' @export
lsspca <- function(X, alpha = 0.95, maxcard = 0, ncomps = 4,
spcaMethod = "u", scalex = FALSE,
variableSelection = c("exhaustive", "seqrep", "backward", "forward", "lasso"),
really.big = FALSE, force.in = NULL, force.out = NULL,
selectfromthese = NULL, lsspca_forLasso = TRUE, lasso_penalty = 0.5) {
##----------------------------------------##
## Creating the environment and error checking
##----------------------------------------##
if (is.data.frame(X))
X = as.matrix(X)
p <- ncol(X)
n <- nrow(X)
if (is.null(colnames(X)))
colnames(X) <- paste0("V", 1:p)
## convert selectfromthese to force out
if (is.list(selectfromthese)) {
fx <- function(x, m) (1:m)[-x]
force.out <- lapply(selectfromthese, fx, m = p)
}
if (is.list(force.in) | is.list(force.out)) {
ncomps <- max(length(force.in), length(force.out))
if (length(force.in) < ncomps)
force.in <- c(force.in, vector("list", ncomps - length(force.in)))
if (length(force.out) < ncomps)
force.out <- c(force.out, vector("list", ncomps - length(force.out)))
}
else if (ncomps == 0) {
ncomps <- p
}
me = unique(spcaMethod)
lme = length(me)
if (lme == 1){
if (!any(spcaMethod == c("u", "c", "p")))
stop("only one of these options allowed for spcaMethod: 'u', 'c', 'p'")
}
else
for(m in 1:lme){
if (!any(me[m] == c("u", "c", "p")))
stop("only one of these options allowed for spcaMethod: 'u', 'c', 'p'")
}
lm = length(spcaMethod)
if (lm < ncomps){
spcaMethod = c(spcaMethod, rep(spcaMethod[lm], ncomps - lm))
}
if (length(maxcard) > 1)
ncomps <- length(maxcard)
if (is.vector(maxcard)) {
maxcard[maxcard == 0] <- p
if (length(maxcard < ncomps)) {
lm <- length(maxcard)
maxcard <- c(maxcard, rep(maxcard[lm], ncomps - lm))
}
} else {
stop("must pass a vector or an integer as maxcard")
}
variableSelection <- switch(stringr::str_sub(variableSelection[1], 1, 1), e = "exhaustive", b = "backward", f = "forward",
s = "seqrep", l = "lasso")
if (is.null(variableSelection))
stop("please pass a valid variable selection option")
if((ncol(X)> 30) & (stringr::str_sub(variableSelection[1], 1, 1) == "e"))
if(really.big == FALSE){
warning("exhaustive search with so many variables requires really.big = TRUE")
really.big == TRUE
}
if (!requireNamespace("geigen", quietly = TRUE)) {
stop("Package \"geigen\" needed for this function to work. Please install it.",
call. = FALSE)
}
if (!requireNamespace("stringr", quietly = TRUE)) {
stop("Package \"stringr\" needed for this function to work. Please install it.",
call. = FALSE)
}
if (variableSelection == "lasso")
if (!requireNamespace("glmnet", quietly = TRUE)) {
stop("Package \"glmnet\" needed for lasso selection to work. Please install it.",
call. = FALSE)
}
else
if (!requireNamespace("leaps", quietly = TRUE)) {
stop("Package \"leaps\" needed for variable selection to work. Please install it.",
call. = FALSE)
}
##-------------------------------------------------------------------##
## create objects for output
##-------------------------------------------------------------------##
card <- rep(0, ncomps)
ind <- as.list(card)
vexp <- rep(0, ncomps)
cvexp <- rep(0, ncomps)
A <- matrix(0, p, ifelse(ncomps == 0, p, ncomps))
contributions <- A
loadingslist <- as.list(1:p)
scores <- matrix(0, n, ncomps)
if (is.null(colnames(X)))
namx <- paste("Var", 1:p) else namx <- colnames(X)
##-------------------------------------------------------------------##
## center and scale the variables
##-------------------------------------------------------------------##
if (scalex == TRUE)
X <- scale(X)
else
if (any(abs(colMeans(X)) > 10^-6)) {
message("Centering the columns of X to zero mean")
X <- scale(X, scale = FALSE)
}
##-------------------------------------------------------------------##
## compute components
##-------------------------------------------------------------------##
K <- X
indall <- 1:p
nc <- 0
j <- 1
stopComp <- FALSE
while (stopComp == FALSE) {
D <- crossprod(K)
r_ee <- eigen(D)
pc <- K %*% r_ee$vec[, 1]
lambda <- r_ee$val[1]
if (j == 1) {
vexpPC <- r_ee$val
totvar <- sum(r_ee$val)
PCloadings = r_ee$vec[, 1:ncomps, drop = FALSE]
## first element of PCloadings always > 0
notzero = sign(PCloadings[1, ])
if (any(notzero < 0))
PCloadings = t(t(PCloadings) * notzero)
PCscores = X %*% PCloadings
}
## variable selection ==========================
if (stringr::str_sub(variableSelection[1], 1, 1) == "l") {
if ((length(force.out) > 0) | (length(force.in) > 0))
stop("lasso not implemented with force.out or force.in")
else g_fit <- glmnet::glmnet(X, pc, intercept = FALSE, alpha = lasso_penalty)
mod_ind <- ifelse(any(g_fit$dev.ratio > alpha), min(which(g_fit$dev.ratio > alpha)), length(g_fit$dev.ratio))
coe <- as.numeric(stats::coef(g_fit, s = g_fit$lambda[mod_ind]))[-1]
ind[[j]] <- which(coe != 0)
card[j] <- length(ind[[j]])
a <- coe[ind[[j]]]
}
else {
ssr <- leaps::regsubsets(x = X, y = pc, method = variableSelection[1],
nbest = 1, force.in = force.in[[j]],
force.out = force.out[[j]], nvmax = maxcard[j],
intercept = FALSE, really.big = really.big)
aa <- summary(ssr)
aa$which <- aa$which[, order(match(colnames(aa$which), colnames(X)))]
mrsq <- any(aa$rsq >= alpha)
if (mrsq == TRUE)
if (spcaMethod[j] == "u")
iord <- which((aa$rsq >= alpha) & ((1:length(aa$rsq)) >= j)) ## rsquared
else
iord <- which(aa$rsq >= alpha) ## rsquared
else {
iord <- length(aa$rsq)
if (maxcard[j] > min(iord))
message(paste("warning: lsspca component ", j, "could not reach Rsquared", alpha))
}
if (is.vector(aa$which))
ind[[j]] <- indall[aa$which]
else
ind[[j]] <- indall[aa$which[iord[1], ]]
card[j] <- length(ind[[j]])
}
## compute LSSPCs =====================
if ((variableSelection != "l") | ((variableSelection == "l") & (lsspca_forLasso == TRUE))) {
if (length(ind[[j]]) > 1) {
Xd <- X[, ind[[j]]]
Sd <- crossprod(Xd)
if (spcaMethod[j] == "u")
{
if (j == 1) {
M <- tcrossprod(crossprod(Xd, X))
ga <- geigen::geigen(M, Sd, symmetric = TRUE)
a <- ga$vec[, card[j]]
}
else {
xxd <- crossprod(X, Xd)
H <- crossprod(A[, 1:(j - 1), drop = FALSE], xxd)
Sm <- solve(Sd)
E <- H %*% Sm
G <- diag(card[j]) - crossprod(H, solve(tcrossprod(E, H))) %*% E
M <- crossprod(tcrossprod(xxd, G))
ga <- geigen::geigen(M, Sd)
a <- ga$vec[, card[j]]
}
}
else {
if (spcaMethod[j] == "c") {
M <- crossprod(crossprod(K, Xd))
ga <- geigen::geigen(M, Sd, symmetric = TRUE)
a <- ga$vec[, card[j]]
}
if (spcaMethod[j] == "p") {
alm <- lm(pc ~ Xd - 1)
a <- alm$coefficients
}
}
## save results for j-th SPC ==========================
scores[, j] <- Xd %*% a
bb <- stats::cor(PCscores[, j], scores[, j])
if (bb < 0) {
a <- -a
scores[, j] <- -scores[, j]
}
a <- a/sqrt(sum(a^2))
names(a) <- namx[ind[[j]]]
A[ind[[j]], j] <- a
contributions[ind[[j]], j] <- a/sum(abs(a))
loadingslist[[j]] <- a
}
else {
a <- 1
scores[, j] <- X[, ind[[j]]]
bb <- cor(pc, scores[, j])
if (bb < 0) {
a <- -a
scores[, j] <- -scores[, j]
}
names(a) <- namx[ind[[j]]]
A[ind[[j]], j] <- a
contributions[ind[[j]], j] <- a
loadingslist[[j]] <- a
}
}
## deflate X
if (j <= ncomps)
K <- K - tcrossprod(scores[, j]) %*% K/sum(scores[, j]^2) #spcaTutoPack:::deflXCforR(c(scores[, j]), K)
cvexp[j] <- totvar - sum(K^2)
vexp[j] <- ifelse(j > 1, cvexp[j] - cvexp[j - 1], cvexp[j])
nc <- nc + 1
if (j < ncomps)
j <- j + 1 else stopComp <- TRUE
}
##-------------------------------------------------------------------##
## create output
##-------------------------------------------------------------------##
ncomps <- nc
rownames(contributions) <- namx
rownames(A) <- namx
dimnames(PCloadings) = dimnames(A)
dimnames(PCscores) = dimnames(scores)
out <- list(loadings = A[, 1:ncomps], contributions = contributions, vexp = vexp[1:ncomps]/totvar, vexpPC = vexpPC[1:ncomps]/totvar,
cvexp = cvexp[1:ncomps]/totvar, rcvexp = cvexp[1:ncomps]/cumsum(vexpPC[1:ncomps]),
scores = scores[, 1:ncomps], ncomps = ncomps, indices = ind, cardinality = card[1:ncomps], loadingslist = loadingslist[1:ncomps],
PCloadings = PCloadings, PCscores = PCscores, spcaMethod = spcaMethod)
if (ncomps > 1) {
if (any(spcaMethod != "u")) {
out$corComp <- cor(scores)
}
}
out$Call <- match.call()
return(out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.