Nothing
#' Adaptive best subset selection for principal component analysis
#'
#' @inheritParams abess.default
#' @param x A matrix object. It can be either a predictor matrix
#' where each row is an observation and each column is a predictor or
#' a sample covariance/correlation matrix.
#' If \code{x} is a predictor matrix, it can be in sparse matrix format
#' (inherit from class \code{"dgCMatrix"} in package \code{Matrix}).
#' @param type If \code{type = "predictor"}, \code{x} is considered as the predictor matrix.
#' If \code{type = "gram"}, \code{x} is considered as a sample covariance or correlation matrix.
#' @param kpc.num A integer decide the number of principal components to be sequentially considered.
#' @param c.max an integer splicing size. The default of \code{c.max} is the maximum of 2 and \code{max(support.size) / 2}.
#' @param sparse.type If \code{sparse.type = "fpc"}, then best subset selection performs on the first principal component;
#' If \code{sparse.type = "kpc"}, then best subset selection would be sequentially performed on the first \code{kpc.num} number of principal components.
#' If \code{kpc.num} is supplied, the default is \code{sparse.type = "kpc"}; otherwise, is \code{sparse.type = "fpc"}.
#' @param tune.type The type of criterion for choosing the support size.
#' Available options are \code{"gic"}, \code{"ebic"}, \code{"bic"}, \code{"aic"} and \code{"cv"}.
#' Default is \code{"gic"}.
#' \code{tune.type = "cv"} is available only when \code{type = "predictor"}.
#' @param cor A logical value. If \code{cor = TRUE}, perform PCA on the correlation matrix;
#' otherwise, the covariance matrix.
#' This option is available only if \code{type = "predictor"}.
#' Default: \code{cor = FALSE}.
#' @param support.size It is a flexible input. If it is an integer vector.
#' It represents the support sizes to be considered for each principal component.
#' If it is a \code{list} object containing \code{kpc.num} number of integer vectors,
#' the i-th principal component consider the support size specified in the i-th element in the \code{list}.
#' Only used for \code{tune.path = "sequence"}.
#' The default is \code{support.size = NULL}, and some rules in details section are used to specify \code{support.size}.
#' @param splicing.type Optional type for splicing.
#' If \code{splicing.type = 1}, the number of variables to be spliced is
#' \code{c.max}, ..., \code{1}; if \code{splicing.type = 2},
#' the number of variables to be spliced is \code{c.max}, \code{c.max/2}, ..., \code{1}.
#' Default: \code{splicing.type = 1}.
#'
#' @details Adaptive best subset selection for principal component analysis (abessPCA) aim
#' to solve the non-convex optimization problem:
#' \deqn{-\arg\min_{v} v^\top \Sigma v, s.t.\quad v^\top v=1, \|v\|_0 \leq s, }
#' where \eqn{s} is support size.
#' Here, \eqn{\Sigma} is covariance matrix, i.e.,
#' \deqn{\Sigma = \frac{1}{n} X^{\top} X.}
#' A generic splicing technique is implemented to
#' solve this problem.
#' By exploiting the warm-start initialization, the non-convex optimization
#' problem at different support size (specified by \code{support.size})
#' can be efficiently solved.
#'
#' The abessPCA can be conduct sequentially for each component.
#' Please see the multiple principal components Section on the [website](https://abess-team.github.io/abess/articles/v08-sPCA.html)
#' for more details about this function.
#' For \code{abesspca} function, the arguments \code{kpc.num} control the number of components to be consider.
#'
#' When \code{sparse.type = "fpc"} but \code{support.size} is not supplied,
#' it is set as \code{support.size = 1:min(ncol(x), 100)} if \code{group.index = NULL};
#' otherwise, \code{support.size = 1:min(length(unique(group.index)), 100)}.
#' When \code{sparse.type = "kpc"} but \code{support.size} is not supplied,
#' then for 20\% principal components,
#' it is set as \code{min(ncol(x), 100)} if \code{group.index = NULL};
#' otherwise, \code{min(length(unique(group.index)), 100)}.
#'
#' @return A S3 \code{abesspca} class object, which is a \code{list} with the following components:
#' \item{coef}{A \eqn{p}-by-\code{length(support.size)} loading matrix of sparse principal components (PC),
#' where each row is a variable and each column is a support size;}
#' \item{nvars}{The number of variables.}
#' \item{sparse.type}{The same as input.}
#' \item{support.size}{The actual support.size values used. Note that it is not necessary the same as the input if the later have non-integer values or duplicated values.}
#' \item{ev}{A vector with size \code{length(support.size)}. It records the cumulative sums of explained variance at each support size.}
#' \item{tune.value}{A value of tuning criterion of length \code{length(support.size)}.}
#' \item{kpc.num}{The number of principal component being considered.}
#' \item{var.pc}{The variance of principal components obtained by performing standard PCA.}
#' \item{cum.var.pc}{Cumulative sums of \code{var.pc}.}
#' \item{var.all}{If \code{sparse.type = "fpc"},
#' it is the total standard deviations of all principal components.}
#' \item{pev}{A vector with the same length as \code{ev}. It records the percent of explained variance (compared to \code{var.all}) at each support size.}
#' \item{pev.pc}{It records the percent of explained variance (compared to \code{var.pc}) at each support size.}
#' \item{tune.type}{The criterion type for tuning parameters.}
#' \item{tune.path}{The strategy for tuning parameters.}
#' \item{call}{The original call to \code{abess}.}
#' It is worthy to note that, if \code{sparse.type == "kpc"}, the \code{coef}, \code{support.size}, \code{ev}, \code{tune.value}, \code{pev} and \code{pev.pc} in list are \code{list} objects.
#'
#' @author Jin Zhu, Junxian Zhu, Ruihuang Liu, Junhao Huang, Xueqin Wang
#'
#' @note Some parameters not described in the Details Section is explained in the document for \code{\link{abess}}
#' because the meaning of these parameters are very similar.
#'
#' @references A polynomial algorithm for best-subset selection problem. Junxian Zhu, Canhong Wen, Jin Zhu, Heping Zhang, Xueqin Wang. Proceedings of the National Academy of Sciences Dec 2020, 117 (52) 33117-33123; \doi{10.1073/pnas.2014241117}
#' @references Sparse principal component analysis. Hui Zou, Hastie Trevor, and Tibshirani Robert. Journal of computational and graphical statistics 15.2 (2006): 265-286. \doi{10.1198/106186006X113430}
#'
#' @export
#'
#' @md
#'
#' @seealso \code{\link{print.abesspca}},
#' \code{\link{coef.abesspca}},
#' \code{\link{plot.abesspca}}.
#'
#' @examples
#' \donttest{
#' library(abess)
#' Sys.setenv("OMP_THREAD_LIMIT" = 2)
#'
#' ## predictor matrix input:
#' head(USArrests)
#' pca_fit <- abesspca(USArrests)
#' pca_fit
#' plot(pca_fit)
#'
#' ## covariance matrix input:
#' cov_mat <- stats::cov(USArrests) * (nrow(USArrests) - 1) / nrow(USArrests)
#' pca_fit <- abesspca(cov_mat, type = "gram")
#' pca_fit
#'
#' ## robust covariance matrix input:
#' rob_cov <- MASS::cov.rob(USArrests)[["cov"]]
#' rob_cov <- (rob_cov + t(rob_cov)) / 2
#' pca_fit <- abesspca(rob_cov, type = "gram")
#' pca_fit
#'
#' ## K-component principal component analysis
#' pca_fit <- abesspca(USArrests,
#' sparse.type = "kpc",
#' support.size = 1:4
#' )
#' coef(pca_fit)
#' plot(pca_fit)
#' plot(pca_fit, "coef")
#'
#' ## select support size via cross-validation ##
#' n <- 500
#' p <- 50
#' support_size <- 3
#' dataset <- generate.spc.matrix(n, p, support_size, snr = 20)
#' spca_fit <- abesspca(dataset[["x"]], tune.type = "cv", nfolds = 5)
#' plot(spca_fit, type = "tune")
#' }
abesspca <- function(x,
type = c("predictor", "gram"),
sparse.type = c("fpc", "kpc"),
cor = FALSE,
kpc.num = NULL,
support.size = NULL,
gs.range = NULL,
tune.path = c("sequence", "gsection"),
tune.type = c("gic", "aic", "bic", "ebic", "cv"),
nfolds = 5,
foldid = NULL,
ic.scale = 1.0,
c.max = NULL,
always.include = NULL,
group.index = NULL,
screening.num = NULL,
splicing.type = 1,
max.splicing.iter = 20,
warm.start = TRUE,
num.threads = 0,
...) {
sparse.type <- match.arg(sparse.type)
tune.type <- match.arg(tune.type)
tune.path <- match.arg(tune.path)
type <- match.arg(type)
data <- list(x=x)
para <- Initialization_PCA(
type = type,
sparse.type = sparse.type,
cor = cor,
support.size = support.size,
kpc.num = kpc.num,
tune.path = tune.path,
gs.range = gs.range,
tune.type = tune.type,
nfolds = nfolds,
foldid = foldid,
ic.scale = ic.scale,
c.max = c.max,
always.include = always.include,
group.index = group.index,
splicing.type = splicing.type,
max.splicing.iter = max.splicing.iter,
screening.num = screening.num,
warm.start = warm.start,
num.threads = num.threads,
support.num = NULL,
important.search = NULL
)
model <- initializate(para, data)
para <- model$para
data <- model$data
x <- data$x
tune.path <- para$tune.path
gs.range <- para$gs.range
kpc.num <- para$kpc.num
warm.start <- para$warm.start
num_threads <- para$num_threads
splicing_type <- para$splicing_type
max_splicing_iter <- para$max_splicing_iter
nobs <- para$nobs
nvars <- para$nvars
vn <- para$vn
g_index <- para$g_index
s_list <- para$s_list
c_max <- para$c_max
ic_scale <- para$ic_scale
important_search <- para$important_search
always_include <- para$always_include
s_max <- para$s_max
s_min <- para$s_min
s_list_bool <- para$s_list_bool
tune_type <- para$tune_type
ic_type <- para$ic_type
cv_fold_id <- para$cv_fold_id
nfolds <- para$nfolds
sparse.type <- para$sparse.type
sparse_matrix <- para$sparse_matrix
gram_x <- para$gram_x
pc_variance <- para$pc_variance
total_variance <- para$total_variance
screening_num <- para$screening_num
screening <- para$screening
path_type <- para$path_type
## Cpp interface:
result <- abessPCA_API(
x = x,
n = nobs,
p = nvars,
normalize_type = as.integer(0),
weight = c(1.0),
sigma = gram_x,
# algorithm_type = 6,
max_iter = max_splicing_iter,
exchange_num = c_max,
path_type = path_type,
is_warm_start = warm.start,
ic_type = ic_type,
ic_coef = ic_scale,
Kfold = nfolds,
sequence = s_list_bool,
s_min = s_min,
s_max = s_max,
screening_size = ifelse(screening_num >= nvars, -1, screening_num),
g_index = g_index,
always_select = always_include,
early_stop = FALSE,
thread = num_threads,
sparse_matrix = sparse_matrix,
splicing_type = splicing_type,
sub_search = important_search,
cv_fold_id = cv_fold_id,
pca_num = kpc.num,
A_init = as.integer(c())
)
if(tune.path == "gsection"){
if(sparse.type == "fpc"){
# sort and delete duplicate things in result
gs_unique_index <- match(sort(unique(result[["sequence"]])), result[["sequence"]])
s_list <- result[["sequence"]][gs_unique_index]
result[["beta_all"]] <- result[["beta_all"]][gs_unique_index]
result[["train_loss_all"]] <- result[["train_loss_all"]][gs_unique_index]
result[["ic_all"]] <- result[["ic_all"]][gs_unique_index]
result[["test_loss_all"]] <- result[["test_loss_all"]][gs_unique_index]
} else{
s_list <- list()
for(i in seq_len(kpc.num)){
# sort and delete duplicate things in result
gs_unique_index <- match(sort(unique(result[[i]][["sequence"]])), result[[i]][["sequence"]])
s_list[[i]] <- result[[i]][["sequence"]][gs_unique_index]
result[[i]][["beta_all"]] <- result[[i]][["beta_all"]][gs_unique_index]
result[[i]][["train_loss_all"]] <- result[[i]][["train_loss_all"]][gs_unique_index]
result[[i]][["ic_all"]] <- result[[i]][["ic_all"]][gs_unique_index]
result[[i]][["test_loss_all"]] <- result[[i]][["test_loss_all"]][gs_unique_index]
}
}
result[["sequence"]] <- NULL
}
# result[["beta"]] <- NULL
# result[["coef0"]] <- NULL
# result[["train_loss"]] <- NULL
# result[["ic"]] <- NULL
# result[["coef0_all"]] <- NULL
# result[["ic_all"]] <- NULL
# result[["test_loss_all"]] <- NULL
if (sparse.type == "fpc") {
names(result)[which(names(result) == "beta_all")] <- "coef"
# result[["coef"]] <- result[["beta_all"]]
result[["ev"]] <- -result[["train_loss_all"]]
# names(result)[which(names(result) == "train_loss_all")] <- "ev"
# result[["ev"]] <- - result[["ev"]][, 1]
result[["coef"]] <- do.call("cbind", result[["coef"]])
result[["coef"]] <- Matrix::Matrix(result[["coef"]],
sparse = TRUE,
dimnames = list(vn, as.character(s_list))
)
result[["beta"]] <- NULL
result[["coef0"]] <- NULL
result[["train_loss"]] <- NULL
result[["ic"]] <- NULL
result[["lambda"]] <- NULL
result[["coef0_all"]] <- NULL
result[["train_loss_all"]] <- NULL
if (tune_type == "cv") {
names(result)[which(names(result) == "ic_all")] <- "tune.value"
result[["test_loss_all"]] <- NULL
} else {
names(result)[which(names(result) == "test_loss_all")] <- "tune.value"
result[["ic_all"]] <- NULL
}
result[["tune.value"]] <- as.vector(result[["tune.value"]])
result[["effective_number_all"]] <- NULL
} else {
coef_list <- lapply(result, function(x) {
x[["beta_all"]]
})
for (i in 1:kpc.num) {
coef_list[[i]] <- Matrix::Matrix(do.call("cbind", coef_list[[i]]),
sparse = TRUE,
dimnames = list(vn, as.character(s_list[[i]]))
)
}
ev_list <- list(-result[[1]][["train_loss_all"]])
for (i in 2:kpc.num) {
ev_vec <- c()
tmp <- coef_list[[1]]
tmp <- tmp[, ncol(tmp), drop = FALSE]
j <- 2
while (j < i) {
tmp2 <- coef_list[[j]]
tmp <- cbind(tmp, tmp2[, ncol(tmp2), drop = FALSE])
j <- j + 1
}
tmp2 <- coef_list[[j]]
for (k in seq_len(ncol(tmp2))) {
ev_vec <- c(ev_vec, sum(adjusted_variance_explained(gram_x, cbind(tmp, tmp2[, k]))))
}
ev_list[[i]] <- ev_vec
}
if (tune_type == "cv") {
tune_value <- lapply(result, function(x) {
x[["test_loss_all"]]
})
} else {
tune_value <- lapply(result, function(x) {
x[["ic_all"]]
})
}
result <- NULL
result[["coef"]] <- coef_list
result[["ev"]] <- ev_list
result[["tune.value"]] <- tune_value
}
result[["kpc.num"]] <- kpc.num
result[["var.pc"]] <- pc_variance
result[["cum.var.pc"]] <- cumsum(pc_variance)
result[["var.all"]] <- total_variance
if (sparse.type == "fpc") {
result[["pev"]] <- result[["ev"]] / total_variance
result[["pev.pc"]] <- result[["ev"]] / pc_variance[1]
} else {
result[["pev"]] <- lapply(result[["ev"]], function(x) {
x / total_variance
})
result[["pev.pc"]] <- lapply(1:kpc.num, function(i) {
result[["ev"]][[i]] / result[["cum.var.pc"]][i]
})
}
result[["nvars"]] <- nvars
result[["sparse.type"]] <- sparse.type
result[["support.size"]] <- s_list
result[["tune.type"]] <- tune_type
result[["tune.path"]] <- tune.path
if(tune.path == "gsection"){
result[["gs.range"]] <- c(s_min,s_max)
}
result[["call"]] <- match.call()
class(result) <- "abesspca"
result
}
adjusted_variance_explained <- function(covmat, loading) {
loading <- as.matrix(loading)
normloading <- sqrt(apply(loading^2, 2, sum))
pc <- covmat %*% t(t(loading) / normloading)
# since it is covariance matrix, it is no need to square the diagonal values.
ev <- abs(diag(qr.R(qr(pc))))
# ev <- (diag(qr.R(qr(pc))))^2
ev
}
project_cov <- function(cov_mat, direction) {
term2 <- (t(direction) %*% (cov_mat %*% as.matrix(direction)))[1, 1] * as.matrix(direction) %*% t(direction)
term3 <- as.matrix(direction) %*% (t(direction) %*% cov_mat)
term4 <- (cov_mat %*% as.matrix(direction)) %*% t(direction)
cov_mat + term2 - term3 - term4
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.