Nothing
#' Partial Least Squares 'a la carte'
#'
#' Runs a Partial Least Squares (PLS) model in regression mode using algorithm
#' implemented in \code{\link[mixOmics]{pls}}. This function allows for the
#' construction of components based on different sets of predictor and/or
#' outcome variables. This function is not using stability.
#'
#' @inheritParams SelectionAlgo
#' @param selectedX binary matrix of size \code{(ncol(xdata) * ncomp)}. The
#' binary entries indicate which predictors (in rows) contribute to the
#' definition of each component (in columns). If \code{selectedX=NULL}, all
#' predictors are selected for all components.
#' @param selectedY binary matrix of size \code{(ncol(ydata) * ncomp)}. The
#' binary entries indicate which outcomes (in rows) contribute to the
#' definition of each component (in columns). If \code{selectedY=NULL}, all
#' outcomes are selected for all components.
#' @param family type of PLS model. Only \code{family="gaussian"} is supported.
#' This corresponds to a PLS model as defined in \code{\link[mixOmics]{pls}}
#' (for continuous outcomes).
#' @param ncomp number of components.
#' @param scale logical indicating if the data should be scaled (i.e.
#' transformed so that all variables have a standard deviation of one).
#'
#' @details All matrices are defined as in (Wold et al. 2001). The weight matrix
#' \code{Wmat} is the equivalent of \code{loadings$X} in
#' \code{\link[mixOmics]{pls}}. The loadings matrix \code{Pmat} is the
#' equivalent of \code{mat.c} in \code{\link[mixOmics]{pls}}. The score
#' matrices \code{Tmat} and \code{Qmat} are the equivalent of
#' \code{variates$X} and \code{variates$Y} in \code{\link[mixOmics]{pls}}.
#'
#' @return A list with: \item{Wmat}{matrix of X-weights.} \item{Wstar}{matrix of
#' transformed X-weights.} \item{Pmat}{matrix of X-loadings.}
#' \item{Cmat}{matrix of Y-weights.} \item{Tmat}{matrix of X-scores.}
#' \item{Umat}{matrix of Y-scores.} \item{Qmat}{matrix needed for
#' predictions.} \item{Rmat}{matrix needed for predictions.}
#' \item{meansX}{vector used for centering of predictors, needed for
#' predictions.} \item{sigmaX}{vector used for scaling of predictors, needed
#' for predictions.} \item{meansY}{vector used for centering of outcomes,
#' needed for predictions.} \item{sigmaY}{vector used for scaling of outcomes,
#' needed for predictions.} \item{methods}{a list with \code{family} and
#' \code{scale} values used for the run.} \item{params}{a list with
#' \code{selectedX} and \code{selectedY} values used for the run.}
#'
#' @seealso \code{\link{VariableSelection}}, \code{\link{BiSelection}}
#'
#' @references \insertRef{PLS}{sharp}
#'
#' @examples
#' \donttest{
#' if (requireNamespace("mixOmics", quietly = TRUE)) {
#' oldpar <- par(no.readonly = TRUE)
#'
#' # Data simulation
#' set.seed(1)
#' simul <- SimulateRegression(n = 200, pk = 15, q = 3, family = "gaussian")
#' x <- simul$xdata
#' y <- simul$ydata
#'
#' # PLS
#' mypls <- PLS(xdata = x, ydata = y, ncomp = 3)
#'
#' if (requireNamespace("sgPLS", quietly = TRUE)) {
#' # Sparse PLS to identify relevant variables
#' stab <- BiSelection(
#' xdata = x, ydata = y,
#' family = "gaussian", ncomp = 3,
#' LambdaX = seq_len(ncol(x) - 1),
#' LambdaY = seq_len(ncol(y) - 1),
#' implementation = SparsePLS,
#' n_cat = 2
#' )
#' plot(stab)
#'
#' # Refitting of PLS model
#' mypls <- PLS(
#' xdata = x, ydata = y,
#' selectedX = stab$selectedX,
#' selectedY = stab$selectedY
#' )
#'
#' # Nonzero entries in weights are the same as in selectedX
#' par(mfrow = c(2, 2))
#' Heatmap(stab$selectedX,
#' legend = FALSE
#' )
#' title("Selected in X")
#' Heatmap(ifelse(mypls$Wmat != 0, yes = 1, no = 0),
#' legend = FALSE
#' )
#' title("Nonzero entries in Wmat")
#' Heatmap(stab$selectedY,
#' legend = FALSE
#' )
#' title("Selected in Y")
#' Heatmap(ifelse(mypls$Cmat != 0, yes = 1, no = 0),
#' legend = FALSE
#' )
#' title("Nonzero entries in Cmat")
#' }
#'
#' # Multilevel PLS
#' # Generating random design
#' z <- rep(seq_len(50), each = 4)
#'
#' # Extracting the within-variability
#' x_within <- mixOmics::withinVariation(X = x, design = cbind(z))
#'
#' # Running PLS on within-variability
#' mypls <- PLS(xdata = x_within, ydata = y, ncomp = 3)
#'
#' par(oldpar)
#' }
#' }
#' @export
PLS <- function(xdata, ydata,
selectedX = NULL, selectedY = NULL,
family = "gaussian", ncomp = NULL, scale = TRUE) {
# Checking mixOmics package is installed
CheckPackageInstalled("mixOmics")
# Checking arguments
if (!family %in% c("gaussian")) {
stop("Invalid input for argument 'family'. Only 'gaussian' family is supported.")
}
# Re-formatting ydata
if (is.vector(ydata)) {
ydata <- cbind(ydata)
}
if (is.null(colnames(ydata))) {
colnames(ydata) <- paste0("outcome", seq_len(ncol(ydata)))
}
# Checking consistency of arguments
if (is.null(selectedX) & is.null(selectedY)) {
if (is.null(ncomp)) {
ncomp <- 1
}
} else {
if (!is.null(selectedX)) {
if (is.null(ncomp)) {
ncomp <- ncol(selectedX)
} else {
if (!is.null(selectedY)) {
if (ncol(selectedX) != ncol(selectedY)) {
stop("Arguments 'selectedX' and 'selectedY' are not consistent. These matrices must have the same number of columns.")
}
}
if (ncomp != ncol(selectedX)) {
message(paste0(
"Arguments 'ncomp' and 'selectedX' are not consistent. The algorithm is run with ncomp=",
ncol(selectedX), "."
))
}
}
} else {
ncomp <- ncol(selectedY)
}
}
# Defining the selection status for predictors
if (is.null(selectedX)) {
selectedX <- matrix(1, nrow = ncol(xdata), ncol = ncomp)
}
rownames(selectedX) <- colnames(xdata)
colnames(selectedX) <- paste0("comp", seq_len(ncomp))
# Defining the selection status for outcomes
if (is.null(selectedY)) {
selectedY <- matrix(1, nrow = ncol(ydata), ncol = ncomp)
}
rownames(selectedY) <- colnames(ydata)
colnames(selectedY) <- paste0("comp", seq_len(ncomp))
# Initialisation
Emat <- scale(xdata, center = TRUE, scale = scale)
Fmat <- scale(ydata, center = TRUE, scale = scale)
meansX <- attr(Emat, "scaled:center")
meansY <- attr(Fmat, "scaled:center")
if (scale) {
sigmaX <- attr(Emat, "scaled:scale")
sigmaY <- attr(Fmat, "scaled:scale")
} else {
sigmaX <- rep(1, ncol(xdata))
sigmaY <- rep(1, ncol(ydata))
}
# Initialisation of empty objects
Wmat <- Pmat <- matrix(0, nrow = ncol(xdata), ncol = ncomp)
rownames(Wmat) <- rownames(Pmat) <- colnames(xdata)
Cmat <- matrix(0, nrow = ncol(ydata), ncol = ncomp)
rownames(Cmat) <- colnames(ydata)
Tmat <- matrix(NA, nrow = nrow(xdata), ncol = ncomp)
Umat <- matrix(NA, nrow = nrow(ydata), ncol = ncomp)
rownames(Tmat) <- rownames(Umat) <- rownames(xdata)
colnames(Wmat) <- colnames(Pmat) <- colnames(Cmat) <- colnames(Tmat) <- colnames(Umat) <- paste0("comp", seq_len(ncomp))
# Loop over the components
for (comp in seq_len(ncomp)) {
# Extracting the stably selected predictors
idsX <- rownames(selectedX)[which(selectedX[, comp] == 1)]
if (length(idsX) > 0) {
Emat_selected <- Emat[, idsX, drop = FALSE]
} else {
warning(paste0("No selected predictors for component ", comp, ". All predictors were included."))
Emat_selected <- Emat
}
# Extracting the stably selected outcomes
idsY <- rownames(selectedY)[which(selectedY[, comp] == 1)]
if (length(idsY) > 0) {
Fmat_selected <- Fmat[, idsY, drop = FALSE]
} else {
warning(paste0("No selected outcomes for component ", comp, ". All outcomes were included."))
Fmat_selected <- Fmat
}
# Fitting PLS model (scaling done in initialisation and not required for further components)
pls_h <- mixOmics::pls(
X = Emat_selected,
Y = Fmat_selected,
mode = "regression",
all.outputs = TRUE,
multilevel = NULL,
ncomp = 1,
scale = FALSE
)
# Extracting the scores
t_h <- pls_h$variates$X
Tmat[, comp] <- t_h
Umat[, comp] <- pls_h$variates$Y
# Extracting the X-weights
w_h <- pls_h$loadings$X
Wmat[idsX, comp] <- w_h
# Extracting the Y-weights
Cmat[idsY, comp] <- pls_h$loadings$Y
# Extracting the X-loadings
Pmat[idsX, comp] <- pls_h$mat.c[, 1, drop = FALSE]
# Deflation step
Emat <- Emat - t_h %*% t(Pmat[, comp, drop = FALSE])
d_h <- t(Fmat) %*% t_h * 1 / as.numeric(t(t_h) %*% t_h)
Fmat <- Fmat - t_h %*% t(d_h)
}
# Transforming the weights
Wstar <- base::zapsmall(Wmat %*% solve(t(Pmat) %*% Wmat))
# Calculating matrices needed for predictions
Qmat <- crossprod(scale(xdata, center = TRUE, scale = scale), Tmat)
Rmat <- crossprod(scale(ydata, center = TRUE, scale = scale), Tmat)
# Preparing outputs
out <- list(
Wmat = Wmat,
Wstar = Wstar,
Pmat = Pmat,
Cmat = Cmat,
Tmat = Tmat,
Umat = Umat,
Qmat = Qmat,
Rmat = Rmat,
meansX = meansX,
sigmaX = sigmaX,
meansY = meansY,
sigmaY = sigmaY,
methods = list(family = family, scale = scale),
params = list(selectedX = selectedX, selectedY = selectedY)
)
return(out)
}
#' Partial Least Squares predictions
#'
#' Computes predicted values from a Partial Least Squares (PLS) model in
#' regression mode applied on \code{xdata}. This function is using the algorithm
#' implemented in \code{\link[mixOmics]{predict.pls}}.
#'
#' @inheritParams SelectionAlgo
#' @param model output of \code{\link{PLS}}.
#'
#' @return An array of predicted values.
#'
#' @seealso \code{\link{PLS}}
#'
#' @examples
#' if (requireNamespace("mixOmics", quietly = TRUE)) {
#' # Data simulation
#' set.seed(1)
#' simul <- SimulateRegression(n = 100, pk = c(5, 5, 5), family = "gaussian")
#' x <- simul$xdata
#' y <- simul$ydata
#'
#' # PLS
#' mypls <- PLS(xdata = x, ydata = y, ncomp = 3)
#'
#' # Predicted values
#' predicted <- PredictPLS(xdata = x, model = mypls)
#' }
#' @export
PredictPLS <- function(xdata, model) {
# Extracting arguments
family <- model$methods$family
ncomp <- ncol(model$Wmat)
# Checking arguments
if (!family %in% c("gaussian")) {
stop("Invalid input for argument 'family'. Only 'gaussian' family is supported.")
}
# Extracting relevant variables
xdata <- xdata[, rownames(model$Wmat), drop = FALSE]
# Re-scaling data
if (ncol(xdata) == 1) {
newdata <- matrix(sapply(xdata, FUN = function(x) {
(x - model$meansX) / model$sigmaX
}), ncol = 1)
} else {
newdata <- t(apply(xdata, 1, FUN = function(x) {
(x - model$meansX) / model$sigmaX
}))
}
# Initialising empty object
out <- array(NA,
dim = c(nrow(newdata), nrow(model$Rmat), ncomp),
dimnames = list(rownames(newdata), rownames(model$Rmat), colnames(model$Rmat))
)
# Loop over the components
for (comp in seq_len(ncomp)) {
# Computing predicted values
Ypred <- newdata %*% model$Wmat[, seq_len(comp), drop = FALSE] %*% solve(t(model$Qmat[, seq_len(comp), drop = FALSE]) %*% model$Wmat[, seq_len(comp), drop = FALSE]) %*% t(model$Rmat)[seq_len(comp), , drop = FALSE]
# Re-scaling
Ypred <- t(apply(Ypred, 1, FUN = function(y) {
y * model$sigmaY + model$meansY
}))
# Storing in the output
out[, , comp] <- Ypred
}
return(out)
}
#' Sparse Partial Least Squares
#'
#' Runs a sparse Partial Least Squares model using implementation from
#' \code{\link[sgPLS]{sgPLS-package}}. This function is not using stability.
#'
#' @inheritParams PLS
#' @param Lambda matrix of parameters controlling the number of selected
#' predictors at current component, as defined by \code{ncomp}.
#' @param family type of PLS model. If \code{family="gaussian"}, a sparse PLS
#' model as defined in \code{\link[sgPLS]{sPLS}} is run (for continuous
#' outcomes). If \code{family="binomial"}, a PLS-DA model as defined in
#' \code{\link[sgPLS]{sPLSda}} is run (for categorical outcomes).
#' @param keepX_previous number of selected predictors in previous components.
#' Only used if \code{ncomp > 1}. The argument \code{keepX} in
#' \code{\link[sgPLS]{sPLS}} is obtained by concatenating
#' \code{keepX_previous} and \code{Lambda}.
#' @param keepY number of selected outcome variables. This argument is defined
#' as in \code{\link[sgPLS]{sPLS}}. Only used if \code{family="gaussian"}.
#' @param scale logical indicating if the data should be scaled (i.e.
#' transformed so that all variables have a standard deviation of one). Only
#' used if \code{family="gaussian"}.
#' @param ... additional arguments to be passed to \code{\link[sgPLS]{sPLS}} or
#' \code{\link[sgPLS]{sPLSda}}.
#'
#' @return A list with: \item{selected}{matrix of binary selection status. Rows
#' correspond to different model parameters. Columns correspond to
#' predictors.} \item{beta_full}{array of model coefficients. Rows correspond
#' to different model parameters. Columns correspond to predictors (starting
#' with "X") or outcomes (starting with "Y") variables for different
#' components (denoted by "PC").}
#'
#' @family penalised dimensionality reduction functions
#' @seealso \code{\link{VariableSelection}}, \code{\link{BiSelection}}
#'
#' @references \insertRef{sparsePLS}{sharp}
#'
#' @examples
#' if (requireNamespace("sgPLS", quietly = TRUE)) {
#' ## Sparse PLS
#'
#' # Data simulation
#' set.seed(1)
#' simul <- SimulateRegression(n = 100, pk = 20, q = 3, family = "gaussian")
#' x <- simul$xdata
#' y <- simul$ydata
#'
#' # Running sPLS with 2 X-variables and 1 Y-variable
#' mypls <- SparsePLS(xdata = x, ydata = y, Lambda = 2, family = "gaussian", keepY = 1)
#'
#'
#' ## Sparse PLS-DA
#'
#' # Data simulation
#' set.seed(1)
#' simul <- SimulateRegression(n = 100, pk = 20, family = "binomial")
#'
#' # Running sPLS-DA with 2 X-variables and 1 Y-variable
#' mypls <- SparsePLS(xdata = simul$xdata, ydata = simul$ydata, Lambda = 2, family = "binomial")
#' }
#' @export
SparsePLS <- function(xdata, ydata, Lambda, family = "gaussian",
ncomp = 1, scale = TRUE,
keepX_previous = NULL, keepY = NULL, ...) {
# Checking sgPLS package is installed
CheckPackageInstalled("sgPLS")
# Storing extra arguments
extra_args <- list(...)
# Checking arguments
if (!family %in% c("binomial", "gaussian")) {
stop("Invalid input for argument 'family'. For PLS models, argument 'family' must be 'gaussian' or 'binomial'.")
}
# Re-formatting y
if (is.vector(ydata)) {
ydata <- cbind(ydata)
}
# All Y variables are selected by default
if (is.null(keepY)) {
keepY <- rep(ncol(ydata), ncomp)
}
# All X variables are kept in previous components by default
if (is.null(keepX_previous)) {
keepX_previous <- rep(ncol(xdata), ncomp - 1)
}
# Initialising the current set of loadings coefficients
beta <- matrix(NA, nrow = length(Lambda), ncol = ncol(xdata))
rownames(beta) <- paste0("s", 0:(length(Lambda) - 1))
colnames(beta) <- colnames(xdata)
# Initialising the full set of loadings coefficients
if (family == "gaussian") {
beta_full <- matrix(NA, nrow = length(Lambda), ncol = (ncol(xdata) + ncol(ydata)) * ncomp)
rownames(beta_full) <- paste0("s", 0:(length(Lambda) - 1))
colnames(beta_full) <- c(
paste0(paste0("X_", colnames(xdata), "_PC"), rep(seq_len(ncomp), each = ncol(xdata))),
paste0(paste0("Y_", colnames(ydata), "_PC"), rep(seq_len(ncomp), each = ncol(ydata)))
)
} else {
ncat <- length(unique(ydata))
beta_full <- matrix(NA, nrow = length(Lambda), ncol = (ncol(xdata) + ncat) * ncomp)
rownames(beta_full) <- paste0("s", 0:(length(Lambda) - 1))
colnames(beta_full) <- c(
paste0(paste0("X_", colnames(xdata), "_PC"), rep(seq_len(ncomp), each = ncol(xdata))),
paste0(paste0("Y_", sort(unique(ydata)), "_PC"), rep(seq_len(ncomp), each = ncat))
)
}
# Loop over all parameters (number of selected variables in X in current component)
for (k in seq_len(length(Lambda))) {
# Number of selected variables per component in X
nvarx <- c(keepX_previous, Lambda[k])
if (family == "gaussian") {
# Extracting relevant extra arguments
tmp_extra_args <- MatchingArguments(extra_args = extra_args, FUN = sgPLS::sPLS)
tmp_extra_args <- tmp_extra_args[!names(tmp_extra_args) %in% c("X", "Y", "ncomp", "scale", "keepX", "keepY")]
# Running PLS model
mymodel <- do.call(sgPLS::sPLS, args = c(
list(
X = xdata, Y = ydata,
ncomp = ncomp, scale = scale,
keepX = nvarx, keepY = keepY
),
tmp_extra_args
))
} else {
# Extracting relevant extra arguments
tmp_extra_args <- MatchingArguments(extra_args = extra_args, FUN = sgPLS::sPLSda)
tmp_extra_args <- tmp_extra_args[!names(tmp_extra_args) %in% c("X", "Y", "ncomp", "keepX")]
# Running PLS model
mymodel <- do.call(sgPLS::sPLSda, args = c(
list(
X = xdata, Y = as.vector(ydata),
ncomp = ncomp,
keepX = nvarx
),
tmp_extra_args
))
}
# Extracting X and Y loadings
Xloadings <- mymodel$loadings$X
Yloadings <- mymodel$loadings$Y
# Making sure that the loadings of the univariate Y are positives
if (ncol(ydata) == 1) {
for (j in seq_len(ncol(mymodel$loadings$Y))) {
if (sign(mymodel$loadings$Y[1, j]) == -1) {
Yloadings[1, j] <- -Yloadings[1, j]
Xloadings[, j] <- -Xloadings[, j]
}
}
}
# Exporting the set of loadings to look at for selection with current component
beta[k, ] <- Xloadings[, ncomp, drop = FALSE]
# Exporting all loadings coefficients
beta_full[k, ] <- c(as.vector(Xloadings), as.vector(Yloadings))
}
beta <- ifelse(beta != 0, yes = 1, no = 0)
return(list(selected = beta, beta_full = beta_full))
}
#' Sparse group Partial Least Squares
#'
#' Runs a sparse group Partial Least Squares model using implementation from
#' \code{\link[sgPLS]{sgPLS-package}}. This function is not using stability.
#'
#' @inheritParams SparsePLS
#' @param family type of PLS model. If \code{family="gaussian"}, a sparse group
#' PLS model as defined in \code{\link[sgPLS]{sgPLS}} is run (for continuous
#' outcomes). If \code{family="binomial"}, a PLS-DA model as defined in
#' \code{\link[sgPLS]{sgPLSda}} is run (for categorical outcomes).
#' @param group_x vector encoding the grouping structure among predictors. This
#' argument indicates the number of variables in each group.
#' @param group_y optional vector encoding the grouping structure among
#' outcomes. This argument indicates the number of variables in each group.
#' @param alpha.x vector of parameters controlling the level of sparsity within
#' groups of predictors.
#' @param alpha.y optional vector of parameters controlling the level of
#' sparsity within groups of outcomes. Only used if \code{family="gaussian"}.
#' @param Lambda matrix of parameters controlling the number of selected groups
#' at current component, as defined by \code{ncomp}.
#' @param keepX_previous number of selected groups in previous components. Only
#' used if \code{ncomp > 1}. The argument \code{keepX} in
#' \code{\link[sgPLS]{sgPLS}} is obtained by concatenating
#' \code{keepX_previous} and \code{Lambda}.
#' @param keepY number of selected groups of outcome variables. This argument is
#' defined as in \code{\link[sgPLS]{sgPLS}}. Only used if
#' \code{family="gaussian"}.
#' @param ... additional arguments to be passed to \code{\link[sgPLS]{sgPLS}} or
#' \code{\link[sgPLS]{sgPLSda}}.
#'
#' @return A list with: \item{selected}{matrix of binary selection status. Rows
#' correspond to different model parameters. Columns correspond to
#' predictors.} \item{beta_full}{array of model coefficients. Rows correspond
#' to different model parameters. Columns correspond to predictors (starting
#' with "X") or outcomes (starting with "Y") variables for different
#' components (denoted by "PC").}
#'
#' @family penalised dimensionality reduction functions
#' @seealso \code{\link{VariableSelection}}, \code{\link{BiSelection}}
#'
#' @references \insertRef{sparsegroupPLS}{sharp}
#'
#' @examples
#' if (requireNamespace("sgPLS", quietly = TRUE)) {
#' ## Sparse group PLS
#' # Data simulation
#' set.seed(1)
#' simul <- SimulateRegression(n = 100, pk = 30, q = 3, family = "gaussian")
#' x <- simul$xdata
#' y <- simul$ydata
#'
#' # Running sgPLS with 1 group and sparsity of 0.5
#' mypls <- SparseGroupPLS(
#' xdata = x, ydata = y, Lambda = 1, family = "gaussian",
#' group_x = c(10, 15, 5), alpha.x = 0.5
#' )
#'
#' # Running sgPLS with groups on outcomes
#' mypls <- SparseGroupPLS(
#' xdata = x, ydata = y, Lambda = 1, family = "gaussian",
#' group_x = c(10, 15, 5), alpha.x = 0.5,
#' group_y = c(2, 1), keepY = 1, alpha.y = 0.9
#' )
#'
#' ## Sparse group PLS-DA
#' # Data simulation
#' set.seed(1)
#' simul <- SimulateRegression(n = 100, pk = 50, family = "binomial")
#'
#' # Running sgPLS-DA with 1 group and sparsity of 0.9
#' mypls <- SparseGroupPLS(
#' xdata = simul$xdata, ydata = simul$ydata, Lambda = 1, family = "binomial",
#' group_x = c(10, 15, 25), alpha.x = 0.9
#' )
#' }
#' @export
SparseGroupPLS <- function(xdata, ydata, family = "gaussian", group_x, group_y = NULL,
Lambda, alpha.x, alpha.y = NULL,
keepX_previous = NULL, keepY = NULL,
ncomp = 1, scale = TRUE, ...) {
# Checking sgPLS package is installed
if (!requireNamespace("sgPLS")) {
stop("This function requires the 'sgPLS' package.")
}
# Storing extra arguments
extra_args <- list(...)
# Checking arguments
if (!family %in% c("binomial", "gaussian")) {
stop("Invalid input for argument 'family'. For PLS models, argument 'family' must be 'gaussian' or 'binomial'.")
}
# Checking groups
if (sum(group_x) != ncol(xdata)) {
stop("Invalid input for argument 'group_x'. The sum of the number of variables per group must be equal to the total number of variables in 'xdata'.")
}
if (!is.null(group_y)) {
if (sum(group_y) != ncol(ydata)) {
stop("Invalid input for argument 'group_y'. The sum of the number of variables per group must be equal to the total number of variables in 'ydata'.")
}
}
# Re-formatting y
if (is.vector(ydata)) {
ydata <- cbind(ydata)
}
# All Y variables are selected by default
if (is.null(keepY)) {
keepY <- rep(ncol(ydata), ncomp)
}
# All X variables are kept in previous components by default
if (is.null(keepX_previous)) {
keepX_previous <- rep(ncol(xdata), ncomp - 1)
}
# Re-formatting the grouping
ind.block.x <- cumsum(group_x)[-length(group_x)]
if (!is.null(group_y)) {
ind.block.y <- cumsum(group_y)[-length(group_y)]
} else {
ind.block.y <- NULL
}
# Initialising the current set of loadings coefficients
beta <- matrix(NA, nrow = length(Lambda), ncol = ncol(xdata))
rownames(beta) <- paste0("s", 0:(length(Lambda) - 1))
colnames(beta) <- colnames(xdata)
# Initialising the full set of loadings coefficients
if (family == "gaussian") {
beta_full <- matrix(NA, nrow = length(Lambda), ncol = (ncol(xdata) + ncol(ydata)) * ncomp)
rownames(beta_full) <- paste0("s", 0:(length(Lambda) - 1))
colnames(beta_full) <- c(
paste0(paste0("X_", colnames(xdata), "_PC"), rep(seq_len(ncomp), each = ncol(xdata))),
paste0(paste0("Y_", colnames(ydata), "_PC"), rep(seq_len(ncomp), each = ncol(ydata)))
)
} else {
ncat <- length(unique(ydata))
beta_full <- matrix(NA, nrow = length(Lambda), ncol = (ncol(xdata) + ncat) * ncomp)
rownames(beta_full) <- paste0("s", 0:(length(Lambda) - 1))
colnames(beta_full) <- c(
paste0(paste0("X_", colnames(xdata), "_PC"), rep(seq_len(ncomp), each = ncol(xdata))),
paste0(paste0("Y_", sort(unique(ydata)), "_PC"), rep(seq_len(ncomp), each = ncat))
)
}
# Loop over all parameters (number of selected variables in X in current component)
for (k in seq_len(length(Lambda))) {
# Number of selected variables per component in X
nvarx <- c(keepX_previous, Lambda[k])
if (family == "binomial") {
# Extracting relevant extra arguments
tmp_extra_args <- MatchingArguments(extra_args = extra_args, FUN = sgPLS::sgPLSda)
tmp_extra_args <- tmp_extra_args[!names(tmp_extra_args) %in% c("X", "Y", "ncomp", "ind.block.x", "keepX", "alpha.x")]
# Running PLS model
mymodel <- do.call(sgPLS::sgPLSda, args = c(
list(
X = xdata, Y = as.vector(ydata),
ncomp = ncomp,
ind.block.x = ind.block.x,
keepX = nvarx, alpha.x = alpha.x
),
tmp_extra_args
)) # no sparsity in Y
} else {
# Extracting relevant extra arguments
tmp_extra_args <- MatchingArguments(extra_args = extra_args, FUN = sgPLS::sgPLS)
tmp_extra_args <- tmp_extra_args[!names(tmp_extra_args) %in% c("X", "Y", "ncomp", "scale", "ind.block.x", "ind.block.y", "keepX", "keepY", "alpha.x", "alpha.y")]
# Running PLS model
mymodel <- do.call(sgPLS::sgPLS, args = c(
list(
X = xdata, Y = ydata,
ncomp = ncomp, scale = scale,
ind.block.x = ind.block.x, ind.block.y = ind.block.y,
keepX = nvarx, keepY = keepY,
alpha.x = alpha.x, alpha.y = alpha.y
),
tmp_extra_args
))
}
# Extracting X and Y loadings
Xloadings <- mymodel$loadings$X
Yloadings <- mymodel$loadings$Y
# Making sure that the loadings of the univariate Y are positives
if (ncol(ydata) == 1) {
for (j in seq_len(ncol(mymodel$loadings$Y))) {
if (sign(mymodel$loadings$Y[1, j]) == -1) {
Yloadings[1, j] <- -Yloadings[1, j]
Xloadings[, j] <- -Xloadings[, j]
}
}
}
# Exporting the set of loadings to look at for selection with current component
beta[k, ] <- Xloadings[, ncomp, drop = FALSE]
# Exporting all loadings coefficients
beta_full[k, ] <- c(as.vector(Xloadings), as.vector(Yloadings))
}
beta <- ifelse(beta != 0, yes = 1, no = 0)
return(list(selected = beta, beta_full = beta_full))
}
#' Group Partial Least Squares
#'
#' Runs a group Partial Least Squares model using implementation from
#' \code{\link[sgPLS]{sgPLS-package}}. This function is not using stability.
#'
#' @inheritParams SparsePLS
#' @param family type of PLS model. If \code{family="gaussian"}, a group PLS
#' model as defined in \code{\link[sgPLS]{gPLS}} is run (for continuous
#' outcomes). If \code{family="binomial"}, a PLS-DA model as defined in
#' \code{\link[sgPLS]{gPLSda}} is run (for categorical outcomes).
#' @param group_x vector encoding the grouping structure among predictors. This
#' argument indicates the number of variables in each group.
#' @param group_y optional vector encoding the grouping structure among
#' outcomes. This argument indicates the number of variables in each group.
#' @param Lambda matrix of parameters controlling the number of selected groups
#' at current component, as defined by \code{ncomp}.
#' @param keepX_previous number of selected groups in previous components. Only
#' used if \code{ncomp > 1}. The argument \code{keepX} in
#' \code{\link[sgPLS]{sgPLS}} is obtained by concatenating
#' \code{keepX_previous} and \code{Lambda}.
#' @param keepY number of selected groups of outcome variables. This argument is
#' defined as in \code{\link[sgPLS]{sgPLS}}. Only used if
#' \code{family="gaussian"}.
#' @param ... additional arguments to be passed to \code{\link[sgPLS]{gPLS}} or
#' \code{\link[sgPLS]{gPLSda}}.
#'
#' @return A list with: \item{selected}{matrix of binary selection status. Rows
#' correspond to different model parameters. Columns correspond to
#' predictors.} \item{beta_full}{array of model coefficients. Rows correspond
#' to different model parameters. Columns correspond to predictors (starting
#' with "X") or outcomes (starting with "Y") variables for different
#' components (denoted by "PC").}
#'
#' @family penalised dimensionality reduction functions
#' @seealso \code{\link{VariableSelection}}, \code{\link{BiSelection}}
#'
#' @references \insertRef{sparsegroupPLS}{sharp}
#'
#' @examples
#' if (requireNamespace("sgPLS", quietly = TRUE)) {
#' ## Group PLS
#' # Data simulation
#' set.seed(1)
#' simul <- SimulateRegression(n = 100, pk = 50, q = 3, family = "gaussian")
#' x <- simul$xdata
#' y <- simul$ydata
#'
#' # Running gPLS with 1 group and sparsity of 0.5
#' mypls <- GroupPLS(
#' xdata = x, ydata = y, Lambda = 1, family = "gaussian",
#' group_x = c(10, 15, 25),
#' )
#'
#' # Running gPLS with groups on outcomes
#' mypls <- GroupPLS(
#' xdata = x, ydata = y, Lambda = 1, family = "gaussian",
#' group_x = c(10, 15, 25),
#' group_y = c(2, 1), keepY = 1
#' )
#' }
#' @export
GroupPLS <- function(xdata, ydata, family = "gaussian", group_x, group_y = NULL,
Lambda, keepX_previous = NULL, keepY = NULL,
ncomp = 1, scale = TRUE, ...) {
# Checking sgPLS package is installed
if (!requireNamespace("sgPLS")) {
stop("This function requires the 'sgPLS' package.")
}
# Storing extra arguments
extra_args <- list(...)
# Checking arguments
if (!family %in% c("binomial", "gaussian")) {
stop("Invalid input for argument 'family'. For PLS models, argument 'family' must be 'gaussian' or 'binomial'.")
}
# Checking groups
if (sum(group_x) != ncol(xdata)) {
stop("Invalid input for argument 'group_x'. The sum of the number of variables per group must be equal to the total number of variables in 'xdata'.")
}
if (!is.null(group_y)) {
if (sum(group_y) != ncol(ydata)) {
stop("Invalid input for argument 'group_y'. The sum of the number of variables per group must be equal to the total number of variables in 'ydata'.")
}
}
# Re-formatting y
if (is.vector(ydata)) {
ydata <- cbind(ydata)
}
# All Y variables are selected by default
if (is.null(keepY)) {
keepY <- rep(ncol(ydata), ncomp)
}
# All X variables are kept in previous components by default
if (is.null(keepX_previous)) {
keepX_previous <- rep(ncol(xdata), ncomp - 1)
}
# Re-formatting the grouping
ind.block.x <- cumsum(group_x)[-length(group_x)]
if (!is.null(group_y)) {
ind.block.y <- cumsum(group_y)[-length(group_y)]
} else {
ind.block.y <- NULL
}
# Initialising the current set of loadings coefficients
beta <- matrix(NA, nrow = length(Lambda), ncol = ncol(xdata))
rownames(beta) <- paste0("s", 0:(length(Lambda) - 1))
colnames(beta) <- colnames(xdata)
# Initialising the full set of loadings coefficients
if (family == "gaussian") {
beta_full <- matrix(NA, nrow = length(Lambda), ncol = (ncol(xdata) + ncol(ydata)) * ncomp)
rownames(beta_full) <- paste0("s", 0:(length(Lambda) - 1))
colnames(beta_full) <- c(
paste0(paste0("X_", colnames(xdata), "_PC"), rep(seq_len(ncomp), each = ncol(xdata))),
paste0(paste0("Y_", colnames(ydata), "_PC"), rep(seq_len(ncomp), each = ncol(ydata)))
)
} else {
ncat <- length(unique(ydata))
beta_full <- matrix(NA, nrow = length(Lambda), ncol = (ncol(xdata) + ncat) * ncomp)
rownames(beta_full) <- paste0("s", 0:(length(Lambda) - 1))
colnames(beta_full) <- c(
paste0(paste0("X_", colnames(xdata), "_PC"), rep(seq_len(ncomp), each = ncol(xdata))),
paste0(paste0("Y_", sort(unique(ydata)), "_PC"), rep(seq_len(ncomp), each = ncat))
)
}
# Loop over all parameters (number of selected variables in X in current component)
for (k in seq_len(length(Lambda))) {
# Number of selected variables per component in X
nvarx <- c(keepX_previous, Lambda[k])
if (family == "binomial") {
# Extracting relevant extra arguments
tmp_extra_args <- MatchingArguments(extra_args = extra_args, FUN = sgPLS::gPLSda)
tmp_extra_args <- tmp_extra_args[!names(tmp_extra_args) %in% c("X", "Y", "ncomp", "ind.block.x", "keepX")]
# Running PLS model
mymodel <- do.call(sgPLS::gPLSda, args = c(
list(
X = xdata, Y = as.vector(ydata),
ncomp = ncomp,
ind.block.x = ind.block.x,
keepX = nvarx
),
tmp_extra_args
)) # no sparsity in Y
} else {
# Extracting relevant extra arguments
tmp_extra_args <- MatchingArguments(extra_args = extra_args, FUN = sgPLS::gPLSda)
tmp_extra_args <- tmp_extra_args[!names(tmp_extra_args) %in% c("X", "Y", "ncomp", "ind.block.x", "keepX")]
# Running PLS model
mymodel <- do.call(sgPLS::gPLS, args = c(
list(
X = xdata, Y = ydata,
ncomp = ncomp, scale = scale,
ind.block.x = ind.block.x, ind.block.y = ind.block.y,
keepX = nvarx, keepY = keepY
),
tmp_extra_args
))
}
# Extracting X and Y loadings
Xloadings <- mymodel$loadings$X
Yloadings <- mymodel$loadings$Y
# Making sure that the loadings of the univariate Y are positives
if (ncol(ydata) == 1) {
for (j in seq_len(ncol(mymodel$loadings$Y))) {
if (sign(mymodel$loadings$Y[1, j]) == -1) {
Yloadings[1, j] <- -Yloadings[1, j]
Xloadings[, j] <- -Xloadings[, j]
}
}
}
# Exporting the set of loadings to look at for selection with current component
beta[k, ] <- Xloadings[, ncomp, drop = FALSE]
# Exporting all loadings coefficients
beta_full[k, ] <- c(as.vector(Xloadings), as.vector(Yloadings))
}
beta <- ifelse(beta != 0, yes = 1, no = 0)
return(list(selected = beta, beta_full = beta_full))
}
#' Sparse Principal Component Analysis
#'
#' Runs a sparse Principal Component Analysis model using implementation from
#' \code{\link[elasticnet]{spca}} (if \code{algo="sPCA"}) or
#' \code{\link[mixOmics]{spca}} (if \code{algo="rSVD"}). This function is not
#' using stability.
#'
#' @inheritParams PLS
#' @param xdata data matrix with observations as rows and variables as columns.
#' @param Lambda matrix of parameters controlling the number of selected
#' variables at current component, as defined by \code{ncomp}.
#' @param keepX_previous number of selected predictors in previous components.
#' Only used if \code{ncomp > 1}.
#' @param algorithm character string indicating the name of the algorithm to use for
#' sparse PCA. Possible values are: "sPCA" (for the algorithm proposed by Zou,
#' Hastie and Tibshirani and implemented in \code{\link[elasticnet]{spca}}) or
#' "rSVD" (for the regularised SVD approach proposed by Shen and Huang and
#' implemented in \code{\link[mixOmics]{spca}}).
#' @param ... additional arguments to be passed to
#' \code{\link[elasticnet]{spca}} (if \code{algorithm="sPCA"}) or
#' \code{\link[mixOmics]{spca}} (if \code{algorithm="rSVD"}).
#'
#' @return A list with: \item{selected}{matrix of binary selection status. Rows
#' correspond to different model parameters. Columns correspond to
#' predictors.} \item{beta_full}{array of model coefficients. Rows correspond
#' to different model parameters. Columns correspond to predictors (starting
#' with "X") or outcomes (starting with "Y") variables for different
#' components (denoted by "PC").}
#'
#' @family penalised dimensionality reduction functions
#' @seealso \code{\link{VariableSelection}}, \code{\link{BiSelection}}
#'
#' @references \insertRef{sparsePCA}{sharp}
#'
#' \insertRef{sparsePCASVD}{sharp}
#'
#' @examples
#' # Data simulation
#' set.seed(1)
#' simul <- SimulateRegression(n = 100, pk = 50, family = "gaussian")
#' x <- simul$xdata
#'
#' # Sparse PCA (by Zou, Hastie, Tibshirani)
#' if (requireNamespace("elasticnet", quietly = TRUE)) {
#' mypca <- SparsePCA(
#' xdata = x, ncomp = 2,
#' Lambda = c(1, 2), keepX_previous = 10, algorithm = "sPCA"
#' )
#' }
#'
#' # Sparse PCA (by Shen and Huang)
#' if (requireNamespace("mixOmics", quietly = TRUE)) {
#' mypca <- SparsePCA(
#' xdata = x, ncomp = 2,
#' Lambda = c(1, 2), keepX_previous = 10, algorithm = "rSVD"
#' )
#' }
#' @export
SparsePCA <- function(xdata, Lambda,
ncomp = 1, scale = TRUE,
keepX_previous = NULL,
algorithm = "sPCA", ...) {
# Checking input value for algorithm
if (!algorithm %in% c("sPCA", "rSVD")) {
stop("Invalid input for argument 'algorithm'. Possible values are: 'sPCA' or 'rSVD'.")
}
# Checking mixOmics package is installed
if (algorithm == "rSVD") {
if (!requireNamespace("mixOmics")) {
stop("This function requires the 'mixOmics' package.")
}
}
# Checking elasticnet package is installed
if (algorithm == "sPCA") {
if (!requireNamespace("elasticnet")) {
stop("This function requires the 'elasticnet' package.")
}
}
# Storing extra arguments
extra_args <- list(...)
# All X variables are kept in previous components by default
if (is.null(keepX_previous)) {
keepX_previous <- rep(ncol(xdata), ncomp - 1)
}
# Initialising the current set of loadings coefficients
beta <- matrix(NA, nrow = length(Lambda), ncol = ncol(xdata))
rownames(beta) <- paste0("s", 0:(length(Lambda) - 1))
colnames(beta) <- colnames(xdata)
# # Initialising the full set of loadings coefficients
beta_full <- matrix(NA, nrow = length(Lambda), ncol = ncol(xdata) * ncomp)
rownames(beta_full) <- paste0("s", 0:(length(Lambda) - 1))
colnames(beta_full) <- c(paste0(paste0("X_", colnames(xdata), "_PC"), rep(seq_len(ncomp), each = ncol(xdata))))
# Loop over all parameters (number of selected variables in X in current component)
for (k in seq_len(length(Lambda))) {
# Number of selected variables per component in X
nvarx <- c(keepX_previous, Lambda[k])
if (algorithm == "rSVD") {
# Extracting relevant extra arguments
tmp_extra_args <- MatchingArguments(extra_args = extra_args, FUN = mixOmics::spca)
tmp_extra_args <- tmp_extra_args[!names(tmp_extra_args) %in% c("X", "ncomp", "keepX", "scale")]
# Running PLS model
mymodel <- do.call(mixOmics::spca, args = c(
list(
X = xdata,
ncomp = ncomp,
scale = scale,
keepX = nvarx
),
tmp_extra_args
))
# Extracting X and Y loadings
Xloadings <- mymodel$loadings$X
}
if (algorithm == "sPCA") {
# Extracting relevant extra arguments
tmp_extra_args <- MatchingArguments(extra_args = extra_args, FUN = elasticnet::spca)
tmp_extra_args <- tmp_extra_args[!names(tmp_extra_args) %in% c("x", "type", "K", "para", "sparse")]
# Running PLS model
if (scale) {
mymodel <- do.call(elasticnet::spca, args = c(list(
x = stats::cor(xdata), type = "Gram",
K = ncomp, para = nvarx, sparse = "varnum"
), tmp_extra_args))
} else {
mymodel <- do.call(elasticnet::spca, args = c(list(
x = stats::cov(xdata), type = "Gram",
K = ncomp, para = nvarx, sparse = "varnum"
), tmp_extra_args))
}
# Extracting X and Y loadings
Xloadings <- mymodel$loadings
}
# Exporting the set of loadings to look at for selection with current component
beta[k, ] <- Xloadings[, ncomp, drop = FALSE]
# Exporting all loadings coefficients
beta_full[k, ] <- as.vector(Xloadings)
}
beta <- ifelse(beta != 0, yes = 1, no = 0)
return(list(selected = beta, beta_full = beta_full))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.