#' Calculate percentage of variance explained (PVE)
#'
#' Function to evaluate the PEV and recommend a number of components to select.
#'
#' @param object Object of class inheriting from \code{"sgspls"}.
#' The function will retrieve some key parameters stored in that object.
#'
#' @export
#' @return \code{PVE} returns a vector of percentage of variance explained by the scores in the PLS method.
#' See references for details.
#'
#' @seealso Tuning functions \code{\link[sgspls]{calc_pve}},
#' \code{\link[sgspls]{tune_sgspls}}, \code{\link[sgspls]{tune_groups}}.
#' Model performance and estimation \code{\link[sgspls]{predict.sgspls}}, \code{\link[sgspls]{perf.sgspls}}, \code{\link[sgspls]{coef.sgspls}}
#' @references Liquet Benoit, Lafaye de Micheaux, Boris Hejblum, Rodolphe
#' Thiebaut. A group and Sparse Group Partial Least Square approach applied in
#' Genomics context. \emph{Submitted}.
#'
#' @examples
#'
#' set.seed(1)
#' n = 50; p = 500;
#'
#' X = matrix(rnorm(n * p), ncol = p, nrow = n)
#'
#' beta <- rep(0,p)
#' beta[1:20] <- 1:20
#'
#' y = X %*% beta + 0.1*rnorm(n)
#'
#' model <- sgspls(X, y, ncomp = 3, mode = "regression")
#' calc_pve(model)
#'
calc_pve <- function(object){
#--- Take the predictions from the PLS object --#
Y <- object$parameters$Y
X <- object$parameters$X
Ypred <- predict(object, X)
PVE <- matrix(
1 - apply(Ypred, 3, function(Xi) sum(apply(Y - Xi,2,var))/sum(apply(Y,2,var))),
ncol = object$parameters$ncomp
)
#--- Get the pve ---#
if (max(PVE) > 0.98){
ncomp <- which(PVE >0.95)[1]
} else{
ncomp <- which.min(PVE)[1]
}
cat("Recommend to use", ncomp, "components.\n\n")
colnames(PVE) <- paste0("Comp ", 1:length(PVE))
rownames(PVE) <- "PVE"
return(PVE)
}
generate_sparsities <- function(){
#--- Original code to generate the sparsities ---#
# # Choose the sparsities for individual level
# sparsities = seq(0.01,0.99,0.01)
# grp = c(seq(0.05,0.95,0.1))
#
# # third column to ensure individual sparsity
# sparsities.s <- do.call(expand.grid,list(grp,sparsities,sparsities))
# sparsities <- NULL
# ind <- which(sparsities.s[,1] %in% grp )
# sparsities.s <- sparsities.s[ind,]
# for(sp in grp){
#
# #-- take only the valid sparsities --#
# ind <- which(apply(sparsities.s,1,sum) == 1 & sparsities.s[,1] == sp )
#
# #-- take the last, first and middle ones --#
# ind <- c(tail(ind,1),ind[floor(length(ind)/2)],head(ind,1))
#
# sparsities <- rbind(sparsities,sparsities.s[ind,])
# }
# sparsities <- as.matrix(unique(sparsities))
# rownames(sparsities) <- NULL
# colnames(sparsities) <- c("Group", "Subgroup","Individual")
return(
structure(c(0.05, 0.05, 0.05, 0.15, 0.15, 0.15, 0.25, 0.25, 0.25,
0.35, 0.35, 0.35, 0.45, 0.45, 0.45, 0.55, 0.55, 0.55, 0.75, 0.75,
0.75, 0.85, 0.85, 0.85, 0.95, 0.95, 0.95, 0.01, 0.48, 0.94, 0.01,
0.43, 0.84, 0.01, 0.38, 0.74, 0.01, 0.33, 0.64, 0.01, 0.28, 0.54,
0.01, 0.23, 0.44, 0.03, 0.13, 0.22, 0.01, 0.08, 0.14, 0.01, 0.03,
0.04, 0.94, 0.47, 0.01, 0.84, 0.42, 0.01, 0.74, 0.37, 0.01, 0.64,
0.32, 0.01, 0.54, 0.27, 0.01, 0.44, 0.22, 0.01, 0.22, 0.12, 0.03,
0.14, 0.07, 0.01, 0.04, 0.02, 0.01), .Dim = c(27L, 3L), .Dimnames = list(
NULL, c("Group", "Subgroup", "Individual")))
)
}
#' Compute cross-validated mean squared prediction error for sgspls regression
#'
#' Tuning function for finding the number of groups, and sparsities to select for an sgspls object.
#' Offers a sequential way to find optimal sparsities, and number of groups for either block.
#'
#' @param pls_obj List of parameters or object of class \code{cv.sgspls} used to perform cross validation (see examples below).
#' @param sparsities Matrix of sparsities, with columns corresponding to group, subgroup and individual
#' sparsity levels to tune over. If it is NULL then a preselected set of sparsity levels is used.
#' @param group_seq a vector containing the number of groups to tune over.
#' @param block A string either "X" or "Y" to indicate which block to tune parameters over.
#' @param folds The number of folds to use in cross validation.
#' @param progressBar Logical, indicating if a progress bar is shown.
#' @param setseed False, or integer for replicating tuning parameters.
#' @param scale_resp Logical, the MSEP is standardised across responses (see perf function for details).
#'
#' @export
#' @return \code{tune_sgspls} returns a list with class \code{cv.sgspls} with measures:
#' \item{result_tuning}{A matrix containing the tuning parameters and MSEP values.}
#' \item{best}{A vector containing the optimal tuning parameters. }
#' \item{parameters}{A list of the parameters for a sgspls object.}
#' \item{tuning_sparsities}{A matrix of group, subgroup and individual sparsities tuned over.}
#' \item{folds}{Number of folds used in cross validation.}
#' \item{min_cv}{Minimum MSEP score.}
#' \item{group_seq}{Groups tuned over in cross validation.}
#'
#' @seealso \code{\link[sgspls]{sgspls}} Tuning functions \code{\link[sgspls]{calc_pve}}, \code{\link[sgspls]{tune_groups}}.
#' Model performance and estimation \code{\link[sgspls]{predict.sgspls}}, \code{\link[sgspls]{perf.sgspls}}, \code{\link[sgspls]{coef.sgspls}}
#'
#' @references Liquet Benoit, Lafaye de Micheaux, Boris Hejblum, Rodolphe
#' Thiebaut. A group and Sparse Group Partial Least Square approach applied in
#' Genomics context. \emph{Submitted}.
#'
#' @examples
#'
#' set.seed(1)
#' n = 50; p = 510;
#' size.groups = 30; size.subgroups = 5
#' groupX <- ceiling(1:p / size.groups)
#' subgroupX <- ceiling(1:p / size.subgroups)
#'
#' X = matrix(rnorm(n * p), ncol = p, nrow = n)
#'
#' beta <- rep(0,p)
#' bSG <- -2:2; b0 <- rep(0,length(bSG))
#' betaG <- c(bSG, b0, bSG, b0, bSG, b0)
#' beta[1:size.groups] <- betaG
#'
#' y = X %*% beta + rnorm(n)
#'
#' #--------------------------------------#
#' #-- Set up a basic model to tune --#
#'
#' cv_pls <- list(X=X, Y=y, groupX=groupX, subgroupX=subgroupX)
#'
#' #---------------------------------------------#
#' #-- Tune over 1 to 2 groups and multiple --#
#' #-- sparsity levels for the first component --#
#'
#' cv_pls_comp1 <- tune_sgspls(pls_obj = cv_pls, group_seq = 1:2, scale_resp = FALSE)
#'
#' #-- MSEP is on the original scale for the response --#
#' cv_pls_comp1$results_tuning
#' cv_pls_comp1$best
#'
#' \dontrun{
#' # Use the optimal fit for the first component and tune the second component
#' cv_pls_comp2 <- tune_sgspls(pls_obj = cv_pls_comp1, group_seq = 1:2, scale_resp = FALSE)
#' cv_pls_comp2$results_tuning
#' cv_pls_comp2$best
#'
#' # Use the optimal fit for the second component and tune the third component
#' cv_pls_comp3 <- tune_sgspls(pls_obj = cv_pls_comp2, group_seq = 1:2, scale_resp = FALSE)
#' cv_pls_comp3$best
#'
#' model <- do.call(sgspls, args = cv_pls_comp3$parameters)
#'
#' model
#'
#' model_coef <- coef(model, type = "coefficients")
#'
#' cbind(beta, model_coef$B[,,2])
#' }
tune_sgspls <-
function(pls_obj, sparsities=NULL, group_seq=NULL, block = "X",
folds= 10, progressBar=TRUE, setseed = 1, scale_resp = TRUE) {
#----------------------#
#--- Get sparsities ---#
if(is.null(sparsities)){
sparsities <- generate_sparsities()
}
#--- Progress Bar ---#
if (progressBar == TRUE)
pb <- txtProgressBar(style = 3)
#--- Init ---#
ngroups <- length(group_seq)
nsparsities <- dim(sparsities)[1]
results_tuning <- NULL
if(class(pls_obj) %in% "cv.sgspls"){
parameters <- pls_obj$parameters
ncomp <- parameters$ncomp + 1
} else {
parameters <- pls_obj
ncomp <- 1:max(parameters$ncomp+1, 1)
}
parameters$ncomp <- ncomp
#-------------------------------------------------#
#--- Loop through sparsities combinations on j ---#
for( j in 1:nsparsities){
#-- Update the sparsity ---#
if(block == "X" || block == "x")
{
parameters$indiv_sparsity_x[ncomp] <- sparsities[j,3]
parameters$subgroup_sparsity_x[ncomp] <- sparsities[j,2]
}
if(block == "Y" || block == "y")
{
parameters$indiv_sparsity_y[ncomp] <- sparsities[j,3]
parameters$subgroup_sparsity_y[ncomp] <- sparsities[j,2]
}
#--- Loop through groups ---#
object_groups <- tune_groups(parameters, group_seq=group_seq, setseed = setseed,
block = block, scale_resp = scale_resp)
results <- cbind( group_seq, sparsities[j,3], sparsities[j,2], t(object_groups$MSEP[ncomp, ,drop=F]) )
results_tuning <- rbind(results_tuning, results)
if (progressBar == TRUE) {
setTxtProgressBar(pb, j/(nsparsities))
}
}
#------------------------------------#
#-- create table of tuning results --#
colnames(results_tuning) <- c("Groups", "Individual", "Subgroup", paste0("MSEP comp",ncomp))
MSEPopt <- min(results_tuning[,-c(1:3)])
best <- results_tuning[which(results_tuning == MSEPopt, arr.ind = T)[1],]
keep <- best[1];
indiv_sparsity <- best[2];
subgroup_sparsity <- best[3]
#-- Return parameters with optimal fit --#
parameters <- object_groups$parameters
parameters$ncomp <- max(ncomp)
if(block == "X" || block == "x"){
parameters$keepX[ncomp] <- keep
parameters$indiv_sparsity_x[ncomp] <- indiv_sparsity
parameters$subgroup_sparsity_x[ncomp] <- subgroup_sparsity
}
if(block == "Y" || block == "y"){
parameters$keepY[ncomp] <- keep
parameters$indiv_sparsity_y[ncomp] <- indiv_sparsity
parameters$subgroup_sparsity_y[ncomp] <- subgroup_sparsity
}
#-- alphas to match the paper --#
rownames(results_tuning) <- paste("alpha",rep(1:length(sparsities[,1]), each = length(group_seq)),sep = "_")
result <- list(results_tuning = results_tuning,
best = best,
parameters = parameters,
tuning_sparsities = sparsities,
folds = folds,
min_cv = min(results_tuning[,-c(1:3)]),
group_seq = group_seq,
folds = object_groups$folds)
class(result) = c("cv.sgspls")
return(result)
}
#' Compute cross-validated mean squared prediction error for sgspls regression
#'
#' Tuning function for finding the number of groups to select for an sgspls object.
#' Function is consistent in the sparsity parameters but may be tuned over the number of
#' groups.
#'
#' @param parameters List of parameters to use in the current PLS object (see examples below).
#' @param group_seq a vector containing the number of groups to tune over.
#' @param block A string either "X" or "Y" to indicate which block to tune parameters over.
#' @param folds The number of folds to use in cross validation.
#' @param setseed False, or integer for replicating tuning parameters.
#' @param scale_resp Logical, the MSEP is standardised across responses (see perf function for details).
#'
#' @export
#' @return \code{tune_groups} returns a list with class \code{cv.sgspls} with measures:
#' \item{result_tuning}{A matrix containing the tuning parameters and MSEP values.}
#' \item{best}{A vector containing the optimal tuning parameters. }
#' \item{parameters}{A list of the parameters for a sgspls object.}
#' \item{tuning_sparsities}{A matrix of group, subgroup and individual sparsities tuned over.}
#' \item{folds}{Number of folds used in cross validation.}
#' \item{min_cv}{Minimum MSEP score.}
#' \item{group_seq}{Groups tuned over in cross validation.}
#'
#' @seealso \code{\link[sgspls]{sgspls}} Tuning functions \code{\link[sgspls]{calc_pve}}, \code{\link[sgspls]{tune_sgspls}}.
#' Model performance and estimation \code{\link[sgspls]{predict.sgspls}}, \code{\link[sgspls]{perf.sgspls}}, \code{\link[sgspls]{coef.sgspls}}
#' @references Liquet Benoit, Lafaye de Micheaux, Boris Hejblum, Rodolphe
#' Thiebaut. A group and Sparse Group Partial Least Square approach applied in
#' Genomics context. \emph{Submitted}.
#'
#' @examples
#'
#' set.seed(1)
#' n = 50; p = 510;
#' size.groups = 30; size.subgroups = 5
#' groupX <- ceiling(1:p / size.groups)
#' subgroupX <- ceiling(1:p / size.subgroups)
#'
#' X = matrix(rnorm(n * p), ncol = p, nrow = n)
#'
#' beta <- rep(0,p)
#' bSG <- -2:2; b0 <- rep(0,length(bSG))
#' betaG <- c(bSG, b0, bSG, b0, bSG, b0)
#' beta[1:size.groups] <- betaG
#'
#' y = X %*% beta + rnorm(n)
#'
#' #--------------------------------------#
#' #-- Set up a basic model to tune --#
#'
#' cv_pls <- list(X=X, Y=y, groupX=groupX, subgroupX=subgroupX, ncomp = 1:3)
#'
#' cv_pls_comp1 <- tune_groups(parameters = cv_pls, group_seq = 1:2, scale_resp = FALSE)
#'
#' #-- MSEP is on the original scale for the response --#
#' cv_pls_comp1$MSEP
#' cv_pls_comp1$keep_opt
#' cv_pls_comp1$ncomp_opt
#'
tune_groups <-
function(parameters, group_seq = NULL, block = "X", folds = 10, scale_resp = TRUE, setseed = 1) {
#-- Get the length of the groups to tune over ---#
ngroups = length(group_seq);
if(is.null(group_seq))
stop("Enter a sequence of groups to tune over")
ncomp <- parameters$ncomp
parameters$ncomp <- max(ncomp)
MSEP <- matrix(0, ncol = ngroups, nrow = max(ncomp))
#-- Tune over the group sequence --#
for(i in 1:ngroups){
#-- Update the group number ---#
if(block == "X" || block == "x")
parameters$keepX[ncomp] = group_seq[i]
if(block == "Y" || block == "y")
parameters$keepY[ncomp] = group_seq[i]
object <- invisible(do.call(sgspls, args = parameters))
#--- Estimate the performance ---#
object_perf <- perf(object, folds = folds, progressBar = FALSE,
scale_resp = scale_resp, setseed = setseed)
MSEP[ , i ] <- apply(object_perf$MSEP, 1, sum)
}
#-- Find optimal tuning parameters --#
mMSEP <- min(MSEP)
colnames(MSEP) <- paste0(group_seq)
rownames(MSEP) <- paste0("ncomp ", 1:max(ncomp))
mMSEPcol <- apply(MSEP, 2, min)
mMSEProw <- apply(MSEP, 1, min)
ncomp_opt <- min( (1:max(ncomp))[mMSEProw == mMSEP])
keep_opt <- min( group_seq[mMSEPcol == mMSEP])
#-- Return parameters with optimal fit --#
parameters <- object$parameters
if(block == "X" || block == "x")
parameters$keepX[ncomp] <- keep_opt
if(block == "Y" || block == "y")
parameters$keepY[ncomp] <- keep_opt
res <- list(
MSEP = MSEP, ncomp_opt = ncomp_opt, keep_opt = keep_opt, parameters = parameters, folds = object_perf$folds
)
return(res)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.