#' Sparse Group Subgroup PLS
#'
#' Fit a PLS model to two blocks of data via the sparse group subgroup Partial
#' Least Squares (sgspls) algorithm. The sgspls algorithm enables selection of
#' variables at the group, subgroup and single feature levels.
#'
#' @param X A matrix of regressors (n x p). By default the matrix will be
#' centered to have mean zero.
#' @param Y A matrix of continuous responses (n x q). By default the matrix will
#' be centered to have mean zero.
#' @param ncomp The number of components to include in the model.
#' @param mode A character string. What type of PLS algorithm to use, matching
#' one of "regression", "canonical". See Details.
#' @param tol A positive real tolerance for the PLS algorithm.
#' @param groupX A vector describing the group details of the X variable. (see
#' example in Details).
#' @param groupY A vector describing the group details of the Y variable. (see
#' example in Details).
#' @param subgroupX A vector describing the subgroup details of the X block (see
#' example in Details).
#' @param subgroupY A vector describing the subgroup details of the Y block (see
#' example in Details).
#' @param keepX Numeric vector of length \code{ncomp}, the number of groups to
#' select in \eqn{X}-loadings. Default selects all groups.
#' @param keepY Numeric vector of length \code{ncomp}, the number of groups to
#' select in \eqn{Y}-loadings. Default selects all groups.
#' @param indiv_sparsity_x Individual sparisty parameter (value between 0 and 1)
#' related to the sparisty within subgroups for the \eqn{X} block.
#' @param indiv_sparsity_y Individual sparisty parameter (value between 0 and 1)
#' related to the sparisty within subgroups for the \eqn{Y} block.
#' @param subgroup_sparsity_x Sub-group sparisty parameter (value between 0 and
#' 1) related to the number of subgroups selected for the PLS \eqn{X} weights.
#' @param subgroup_sparsity_y Sub-group sparisty parameter (value between 0 and
#' 1) related to the number of subgroups selected for the PLS \eqn{Y} weights.
#' @param scale.x Scale predictors by their standard deviation.
#' @param scale.y Scale responses by their standard deviation.
#' @param ... additional arguments for low level functionality.
#' @param max.iter How many iterations should be performed? Default is 500.
#'
#' @export
#' @return \code{sgspls} returns an object of class \code{"sgspls"}, a list that
#' contains the following components:
#' \item{weights}{a list containing the X and Y pls weights.}
#' \item{scores}{a list containing the X and Y pls scores.}
#' \item{names}{a list containing the X and Y names.}
#' \item{parameters}{a list containing the parameters of the model that was fitted.}
#'
#' @author Matthew Sutton \email{m5.sutton@hdr.qut.edu.au}
#' @references Liquet Benoit, Lafaye de Micheaux, Boris Hejblum, Rodolphe
#' Thiebaut. A group and Sparse Group Partial Least Square approach applied in
#' Genomics context. \emph{Submitted}.
#'
#' L\^e Cao, K.-A., Martin, P.G.P., Robert-Grani\'e, C. and Besse, P. (2009).
#' Sparse canonical methods for biological data integration: application to a
#' cross-platform study. \emph{BMC Bioinformatics} \bold{10}:34.
#'
#' Shen, H. and Huang, J. Z. (2008). Sparse principal component analysis via
#' regularized low rank matrix approximation. \emph{Journal of Multivariate
#' Analysis} \bold{99}, 1015-1034.
#'
#' Tenenhaus, M. (1998). \emph{La r\'egression PLS: th\'eorie et pratique}.
#' Paris: Editions Technic.
#'
#' @seealso Tuning functions \code{\link[sgspls]{calc_pve}},
#' \code{\link[sgspls]{tune_sgspls}}, \code{\link[sgspls]{tune_groups}}.
#' Model performance and estimation \code{\link[sgspls]{predict.sgspls}}, \code{\link[sgspls]{perf.sgspls}}, \code{\link[sgspls]{coef.sgspls}}
#'
#' @examples
#'
#' set.seed(1)
#' n = 50; p = 510;
#'
#' size.groups = 30; size.subgroups = 5
#' groupX <- ceiling(1:p / size.groups)
#' subgroupX <- ceiling(1:p / size.subgroups)
#'
#' X = matrix(rnorm(n * p), ncol = p, nrow = n)
#'
#' beta <- rep(0,p)
#' bSG <- -2:2; b0 <- rep(0,length(bSG))
#' betaG <- c(bSG, b0, bSG, b0, bSG, b0)
#' beta[1:size.groups] <- betaG
#'
#' y = X %*% beta + 0.1*rnorm(n)
#'
#' model <- sgspls(X, y, ncomp = 3, mode = "regression", keepX = 1,
#' groupX = groupX, subgroupX = subgroupX,
#' indiv_sparsity_x = 0.8, subgroup_sparsity_x = 0.15)
#'
#' reg_coef <- coef(model, type = "coefficients")
#'
#' # Check model fit
#' cbind(reg_coef$B[ , , 3], beta)
#'
#' \dontrun{
#' cbind(model.sgsplsR$B.hat[,,3], beta)[1:30,]
#' }
sgspls <-
function(X, Y, ncomp = 2, mode = "regression",
keepX = NA, keepY = NA, max.iter = 500,
tol = 1e-12, scale.x = T, scale.y = F,
groupX = rep(1,ncol(X)), groupY = rep(1,ncol(Y)),
subgroupX = rep(1,ncol(X)), subgroupY = rep(1,ncol(Y)),
indiv_sparsity_x = rep(0,ncomp), subgroup_sparsity_x = rep(0,ncomp),
indiv_sparsity_y = rep(0,ncomp), subgroup_sparsity_y = rep(0,ncomp),
...)
{
# Check the data #
X = as.matrix(X)
Y = as.matrix(Y)
p = ncol(X); q = ncol(Y); n = nrow(X);
if (any(!is.finite(Y))) stop("\ninfinite, NA or NaN values in X or Y")
if (is.null(colnames(X))) colnames(X) = paste(rep("X",p), 1:p, sep="")
if (is.null(rownames(X))) rownames(X) = rep(1:n)
if (is.null(colnames(Y))) colnames(Y) = paste(rep("Y",q), 1:q, sep="")
if (is.null(rownames(Y))) rownames(Y) = rep(1:n)
# Add group information
ngX <- length(unique(groupX))
ngY <- length(unique(groupY))
# Fix number of selected groups
if(is.na(keepY[1])) keepY <- ngY
if(is.na(keepX[1])) keepX <- ngX
# Repeat arguments to match ncomp length
keepX <- rep_param(keepX, ncomp)
keepY <- rep_param(keepY, ncomp)
indiv_sparsity_x <- rep_param(indiv_sparsity_x,ncomp)
subgroup_sparsity_x <- rep_param(subgroup_sparsity_x,ncomp)
indiv_sparsity_y <- rep_param(indiv_sparsity_y,ncomp)
subgroup_sparsity_y <- rep_param(subgroup_sparsity_y,ncomp)
# Set the alphas for the sparsity values
# There must be positive penalty on the group so that the keepX and keepY parameters work
alpha1.x <- 1 - rowSums(cbind(matrix(indiv_sparsity_x, ncol = 1), matrix(subgroup_sparsity_x, ncol = 1)))
if(any(alpha1.x == 0)){
ind <- which(alpha1.x == 0)
alpha1.x[ind] <- 1e-06
for (i in ind){
if(indiv_sparsity_x[ind] < subgroup_sparsity_x[ind]) {
subgroup_sparsity_x[ind] = subgroup_sparsity_x[ind] - 1e-06
} else{
indiv_sparsity_x[ind] = indiv_sparsity_x[ind] - 1e-06
}
}
}
alpha2.x <- subgroup_sparsity_x
alpha3.x <- indiv_sparsity_x
alpha1.y <- 1 - rowSums(cbind(indiv_sparsity_y, subgroup_sparsity_y))
alpha1.y <- pmax(alpha1.y, 1e-6)
if(any(alpha1.y == 0)){
ind <- which(alpha1.x == 0)
alpha1.x[ind] <- 1e-06
for (i in ind){
if(indiv_sparsity_x[ind] < subgroup_sparsity_x[ind]) {
subgroup_sparsity_x[ind] = subgroup_sparsity_x[ind] - 1e-06
} else{
indiv_sparsity_x[ind] = indiv_sparsity_x[ind] - 1e-06
}
}
}
alpha2.y <- subgroup_sparsity_y
alpha3.y <- indiv_sparsity_y
if(any(round(rowSums(cbind(alpha1.x, alpha2.x, alpha3.x)),6) != 1)) stop("Sparsities don't sum to 1 on X")
if(any(round(rowSums(cbind(alpha1.y, alpha2.y, alpha3.y)),6) != 1)) stop("Sparsities don't sum to 1 on Y")
## Specialist method to preform sgspls with lambda and alpha's specified on X
# additional Args #
lambda.x = rep(0,ncomp)
lambda.y = rep(0,ncomp)
singular_vals <- rep(0, ncomp)
add.args = list(...)
if(!is.null(add.args$lambda.x)){
lambda.x <- add.args$lambda.x
alpha1.x <- add.args$alpha1.x
alpha2.x <- add.args$alpha2.x
alpha3.x <- add.args$alpha3.x
}
# Scaleing similar to the pls package #
Y_h <- scale(Y, center = T, scale = scale.y)
X_h <- scale(X, center = T, scale = scale.x)
x_scores <- matrix(nrow=n, ncol = ncomp) # X variate
y_scores <- matrix(nrow=n, ncol = ncomp) # Y variate
x_weights <- matrix(nrow=p, ncol = ncomp)
y_weights <- matrix(nrow=q, ncol = ncomp)
# Algorithm #
for(h in 1:ncomp){
M = t(X_h)%*%Y_h
svd.M <- try(svd(M,nu = 1,nv = 1))
if(class(svd.M) == "try-error"){
M <- M / (n-1)
svd.M <- try(svd(M,nu = 1,nv = 1))
}
if (svd.M$d[1] < .Machine$double.eps) {
cat("Too many PLS components, stoped at ",h-1," components",svd.M$d[1])
ncomp = h-1;
scores = scores[,1:(h-1)]; weights = weights[,1:(h-1)]; loadings = loadings[,1:(h-1)]
path.coeff = path.coeff[1:(h-1)]; comps = comps[1:(h-1)]
break
}
# Calaulate the weights
weights <- cal_weights(M = M,svd.M = svd.M,keepX = keepX[h],keepY = keepY[h],groupX = groupX,
groupY = groupY,subgroupX = subgroupX,subgroupY = subgroupY,alpha1.x = alpha1.x[h],
alpha2.x = alpha2.x[h],alpha1.y = alpha1.y[h],alpha2.y = alpha2.y[h],tol, max.iter,
lambda.x = lambda.x[h], lambda.y = lambda.y[h])
x_weights[,h] <- weights$x
y_weights[,h] <- weights$y
# calculate loadings, scores and deflate X & Y
x_scores[,h] <- x_score <- X_h %*% weights$x
y_scores[,h] <- y_score <- Y_h %*% weights$y
# Singular value type value
singular_vals[h] <- weights$d
### Deflation step
proj_x <- diag(1,n,n) - tcrossprod(x_score)/drop(crossprod(x_score))
X_h <- proj_x %*% X_h
if (mode=="regression") {
Y_h <- proj_x %*% Y_h
}
else
{
proj_y <- diag(1,n,n) - tcrossprod(y_score)/drop(crossprod(y_score))
Y_h <- proj_y %*% Y_h
}
}
cl = match.call()
# Save parameters for quick code in perf and tunning functions
parameters <- list(X=X,
Y=Y,
ncomp=ncomp,
max.iter = max.iter,
mode=mode,
keepX=keepX,
keepY=keepY,
groupX = groupX, groupY = groupY,
subgroupX = subgroupX, subgroupY = subgroupY,
indiv_sparsity_x = indiv_sparsity_x, subgroup_sparsity_x = subgroup_sparsity_x,
indiv_sparsity_y = indiv_sparsity_y, subgroup_sparsity_y = subgroup_sparsity_y,
tol = tol, scale.x = scale.x, scale.y = scale.y,
X.residuals = X_h, Y.residuals = Y_h)
result <- list(call = cl, weights = list(X = x_weights, Y = y_weights), scores = list(X = x_scores, Y = y_scores),
names = list(X = colnames(X),Y = colnames(Y), indiv = rownames(X)), singular_vals = singular_vals, parameters=parameters)
class(result) = c("sgspls")
return(invisible(result))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.