Nothing
#' Ordered Homogeneity Pursuit Lasso
#'
#' Fits the ordered homogeneity pursuit lasso (OHPL) model.
#'
#' @param x Predictor matrix.
#' @param y Response matrix with one column.
#' @param maxcomp Maximum number of components for PLS.
#' @param gamma A number between (0, 1) for generating the gamma sequence.
#' An usual choice for gamma could be `n * 0.05`, where `n` is a number
#' in 2, 3, ..., 19.
#' @param cv.folds Number of cross-validation folds.
#' @param G Maximum number of variable groups.
#' @param type Find the maximum absolute correlation (`"max"`)
#' or find the median of absolute correlation (`"median"`).
#' Default is `"max"`.
#' @param scale Should the predictor matrix be scaled? Default is `TRUE`.
#' @param pls.method Method for fitting the PLS model. Default is `"simpls"`.
#' See the details section in [pls::plsr()] for all possible options.
#'
#' @return A list of fitted OHPL model object with performance metrics.
#'
#' @importFrom glmnet cv.glmnet
#'
#' @export OHPL
#'
#' @references
#' You-Wu Lin, Nan Xiao, Li-Li Wang, Chuan-Quan Li, and Qing-Song Xu (2017).
#' Ordered homogeneity pursuit lasso for group variable selection with
#' applications to spectroscopic data.
#' _Chemometrics and Intelligent Laboratory Systems_ 168, 62--71.
#'
#' @examples
#' # Generate simulation data
#' dat <- OHPL.sim(
#' n = 100, p = 100, rho = 0.8,
#' coef = rep(1, 10), snr = 3, p.train = 0.5,
#' seed = 1010
#' )
#'
#' # Split training and test set
#' x <- dat$x.tr
#' y <- dat$y.tr
#' x.test <- dat$x.te
#' y.test <- dat$y.te
#'
#' # Fit the OHPL model
#' fit <- OHPL(x, y, maxcomp = 3, gamma = 0.5, G = 10, type = "max")
#'
#' # Selected variables
#' fit$Vsel
#'
#' # Make predictions
#' y.pred <- predict(fit, x.test)
#'
#' # Compute evaluation metric RMSEP, Q2 and MAE for the test set
#' perf <- OHPL.RMSEP(fit, x.test, y.test)
#' perf$RMSEP
#' perf$Q2
#' perf$MAE
OHPL <- function(
x, y, maxcomp, gamma, cv.folds = 5L,
G = 30L, type = c("max", "median"),
scale = TRUE, pls.method = "simpls") {
type <- match.arg(type)
X <- x
y <- y
n.cal <- length(y)
# Compute beta
beta <- single.beta(X = x, y = y, maxcomp = maxcomp)$"beta"
# Cluster variables with Fisher optimal partitions algorithm
C <- dlc(beta, maxk = G)$"C"
groups <- FOP(beta, G, C)
# Extract the prototypes from each group
prototype <- proto(x, y, groups, type = type)
# X here should be the original X, not the normalized X
X.pro <- x[, prototype]
# Lasso with glmnet (with default lambdas)
lasso.cv <- suppressWarnings(cv.glmnet(X.pro, y, nfolds = cv.folds))
# optnum: select the model with minimal cv error
lasso.optnum <- which.min(lasso.cv$cvm)[1]
lasso.cv.min <- sqrt(min(lasso.cv$cvm)) # cv.min
lasso.optlambda <- lasso.cv$lambda.min # optlambda
lasso.optbeta <- lasso.cv$glmnet.fit$beta[, lasso.optnum]
max.beta <- max(abs(lasso.optbeta))
# Edge case: when there are less groups, lasso tends to
# generate many near-zero coefficients (false positives)
# filter out variables here with hard thresholding.
index.beta <- (abs(lasso.optbeta) / max.beta) < gamma
lasso.optbeta[index.beta] <- 0
# Build PLS model with the selected prototype
# and the variable grouping information
index.nozero <- which(lasso.optbeta != 0)
Vsel <- NULL
for (i in 1L:length(index.nozero)) {
Vseli <- which(groups == index.nozero[i])
Vsel <- c(Vsel, Vseli)
}
# Vsel.idx[[j]] = Vsel
# use the selected variables to build the PLS model
# note that the X and y here are the original ones (not centered)
Xsel <- x[, Vsel]
if (dim(Xsel)[2L] == 0L) {
Vsel <- c(Vsel, prototype)
Xsel <- X.pro
print("The length of selected variables is zero")
}
temp.ncomp <- min((dim(Xsel)[1] - 1), dim(Xsel)[2])
# Xsel.idx[[j]] = Xsel.temp
# create a new data frame based on the selected
# varaibles (Xsel) and the original response (y)
plsdf <- as.data.frame(cbind(Xsel, y = y))
Vsel.maxcomp <- min(length(Vsel), maxcomp, temp.ncomp)
plsr.cvfit <- plsr(
y ~ .,
data = plsdf, ncomp = Vsel.maxcomp, scale = scale,
method = pls.method, validation = "LOO"
)
# Select the best component number using adjusted CV
opt.K <- which.min(pls::RMSEP(plsr.cvfit)[["val"]][2L, 1L, -1L])
# Store the minimal RMSE from CV
RMSECV <- min(pls::RMSEP(plsr.cvfit)[["val"]][2L, 1L, -1L])
# Store Q2.CV
Q2.cv <- R2(plsr.cvfit, estimate = "CV")[["val"]][, , opt.K + 1]
nVar <- length(Vsel)
variables <- matrix(0, nrow = ncol(X), ncol = 1)
variables[Vsel, ] <- 1
# Build an optimal PLS model
plsdf <- as.data.frame(cbind(Xsel, y = y))
plsr.fit <- plsr(
y ~ Xsel,
data = plsdf, ncomp = Vsel.maxcomp, scale = scale,
method = pls.method, validation = "none"
)
RMSEC <- min(pls::RMSEP(plsr.fit, estimate = "train")[["val"]][, , opt.K + 1])
Q2.c <- pls::R2(plsr.fit, estimate = "train")[["val"]][, , opt.K + 1]
res <- list(
"model" = plsr.fit, "opt.K" = opt.K,
"RMSECV" = RMSECV, "Q2.cv" = Q2.cv,
"RMSEC" = RMSEC, "Q2.c" = Q2.c,
"nVar" = nVar, "variables" = variables, "Vsel" = Vsel,
"groups" = groups
)
class(res) <- c("OHPL")
res
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.