Nothing
#source("./TSLA.fit.R")
#source("./tools.R")
# test
#' Cross validation for TSLA
#'
#' Conduct cross validation to select the optimal tuning parameters in TSLA.
#'
#'
#' @param y Response in matrix form, continuous for \code{family = "ls"} and binary (0/1)
#' for \code{family = "logit"}.
#' @param X_1 Design matrix for unpenalized features (excluding intercept). Need to be in the matrix form.
#' @param X_2 Expanded design matrix for \code{penalty = "CL2"}; Original design matrix
#' for \code{penalty = "RFS-Sum"}. Need to be in the matrix form.
#' @param treemat Expanded tree structure in matrix form for
#' \code{penalty = "CL2"}; Original structure for \code{penalty = "RFS-Sum"}.
#' @param family Two options. Use "ls" for least square problems and "logit"
#' for logistic regression problems.
#' @param penalty Two options for group penalty on \eqn{\gamma}, "CL2" or "RFS-Sum".
#' @param pred.loss Model performance metrics. If \code{family="ls"}, default
#' is "MSE" (mean squared error). If \code{family="logit"}, default is "AUC". For logistic
#' model, another option is "deviance".
#' @param gamma.init Initial value for the optimization. Default is a zero vector.
#' The length should equal to 1+\code{ncol(X_1)}+\code{ncol(A)}.
#' See details of A in \code{get_tree_obj()}.
#' @param weight A vector of length two and it is used for logistic regression
#' only. The first element corresponds to weight of y=1 and the
#' second element corresponds to weight of y=0.
#' @param nfolds Number of cross validation folds. Default is 5.
#' @param group.weight User-defined weights for group penalty. Need to be a vector
#' and the length equals to the number of groups.
#' @param feature.weight User-defined weights for each predictor after expansion.
#' @param control A list of parameters controlling algorithm convergence. Default values:
#' \code{tol = 1e-5}, convergence tolerance; \code{maxit = 10000},
#' maximum number of iterations; \code{mu = 1e-3}, smoothness parameter in SPG.
#' @param modstr A list of parameters controlling tuning parameters. Default values:
#' \code{lambda = NULL}. If lambda is not provided, the package will give a default lambda sequence;
#' \code{lambda.min.ratio = 1e-04}, smallest value for lambda as
#' a fraction of lambda.max (given by default when lambda is NULL);
#' \code{nlambda = 50},
#' number of lambda values (equal spacing on log scale) used when lambda is NULL;
#' \code{alpha = seq(0, 1, length.out = 10)}, sequence of alpha. Here, alpha is
#' tuning parameter for generalized lasso penalty and 1-alpha is the tuning
#' parameter for group lasso penalty.
#'
#' @return A list of cross validation results.
#' \item{lambda.min}{\eqn{\lambda} value with best prediction performance.}
#' \item{alpha.min}{\eqn{\alpha} value with best prediction performance.}
#' \item{cvm}{A (number-of-lambda * number-of-alpha) matrix saving the means of cross validation loss across folds.}
#' \item{cvsd}{A (number-of-lambda * number-of-alpha) matrix saving standard deviations of
#' cross validation loss across folds.}
#' \item{TSLA.fit}{Outputs from \code{TSLA.fit()}.}
#' \item{Intercept.min}{Intercept corresponding to \code{(lambda.min,alpha.min)}.}
#' \item{cov.min}{Coefficients of unpenalized features
#' corresponding to \code{(lambda.min,alpha.min)}.}
#' \item{beta.min}{Coefficients of binary features corresponding
#' to \code{(lambda.min,alpha.min)}.}
#' \item{gamma.min}{Node coefficients corresponding to \code{(lambda.min,alpha.min)}.}
#' \item{groupnorm.min}{Group norms of node coefficients corresponding to \code{(lambda.min,alpha.min)}.}
#' \item{lambda.min.index}{Index of the best \eqn{\lambda} in the sequence.}
#' \item{alpha.min.index}{Index of the best \eqn{\alpha} in the sequence.}
#'
#' @export
#'
#'
#' @examples
#' # Load the synthetic data
#' data(ClassificationExample)
#'
#' tree.org <- ClassificationExample$tree.org # original tree structure
#' x2.org <- ClassificationExample$x.org # original design matrix
#' x1 <- ClassificationExample$x1
#' y <- ClassificationExample$y # response
#'
#' # Do the tree-guided expansion
#' expand.data <- getetmat(tree.org, x2.org)
#' x2 <- expand.data$x.expand # expanded design matrix
#' tree.expand <- expand.data$tree.expand # expanded tree structure
#'
#' # Do train-test split
#' idtrain <- 1:200
#' x1.train <- as.matrix(x1[idtrain, ])
#' x2.train <- x2[idtrain, ]
#' y.train <- y[idtrain, ]
#' x1.test <- as.matrix(x1[-idtrain, ])
#' x2.test <- x2[-idtrain, ]
#' y.test <- y[-idtrain, ]
#'
#' # specify some model parameters
#' set.seed(100)
#' control <- list(maxit = 100, mu = 1e-3, tol = 1e-5, verbose = FALSE)
#' modstr <- list(nlambda = 5, alpha = seq(0, 1, length.out = 5))
#' simu.cv <- cv.TSLA(y = y.train, as.matrix(x1[idtrain, ]),
#' X_2 = x2.train,
#' treemat = tree.expand, family = 'logit',
#' penalty = 'CL2', pred.loss = 'AUC',
#' gamma.init = NULL, weight = c(1, 1), nfolds = 5,
#' group.weight = NULL, feature.weight = NULL,
#' control = control, modstr = modstr)
#' # Do prediction with the selected tuning parameters on the test set. Report AUC on the test set.
#' rmid <- simu.cv$TSLA.fit$rmid # remove all zero columns
#' if(length(rmid) > 0){
#' x2.test <- x2.test[, -rmid]}
#' y.new <- predict_cvTSLA(simu.cv, as.matrix(x1[-idtrain, ]), x2.test)
#' library(pROC)
#' auc(as.vector(y.test), as.vector(y.new))
#'
#'
cv.TSLA <- function(y, X_1 = NULL, X_2, treemat, family = c('ls', 'logit'),
penalty = c('CL2', 'RFS-Sum'),
pred.loss = c('MSE', 'AUC', 'deviance'),
gamma.init = NULL, weight = NULL, nfolds = 5,
group.weight = NULL, feature.weight = NULL,
control = list(), modstr = list()){
## X_1: covariate matrix
## X_2: expanded design matrix
Call <- match.call()
## set control values for model fitting
control <- do.call("TSLA.control", control)
## set mod strings
modstr <- do.call("TSLA.modstr", modstr)
## get choice of penalty form
penalty <- match.arg(penalty)
## make sure default for logistic regression is AUC
if(family == 'logit' & pred.loss == 'MSE'){
pred.loss <- 'AUC'
}
## get choice of loss function
pred.loss <- match.arg(pred.loss)
y <- as.matrix(y)
X_2 <- as.matrix(X_2)
if(!is.null(X_1)){
X_1 <- as.matrix(X_1)
}
# run TSLA.fit for the first time to get the whole solution path
TSLA.object <- TSLA.fit(y, X_1 = X_1, X_2 = X_2, treemat = treemat,
family = family, penalty = penalty,
gamma.init = gamma.init, weight = weight,
group.weight = group.weight,
feature.weight = feature.weight,
control = control, modstr = modstr)
## get lambda and alpha range
# get alpha sequence
alpha <- TSLA.object$alpha.seq
nalpha <- length(alpha)
# get the lambda sequence
lambda <- TSLA.object$lambda.seq
nlambda <- modstr$nlambda
modstr$lambda <- lambda
modstr$alpha <- alpha
n <- nrow(X_2)
# do cross-validation
if(family == 'logit'){
foldid0 <- sample(rep(seq(nfolds), length = n - sum(y)))
foldid1 <- sample(rep(seq(nfolds), length = sum(y)))
foldid <- rep(0, n)
foldid[which(y == 0)] <- foldid0
foldid[which(y == 1)] <- foldid1
}else{
foldid <- sample(rep(seq(nfolds), length = n))
}
outlist <- as.list(seq(nfolds))
for(i in seq(nfolds)){
# train/test splitting
#if(family == 'logit'){
# trainid <- c(which(foldid0 != i), which(foldid1 != i))
#}else{
trainid <- which(foldid != i)
#}
if(!is.null(X_1)){
X_1.train <- X_1[trainid, ]
}else{
X_1.train <- NULL
}
y.train <- as.matrix(y)[trainid, ]
X_2.train <- X_2[trainid, ]
outlist[[i]] <- TSLA.fit(y.train, X_1.train, X_2.train, treemat,
family = family, penalty = penalty,
gamma.init = gamma.init, weight = weight,
group.weight = group.weight,
feature.weight = feature.weight,
control = control, modstr = modstr)
}
###What to do depends on the pred.loss and family
fun <- paste("cv", family, sep = ".")
cvstuff <- do.call(fun, list(outlist, y, X_1, X_2, foldid, pred.loss))
cvm <- cvstuff$cvm
cvsd <- cvstuff$cvsd
paramin <- cvstuff$paramin
if(nrow(paramin) == 1){
lambda.min.index <- paramin[1, 1]
alpha.min.index <- paramin[1, 2]
lambda.min <- lambda[lambda.min.index]
alpha.min <- alpha[alpha.min.index]
}else{
lambda.min.index <- min(paramin[, 1])
col.index <- which(paramin[, 1] == min(paramin[, 1]))
lambda.min <- lambda[min(paramin[, 1])]
# max alpha to get more features aggregated
alpha.min.index <- paramin[, 2][col.index ]
alpha.min.index <- max(alpha.min.index)
alpha.min <- alpha[alpha.min.index ]
}
### get corresponding coefficient from TSLA.fit
coefs <- coef_TSLA(TSLA.object)
Intercept.min <- coefs$Intercept[, lambda.min.index, alpha.min.index]
if(is.null(X_1)){
cov.min <- NULL
}else if(ncol(X_1) == 1){
cov.min <- coefs$cov.coef[, lambda.min.index, alpha.min.index]
}else{
cov.min <- coefs$cov.coef[, lambda.min.index, alpha.min.index]
}
beta.min <- coefs$beta.coef[, lambda.min.index, alpha.min.index]
gamma.min <- coefs$gamma.coef[, lambda.min.index, alpha.min.index]
groupnorm.min <- TSLA.object$groupnorm[, lambda.min.index, alpha.min.index]
out <- list(lambda.min = lambda.min, alpha.min = alpha.min,
cvm = cvm, cvsd = cvsd,
TSLA.fit = TSLA.object,
Intercept.min = Intercept.min,
cov.min = cov.min, beta.min = beta.min,
gamma.min = gamma.min,
groupnorm.min = groupnorm.min,
lambda.min.index = lambda.min.index,
alpha.min.index = alpha.min.index)
return(out)
}
### internal functions
# sub-functions to calculate loss for each fold
# default for ls: MSE
cv.ls <- function(outlist, y, X_1 = NULL, X_2, foldid, pred.loss = 'MSE'){
pred.loss <- match.arg(pred.loss)
nfolds <- max(foldid)
fitdim <- dim(outlist[[1]]$gammacoef)
out <- array(0, dim = c(nfolds, fitdim[2], fitdim[3]))
for(i in seq(nfolds)){
fitobj <- outlist[[i]]
testid <- which(foldid == i)
ni <- length(testid)
y.test <- as.matrix(y)[testid, ]
if(!is.null(X_1)){
X_1.test <- as.matrix(X_1[testid, ])
}else{
X_1.test <- NULL
}
# remove zero columns identified in the training set
X_2.test <- X_2[testid, ]
if(length(fitobj$rmid) > 0){
X_2.test <- X_2.test[, -fitobj$rmid]
}
# get prediction at each lambda, alpha combination in the ith fold
ypre <- predict_TSLA(fitobj, X_1.test, X_2.test, type = "response")
out[i, , ] <- apply(ypre, c(2, 3), FUN = function(x) sum((x-y.test)^2))/ni
}
cvm <- apply(out, c(2, 3), FUN = mean)
cvsd <- apply(out, c(2, 3), FUN = sd)
paramin <- which(cvm == min(cvm), arr.ind = TRUE)
return(list(cvraw = out, cvm = cvm, cvsd = cvsd, paramin = paramin))
}
# sub-functions to calculate loss for each fold
# default for logistic: AUC
cv.logit <- function(outlist, y, X_1 = NULL, X_2, foldid,
pred.loss = c('AUC', 'deviance')){
pred.loss <- match.arg(pred.loss)
nfolds <- max(foldid)
fitdim <- dim(outlist[[1]]$gammacoef)
out <- array(0, dim = c(nfolds, fitdim[2], fitdim[3]))
for(i in seq(nfolds)){
fitobj <- outlist[[i]]
testid <- which(foldid == i)
ni <- length(testid)
y.test <- as.matrix(y)[testid, ]
if(!is.null(X_1)){
X_1.test <- as.matrix(X_1[testid, ])
}else{
X_1.test <- NULL
}
# remove zero columns identified in the training set
X_2.test <- X_2[testid, ]
if(length(fitobj$rmid) > 0){
X_2.test <- X_2.test[, -fitobj$rmid]
}
# get prediction at each lambda, alpha combination in the ith fold
ypre <- predict_TSLA(fitobj, X_1.test, X_2.test, type = "response")
outi <- switch(pred.loss,
"AUC" = apply(ypre, c(2, 3), FUN = function(x) as.numeric(auc(y.test, x))),
"deviance" = apply(ypre, c(2, 3), FUN = function(x){
sum(-log(x[y.test == 1])) + sum(-log(1 - x[y.test == 0]))
} ))
out[i, , ] <- outi
}
cvm <- apply(out, c(2, 3), FUN = mean)
cvsd <- apply(out, c(2, 3), FUN = sd)
paramin <- switch(pred.loss,
"AUC" = which(cvm == max(cvm), arr.ind = TRUE),
"deviance" = which(cvm == min(cvm), arr.ind = TRUE))
return(list(cvraw = out, cvm = cvm, cvsd = cvsd, paramin = paramin))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.