cv <- function(y,
X,
rand,
id,
data,
lmeUpdate = FALSE,
K = 5,
se1 = FALSE,
smoothInit = NULL,
pathLength = 20,
useTestRanEff = TRUE,
tauMax = NULL,
revPath = FALSE,
paramOrder = NULL,
epsilonAbs = 1e-4,
epsilonRel = 1e-4,
iterMax = 1e3,
centerZ = FALSE,
verbose = 2) {
#' cross validation for l1-penalized additive mixed models
#'
#' This function runs cross-validation one path at a time
#' to estimate the smoothing parameters
#' @param y character string denoting output variable
#' @param X list of fixed effect design matrices setup by l1smooth
#' @param rand list of random effect terms setup by randomEffects
#' @param id character string denoting subject id variable
#' @param data dataset
#' @param se1 TRUE/FALSE if true, sets smoothing parameter to value with
#' maximum CV error within 1 sd of the value that minimizes CV error
#' @param smoothInit vector of initial smoothing parameter values
#' @param pathLength length of each smoothing parameter path
#' (evenly spaced on the log scale)
#' @param useTestRanEff TRUE/FALSE if true, estimates random effects with test sample
#' @param tauMax scalar max value to use in tau path. If NULL, set to maximum
#' value used for fixed effect smoothing parameters.
#' @param revPath TRUE/FALSE if true, estimates smoothing paths from smallest to largest
#' @param paramOrder order of smoothing parameters to be fit
#' @param epsilonAbs absolute error in ADMM (tolerance for convergence)
#' @param epsilonRel relative error in ADMM (tolerance for convergence)
#' @param iterMax maximum number of ADMM iterations
#' @param centerZ TRUE/FALSE If True, imposes centering constraints on random effects
#' @param verbose integer (0, 1, 2) amount of information printed to terminal. 0 = no information, 1 = path iteration, 2 = path iteration + fold results
#' @keywords ADMM, cross validation
#' @export
#' @examples
#' library(psplinesl1)
#' data(simData)
#'
#' # setup p-spline matrices
#' X <- list(ps(x = "x", data = simData,
#' norder = 2, k = 1, width = 0.05,
#' center = TRUE))
#'
#' # setup random effect matrices
#' rand <- re(x = "x", id = "id", data = simData,
#' randomCurves = FALSE)
#'
#' # run cross validation and view paths
#' cvOut <- cv(y = "y", id = "id", X = X, rand = rand,
#' K = 5,
#' pathLength = 20,
#' data = simData)
#' plot(cvOut)
#'
#' # fit model with all data
#' a1 <- admm(y = "y", X, Z = rand$Z, S = rand$S,
#' tau = cvOut$smoothOpt[1],
#' lambda = cvOut$smoothOpt[2:(length(X)+1)],
#' rho = min(5, max(cvOut$smoothOpt)),
#' data = simData)
#'
#' # get and plot fitted model with confidence bands
#' CI <- ci(model = a1, alpha = 0.05)
#' plot(CI)
#'
#' # extract values from ci object for custom plotting
#' CIpoly <- data.frame(x = c(CI[[1]]$x, rev(CI[[1]]$x)),
#' y = c(CI[[1]]$lower, rev(CI[[1]]$upper)))
#'
#' ggplot(aes(x = x, y = y), data = newDat)+
#' geom_polygon(data = CIpoly, fill = "grey")+
#' geom_line(aes(x = CI[[1]]$x, y = CI[[1]]$smooth))
ycol <- which(colnames(data) == y)
J <- length(X) # number of smooths
beta0 <- mean(data[, ycol])
lambdaMax <- rep(NA, J)
for (j in 1:J) {
lambdaMax[j] <- max(abs(ginv(tcrossprod(X[[j]]$D, X[[j]]$D)) %*% X[[j]]$D %*%
ginv(crossprod(X[[j]]$F, X[[j]]$F)) %*% t(X[[j]]$F) %*% (data[, ycol] - beta0)))
}
# By default use largest lambdaMax for tau (random effects)
# This might not be appropriate for all data
if(is.null(tauMax)) {
tauLambdaMax <- c(max(lambdaMax), lambdaMax)
} else {
tauLambdaMax <- c(tauMax, lambdaMax)
}
# TODO: allow for different smoothing paths values
smoothPath <- lapply(tauLambdaMax, function(x){
exp(seq(log(x),
log(x*1e-5),
length.out = pathLength))})
if (revPath) {
smoothPath <- lapply(smoothPath, rev)
}
idCol <- which(colnames(data) == id)
uniqId <- unique(data[, idCol])
# if indicator variable, setup folds with subjects in level
byVars <- unlist(sapply(X, function(x) {x$by}))
if (is.null(byVars)) {
# Create folds without regard to having subjects from each cell in each fold
fold <- list()
nid <- floor(length(uniqId) / K)
for (k in 1:K) {
if (k < K) {
fold[[k]] <- uniqId[((k-1)*nid + 1):(k*nid)]
} else {
fold[[k]] <- uniqId[((k-1)*nid + 1):length(uniqId)]
}
}
} else {
# split cells evenly across folds
byCols <- which(colnames(data) %in% byVars)
byIsInd <- sapply(byCols, function(i) {
length(unique(data[, i])) == 2})
# get variable patterns for each id
varPatterns <- matrix(NA, nrow = length(uniqId), ncol = sum(byIsInd))
rownames(varPatterns) <- uniqId
colnames(varPatterns) <- byVars[byIsInd]
for(i in 1:length(uniqId)) {
varPatterns[i, ] <- data[which(data[, idCol] == uniqId[i])[1],
c(byCols[byIsInd])]
}
# get unique variable patterns
uniqVarPatterns <- as.data.frame(unique(varPatterns))
rownames(uniqVarPatterns) <- NULL
# get ids with each unique variable pattern
ids <- list()
for(i in 1:nrow(uniqVarPatterns)) {
ind <- rep(TRUE, nrow(varPatterns))
for (d in 1:ncol(uniqVarPatterns)) {
ind <- ind * (varPatterns[, d] == uniqVarPatterns[i, d])
}
ids[[i]] <- rownames(varPatterns)[which(ind == 1)]
}
# to do: give warning if not enough subjects in each cell
minCellInd <- which.min(sapply(ids, length))
minCell <- min(sapply(ids, length))
if(floor(minCell / K) < 2) {
stop(paste("Not enough subjects for each combination of factor levels
to have at least 2 in each fold (only ", minCell, " subjects in smallest
cell). Try reducing the number of folds to K <= ", floor(minCell/2),".", sep = "")
)
}
# tab <- xtabs(as.formula(paste("~", paste(colnames(varPatterns), collapse = " + "), sep = " ")), data = mat)
fold <- list()
nid <- floor(sapply(ids, length) / K)
for (k in 1:K) {
if (k < K) {
fold[[k]] <- do.call(c,
lapply(1:length(ids), function(i) {
ids[[i]][((k-1)*nid[i] + 1):(k*nid[i])]
}))
} else {
fold[[k]] <- do.call(c,
lapply(1:length(ids), function(i) {
ids[[i]][((k-1)*nid[i] + 1):length(ids[[i]])]
}))
}
}
}
if (is.null(smoothInit)) {
smoothOpt <- rep(0, length(smoothPath))
# smoothOpt <- c(min(sapply(smoothPath, min), sapply(smoothPath, min))
} else {
smoothOpt <- smoothInit
}
names(smoothOpt) <- c("tau", paste("lambda", 1:J, sep = ""))
CVlong <- data.frame(param = factor(rep(c("tau", paste("lambda[", 1:J, "]", sep = "")),
each = pathLength),
levels = c("tau",
paste("lambda[", 1:J, "]", sep = ""))),
pathVal = do.call(c, smoothPath),
cv = NA,
sd = NA)
minInd <- rep(NA, J + 1)
if (is.null(paramOrder)) {
cvOrder <- 1:(J+1)
} else if(is.numeric(paramOrder) &
length(paramOrder) == J + 1 &
length(unique(paramOrder)) == J + 1) {
cvOrder <- paramOrder
} else {
stop("paramOrder must be either NULL or a vector of length J+1 that
contains each of 1 through J+1 exactly once")
}
paramNames <- c("tau", paste("lambda", 1:J, sep = ""))
if (lmeUpdate) {
cvOrder <- cvOrder - 1
cvOrder <- cvOrder[which(cvOrder > 0)]
smoothOpt <- smoothOpt[-1]
paramNames <- paramNames[-1]
smoothPath <- smoothPath[-1]
CVlong <- CVlong[which(CVlong$param != "tau"), ]
minInd <- minInd[-1]
}
# separate warm starts for each fold
warm <- vector("list", K)
for (j in cvOrder) {
for (pathIter in 1:length(smoothPath[[j]])) {
tauLambda <- smoothOpt
tauLambda[j] <- smoothPath[[j]][pathIter]
names(tauLambda) <- paramNames
if(verbose >= 1) {
print("")
print(paste(paramNames[j],
", iteration ", pathIter, " of ",
pathLength, sep = ""))
print(tauLambda)
}
rho <- min(5, max(tauLambda))
# k-fold cross-validation
cvTemp <- rep(0, K)
for (k in 1:K) {
testRec <- (data$id %in% fold[[k]])
train <- data[!testRec, ]
test <- data[testRec, ]
nTrain <- table(train$id)
nTest <- table(test$id)
Xtrain <- Xtest <- X
for (l in 1:J) {
Xtrain[[l]]$F <- X[[l]]$F[!testRec, ]
Xtest[[l]]$F <- X[[l]]$F[testRec, ]
}
randTrain <- list()
randTrain$Z <- rand$Z[which(!rownames(rand$Z) %in% fold[[k]]),
which(!colnames(rand$Z) %in% fold[[k]])]
randTrain$S <- rand$S[which(!rownames(rand$S) %in% fold[[k]]),
which(!colnames(rand$S) %in% fold[[k]])]
if (lmeUpdate) {
tau = NULL
lambda = tauLambda
} else {
tau = tauLambda[1]
lambda = tauLambda[2:(J + 1)]
}
m1 <- admm(y = y, id = id, X = Xtrain, rand = randTrain,
tau = tau,
lambda = lambda,
rho = rho,
epsilonAbs = epsilonAbs,
epsilonRel = epsilonRel,
iterMax = iterMax,
warm = warm[[k]],
data = train,
lmeUpdate = lmeUpdate,
centerZ = centerZ,
forCV = TRUE)
yTestHat <- m1$warm$beta0
for (l in 1:J) {
yTestHat <- yTestHat + Xtest[[l]]$F %*% m1$warm$beta[[l]]
}
if (useTestRanEff) {
Ztest <- rand$Z[which(rownames(rand$Z) %in% fold[[k]]),
which(colnames(rand$Z) %in% fold[[k]])]
Stest <- rand$S[which(rownames(rand$S) %in% fold[[k]]),
which(colnames(rand$S) %in% fold[[k]])]
bTest <- solve(crossprod(Ztest) + tauLambda[1] * Stest,
crossprod(Ztest, test[, ycol] - yTestHat))
yTestHat <- yTestHat + Ztest %*% bTest
}
cvTemp[k] <- sum((test[, ycol] - yTestHat)^2)
warm[[k]] <- m1$warm
if(verbose >= 2) {
print(paste("fold: ", k,
", ADMM iterations: ", m1$conv$iter,
", cumulative CV error: ", signif(sum(cvTemp), 4),
", training error: ", signif(sum(m1$fit$residuals^2), 4),
sep = ""))
}
}
CVlong$cv[(j-1)*pathLength + pathIter] <- sum(cvTemp)
CVlong$sd[(j-1)*pathLength + pathIter] <- sd(cvTemp)
}
smoothjcv <- CVlong$cv[((j-1)*pathLength + 1):(j*pathLength)]
smoothjsd <- CVlong$sd[((j-1)*pathLength + 1):(j*pathLength)]
minInd[j] <- which.min(smoothjcv)
if (se1) {
# TODO: fix
ind1se <- which.max(smoothjcv <= smoothjcv[minInd[j]] + smoothjsd[minInd[j]])
smoothOpt[j] <- smoothPath[[j]][ind1se]
} else {
# in case of tie, largest parameter selected (smallest df)
smoothOpt[j] <- smoothPath[[j]][which.min(smoothjcv)]
}
}
formattedNames <- paramNames
lambdaInd <- grep("lambda", formattedNames)
formattedNames[lambdaInd] <- paste0("lambda[",1:length(lambdaInd), "]")
cvVlines <- data.frame(pathVal = sapply(1:length(smoothPath), function(l) {
smoothPath[[l]][minInd[l]]}),
param = formattedNames)
gg <- ggplot(aes(x = log(pathVal), y = cv), data = CVlong)+
geom_point()+
geom_errorbar(aes(ymax = cv + sd, ymin = cv - sd))+
facet_wrap(~param, scale = "free", labeller = label_parsed)+
theme_bw(28)+
geom_vline(aes(xintercept = log(pathVal)),
size = 1, linetype = "dashed", data = cvVlines)+
labs(x = "log(smoothing parameter)", y = "CV error")
ret <- list(smoothOpt = smoothOpt, CVlong = CVlong, smoothPath = smoothPath,
minInd = minInd, cvVlines = cvVlines, gg = gg)
class(ret) <- "cv"
return(ret)
}
plot.cv <- function(out){
#' Plot function for cv
#'
#' This function plots the smoothing parameter paths
#' and prints the optimal smoothing parameters
#' @param out output from the cv function
#' @keywords cv print
#' @export
print(out$smoothOpt)
print(out$gg)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.