#' Cross-Validation of BB Learning
#'
#' Run multiple fittings of \code{bbl} model with training/validation
#' division of data
#'
#' The \code{data} slot of \code{object} is split into training and validation
#' subsets of (\code{nfold}-1):1 ratio. The model is trained with the
#' former and validated on the latter. Individual division/fold results are
#' combined into validation result for all instances in the data set and
#' prediction score is evaluated using the known response group
#' identity.
#'
#' @param formula Formula for model. Note that intercept has no effect.
#' @param data Data frame of data. Column names must match \code{formula}.
#' @param freq Frequency vector of how many times each row of \code{data} must
#' be repeated. If \code{NULL}, defaults to vector of 1s.
#' Fractional values are not supported.
#' @param novarOk Proceed even when there are predictors with only one factor level.
#' @param lambda Vector of L2 penalizer values for \code{method = 'pseudo'}. Inferences
#' will be repeated for each value. Restricited to non-negative values.
#' @param lambdah L2 penalizer in \code{method = 'pseudo'} applied to
#' parameter \code{h}. In contrast to \code{lambda},
#' only a single value is allowed.
#' @param eps Vector of regularization parameters, \eqn{\epsilon\in[0,1]},
#' for \code{method = 'mf'}. Inference will be repeated
#' for each value.
#' @param nfold Number of folds for training/validation split.
#' @param method \code{c('pseudo','mf')} for pseudo-likelihood maximization or
#' mean field.
#' @param naive Naive Bayes (no interactions). Equivalent to \code{method = 'mf'}
#' together with \code{eps = 0}.
#' @param use.auc Use AUC as the measure of prediction accuracy. Only works
#' if response groups are binary. If \code{FALSE}, mean prediction group
#' accuracy will be used as score.
#' @param verbose Verbosity level. Downgraded when relayed into \code{\link{bbl}}.
#' @param progress.bar Display progress bar in \code{\link{predict}}.
#' @param storeOpt Store the optimal fitted object of class \code{\link{bbl}}.
#' @param ... Other parameters to \code{\link{mlestimate}}.
#' @return Object of class \code{cv.bbl} extending \code{\link{bbl}}, a list
#' with extra components
#' \item{regstar}{Value of regularization parameter, \code{lambda}
#' and \code{eps} for \code{method='pseudo'} and \code{method='mf'},
#' respectively, at which the accuracy score is maximized}
#' \item{maxscore}{Value of maximum accuracy score}
#' \item{cvframe}{Data frame of regularization parameters and scores scanned}
#' The components of \code{\link{bbl}} store the optimal model trained
#' if \code{storeOpt=TRUE}.
#' @examples
#' set.seed(513)
#' m <- 5
#' n <- 100
#' predictors <- list()
#' for(i in 1:m) predictors[[i]] <- c('a','c','g','t')
#' names(predictors) <- paste0('v',1:m)
#' par <- list(randompar(predictors), randompar(predictors, h0=0.1, J0=0.1))
#' dat <- randomsamp(predictors, response=c('ctrl','case'), par=par, nsample=n)
#' cv <- crossVal(y ~ .^2, data=dat, method='mf', eps=seq(0.1,0.9,0.1))
#' cv
#' @export
crossVal <- function(formula, data, freq=NULL, novarOk=FALSE,
lambda=1e-5, lambdah=0, eps=0.9, nfold=5,
method='pseudo', naive=FALSE, use.auc=TRUE,
verbose=1, progress.bar=FALSE, storeOpt=TRUE, ...){
cl <- match.call()
if(missing(data)) data <- environment(formula)
if(!is.null(freq)){
if(length(freq)!=NROW(data))
stop('Length of freq does not match data')
zero <- freq==0
data <- data[!zero,]
freq <- freq[!zero] # remove rows with zero freq
}
# data <- as.data.frame(lapply(data, function(x) if(is.numeric(x)) factor(x)))
term <- terms(formula, data=data)
idy <- attributes(term)$response
vars <- as.character(attributes(term)$variables)
resp <- vars[[idy+1]]
vars <- vars[!(vars %in% c('list',resp))]
# xlevels <- .getXlevels(term, m=data)
xlevels <- getxlevels(vars, data=data)
formula <- formula(term)
for(i in seq_along(xlevels)){
if(length(xlevels[[i]])==1){
mesg <- paste0('Predictor ',names(xlevels)[i],' has one level')
if(novarOk)
warning(mesg)
else
stop(mesg)
}
}
m <- length(xlevels)
y <- data[,resp]
yx <- data[,c(resp,names(xlevels))]
groups <- levels(factor(y))
Ly <- length(groups)
if(Ly==1) warning('Only one response group in data')
if(Ly!=2) use.auc <- FALSE
if(!is.null(freq)){
if(!all.equal(freq, as.integer(freq))) stop('Non-integer freq')
if(length(freq)!=NROW(data))
stop('Length of freq does not match data')
yx <- freq2raw(data=yx, freq=freq)
y <- yx[,resp]
}
if(naive){
method <- 'mf'
eps <- 0
}
if(method=='pseudo'){
reglist <- lambda
if(length(lambdah)>1)
stop('Only a single value of lambdah allowed')
}
else if(method=='mf'){
if(!naive) reglist <- eps
else reglist <- 0
}
else stop('Unknown method')
res <- NULL # data frame of cv result
bbopt <- NULL # optimally trained bbl object
maxscore <- -Inf
regstar <- 0
for(reg in reglist){
if(is.null(lambdah)) regh <- reg
else regh <- lambdah
if(verbose>0){
if(method=='pseudo'){
if(regh>0)
cat('Cross validation under lambda = (',reg,
',',regh,')\n',sep='')
else
cat('Cross validation under lambda = ',reg,'\n',sep='')
}
else cat('Cross validation under epsilon = ',reg,'\n',sep='')
}
pred <- NULL
for(k in seq_len(nfold)){
if(verbose>0) cat(' Fold no. ',k,'...\n',sep='')
itrain <- ival <- NULL
for(iy in seq_len(Ly)){
idy <- which(y==groups[iy])
ns <- length(idy)
nval <- max(1,floor(ns/nfold))
imax <- k*nval
if(imax > ns | k==nfold) imax <- ns
if(imax < 1) imax <- 1
iyval <- idy[seq((k-1)*nval+1, imax)]
iytrain <- idy[!idy %in% iyval]
ival <- c(ival, iyval)
itrain <- c(itrain, iytrain)
}
if(sum(is.na(ival)>0) | sum(is.na(itrain)>0))
stop('Error in train/validation division')
if(length(ival)==0 | length(itrain)==0)
stop('Sample size too small for nfold requested')
dval <- yx[ival, ,drop=FALSE]
dtrain <- yx[itrain, ,drop=FALSE]
if(method=='pseudo')
obtrain <- bbl(formula, data=dtrain, xlevels=xlevels,
method=method, novarOk=TRUE,
testNull=FALSE, lambda=reg,
verbose=verbose-1, lambdah=regh, ...)
else
obtrain <- bbl(formula, data=dtrain, xlevels=xlevels,
method=method, novarOk=TRUE,
testNull=FALSE, eps=reg,
verbose=verbose-1, lambdah=lambdah, ...)
pr <- predict(object=obtrain, newdata=dval, logit=!use.auc,
progress.bar=progress.bar, verbose=verbose-1)
yxv <- data.frame(ytrue=factor(yx[ival,1],levels=groups))
pred <- rbind(pred, cbind(yxv, pr))
}
if(use.auc){
score <- pROC::roc(response=pred[,1], levels=groups,
predictor=pred[,3], direction='<')$auc
if(verbose>0) cat(' AUC = ',score,'\n',sep='')
}
else{
score <- mean(pred[,1]==pred$yhat)
if(verbose>0) cat(' Prediction score = ',score,'\n',sep='')
}
if(method=='pseudo'){
if(use.auc) rx <- data.frame(lambda=reg, auc=score)
else rx <- data.frame(lambda=reg, score=score)
} else{
if(use.auc) rx <- data.frame(epsilon=reg, auc=score)
else rx <- data.frame(epsilon=reg, score=score)
}
if(score > maxscore){
maxscore <- score
regstar <- reg
if(storeOpt) bbopt <- obtrain
}
res <- rbind(res, rx)
}
cv <- c(bbopt, list(regstar=regstar, maxscore=maxscore, cvframe=res))
class(cv) <- 'cv.bbl'
return(cv)
}
getxlevels <- function(vars, data){
m <- length(vars)
xlevels <- vector('list',m)
names(xlevels) <- vars
for(i in seq_along(vars)){
if(!vars[i] %in% colnames(data))
stop(paste0("'",vars[i],"' not in data"))
v <- data[,vars[i]]
if(is.factor(v) | is.character(v)){
lv <- levels(factor(v))
xlevels[[i]] <- lv[order(lv)]
}
else{
v <- unique(v)
xlevels[[i]] <- v[order(v)]
}
}
return(xlevels)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.