Nothing
#' Parametric bootstrap testing to compare two models by analysis of variance (deviance)
#'
#' Compute analysis of variance (or deviance) tables for two fitted model objects,
#' with the \emph{P}-value estimated using a parametric bootstrap -- by repeatedly
#' simulating new responses from the fitted model under the null hypothesis.
#'
#' @param objectNull is the fitted model under the null hypothesis. This can be
#' \emph{any} object that responds to \code{simulate}, \code{update} and \code{anova}.
#' @param object is the fitted model under the alternative hypothesis. This can be
#' \emph{any} object that responds to \code{update}, \code{anova} and \code{model.frame}.
#' It must be larger than \code{objectNull} and its model frame should contain
#' all the terms required to fit \code{objectNull}.
#' @param n.sim the number of simulated datasets to generate for parametric
#' bootstrap testing. Defaults to 999.
#' @param rowRef the row number where the test statistic of interest can be found
#' in the standard \code{anova} output when calling \code{anova(objectNull,object)}.
#' Defaults to \code{2}, because it is on the second row for most common models.
#' @param colRef the column number where the test statistic of interest can be found
#' in the standard \code{anova} output when calling \code{anova(objectNull,object)}.
#' Default choices have been set for common models (\code{colRef=5} for \code{lm} objects,
#' \code{colRef=6} for \code{lmer} and \code{glmmTMB} objects and \code{colRef=4} otherwise, which is correct
#' for \code{glm} and \code{gam} objects).
#' @param ncpus the number of CPUs to use. Default (NULL) uses two less than the total
#' available.
#' @param ... further arguments sent through to \code{anova}.
#'
#' @details
#' The \code{anova} function typically relies on classical asymptotic results
#' which sometimes don't apply well, e.g. when using penalised likelihood to fit a
#' generalised additive model, or when testing whether a random effect should be
#' included in a mixed model. In such cases we can get a more accurate test by
#' using simulation to estimate the \emph{P}-value -- repeatedly simulating
#' new sets of simulated responses, refitting the null and alternative models, and
#' recomputing the test statistic. This process allows us to estimate the
#' distribution of the test statistic when the null hypothesis is true, hence
#' draw a conclusion about whether the observed test statistic is large relative
#' to what would be expected under the null hypothesis. The process is often
#' referred to as a \emph{parametric bootstrap} (Davison & Hinkley 1997), which
#' the \emph{PB} in the function name (\code{anovaPB}) is referring to.
#'
#' This function will take any pair of fitted models, a null (\code{objectNull})
#' and an alternative (\code{object}), and use simulation to estimate the
#' \emph{P}-value of the test of whether there is evidence against the model in
#' \code{objectNull} in favour of the model in \code{object}. This function works
#' by repeatedly performing an \code{anova} to compare \code{objectNull} to
#' \code{object}, where the responses in the \code{model.frame} have been replaced
#' with values simulated by applying \code{simulate} to \code{objectNull}. Hence
#' for this function to work, the objects being compared need to respond
#' to the \code{anova}, \code{simulate} and \code{model.frame} functions.
#'
#' This function needs to be able to find the test statistic in the \code{anova}
#' output, but it is stored in different places for different types of objects.
#' It is assumed that \code{anova(objectNull,object)} returns a matrix and that the
#' test statistic is stored in the \code{(rowRef,colRef)}th element, where both
#' \code{rowRef} and \code{colRef} have been chosen to be correct for common
#' model types (\code{glm}, \code{gam}, \code{lmer}).
#'
#' @return A matrix which has the appearance and behaviour of an object returned by
#' \code{anova(objectNull,object)}, except that the \emph{P}-value has been computed
#' by parametric bootstrap, and the matrix now inherits class \code{anovaPB}.
#'
#' @author David Warton <david.warton@@unsw.edu.au>
#' @references
#' Davison A.C. & Hinkley D. V. (1997) Bootstrap methods and their application, Cambridge University Press.
#' Warton DI (2022) Eco-Stats - Data Analysis in Ecology, from \emph{t}-tests to multivariate abundances. Springer, ISBN 978-3-030-88442-0
#'
#' @seealso \code{\link[stats]{anova}}
#' @examples
#' # fit a Poisson regression to random data:
#' y = rpois(50,lambda=1)
#' x = 1:50
#' rpois_glm = glm(y~x, family=poisson())
#' rpois_int = glm(y~1, family=poisson())
#' anovaPB(rpois_int, rpois_glm, n.sim=99, ncpus=1)
#' # this approach would run faster on some larger problems (but maybe not this one!):
#' \dontrun{anovaPB(rpois_int, rpois_glm, n.sim=99, ncpus=4)}
#'
#' @import stats
#' @importFrom parallel clusterApplyLB clusterExport detectCores makeCluster stopCluster
#' @importFrom methods .hasSlot
#' @export
anovaPB=function(objectNull, object, n.sim=999, colRef = switch(class(object)[1],"lm"=5,"lmerMod"=6,"glmmTMB"=6,4), rowRef=2, ncpus=NULL, ...)
{
# check the second model is the larger one... otherwise this will be a prob later
if(length(unlist(coef(objectNull)))>length(unlist(coef(object))))
stop(paste("Whoah, the first object fitted a larger model than the second object...","\n",
" it should be a smaller 'null' model fit, nested in the second object."))
# get dimnames of response
respDimnames = dimnames( model.response(model.frame(object)) )
# decide number of cores to use
if(is.null(ncpus))
{
ncpus=max(1,parallel::detectCores()-2)
parl = ifelse(ncpus==1,"no","snow")
}
# get the observed stat
targs <- match.call(expand.dots = FALSE)
anovaFn=anova
# statObs=try(anova(targs[[2]],targs[[3]],...)) #using targs[[c(2,3)]] instead of c(objectNull,object) so user names for models are in output
statObs=try(anova(objectNull,object,...))
if(inherits(statObs,"try-error"))
{
# OK so if that didn't work, let's define a new anova function via logLik
anovaFn=function(objectNull,object,...)
{
llAlt=logLik(object)
llNull=logLik(objectNull)
table=data.frame(df=c(attr(llNull,"df"),attr(llAlt,"df")),
deviance=-2*c(llNull,llAlt),
LRT = c(NA,-2*llNull+2*llAlt))
return(table)
}
# apply this function, add P-values column, change colRef to 3
statObs=anovaFn(objectNull,object)
statObs$P=c(NA,NA)
names(statObs)[4]='Pr(>LRT)'
colRef=3
# add model details so the printed output looks nice
modelnamelist = c(deparse(substitute(objectNull)),
deparse(substitute(object)))
Xnames <- list(paste(deparse(formula(objectNull),width.cutoff=500), collapse = "\n"),
paste(deparse(formula(object),width.cutoff=500), collapse = "\n"))
topnote <- paste(modelnamelist, ": ", Xnames, sep = "", collapse = "\n")
title <- "Analysis of Deviance Table\n"
rownames(statObs)=modelnamelist
attr(statObs, "heading") <- c(title, topnote)
}
# set up for the bootstrap: assign a stats vector and get the model frame
stats = rep(NA,n.sim+1)
stats[1]=statObs[rowRef,colRef]
# mf = model.frame(object)
if( inherits(object,c("lmerMod","glmerMod")) )
{
cll <- object@call
mf <- match.call(call=cll)
if(.hasSlot(object,"data"))
dat <- object@data
else
dat <- NULL
}
else
{
cll <- object$call
mf <- match.call(call=cll)
dat <- object$data
}
m <- match(c("formula", "data", "subset",
"weights", "na.action", "etastart",
"mustart", "offset"), names(mf), 0L)
mf <- mf[c(1L, m)]
# mf$drop.unused.levels <- TRUE
mf[[1L]] <- quote(stats::model.frame)
# try to coerce to a data frame
modelF <- try( eval(mf, parent.frame()), silent=TRUE )
# if for some reason this didn't work (mgcv::gam objects cause grief) then just call model.frame on object:
# also, do this for lme4 because it is so not a team player
if(inherits(modelF, "try-error") | inherits(object,c("lmerMod","glmerMod","glmmTMB")))
modelF <- model.frame(object)
respName <- names(model.frame(object))[1]
whichResp <- 1
# if data object available, add stuff to modelF that is not there... hack fix for offsets that can't be found by model.frame
if(is.null(dat)==FALSE)
if(is.list(dat)) # only try this when a list or data frame is provided
{
whichAdd = which( names(dat) %in% names(modelF) == FALSE)
if(length(whichAdd)>0)
for (iAdd in whichAdd)
if(is.list(dat[[iAdd]])==FALSE) #stuff added can't be a data frame!
modelF[[names(dat)[iAdd]]] = dat[[iAdd]]
}
# if there is an offset, add it, as a separate argument when updating
offs=NULL
modelF$offs=try(model.offset(modelF))
# if response has brackets in its name, it is some sort of expression,
# put quotes around it so it works (?)
if(regexpr("(",respName,fixed=TRUE)>0)
{
newResp = sprintf("`%s`", respName)
fm.update = reformulate(".", response = newResp)
}
else
fm.update = reformulate(".")
is.mva = ncol(as.matrix(modelF[[whichResp]]))>1
# now n.sim times, simulate new response, refit models and get anova again
yNew = simulate(objectNull,n.sim)
getStat = function(iSim, yNew, objectNull=objectNull, object=object, modelF=modelF, anovaFn=anovaFn, is.mva=is.mva, fm.update=fm.update,
whichResp=whichResp, respDimnames=respDimnames, rowRef=rowRef, colRef=colRef)
{
modelF[[whichResp]] =
if(is.mva)
yNew[,,iSim]
else
as.matrix(yNew[,iSim], dimnames=respDimnames) #matrix to fix lme4 issues
if(inherits(modelF$offs,"try-error") | is.null(modelF$offs))
{
objectiNull = update(objectNull, formula=fm.update, data=modelF)
objecti = update(object, formula=fm.update, data=modelF)
}
else
{
objectiNull = update(object, formula=fm.update, data=modelF,offset=offs)
objecti = update(object, formula=fm.update, data=modelF,offset=offs)
}
return(anovaFn(objectiNull,objecti,...)[rowRef,colRef])
}
if(ncpus>1)
{
cl <- parallel::makeCluster(ncpus)
parallel::clusterExport(cl,c("getStat","anovaFn","yNew","objectNull","object","modelF","is.mva","fm.update","whichResp","nbinom2",
"respDimnames","rowRef","colRef",as.character(cll[[1]])), envir=environment())
statList <- parallel::clusterApplyLB(cl, 1:n.sim, getStat, yNew=yNew, objectNull=objectNull, object=object, modelF=modelF,
anovaFn=anovaFn, is.mva=is.mva, fm.update=fm.update, whichResp=whichResp,
respDimnames=respDimnames, rowRef=rowRef, colRef=colRef)
parallel::stopCluster(cl)
stats[1:n.sim+1] = unlist(statList)
}
else
for(iBoot in 1:n.sim+1)
stats[iBoot] = getStat(iBoot-1,yNew=yNew,objectNull=objectNull,object=object,modelF=modelF,anovaFn=anovaFn,is.mva=is.mva,
fm.update=fm.update, whichResp=whichResp,respDimnames=respDimnames,rowRef=rowRef,colRef=colRef)
# {
# modelF[[whichResp]] = as.matrix(simulate(objectNull), dimnames=respDimnames) #matrix to fix lme4 issues
# if(inherits(modelF$offs,"try-error") | is.null(modelF$offs))
# {
# objectiNull = update(objectNull, formula=fm.update, data=modelF)
# objecti = update(object, formula=fm.update, data=modelF)
# }
# else
# {
# objectiNull = update(object, formula=fm.update, data=modelF,offset=offs)
# objecti = update(object, formula=fm.update, data=modelF,offset=offs)
# }
# stats[iBoot] = anovaFn(objectiNull,objecti,...)[rowRef,colRef]
# }
# now take the original anova table, get rid of unneeded columns, stick on P-value
statReturn=statObs[,1:colRef] #get rid of the extra columns we don't need
statReturn$'P-value'=NA
statReturn$'P-value'[rowRef] = mean(stats>(stats[1]-1.e-8), na.rm=TRUE)
attr(statReturn,"n.sim")=sum(is.na(stats)==FALSE)-1
# change the name of the P-value column to whatever it was in the original anova table
hasP = grep("Pr",colnames(statObs))
colnames(statReturn)[colRef+1]=colnames(statObs)[hasP[1]]
# add some bells and whistles to the anova table, useful for printing
attr(statReturn, "heading") = attr(statObs,"heading")
class(statReturn)=c("anovaPB",class(statObs))
return(statReturn)
}
#' @export
print.anovaPB=function(x, digits = max(getOption("digits") - 3, 3),
signif.stars = getOption("show.signif.stars"),
dig.tst = max(1, min(5, digits - 1)), ...)
{
if (!is.logical(signif.stars) || is.na(signif.stars)) {
warning("option \"show.signif.stars\" is invalid: assuming TRUE")
signif.stars <- TRUE
}
if (!is.null(heading <- attr(x, "heading")))
{ cat("\n")
cat(heading, sep = "\n")
}
printCoefmat(round(x, digits=dig.tst), digits = digits, signif.stars = signif.stars, P.values = TRUE, has.Pvalue=TRUE, cs.ind = NULL, tst.ind = NCOL(x)-1, zap.ind=NCOL(x)-1, na.print = "", ...)
cat("\n")
cat(paste("P-value calculated by simulating ", attr(x,"n.sim"), " samples from ", rownames(x)[1],".\n",sep=""))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.