#' ANOVA for GEEs
#'
#' Carry out ANOVA and do backward model selection for GEEs fitted with geepack::geeglm()
#'
#' @param model1 a fitted model object created by geepack::geeglm()
#'
#' @export
#' @return a list containing the best fitted model (with terms with p-values above 0.05 removed via backward selection) and a table of p-values for all variables in model1
gee_anova <- function(model1){
#model1 is the "full model" we are starting with. output object from geeglm (or other object that anova() will work on)
#note that terms are added sequentially if we use the anova() function.
#We want a p-value for each term *assuming all the others are included in the model*
#so we can do that with anova() by placing
#each term as the last term and applying anova() each time, keeping only the
#pvalue for the "last term". Then for model selection we
#remove the term which has the highest p-value.
#this can be done manually (time-consuming), or using the following automated code
#note: the code below will not work if the names of the predictor variables are "nested"
#for example if one predictor is called "x" and the others are "x2" and "x3".
#make sure names are unique, for example even "x1", "x2", "x3" would be ok!
#create a data frame called "model.seln" to hold the model selection results
model.seln <- data.frame(coefs=labels(stats::terms.formula(model1$formula)), p.vals=1, include=T)
#get a list of the names of all the coefficients in the full model model1
all.coefs <- labels(stats::terms.formula(model1$formula))
alfa <- 0.05 #desired significance level for hypothesis testing
best.model <- model1 #start with the full model - we will reduce it via anova-based model selection
#the following loop will run until all non-significant predictors have been removed from the model
while (any(model.seln[model.seln$include,"p.vals"] > alfa)){
#get a list of the predictors in the current best model
best.coefs <- labels(stats::terms.formula(best.model$formula))
for (k in 1:length(best.coefs)){ #loop over all coefficients except intercept
#for each predictor/coefficient, move it to the end of the model specification then run anova
coef <- best.coefs[k]
#create a string naming the predictor currently being tested, to feed to "update" function
f<-paste(".~.-",coef, "+", coef,sep="")
#create "test.model", which has the current predictor as the last-term-added
test.model <- stats::update(best.model,
stats::as.formula(f) )
#save the p-value (for the current predictor only) in the data frame "model.seln"
model.seln[model.seln$coefs==coef,"p.vals"] <-
stats::anova(test.model)$P[labels(stats::terms.formula(test.model$formula))==coef]
}
#first check interaction terms significance
ints <- grep(":", best.coefs) #indices of interaction terms in (current) best model coef list
ints.full <- grep(":", all.coefs)#indices of interaction terms in full model coef list
mains <- setdiff(c(1:length(best.coefs)),ints) #indices of main effect terms in current best model coef list
mains.full <- setdiff(c(1:length(all.coefs)), ints.full) #indices of main effect terms in full model coef list
maxp <- max(c(0.01,model.seln[intersect(ints.full, which(model.seln$include==T)),"p.vals"])) #the largest p-value for interaction terms
if(maxp > alfa) { #if the largest interaction p-value is larger than the significance threshold, then:
#find the index (in the full model coef list) for the interaction term with the biggest p value
j <- which(model.seln[ints.full,"p.vals"]==maxp)
#mark the least-significant variable for exclusion
model.seln[ints.full[j],"include"] <- F
#and remove it from "best.model"
j2 <- which(best.coefs[ints] == model.seln[ints.full[j], "coefs"])
f<-paste(".~.-",best.coefs[ints[j2]],sep="")
best.model <- stats::update(best.model,stats::as.formula(f))
}else{ #if all the interactions are significant, then can check the main effects
#goal: find the least-significant one with p>alfa.
#j is the indices of the main-effect p-values, ordered biggest to smallest
px <- sort(model.seln[model.seln$coefs %in% best.coefs,"p.vals"],
decreasing=T,
index.return=T)$ix
p.sort <- model.seln[which(model.seln$coefs %in% best.coefs)[px],
c("coefs", "p.vals")]
#exclude all the ones that are part of significant interactions
for (kk in 1:nrow(p.sort)) { #this loop will remove entries from j until the first one is no longer part of a significant interaction.
sig.int <- grep(pattern=p.sort[kk,"coefs"], best.coefs[ints])
if (length(sig.int) > 0) {
p.sort[kk,] <- NA
}
}
p.sort <- stats::na.omit(p.sort)
#now update the model by removing the main effect with the biggest p value (among non-significant ones)
if (nrow(p.sort) == 0){
break # if there are no mains that are not in significant interactions, we're done
}else{#if there are main effects that are non-significant and also not in a significant interaction, remove the least significant one
if (model.seln[model.seln$coefs==p.sort[1,"coefs"],"p.vals"] > alfa){
model.seln[model.seln$coefs==p.sort[1,"coefs"],"include"] <- F
f<-paste(".~.-",p.sort[1,"coefs"],sep="")
best.model <- stats::update(best.model,stats::as.formula(f))
}else{break}
}
}
}
return(list(best.model=best.model, pvals=model.seln))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.