#' Fit Generalize Estimating Equations with Small Sample Variance Estimators
#'
#'This function fits a GEE model using the \code{geeglm} function
#'from the \pkg{geepack}-package. Additionally, small sample variance
#'estimates are calculated using one of three methods proposed in the following
#'papers: Pan (2001), Mancl and Derouen (2001), and Wang and Long (2011).
#'
#' @param formula A two-sided linear formula object with the response
#' on the left of a ~ operator and the terms, separated by + operators, on the right.
#' @param family The error distribution to be used in the model. Only \code{guassian} and \code{poisson}
#' families are supported.
#' @param data an optional data frame containing the variables in the model and the id variable. If not found in data, the variables are taken from
#' \code{environment(formula)}, typically the environment from which glm is called.
#' @param id a vector or data column name which identifies the clusters. The length of
#' id should be the same as the number of observations. Data are
#' assumed to be sorted so that observations on each cluster appear
#' as contiguous rows in data. If data is not sorted this way, the
#' function will not identify the clusters correctly. If \code{sort=TRUE} (default),
#' the dataframe from the \code{data} argument is sorted by the id column to avoid
#' this issue.
#' @param corstr a character string specifying the correlation structure. The following are permitted: "independence", "exchangeable", "ar1", and "unstructured"
#' @param small.samp.method a character string specifying the
#' small sample method. The following are permitted: \code{"pan"} for the
#' Pan (2001) method, \code{"md"} for the Mancl and Derouen (2001) method, and \code{"wl"} for the Wang and Long (2011) method.
#' If \code{small.samp.method} is null, small sample variance estimates are not computed.
#' The resulting object will be identical to the object created if
#' \code{geeglm} from the \pkg{geepack}-package was used.
#' @param ... additional arguments passed on to \code{geepack::geeglm}.
#'
#' @details This function borrows heavily from the corresponding small sample variance estimating
#'functions in the \pkg{geesmv}-package
#'(\code{\link[geesmv]{GEE.var.pan}}, \code{\link[geesmv]{GEE.var.md}} and \code{\link[geesmv]{GEE.var.wl}}).
#'In addition to combining these functions into a single function and using the \code{geeglm} function
#'in model fitting, this function also varys from these functions in that it
#'ensures that model offsets are properly accounted for in the calculation of
#'small sample estimators.
#'
#' @import geepack geesmv
#'
#' @return This function returns a \code{geeglm} object with small modifications:
#'
#' \item{small.samp.var}{an additional item containing the small sample variance estimators is included in the fit object}
#' \item{geese$vbeta}{The variance covariance matrix in the geese item from the object fit contains the small sample adjusted variance covaraince matrix.}
#'
#'
#'@author Elizabeth Wynn, \pkg{geesmv}-authors for underlying code used in small sample size variance estimators.
#'
#' @seealso \code{\link[geepack]{geeglm}}, \code{\link[geesmv]{GEE.var.pan}}, \code{\link[geesmv]{GEE.var.md}} and \code{\link[geesmv]{GEE.var.wl}}
#'
#' @references
#' Mancl LA, DeRouen TA (2001). "A covariance estimator for GEE with improved small-sample properties." Biometrics, 57(1), 126-134. ISSN 0006341X. doi:10. 1111/j.0006-341X.2001.00126.x.
#'
#' Pan W (2001). "On the robust variance estimator in generalised estimating equations." \emph{Biometrika}, 88(3), 901-906. ISSN 00063444. doi:10.1093/biomet/88.3.901.
#'
#' Wang M, Long Q (2011). "Modified robust variance estimator for generalized estimating equations with improved small-sample performance." \emph{Statistics in Medicine}, 30(11), 1278-1291. ISSN 02776715. doi:10.1002/sim.4150
#'
#'
#'
#' @examples
#' data("simdata")
#' sample_meta_data <- simdata$metadata
#'
#' #Subset down to one observation (i.e. gene)
#' counts=simdata$counts[1,]
#'
#' #Combine counts, metadata into dataframe
#' df=cbind(counts, sample_meta_data)
#'
#' #Sort data by id (Function also does this if sort=T)
#' df=df[order(df$ids),]
#'
#' #Fit the Model-use Pan method for small sample variance
#' fit.gee.pan<-geeglm_small_samp(formula =counts ~ group * time,
#' family=poisson, data=df, id=ids,
#' corstr="exchangeable",
#' small.samp.method="pan", sort=T)
#' @export
#'
#NOTE: Added parameter mu throughout all functions because function doesn't account for offset
#Use geepack instead of gee model
#Doesn't support binomial model
geeglm_small_samp<-function (formula,
family = poisson,
data,
id,
corstr = "exchangeable",
small.samp.method=NULL,
sort=T,
...)
{
call <- match.call(expand.dots = TRUE)
if(sort) {
mf=model.frame(formula, data)
names(mf)=gsub("offset\\(|\\)$", "",names(mf))
## EW Only can sort if dataframe is provided with all formula/id variables
if(is.null(call$data)){
stop("Cannot sort by ID unless data is provided to the data argument")
}else if(any(!(c(names(mf), paste(call$id)) %in% names(data)))){
stop("All formula and id variables must be in provided data frame in order to sort by ID ")
}else{
data<-data[order(data[[paste(call$id)]]),]
}
}
call$data=quote(data)
init <- model.frame(formula, data)
init$num <- 1:length(init[, 1])
if (any(is.na(init))) {
index <- na.omit(init)$num
data <- data[index, ]
m <- model.frame(formula, data)
mt <- attr(m, "terms")
data$response <- model.response(m, "numeric")
mat <- as.data.frame(model.matrix(formula, m))
}
else {
m <- model.frame(formula, data)
mt <- attr(m, "terms")
data$response <- model.response(m, "numeric")
mat <- as.data.frame(model.matrix(formula, m))
}
#################Make sure data is sorted################
###Use geepack model (geesmv fits "gee" package model)###
#Calling geeglm
call_list<-as.list(call)
if(is.null(call_list$family)) call_list$family=quote(poisson)
if(is.null(call_list$corstr)) call_list$corstr="exchangeable"
call_list[[1]]<-call_list$small.samp.method<-call_list$sort<-NULL
gee.fit<-do.call("geeglm", call_list)
#If small samp method is provided
if(!is.null(small.samp.method)){
beta_est <- gee.fit$coefficients
alpha <- summary(gee.fit)$corr[[1]]
#########Drawing largely from code from geesmv package############
len <- length(beta_est)
len_vec <- len^2
data$id <- as.numeric(factor(gee.fit$id)) ## EAW: 1st make factor in case id is character vector
cluster <- cluster.size(data$id)
ncluster <- max(cluster$n)
size <- cluster$m
mat$subj <- rep(unique(data$id), cluster$n)
if (is.character(corstr)) {
var <- switch(corstr, independence = cormax.ind(ncluster),
exchangeable = cormax.exch(ncluster, alpha), `AR-M` = cormax.ar1(ncluster,
alpha), unstructured = summary(gee.fit)$working.correlation
)
}
else {
print(corstr)
stop("'working correlation structure' not recognized")
}
if (is.character(family)) {
family <- switch(family, gaussian = "gaussian",
poisson = "poisson")
}
else {
if (is.function(family)) {
family <- family()[[1]]
}
else {
print(family)
stop("'family' not recognized")
}
}
if(small.samp.method=="wl"){
m <- model.frame(formula, data)
mat <- as.data.frame(model.matrix(formula, m))
mat$subj <- rep(unique(data$id), cluster$n)
}
cov.beta <- unstr <- matrix(0, nrow = len, ncol = len)
if(small.samp.method=="pan") step <- matrix(0, nrow = cluster$n[1], ncol = cluster$n[1])
else if(small.samp.method=="md") step11 <- matrix(0, nrow = len, ncol = len)
else if(small.samp.method=="wl")step01 <- matrix(0, nrow = len, ncol = len)
for (i in 1:size) {
y <- as.matrix(data$response[data$id == unique(data$id)[i]])
covariate <- as.matrix(subset(mat[, -length(mat[1, ])],
mat$subj == unique(data$id)[i]))
#EAW added fitted values so don't have to solve using link
mu=as.matrix(gee.fit$fitted.values[data$id == unique(data$id)[i]])
if(small.samp.method=="pan"){
if (family == "gaussian") {
resid <- (y - mu) %*% t(y - mu)
step <- step + resid
}
else if (family == "poisson") {
resid <- (y - mu) %*% t(y -
mu)
B <- matrix(0, nrow = cluster$n[i], ncol = cluster$n[i])
diag(B) <- 1/sqrt(mu)
step <- step + B %*% resid %*% B
}
}else if(small.samp.method=="md"){
var_i = var[1:cluster$n[i], 1:cluster$n[i]]
if (family == "gaussian") {
xx <- t(covariate) %*% solve(var_i) %*% covariate
step11 <- step11 + xx
}
else if (family == "poisson") {
D <- mat.prod(covariate, mu)
Vi <- diag(sqrt(c(mu)),
cluster$n[i]) %*% var_i %*% diag(sqrt(c(mu)), cluster$n[i])
xx <- t(D) %*% solve(Vi) %*% D
step11 <- step11 + xx
}
}else if(small.samp.method=="wl"){
var_i = var[1:cluster$n[i], 1:cluster$n[i]]
if (family == "gaussian") {
xx <- t(covariate) %*% solve(var_i) %*% covariate
step01 <- step01 + xx
}
else if (family == "poisson") {
D <- mat.prod(covariate, mu)
Vi <- diag(sqrt(c(mu)),
cluster$n[i]) %*% var_i %*% diag(sqrt(c(mu)), cluster$n[i])
xx <- t(D) %*% solve(Vi) %*% D
step01 <- step01 + xx
}
}
}
if(small.samp.method=="wl"){
step <- matrix(0, nrow = cluster$n[i], ncol = cluster$n[i])
for (i in 1:size) {
y <- as.matrix(data$response[data$id == unique(data$id)[i]])
covariate <- as.matrix(subset(mat[, -length(mat[1, ])],
mat$subj == unique(data$id)[i]))
#EAW added fitted values so don't have to solve using link
mu=as.matrix(gee.fit$fitted.values[data$id == unique(data$id)[i]])
var_i = var[1:cluster$n[i], 1:cluster$n[i]]
if (family == "gaussian") {
resid <- solve(cormax.ind(cluster$n[i]) - covariate %*%
solve(step01) %*% t(covariate) %*% solve(var_i)) %*%
(y - mu)
step <- step + resid %*% t(resid)
}
else if (family == "poisson") {
B <- matrix(0, nrow = cluster$n[i], ncol = cluster$n[i])
diag(B) <- 1/sqrt(mu)
D <- mat.prod(covariate, mu)
Vi <- diag(sqrt(c(mu)),
cluster$n[i]) %*% var_i %*% diag(sqrt(c(mu)), cluster$n[i])
resid <- B %*% solve(cormax.ind(cluster$n[i]) - D %*%
solve(step01) %*% t(D) %*% solve(Vi)) %*% (y -mu)
step <- step + resid %*% t(resid)
}
}
}
if(small.samp.method%in% c("pan", "wl")){
unstr <- step/size
step11 <- matrix(0, nrow = len, ncol = len)
}
step12 <- matrix(0, nrow = len, ncol = len)
step13 <- matrix(0, nrow = len_vec, ncol = 1)
step14 <- matrix(0, nrow = len_vec, ncol = len_vec)
p <- matrix(0, nrow = len_vec, ncol = size)
for (i in 1:size) {
y <- as.matrix(data$response[data$id == unique(data$id)[i]])
covariate <- as.matrix(subset(mat[, -length(mat[1, ])],
mat$subj == unique(data$id)[i]))
#EAW added fitted values so don't have to solve using link
mu=as.matrix(gee.fit$fitted.values[data$id == unique(data$id)[i]])
var_i = var[1:cluster$n[i], 1:cluster$n[i]]
if(small.samp.method=="pan"){
if (family == "gaussian") {
xy <- t(covariate) %*% solve(var_i) %*% unstr %*%
solve(var) %*% covariate
xx <- t(covariate) %*% solve(var_i) %*% covariate
step11 <- step11 + xx
step12 <- step12 + xy
step13 <- step13 + matrixcalc::vec(xy)
p[, i] <- matrixcalc::vec(xy)
}
else if (family == "poisson") {
A <- matrix(0, nrow = cluster$n[i], ncol = cluster$n[i])
diag(A) <- mu
D <- mat.prod(covariate, mu)
Vi <- diag(sqrt(c(mu)),
cluster$n[i]) %*% var_i %*% diag(sqrt(c(mu)), cluster$n[i])
xy <- t(D) %*% solve(Vi) %*% sqrt(A) %*% unstr %*%
sqrt(A) %*% solve(Vi) %*% D
xx <- t(D) %*% solve(Vi) %*% D
step12 <- step12 + xy
step11 <- step11 + xx
step13 <- step13 + matrixcalc::vec(xy)
p[, i] <- matrixcalc::vec(xy)
}
}else if(small.samp.method=="md"){
if (family == "gaussian") {
xy <- t(covariate) %*% solve(var_i) %*% solve(cormax.ind(cluster$n[i]) -
covariate %*% solve(step11) %*% t(covariate) %*%
solve(var_i)) %*% (y - mu)
step12 <- step12 + xy %*% t(xy)
step13 <- step13 + matrixcalc::vec(xy %*% t(xy))
p[, i] <- matrixcalc::vec(xy %*% t(xy))
}
else if (family == "poisson") {
D <- mat.prod(covariate, mu)
Vi <- diag(sqrt(c(mu)),
cluster$n[i]) %*% var_i %*% diag(sqrt(c(mu)), cluster$n[i])
xy <- t(D) %*% solve(Vi) %*% solve(cormax.ind(cluster$n[i]) -
D %*% solve(step11) %*% t(D) %*% solve(Vi)) %*%
(y - mu)
step12 <- step12 + xy %*% t(xy)
step13 <- step13 + matrixcalc::vec(xy %*% t(xy))
p[, i] <- matrixcalc::vec(xy %*% t(xy))
}
}else if(small.samp.method=="wl"){
if (family == "gaussian") {
xy <- t(covariate) %*% solve(var_i) %*% unstr %*%
solve(var) %*% covariate
xx <- t(covariate) %*% solve(var_i) %*% covariate
step11 <- step11 + xx
step12 <- step12 + xy
step13 <- step13 + matrixcalc::vec(xy)
p[, i] <- matrixcalc::vec(xy)
}
else if (family == "poisson") {
B <- matrix(0, nrow = cluster$n[i], ncol = cluster$n[i])
diag(B) <- mu
D <- mat.prod(covariate, mu)
Vi <- diag(sqrt(c(mu)),
cluster$n[i]) %*% var_i %*% diag(sqrt(c(mu)), cluster$n[i])
xy <- t(D) %*% solve(Vi) %*% sqrt(B) %*% unstr %*%
sqrt(B) %*% solve(Vi) %*% D
xx <- t(D) %*% solve(Vi) %*% D
step11 <- step11 + xx
step12 <- step12 + xy
step13 <- step13 + matrixcalc::vec(xy)
p[, i] <- matrixcalc::vec(xy)
}
}
}
for (i in 1:size) {
dif <- (p[, i] - step13/size) %*% t(p[, i] - step13/size)
step14 <- step14 + dif
}
cov.beta <- solve(step11) %*% (step12) %*% solve(step11)
cov.var <- size/(size - 1) * kronecker(solve(step11), solve(step11)) %*%
step14 %*% kronecker(solve(step11), solve(step11))
gee.fit$small.samp.var<-diag(cov.beta)
gee.fit$geese$vbeta<- cov.beta
names(gee.fit$small.samp.var)<-names(gee.fit$coefficients)
}
#gee.fit$call<-NULL
return(gee.fit)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.