# ================================== cglm ====================================
#' Fit Generalized Models to cluster correlated data
#'
#' \code{cglm} has the same functionality as \code{\link[stats]{glm}}
#' except that cluster correlated data can be analyzed using the methodology
#' of \href{http://dx.doi.org/10.1093/biomet/asm015}{Chandler and Bate (2007)},
#' implemented by the \code{\link[chandwich]{chandwich}} package.
#'
#' @inheritParams stats::glm
#' @param cluster A vector or factor indicating from which cluster the
#' respective loglikelihood contributions from \code{loglik} originate.
#' Must have the same length as the vector returned by \code{loglik}.
#' If \code{cluster} is not supplied then it is set inside
#' \code{\link[chandwich]{adjust_loglik}} under the assumption that
#' each observation forms its own cluster.
#' @param use_alg A logical scalar. Whether or not to provide to
#' \code{\link[chandwich]{adjust_loglik}} via the arguments
#' \code{alg_deriv} and \code{alg_hess} functions to evalulate
#' the derivatives (scores) of the loglikelihood with respect to the
#' parameters and the Hessian of the loglikelihood respectively.
#' @details Three adjustments to the independence loglikelihood described in
#' Chandler and Bate (2007) are available. The vertical adjustment is
#' described in Section 6 and two horizontal adjustments are described
#' in Sections 3.2 to 3.4. See the descriptions of \code{type} and, for the
#' horizontal adjustments, the descriptions of \code{C_cholesky} and
#' \code{C_spectral}, in the documentation of
#' \code{\link[chandwich]{adjust_loglik}}.
#'
#' The adjustments involve first and second derivatives of the loglikelihood
#' with respect to the model parameters. If \code{use_alg = FALSE} then
#' these are estimated using \code{\link[numDeriv]{jacobian}} and
#' \code{\link[stats:optim]{optimHess}}.
#'
#' If the argument \code{cluster} is not provided then the adjustment to the
#' loglikelihood is based on a robust sandwich estimator for the covariance
#' matrix of the parameter estimators, under the assumption that each
#' observation forms its own cluster.
#'
#' If you only wish to adjust parameter covariance matrices, that is,
#' you are not interested in an adjusted loglikelihood, then the
#' \href{https://CRAN.R-project.org/package=sandwich}{sandwich} package
#' will provide equivalent results and is more flexible than
#' \code{chandwichGLM}. The vignettes in the sandwich package give a very
#' clear account of this general area.
#' @return An object of class \code{"chandwich"}, returned by
#' \code{\link[chandwich]{adjust_loglik}}. The function
#' \code{\link{summary}} (i.e. \code{\link[chandwich]{summary.chandwich}})
#' can be used to obtain or print a summary of the results.
#' The function \code{\link[chandwich]{conf_intervals}} can be used to
#' calculate confidence intervals for each parameter.
#' \code{\link[chandwich]{plot.confint}} can be used to illustrate the
#' an individual confidence interval in a plot.
#' The function \code{\link[chandwich]{conf_region}}, and its plot method
#' \code{\link[chandwich]{plot.confreg}}, can be used to plot confidence
#' regions for pairs of parameters.
#' @references Chandler, R. E. and Bate, S. (2007). Inference for clustered
#' data using the independence loglikelihood. \emph{Biometrika},
#' \strong{94}(1), 167-183. \url{http://dx.doi.org/10.1093/biomet/asm015}
#' @seealso
#' \code{\link[stats]{glm}},
#' \code{\link[chandwich]{chandwich}},
#' \code{\link[chandwich]{adjust_loglik}}.
#' @examples
#'
#' ### Section 5.2 of sandwich vignette at
#' # https://cran.r-project.org/web/packages/sandwich/vignettes/sandwich-OOP.pdf
#'
#' ## Load the dataset Affairs from the package AER
#' data("Affairs", package = "AER")
#'
#' ## Fit Binomial regression model
#' fm_probit <- glm(I(affairs > 0) ~ age + yearsmarried + religiousness +
#' occupation + rating, data = Affairs,
#' family = binomial(link = "probit"))
#'
#' # Fit the model using clgm(), adjusting the log-likelihood
#' # The default cluster is used: adjusted SEs viewed as model-robust SEs
#' c_probit <- cglm(I(affairs > 0) ~ age + yearsmarried + religiousness +
#' occupation + rating, data = Affairs,
#' family = binomial(link = "probit"))
#'
#' library(sandwich)
#' library(lmtest)
#' ## Check that
#' # (1) the unadjusted SEs are close to those from chandwichGLM
#' # (2) the adjusted SEs from sandwich are close to those from chandwichGLM
#' coeftest(fm_probit)
#' coeftest(fm_probit, vcov = sandwich)
#' summary(c_probit)
#'
#' ## Extra facilities provided by chandwich
#'
#' # Confidence intervals based on the adjusted log-likelihood
#' cints <- chandwich::conf_intervals(c_probit)
#' cints
#' plot(cints, which_par = "occupation")
#'
#' # Confidence region for a pair of parameters (slow to run)
#' conf <- chandwich::conf_region(c_probit, which_pars = 2:3)
#' plot(conf, conf = c(50, 75, 95, 99))
#'
#' # Adjusted likelihood ratio test to compare nested models
#' # The p-value is different from, but in this case very similar to,
#' # the p-value from the Wald test from coeftest() above
#' chandwich::compare_models(c_probit, fixed_pars = "occupation")
#'
#' ### Poisson GLM
#'
#' ## Example from the help file for stats::glm()
#' ## Dobson (1990) Page 93: Randomized Controlled Trial :
#' counts <- c(18,17,15,20,10,20,25,13,12)
#' outcome <- gl(3,1,9)
#' treatment <- gl(3,3)
#' d.AD <- data.frame(treatment, outcome, counts)
#'
#' glm.D93 <- glm(counts ~ outcome + treatment, family = poisson)
#' cglm1 <- cglm(counts ~ outcome + treatment, family = poisson)
#' cglm2 <- cglm(counts ~ outcome + treatment, family = poisson,
#' use_alg = FALSE)
#' summary(glm.D93)
#' summary(cglm1)
#' summary(cglm2)
#'
#' # Confidence intervals for each parameter
#' cints <- chandwich::conf_intervals(cglm1)
#' cints
#' plot(cints, which_par = 2)
#'
#' # A confidence region for a pair of parameters
#' conf <- chandwich::conf_region(cglm1, which_pars = 1:2)
#' plot(conf, conf = c(50, 75, 95, 99))
#'
#' ### Binomial GLM
#'
#' # Response vector (0/1 numeric or, in this case, a yes/no factor)
#' w <- glm(degree ~ religion + gender + age, data = carData::WVS,
#' family = binomial)
#' wc <- cglm(degree ~ religion + gender + age, data = carData::WVS,
#' family = binomial, cluster = carData::WVS$country)
#'
#' summary(w)
#' summary(wc)
#'
#' # Response (two-column) matrix (number of successes, number of failures)
#'
#' cuse <- read.table("http://data.princeton.edu/wws509/datasets/cuse.dat",
#' header=TRUE)
#' lrfit <- glm(cbind(using, notUsing) ~ age + education + wantsMore,
#' family = binomial, data = cuse)
#' @export
cglm <- function(formula, family = gaussian, data, weights, subset, na.action,
start = NULL, etastart, mustart, offset, control = list(...),
model = TRUE, method = "glm.fit", x = FALSE, y = TRUE,
contrasts = NULL, cluster = NULL, use_alg = TRUE, ...) {
#
# 1. Fit the model using stats::glm()
#
# Extract the arguments from the user's call, including any in ...
glm_call <- match.call(expand.dots = TRUE)
# Change the first component (the function to call) to stats::glm()
glm_call[[1L]] <- quote(stats::glm)
# Set cluster and use_alg to NULL: they are not arguments of stats::glm()
glm_call$cluster <- NULL
glm_call$use_alg <- NULL
# Add x = TRUE so that the returned object will contain the design matrix x
glm_call$x <- TRUE
# Call stats::glm with the user's arguments
res <- eval.parent(glm_call)
glm_object <- eval.parent(glm_call)
# Extract the response object
mf <- match.call(expand.dots = FALSE)
m <- match(c("formula", "data", "subset", "weights", "na.action",
"etastart", "mustart", "offset"), names(mf), 0L)
mf <- mf[c(1L, m)]
mf$drop.unused.levels <- TRUE
mf[[1L]] <- quote(stats::model.frame)
mf <- eval(mf, parent.frame())
response <- stats::model.response(mf, "any")
# Special treatment for binomial GLMs when the input response is a
# two-column matrix of (number of successes, number of failures)
#
if (is.matrix(response)) {
glm_object$n_successes <- response[, 1]
glm_object$n_trials <- rowSums(response)
} else {
glm_object$n_successes <- glm_object$y
glm_object$n_trials <- 1
}
#
# 2. Use chandwich::adjust_loglik() to adjust the log-likelihood
#
# Extract the name of the GLM family
glm_family <- glm_object$family$family
# Set the independence log-likelihood to be adjusted
loglik_for_chandwich <- switch(glm_family,
poisson = pois_glm_loglik,
binomial = binom_glm_loglik,
NULL)
if (use_alg) {
# Set a function to evaluate the score matrix
alg_deriv_for_chandwich <- switch(glm_family,
poisson = no_dispersion_glm_alg_deriv,
binomial = no_dispersion_glm_alg_deriv,
NULL)
# Set a function to evaluate the Hessian matrix of the loglikelihood
alg_hess_for_chandwich <- switch(glm_family,
poisson = no_dispersion_glm_alg_hess,
binomial = no_dispersion_glm_alg_hess,
NULL)
} else {
alg_deriv_for_chandwich <- NULL
alg_hess_for_chandwich <- NULL
}
# Perform the adjustment using by chandwich::adjust_loglik()
# We provide mle and hess_at_mle in order that the MLE and SE of the
# parameters matches those obtained by stats::glm()
res <- chandwich::adjust_loglik(loglik = loglik_for_chandwich,
glm_object = glm_object,
cluster = cluster,
init = glm_object$coefficients,
p = length(glm_object$coefficients),
par_names = names(glm_object$coefficients),
alg_deriv = alg_deriv_for_chandwich,
alg_hess = alg_hess_for_chandwich)
return(res)
}
# Note: pars not used because we only need the derivatives at the MLE,
# which is stored in glm_object
no_dispersion_glm_alg_deriv <- function(pars, glm_object) {
return(sandwich::estfun(glm_object))
}
no_dispersion_glm_alg_hess <- function(pars, glm_object) {
return(-solve(vcov(glm_object)))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.