R/pseudoglm-methods.R
In eventglm: Regression Models for Event History Outcomes

Documented in residuals.pseudoglm vcov.pseudoglm

#' Print method for pseudoglm
#' @param x A pseudoglm object, as returned by \link{cumincglm} or \link{rmeanglm}
#' @param digits Number of significant digits
#' @param ... Not used
#' @return x, invisibly
#'
#' @export
#'
print.pseudoglm <- function (x, digits = max(3L, getOption("digits") - 3L), ...)
{
    outcome <- switch(x$type, rmean = "restricted mean",
                      cuminc = "cumulative incidence",
                      survival = "survival")
    junc <- "of"
    if(outcome == "restricted mean" & x$competing) {
        outcome <- "restricted mean time lost"
        junc <- "due to"
    }
    cat("\nCall:  ", paste(deparse(x$call), sep = "\n",
                           collapse = "\n"), "\n\n", sep = "")
    if(x$competing) {
        cat("\nModel for the", x$link, outcome, junc, "cause", x$cause, "at time", x$time, "\n\n")
    } else {
        cat("\nModel for the", x$link, outcome, "at time", x$time, "\n\n")
    }
    if (length(coef(x))) {
        cat("Coefficients")
        if (is.character(co <- x$contrasts))
            cat("  [contrasts: ", apply(cbind(names(co),
                                              co), 1L, paste, collapse = "="), "]")
        cat(":\n")
        print.default(format(x$coefficients, digits = digits),
                      print.gap = 2, quote = FALSE)
    }
    else cat("No coefficients\n\n")
    cat("\nDegrees of Freedom:", x$df.null, "Total (i.e. Null); ",
        x$df.residual, "Residual\n")
    if (nzchar(mess <- naprint(x$na.action)))
        cat("  (", mess, ")\n", sep = "")

    cat("\n")
    invisible(x)
}


#' Compute covariance matrix of regression coefficient estimates
#'
#'
#' @param object A pseudoglm object, as returned by \link{cumincglm} or
#'   \link{rmeanglm}.
#' @param type The method to use for variance estimation; one of "corrected",
#'   "robust", "naive", or "cluster"
#' @param ... Not used
#' @return A numeric matrix containing the variance-covariance estimates
#'
#' @details The "corrected" variance estimate for the cumulative incidence is as
#'   described in Overgaard et al. (2017) <doi:10.1214/16-AOS1516>, with code
#'   adapted from Overgaard's Stata program. This method does not handle ties
#'   and only has marginal benefits in reasonable sample sizes. The default is
#'   "robust" which uses the sandwich estimator vcovHC as implemented in the
#'   sandwich package. "cluster" is another option if you have clustered
#'   observations that uses the vcovCL function in sandwich. Finally "naive"
#'   uses the same method as glm to compute the variance, and is known to be
#'   anti-conservative. The bootstrap is another recommended option that can be
#'   implemented using other tools; there is an example in the vignette.
#'
#' @references Overgaard, Morten; Parner, Erik Thorlund; Pedersen, Jan.
#'   Asymptotic theory of generalized estimating equations based on jack-knife
#'   pseudo-observations. Ann. Statist. 45 (2017), no. 5, 1988--2015.
#'   <doi:10.1214/16-AOS1516>.
#' @seealso \link[sandwich]{vcovHC}, \link[sandwich]{vcovCL}
#' @export
vcov.pseudoglm <- function(object, type = "robust", ...) {

    if(object$method == "geese") {
        if(type != "robust") {
            message("Only robust variance estimate available for multiple time points.")
        }
        return(object$sandcov)
    }

    if(type == "corrected") {

        if(!object$type %in% c("cuminc", "survival")) {
            stop("Corrected variance estimator not available for", object$type)
        }

        if(length(object$time) > 1) {
            stop("Corrected variance estimator not available for multiple time points.")
        }

        if(is.null(object$x)) {
            stop("Corrected variance requires 'x = TRUE' in the model fit.")
        }
        datamat <- object$datamat
        noobs <- len <- nrow(datamat)

        datord <- order(datamat[, 1], -datamat[, 2], -datamat[, 3], -datamat[, 4])
        datamat <- datamat[datord, ]

        datain <- datamat[datamat[, 1] <= object$time, ]

        if(any(table(datain[, 1]) > 1)) {
            stop("Corrected variance not available when there are tied event times")
        }


        len2 <- nrow(datain)

        timejump <- datain[1:(len2 - 1), 1] != datain[2:len2, 1]
        times_id <- c(which(timejump == TRUE), len2)

        n_times <- length(times_id)
        times_nr <- cumsum(c(1, timejump))

        Y_lag <- (noobs:1)[times_id]

        N_all <- datain[, 2]
        N_1 <- datain[, 3]
        N_0 <- datain[, 4]

        # Calculating the overall survival function (missing values mean a population die out and should be interpreted as 0)
        H_0 <- cumsum(N_0)/noobs
        H_0_dif = c(H_0[times_id[1]],  ((H_0[times_id])[2:n_times] - (H_0[times_id])[1:(n_times-1)]))


        H_1 = cumsum(N_1)/noobs
        H_1_dif = c(H_1[times_id[1]] , ((H_1[times_id])[2:n_times] - (H_1[times_id])[1:(n_times-1)]))

        H_all = cumsum(N_all)/noobs
        H_all_dif = c(H_all[times_id[1]] , ((H_all[times_id])[2:n_times] - (H_all[times_id])[1:(n_times-1)]))

        H_lag = Y_lag / noobs
        H_lag_inv = 1 / H_lag
        H_lag_inv[is.na(H_lag_inv)] <- 0


        Lambda_0_dif = H_0_dif / H_lag
        Lambda_1_dif = H_1_dif / H_lag
        Lambda_all_dif = H_all_dif / H_lag
        Lambda_0_dif_comp_inv = 1 / (1 - Lambda_0_dif)
        Lambda_0_dif_comp_inv[is.na(Lambda_0_dif_comp_inv)] <- 0

        S = exp(cumsum(log(1 - Lambda_all_dif)))
        S[is.na(S)] <- 0
        S_lag = c(1 , S[1:(n_times-1)])

        G = exp(cumsum(log(1 - Lambda_0_dif)))
        G[is.na(G)] <- 0
        G_lag = c(1 , G[1:(n_times-1)])
        G_lag_inv = 1 / G_lag
        G_lag_inv[is.na(G_lag_inv)] <- 0

        F_1 = cumsum(S_lag * Lambda_1_dif)

        d_phi_1 = c((N_1 * G_lag_inv[times_nr]) , rep(0, noobs - len2))
        d_phi_2 = c(N_0 * (F_1[n_times] - F_1[times_nr]) * Lambda_0_dif_comp_inv[times_nr] * H_lag_inv[times_nr] -
                        cumsum(H_0_dif * (F_1[n_times] - F_1) * Lambda_0_dif_comp_inv * H_lag_inv^2)[times_nr] ,
                    rep(-sum(H_0_dif * (F_1[n_times] - F_1) * Lambda_0_dif_comp_inv * H_lag_inv^2), noobs-len2))

        beta = coefficients(object)
        k = length(beta)

        z = object$x[datord,]  ## what if the model contains po at multiple time points?

        linpred <- z %*% beta
        muhat = object$family$linkinv(linpred)
        Ahat <- mu_derivhat <- z * as.vector(object$family$mu.eta(linpred))
        Ahat_red = Ahat[1:len2,]

        a_len <- a1_len <- a2_len <- a3_len <-
            b_1 <- b_2 <- b_3 <- b_4 <- b_5 <- b_6 <-
            matrix(numeric(length = noobs * k), nrow = noobs, ncol = k)
        H_0z <- H_0z_dif <- H_1z <- H_1z_dif <- H_z <- H_z_lag <- matrix(numeric(len2 * k), nrow = len2, ncol = k)


        ## compute variance
        for(j in 1:k) {
            a_len[,j] = Ahat[,j] * (d_phi_1-muhat+d_phi_2)
            a1_len[,j] = Ahat[,j]  * (d_phi_1)
            a2_len[,j] = Ahat[,j] * (-muhat)
            a3_len[,j] = Ahat[,j] * (d_phi_2)

            H_0z[,j] = cumsum(Ahat_red[,j] * N_0)/noobs
            H_0z_dif[,j] = c(H_0z[times_id[1],j] , ((H_0z[times_id,j])[2:n_times] - (H_0z[times_id,j])[1:(n_times-1)]))

            H_1z[,j] = cumsum(Ahat_red[,j] * N_1)/noobs
            H_1z_dif[,j] = c(H_1z[times_id[1],j] , ((H_1z[times_id,j])[2:n_times] - (H_1z[times_id,j])[1:(n_times-1)]))

            H_z[,j] = (sum(Ahat[,j]) - cumsum(Ahat_red[,j]))/noobs
            H_z_lag[,j] = c(sum(Ahat[,j]) / noobs , (H_z[1:(len2-1),j])[times_id[1:(n_times-1)]])

            temp1 = cumsum( H_0z_dif[,j] * H_lag_inv * Lambda_0_dif_comp_inv - H_z_lag[,j] * H_lag_inv^2 * Lambda_0_dif_comp_inv * H_0_dif )
            temp1_lag = c(0 , temp1[1:(n_times-1)])
            temp1_dif = temp1 - temp1_lag

            temp2 = cumsum( H_0z_dif[,j] * (F_1[n_times] - F_1) * H_lag_inv * Lambda_0_dif_comp_inv -
                                H_z_lag[,j] * (F_1[n_times] - F_1) * H_lag_inv^2 * Lambda_0_dif_comp_inv  * H_0_dif )
            temp3 = cumsum( H_1z_dif[,j] / G_lag )


            b_1[,j] = c(N_1 * G_lag_inv[times_nr] * temp1_lag[times_nr] , rep(0, len-len2))

            b_2[,j] = c(N_0 * (temp3[n_times] - temp3[times_nr]) * H_lag_inv[times_nr] * Lambda_0_dif_comp_inv[times_nr] -
                            cumsum( H_0_dif * (temp3[n_times] - temp3) * H_lag_inv^2 * Lambda_0_dif_comp_inv )[times_nr] ,
                        rep(-sum( H_0_dif * (temp3[n_times] - temp3) * H_lag_inv^2 * Lambda_0_dif_comp_inv ), len-len2))

            temp4 = cumsum( H_1_dif * G_lag_inv * temp1 )

            b_3[,j] = c(N_0 * (temp4[n_times] - temp4[times_nr]) * H_lag_inv[times_nr] * Lambda_0_dif_comp_inv[times_nr] -
                            cumsum( H_0_dif * (temp4[n_times] - temp4) * H_lag_inv^2 * Lambda_0_dif_comp_inv )[times_nr] ,
                        rep(-sum( H_0_dif * (temp4[n_times] -temp4) / H_lag^2 / (1-Lambda_0_dif) ), len-len2 ))


            b_4[,j] = c(- N_0 * (F_1[n_times] - F_1[times_nr]) * H_z_lag[times_nr,j] * H_lag_inv[times_nr]^2 * Lambda_0_dif_comp_inv[times_nr] +
                            cumsum( H_0_dif * (F_1[n_times] - F_1) * H_z_lag[,j] * H_lag_inv^3 * Lambda_0_dif_comp_inv )[times_nr] ,
                        rep(sum( H_0_dif * (F_1[n_times] - F_1) * H_z_lag[,j] * H_lag_inv^3 * Lambda_0_dif_comp_inv ), len-len2 ))


            b_5[,j] = c(-cumsum( H_0z_dif[,j] * (F_1[n_times] - F_1) * H_lag_inv^2 * Lambda_0_dif_comp_inv -
                                     H_z_lag[,j] * (F_1[n_times] - F_1) * H_lag_inv^3 * Lambda_0_dif_comp_inv * H_0_dif )[times_nr] ,
                        rep(- sum( H_0z_dif[,j] * (F_1[n_times] - F_1) * H_lag_inv^2 * Lambda_0_dif_comp_inv -
                                       H_z_lag[,j] * (F_1[n_times] - F_1) * H_lag_inv^3 * Lambda_0_dif_comp_inv * H_0_dif ), len-len2 ))

            b_6[,j] = c(N_0 * (F_1[n_times] - F_1[times_nr]) * temp1_dif[times_nr] * H_lag_inv[times_nr] * Lambda_0_dif_comp_inv[times_nr] -
                            cumsum( H_0_dif * (F_1[n_times] - F_1) * temp1_dif * H_lag_inv^2 * Lambda_0_dif_comp_inv )[times_nr] ,
                        rep(-sum( H_0_dif * (F_1[n_times] - F_1) * temp1_dif * H_lag_inv^2 * Lambda_0_dif_comp_inv ), len-len2 ))


        }

        b <- b_1 + b_2 + b_3 + b_4 + b_5 + b_6
        Sigma <- t(a_len) %*% a_len / noobs + t(a_len) %*% b / noobs +
            t(b) %*% a_len / noobs + t(b) %*% b / noobs

        Minvhat <- chol2inv(chol(t(Ahat) %*% mu_derivhat / nrow(Ahat)))
        Minvhat %*% Sigma %*% t(Minvhat) / noobs


    } else if(type == "robust") {

        class(object) <- c("glm", "lm")
        sandwich::vcovHC(object)

    } else if(type == "naive") {

        class(object) <- c("glm", "lm")
        stats::vcov(object)


    } else if(type == "cluster") {

        class(object) <- c("glm", "lm")
        sandwich::vcovCL(object, cluster = object$cluster.id)

    } else stop("unknown variance type")

}


#' Summary method
#'
#' @param object A pseudoglm object, as returned by \link{cumincglm} or \link{rmeanglm}
#' @param correlation logical; if TRUE, the correlation matrix of the estimated parameters is returned and printed.
#' @param symbolic.cor logical; If TRUE, print the correlations in a symbolic form rather than as numbers.
#' @param type The method to use for variance estimation; one of "corrected", "robust", "naive", or "cluster"
#' @param ... Not used
#' @return An object of class \link[stats]{summary.glm}
#' @export
#'
summary.pseudoglm <- function (object, correlation = FALSE, symbolic.cor = FALSE,
                               type = "robust",
                               ...)
{
    df.r <- object$df.residual
    aliased <- is.na(coef(object))
    p <- object$rank
    if (p > 0) {
        p1 <- 1L:p
        Qr <- object$qr
        coef.p <- object$coefficients[Qr$pivot[p1]]
        covmat.unscaled <- vcov(object, type = type)[p1, p1]
        dimnames(covmat.unscaled) <- list(names(coef.p), names(coef.p))
        covmat <- covmat.unscaled
        var.cf <- diag(covmat)
        s.err <- sqrt(var.cf)
        tvalue <- coef.p/s.err
        dn <- c("Estimate", "Std. Error")
        if (df.r > 0) {
            pvalue <- 2 * pnorm(-abs(tvalue))
            coef.table <- cbind(coef.p, s.err, tvalue, pvalue)
            dimnames(coef.table) <- list(names(coef.p), c(dn,
                                                          "z value", "Pr(>|z|)"))
        } else {
            coef.table <- cbind(coef.p, NaN, NaN, NaN)
            dimnames(coef.table) <- list(names(coef.p), c(dn,
                                                          "z value", "Pr(>|z|)"))
        }
        df.f <- NCOL(Qr$qr)
    }
    else {
        coef.table <- matrix(, 0L, 4L)
        dimnames(coef.table) <- list(NULL, c("Estimate",
                                             "Std. Error", "z value", "Pr(>|z|)"))
        covmat.unscaled <- covmat <- matrix(, 0L, 0L)
        df.f <- length(aliased)
    }
    keep <- match(c("call", "terms", "family",
                    "deviance", "aic", "contrasts", "df.residual",
                    "null.deviance", "df.null", "iter",
                    "na.action"), names(object), 0L)
    ans <- c(object[keep], list(deviance.resid = residuals(object,
                                                           type = "deviance"), coefficients = coef.table,
                                aliased = aliased, dispersion = 1, df = c(object$rank,
                                                                          df.r, df.f),
                                cov.unscaled = covmat.unscaled, cov.scaled = covmat))
    if (correlation && p > 0) {
        dd <- sqrt(diag(covmat.unscaled))
        ans$correlation <- covmat.unscaled/outer(dd, dd)
        ans$symbolic.cor <- symbolic.cor
    }
    class(ans) <- "summary.glm"
    return(ans)
}

#' Pseudo-observation scaled residuals
#'
#' Computes residuals according to the recommendations of Pohar-Perme and
#' Andersen (2009) <doi: 10.1002/sim.3401>.
#'
#' @param object A pseudoglm object, as returned by \link{cumincglm} or
#'   \link{rmeanglm}
#' @param type Either "scaled" (the default for cumulative incidence outcomes)
#'   or one of the types available in \link[stats]{residuals.glm} for restricted mean outcomes, with the default being "deviance".
#' @param ... Arguments passed on to \link[stats]{residuals.glm}.
#' @return A numeric vector of residuals
#'
#' @details The scaled residuals are computed as \deqn{\hat{\epsilon}_i =
#'   \frac{\hat{E}(V_i) - \hat{Y}_i}{\sqrt{\hat{Y}_i (1 - \hat{Y}_i)}}} When the
#'   outcome is the cumulative incidence, the denominator corresponds to an
#'   estimate of the standard error of the conditional estimate of the outcome
#'   in the absence of censoring. For the restricted mean, no such rescaling is
#'   done and the computation is passed off to \link[stats]{residuals.glm}.
#' @references Perme MP, Andersen PK. Checking hazard regression models using
#'   pseudo-observations. Stat Med. 2008;27(25):5309-5328.
#'   <doi:10.1002/sim.3401>
#' @export
residuals.pseudoglm <- function(object, type = NULL, ...){

    if(object$type == "rmean") {
        if(is.null(type)) {
            type <- "deviance"
        }
        residuals.glm(object, type = type, ...)
    } else if(object$type == "cuminc") {

        if(is.null(type) || type == "scaled") {

            cond.cuminc <- predict(object, type = "response")
            (object$y - cond.cuminc) / sqrt(cond.cuminc * (1 - cond.cuminc))


        } else {
            residuals.glm(object, type = type, ...)
        }

    }

}


#' Confidence Intervals for pseudoglm Model Parameters
#'
#' Computes Wald confidence intervals for one or more parameters in a fitted model. Users can specify the type of variance estimate used, with the default being the robust sandwich variance estimator.
#'
#' @param object A fitted model object from \link{cumincglm} or \link{rmeanglm}
#' @param parm a specification of which parameters are to be given confidence intervals, either a vector of numbers or a vector of names. If missing, all parameters are considered.
#' @param level the confidence level required.
#' @param type The type of variance estimate to use, see \link{vcov.pseudoglm}
#' @param ... Not used
#'
#' @return A matrix (or vector) with columns giving lower and upper confidence limits for each parameter. These will be labelled as (1-level)/2 and 1 - (1-level)/2 in % (by default 2.5% and 97.5%).
#'
#' @export
#' @examples
#' cumincipcw <- cumincglm(survival::Surv(etime, event) ~ age + sex,
#'          time = 200, cause = "pcm", link = "identity",
#'          model.censoring = "independent", data = mgus2)
#' confint(cumincipcw)
#'
confint.pseudoglm <- function (object, parm, level = 0.95, type = "robust", ...)
{
    cf <- coef(object)
    ses <- sqrt(diag(vcov(object, type = type)))
    pnames <- names(ses)
    if (is.matrix(cf)){
        cf <- setNames(as.vector(cf), pnames)
    }
    if (missing(parm)) {
        parm <- pnames
    } else if (is.numeric(parm)){
        parm <- pnames[parm]
    }
    a <- (1 - level)/2
    a <- c(a, 1 - a)
    fac <- stats::qnorm(a)
    pct <- paste(format(100 * a, trim = TRUE, scientific = FALSE, digits = 3), "%")
    ci <- array(NA_real_, dim = c(length(parm), 2L), dimnames = list(parm,
                                                                     pct))
    ci[] <- cf[parm] + ses[parm] %o% fac
    ci
}