broom.mixed: Tidying Methods for Mixed Models

Documented in tidy.merMod

#' Tidying methods for mixed effects models
#'
#' These methods tidy the coefficients of \code{lme4::lmer} and \code{lme4::glmer}
#' models (i.e., \code{merMod} objects). Methods are also provided for \code{allFit}
#' objects.
#'
#' @aliases glance.allFit
#' @aliases tidy.allFit
#' @param x An object of class \code{merMod}, such as those from \code{lmer},
#' \code{glmer}, or \code{nlmer}
#' @param ... Additional arguments (passed to \code{confint.merMod} for \code{tidy}; \code{augment_columns} for \code{augment}; ignored for \code{glance})
#'
#' @return All tidying methods return a \code{data.frame} without rownames.
#' The structure depends on the method chosen.
#'
#' @name lme4_tidiers
#'
#' @examples
#'
#' if (require("lme4")) {
#'     ## original model
#'     \dontrun{
#'         lmm1 <- lmer(Reaction ~ Days + (Days | Subject), sleepstudy)
#'     }
#'     ## load stored object
#'     load(system.file("extdata", "lme4_example.rda", package="broom.mixed"))
#'     (tt <- tidy(lmm1))
#'     tidy(lmm1, effects = "fixed")
#'     tidy(lmm1, effects = "fixed", conf.int=TRUE)
#'     tidy(lmm1, effects = "fixed", conf.int=TRUE, conf.method="profile")
#'     ## lmm1_prof <- profile(lmm1) # generated by extdata/runexamples
#'     tidy(lmm1, conf.int=TRUE, conf.method="profile", profile=lmm1_prof)
#'     ## conditional modes (group-level deviations from population-level estimate)
#'     tidy(lmm1, effects = "ran_vals", conf.int=TRUE)
#'     ## coefficients (group-level estimates)
#'     (rcoef1 <- tidy(lmm1, effects = "ran_coefs"))
#'     if (require(tidyr) && require(dplyr)) {
#'        ## reconstitute standard coefficient-by-level table
#'        spread(rcoef1,key=term,value=estimate)
#'        ## split ran_pars into type + term; sort fixed/sd/cor
#'        (tt %>% separate(term,c("type","term"),sep="__",fill="left")
#'            %>% arrange(!is.na(type),desc(type)))
#'     }
#'     head(augment(lmm1, sleepstudy))
#'     glance(lmm1)
#'
#'     glmm1 <- glmer(cbind(incidence, size - incidence) ~ period + (1 | herd),
#'                   data = cbpp, family = binomial)
#'     tidy(glmm1)
#'     tidy(glmm1,exponentiate=TRUE)
#'     tidy(glmm1, effects = "fixed")
#'     ## suppress warning about influence.merMod
#'     head(suppressWarnings(augment(glmm1, cbpp)))
#'     glance(glmm1)
#'
#'     startvec <- c(Asym = 200, xmid = 725, scal = 350)
#'     nm1 <- nlmer(circumference ~ SSlogis(age, Asym, xmid, scal) ~ Asym|Tree,
#'                   Orange, start = startvec)
#'     ## suppress warnings about var-cov matrix ...
#'     op <- options(warn=-1)
#'     tidy(nm1)
#'     tidy(nm1, effects = "fixed")
#'     options(op)
#'     head(augment(nm1, Orange))
#'     glance(nm1)
#'     detach("package:lme4")
#' }
#' if (require("lmerTest")) {
#'    lmm1 <- lmer(Reaction ~ Days + (Days | Subject), sleepstudy)
#'    tidy(lmm1)
#'    glance(lmm1)
#'    detach("package:lmerTest")  # clean up
#' }
NULL

#' @export
confint.rlmerMod <- function(object, parm, level,
                             method = "Wald", ...) {
  cc <- class(object)
  class(object) <- "merMod"
  if (method != "Wald") {
    warning("only Wald method implemented for rlmerMod objects")
  }
  return(confint(object, parm = parm, level = level, method = "Wald", ...))
}

## FIXME: relatively trivial/only used in one place, could
## probably be eliminated? OR move to utils (also in glmmTMB?)
fix_ran_vals <- function(g) {
  term <- level <- estimate <- NULL
  r <- (g
      %>% tibble::rownames_to_column("level")
      %>% gather(term, estimate, -level)
  )
  return(r)
}

#' @rdname lme4_tidiers
#'
#' @param debug print debugging output?
#' @param effects A character vector including one or more of "fixed" (fixed-effect parameters); "ran_pars" (variances and covariances or standard deviations and correlations of random effect terms); "ran_vals" (conditional modes/BLUPs/latent variable estimates); or "ran_coefs" (predicted parameter values for each group, as returned by \code{\link[lme4]{coef.merMod}})
#' @param conf.int whether to include a confidence interval
#' @param conf.level confidence level for CI
#' @param conf.method method for computing confidence intervals (see \code{lme4::confint.merMod})
#' @param scales scales on which to report the variables: for random effects, the choices are \sQuote{"sdcor"} (standard deviations and correlations: the default if \code{scales} is \code{NULL}) or \sQuote{"vcov"} (variances and covariances). \code{NA} means no transformation, appropriate e.g. for fixed effects.
#' @param exponentiate  whether to exponentiate the fixed-effect coefficient estimates and confidence intervals (common for logistic regression); if \code{TRUE}, also scales the standard errors by the exponentiated coefficient, transforming them to the new scale
#' @param exponentiate_ran_coefs whether to exponentiate the predicted paramater values for each group
#' @param ran_prefix a length-2 character vector specifying the strings to use as prefixes for self- (variance/standard deviation) and cross- (covariance/correlation) random effects terms
#' @param profile pre-computed profile object, for speed when using \code{conf.method="profile"}
#' @param ddf.method  the method for computing the degrees of freedom and t-statistics (only applicable when using the \pkg{lmerTest} package: see \code{\link[lmerTest]{summary.lmerModLmerTest}}
#'
#' @return \code{tidy} returns one row for each estimated effect, either
#' with groups depending on the \code{effects} parameter.
#' It contains the columns
#'   \item{group}{the group within which the random effect is being estimated: \code{"fixed"} for fixed effects}
#'   \item{level}{level within group (\code{NA} except for modes)}
#'   \item{term}{term being estimated}
#'   \item{estimate}{estimated coefficient}
#'   \item{std.error}{standard error}
#'   \item{statistic}{t- or Z-statistic (\code{NA} for modes)}
#'   \item{p.value}{P-value computed from t-statistic (may be missing/NA)}
#'
#' @importFrom dplyr mutate bind_rows bind_cols
#' @importFrom tibble rownames_to_column
#' @importFrom tidyr gather spread
#' @importFrom purrr map
#' @importFrom nlme VarCorr ranef
#' @importFrom methods is selectMethod
#' @importFrom stats cov2cor

#' @export
tidy.merMod <- function(x, effects = c("ran_pars", "fixed"),
                        scales = NULL, ## c("sdcor","vcov",NA),
                        exponentiate = FALSE,
                        exponentiate_ran_coefs = FALSE,
                        ran_prefix = NULL,
                        conf.int = FALSE,
                        conf.level = 0.95,
                        conf.method = "Wald",
                        ddf.method = NULL,
                        profile = NULL,
                        debug = FALSE,
                        ...) {

  ## R CMD check false positives
  variable <- level <- term <- estimate <- std.error <- grpvar <-
    .id <- grp <- condval <- condsd <- NULL

  if(is.null(ddf.method)) {
      ddf.method <- if (inherits(x,"lmerModLmerTest")) "Satterthwaite" else NULL
  }
  effect_names <- c("ran_pars", "fixed", "ran_vals", "ran_coefs")
  if (!is.null(scales)) {
    if (length(scales) != length(effects)) {
      stop(
        "if scales are specified, values (or NA) must be provided ",
        "for each effect"
      )
    }
  }
  if (length(miss <- setdiff(effects, effect_names)) > 0) {
    stop("unknown effect type ", miss)
  }

  if (conf.int && conf.method == "profile" && !is.null(profile)) {
     p <- profile
  } else {
     p <- x
  }

  ret_list <- list()
  if ("fixed" %in% effects) {
      ## return tidied fixed effects
      ## not quite sure why implicit/automatic method dispatch doesn't work,
      ##  but we seem to need this to get the right summary for  merModLmerTest
      ##  objects ...
      sum_fun <- selectMethod("summary", class(x))
      if (inherits(x,"lmerModLmerTest")) {
          ss <- sum_fun(x, ddf=ddf.method)
      } else {
          ss <- sum_fun(x)
      }
      ret <- stats::coef(ss) %>%
          data.frame(check.names = FALSE) %>%
          rename_regex_match()

    if (debug) {
      cat("output from coef(summary(x)):\n")
      print(coef(ss))
    }

    ## need to save rownames before dplyr (mutate(), bind_cols())
    ##   destroys them ...
    ret <- tibblify(ret)

    if (conf.int) {
      if (is(x, "merMod") || is(x, "rlmerMod")) {
          cifix <- cifun(p, parm = "beta_", method = conf.method,
                         level = conf.level, ddf.method = ddf.method, ...)
      } else {
        ## ?? for glmmTMB?  check ...
        cifix <- cifun(p, level = conf.level, ...)
      }
      ret <- dplyr::bind_cols(ret, cifix)
    }
    ran_effs <- sprintf("ran_%s", c("pars", "modes", "coefs"))


    ## if (any(purrr::map_lgl(ran_effs, ~. %in% effects))) {
    ## add group="fixed" to tidy table for fixed effects
    ## ret <- mutate(ret,group="fixed")
    ## }

    if (exponentiate) {
      vv <- intersect(c("estimate", "conf.low", "conf.high"), names(ret))
      ret <- (ret
        %>% mutate_at(vars(vv), ~exp(.))
        %>% mutate(across(std.error, ~ . * estimate))
      )
    }


    ret_list$fixed <- reorder_frame(ret)
  }
  if ("ran_pars" %in% effects) {
    if (is.null(scales)) {
      rscale <- "sdcor"
    } else {
      rscale <- scales[effects == "ran_pars"]
    }
    if (!rscale %in% c("sdcor", "vcov")) {
      stop(sprintf("unrecognized ran_pars scale %s", sQuote(rscale)))
    }
    vc <- VarCorr(x)
    if (!is(x, "merMod") && grepl("^VarCorr", class(vc)[1])) {
      if (!is(x, "rlmerMod")) {
        ## hack: attempt to augment glmmADMB (or other)
        ##   values so we can use as.data.frame.VarCorr.merMod
        vc <- lapply(
          vc,
          function(x) {
            attr(x, "stddev") <- sqrt(diag(x))
            attr(x, "correlation") <- stats::cov2cor(x)
            x
          }
        )
        attr(vc, "useScale") <- (stats::family(x)$family == "gaussian")
      }
      class(vc) <- "VarCorr.merMod"
    }
    ## n.b. use order="lower.tri" here so that term order matches
    ## that returned by confint() !
    ret <- as.data.frame(vc, order="lower.tri")
    ## purrr::map_at?
    ret[] <- lapply(ret, function(x) if (is.factor(x)) {
        as.character(x)
      } else {
        x
      } )
    if (is.null(ran_prefix)) {
      ran_prefix <- switch(rscale,
        vcov = c("var", "cov"),
        sdcor = c("sd", "cor")
      )
    }

    ## don't try to assign as rowname (non-unique anyway),
    ## make it directly into a term column
    ret[["term"]] <- apply(ret[c("var1", "var2")], 1,
      ran_pars_name,
      ran_prefix = ran_prefix
    )

    ## keep only desired term, rename
    ret <- setNames(
      ret[c("grp", "term", rscale)],
      c("group", "term", "estimate")
    )

    ## these are in 'lower.tri' order, need to make sure this
    ## is matched in as.data.frame() below
    if (conf.int) {
        ciran <- cifun(p, parm = "theta_", method = conf.method, level = conf.level, ...)
        ret <- data.frame(ret, ciran, stringsAsFactors = FALSE)
    }
    ret_list$ran_pars <- ret
  }

  if ("ran_vals" %in% effects) {
    ## fix each group to be a tidy data frame

    ret <- (ranef(x, condVar = TRUE)
        %>% as.data.frame(stringsAsFactors = FALSE)
        %>% dplyr::rename(
                       group = grpvar, level = grp,
                       estimate = condval, std.error = condsd
                   )
        %>% dplyr::mutate_if(is.factor, as.character)
    )

    if (conf.int) {
      if (conf.method != "Wald") {
        stop("only Wald CIs available for conditional modes")
      }

      mult <- qnorm((1 + conf.level) / 2)
      ret <- ret %>% mutate(
        conf.low = estimate - mult * std.error,
        conf.high = estimate + mult * std.error
      )
    }

    ret_list$ran_vals <- ret
  }
  if ("ran_coefs" %in% effects) {
    ret <- coef(x) %>%
      purrr::map(fix_ran_vals) %>%
      bind_rows(.id = "group")

    if (conf.int) {
      warning("CIs not available for random-effects coefficients: returning NA")
      ret <- ret %>% mutate(conf.low = NA, conf.high = NA)
    }

    if (exponentiate_ran_coefs) {
      rc <- intersect("estimate", names(ret))
      ret <- (ret %>% mutate_at(vars(rc), ~exp(.)))
    }
    
    ret_list$ran_coefs <- ret
  }
  
  if (exponentiate_ran_coefs) {
    if(!"ran_coefs" %in% effects) {
      warning("Exponentiated random coefficients were specified, but random coefficients were not requested. Not returning random coefficients.")
    }
  }

  ret <- (ret_list
  %>%
    dplyr::bind_rows(.id = "effect")
    %>%
    as_tibble()
    %>%
    reorder_cols()
  )
  return(ret)
}

#' @rdname lme4_tidiers
#' @export
tidy.rlmerMod <- tidy.merMod

#' @rdname lme4_tidiers
#'
#' @param data original data this was fitted on; if not given this will
#' attempt to be reconstructed
#' @param newdata new data to be used for prediction; optional
#'
#' @template augment_NAs
#'
#' @return \code{augment} returns one row for each original observation,
#' with columns (each prepended by a .) added. Included are the columns
#'   \item{.fitted}{predicted values}
#'   \item{.resid}{residuals}
#'   \item{.fixed}{predicted values with no random effects}
#'
#' Also added for "merMod" objects, but not for "mer" objects,
#' are values from the response object within the model (of type
#' \code{lmResp}, \code{glmResp}, \code{nlsResp}, etc). These include \code{".mu",
#' ".offset", ".sqrtXwt", ".sqrtrwt", ".eta"}.
#'
#' @importFrom broom augment augment_columns
#' @importFrom dplyr bind_cols
#' @export
augment.merMod <- function(x, data = stats::model.frame(x), newdata, ...) {
  # move rownames if necessary
  if (missing(newdata)) {
    newdata <- NULL
  }
  ret <- suppressMessages(augment_columns(x, data, newdata, se.fit = NULL, ...))

  # add predictions with no random effects (population means)
  predictions <- stats::predict(x, re.form = NA)
  # some cases, such as values returned from nlmer, return more than one
  # prediction per observation. Not clear how those cases would be tidied
  if (length(predictions) == nrow(ret)) {
    ret$.fixed <- predictions
  }

  # columns to extract from resp reference object
  # these include relevant ones that could be present in lmResp, glmResp,
  # or nlsResp objects

  respCols <- c(
    "mu", "offset", "sqrtXwt", "sqrtrwt", "weights",
    "wtres", "gam", "eta"
  )
  cols <- lapply(respCols, function(cc) x@resp[[cc]])
  names(cols) <- paste0(".", respCols)

  ## remove too-long fields and empty fields
  n_vals <- vapply(cols,length,1L)
  min_n <- min(n_vals[n_vals>0])

  cols <- dplyr::bind_cols(cols[n_vals==min_n])

  cols <- insert_NAs(cols, ret)
  if (length(cols) > 0) {
    ret <- dplyr::bind_cols(ret, cols)
  }

  return(unrowname(ret))
}


#' @rdname lme4_tidiers
#'
#' @return \code{glance} returns one row with the columns
#'   \item{nobs}{the number of observations}
#'   \item{sigma}{the square root of the estimated residual variance}
#'   \item{logLik}{the data's log-likelihood under the model}
#'   \item{AIC}{the Akaike Information Criterion}
#'   \item{BIC}{the Bayesian Information Criterion}
#'   \item{deviance}{deviance}
#'
#' @rawNamespace if(getRversion()>='3.3.0') importFrom(stats, sigma) else importFrom(lme4,sigma)
#' @importFrom broom glance
#' @export
glance.merMod <- function(x, ...) {
  deviance <- NULL ## false-positive code checks
  ff <- finish_glance(x = x)
  if (lme4::isREML(x)) ff <- rename(ff, REMLcrit = deviance)
  return(ff)
}

##' Augmentation for random effects (for caterpillar plots etc.)
##'
##' @param x ranef (conditional mode) information from an lme4 fit, using \code{ranef(.,condVar=TRUE)}
##' @param ci.level level for confidence intervals
##' @param reorder reorder levels by conditional mode values?
##' @param order.var numeric or character: which variable to use for ordering levels?
##' @param \dots additional arguments (unused: for generic consistency)
##' @examples
##' if (require("lme4")) {
##'    load(system.file("extdata","lme4_example.rda",package="broom.mixed"))
##'    rr <- ranef(lmm1,condVar=TRUE)
##'    aa <- broom::augment(rr)
##'    ## Q-Q plot:
##'    if (require(ggplot2) && require(dplyr)) {
##'       g0 <- ggplot(aa,aes(estimate,qq,xmin=lb,xmax=ub))+
##'           geom_errorbarh(height=0)+
##'           geom_point()+facet_wrap(~variable,scale="free_x")
##'       ## regular caterpillar plot:
##'       g1 <- ggplot(aa,aes(estimate,level,xmin=lb,xmax=ub))+
##'          geom_errorbarh(height=0)+
##'          geom_vline(xintercept=0,lty=2)+
##'          geom_point()+facet_wrap(~variable,scale="free_x")
##'       ## emphasize extreme values
##'       aa2 <- group_by(aa,grp,level)
##'       aa3 <- mutate(aa2, keep=any(estimate/std.error>2))
##'       ## Update caterpillar plot with extreme levels highlighted
##'       ##  (highlight all groups with *either* extreme intercept *or*
##'       ##   extreme slope)
##'       ggplot(aa3, aes(estimate,level,xmin=lb,xmax=ub,colour=factor(keep)))+
##'          geom_errorbarh(height=0)+
##'          geom_vline(xintercept=0,lty=2)+
##'          geom_point()+facet_wrap(~variable,scale="free_x")+
##'          scale_colour_manual(values=c("black","red"), guide=FALSE)
##'    }
##' }
##' @importFrom stats ppoints
##' @export
augment.ranef.mer <- function(x,
                              ci.level = 0.9,
                              reorder = TRUE,
                              order.var = 1, ...) {
  variable <- level <- estimate <- std.error <- grpvar <-
    grp <- condval <- condsd <- NULL
  tmpf <- function(z) {
    if (is.character(order.var) && !order.var %in% names(z)) {
      order.var <- 1
      warning("order.var not found, resetting to 1")
    }
    ## would use plyr::name_rows, but want levels first
    zz <- data.frame(level = rownames(z), z, check.names = FALSE)
    if (reorder) {
      ## if numeric order var, add 1 to account for level column
      ov <- if (is.numeric(order.var)) order.var + 1 else order.var
      zz$level <- reorder(zz$level, zz[, order.var + 1], FUN = identity)
    }
    ## Q-Q values, for each column separately
    qq <- c(apply(z, 2, function(y) {
      qnorm(stats::ppoints(nrow(z)))[order(order(y))]
    }))
    rownames(zz) <- NULL
    pv <- attr(z, "postVar")
    cols <- 1:(dim(pv)[1])
    se <- unlist(lapply(cols, function(i) sqrt(pv[i, i, ])))
    ## n.b.: depends on explicit column-major ordering of se/melt
    zzz <- zz %>%
           tidyr::pivot_longer(-level, values_to = "estimate", names_to = "variable") %>%
           dplyr::arrange(variable, level)
    zzz <- cbind(zzz, qq = qq, std.error = se)
    ## reorder columns:
    subset(zzz, select = c(variable, level, estimate, qq, std.error))
  }
  dd <- purrr::map_dfr(x, tmpf, .id = "grp")

  ci.val <- -qnorm((1 - ci.level) / 2)
  transform(dd,
    ## p=2*pnorm(-abs(estimate/std.error)), ## 2-tailed p-val
    lb = estimate - ci.val * std.error,
    ub = estimate + ci.val * std.error
  )
}

## experimental
##' @export
tidy.gamm4 <- function(x, ...) {
  ## gamm4 returns an *unclassed* list of length two
  ## (mer, gam)
  r <- tidy(x$mer, ...)
  r$term <- gsub("^X", "", r$term)
  return(r)
}

##' @export
augment.gamm4 <- function(x, ...) {
  return(augment(x$mer, ...))
}

##' @export
glance.gamm4 <- function(x, ...) {
  return(glance(x$mer, ...))
}

##' @export
tidy.lmList4 <- function(x, conf.int = FALSE,
                         conf.level = 0.95, ...) {

    cols <- estimate <- std.error <- NULL ## R CMD check false positives

    ss <- summary(x)$coefficients
    names(dimnames(ss)) <- c("group","cols","terms")

    # flatten results cube
    tmp <- list()
    for (i in 1:dim(ss)[3]) {
      tmp[[i]] <- ss[, , i, drop=TRUE] %>%
                  as.data.frame %>%
                  tibble::rownames_to_column(var = "group") %>%
                  rename_regex_match() %>%
                  dplyr::mutate(`terms` = dimnames(ss)$terms[i])
    }
    tmp <- dplyr::bind_rows(tmp)
    tmp <- tmp[, unique(c("group", "terms", sort(colnames(tmp))))]
    tmp <- tmp[order(tmp$group, tmp$terms),]
    ret <- tibble::as_tibble(tmp)

    if (conf.int) {
        qq <- qnorm((1+conf.level)/2)
        ret <- (ret %>%
                mutate(conf.low=estimate-qq*std.error,
                       conf.high=estimate+qq*std.error)
        )
        ## don't think reorder_cols is necessary ...?
    }
    return(ret)
}

#' @export
tidy.allFit <- function(x, ...) {
  bad <- purrr::map_lgl(x, is, "error")
  purrr::map_dfr(x[!bad], tidy, ..., .id = "optimizer")
}

##' @importFrom purrr possibly
##' @export
glance.allFit <- function(x, ...) {
  bad <- purrr::map_lgl(x, is, "error")
  if (all(bad)) stop("all models bad")
  ## find first non-bad and fill with NA values
  dummy <- glance(x[!bad][[1]]) %>% mutate(across(everything(), ~ NA))
  res <- (purrr::map_dfr(x, purrr::possibly(glance, ..., otherwise = dummy),
                         .id = "optimizer")
    ## compute relative negative log-likelihood
    %>% mutate(NLL_rel = ifelse(is.na(logLik), Inf, max(logLik, na.rm=TRUE) - logLik))
  )
  return(res)
}