Nothing
#' @title Weighted Variance
#' @description Calculate the weighted variance of `x`.
#' @param x A numeric vector.
#' @param w A numeric vector of weights of the same length as `x`.
#' @param na.rm Logical, whether NAs in `x` should be removed.
#' @details Calculate the weighted variance of `x` via the weighted covariance
#' matrix ([cov.wt()]). If no weights are supplied, the simple variance is
#' returned instead. As in [weighted.mean()], `NA`s in `w` are not handled
#' specially and will return `NA` as result.
#' @return A numeric value, the weighted variance of `x`.
#' @seealso [var()]
#' @examples
#' # Weighted variance
#' x <- rnorm(30)
#' w <- runif(30, 0, 1)
#' varW(x, w)
#'
#' # Simple variance
#' varW(x)
#' stopifnot(varW(x) == var(x))
#'
#' # NA handling
#' varW(c(x[1:29], NA), w, na.rm = TRUE) # NA in x (removed)
#' varW(c(x[1:29], NA), w, na.rm = FALSE) # NA in x (NA returned)
#' varW(x[1:29], w = c(w[1:29], NA)) # NA in w (NA returned)
#' @export
varW <- function(x, w = NULL, na.rm = FALSE) {
if (!is.null(w)) {
x.na <- is.na(x)
w.na <- is.na(w)
if (!(any(x.na) && !na.rm || any(w.na))) {
x <- cbind(x[!x.na])
w <- as.vector(w)[!x.na]
cov.wt(x, w)$cov[1, ]
} else NA
} else var(x, na.rm = na.rm)
}
#' @title Weighted Standard Deviation
#' @description Calculate the weighted standard deviation of `x`.
#' @param ... Arguments to [varW()].
#' @details This is simply a wrapper for [varW()], applying the square root to
#' the output.
#' @return A numeric value, the weighted standard deviation of `x`.
#' @seealso [sd()]
#' @export
sdW <- function(...) {
sqrt(varW(...))
}
#' @title Get Model Data
#' @description Extract the data used to fit a model.
#' @param mod A fitted model object, or a list or nested list of such objects.
#' @param subset Logical. If `TRUE`, only observations used to fit the model(s)
#' are returned (i.e. missing observations (`NA`) or those with zero weight
#' are removed).
#' @param merge Logical. If `TRUE`, and `mod` is a list or nested list, a single
#' dataset containing all variables used to fit models is returned (variables
#' must be the same length).
#' @param env Environment in which to look for data (passed to [eval()]).
#' Defaults to the [formula()] environment.
#' @details This is a simple convenience function to return the data used to fit
#' a model, by evaluating the 'data' slot of the model call object. If the
#' 'data' argument of the model call was not specified, or is not a data frame
#' (or coercible to such) containing all variables referenced in the model
#' formula, an error will be thrown – this restriction is largely to ensure
#' that a single coherent dataset of all model variables can be made available
#' for resampling purposes.
#'
#' If `mod` is a list of models and `merge = TRUE`, all (unique) variables
#' used to fit models are merged into a single data frame. This will return an
#' error if `subset = TRUE` results in datasets with different numbers of
#' observations (rows).
#' @return A data frame of the variables used to fit the model(s), or a list or
#' nested list of same.
#' @seealso [getCall()]
#' @examples
#' # Get data used to fit SEM from Shipley (2009)
#' head(getData(shipley.sem[[1]])) # from single model
#' lapply(getData(shipley.sem), head) # from SEM (list)
#' head(getData(shipley.sem, merge = TRUE)) # from SEM (single dataset)
#' @export
getData <- function(mod, subset = FALSE, merge = FALSE, env = NULL) {
m <- mod
# Function
getData <- function(m) {
# Data from 'data' argument of model call
mc <- getCall(m)
f <- formula(m)
if (is.null(env)) env <- environment(f)
d <- eval(mc$data, env)
if (!is.null(d)) d <- data.frame(d) else
stop("'data' argument of model call not specified, or data is NULL.")
# All var names from model call
f <- c(f, mc$subset, mc$weights, mc$offset, mc$correlation)
vn <- unlist(lapply(f, all.vars))
if (!all(vn %in% names(d)))
stop("'data' does not contain all variables used to fit model.")
# Subset data for model observations?
if (subset) {
obs <- names(fitted(m))
w <- weights(m)
if (!is.null(w)) obs <- obs[w > 0 & !is.na(w)]
d[obs, ]
} else d
}
# Apply recursively
rgetData <- function(m) {
if (isList(m)) {
d <- lapply(m, rgetData)
if (merge) {
d <- do.call(cbind, unname(d))
d[unique(names(d))]
} else d
} else getData(m)
}
rgetData(m)
}
#' @title Get Model Design Matrix
#' @description Return the design matrix for a fitted model, with some
#' additional options.
#' @param mod A fitted model object, or a list or nested list of such objects.
#' Can also be a model formula(s) or character vector(s) of term names (in
#' which case `data` must be supplied).
#' @param data An optional dataset, used to refit the model(s) and/or construct
#' the design matrix.
#' @param contrasts Optional, a named list of contrasts to apply to factors (see
#' the `contrasts.arg` argument of [model.matrix()] for specification). These
#' will override any existing contrasts in the data or model call.
#' @param add.data Logical, whether to append data not specified in the model
#' formula (with factors converted to dummy variables).
#' @param centre,scale Logical, whether to mean-centre and/or scale terms by
#' standard deviations (for interactions, this is carried out prior to
#' construction of product terms). Alternatively, a numeric vector of
#' means/standard deviations (or other statistics) can be supplied, whose
#' names must match term names.
#' @param as.df Logical, whether to return the matrix as a data frame (without
#' modifying names).
#' @param merge Logical. If `TRUE`, and `mod` is a list or nested list, a single
#' matrix containing all terms is returned (variables must be the same
#' length).
#' @param env Environment in which to look for model data (if none supplied).
#' Defaults to the [formula()] environment.
#' @details This is primarily a convenience function to enable more flexible
#' construction of design matrices, usually for internal use and for further
#' processing. Use cases include processing and/or return of terms which may
#' not be present in a typical design matrix (e.g. constituents of product
#' terms, dummy variables).
#' @return A matrix or data frame of model(s) terms, or a list or nested list of
#' same.
#' @seealso [model.matrix()]
#' @examples
#' # Model design matrix (original)
#' m <- shipley.growth[[3]]
#' x1 <- model.matrix(m)
#' x2 <- getX(m)
#' stopifnot(all.equal(x1, x2, check.attributes = FALSE))
#'
#' # Using formula or term names (supply data)
#' d <- shipley
#' x1 <- getX(formula(m), data = d)
#' x2 <- getX(names(lme4::fixef(m)), data = d)
#' stopifnot(all.equal(x1, x2))
#'
#' # Scaled terms
#' head(getX(m, centre = TRUE, scale = TRUE))
#'
#' # Combined matrix for SEM
#' head(getX(shipley.sem, merge = TRUE))
#' head(getX(shipley.sem, merge = TRUE, add.data = TRUE)) # add other variables
#' @export
getX <- function(mod, data = NULL, contrasts = NULL, add.data = FALSE,
centre = FALSE, scale = FALSE, as.df = FALSE, merge = FALSE,
env = NULL) {
m <- mod; d <- data; ct <- contrasts; cn <- centre; sc <- scale
# Function
getX <- function(m) {
# Function to convert to data frame without modifying names
dF <- function(...) {
data.frame(..., check.names = FALSE)
}
# Function to evaluate a term name (x) in data (d)
eT <- function(x, d) {
eT <- function(x) {
if (!x %in% names(d)) {
eval(parse(text = x), d)
} else d[[x]]
}
if (grepl("poly\\(.*[0-9]$", x)) {
n <- nchar(x)
xd <- eT(substr(x, 1, n - 1))
xd[, substr(x, n, n)]
} else eT(x)
}
# Refit model with any supplied data
if (isMod(m) && !is.null(d)) {
m <- eval(update(m, data = d, evaluate = FALSE))
env <- environment()
}
# Data
if (isMod(m)) d <- getData(m, subset = TRUE, env = env)
if (is.null(d))
stop("'data' must be supplied.")
# Factor contrasts
if (isMod(m) && is.null(ct)) {
ct <- suppressWarnings(
attr(model.matrix(m, data = d), "contrasts")
)
}
# Formulas
i <- if (!is.character(m)) attr(terms(m), "intercept") else any(isInt(m))
xn <- if (!is.character(m)) labels(terms(m)) else m
xn <- xn[!isInt(xn) & !grepl("[|]", xn)]
fd <- reformulate(c(xn, names(d)), intercept = i)
fx <- reformulate(xn, intercept = i)
# Design matrix (x)
d <- model.frame(fd, data = d, na.action = na.pass)
x <- suppressWarnings(
dF(model.matrix(fx, data = d, contrasts.arg = ct))
)
obs <- rownames(x)
# Term/variable names
xn <- names(x)
xn2 <- names(model.frame(fx, data = d))
# Add other variables from data (convert factors to dummy vars)
d <- sapply(names(d), function(i) {
di <- d[[i]]
if (!is.numeric(di)) {
if (i %in% xn2) {
di <- as.factor(di)
ci <- if (!is.null(ct)) list(di = ct[[i]])
di <- cbind(
model.matrix( ~ 0 + di),
model.matrix( ~ di, contrasts.arg = ci)
)
colnames(di) <- gsub("^di", i, colnames(di))
di
} else 0
} else di
}, simplify = FALSE)
d <- dF(do.call(cbind, d))
d <- d[unique(names(d))]
x <- dF(x, d[!names(d) %in% xn])
# Means/SDs (for centring/scaling)
xm <- sapply(names(x), function(i) {
if (!isFALSE(cn)) {
if (i %in% names(cn)) cn[[i]] else mean(x[[i]])
} else 0
})
xs <- sapply(names(x), function(i) {
if (!isFALSE(sc)) {
if (i %in% names(sc)) sc[[i]] else sd(x[[i]])
} else 1
})
# Term names to return
if (add.data) {
rn <- if (isMer(m)) {
unlist(lapply(lme4::VarCorr(m), rownames))
}
xn <- unique(c(names(x), rn))
}
# Construct final matrix
x <- sapply(xn, function(i) {
if (!isInt(i)) {
ii <- unlist(strsplit(i, "(?<!:):(?!:)", perl = TRUE))
xi <- sapply(ii, eT, x)
xn <- colnames(xi)
xi <- sweep(xi, 2, xm[xn])
xi <- sweep(xi, 2, xs[xn], "/")
apply(xi, 1, prod)
} else x[, i]
})
rownames(x) <- obs
# Return
if (as.df) dF(x) else x
}
# Apply recursively
rgetX <- function(m) {
if (isList(m)) {
x <- lapply(m, rgetX)
if (merge) {
x <- do.call(cbind, unname(x))
x[, unique(colnames(x)), drop = FALSE]
} else x
} else getX(m)
}
rgetX(m)
}
#' @title Get Model Term Names
#' @description Extract term names from a fitted model object.
#' @param mod A fitted model object, or a list or nested list of such objects.
#' @param intercept Logical, whether the intercept should be included.
#' @param aliased Logical, whether names of aliased terms should be included
#' (see Details).
#' @param list Logical, whether names should be returned as a list, with all
#' multi-coefficient terms grouped under their main term names.
#' @param env Environment in which to look for model data (used to construct the
#' model frame). Defaults to the [formula()] environment.
#' @details Extract term names from a fitted model. Names of terms for which
#' coefficients cannot be estimated are also included if `aliased = TRUE`
#' (default). These may be terms which are perfectly correlated with other
#' terms in the model, so that the model design matrix is rank deficient.
#' @return A character vector or list/nested list of term names.
#' @examples
#' # Term names from Shipley SEM
#' m <- shipley.sem
#' xNam(m)
#' xNam(m, intercept = FALSE)
#'
#' # Model with different types of predictor (some multi-coefficient terms)
#' d <- data.frame(
#' y = rnorm(100),
#' x1 = rnorm(100),
#' x2 = as.factor(rep(c("a", "b", "c", "d"), each = 25)),
#' x3 = rep(1, 100)
#' )
#' m <- lm(y ~ poly(x1, 2) + x2 + x3, data = d)
#' xNam(m)
#' xNam(m, aliased = FALSE) # drop term that cannot be estimated (x3)
#' xNam(m, aliased = FALSE, list = TRUE) # names as list
#' @export
xNam <- function(mod, intercept = TRUE, aliased = TRUE, list = FALSE,
env = NULL) {
m <- mod
# Function
xNam <- function(m) {
# Term names
tt <- terms(m)
xn <- labels(tt)
int <- attr(tt, "intercept")
if (int) xn <- c("(Intercept)", xn)
# Expand interaction terms (list)
XN <- sapply(xn, function(i) {
unlist(strsplit(i, "(?<!:):(?!:)", perl = TRUE))
}, simplify = FALSE)
# Predictors
d <- getData(m, env = env)
x <- suppressWarnings(model.frame(m, data = d))
x <- x[names(x) %in% unlist(unname(XN))]
xn2 <- unique(c(xn, names(x)))
# Factor contrasts
ct <- suppressWarnings(
attr(model.matrix(m, data = d), "contrasts")
)
# Expand factor/matrix terms (list)
XN2 <- sapply(xn2, function(i) {
if (i %in% names(x)) {
xi <- x[[i]]
j <- colnames(xi)
n <- ncol(xi)
if (!is.numeric(xi)) {
xi <- as.factor(xi)
ci <- list(xi = ct[[i]])
xi <- model.matrix( ~ xi, contrasts.arg = ci)
j <- gsub("^xi", "", colnames(xi)[-1])
j <- c(levels(xi), j)
n <- nlevels(xi)
}
j <- unique(c("", seq(n), j))
paste0(i, j)
} else i
}, simplify = FALSE)
# Expand interaction terms involving factors/matrices (list)
XN <- sapply(xn, function(i) {
if (isInx(i)) {
j <- expand.grid(XN2[XN[[i]]])
apply(j, 1, paste, collapse = ":")
} else XN2[[i]]
}, simplify = FALSE)
# Drop some names? (aliased, subsetted factor levels, etc.)
b <- if (isMer(m)) lme4::fixef(m, add.dropped = TRUE) else coef(m)
bn <- names(b)
if (!aliased) bn <- bn[!is.na(b)]
XN <- lapply(XN, function(i) bn[bn %in% i])
XN <- XN[sapply(XN, length) > 0]
# Drop intercept?
if (!intercept && int) XN <- XN[-1]
# Return names
if (!list) unlist(unname(XN)) else XN
}
# Apply recursively
rMapply(xNam, m, SIMPLIFY = FALSE)
}
#' @title Generalised Link Transformation
#' @description Transform a numeric variable using a GLM link function, or
#' return an estimate of same.
#' @param x a positive numeric vector, corresponding to a variable to be
#' transformed. Can also be a list or nested list of such vectors.
#' @param family Optional, the error distribution family containing the link
#' function which will be used to transform `x` (see [family()] for
#' specification details). If not supplied, it is determined from `x` (see
#' Details).
#' @param force.est Logical, whether to force the return of the estimated rather
#' than direct transformation, where the latter is available (i.e. does not
#' contain undefined values).
#' @details `glt()` can be used to provide a 'generalised' transformation of a
#' numeric variable, using the link function from a generalised linear model
#' (GLM) fit to the variable. The transformation is generalised in the sense
#' that it can extend even to cases where a standard link transformation would
#' produce undefined values. It achieves this by using an estimate based on
#' the 'working' response variable of the GLM (see below). If the error
#' distribution `family` is not specified (default), then it is determined
#' (roughly) from `x`, with `binomial(link = "logit")` used when all x <= 1
#' and `poisson(link = "log")` otherwise. Although the function is generally
#' intended for variables with a binomial or Poisson distribution, any
#' variable which can be fit using [glm()] can be supplied. One of the key
#' purposes of `glt()` is to allow the calculation of fully standardised
#' effects (coefficients) for GLMs (in which case `x` = the response
#' variable), while it can also facilitate the proper calculation of SEM
#' indirect effects (see below).
#'
#' **Estimating the direct link transformation**
#'
#' A key challenge in generating fully standardised effects for a GLM with a
#' non-gaussian link function is the difficulty in calculating appropriate
#' standardised ranges (typically the standard deviation) for the response
#' variable in the link scale. This is because a direct transformation of the
#' response will often produce undefined values. Although methods for
#' circumventing this issue by indirectly estimating the variance of the
#' response on the link scale have been proposed – including a
#' latent-theoretic approach for binomial models (McKelvey & Zavoina, 1975)
#' and a more general variance-based method using pseudo R-squared (Menard,
#' 2011) – here an alternative approach is used. Where transformed values are
#' undefined, the function will instead return the synthetic 'working'
#' response from the iteratively reweighted least squares (IRLS) algorithm of
#' the GLM (McCullagh & Nelder, 1989). This is reconstructed as the sum of the
#' linear predictor and the working residuals – with the latter comprising the
#' error of the model on the link scale. The advantage of this approach is
#' that a relatively straightforward 'transformation' of any non-gaussian
#' response is readily attainable in all cases. The standard deviation (or
#' other relevant range) can then be calculated using values of the
#' transformed response and used to scale the effects. An additional benefit
#' for piecewise SEMs is that the transformed rather than original response
#' can be specified as a predictor in other models, ensuring that standardised
#' indirect and total effects are calculated correctly (i.e. using the same
#' units).
#'
#' To ensure a high level of 'accuracy' in the working response – in the sense
#' that the inverse-transformation is practically indistinguishable from the
#' original response variable – the function uses the following iterative
#' fitting procedure to calculate a 'final' working response:
#'
#' 1. A new GLM of the same error family is fit with the original response
#' variable as both predictor and response, and using a single IRLS iteration.
#'
#' 2. The working response is reconstructed from this model.
#'
#' 3. The inverse transformation of the working response is calculated.
#'
#' 4. If the inverse transformation is 'effectively equal' to the original
#' response (tested using [all.equal()] with the default tolerance of
#' `1.5e-8`), the working response is returned; otherwise, the GLM is refit
#' with the working response now as the predictor, and steps 2-4 are repeated
#' – each time with an additional IRLS iteration.
#'
#' This approach will generate a very reasonable transformation of the
#' response variable, which will also be practically indistinguishable from
#' the direct transformation, where this can be compared (see Examples). It
#' also ensures that the transformed values, and hence the standard deviation,
#' are the same for any GLM fitting the same response (provided it uses the
#' same link function) – facilitating model comparisons, selection, and
#' averaging.
#'
#' @note As we often cannot directly observe the GLM response variable on the
#' link scale, any method estimating its values or statistics will be wrong to
#' some degree. The heuristic approach described here aims to reduce this
#' error as far as (reasonably) possible, while also generating standardised
#' effects whose interpretation most closely resembles those of the ordinary
#' linear model. The solution of using the working response from the GLM to
#' scale effects is a practical, but reasonable one, and one that takes
#' advantage of modern computing power to minimise error through iterative
#' model fitting. An added bonus is that the estimated variance is constant
#' across models fit to the same response variable, which cannot be said of
#' previous methods (Menard, 2011). The overall approach would be classed as
#' 'observed-empirical' by Grace et al. (2018), as it utilises model error
#' variance (the estimated working residuals) rather than theoretical
#' distribution-specific variance.
#' @return A numeric vector of the transformed values, or an array, list of
#' vectors/arrays, or nested list.
#' @references Grace, J. B., Johnson, D. J., Lefcheck, J. S., & Byrnes, J. E. K.
#' (2018). Quantifying relative importance: computing standardized effects in
#' models with binary outcomes. *Ecosphere*, *9*, e02283. \doi{10/gdm5bj}
#'
#' McCullagh P., & Nelder, J. A. (1989). *Generalized Linear Models* (2nd
#' Edition). Chapman and Hall.
#'
#' McKelvey, R. D., & Zavoina, W. (1975). A statistical model for the analysis
#' of ordinal level dependent variables. *The Journal of Mathematical
#' Sociology*, *4*(1), 103-120. \doi{10/dqfhpp}
#'
#' Menard, S. (2011). Standards for Standardized Logistic Regression
#' Coefficients. *Social Forces*, *89*, 1409-1428. \doi{10/bvxb6s}
#' @examples
#' # Compare estimate with a direct link transformation
#' # (test with a poisson variable, log link)
#' set.seed(13)
#' y <- rpois(30, lambda = 10)
#' yl <- unname(glt(y, force.est = TRUE))
#'
#' # Effectively equal?
#' all.equal(log(y), yl)
#' # TRUE
#'
#' # Actual difference...
#' all.equal(log(y), yl, tolerance = .Machine$double.eps)
#' # "Mean relative difference: 2.489317e-10"
#' @export
glt <- function(x, family = NULL, force.est = FALSE) {
f <- family
# Function
glt <- function(x) {
# Error family
if (is.character(f)) {
f <- get(f, mode = "function", envir = parent.frame())
}
if (is.function(f)) f <- f()
if (is.null(f)) {
f <- if (all(x <= 1)) binomial() else poisson()
}
# Transform variable to link scale
xl <- f$linkfun(as.numeric(x))
# Return transformation or estimate (GLM working response)
if (any(is.infinite(xl)) || force.est) {
x2 <- x
m <- suppressWarnings(
do.call(glm, list(x ~ x2, family = f, control = list(maxit = 1),
na.action = na.exclude))
)
i <- 0
repeat {
xl <- predict(m) + resid(m, type = "working")
xli <- f$linkinv(xl)
eql <- isTRUE(all.equal(x, xli, tolerance = sqrt(.Machine$double.eps),
check.names = FALSE))
if (eql) {
return(xl)
}
i <- i + 1
m <- suppressWarnings(
update(m, . ~ xl, control = list(maxit = i))
)
}
} else xl
}
# Apply recursively
rMapply(glt, x)
}
#' @title Get Model Response Variable
#' @description Extract the response variable from a fitted model on the
#' original or link scale.
#' @param mod A fitted model object, or a list or nested list of such objects.
#' @param data An optional dataset, used to first refit the model(s).
#' @param link Logical. If `TRUE`, return the GLM response variable on the link
#' scale (see Details).
#' @param offset Logical. If `TRUE`, include model offset(s) in the response.
#' @param env Environment in which to look for model data (if none supplied).
#' Defaults to the [formula()] environment.
#' @details `getY()` will return the response variable from a model by summing
#' the fitted values and the response residuals. If `link = TRUE` and the
#' model is a GLM, the response is transformed to the link scale. If this
#' results in undefined values, an estimate based on the 'working' response
#' variable of the GLM is returned instead (see [glt()]).
#'
#' Any offset variables are subtracted from the response by default. This
#' means that, for example, rates rather than raw counts will be returned for
#' poisson GLMs (where applicable).
#' @return A numeric vector comprising the response variable on the original or
#' link scale, or an array, list of vectors/arrays, or nested list.
#' @examples
#' # All SEM responses (original scale)
#' head(getY(shipley.sem))
#'
#' # Estimated response in link scale from binomial model
#' head(getY(shipley.sem$Live, link = TRUE))
#' @export
getY <- function(mod, data = NULL, link = FALSE, offset = FALSE, env = NULL) {
m <- mod; d <- data
# Function
getY <- function(m) {
# Refit model with any supplied data
if (!is.null(d)) {
m <- eval(update(m, data = d, evaluate = FALSE))
env <- environment()
}
# Model error family
f <- if (isBet(m)) m$link$mean else family(m)
# Model weights and offset
w <- weights(m)
o <- if (!offset) {
d <- getData(m, subset = TRUE, env = env)
mf <- suppressWarnings(model.frame(m, data = d))
model.offset(mf)
}
# Model response
y <- fitted(m) + resid(m, type = "response")
if (!is.null(w)) y <- y[w > 0 & !is.na(w)]
if (!is.null(o)) y <- f$linkinv(f$linkfun(y) - o)
y[zapsmall(y) == 0] <- 0
an <- names(attributes(y))
attributes(y)[an != "names"] <- NULL
# Return response on original or link scale
if (isGlm(m) && link) glt(y, family = f) else y
}
# Apply recursively
rMapply(getY, m)
}
#' @title Generalised Variance Inflation Factors
#' @description Calculate generalised variance inflation factors for terms in a
#' fitted model(s) via the variance-covariance matrix of coefficients.
#' @param mod A fitted model object, or a list or nested list of such objects.
#' @param data An optional dataset, used to first refit the model(s).
#' @param env Environment in which to look for model data (if none supplied).
#' Defaults to the [formula()] environment.
#' @details `VIF()` calculates generalised variance inflation factors (GVIF) as
#' described in Fox & Monette (1992), and also implemented in
#' [`car::vif()`](https://rdrr.io/cran/car/man/vif.html). However, whereas
#' `vif()` returns both GVIF and GVIF^(1/(2*Df)) values, `VIF()` simply
#' returns the squared result of the latter measure, which equals the standard
#' VIF for single-coefficient terms and is the equivalent measure for
#' multi-coefficient terms (e.g. categorical or polynomial). Also, while
#' `vif()` returns values per model term (i.e. predictor variable), `VIF()`
#' returns values per coefficient, meaning that the same value will be
#' returned per coefficient for multi-coefficient terms. Finally, `NA` is
#' returned for any terms which could not be estimated in the model (e.g.
#' aliased).
#' @return A numeric vector of the VIFs, or an array, list of vectors/arrays, or
#' nested list.
#' @references Fox, J., & Monette, G. (1992). Generalized Collinearity
#' Diagnostics. *Journal of the American Statistical Association*, *87*,
#' 178-183. \doi{10/dm9wbw}
#' @examples
#' # Model with two correlated terms
#' m <- shipley.growth[[3]]
#' VIF(m) # Date & DD somewhat correlated
#' VIF(update(m, . ~ . - DD)) # drop DD
#'
#' # Model with different types of predictor (some multi-coefficient terms)
#' d <- data.frame(
#' y = rnorm(100),
#' x1 = rnorm(100),
#' x2 = as.factor(rep(c("a", "b", "c", "d"), each = 25)),
#' x3 = rep(1, 100)
#' )
#' m <- lm(y ~ poly(x1, 2) + x2 + x3, data = d)
#' VIF(m)
#' @export
VIF <- function(mod, data = NULL, env = NULL) {
m <- mod; d <- data
# Function
VIF <- function(m) {
# Refit model with any supplied data
if (!is.null(d)) {
m <- eval(update(m, data = d, evaluate = FALSE))
env <- environment()
}
# Term names
XN <- xNam(m, intercept = FALSE, list = TRUE, env = env)
xn <- xNam(m, intercept = FALSE, aliased = FALSE, env = env)
# VIFs
if (length(xn) > 1) {
# T/F for terms as matrices
if (is.null(d)) d <- getData(m, env = env)
mf <- suppressWarnings(model.frame(m, data = d))
mat <- sapply(names(XN), function(i) {
if (i %in% names(mf)) class(mf[, i])[1] == "matrix" else FALSE
})
# Var-cov/cor matrix
V <- as.matrix(vcov(m))[xn, xn]
R <- cov2cor(V)
det.R <- det(R)
# Function
VIF <- function(i) {
if (all(i %in% xn)) {
j <- !xn %in% i
Ri <- R[i, i, drop = FALSE]
Rj <- R[j, j, drop = FALSE]
vif <- det(Ri) * det(Rj) / det.R
p <- 1 / (2 * length(i))
(vif^p)^2
} else NA
}
# Calculate
vif <- Map(function(i, j) {
if (j) sapply(i, VIF) else {
vif <- VIF(i)
sapply(i, function(x) vif)
}
}, XN, mat)
unlist(unname(vif))
} else {
sapply(unlist(unname(XN)), function(i) {
if (i %in% xn) 1 else NA
})
}
}
# Apply recursively
rMapply(VIF, m, SIMPLIFY = FALSE)
}
#' @title Root Variance Inflation Factors
#' @description Calculate root variance inflation factors (RVIF) for terms in a
#' fitted model(s), i.e. the square root of the (generalised) VIFs.
#' @param ... Arguments to [VIF()].
#' @details RVIFs quantify the inflation of estimate standard errors due to
#' multicollinearity among predictors, and also of estimates themselves
#' compared to the 'unique' (residualised) effects. RVIFs may often be more
#' practical than VIFs for assessing multicollinearity, relating more directly
#' to the parameters of interest.
#' @return A numeric vector of the RVIFs, or an array, list of vectors/arrays,
#' or nested list.
#' @export
RVIF <- function(...) {
rMapply(sqrt, VIF(...), SIMPLIFY = FALSE)
}
#' @title R-squared
#' @description Calculate (Pseudo) R-squared for a fitted model, defined here as
#' the squared multiple correlation between the observed and fitted values for
#' the response variable. 'Adjusted' and 'Predicted' versions are also
#' calculated (see Details).
#' @param mod A fitted model object, or a list or nested list of such objects.
#' @param data An optional dataset, used to first refit the model(s).
#' @param adj,pred Logical. If `TRUE` (default), adjusted and/or predicted
#' R-squared are also returned (the latter is not available for all model
#' types).
#' @param offset Logical. If `TRUE`, include model offset(s) in the calculations
#' (i.e. in the response variable and fitted values).
#' @param re.form For mixed models of class `"merMod"`, the formula for random
#' effects to condition on when generating fitted values used in the
#' calculation of R-squared. Defaults to `NULL`, meaning all random effects
#' are included. See [predict.merMod()] for further specification details.
#' @param type The type of correlation coefficient to use. Can be `"pearson"`
#' (default) or `"spearman"`.
#' @param adj.type The type of adjusted R-squared estimator to use. Can be
#' `"olkin-pratt"` (default) or `"ezekiel"`. See Details.
#' @param positive.only Logical, whether to return only positive values for
#' R-squared (negative values replaced with zero).
#' @param env Environment in which to look for model data (if none supplied).
#' Defaults to the [formula()] environment.
#' @details Various approaches to the calculation of a goodness of fit measure
#' for GLMs analogous to R-squared in the ordinary linear model have been
#' proposed. Generally termed 'pseudo R-squared' measures, they include
#' variance-based, likelihood-based, and distribution-specific approaches.
#' Here however, a more straightforward definition is used, which can be
#' applied to any model for which fitted values of the response variable are
#' generated: R-squared is calculated as the squared (weighted) correlation
#' between the observed and fitted values of the response (in the original
#' units). This is simply the squared version of the correlation measure
#' advocated by Zheng & Agresti (2000), itself an intuitive measure of
#' goodness of fit describing the predictive power of a model. As the measure
#' does not depend on any specific error distribution or model estimating
#' procedure, it is also generally comparable across many different types of
#' model (Kvalseth, 1985). In the case of the ordinary linear model, the
#' measure is exactly equal to the traditional R-squared based on sums of
#' squares.
#'
#' If `adj = TRUE` (default), the 'adjusted' R-squared value is also returned,
#' which provides an estimate of the population – as opposed to sample –
#' R-squared. This is achieved via an analytical formula which adjusts
#' R-squared using the 'degrees of freedom' of the model (i.e. the ratio of
#' observations to parameters), helping to counter multiple R-squared's
#' positive bias and guard against overfitting of the model to noise in the
#' original sample. By default, this is calculated via the exact 'Olkin-Pratt'
#' estimator, shown in recent simulations to be the optimal unbiased
#' population R-squared estimator across a range of estimators and
#' specification scenarios (Karch, 2020), and thus a good general first
#' choice, even for smaller sample sizes. Setting `adj.type = "ezekiel"`
#' however will use the simpler and more common 'Ezekiel' formula, which can
#' be more appropriate where minimising the mean squared error (MSE) of the
#' estimate is more important than strict unbiasedness (Hittner, 2019; Karch,
#' 2020).
#'
#' If `pred = TRUE` (default), a 'predicted' R-squared is also returned, which
#' is calculated via the same formula as for R-squared but using
#' cross-validated, rather than original, fitted values. These are obtained by
#' dividing the model residuals (in the response scale) by the complement of
#' the observation leverages (diagonals of the hat matrix), then subtracting
#' these inflated 'predicted' residuals from the response variable. This is
#' essentially a short cut to obtaining 'out-of-sample' predictions, normally
#' arising via a 'leave-one-out' cross-validation procedure (they are not
#' exactly equal for GLMs). The resulting R-squared is an estimate of the
#' R-squared that would result were the model fit to new data, and will be
#' lower than the original – and likely also the adjusted – R-squared,
#' highlighting the loss of explanatory power due to sample noise. Predicted
#' R-squared [may be a more powerful and general indicator of overfitting than
#' adjusted
#' R-squared](https://stats.stackexchange.com/questions/242770/difference-between-adjusted-r-squared-and-predicted-r-squared),
#' as it provides a true out-of-sample test. This measure is a variant of an
#' [existing
#' one](https://www.r-bloggers.com/2014/05/can-we-do-better-than-r-squared/),
#' calculated by substituting the 'PRESS' statistic, i.e. the sum of squares
#' of the predicted residuals (Allen, 1974), for the residual sum of squares
#' in the classic R-squared formula. It is not calculated here for GLMMs, as
#' the interpretation of the hat matrix is not reliable (see
#' [hatvalues.merMod()]).
#'
#' For models fitted with one or more offsets, these will be removed by
#' default from the response variable and fitted values prior to calculations.
#' Thus R-squared will measure goodness of fit only for the predictors with
#' estimated, rather than fixed, coefficients. This is likely to be the
#' intended behaviour in most circumstances, though if users wish to include
#' variation due to the offset(s) they can set `offset = TRUE`.
#'
#' For mixed models, the function will, by default, calculate all R-squared
#' measures using fitted values incorporating both the fixed and random
#' effects, thus encompassing all variation captured by the model. This is
#' equivalent to the 'conditional' R-squared of Nakagawa et al. (2017) (though
#' see that reference for a more advanced approach to R-squared for mixed
#' models). To include only some or no random effects, simply set the
#' appropriate formula using the argument `re.form`, which is passed directly
#' to [predict.merMod()]. If `re.form = NA`, R-squared is calculated for the
#' fixed effects only, i.e. the 'marginal' R-squared of Nakagawa et al.
#' (2017).
#'
#' As R-squared is calculated here as a squared correlation, the `type` of
#' correlation coefficient can also be specified. Setting this to `"spearman"`
#' may be desirable in some cases, such as where the relationship between
#' response and fitted values is not necessarily bivariate normal or linear,
#' and a correlation of the ranks may be more informative and/or general. This
#' purely monotonic R-squared can also be considered a [useful goodness of fit
#' measure](https://stats.stackexchange.com/questions/44268/reporting-coefficient-of-determination-using-spearmans-rho),
#' and may be more appropriate for comparisons between GLMs and ordinary
#' linear models in some scenarios.
#'
#' R-squared values produced by this function will by default be in the range
#' 0-1, meaning that any negative values arising from calculations will be
#' converted to zero. Negative values essentially mean that the fit is 'worse'
#' than the null expectation of no relationship between the variables, which
#' can be difficult to interpret in practice and in any case usually only
#' occurs in rare situations, such as where the intercept is suppressed or
#' where a low value of R-squared is adjusted downwards via an analytic
#' estimator. Such values are also 'impossible' in practice, given that
#' R-squared is a strictly positive measure (as generally known). Hence, for
#' simplicity and ease of interpretation, values less than zero are presented
#' as a complete lack of model fit. This is also recommended by Shieh (2008),
#' who shows for adjusted R-squared that such 'positive-part' estimators have
#' lower MSE in estimating the population R-squared (though higher bias). To
#' allow return of negative values however, set `positive.only = FALSE`. This
#' may be desirable for simulation purposes, and/or where strict unbiasedness
#' is prioritised.
#'
#' @note Caution must be exercised in interpreting the values of any pseudo
#' R-squared measure calculated for a GLM or mixed model (including those
#' produced by this function), as such measures do not hold all the properties
#' of R-squared in the ordinary linear model and as such may not always behave
#' as expected. Care must also be taken in comparing the measures to their
#' equivalents from ordinary linear models, particularly the adjusted and
#' predicted versions, as assumptions and/or calculations may not generalise
#' well. With that being said, the value of standardised R-squared measures
#' for even 'rough' model fit assessment and comparison may outweigh such
#' reservations, and the adjusted and predicted versions in particular may aid
#' the user in diagnosing and preventing overfitting. They should NOT,
#' however, replace other measures such as AIC or BIC for formally comparing
#' and/or ranking competing models fit to the same response variable.
#' @return A numeric vector of the R-squared value(s), or an array, list of
#' vectors/arrays, or nested list.
#' @references Allen, D. M. (1974). The Relationship Between Variable Selection
#' and Data Augmentation and a Method for Prediction. *Technometrics*,
#' *16*(1), 125-127. \doi{10/gfgv57}
#'
#' Hittner, J. B. (2019). Ezekiel’s classic estimator of the population
#' squared multiple correlation coefficient: Monte Carlo-based extensions and
#' refinements. *The Journal of General Psychology*, *147*(3), 213–227.
#' \doi{10/gk53wb}
#'
#' Karch, J. (2020). Improving on Adjusted R-Squared. *Collabra: Psychology*,
#' *6*(1). \doi{10/gkgk2v}
#'
#' Kvalseth, T. O. (1985). Cautionary Note about R2. *The American
#' Statistician*, *39*(4), 279-285. \doi{10/b8b782}
#'
#' Nakagawa, S., Johnson, P. C. D., & Schielzeth, H. (2017). The coefficient
#' of determination R2 and intra-class correlation coefficient from
#' generalized linear mixed-effects models revisited and expanded. *Journal of
#' the Royal Society Interface*, *14*(134). \doi{10/gddpnq}
#'
#' Shieh, G. (2008). Improved Shrinkage Estimation of Squared Multiple
#' Correlation Coefficient and Squared Cross-Validity Coefficient.
#' *Organizational Research Methods*, *11*(2), 387–407. \doi{10/bcwqf3}
#'
#' Zheng, B., & Agresti, A. (2000). Summarizing the predictive power of a
#' generalized linear model. *Statistics in Medicine*, *19*(13), 1771-1781.
#' \doi{10/db7rfv}
#' @examples
#' # Pseudo R-squared for mixed models
#' R2(shipley.sem) # fixed + random ('conditional')
#' R2(shipley.sem, re.form = ~ (1 | tree)) # fixed + 'tree'
#' R2(shipley.sem, re.form = ~ (1 | site)) # fixed + 'site'
#' R2(shipley.sem, re.form = NA) # fixed only ('marginal')
#' R2(shipley.sem, re.form = NA, type = "spearman") # using Spearman's Rho
#'
#' # Predicted R-squared: compare cross-validated predictions calculated/
#' # approximated via the hat matrix to standard method (leave-one-out)
#'
#' # Fit test models using Shipley data – compare lm vs glm
#' d <- na.omit(shipley)
#' m <- lm(Live ~ Date + DD + lat, d)
#' # m <- glm(Live ~ Date + DD + lat, binomial, d)
#'
#' # Manual CV predictions (leave-one-out)
#' cvf1 <- sapply(1:nrow(d), function(i) {
#' m.ni <- update(m, data = d[-i, ])
#' predict(m.ni, d[i, ], type = "response")
#' })
#'
#' # Short-cut via the hat matrix
#' y <- getY(m)
#' f <- fitted(m)
#' cvf2 <- y - (y - f) / (1 - hatvalues(m))
#'
#' # Compare predictions (not exactly equal for GLMs)
#' all.equal(cvf1, cvf2)
#' # lm: TRUE; glm: "Mean relative difference: 1.977725e-06"
#' cor(cvf1, cvf2)
#' # lm: 1; glm: 0.9999987
#'
#' # NOTE: comparison not tested here for mixed models, as hierarchical data can
#' # complicate the choice of an appropriate leave-one-out procedure. However,
#' # there is no obvious reason why use of the leverage values (diagonals of the
#' # hat matrix) to estimate CV predictions shouldn't generalise, roughly, to
#' # the mixed model case (at least for LMMs). In any case, users should
#' # exercise the appropriate caution in interpretation.
#' @export
R2 <- function(mod, data = NULL, adj = TRUE, pred = TRUE, offset = FALSE,
re.form = NULL, type = c("pearson", "spearman"),
adj.type = c("olkin-pratt", "ezekiel"), positive.only = TRUE,
env = NULL) {
m <- mod; d <- data; rf <- re.form; type <- match.arg(type);
adj.type <- match.arg(adj.type)
# Function
R2 <- function(m) {
# Refit model with any supplied data
if (!is.null(d)) {
m <- eval(update(m, data = d, evaluate = FALSE))
env <- environment()
}
# Degrees of freedom
n <- nobs(m)
i <- attr(terms(m), "intercept")
ndf <- n - i
rdf <- df.residual(m)
# No. predictors
k <- ndf - rdf
# R-squared
R2 <- if (k > 0) {
# Model link function
f <- if (isBet(m)) m$link$mean else family(m)
lF <- f$linkfun
lI <- f$linkinv
# Model weights
w <- weights(m)
if (is.null(w)) w <- rep(1, n)
w <- w[w > 0 & !is.na(w)]
# Model offset
d <- getData(m, subset = TRUE, env = env)
o <- if (!offset) {
mf <- suppressWarnings(model.frame(m, data = d))
model.offset(mf)
}
if (is.null(o)) o <- 0
# Response
y <- getY(m, offset = offset, env = env)
obs <- names(y)
# Fitted values
# (need to supply data to predict() to avoid issues w/ re.form argument)
f <- predict(m, d, re.form = rf)[obs]
f <- lI(f - o)
# Correlation
yf <- cbind(y, f)
if (type == "spearman") yf <- apply(yf, 2, rank)
cm <- cov.wt(yf, w, cor = TRUE, center = as.logical(i))
R <- cm$cor[1, 2]
if (is.na(R)) R <- 0
# Return R-squared
if (positive.only) R <- max(R, 0)
if (R < 0) -R^2 else R^2
} else 0
# Adjusted R-squared
R2a <- if (adj) {
if (k > 0) {
# Variance unexplained (model 'error')
e <- 1 - R2
# Estimator (some code from altR2:::OPExactEstimator())
R2a <- if (adj.type == "olkin-pratt") {
1 - e * (ndf - 2) / rdf * gsl::hyperg_2F1(1, 1, (rdf + 2) / 2, e)
} else {
1 - e * ndf / rdf
}
# Return adjusted R-squared
if (positive.only) R2a <- max(R2a, 0)
R2a
} else 0
}
# Predicted R-squared
R2p <- if (pred) {
if (!isGls(m) && !(isMer(m) && isGlm(m))) {
if (k > 0) {
# Leverage values (diagonals of the hat matrix, hii)
hii <- hatvalues(m)[obs]
s <- hii < 1
# CV fitted values (response minus 'predicted' residuals)
rp <- (y - f) / (1 - hii)
f <- y - rp
# Correlation
yf[, 2] <- if (type == "spearman") rank(f) else f
cm <- cov.wt(yf[s, ], w[s], cor = TRUE, center = as.logical(i))
Rp <- cm$cor[1, 2]
# Return predicted R-squared
if (positive.only) Rp <- max(Rp, 0)
if (Rp < 0) -Rp^2 else Rp^2
} else 0
} else NA
}
# Return values
c("R.squared" = R2, "R.squared.adj" = R2a, "R.squared.pred" = R2p)
}
# Apply recursively
rMapply(R2, m)
}
#' @title Weighted Average of Model Estimates
#' @description Calculate a weighted average of model estimates (e.g. effects,
#' fitted values, residuals) for a set of models.
#' @param est A list or nested list of numeric vectors, comprising the model
#' estimates. In the latter case, these should correspond to estimates for
#' candidate models for each of a set of different response variables.
#' @param weights An optional numeric vector of weights to use for model
#' averaging, or a named list of such vectors. The former should be supplied
#' when `est` is a list, and the latter when it is a nested list (with
#' matching list names). If `weights = "equal"` (default), a simple average is
#' calculated instead.
#' @param est.names An optional vector of names used to extract and/or sort
#' estimates from the output.
#' @details This function can be used to calculate a weighted average of model
#' estimates such as effects, fitted values, or residuals, where models are
#' typically competing candidate models fit to the same response variable.
#' Weights are typically a 'weight of evidence' type metric such as Akaike
#' model weights (Burnham & Anderson, 2002; Burnham et al., 2011), which can
#' be conveniently calculated in *R* using packages such as
#' [MuMIn](https://cran.r-project.org/package=MuMIn) or
#' [AICcmodavg](https://cran.r-project.org/package=AICcmodavg). However,
#' numeric weights of any sort can be used. If none are supplied, a simple
#' average is calculated instead.
#'
#' Averaging is performed via the 'full'/'zero' rather than
#' 'subset'/'conditional'/'natural' method, meaning that zero is substituted
#' for estimates for any 'missing' parameters (e.g. effects) prior to
#' calculations. This provides a form of shrinkage and thus reduces [estimate
#' bias](https://stackoverflow.com/questions/53055050/predicted-values-with-mumin-throwing-error-when-full-false)
#' (Burnham & Anderson, 2002; Grueber et al., 2011).
#' @return A numeric vector of the model-averaged estimates, or a list of such
#' vectors.
#' @references Burnham, K. P., & Anderson, D. R. (2002). *Model Selection and
#' Multimodel Inference: A Practical Information-Theoretic Approach* (2nd
#' ed.). Springer-Verlag. <https://link.springer.com/book/10.1007/b97636>
#'
#' Burnham, K. P., Anderson, D. R., & Huyvaert, K. P. (2011). AIC model
#' selection and multimodel inference in behavioral ecology: some background,
#' observations, and comparisons. *Behavioral Ecology and Sociobiology*,
#' *65*(1), 23-35. \doi{10/c4mrns}
#'
#' Dormann, C. F., Calabrese, J. M., Guillera‐Arroita, G., Matechou, E., Bahn,
#' V., Bartoń, K., Beale, C. M., Ciuti, S., Elith, J., Gerstner, K., Guelat,
#' J., Keil, P., Lahoz‐Monfort, J. J., Pollock, L. J., Reineking, B., Roberts,
#' D. R., Schröder, B., Thuiller, W., Warton, D. I., … Hartig, F. (2018).
#' Model averaging in ecology: A review of Bayesian, information-theoretic,
#' and tactical approaches for predictive inference. *Ecological Monographs*,
#' *88*(4), 485–504. \doi{10/gfgwrv}
#'
#' Grueber, C. E., Nakagawa, S., Laws, R. J., & Jamieson, I. G. (2011).
#' Multimodel inference in ecology and evolution: challenges and solutions.
#' *Journal of Evolutionary Biology*, *24*(4), 699-711. \doi{10/b7b5d4}
#'
#' Walker, J. A. (2019). Model-averaged regression coefficients have a
#' straightforward interpretation using causal conditioning. *BioRxiv*,
#' 133785. \doi{10/c8zt}
#' @examples
#' # Model-averaged effects (coefficients)
#' m <- shipley.growth # candidate models
#' e <- lapply(m, function(i) coef(summary(i))[, 1])
#' avgEst(e)
#'
#' # Using weights
#' w <- runif(length(e), 0, 1)
#' avgEst(e, w)
#'
#' # Model-averaged predictions
#' f <- lapply(m, predict)
#' head(avgEst(f, w))
#' @export
avgEst <- function(est, weights = "equal", est.names = NULL) {
e <- est; w <- weights; en <- est.names
# Weights
if (all(w == "equal")) {
eqW <- function(x) {
rep(1, length(x))
}
w <- if (any(sapply(e, isList))) lapply(e, eqW) else eqW(e)
}
# Function
avgEst <- function(e, w) {
# Sort names (for effects)
en2 <- unique(unlist(lapply(e, names)))
num <- suppressWarnings(as.numeric(en2))
if (all(is.na(num))) {
s1 <- isInt(en2)
s2 <- isPhi(en2) | isR2(en2)
en3 <- sort(en2[!s1 & !s2])
en3 <- names(sort(sapply(en3, function(i) {
lengths(regmatches(i, gregexpr(":", i)))
})))
en2 <- c(en2[s1], en3, en2[s2])
}
en <- if (!is.null(en)) en[en %in% en2] else en2
# Combine estimates into table (missing = zero)
e <- do.call(cbind, lapply(e, function(i) {
sapply(en, function(j) {
if (j %in% names(i)) i[[j]] else 0
})
}))
# Weighted average
apply(e, 1, weighted.mean, w)
}
# Apply recursively
rMapply(avgEst, e, w, SIMPLIFY = FALSE)
}
#' @title Standardised Effects
#' @description Calculate fully standardised effects (model coefficients) in
#' standard deviation units, adjusted for multicollinearity.
#' @param mod A fitted model object, or a list or nested list of such objects.
#' @param weights An optional numeric vector of weights to use for model
#' averaging, or a named list of such vectors. The former should be supplied
#' when `mod` is a list, and the latter when it is a nested list (with
#' matching list names). If set to `"equal"`, a simple average is calculated
#' instead.
#' @param data An optional dataset, used to first refit the model(s).
#' @param term.names An optional vector of names used to extract and/or sort
#' effects from the output.
#' @param unique.eff Logical, whether unique effects should be calculated
#' (adjusted for multicollinearity among predictors).
#' @param cen.x,cen.y Logical, whether effects should be calculated as if from
#' mean-centred variables.
#' @param std.x,std.y Logical, whether effects should be scaled by the standard
#' deviations of variables.
#' @param refit.x Logical, whether the model should be refit with mean-centred
#' predictor variables (see Details).
#' @param incl.raw Logical, whether to append the raw (unstandardised) effects
#' to the output.
#' @param R.squared Logical, whether R-squared values should also be calculated
#' (via [R2()]).
#' @param R2.arg A named list of additional arguments to [R2()] (where
#' applicable), excepting argument `env`. Ignored if `R.squared = FALSE`.
#' @param env Environment in which to look for model data (if none supplied).
#' Defaults to the [formula()] environment.
#' @details `stdEff()` will calculate fully standardised effects (coefficients)
#' in standard deviation units for a fitted model or list of models. It
#' achieves this via adjusting the 'raw' model coefficients, so no
#' standardisation of input variables is required beforehand. Users can simply
#' specify the model with all variables in their original units and the
#' function will do the rest. However, the user is free to scale and/or centre
#' any input variables should they choose, which should not affect the outcome
#' of standardisation (provided any scaling is by standard deviations). This
#' may be desirable in some cases, such as to increase numerical stability
#' during model fitting when variables are on widely different scales.
#'
#' If arguments `cen.x` or `cen.y` are `TRUE`, effects will be calculated as
#' if all predictors (x) and/or the response variable (y) were mean-centred
#' prior to model-fitting (including any dummy variables arising from
#' categorical predictors). Thus, for an ordinary linear model where centring
#' of x and y is specified, the intercept will be zero – the mean (or weighted
#' mean) of y. In addition, if `cen.x = TRUE` and there are interacting terms
#' in the model, all effects for lower order terms of the interaction are
#' adjusted using an expression which ensures that each main effect or lower
#' order term is estimated at the mean values of the terms they interact with
#' (zero in a 'centred' model) – typically improving the interpretation of
#' effects. The expression used comprises a weighted sum of all the effects
#' that contain the lower order term, with the weight for the term itself
#' being zero and those for 'containing' terms being the product of the means
#' of the other variables involved in that term (i.e. those not in the lower
#' order term itself). For example, for a three-way interaction (x1 * x2 *
#' x3), the expression for main effect \eqn{\beta1} would be:
#'
#' \deqn{\beta_{1} + \beta_{12}\bar{x}_{2} + \beta_{13}\bar{x}_{3} +
#' \beta_{123}\bar{x}_{2}\bar{x}_{3}}{\beta1 + (\beta12 * x̄2) + (\beta13 *
#' x̄3) + (\beta123 * x̄2 * x̄3)} (adapted from
#' [here](https://stats.stackexchange.com/questions/65898/why-could-centering-independent-variables-change-the-main-effects-with-moderatio))
#'
#' In addition, if `std.x = TRUE` or `unique.eff = TRUE` (see below), product
#' terms for interactive effects will be recalculated using mean-centred
#' variables, to ensure that standard deviations and variance inflation
#' factors (VIF) for predictors are calculated correctly (the model must be
#' refit for this latter purpose, to recalculate the variance-covariance
#' matrix).
#'
#' If `std.x = TRUE`, effects are scaled by multiplying by the standard
#' deviations of predictor variables (or terms), while if `std.y = TRUE` they
#' are divided by the standard deviation of the response variable (minus any
#' offsets). If the model is a GLM, this latter is calculated using the
#' link-transformed response (or an estimate of same) generated using the
#' function [glt()]. If both arguments are true, the effects are regarded as
#' 'fully' standardised in the traditional sense, often referred to as
#' 'betas'.
#'
#' If `unique.eff = TRUE` (default), effects are adjusted for
#' multicollinearity among predictors by dividing by the square root of the
#' VIFs (Dudgeon, 2016; Thompson et al., 2017; [RVIF()]). If they have also
#' been scaled by the standard deviations of x and y, this converts them to
#' semipartial correlations, i.e. the correlation between the unique
#' components of predictors (residualised on other predictors) and the
#' response variable. This measure of effect size is arguably much more
#' interpretable and useful than the traditional standardised coefficient, as
#' it always represents the unique effects of predictors and so can more
#' readily be compared both within and across models. Values range from zero
#' to +/- one rather than +/- infinity (as in the case of betas) – putting
#' them on the same scale as the bivariate correlation between predictor and
#' response. In the case of GLMs however, the measure is analogous but not
#' exactly equal to the semipartial correlation, so its values may not always
#' be bound between +/- one (such cases are likely rare). Importantly, for
#' ordinary linear models, the square of the semipartial correlation equals
#' the increase in R-squared when that variable is included last in the model
#' – directly linking the measure to unique variance explained. See
#' [here](https://www.daviddisabato.com/blog/2016/4/8/on-effect-sizes-in-multiple-regression)
#' for additional arguments in favour of the use of semipartial correlations.
#'
#' If `refit.x`, `cen.x`, and `unique.eff` are `TRUE` and there are
#' interaction terms in the model, the model will be refit with any
#' (newly-)centred continuous predictors, in order to calculate correct VIFs
#' from the variance-covariance matrix. However, refitting may not be
#' necessary in some circumstances, for example where predictors have already
#' been mean-centred, and whose values will not subsequently be resampled
#' (e.g. parametric bootstrap). Setting `refit.x = FALSE` in such cases will
#' save time, especially with larger/more complex models and/or bootstrap
#' runs.
#'
#' If `incl.raw = TRUE`, raw (unstandardised) effects can also be appended,
#' i.e. those with all centring and scaling options set to `FALSE` (though
#' still adjusted for multicollinearity, where applicable). These may be of
#' interest in some cases, for example to compare their bootstrapped
#' distributions with those of standardised effects.
#'
#' If `R.squared = TRUE`, model R-squared values are appended to effects via
#' the [R2()] function, with any additional arguments passed via `R2.arg`.
#'
#' Finally, if `weights` are specified, the function calculates a weighted
#' average of standardised effects across a set (or sets) of different
#' candidate models for a particular response variable(s) (Burnham & Anderson,
#' 2002), via the [avgEst()] function.
#' @return A numeric vector of the standardised effects, or a list or nested
#' list of such vectors.
#' @references Burnham, K. P., & Anderson, D. R. (2002). *Model Selection and
#' Multimodel Inference: A Practical Information-Theoretic Approach* (2nd
#' ed.). Springer-Verlag. <https://link.springer.com/book/10.1007/b97636>
#'
#' Dudgeon, P. (2016). A Comparative Investigation of Confidence Intervals for
#' Independent Variables in Linear Regression. *Multivariate Behavioral
#' Research*, *51*(2-3), 139-153. \doi{10/gfww3f}
#'
#' Thompson, C. G., Kim, R. S., Aloe, A. M., & Becker, B. J. (2017).
#' Extracting the Variance Inflation Factor and Other Multicollinearity
#' Diagnostics from Typical Regression Results. *Basic and Applied Social
#' Psychology*, *39*(2), 81-90. \doi{10/gfww2w}
#' @examples
#' library(lme4)
#'
#' # Standardised (direct) effects for SEM
#' m <- shipley.sem
#' stdEff(m)
#' stdEff(m, cen.y = FALSE, std.y = FALSE) # x-only
#' stdEff(m, std.x = FALSE, std.y = FALSE) # centred only
#' stdEff(m, cen.x = FALSE, cen.y = FALSE) # scaled only
#' stdEff(m, unique.eff = FALSE) # include multicollinearity
#' stdEff(m, R.squared = TRUE) # add R-squared
#' stdEff(m, incl.raw = TRUE) # add unstandardised
#'
#' # Demonstrate equality with effects from manually-standardised variables
#' # (gaussian models only)
#' m <- shipley.growth[[3]]
#' d <- data.frame(scale(na.omit(shipley)))
#' e1 <- stdEff(m, unique.eff = FALSE)
#' e2 <- coef(summary(update(m, data = d)))[, 1]
#' stopifnot(all.equal(e1, e2))
#'
#' # Demonstrate equality with square root of increment in R-squared
#' # (ordinary linear models only)
#' m <- lm(Growth ~ Date + DD + lat, data = shipley)
#' r2 <- summary(m)$r.squared
#' e1 <- stdEff(m)[-1]
#' en <- names(e1)
#' e2 <- sapply(en, function(i) {
#' f <- reformulate(en[!en %in% i])
#' r2i <- summary(update(m, f))$r.squared
#' sqrt(r2 - r2i)
#' })
#' stopifnot(all.equal(e1, e2))
#'
#' # Model-averaged standardised effects
#' m <- shipley.growth # candidate models
#' w <- runif(length(m), 0, 1) # weights
#' stdEff(m, w)
#' @export
stdEff <- function(mod, weights = NULL, data = NULL, term.names = NULL,
unique.eff = TRUE, cen.x = TRUE, cen.y = TRUE, std.x = TRUE,
std.y = TRUE, refit.x = TRUE, incl.raw = FALSE,
R.squared = FALSE, R2.arg = NULL, env = NULL) {
m <- mod; w <- weights; d <- data; en <- term.names
# Function
stdEff <- function(m) {
# Refit model with any supplied data
if (!is.null(d)) {
m <- eval(update(m, data = d, evaluate = FALSE))
env <- environment()
}
# Model coefficients
b <- if (isMer(m)) lme4::fixef(m, add.dropped = TRUE) else coef(m)
bn <- names(b)
int <- any(isInt(bn)) # intercept?
# Effects (subset only relevant parameters)
e <- na.omit(b[!isPhi(bn)])
en <- names(e)[!isInt(names(e))]
k <- length(en)
# Model weights
w <- weights(m)
if (is.null(w)) w <- rep(1, nobs(m))
w <- w[w > 0 & !is.na(w)]
# Response
y <- getY(m, env = env)
obs <- names(y)
# Centre/scale (x)
if (k > 0) {
# Predictors
x <- getX(m, add.data = TRUE, as.df = TRUE, env = env)
xm <- colMeans(x)
inx <- any(isInx(en)) # interactions?
# Adjust intercept (and possibly effects/predictors)
if (cen.x) {
# Model offset
d <- getData(m, subset = TRUE, env = env)
mf <- suppressWarnings(model.frame(m, data = d))
o <- model.offset(mf)
if (is.null(o)) o <- 0
# Adjust intercept (set to mean of predicted y)
if (int) {
f <- predict(m, re.form = NA)[obs] - o
e[1] <- weighted.mean(f, w)
}
# For interactions, adjust effects and centre predictors
if (inx) {
# Adjust lower-order terms
# (ti = terms containing term i; ni = non-i components of ti)
sI <- function(x) {
unlist(strsplit(x, "(?<!:):(?!:)", perl = TRUE))
}
EN <- sapply(en, sI)
e[en] <- sapply(en, function(i) {
ei <- e[[i]]
ii <- EN[[i]]
ti <- en[en != i & sapply(en, function(j) all(ii %in% EN[[j]]))]
if (length(ti) > 0) {
ei + sum(sapply(ti, function(j) {
jj <- EN[[j]]
ni <- jj[!jj %in% ii]
prod(e[j], xm[ni])
}))
} else ei
})
# Centre predictors (for correct SDs for product terms)
if (std.x) {
x <- getX(m, centre = TRUE, as.df = TRUE, env = env)
}
}
}
# Scale effects (x)
if (std.x) {
xs <- sapply(x[en], sdW, w)
e[en] <- e[en] * xs
}
# Calculate unique effects of predictors (adjust for multicollinearity)
if (unique.eff && k > 1) {
# Refit model with centred predictors
# (to calculate correct VIFs for interacting terms)
m2 <- if (cen.x && inx && refit.x) {
# Update dataset
# (add response, weights, offset; set sum contrasts for factors)
d2 <- data.frame(y, w, o, d)
d2 <- data.frame(sapply(d2, function(i) {
if (!is.numeric(i)) {
i <- factor(i)
contrasts(i) <- contr.sum(levels(i))
}
return(i)
}))
# Update term names (scale numeric predictors)
XN <- lapply(labels(terms(m)), sI)
xn <- unlist(lapply(XN, function(i) {
paste(sapply(i, function(j) {
if (is.numeric(mf[, j])) paste0("scale(", j, ")") else j
}), collapse = ":")
}))
# Add random effects
if (isMer(m)) {
re <- lme4::findbars(formula(m))
re <- sapply(re, function(i) paste0("(", deparse(i), ")"))
xn <- c(xn, re)
}
# Rename any existing terms named "y", "w", "o"
if (any(c("y", "w", "o") %in% names(d))) {
xn <- sapply(xn, function(i) {
i <- gsub("([^\\w.])", "~\\1~", i, perl = TRUE)
i <- unlist(strsplit(i, "~"))
s <- i %in% c("y", "w", "o")
i[s] <- paste0(i[s], ".1")
paste(i, collapse = "")
})
}
# Refit model
eval(update(m, reformulate(xn, "y", int), data = d2, weights = w,
offset = o, contrasts = NULL, evaluate = FALSE))
} else m
# Divide effects by RVIFs
rvif <- na.omit(RVIF(m2, env = environment()))
e[en] <- e[en] / rvif
if (incl.raw) b[en] <- b[en] / rvif
}
}
# Centre intercept (y)
if (cen.y && int) {
ym <- weighted.mean(y, w)
if (isGlm(m)) {
f <- if (isBet(m)) m$link$mean else family(m)
ym <- f$linkfun(ym)
}
e[1] <- e[1] - ym
}
# Scale effects (y)
if (std.y) {
if (isGlm(m)) y <- getY(m, link = TRUE, env = env)
ys <- sdW(y, w)
e <- e / ys
}
# Return standardised effects (optionally append R-squared/raw effects)
e <- sapply(bn, function(i) {
if (i %in% names(e)) e[[i]] else b[[i]]
})
if (R.squared) {
r2 <- do.call(R2, c(list(m, env = env), R2.arg))
names(r2) <- paste0("(", names(r2), ")")
e <- c(e, r2)
}
if (incl.raw) {
b <- c(b, e[isR2(names(e))])
names(b) <- paste0("(raw)_", names(b))
c(e, b)
} else e
}
# Apply recursively
e <- rMapply(stdEff, m, SIMPLIFY = FALSE)
# Output effects or weighted average
if (isList(e) && !is.null(w)) avgEst(e, w, en) else {
if (!is.null(en)) {
rMapply(function(i) {
i[en[en %in% names(i)]]
}, e, SIMPLIFY = FALSE)
} else e
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.