##############################################################################
# Copyright (c) 2012-2020 Russell V. Lenth #
# #
# This file is part of the emmeans package for R (*emmeans*) #
# #
# *emmeans* is free software: you can redistribute it and/or modify #
# it under the terms of the GNU General Public License as published by #
# the Free Software Foundation, either version 2 of the License, or #
# (at your option) any later version. #
# #
# *emmeans* is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
# GNU General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with R and *emmeans*. If not, see #
# <https://www.r-project.org/Licenses/> and/or #
# <http://www.gnu.org/licenses/>. #
##############################################################################
# Reference grid code
# Change to cov.reduce specification: can be...
# a function: is applied to all covariates
# named list of functions: applied to those covariates (else mean is used)
# TRUE - same as mean
# FALSE - same as function(x) sort(unique(x))
#' Create a reference grid from a fitted model
#'
#' Using a fitted model object, determine a reference grid for which estimated
#' marginal means are defined. The resulting \code{ref_grid} object encapsulates
#' all the information needed to calculate EMMs and make inferences on them.
#'
#' To users, the \code{ref_grid} function itself is important because most of
#' its arguments are in effect arguments of \code{\link{emmeans}} and related
#' functions, in that those functions pass their \code{...} arguments to
#' \code{ref_grid}.
#'
#' The reference grid consists of combinations of independent variables over
#' which predictions are made. Estimated marginal means are defined as these
#' predictions, or marginal averages thereof. The grid is determined by first
#' reconstructing the data used in fitting the model (see
#' \code{\link{recover_data}}), or by using the \code{data.frame} provided in
#' \code{data}. The default reference grid is determined by the observed levels
#' of any factors, the ordered unique values of character-valued predictors, and
#' the results of \code{cov.reduce} for numeric predictors. These may be
#' overridden using \code{at}. See also the section below on
#' recovering/overriding model information.
#'
#'
#' @param object An object produced by a supported model-fitting function, such
#' as \code{lm}. Many models are supported. See
#' \href{../doc/models.html}{\code{vignette("models", "emmeans")}}.
#' @param at Optional named list of levels for the corresponding variables
#' @param cov.reduce A function, logical value, or formula; or a named list of
#' these. Each covariate \emph{not} specified in \code{cov.keep} or \code{at}
#' is reduced according to these specifications. See the section below on
#' \dQuote{Using \code{cov.reduce} and \code{cov.keep}}.
#' @param cov.keep Character vector: names of covariates that are \emph{not}
#' to be reduced; these are treated as factors and used in weighting calculations.
#' \code{cov.keep} may also include integer value(s), and if so, the maximum
#' of these is used to set a threshold such that any covariate having no more
#' than that many unique values is automatically included in \code{cov.keep}.
#' @param mult.names Character value: the name(s) to give to the
#' pseudo-factor(s) whose levels delineate the elements of a multivariate
#' response. If this is provided, it overrides the default name(s) used for
#' \code{class(object)} when it has a multivariate response (e.g., the default
#' is \code{"rep.meas"} for \code{"mlm"} objects).
#' @param mult.levs A named list of levels for the dimensions of a multivariate
#' response. If there is more than one element, the combinations of levels are
#' used, in \code{\link{expand.grid}} order. The (total) number of levels must
#' match the number of dimensions. If \code{mult.name} is specified, this
#' argument is ignored.
#' @param options If non-\code{NULL}, a named \code{list} of arguments to pass
#' to \code{\link{update.emmGrid}}, just after the object is constructed.
#' @param data A \code{data.frame} to use to obtain information about the
#' predictors (e.g. factor levels). If missing, then
#' \code{\link{recover_data}} is used to attempt to reconstruct the data. See
#' the note with \code{\link{recover_data}} for an important precaution.
#' @param df Numeric value. This is equivalent to specifying \code{options(df =
#' df)}. See \code{\link{update.emmGrid}}.
#' @param type Character value. If provided, this is saved as the
#' \code{"predict.type"} setting. See \code{\link{update.emmGrid}} and the
#' section below on prediction types and transformations.
#' @param regrid Character, logical, or list. If non-missing, the reference
#' grid is reconstructed via \code{\link{regrid}} with the argument
#' \code{transform = regrid}. See the section below on prediction types and
#' transformations. \emph{Note:} This argument was named \code{transform} in
#' version 1.7.2 and earlier. For compatibility with old code, \code{transform}
#' is still accepted if found among \code{...},
#' as long as it doesn't match \code{tran}.
#' @param nesting If the model has nested fixed effects, this may be specified
#' here via a character vector or named \code{list} specifying the nesting
#' structure. Specifying \code{nesting} overrides any nesting structure that
#' is automatically detected. See the section below on Recovering or Overriding
#' Model Information.
#' @param offset Numeric scalar value (if a vector, only the first element is
#' used). This may be used to add an offset, or override offsets based on the
#' model. A common usage would be to specify \code{offset = 0} for a Poisson
#' regression model, so that predictions from the reference grid become rates
#' relative to the offset that had been specified in the model.
#' @param sigma Numeric value to use for subsequent predictions or
#' back-transformation bias adjustments. If not specified, we use
#' \code{sigma(object)}, if available, and \code{NULL} otherwise.
#' Note: This applies only when the family is \code{"gaussian"}; for other families,
#' \code{sigma} is set to \code{NA} and cannot be overridden.
#' @param counterfactuals,wt.counter,avg.counter \code{counterfactuals} specifies character
#' names of counterfactual factors. If this is non-missing, a reference grid
#' is created consisting of combinations of counterfactual levels and a constructed
#' factor \code{.obs.no.} having a level for each observation in the dataset.
#' By default, this grid is re-gridded with the response transformation
#' and averaged over \code{.obs.no.} (by default, with equal weights, but
#' a vector of weights may be specified in \code{wt.counter}; it must be
#' of length equal to the number of observations in the dataset).
#' If \code{avg.counter} is set to \code{FALSE}, this averaging is disabled.
#' See the section below on counterfactuals.
#' @param nuisance,non.nuisance,wt.nuis If \code{nuisance} is a vector of predictor names,
#' those predictors are omitted from the reference grid. Instead, the result
#' will be as if we had averaged over the levels of those factors, with either
#' equal or proportional weights as specified in \code{wt.nuis} (see the
#' \code{weights} argument in \code{\link{emmeans}}). The factors in
#' \code{nuisance} must not interact with other factors, not even other
#' nuisance factors. Specifying nuisance factors can save considerable
#' storage and computation time, and help avoid exceeding the maximum
#' reference-grid size (\code{get_emm_option("rg.limit")}).
#' @param rg.limit Integer limit on the number of reference-grid rows to allow
#' (checked before any multivariate responses are included).
#' @param ... Optional arguments passed to \code{\link{summary.emmGrid}},
#' \code{\link{emm_basis}}, and
#' \code{\link{recover_data}}, such as \code{params}, \code{vcov.} (see
#' \bold{Covariance matrix} below), or options such as \code{mode} for
#' specific model types (see \href{../doc/models.html}{vignette("models",
#' "emmeans")}).
#'
#' @section Using \code{cov.reduce} and \code{cov.keep}:
#' The \code{cov.keep} argument was not available in \pkg{emmeans} versions
#' 1.4.1 and earlier. Any covariates named in this list are treated as if they
#' are factors: all the unique levels are kept in the reference grid. The user
#' may also specify an integer value, in which case any covariate having no more
#' than that number of unique values is implicitly included in \code{cov.keep}.
#' The default for \code{cov.keep} is set and retrieved via the
#' \code{\link{emm_options}} framework, and the system default is \code{"2"},
#' meaning that covariates having only two unique values are automatically
#' treated as two-level factors. See also the Note below on backward compatibility.
#'
#' There is a subtle distinction between including a covariate in \code{cov.keep}
#' and specifying its values manually in \code{at}: Covariates included in
#' \code{cov.keep} are treated as factors for purposes of weighting, while
#' specifying levels in \code{at} will not include the covariate in weighting.
#' See the \code{mtcars.lm} example below for an illustration.
#'
#' \code{cov.reduce} may be a function,
#' logical value, formula, or a named list of these.
#' If a single function, it is applied to each covariate.
#' If logical and \code{TRUE}, \code{mean} is used. If logical and
#' \code{FALSE}, it is equivalent to including all covariates in
#' \code{cov.keep}. Use of \samp{cov.reduce = FALSE} is inadvisable because it
#' can result in a huge reference grid; it is far better to use
#' \code{cov.keep}.
#'
#' If a formula (which must be two-sided), then a model is fitted to that
#' formula using \code{\link{lm}}; then in the reference grid, its response
#' variable is set to the results of \code{\link{predict}} for that model,
#' with the reference grid as \code{newdata}. (This is done \emph{after} the
#' reference grid is determined.) A formula is appropriate here when you think
#' experimental conditions affect the covariate as well as the response.
#'
#' To allow for situations where a simple \code{lm()} call as described above won't
#' be adequate, a formula of the form \code{ext ~ fcnname} is also supported,
#' where the left-hand side may be \code{ext}, \code{extern}, or
#' \code{external} (and must \emph{not} be a predictor name) and the
#' right-hand side is the name of an existing function. The function is called
#' with one argument, a data frame with columns for each variable in the
#' reference grid. The function is expected to use that frame as new data to
#' be used to obtain predictions for one or more models; and it should return
#' a named list or data frame with replacement values for one or more of the
#' covariates.
#'
#' If \code{cov.reduce} is a named list, then the above criteria are used to
#' determine what to do with covariates named in the list. (However, formula
#' elements do not need to be named, as those names are determined from the
#' formulas' left-hand sides.) Any unresolved covariates are reduced using
#' \code{"mean"}.
#'
#' Any \code{cov.reduce} of \code{cov.keep} specification for a covariate
#' also named in \code{at} is ignored.
#'
#'
#' @section Interdependent covariates: Care must be taken when covariate values
#' depend on one another. For example, when a polynomial model was fitted
#' using predictors \code{x}, \code{x2} (equal to \code{x^2}), and \code{x3}
#' (equal to \code{x^3}), the reference grid will by default set \code{x2} and
#' \code{x3} to their means, which is inconsistent. The user should instead
#' use the \code{at} argument to set these to the square and cube of
#' \code{mean(x)}. Better yet, fit the model using a formula involving
#' \code{poly(x, 3)} or \code{I(x^2)} and \code{I(x^3)}; then there is only
#' \code{x} appearing as a covariate; it will be set to its mean, and the
#' model matrix will have the correct corresponding quadratic and cubic terms.
#'
#' @section Matrix covariates: Support for covariates that appear in the dataset
#' as matrices is very limited. If the matrix has but one column, it is
#' treated like an ordinary covariate. Otherwise, with more than one column,
#' each column is reduced to a single reference value -- the result of
#' applying \code{cov.reduce} to each column (averaged together if that
#' produces more than one value); you may not specify values in \code{at}; and
#' they are not treated as variables in the reference grid, except for
#' purposes of obtaining predictions.
#'
#' @section Recovering or overriding model information: Ability to support a
#' particular class of \code{object} depends on the existence of
#' \code{recover_data} and \code{emm_basis} methods -- see
#' \link{extending-emmeans} for details. The call
#' \code{methods("recover_data")} will help identify these.
#'
#' \bold{Data.} In certain models, (e.g., results of
#' \code{\link[lme4]{glmer.nb}}), it is not possible to identify the original
#' dataset. In such cases, we can work around this by setting \code{data}
#' equal to the dataset used in fitting the model, or a suitable subset. Only
#' the complete cases in \code{data} are used, so it may be necessary to
#' exclude some unused variables. Using \code{data} can also help save
#' computing, especially when the dataset is large. In any case, \code{data}
#' must represent all factor levels used in fitting the model. It
#' \emph{cannot} be used as an alternative to \code{at}. (Note: If there is a
#' pattern of \code{NAs} that caused one or more factor levels to be excluded
#' when fitting the model, then \code{data} should also exclude those levels.)
#'
#' \bold{Covariance matrix.} By default, the variance-covariance matrix for
#' the fixed effects is obtained from \code{object}, usually via its
#' \code{\link{vcov}} method. However, the user may override this via a
#' \code{vcov.} argument, specifying a matrix or a function. If a matrix, it
#' must be square and of the same dimension and parameter order of the fixed
#' effects. If a function, must return a suitable matrix when it is called
#' with arguments \code{(object, ...)}. Be careful with possible
#' unintended conflicts with arguments in \code{...}; for example,
#' \code{sandwich::vcovHAC()} has optional arguments \code{adjust} and \code{weights}
#' that may be intended for \code{emmeans()} but will also be passed to \code{vcov.()}.
#'
#' \bold{Nested factors.} Having a nesting structure affects marginal
#' averaging in \code{emmeans} in that it is done separately for each level
#' (or combination thereof) of the grouping factors. \code{ref_grid} tries to
#' discern which factors are nested in other factors, but it is not always
#' obvious, and if it misses some, the user must specify this structure via
#' \code{nesting}; or later using \code{\link{update.emmGrid}}. The
#' \code{nesting} argument may be a character vector, a named \code{list},
#' or \code{NULL}.
#' If a \code{list}, each name should be the name of a single factor in the
#' grid, and its entry a character vector of the name(s) of its grouping
#' factor(s). \code{nested} may also be a character value of the form
#' \code{"factor1 \%in\% (factor2*factor3)"} (the parentheses are optional).
#' If there is more than one such specification, they may be appended
#' separated by commas, or as separate elements of a character vector. For
#' example, these specifications are equivalent: \code{nesting = list(state =
#' "country", city = c("state", "country")}, \code{nesting = "state \%in\%
#' country, city \%in\% (state*country)"}, and \code{nesting = c("state \%in\%
#' country", "city \%in\% state*country")}.
#'
#' @section Predictors with subscripts and data-set references: When the fitted
#' model contains subscripts or explicit references to data sets, the
#' reference grid may optionally be post-processed to simplify the variable
#' names, depending on the \code{simplify.names} option (see
#' \code{\link{emm_options}}), which by default is \code{TRUE}. For example,
#' if the model formula is \code{data1$resp ~ data1$trt + data2[[3]] +
#' data2[["cov"]]}, the simplified predictor names (for use, e.g., in the
#' \code{specs} for \code{\link{emmeans}}) will be \code{trt},
#' \code{data2[[3]]}, and \code{cov}. Numerical subscripts are not simplified;
#' nor are variables having simplified names that coincide, such as if
#' \code{data2$trt} were also in the model.
#'
#' Please note that this simplification is performed \emph{after} the
#' reference grid is constructed. Thus, non-simplified names must be used in
#' the \code{at} argument (e.g., \code{at = list(`data2["cov"]` = 2:4)}.
#'
#' If you don't want names simplified, use \code{emm_options(simplify.names =
#' FALSE)}.
#'
#'
#'
#' @section Prediction types and transformations:
#' Transformations can exist because of a link function in a generalized linear model,
#' or as a response transformation, or even both. In many cases, they are auto-detected,
#' for example a model formula of the form \code{sqrt(y) ~ ...}. Even transformations
#' containing multiplicative or additive constants, such as \code{2*sqrt(y + pi) ~ ...},
#' are auto-detected. A response transformation of \code{y + 1 ~ ...} is \emph{not}
#' auto-detected, but \code{I(y + 1) ~ ...} is interpreted as \code{identity(y + 1) ~ ...}.
#' A warning is issued if it gets too complicated.
#' Complex transformations like the Box-Cox transformation are not auto-detected; but see
#' the help page for \code{\link{make.tran}} for information on some advanced methods.
#'
#' There is a subtle difference
#' between specifying \samp{type = "response"} and \samp{regrid =
#' "response"}. While the summary statistics for the grid itself are the same,
#' subsequent use in \code{\link{emmeans}} will yield different results if
#' there is a response transformation or link function. With \samp{type =
#' "response"}, EMMs are computed by averaging together predictions on the
#' \emph{linear-predictor} scale and then back-transforming to the response
#' scale; while with \samp{regrid = "response"}, the predictions are
#' already on the response scale so that the EMMs will be the arithmetic means
#' of those response-scale predictions. To add further to the possibilities,
#' \emph{geometric} means of the response-scale predictions are obtainable via
#' \samp{regrid = "log", type = "response"}. See also the help page for
#' \code{\link{regrid}}.
#'
#' \emph{Order-of-processing issues:}
#' The \code{regrid} argument, if present, is acted on immediately after the reference
#' grid is constructed, while some of the \code{...} arguments may be used to
#' update the object at the very end. Thus, code like
#' \code{ref_grid(mod, tran = "sqrt", regrid = "response")} will not work correctly
#' if the intention was to specify the response transformation, because the re-grid
#' is done \emph{before} it processes \code{tran = "sqrt"}. To get the intended
#' result, do
#' \code{regrid(ref_grid(mod, tran = "sqrt"), transform = "response")}.
#'
#' @section Counterfactuals:
#' If \code{counterfactuals} is specified, the rows of the entire dataset
#' become a factor in the reference grid, and the other reference levels are
#' confined to those named in \code{counterfactuals}. In this type of analysis
#' (called G-computation), we substitute each combination of counterfactual
#' levels into the entire dataset. Thus, predictions from this grid are those
#' of each observation under each of the counterfactual levels. For this to
#' make sense, we require an assumption of exchangeability of these levels.
#'
#' By default, this grid is converted to the response scale (unless otherwise
#' specified in \code{regrid}) and averaged over the observations in the dataset.
#' Averaging can be disabled by setting \code{avg.counter = FALSE}, but
#' be warned that the resulting reference grid is potentially huge -- the
#' number of observations in the dataset times the number of counterfactual
#' combinations, times the number of multivariate levels.
#'
#' The counterfactuals code is still fairly rudimentary and we can't guarantee
#' it will always work, such as in cases of nested models. Sometimes, an error
#' can be averted by specifying \code{avg.counter = FALSE}.
#'
#' @section Optional side effect: If the \code{save.ref_grid} option is set to
#' \code{TRUE} (see \code{\link{emm_options}}),
#' The most recent result of \code{ref_grid}, whether
#' called directly or indirectly via \code{\link{emmeans}},
#' \code{\link{emtrends}}, or some other function that calls one of these, is
#' saved in the user's environment as \code{.Last.ref_grid}. This facilitates
#' checking what reference grid was used, or reusing the same reference grid
#' for further calculations. This automatic saving is disabled by default, but
#' may be enabled via \samp{emm_options(save.ref_grid = TRUE)}.
#'
#'
#'
#' @return An object of the S4 class \code{"emmGrid"} (see
#' \code{\link{emmGrid-class}}). These objects encapsulate everything needed
#' to do calculations and inferences for estimated marginal means, and contain
#' nothing that depends on the model-fitting procedure.
#'
#' @seealso Reference grids are of class \code{\link[=emmGrid-class]{emmGrid}},
#' and several methods exist for them -- for example
#' \code{\link{summary.emmGrid}}. Reference grids are fundamental to
#' \code{\link{emmeans}}. Supported models are detailed in
#' \href{../doc/models.html}{\code{vignette("models", "emmeans")}}.
#' See \code{\link{update.emmGrid}} for details of arguments that can be in
#' \code{options} (or in \code{...}).
#'
#' @note The system default for \code{cov.keep} causes models
#' containing indicator variables to be handled differently than in
#' \pkg{emmeans} version 1.4.1 or earlier. To replicate older
#' analyses, change the default via
#' \samp{emm_options(cov.keep = character(0))}.
#'
#' @note Some earlier versions of \pkg{emmeans} offer a \code{covnest} argument.
#' This is now obsolete; if \code{covnest} is specified, it is harmlessly
#' ignored. Cases where it was needed are now handled appropriately via the
#' code associated with \code{cov.keep}.
#'
#' @export
#'
#' @examples
#' fiber.lm <- lm(strength ~ machine*diameter, data = fiber)
#' ref_grid(fiber.lm)
#'
#' ref_grid(fiber.lm, at = list(diameter = c(15, 25)))
#'
#' \dontrun{
#' # We could substitute the sandwich estimator vcovHAC(fiber.lm)
#' # as follows:
#' summary(ref_grid(fiber.lm, vcov. = sandwich::vcovHAC))
#' }
#'
#' # If we thought that the machines affect the diameters
#' # (admittedly not plausible in this example), then we should use:
#' ref_grid(fiber.lm, cov.reduce = diameter ~ machine)
#'
#' ### Model with indicator variables as predictors:
#' mtcars.lm <- lm(mpg ~ disp + wt + vs * am, data = mtcars)
#' (rg.default <- ref_grid(mtcars.lm))
#' (rg.nokeep <- ref_grid(mtcars.lm, cov.keep = character(0)))
#' (rg.at <- ref_grid(mtcars.lm, at = list(vs = 0:1, am = 0:1)))
#'
#' # Two of these have the same grid but different weights:
#' rg.default@grid
#' rg.at@grid
#'
#' ### Using cov.reduce formulas...
#' # Above suggests we can vary disp indep. of other factors - unrealistic
#' rg.alt <- ref_grid(mtcars.lm, at = list(wt = c(2.5, 3, 3.5)),
#' cov.reduce = disp ~ vs * wt)
#' rg.alt@grid
#'
#' # Alternative to above where we model sqrt(disp)
#' disp.mod <- lm(sqrt(disp) ~ vs * wt, data = mtcars)
#' disp.fun <- function(dat)
#' list(disp = predict(disp.mod, newdata = dat)^2)
#' rg.alt2 <- ref_grid(mtcars.lm, at = list(wt = c(2.5, 3, 3.5)),
#' cov.reduce = external ~ disp.fun)
#' rg.alt2@grid
#'
#'
#' # Multivariate example
#' MOats.lm = lm(yield ~ Block + Variety, data = MOats)
#' ref_grid(MOats.lm, mult.names = "nitro")
#' # Silly illustration of how to use 'mult.levs' to make comb's of two factors
#' ref_grid(MOats.lm, mult.levs = list(T=LETTERS[1:2], U=letters[1:2]))
#'
#' # Comparing estimates with and without counterfactuals
#' neuralgia.glm <- glm(Pain ~ Treatment + Sex + Age + Duration,
#' family = binomial(), data = neuralgia)
#' emmeans(neuralgia.glm, "Treatment", type = "response")
#'
#' emmeans(neuralgia.glm, "Treatment", counterfactuals = "Treatment")
#'
#'
#' # Using 'params'
#' require("splines")
#' my.knots = c(2.5, 3, 3.5)
#' mod = lm(Sepal.Length ~ Species * ns(Sepal.Width, knots = my.knots), data = iris)
#' ## my.knots is not a predictor, so need to name it in 'params'
#' ref_grid(mod, params = "my.knots")
#'
ref_grid <- function(object, at, cov.reduce = mean, cov.keep = get_emm_option("cov.keep"),
mult.names, mult.levs,
options = get_emm_option("ref_grid"), data, df, type,
regrid, nesting, offset, sigma,
counterfactuals, wt.counter, avg.counter = TRUE,
nuisance = character(0), non.nuisance, wt.nuis = "equal",
rg.limit = get_emm_option("rg.limit"), ...)
{
# hack to ignore 'tran' in dots arguments and interpret 'transform' as `regrid` :
.foo = function(t,tr,tra,tran, transform = NULL, ...) transform
.bar = .foo(...)
if (!is.null(.bar)) {
regrid = .bar
message("In 'ref_grid()', use 'regrid = ...' rather than 'transform = ...' ",
"to avoid this message.")
}
if (!missing(df)) {
if(is.null(options)) options = list()
options$df = df
}
# recover the data
if (missing(data)) {
data = try(recover_data (object, data = NULL, ...))
if (inherits(data, "try-error"))
stop("Perhaps a 'data' or 'params' argument is needed")
}
# note if options$delts non-null, we called 2nd time from emtrends and data is ok as-is
else if (is.null(options$delts)) # attach needed attributes to given data
data = recover_data(object, data = as.data.frame(data), ...)
if(is.character(data)) # 'data' is in fact an error message
stop(data)
## undocumented hook to use ref_grid as slave to get data
if (!is.null(options$just.data))
return(data)
trms = attr(data, "terms")
# find out if any variables are coerced to factors or vice versa
coerced = .find.coerced(trms, data) # now list with members 'factors' and 'covariates'
# convenience function
sort.unique = function(x) sort(unique(x))
# Get threshold for max #levels to keep a covariate
if (is.null(cov.keep))
cov.keep = character(0)
cov.thresh = max(c(1, suppressWarnings(as.integer(cov.keep))), na.rm = TRUE)
if (is.logical(cov.reduce)) {
if (!cov.reduce)
cov.thresh = 99 # big enough!
cov.reduce = mean
}
# Ensure cov.reduce is a function or list thereof
dep.x = list() # list of formulas to fit later
fix.cr = function(cvr) {
if (inherits(cvr, "formula")) {
if (length(cvr) < 3)
stop("Formulas in 'cov.reduce' must be two-sided")
lhs = .all.vars(cvr)[1]
dep.x[[lhs]] <<- cvr
cvr = mean
}
else if (!inherits(cvr, c("function","list")))
stop("Invalid 'cov.reduce' argument")
cvr
}
# IMPORTANT: following stmts may also affect x.dep
if (is.list(cov.reduce))
cov.reduce = lapply(cov.reduce, fix.cr)
else
cov.reduce = fix.cr(cov.reduce)
# zap any formulas that are also in 'at'
if (!missing(at))
for (xnm in names(at)) dep.x[[xnm]] = NULL
# local cov.reduce function that works with function or named list
cr = function(x, nm) {
if (is.function(cov.reduce))
cov.reduce(x)
else if (hasName(cov.reduce, nm))
cov.reduce[[nm]](x)
else
mean(x)
}
# initialize empty lists
ref.levels = matlevs = xlev = chrlev = list()
for (nm in attr(data, "responses")) {
y = data[[nm]]
if (is.matrix(y))
matlevs[[nm]] = apply(y, 2, mean)
else
ref.levels[[nm]] = mean(y)
}
for (nm in attr(data, "predictors")) {
x = data[[nm]]
if (is.matrix(x) && ncol(x) == 1) # treat 1-column matrices as covariates
x = as.numeric(x)
# Save the original levels of factors, no matter what
if (is.factor(x) && !(nm %in% coerced$covariates))
xlev[[nm]] = levels(factor(x))
# (applying factor drops any unused levels)
else if (is.character(x)) # just like a factor
xlev[[nm]] = sort(unique(x))
# Now go thru and find reference levels...
# mentioned in 'at' list but not coerced factor
if (!(nm %in% coerced$factors) && !missing(at) && (hasName(at, nm)))
ref.levels[[nm]] = at[[nm]]
# factors not in 'at'
else if (is.factor(x) && !(nm %in% coerced$covariates))
ref.levels[[nm]] = levels(factor(x))
else if (is.character(x) || is.logical(x))
ref.levels[[nm]] = chrlev[[nm]] = sort.unique(x)
# matrices
else if (is.matrix(x)) {
# Matrices -- reduce columns thereof, but don't add to baselevs
matlevs[[nm]] = apply(x, 2, cr, nm)
# if cov.reduce returns a vector, average its columns
if (is.matrix(matlevs[[nm]]))
matlevs[[nm]] = apply(matlevs[[nm]], 2, mean)
}
# covariate coerced, or not mentioned in 'at'
else {
# single numeric pred but coerced to a factor - use unique values
# even if in 'at' list. We'll fix this up later
if (nm %in% coerced$factors)
ref.levels[[nm]] = sort.unique(x)
# Ordinary covariates - summarize based on cov.keep
else {
if ((length(uval <- sort.unique(x)) > cov.thresh) && !(nm %in% cov.keep))
ref.levels[[nm]] = cr(as.numeric(x), nm)
else {
ref.levels[[nm]] = uval
cov.keep = c(cov.keep, nm)
}
}
}
}
if (!missing(non.nuisance))
nuisance = setdiff(names(ref.levels), non.nuisance)
# Now create the reference grid
if(no.nuis <- (length(nuisance) == 0)) {
if (!missing(counterfactuals)) {
cfac = intersect(counterfactuals, names(ref.levels))
ref.levels = ref.levels[cfac]
ref.levels$.obs.no. = seq_len(nrow(data))
.check.grid(ref.levels, rg.limit)
grid = .setup.cf(ref.levels, data)
}
else {
.check.grid(ref.levels, rg.limit)
grid = do.call(expand.grid, ref.levels)
}
}
else {
nuis.info = .setup.nuis(nuisance, ref.levels, trms, rg.limit)
grid = nuis.info$grid
}
# undocumented hook to expand grid by increments of 'var' (needed by emtrends)
if (!is.null(delts <- options$delts)) {
var = options$var
n.orig = nrow(grid) # remember how many rows we had
grid = grid[rep(seq_len(n.orig), length(delts)), , drop = FALSE]
options$var = options$delts = NULL
# grid[[var]] = grid[[var]] + rep(delts, each = n.orig)
# (we have to wait until after covariate calcs to do this)
}
# add any matrices
for (nm in names(matlevs)) {
tmp = matrix(rep(matlevs[[nm]], each=nrow(grid)), nrow=nrow(grid))
dimnames(tmp) = list(NULL, names(matlevs[[nm]]))
grid[[nm]] = tmp
}
# resolve any covariate formulas
for (xnm in names(dep.x)) {
if ((xnm %in% c("ext", "extern", "external")) && !(xnm %in% names(grid))) { # use external fcn
fun = get(as.character(dep.x[[xnm]][[3]]), inherits = TRUE)
rslts = fun(grid) # should be some thing that supports names() and [[]], e.g., a list or d.f.
for (nm in intersect(names(rslts), names(grid))) {
grid[[nm]] = rslts[[nm]]
ref.levels[[nm]] = NULL
}
}
else if (!all(.all.vars(dep.x[[xnm]]) %in% names(grid)))
stop("Formulas in 'cov.reduce' must predict covariates actually in the model")
else { # Use lm() to predict this covariate
xmod = lm(dep.x[[xnm]], data = data)
grid[[xnm]] = predict(xmod, newdata = grid)
ref.levels[[xnm]] = NULL
}
}
# finish-up our hook for expanding the grid
if (!is.null(delts)) # add increments if any
grid[[var]] = grid[[var]] + rep(delts, each = n.orig)
if (!is.null(attr(data, "pass.it.on"))) # a hook needed by emm_basis.gamlss et al.
attr(object, "data") = data
###!! Prevent a warning like in https://stackoverflow.com/questions/68969384/emmeans-warning-in-model-frame-defaultformula-data-data-variable-gr/68990172#68990172
xl = xlev
modnm = rownames(attr(trms, "factors"))
chk = sapply(modnm, function(mn) mn %in% names(xl))
for(i in which(!chk)) { # replace names in xl - e.g., as.factor(trt) where trt already a factor
fn = all.vars(reformulate(modnm[i]))
if (length(fn) == 1)
names(xl)[names(xl) == fn] = modnm[i]
}
###!! If we remove this code, also need to change xl back to xlev in 'basis =' call below
# we've added args `misc` and `options` so emm_basis methods can access and use these if they want
basis = emm_basis(object, trms, xl, grid, misc = attr(data, "misc"), options = options, ...)
environment(basis$dffun) = baseenv() # releases unnecessary storage
if(length(basis$bhat) != ncol(basis$X))
stop("Something went wrong:\n",
" Non-conformable elements in reference grid.",
call. = TRUE)
collapse = NULL
if (!missing(counterfactuals)) {
grid = do.call(expand.grid, ref.levels)
if (missing(regrid))
regrid = "response"
if (avg.counter) collapse = ".obs.no."
}
if(!no.nuis) {
basis = .basis.nuis(basis, nuis.info, wt.nuis, ref.levels, data, grid, ref.levels)
grid = basis$grid
nuisance = ref.levels[nuis.info$nuis] # now nuisance has the levels info
ref.levels = basis$ref.levels
}
misc = basis$misc
### Figure out if there is a response transformation...
# next stmt assumes that model formula is 1st argument (2nd element) in call.
# if not, we probably get an error or something that isn't a formula
# and it is silently ignored
frm = try(formula(eval(attr(data, "call")[[2]])), silent = TRUE)
if (inherits(frm, "formula")) { # response may be transformed
lhs = if(length(frm) == 2) NULL
else frm[-3]
tran = setdiff(.all.vars(lhs, functions = TRUE), c(.all.vars(lhs), "~", "cbind", "+", "-", "*", "/", "^", "%%", "%/%"))
if(length(tran) > 0) {
# we are now supporting scale() as well as some functions from datawizard
if (tran[1] %in% c("scale", "center", "centre", "standardize", "standardise")) { # we'll try to set it up based on terms component
pv = try(attr(terms(object), "predvars"), silent = TRUE)
if (!inherits(pv, "try-error") && !is.null(pv)) {
scal = which(sapply(c(sapply(pv, as.character), "foo"),
function(x) x[1]) == tran[1])
if(length(scal) > 0) {
pv = pv[[scal[1]]]
ctr = ifelse(is.null(pv$center), 0, ifelse(pv$center, pv$center, 0))
scl = ifelse(is.null(pv$scale), 1, ifelse(pv$scale, pv$scale, 1))
tran = make.tran("scale", y = 0, center = ctr, scale = scl)
}
}
if (is.character(tran)) { # didn't manage to find params
tran = NULL
message("NOTE: Unable to recover scale() parameters. See '? make.tran'")
}
}
else if (tran[1] == "linkfun")
tran = as.list(environment(trms))[c("linkfun","linkinv","mu.eta","valideta","name")]
else {
if (tran[1] == "I")
tran = "identity"
tran = paste(tran, collapse = ".")
# length > 1: Almost certainly unsupported, but facilitates a more informative error message
const.warn = "There are unevaluated constants in the response formula\nAuto-detection of the response transformation may be incorrect"
# Look for a multiplier, e.g. 2*sqrt(y)
tst = strsplit(strsplit(as.character(lhs[2]), "\\(")[[1]][1], "\\*")[[1]]
if(length(tst) > 1) {
mul = try(eval(parse(text = tst[1])), silent = TRUE)
if(!inherits(mul, "try-error")) {
misc$tran.mult = mul
tran = gsub("\\*\\.", "", tran)
}
else
warning(const.warn)
}
# look for added constant, e.g. log(y + 1)
tst = strsplit(as.character(lhs[2]), "\\(|\\)|\\+")[[1]]
if (length(tst) > 2) {
const = try(eval(parse(text = tst[3])), silent = TRUE)
if (!inherits(const, "try-error") && (length(tst) == 3))
misc$tran.offset = const
else
warning(const.warn)
}
}
if(is.null(misc[["tran"]]))
misc$tran = tran
else
misc$tran2 = tran
misc$inv.lbl = "response"
}
}
# Take care of multivariate response
multresp = character(0) ### ??? was list()
ylevs = misc$ylevs
if(!is.null(ylevs)) { # have a multivariate situation
if (missing(mult.levs))
mult.levs = ylevs
if(!missing(mult.names)) {
k = seq_len(min(length(ylevs), length(mult.names)))
names(mult.levs)[k] = mult.names[k]
}
if (length(ylevs) > 1)
ylevs = list(seq_len(prod(sapply(mult.levs, length))))
k = prod(sapply(mult.levs, length))
if (k != length(ylevs[[1]]))
stop("supplied 'mult.levs' is of different length ",
"than that of multivariate response")
for (nm in names(mult.levs))
ref.levels[[nm]] = mult.levs[[nm]]
multresp = names(mult.levs)
MF = do.call("expand.grid", mult.levs)
grid = merge(grid, MF)
}
# add any matrices
for (nm in names(matlevs))
grid[[nm]] = matrix(rep(matlevs[[nm]],
each=nrow(grid)), nrow=nrow(grid))
# Here's a complication. If a numeric predictor was coerced to a factor, we had to
# include all its levels in the reference grid, even if altered in 'at'
# Moreover, whatever levels are in 'at' must be a subset of the unique values
# So we now need to subset the rows of the grid and linfct based on 'at'
problems = if (!missing(at))
intersect(c(multresp, coerced$factors), names(at))
else character(0)
if (length(problems) > 0) {
incl.flags = rep(TRUE, nrow(grid))
for (nm in problems) {
if (is.numeric(ref.levels[[nm]])) {
dig = 3 - log10(max(abs(ref.levels[[nm]])))
at[[nm]] = round(at[[nm]], digits = dig)
ref.levels[[nm]] = round(ref.levels[[nm]], digits = dig)
grid[[nm]] = round(grid[[nm]], digits = dig)
}
# get only "legal" levels
at[[nm]] = ref.levels[[nm]] = at[[nm]][at[[nm]] %in% ref.levels[[nm]]]
rows = numeric(0)
for(x in at[[nm]])
rows = c(rows, which(grid[[nm]] == x))
grid = grid[rows, , drop = FALSE]
grid[[nm]] = factor(grid[[nm]], levels = at[[nm]])
basis$X = basis$X[rows, , drop = FALSE]
}
}
# Any offsets??? (misc$offset.mult might specify removing or reversing the offset)
om = ifelse(is.null(misc$offset.mult), 1, misc$offset.mult)
oval = 0
if (!missing(offset)) { # For safety, we always treat it as scalar
if (offset[1] != 0)
oval = offset[1]
}
else {
if(".static.offset." %in% names(grid)) { # static offset is available
oval = om * grid[[".static.offset."]]
# grid$.static.offset. = NULL
}
#else
if(!is.null(attr(trms,"offset"))) {
if (any(om != 0))
oval = om * (oval + .get.offset(trms, grid))
}
if(any(oval != 0))
grid[[".offset."]] = oval
}
### --- Determine weights for each grid point
if (!hasName(data, "(weights)"))
data[["(weights)"]] = 1
cov.keep = intersect(unique(cov.keep), names(ref.levels))
nms = union(union(union(names(xlev), names(chrlev)), coerced$factors), cov.keep)
nms = intersect(nms, names(grid))
#### Old code...
# if (!covnest)
# nms = union(union(names(xlev), names(chrlev)), coerced$factors) # only factors, no covariates or mult.resp
# else
# nms = setdiff(names(ref.levels)[sapply(ref.levels, length) > 1], multresp) # all names (except multiv) for which there is > 1 level
if (length(nms) == 0)
wgt = rep(1, nrow(grid)) # all covariates; give each weight 1
else {
id = .my.id(data[, nms, drop = FALSE])
uid = !duplicated(id)
key = do.call(paste, unname(data[uid, nms, drop = FALSE]))
key = key[order(id[uid])]
tgt = do.call(paste, unname(grid[, nms, drop = FALSE]))
wgt = rep(0, nrow(grid))
for (i in seq_along(key))
wgt[tgt == key[i]] = sum(data[["(weights)"]][id==i])
}
grid[[".wgt."]] = wgt
model.info = list(call = attr(data,"call"), terms = trms, xlev = xlev)
if (!is.null(mm <- basis$model.matrix)) { # submodel support
attr(mm, "factors") = .smpFT(trms)
model.info$model.matrix = mm
}
# Detect any nesting structures
nst = .find_nests(grid, trms, coerced$orig, ref.levels)
if (length(nst) > 0)
model.info$nesting = nst
misc$is.new.rg = TRUE
misc$ylevs = NULL # No longer needed
misc$estName = "prediction"
misc$estType = "prediction"
misc$infer = c(FALSE,FALSE)
misc$level = .95
misc$adjust = "none"
misc$famSize = nrow(grid)
if(is.null(misc$avgd.over))
misc$avgd.over = character(0)
# emm_basis may have provided a sigma. If not, use sigma value from model or args
if(is.null(misc$sigma) && missing(sigma)) { # Get 'sigma(object)' if available, else NULL
sigma = suppressWarnings(try(stats::sigma(object), silent = TRUE))
if (inherits(sigma, "try-error"))
sigma = NULL
misc$sigma = sigma
}
# ... but override it only if sigma != NA
if(is.null(misc$sigma) || (length(misc$sigma) == 0) || !is.na(misc$sigma[1]))
misc$sigma = sigma
post.beta = basis$post.beta
if (is.null(post.beta))
post.beta = matrix(NA)
predictors = intersect(attr(data, "predictors"), names(grid))
if(!missing(counterfactuals)) predictors = c(predictors, ".obs.no.")
simp.tbl = environment(trms)$.simplify.names.
if (! is.null(simp.tbl)) {
names(grid) = .simplify.names(names(grid), simp.tbl)
predictors = .simplify.names(predictors, simp.tbl)
names(ref.levels) = .simplify.names(names(ref.levels), simp.tbl)
if (!is.null(post.beta)) names(post.beta) = .simplify.names(names(post.beta), simp.tbl)
if (!is.null(model.info$nesting)) {
model.info$nesting = lapply(model.info$nesting, .simplify.names, simp.tbl)
names(model.info$nesting) = .simplify.names(names(model.info$nesting), simp.tbl)
}
environment(trms)$.simplify.names. = NULL
}
result = new("emmGrid",
model.info = model.info,
roles = list(predictors = predictors,
responses = attr(data, "responses"),
multresp = multresp,
nuisance = nuisance),
grid = grid, levels = ref.levels, matlevs = matlevs,
linfct = basis$X, bhat = basis$bhat, nbasis = basis$nbasis, V = basis$V,
dffun = basis$dffun, dfargs = basis$dfargs,
misc = misc, post.beta = post.beta)
if (!missing(type)) {
if (is.null(options)) options = list()
options$predict.type = type
}
if (!missing(nesting)) {
result@model.info$nesting = lst = .parse_nest(nesting)
if(!is.null(lst)) {
nms = union(names(lst), unlist(lst))
if(!all(nms %in% names(result@grid)))
stop("Nonexistent variables specified in 'nesting'")
result@misc$display = .find.nonempty.nests(result, nms)
}
}
else if (!is.null(nst <- result@model.info$nesting)) {
result@misc$display = .find.nonempty.nests(result)
if (get_emm_option("msg.nesting"))
message("NOTE: A nesting structure was detected in the ",
"fitted model:\n ", .fmt.nest(nst))
}
result = .update.options(result, options, ...)
if(!is.null(hook <- misc$postGridHook)) {
if (is.character(hook))
hook = get(hook)
result@misc$postGridHook = NULL
result = hook(result, ...)
}
if(!missing(regrid)) {
if(missing(wt.counter)) wt.counter = 1
result = regrid(result, transform = regrid, sigma = sigma,
.collapse = collapse, wt.counter = wt.counter, ...)
if(!is.null(collapse))
result@misc$avgd.over = collapse
}
.save.ref_grid(result)
result
}
#### End of ref_grid ------------------------------------------
# local utility to save each newly constructed ref_grid, if enabled
# Goes into global environment unless .Last.ref_grid is found further up
.save.ref_grid = function(object) {
if (is.logical(isnewrg <- object@misc$is.new.rg))
if(isnewrg && get_emm_option("save.ref_grid"))
assign(".Last.ref_grid", object, inherits = TRUE)
}
# This function figures out which covariates in a model
# have been coerced to factors. And also which factors have been coerced
# to be covariates
.find.coerced = function(trms, data) {
if (ncol(data) == 0)
return(list(factors = integer(0), covariates = integer(0)))
isfac = sapply(data, function(x) inherits(x, "factor"))
# Character vectors of factors and covariates in the data...
facs.d = names(data)[isfac]
covs.d = names(data)[!isfac]
lbls = attr(trms, "term.labels")
M = model.frame(trms, utils::head(data, 2)) #### just need a couple rows
isfac = sapply(M, function(x) inherits(x, "factor"))
# Character vector of terms in the model frame that are factors ...
facs.m = names(M)[as.logical(isfac)]
covs.m = setdiff(names(M), facs.m)
# Exclude the terms that are already factors
# What's left will be things like "factor(dose)", "interact(dose,treat)", etc
# we're saving these in orig
orig = cfac = setdiff(facs.m, facs.d)
if(length(cfac) != 0) {
cvars = lapply(cfac, function(x) .all.vars(stats::reformulate(x))) # Strip off the function calls
cfac = intersect(unique(unlist(cvars)), covs.d) # Exclude any variables that are already factors
}
# Do same with covariates
ccov = setdiff(covs.m, covs.d)
orig = c(orig, ccov)
if(length(ccov) > 0) {
cvars = lapply(ccov, function(x) .all.vars(stats::reformulate(x)))
ccov = intersect(unique(unlist(cvars)), facs.d)
}
list(factors = cfac, covariates = ccov, orig = orig)
}
# My replacement for plyr::id(, drop = TRUE)
.my.id = function(data){
p = do.call(paste, data)
u = unique(p)
match(p, u)
}
# Utility to error-out when potential reference grid size is too big
.check.grid = function(levs, limit = get_emm_option("rg.limit")) {
size = prod(sapply(levs, length))
if (size > limit)
stop("The rows of your requested reference grid would be ", size, ", which exceeds\n",
"the limit of ", limit, " (not including any multivariate responses).\n",
"Your options are:\n",
" 1. Specify some (or more) nuisance factors using the 'nuisance' argument\n",
" (see ?ref_grid). These must be factors that do not interact with others.\n",
" 2. Add the argument 'rg.limit = <new limit>' to the call. Be careful,\n",
" because this could cause excessive memory use and performance issues.\n",
" Or, change the default via 'emm_options(rg.limit = <new limit>)'.\n",
call. = FALSE)
}
# Utility to set up the grid for nuisance factors. This consists of two or more
# data.frames rbinded together:
# * the expanded grid for all factors *not* in nuis, with the nuis factors set
# at their first levels
# * for each factor in f nuis, a set of rows for (levs$f), with the other
# factors at their first levels (this is arbitrary)
# In addition, we return a character vector 'row.assign' corresponding to the rows
# in the grid, telling which is what: ".main.grid." for the first part of the grid,
# otherwise factor names from nuis.
# We also return 'nuis' itself - which may be reduced since illegal
# entries are silently removed
.setup.nuis = function(nuis, levs, trms, rg.limit) {
firsts = args = lapply(levs, function(x) x[1])
nuis = intersect(nuis, names(levs))
# sanity checks on terms, and term indexes
fsum = rep(99, length(nuis))
tbl = attr(trms, "factors")
rn = row.names(tbl) = sapply(row.names(tbl), function(nm)
paste(all.vars(reformulate(nm)), collapse = ","))
for (i in seq_along(nuis)) {
f = nuis[i]
if(f %in% rn)
fsum[i] = sum(tbl[f, ])
}
nuis = nuis[fsum == 1] # silently remove any unfound or interacting factors
# top part...
non.nuis = setdiff(names(levs), nuis)
for (n in non.nuis)
args[[n]] = levs[[n]]
.check.grid(args, rg.limit)
grid = do.call(expand.grid, args)
ra = rep(".main.grid.", nrow(grid))
# bottom parts
for (f in nuis) {
args = firsts
args[[f]] = levs[[f]]
grid = rbind(grid, do.call(expand.grid, args))
ra = c(ra, rep(f, length(levs[[f]])))
}
list(grid = grid, row.assign = ra, nuis = nuis)
}
# Do the required post-processing for nuisance factors...
# After we get the model matrix for this grid, we'll average each set of rows in the
# bottom part, and substitute those averages in the required columns in the top part
# of the model matrix.
.basis.nuis = function(basis, info, wt, levs, data, grid, ref.levels) {
ra = info$row.assign
r. = rep(".", length(ra)) # fillers
X = basis$X
n = sum(ra == ".main.grid.")
k = nrow(X) / length(ra) # multivariate dimension
nuis = info$nuis
wts = lapply(nuis, function(f) {
if (wt == "equal")
w = rep(1, length(levs[[f]]))
else {
x = data[[f]]
w = sapply(levs[[f]], function(lev) sum(x == lev))
}
w / sum(w)
})
names(wts) = nuis
# In a multivariate case, we have to repeat the same operations for each block of X rows
for (m in 1:k) {
RA = c(rep(r., m - 1), ra, rep(r., k - m))
for (f in nuis) {
subX = X[RA == f, , drop = FALSE]
cols = which(apply(subX, 2, function(x) diff(range(x)) > 0))
subX = sweep(subX[, cols, drop = FALSE], 1, wts[[f]], "*")
avg = apply(subX, 2, sum)
avg = matrix(rep(avg, each = n), nrow = n) # now several copies
X[RA == ".main.grid.", cols] = avg
}
}
basis$misc$nuis = nuis
basis$misc$avgd.over = paste(length(nuis), "nuisance factors")
RA = rep(ra, k)
basis$X = X[RA == ".main.grid.", , drop = FALSE]
basis$grid = grid[RA == ".main.grid.", , drop = FALSE]
non.nuis = setdiff(names(ref.levels), info$nuis)
basis$ref.levels = ref.levels[non.nuis]
basis
}
# Set up grid for counterfactuals - i.e., copies of whole dataset with
# counterfactual levels substituted, with obs index varying the slowest
.setup.cf = function(levs, data) {
lv = arg = levs[-length(levs)] # omit .obs.no.
arg$stringsAsFactors = FALSE
g = do.call(expand.grid, arg)
idx = rep(seq_len(nrow(data)), each = nrow(g))
xdata = data[idx, , drop = FALSE]
idx = matrix(seq_len(nrow(xdata)), nrow = nrow(g))
for (i in seq_along(g[, 1]))
for (j in names(lv))
xdata[idx[i, ], j] = g[i, j]
xdata
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.