R/amce.R
In cregg: Simple Conjoint Tidying, Analysis, and Visualization

Documented in amce amce_by_reference

#' @rdname amce
#' @title Tidy estimation of AMCEs
#' @description Estimate AMCEs for a conjoint analysis and return a tidy data frame of results
#' @param data A data frame containing variables specified in \code{formula}. All RHS variables should be factors; the base level for each will be used in estimation and its reported AMCE will be NA (for printing). Optionally, this can instead be an object of class \dQuote{survey.design} returned by \code{\link[survey]{svydesign}}.
#' @param formula A formula specifying an AMCE model to be estimated. All variables should be factors; all levels across features should be unique. Two-way constraints can be specified with an asterisk (*) between RHS features. The specific constrained level pairs within these features are then detected automatically. Higher-order constraints are not allowed.
#' @param variable An RHS formula containing a single factor variable from \code{formula}. This will be used by \code{amce_by_reference} to estimate AMCEs relative to each possible factor level as a reference category. If more than one RHS variables are specified, the first will be used.
#' @template id
#' @template weights
#' @template feature_order
#' @template feature_labels
#' @template level_order
#' @template alpha
#' @param \dots For \code{amce}: additional arguments to \code{\link[stats]{glm}} or \code{\link[survey]{svyglm}}, the latter being used if \code{weights} is non-NULL. For \code{amce_by_reference}: additional arguments passed to \code{amce}.
#' @return A data frame of class \dQuote{cj_amce}
#' @details \code{amce} provides estimates of AMCEs (or rather, average marginal effects for each feature level). Two-way constraints can be specified with an asterisk (*) between features. The specific constrained level pairs within these features are then detected automatically. The function can also be used for calculating average component interaction effects when combined with \code{interaction}, and for balance testing by specifying a covariate rather outcome on the left-hand side of \code{formula}. See examples.
#' 
#' \code{amce_by_reference} provides a tool for quick sensitivity analysis. AMCEs are defined relative to an arbitrary reference category (i.e., feature level). This function will loop over all feature levels (for a specified feature) to show how interpretation will be affected by choice of reference category. The resulting data frame will be a stacked result from \code{amce}, containing an additional \code{REFERENCE} column specifying which level of \code{variable} was used as the reference category. In unconstrained conjoint designs, only AMCEs for \code{variable} will vary by reference category; in constrained designs, AMCEs for any factor constrained by \code{variable} may also vary.
#' 
#' Users may desire to specify a \code{family} argument via \code{\dots}, which should be a \dQuote{family} object such as \code{gaussian}. Sensible alternatives are \code{binomial} (for binary outcomes) and quasibinomial (for weighted survey data). See \code{\link[stats]{family}} for details. In such cases, effects are always reported on the link (not outcome) scale.
#'
#' @examples
#' data("taxes")
#' # estimating AMCEs
#' amce(taxes, chose_plan ~ taxrate1 + taxrate2 + taxrate3 + 
#'      taxrate4 + taxrate5 + taxrate6 + taxrev, id = ~ ID)
#' 
#' \donttest{
#' data("immigration")
#' # estimating AMCEs with constraints
#' amce(immigration, ChosenImmigrant ~ Gender + ReasonForApplication * CountryOfOrigin,
#'      id = ~CaseID)
#'
#'# estimating average component interaction effects (AMCEs of feature combinations)
#' immigration$language_entry <- interaction(immigration$LanguageSkills, 
#'                                           immigration$PriorEntry, sep = "_")
#' amce(immigration,ChosenImmigrant ~ language_entry, id = ~CaseID)
#'
#' # balance testing example
#' plot(amce(immigration[!is.na(immigration$ethnocentrism),],
#'      ethnocentrism ~ Gender + Education + LanguageSkills, id = ~ CaseID))
#' 
#' # reference category sensitivity
#' x <- amce_by_reference(immigration, ChosenImmigrant ~ LanguageSkills + Education, 
#'        variable = ~ LanguageSkills, id = ~ CaseID)
#' # plot
#' plot(x)
#' }
#' @seealso \code{\link{amce_diffs}} \code{\link{mm}} \code{\link{plot.cj_amce}}
#' @import stats
#' @import survey
#' @importFrom sandwich vcovCL
#' @export
amce <- 
function(
  data,
  formula,
  id = ~ 0,
  weights = NULL,
  feature_order = NULL,
  feature_labels = NULL,
  level_order = c("ascending", "descending"),
  alpha = 0.05,
  ...
) {

    # get outcome variable
    outcome <- all.vars(stats::update(formula, . ~ 0))
    if (!length(outcome) || outcome == ".") {
        stop("'formula' is missing a left-hand outcome variable")
    }
    
    # get RHS variables, variable labels, and factor levels
    RHS <- all.vars(stats::update(formula, 0 ~ . ))
    
    # detect constraints from 'formula'
    formula_terms <- terms(formula)
    if (any(attr(formula_terms, "order") >= 2)) {
        # constraints are present
        constrained_terms <- attr(formula_terms, "factors")[ , attr(formula_terms, "order") == 2, drop = FALSE]
        constrained_vars <- colnames(constrained_terms)
        constraints <- lapply(constrained_vars, function(tmp) {
            as.formula(paste0("~", tmp))
        })
        # vector of constrained variables
        constrained_vars <- unlist(lapply(constraints, all.vars))
        ## check that constraints do not include duplicated terms
        if (any(duplicated(constrained_vars))) {
            stop("All variables in constraints must be unique and constraints may only be two-way")
        }
        constrained_vars <- unique(constrained_vars)
        
        # vector of unconstrained variables
        unconstrained_vars <- RHS[!RHS %in% constrained_vars]
    } else {
        constraints <- NULL
        constrained_vars <- NULL
        unconstrained_vars <- RHS
    }
    
    # process feature_order argument
    feature_order <- check_feature_order(feature_order, RHS)
    
    # set level_order (within features) to ascending or descending
    level_order <- match.arg(level_order)
    
    # function to produce "fancy" feature labels
    feature_labels <- clean_feature_labels(data = data, RHS = RHS, feature_labels = feature_labels)
    
    # convert feature labels and levels to data frame
    term_labels_df <- make_term_labels_df(data, feature_order, level_order = level_order)
    
    # estimate model
    if (inherits(data, "data.frame") && is.null(weights)) {
        data <- cj_df(data)
        svydesign <- survey::svydesign(ids = id, weights = ~ 1, data = data)
        mod <- survey::svyglm(formula, design = svydesign, ...)
    } else if (inherits(data, "data.frame")) {
        data <- cj_df(data)
        svydesign <- survey::svydesign(ids = id, weights = weights, data = data)
        mod <- survey::svyglm(formula, design = svydesign, ...)
    } else if (inherits(data, "survey.design")) {
        svydesign <- data
        data <- cj_df(data[["variables"]])
        mod <- survey::svyglm(formula, design = svydesign, ...)
    } else {
        stop("'data' is not a 'data.frame' or 'survey.design' object")
    }
    
    # empty AMCEs (used in two places below)
    empty_amces <- data.frame(outcome = character(),
                              statistic = character(),
                              feature = character(),
                              level = character(),
                              estimate = numeric(),
                              std.error = numeric(),
                              z = numeric(),
                              p = numeric(),
                              lower = numeric(),
                              upper = numeric(),
                              check.names = FALSE,
                              stringsAsFactors = FALSE)
    
    # get coefficients as data frame (correct, if needed, for clustering)
    coef_summary <- get_coef_summary(mod = mod, data = data, id = NULL, alpha = alpha)
    
    # first estimate unconstrained terms
    if (length(unconstrained_vars)) {
        # extract 'unconstrained_vars'
        out <- coef_summary
        names(out) <- c("estimate", "std.error", "z", "p", "lower", "upper", "factor")
        
        # cleanup term names
        out <- cbind(clean_term_names(out[["factor"]], RHS), out[names(out) != "factor"])
        
        # fill in missing base levels with 0s
        out <- merge(out[!out$feature %in% c("", "(Intercept)"),],
                          term_labels_df, 
                          by = c("feature", "level"), all = TRUE)
        out[["estimate"]][is.na(out[["estimate"]])] <- 0
        out[["outcome"]] <- outcome
        out[["statistic"]] <- "amce"
        
        # subset to variables in 'unconstrained_vars'
        unconstrained_amces <- out[out[["feature"]] %in% unconstrained_vars, , drop = FALSE]
        
    } else {
        # if no 'unconstrained_vars', return empty data frame
        unconstrained_amces <- empty_amces
    }
    
    # then calculate constrained terms
    if (!is.null(constraints)) {
        ## for each constraint, we need to:
        ## > figure out what estimated coefficients go with each of the two terms (using `get_coef_metadata()` to identify terms and `get_coef_summary()` to get a clean model summary)
        ## > average effects of first constrained variable across levels of second variable
        ## > average effects of second constrained variable across levels of first variable
        ##     > if we were to generalize this to higher-order constraints, the code would simply have to be made more complex
        ## > `rbind()` the two sets of results back together
        constrained_amces <- lapply(constraints, function(one) {
            # identify variables
            variables_in_this_constraint <- setdiff(unlist(lapply(one, all.vars)), "~")
            var1 <- variables_in_this_constraint[1L]
            var2 <- variables_in_this_constraint[2L]
            
            # function to calculate AMCEs by averaging coefficients
            calculate_amce <- function(to_average, feature) {
                
                # subset to estimable coefficients
                to_average <- to_average[!is.na(to_average[["Estimate"]]), , drop = FALSE]
                # make sure estimates are sorted in correct order
                to_average <- to_average[order(to_average[["_order"]]),]
                
                # var-cov matrix of these estimates
                this_varcov <- vcov(mod)[to_average[["_coef"]], to_average[["_coef"]]]
                
                # calculate AMCE, giving uniform weight to features
                ## calculate weights (base term is weighted 1; other alters equally)
                wts <- c(1L, rep(1L/nrow(to_average), nrow(to_average) - 1L))
                est <- sum(to_average[["Estimate"]] * wts)
                ## variance is weights %*% vcov %*% weights
                variance <- (wts %*% this_varcov %*% wts)[1L,1L,drop = TRUE]
                
                # populate output
                averaged <- data.frame(outcome = outcome,
                                       statistic = "amce",
                                       feature = feature,
                                       level = to_average[to_average[["_order"]] == 1L, "_base_level", drop = TRUE],
                                       estimate = est,
                                       std.error = sqrt(variance),
                                       z = est/sqrt(variance),
                                       p = 2L*(stats::pnorm(-abs(est/sqrt(variance)))),
                                       check.names = FALSE,
                                       stringsAsFactors = FALSE
                                       )
                averaged[["lower"]] <- est - (stats::qnorm(1-alpha) * sqrt(variance))
                averaged[["upper"]] <- est + (stats::qnorm(1-alpha) * sqrt(variance))
                return(averaged)
            }
            
            terms_df <- get_coef_metadata(mod)
            # calculate MEs for first variable, constraining second
            ## subset terms_df to terms with 'var1' and/or 'var2'
            terms_df1 <- terms_df[terms_df[[var1]] | terms_df[[var2]], , drop = FALSE]
            # add term levels to 'term_df1'
            terms_df1 <- identify_term_levels(terms_df1, data, base_var = var1, by_var = var2)
            ## merge terms_df with `coef_summary`
            terms_df1 <- merge(coef_summary, terms_df1, by = "_coef")
            ## split merged object by 'var1' levels, excluding base level, calculating AMCE
            terms_df1 <- terms_df1[terms_df1[["_base_level"]] != levels(data[[var1]])[1L], , drop = FALSE]
            one_out1_list <- lapply(split(terms_df1, terms_df1[["_base_level"]], drop = TRUE), calculate_amce, feature = var1)
            one_out1 <- do.call("rbind", one_out1_list)
            
            # calculate MEs for second variable, constraining first
            ## subset terms_df to terms with 'var1' and/or 'var2'
            terms_df2 <- terms_df[terms_df[[var1]] | terms_df[[var2]], , drop = FALSE]
            # add term levels to 'term_df1'
            terms_df2 <- identify_term_levels(terms_df2, data, base_var = var2, by_var = var1)
            ## merge terms_df with `coef_summary`
            terms_df2 <- merge(coef_summary, terms_df2, by = "_coef")
            ## split merged object by 'var1' levels, excluding base level, calculating AMCE
            terms_df2 <- terms_df2[terms_df2[["_base_level"]] != levels(data[[var2]])[1L], , drop = FALSE]
            one_out2_list <- lapply(split(terms_df2, terms_df2[["_base_level"]], drop = TRUE), calculate_amce, feature = var2)
            one_out2 <- do.call("rbind", one_out2_list)
            
            # NOTE: IF WE HAD HIGHER-ORDER CONSTRAINTS, WE WOULD NEED ADDITIONAL STEPS HERE
            
            # combine both sets of AMCEs for this constraint
            one_out <- rbind(one_out1, one_out2)
            one_out[["BY"]] <- NULL
            
            # restore base levels of both variables
            one_out <- rbind(one_out,
                             data.frame(statistic = rep("amce", 2L),
                                        outcome = rep(outcome, 2L),
                                        feature = c(var1,var2),
                                        level = c(levels(data[[var1]])[1L], levels(data[[var2]])[1L]),
                                        estimate = c(0L, 0L),
                                        std.error = NA_real_,
                                        z = NA_real_,
                                        p = NA_real_,
                                        lower = NA_real_,
                                        upper = NA_real_))
            rownames(one_out) <- seq_len(nrow(one_out))
            return(one_out)
        })
        
        # rbind all constraint pairs together
        constrained_amces <- do.call("rbind", constrained_amces)
        
    } else {
        # if no 'constrained_vars', return empty data frame
        constrained_amces <- empty_amces
    }
    
    # update out with constrained AMCEs
    out <- rbind(unconstrained_amces, constrained_amces)
    
    # label features and levels
    out[["level"]] <- factor(out[["level"]], levels = term_labels_df[["level"]])
    out[["feature"]] <- factor(out[["feature"]],
                               levels = feature_order,
                               labels = feature_labels[feature_order])
    out <- out[c("outcome", "statistic", "feature", "level", "estimate", "std.error", "z", "p", "lower", "upper")]
    out <- out[order(out$level),]
    rownames(out) <- seq_len(nrow(out))
    
    # return
    return(structure(out, class = c("cj_amce", "data.frame")))
}

#' @rdname amce
#' @export
amce_by_reference <- function(data, formula, variable, ...) {
    
    # get outcome variable
    variable <- all.vars(stats::update(variable, 0 ~ .))[[1L]]
    
    # get levels
    if (!inherits(data[[variable]], "factor")) {
        stop("'variable' must be a factor variable")
    }
    levs <- levels(data[[variable]])
    
    # loop over levels
    out <- list()
    for (i in seq_along(levs)) {
        data[[variable]] <- relevel(data[[variable]], levs[i])
        out[[i]] <- amce(data, formula, ...)
        out[[i]][["REFERENCE"]] <- levs[i]
    }
    
    # return value
    ## stack
    out <- do.call("rbind", out)
    ## add reference category column
    out[["REFERENCE"]] <- factor(out[["REFERENCE"]], levels = levs)
    return(structure(out, class = c("cj_amce", "data.frame"), by = "REFERENCE"))
}