R/est_change_approx.R
In semfindr: Influential Cases in Structural Equation Modeling

Documented in est_change_approx

#' @title Standardized Case Influence on Parameter Estimates (Approximate)
#'
#' @description Gets a [lavaan::lavaan()] output and computes the
#' approximate standardized changes in selected parameters for each case
#' if included.
#'
#' @details For each case, [est_change_approx()] computes the
#' approximate differences in the estimates of selected parameters
#' with and without this case:
#'
#' (Estimate with all case) - (Estimate without this case)
#'
#' The differences are standardized by dividing
#' the approximate raw differences by their standard errors. This is a
#' measure of the standardized influence of a case on the parameter estimates
#' if it is included.
#'
#' If the value of a case is positive, including the case increases an estimate.
#'
#' If the value of a case is negative, including the case decreases an estimate.
#'
#' The model is not refitted. Therefore, the result is only an
#' approximation of that of [est_change()]. However, this
#' approximation is useful for identifying potentially influential
#' cases when the sample size is very large or the model takes a long
#' time to fit. This function can be used to identify potentially
#' influential cases quickly and then select them to conduct the
#' leave-one-out sensitivity analysis using [lavaan_rerun()] and
#' [est_change()].
#'
#' This function also computes the approximate generalized Cook's
#' distance (gCD). To avoid confusion, it is labelled `gcd_approx`.
#'
#' For the technical details, please refer to the vignette
#' on this approach: \code{vignette("casewise_scores", package = "semfindr")}
#'
#' The approximate approach supports a model with
#' equality constraints (available in 0.1.4.8 and later version).
#'
#' Supports both single-group and multiple-group models.
#' (Support for multiple-group models available in 0.1.4.8 and later version).
#'
#' @param fit The output from [lavaan::lavaan()] or its wrappers (e.g.,
#' [lavaan::cfa()] and [lavaan::sem()]).
#'
#' @param parameters A character vector to specify the selected
#' parameters. Each parameter is named as in `lavaan` syntax, e.g.,
#' `x ~ y` or `x ~~ y`, as appeared in the columns `lhs`, `op`, and `rhs`
#' in the output of [lavaan::parameterEstimates()].
#' Supports specifying an operator to select all parameters with these
#' operators: `~`, `~~`, `=~`, and `~1`. This vector can contain
#' both parameter names and operators. More details can be found
#' in the help of [pars_id()].
#' If omitted or `NULL`, the
#' default, changes on all free parameters will be computed.
#'
#' @param case_id If it is a character vector of length equals to the
#' number of cases (the number of rows in the data in `fit`), then it
#' is the vector of case identification values. If it is `NULL`, the
#' default, then `case.idx` used by `lavaan` functions will be used
#' as case identification values.
#'
#' @param allow_inadmissible If `TRUE`, accepts a fit object with
#' inadmissible results (i.e., `post.check` from
#' [lavaan::lavInspect()] is `FALSE`). Default is `FALSE`.
#'
#' @param skip_all_checks If `TRUE`, skips all checks and allows
#' users to run this function on any object of the `lavaan` class.
#' For users to experiment this and other functions on models
#' not officially supported. Default is `FALSE`.
#'
#' @return An `est_change`-class object, which is
#' matrix with the number of columns equals to the number of
#' requested parameters plus one, the last column being the
#' approximate generalized Cook's
#' distance. The number of rows equal to the number
#' of cases. The row names are the case identification values used in
#' [lavaan_rerun()]. The elements are approximate standardized
#' differences.
#' A print method is available for user-friendly output.
#'
#' @author Idea by Mark Hok Chio Lai <https://orcid.org/0000-0002-9196-7406>,
#' implemented by Shu Fai Cheung <https://orcid.org/0000-0002-9871-9448>.
#'
#'
#' @examples
#' library(lavaan)
#'
#' # A path model
#'
#' dat <- pa_dat
#' mod <-
#' "
#' m1 ~ a1 * iv1 + a2 * iv2
#' dv ~ b * m1
#' a1b := a1 * b
#' a2b := a2 * b
#' "
#' # Fit the model
#' fit <- lavaan::sem(mod, dat)
#' summary(fit)
#'
#' # Approximate standardized changes and gCD
#' out_approx <- est_change_approx(fit)
#' head(out_approx)
#'
#' # Fit the model several times. Each time with one case removed.
#' # For illustration, do this only for the first 10 cases.
#' fit_rerun <- lavaan_rerun(fit, parallel = FALSE,
#'                           to_rerun = 1:10)
#' # Compute the changes in chisq if a case is removed
#' out <- est_change(fit_rerun)
#' head(out)
#'
#' # Compare the results
#' plot(out_approx[1:10, 1], out[, 1])
#' abline(a = 0, b = 1)
#' plot(out_approx[1:10, 2], out[, 2])
#' abline(a = 0, b = 1)
#' plot(out_approx[1:10, 3], out[, 3])
#' abline(a = 0, b = 1)
#' plot(out_approx[1:10, "gcd_approx"], out[, "gcd"])
#' abline(a = 0, b = 1)
#'
#' # A CFA model
#'
#' dat <- cfa_dat
#' mod <-
#' "
#' f1 =~  x1 + x2 + x3
#' f2 =~  x4 + x5 + x6
#' f1 ~~ f2
#' "
#' # Fit the model
#' fit <- lavaan::cfa(mod, dat)
#' summary(fit)
#'
#' # Approximate standardized changes and gCD
#' # Compute gCD only for free loadings
#' out_approx <- est_change_approx(fit,
#'                                 parameters = "=~")
#' head(out_approx)
#'
#' # A latent variable model
#'
#' dat <- sem_dat
#' mod <-
#' "
#' f1 =~  x1 + x2 + x3
#' f2 =~  x4 + x5 + x6
#' f3 =~  x7 + x8 + x9
#' f2 ~   a * f1
#' f3 ~   b * f2
#' ab := a * b
#' "
#' # Fit the model
#' fit <- lavaan::sem(mod, dat)
#' summary(fit)
#'
#' # Approximate standardized changes and gCD
#' # Compute gCD only for structural paths
#' out_approx <- est_change_approx(fit,
#'                                 parameters = "~")
#' head(out_approx)
#'
#'
#'
#' @export
#' @importMethodsFrom lavaan vcov

est_change_approx <- function(fit,
                       parameters = NULL,
                       case_id = NULL,
                       allow_inadmissible = FALSE,
                       skip_all_checks = FALSE
                       ) {
  if (missing(fit)) {
      stop("No lavaan output supplied.")
    }
  if (!inherits(fit, "lavaan")) {
      stop("The fit object is not a lavaan output.")
    }

  if (!skip_all_checks) {
    check_out <- approx_check(fit, print_messages = FALSE,
                              multiple_group = TRUE,
                              equality_constraint = TRUE)

    if (check_out != 0) {
        if ((check_out == -1) &&
            !(suppressWarnings(lavaan::lavInspect(fit, "post.check"))) &&
            allow_inadmissible) {
          } else {
            stop(attr(check_out, "info"))
          }
      }
    }

  ngroups <- lavaan::lavInspect(fit, "ngroups")
  if (ngroups > 1) {
      n_j <- sapply(lavaan::lavInspect(fit, "data"), nrow)
      n <- sum(n_j)
    } else {
      n <- nrow(lavaan::lavInspect(fit, "data"))
      n_j <- n
    }
  if (is.null(case_id)) {
      case_ids <- lavaan::lavInspect(fit, "case.idx",
                                     drop.list.single.group = FALSE)
      case_ids <- sort(unlist(case_ids, use.names = FALSE))
    } else {
      case_ids <- lavaan::lavInspect(fit, "case.idx",
                                    drop.list.single.group = FALSE)
      if (length(case_id) != n) {
          stop("The length of case_id is not equal to the number of cases.")
        } else {
          case_ids <- case_id
        }
    }
  est0 <- lavaan::parameterTable(fit)
  # Do not use user labels except for user-defined parameters
  # Ensure that plabels are not used as lavlabels
  tmp1 <- est0$plabel[est0$plabel != ""]
  tmp2 <- est0$label %in% tmp1
  tmp3 <- est0$label
  est0$label[tmp2] <- ""
  est0$label[est0$op != ":="] <- ""
  est0$lavlabel <- lavaan::lav_partable_labels(est0,
                                               type = "user")
  parameters_names <- est0[est0$free > 0, "lavlabel"]
  if (!is.null(parameters)) {
      parameters_selected <- pars_id(parameters,
                                    fit = fit,
                                    where = "coef")
    } else {
      parameters_selected <- seq_len(length(parameters_names))
    }
  param_idx <- parameters_selected
  x0 <- est_change_raw_approx(fit = fit,
                              parameters = parameters, case_id = case_id)
  vcov0 <- lavaan::vcov(fit)
  scores0 <- lavaan::lavScores(fit,
                              ignore.constraints = TRUE,
                              remove.duplicated = FALSE)
  s0 <- scores0[, param_idx, drop = FALSE]
  v0 <- vcov0[param_idx, param_idx, drop = FALSE]
  v1 <- diag(1 / sqrt(diag(v0)), nrow = nrow(v0), ncol = ncol(v0))
  out0 <- x0 %*% v1 * n / (n - 1)
  if (((ngroups > 1) || fit@Model@eq.constraints)) {
      v0_full <- full_rank(v0)
      v1_full <- v0_full$final
      p_kept <- seq_len(ncol(v0))
      if (length(v0_full$dropped) > 0) {
          p_kept <- p_kept[-v0_full$dropped]
        }
      x0 <- x0[, p_kept, drop = FALSE]
      gcd_approx <- rowSums(
          (x0 %*% solve(v1_full * n) * (n - 1)) * x0
        )
    } else {
      info0 <- lavaan::lavInspect(fit, what = "information")[param_idx, param_idx,
                                                            drop = FALSE]
      gcd_approx <- rowSums(
          (x0 %*% info0 * (n - 1)) * x0
        )
    }
  colnames(out0) <- parameters_names[parameters_selected]
  out <- cbind(out0, gcd_approx)
  colnames(out) <- c(parameters_names[parameters_selected], "gcd_approx")
  rownames(out) <- case_ids

  attr(out, "call") <- match.call()
  attr(out, "change_type") <- "standardized"
  attr(out, "method") <- "approx"
  attr(out, "standardized") <- FALSE

  class(out) <- c("est_change", class(out))

  out
}