Custom loss functions for SEM with tensorsem

knitr::opts_chunk$set(
  collapse = TRUE,
  cache = TRUE,
  comment = "#>"
)
library(tensorsem)

With tensorsem we can estimate structural equation models using maximum likelihood (ML), but we can also go beyond standard multivariate normal ML to estimate the parameters. For example, we can create a robust form of SEM by using a multivariate t-distribution instead of a multivariate normal distribution for the observations. We can also perform unweighted least squares or weighted least squares. Lastly, we can put custom penalties on any parameter in the model.

The machinery that enables all these custom objective functions is automatic differentiation (through torch) and adaptive gradient-based optimization algorithms, such as optim_adam. In this vignette, we show how this can be done.

Maximum likelihood estimation

Maximum likelihood estimation is the default in lavaan, so that's what we'll compare to.

# Create model syntax
syntax <- "
  # three-factor model
  visual  =~ x1 + x2 + x3
  textual =~ x4 + x5 + x6
  speed   =~ x7 + x8 + x9
"

# Fit lavaan model
fit_lavaan <- sem(
  model = syntax,
  data = HolzingerSwineford1939,
  std.lv = TRUE,
  information = "observed",
  fixed.x = FALSE
)

pt_lavaan <- partable(fit_lavaan)

And, in order to visually compare the different estimation methods, we create a custom plot function:

param_plot <- function(...) {
  ptli <- list(...)
  ptli <- lapply(names(ptli), function(n) {
    out <- ptli[[n]]
    out$method <- n
    return(out)
  })
  Reduce(rbind, ptli) |> 
    ggplot2::ggplot(ggplot2::aes(x = id, y = est, colour = forcats::as_factor(method),
               ymin = est - 1.96*se, ymax = est + 1.96*se)) + 
    ggplot2::geom_pointrange(position = ggplot2::position_dodge(width = .8)) +
    ggplot2::theme_minimal() +
    ggplot2::labs(x = "Parameter", y = "Value", colour = "Method")
}

param_plot(lavaan = pt_lavaan)

Now, we will estimate this same model using tensorsem:

# initialize the SEM model object
mod_ml <- torch_sem(syntax, dtype = torch_float64())

# create a data object as a torch tensor
dat_torch <- torch_tensor(
  data = scale(HolzingerSwineford1939[,7:15], scale = FALSE),
  requires_grad = FALSE,
  dtype = torch_float64(),
)

# estimate the model using default settings
mod_ml$fit(dat = dat_torch, verbose = FALSE)

# re-compute the log-likelihood & create partable
ll <- mod_ml$loglik(dat_torch)
pt_ml <- mod_ml$partable(-ll)

# compare to lavaan
param_plot(
  lavaan = pt_lavaan,
  torch_ml = pt_ml
)

As we can see, the two methods yield practically the same parameter estimates and standard errors.

Robust t-distributed SEM

Next, we'll change the loss function to the negative log-likelihood of a multivariate t-distribution with 2 degrees of freedom for each variable.

# Create a multivariate t log-likelihood distribution using torch operations
# inspiration: https://docs.pyro.ai/en/stable/_modules/pyro/distributions/multivariate_studentt.html
# note that that code is APACHE-2.0 licensed
mvt_loglik <- function(x, Sigma, nu = 2) {
  p <- x$shape[2]
  n <- x$shape[1]

  Schol <- linalg_cholesky(Sigma)

  # constant term
  C <- Schol$diag()$log()$sum() +  p/2*torch_log(nu) + 
    p/2*log(pi) + torch_lgamma(nu / 2) - 
    torch_lgamma((p + nu) / 2)

  # data-dependent term
  y <- x$t()$triangular_solve(Schol, upper = FALSE)[[1]]
  D <- torch_log1p(y$square()$sum(1) / nu)

  return(-0.5 * (nu + p) * D - C)
}

Now, we will use this distribution as the loss function.

# initialize the SEM model object
mod_t <- torch_sem(syntax, dtype = torch_float64())

# initialize at the ml estimates for faster convergence
mod_t$load_state_dict(mod_ml$state_dict())

# initialize the optimizer
opt <- optim_adam(mod_t$parameters, lr = 0.01)

# start the training loop
iters <- 200L
loglik <- numeric(iters)
for (i in 1:iters) {
  opt$zero_grad()
  Sigma <- mod_t()
  loss <- -mvt_loglik(dat_torch, Sigma)$sum()
  loglik[i] <- -loss$item()
  loss$backward()
  opt$step()
}

Then, we can plot the optimization trajectory to see whether it's converged:

opt_plot <- function(losses) {
  ggplot2::ggplot(data.frame(x = 1:length(losses), y = losses), ggplot2::aes(x, y)) + 
    ggplot2::geom_line() +
    ggplot2::theme_minimal()
}
opt_plot(loglik) +
  ggplot2::labs(x = "Epochs", y = "Log-Likelihood", title = "ML estimation of multivariate T SEM")

Then, we can again compare the parameter estimates:

# re-compute the log-likelihood & create partable
ll <- mvt_loglik(dat_torch, mod_t())$sum()
pt_t <- mod_t$partable(-ll)

# compare to lavaan
param_plot(mvn = pt_ml, t = pt_t)

Unweighted and Diagonally weighted least squares

By half-vectorizing the model-implied and observed covariance matrices, we can create versions of least squares for estimating SEM.

uls_loss <- function(s, Sigma) {
  r <- s - torch_vech(Sigma)
  r$t()$mm(r)
}

Let's try it out!

# initialize the SEM model object
mod_uls <- torch_sem(syntax, dtype = torch_float64())

# initialize at the ml estimates for faster convergence
mod_uls$load_state_dict(mod_ml$state_dict())

# initialize the optimizer
opt <- optim_adam(mod_uls$parameters, lr = 0.01)

# compute s
s <- torch_vech(dat_torch$t()$mm(dat_torch) / 301)

# start the training loop
iters <- 200L
losses <- numeric(iters)
for (i in 1:iters) {
  opt$zero_grad()
  Sigma <- mod_uls()
  loss <- uls_loss(s, Sigma)
  losses[i] <- loss$item()
  loss$backward()
  opt$step()
}

Then, we can plot the loss path & compare parameters

opt_plot(losses) +
  ggplot2::labs(x = "Epochs", y = "ULS loss", title = "ULS estimation of SEM")

Similarly, we can perform dwls by using the Asymptotic Distribution Free Gamma matrix

dwls_loss <- function(s, Sigma, w) {
  r <- s - torch_vech(Sigma)
  r$t()$mul(w)$mm(r)
}
# initialize the SEM model object
mod_dwls <- torch_sem(syntax, dtype = torch_float64())

# initialize at the ml estimates for faster convergence
mod_dwls$load_state_dict(mod_ml$state_dict())

# initialize the optimizer
opt <- optim_adam(mod_dwls$parameters, lr = 0.01)

# compute weights for DWLS, based on
# asymptotic distribution free Gamma matrix
Gamma_ADF <- lavaan:::lavGamma(fit_lavaan)[[1]]
w <- torch_tensor(1 / diag(Gamma_ADF), dtype = torch_float64(), requires_grad = FALSE)

# start the training loop
iters <- 200L
losses <- numeric(iters)
for (i in 1:iters) {
  opt$zero_grad()
  Sigma <- mod_dwls()
  loss <- dwls_loss(s, Sigma, w)
  losses[i] <- loss$item()
  loss$backward()
  opt$step()
}

Then we can again plot the loss path

opt_plot(losses) +
  ggplot2::labs(x = "Epochs", y = "DWLS loss", title = "DWLS estimation of SEM")
pt_uls <- mod_uls$partable()
pt_dwls <- mod_dwls$partable()

# compare to lavaan
param_plot(mvn = pt_ml, uls = pt_uls, dwls = pt_dwls)

Custom penalties

Using a penalty we can adjust the maximum likelihood objective. For example, it is possible to specify an EFA, with L1 (LASSO) penalties on the factor loadings (in the Lambda matrix).

# Create model syntax
syntax <- "
  # three-factor model
  visual + textual + speed =~ x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8 + x9

  # no correlations between the factors
  visual ~~ 0*textual
  visual ~~ 0*speed
  textual ~~ 0*speed
"

mod_efa <- torch_sem(syntax, dtype = torch_float64())
mod_efa$load_state_dict(mod_ml$state_dict())

# lambda is the penalty hyperparameter
lambda <- torch_tensor(0.15)

# initialize the optimizer
opt <- optim_adam(mod_efa$parameters, lr = 0.01)


# start the training loop
iters <- 200L
losses <- numeric(iters)
for (i in 1:iters) {
  opt$zero_grad()
  Sigma <- mod_efa()
  loss <- mvn_negloglik(dat_torch, Sigma) / 301 + lambda * mod_efa$Lam$abs()$sum()
  losses[i] <- loss$item()
  loss$backward()
  opt$step()
}

As always, we can plot the loss over training iterations:

opt_plot(losses) +
  ggplot2::labs(x = "Epochs", y = "Penalized SEM loss", title = "Penalized estimation of SEM")

Let's see whether the procedure allowed a cross-loading:

load <- as_array(mod_efa$Lam)
colnames(load) <- c("visual", "textual", "speed")
rownames(load) <- paste0("x", 1:9)
round(load, 2)

It allowed a quite strong cross-loading between the factor "speed" and item "x9", as well as between "textual" and "x1".



vankesteren/tensorsem documentation built on Aug. 22, 2023, 7:41 a.m.