In Gengrui-Zhang/R2spa: An R package for two-stage path analysis (2S-PA) to adjust for measurement errors

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

library(lavaan)
library(R2spa)
library(OpenMx)
library(umx)
library(mirt)

With a Linear Factor Model

The example is from https://lavaan.ugent.be/tutorial/sem.html.

# CFA
my_cfa <- "
  # latent variables
    ind60 =~ x1 + x2 + x3
    dem60 =~ y1 + y2 + y3 + y4
"
(fs_dat <- get_fs(PoliticalDemocracy, model = my_cfa, std.lv = TRUE)) |> head()

tspa_fit <- tspa(model = "dem60 ~ ind60", data = fs_dat,
                 fsT = attr(fs_dat, "fsT"),
                 fsL = attr(fs_dat, "fsL"))
parameterestimates(tspa_fit)

Using OpenMx

Create OpenMx model without latent variables. Here we also use the umx package.

fsreg_umx <- umxLav2RAM(
    "
    dem60 ~ ind60
    dem60 + ind60 ~ 1
    ",
    printTab = FALSE)

Create loading and error covariance matrices (need reordering)

# Loading
matL <- mxMatrix(
    type = "Full", nrow = 2, ncol = 2,
    free = FALSE,
    values = attr(fs_dat, "fsL")[2:1, 2:1],
    name = "L"
)
# Error
matE <- mxMatrix(
    type = "Symm", nrow = 2, ncol = 2,
    free = FALSE,
    values = attr(fs_dat, "fsT")[2:1, 2:1],
    name = "E"
)

Run 2S-PA

tspa_mx <- tspa_mx_model(fsreg_umx, data = fs_dat,
                         mat_ld = matL, mat_ev = matE,
                         fs_lv_names = c(ind60 = "fs_ind60",
                                         dem60 = "fs_dem60"))
# Run OpenMx
tspa_mx_fit <- mxRun(tspa_mx)
# Summarize the results (takes 20 seconds or so)
summary(tspa_mx_fit)
# Standardize coefficients
mxStandardizeRAMpaths(tspa_mx_fit, SE = TRUE)$m1[1, ]

# Alternative specification
tspa_mx <- tspa_mx_model(fsreg_umx, data = fs_dat,
                         mat_ld = attr(fs_dat, "fsL"),
                         mat_ev = attr(fs_dat, "fsT"))

With Mean Structure

my_cfa <- "
  # latent variables
    ind60 =~ x1 + x2 + x3
    dem60 =~ y1 + y2 + y3 + y4
    x1 + y1 ~ 0
    ind60 + dem60 ~ 1
"
fs_dat <- get_fs(PoliticalDemocracy, model = my_cfa,
                 meanstructure = TRUE)

lavaan

tspa_fit <- tspa(model = "dem60 ~ ind60\ndem60 + ind60 ~ 1", data = fs_dat,
                 fsT = attr(fs_dat, "fsT"),
                 fsL = attr(fs_dat, "fsL"),
                 fsb = attr(fs_dat, "fsb"))
parameterEstimates(tspa_fit)
standardizedSolution(tspa_fit)

OpenMx

# Force symmetric E
matE <- attr(fs_dat, "fsT")
matE <- (matE + t(matE)) / 2
matb <- t(as.matrix(attr(fs_dat, "fsb")))
tspa_mx <- tspa_mx_model(fsreg_umx, data = fs_dat,
                         mat_ld = attr(fs_dat, "fsL"),
                         mat_ev = matE,
                         mat_int = matb)
# Run OpenMx
tspa_mx_fit <- mxRun(tspa_mx)
# Summarize the results (takes 20 seconds or so)
summary(tspa_mx_fit)
# Standardize coefficients
mxStandardizeRAMpaths(tspa_mx_fit, SE = TRUE)$m1[1, ]

Compare to joint model with OpenMx

jreg_umx_fit <- umxRAM(
    "
    # latent variables
    ind60 =~ x1 + x2 + x3
    dem60 =~ y1 + y2 + y3 + y4
    # latent regression
    dem60 ~ ind60
    ",
    data = PoliticalDemocracy)
mxStandardizeRAMpaths(jreg_umx_fit, SE = TRUE)[4, ]

Combined with IRT

Example from Lai & Hsiao (2021, Psychological Methods)

Not accounting for error

# Simulate data with mirt
set.seed(1235)
num_obs <- 1000
# Simulate theta
eta <- MASS::mvrnorm(num_obs, mu = c(0, 0), Sigma = diag(c(1, 1 - 0.5^2)),
                     empirical = TRUE)
th1 <- eta[, 1]
th2 <- -1 + 0.5 * th1 + eta[, 2]
# items and response data
a1 <- matrix(1, 10)
d1 <- matrix(rnorm(10))
a2 <- matrix(runif(10, min = 0.5, max = 1.5))
d2 <- matrix(rnorm(10))
dat1 <- simdata(a = a1, d = d1, N = num_obs, itemtype = "2PL", Theta = th1)
dat2 <- simdata(a = a2, d = d2, N = num_obs, itemtype = "2PL", Theta = th2)
# Factor scores
mod1 <- mirt(dat1, model = 1, itemtype = "Rasch", verbose = FALSE)
mod2 <- mirt(dat2, model = 1, itemtype = "2PL", verbose = FALSE)
fs1 <- fscores(mod1, full.scores.SE = TRUE)
fs2 <- fscores(mod2, full.scores.SE = TRUE)
lm(fs2[, 1] ~ fs1[, 1])  # attenuated coefficient

Joint model

With WLS

dat <- cbind(dat1, dat2)
colnames(dat) <- paste0("i", 1:20)
wls_fit <- sem("
f1 =~ i1 + i2 + i3 + i4 + i5 + i6 + i7 + i8 + i9 + i10
f2 =~ i11 + i12 + i13 + i14 + i15 + i16 + i17 + i18 + i19 + i20
f2 ~ f1
", data = dat, ordered = TRUE, std.lv = TRUE)
coef(wls_fit)["f2~f1"]

With Maximum Likelihood

Note: This is extremely computational intensive

Using `tspa_mx_model()`

Because EAP (shrinkage) scores are used, we use the following properties for these scores ($\tilde \eta_i$ for person $i$):

Standard error (SE, from software output) = $\sqrt{V(\eta) (1 - \rho_{\tilde \eta, i})}$, where $\rho_{\tilde \eta, i}$ is the reliability of $\tilde \eta_i$
Loading of $\tilde \eta_i$ on $\eta$ is equal to $\rho_{\tilde \eta, i}$ = $1 - \text{SE}^2 / V(\eta)$
Standard error of measurement for $\tilde \eta_i$ is equal to $\rho_{\tilde \eta, i} \text{SE}$
Based on the above, the total variance of $\tilde \eta_i$ is also $\rho_{\tilde \eta, i}$.

# Combine into data set
fs_dat <- cbind(fs1, fs2) |>
    as.data.frame() |>
    setNames(c("fs1", "se_fs1", "fs2", "se_fs2")) |>
    # Compute reliability and error variances
    within(expr = {
        rel_fs1 <- 1 - se_fs1^2
        rel_fs2 <- 1 - se_fs2^2
        ev_fs1 <- se_fs1^2 * (1 - se_fs1^2)
        ev_fs2 <- se_fs2^2 * (1 - se_fs2^2)
    })

# Loading
matL <- mxMatrix(
    type = "Diag", nrow = 2, ncol = 2,
    free = FALSE,
    labels = c("data.rel_fs2", "data.rel_fs1"),
    name = "L"
)
# Error
matE <- mxMatrix(
    type = "Diag", nrow = 2, ncol = 2,
    free = FALSE,
    labels = c("data.ev_fs2", "data.ev_fs1"),
    name = "E"
)

fsreg_umx <- umxLav2RAM(
    "
      fs2 ~ fs1
      fs2 + fs1 ~ 1
    ",
    printTab = FALSE)

tspa_mx <- tspa_mx_model(fsreg_umx, data = fs_dat,
                         mat_ld = matL, mat_ev = matE)
# Run OpenMx
tspa_mx_fit <- mxRun(tspa_mx)
# Summarize the results
summary(tspa_mx_fit)

Alternatively, one can use named matrices (mat_ld and mat_ev) to specify the names of the columns for the loading and error covariance matrices.

cross_load <- matrix(c("rel_fs2", NA, NA, "rel_fs1"), nrow = 2) |>
    `dimnames<-`(rep(list(c("fs2", "fs1")), 2))
err_cov <- matrix(c("ev_fs2", NA, NA, "ev_fs1"), nrow = 2) |>
    `dimnames<-`(rep(list(c("fs2", "fs1")), 2))
tspa_mx <- tspa_mx_model(fsreg_umx, data = fs_dat,
                         mat_ld = cross_load, mat_ev = err_cov)
# Run OpenMx
tspa_mx_fit <- mxRun(tspa_mx)
# Summarize the results
summary(tspa_mx_fit)

Compare to joint model with OpenMx

Note: FIML couldn't finish within 12 hours

jreg_umx_fit <- umxRAM(
    "
    f1 =~ a * i1 + a * i2 + a * i3 + a * i4 + a * i5 +
          a * i6 + a * i7 + a * i8 + a* i9 + a * i10
    f2 =~ i11 + i12 + i13 + i14 + i15 + i16 + i17 + i18 + i19 + i20
    f2 ~ f1
    i1 + i2 + i3 + i4 + i5 + i6 + i7 + i8 + i9 + i10 ~ 0
    i11 + i12 + i13 + i14 + i15 + i16 + i17 + i18 + i19 + i20 ~ 0
    i1 ~~ 1 * i1
    i2 ~~ 1 * i2
    i3 ~~ 1 * i3
    i4 ~~ 1 * i4
    i5 ~~ 1 * i5
    i6 ~~ 1 * i6
    i7 ~~ 1 * i7
    i8 ~~ 1 * i8
    i9 ~~ 1 * i9
    i10 ~~ 1 * i10
    i11 ~~ 1 * i11
    i12 ~~ 1 * i12
    i13 ~~ 1 * i13
    i14 ~~ 1 * i14
    i15 ~~ 1 * i15
    i16 ~~ 1 * i16
    i17 ~~ 1 * i17
    i18 ~~ 1 * i18
    i19 ~~ 1 * i19
    i20 ~~ 1 * i20
    ",
    data = lapply(as.data.frame(dat), FUN = mxFactor,
                 levels = c(0, 1)) |>
               as.data.frame(),
    type = "DWLS")
jreg_std <- mxStandardizeRAMpaths(jreg_umx_fit, SE = TRUE)
jreg_std[jreg_std$label == "f1_to_f2", ]

Multidimensional Measurement Model

When a multidimensional model is used, for EAP scores, we use the following properties generalized from the unidimensional case. Specifically, let $V(\boldsymbol \eta)$ = $\boldsymbol{\psi}$.

Loading of $\tilde{\boldsymbol \eta}i$ on $\boldsymbol \eta$ is $\boldsymbol{\Lambda}{\tilde \eta, i}$, which is also the reliability matrix (i.e., the squared covariance between $\tilde{\boldsymbol \eta}_i$ and $\boldsymbol \eta$).
Error matrix ($\mathbf{E}$, from software output) = $(\mathbf{I} - \boldsymbol{\Lambda}{\tilde \eta, i})\boldsymbol{\psi}$. Therefore, $\boldsymbol{\Lambda}{\tilde \eta, i} = \mathbf{I} - \mathbf{E} \boldsymbol{\psi}^{-1}$.
Error variance of $\tilde \eta_i$ is $\boldsymbol{\Lambda}_{\tilde \eta, i} \mathbf{E}$.

dat <- cbind(dat1, dat2)
colnames(dat) <- paste0("Item_", 1:20)
# Multidimensional IRT
mod <- "
F1 = 1-10
F2 = 11-20
COV = F1*F2
CONSTRAIN = (1-10, a1)
"
mfit <- mirt(dat, model = mod, itemtype = "2PL", verbose = FALSE)

# Factor scores
fs <- fscores(mfit)
# Error portion of factor scores
fs_ecov <- fscores(mfit, return.acov = TRUE)
# Latent variable covariance
psi <- diag(2)
psi[1, 2] <- psi[2, 1] <- coef(mfit)$GroupPars[1, "COV_21"]
inv_psi <- solve(psi)
load <- lapply(fs_ecov, FUN = function(err) {
    diag(2) - err %*% inv_psi
})
fs_err <- Map(`%*%`, load, fs_ecov)
# Convert to data frame
load_dat <- lapply(load, FUN = c) |>
    do.call(what = rbind) |>
    as.data.frame() |>
    setNames(c("F1_by_fs1", "F1_by_fs2", "F2_by_fs1", "F2_by_fs2"))
fs_err_dat <- lapply(fs_err, FUN = `[`, c(1, 2, 4)) |>
    do.call(what = rbind) |>
    as.data.frame() |>
    setNames(c("ev_fs1", "ecov_fs1_fs2", "ev_fs2"))

# Combine into data set
fs_dat <- cbind(fs, load_dat, fs_err_dat)
# Build OpenMx model
fsreg_umx <- umxLav2RAM(
    "
      F2 ~ F1
      F2 + F1 ~ 1
    ",
    printTab = FALSE)
cross_load <- matrix(c("F1_by_fs1", "F1_by_fs2",
                       "F2_by_fs1", "F2_by_fs2"), nrow = 2) |>
    `dimnames<-`(rep(list(c("F1", "F2")), 2))
err_cov <- matrix(c("ev_fs1", "ecov_fs1_fs2",
                    "ecov_fs1_fs2", "ev_fs2"), nrow = 2) |>
    `dimnames<-`(rep(list(c("F1", "F2")), 2))
tspa_mx2 <- tspa_mx_model(fsreg_umx, data = fs_dat,
                          mat_ld = cross_load, mat_ev = err_cov)
# Run OpenMx
tspa_mx2_fit <- mxRun(tspa_mx2)
# Summarize the results
summary(tspa_mx2_fit)
# Standardize coefficients
mxStandardizeRAMpaths(tspa_mx2_fit, SE = TRUE)$m1[1, ]