knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
When computing the reliability or the standard errors of factor scores, we typically condition on the estimates of the model parameters, aka assuming that they are known. However, when sample size is small, ignoring those can be problematic as the model parameters are imprecise. This vignette provides details on how to obtain more realistic estimates of standard errors of factor scores.
Consider the conditional variance of the factor scores, $Var(\tilde \eta \mid \eta)$, with $\tilde \eta = \boldsymbol{a}^\top \boldsymbol{y}$. From the factor model, we have
$$Var(\tilde \eta \mid \eta) = Var(\boldsymbol{a}^\top \boldsymbol{\lambda} \eta + \boldsymbol{a}^\top \boldsymbol{\varepsilon} \mid \eta) = \eta^2 Var(\boldsymbol{a}^\top \boldsymbol{\lambda}) + Var(\boldsymbol{a}^\top \boldsymbol{\varepsilon}).$$
The second term is the error component. Here is a small simulation:
#' Load package and set seed library(lavaan) library(R2spa) set.seed(191254) #' Fixed conditions num_obs <- 100 lambda <- c(.3, .5, .7, .9) theta <- c(.5, .4, .5, .7) #' Simulate true score, and standardized eta <- rnorm(num_obs) eta <- (eta - mean(eta)) eta <- eta / sqrt(mean(eta^2)) #' Function for simulating y simy <- function(eta, lambda) { t(tcrossprod(lambda, eta) + rnorm(length(lambda) * length(eta), sd = sqrt(theta))) }
#' Simulation nsim <- 2500 tfsy_sim <- fsy_sim <- matrix(NA, nrow = num_obs, ncol = nsim) # Also save the scoring matrix a_sim <- matrix(NA, nrow = length(lambda), ncol = nsim) for (i in seq_len(nsim)) { y <- simy(eta = eta, lambda = lambda) tfsy_sim[, i] <- R2spa::compute_fscore( y, lambda = lambda, theta = diag(theta), psi = matrix(1) ) fsy <- R2spa::get_fs(y, std.lv = TRUE) fsy_sim[, i] <- fsy[, 1] a_sim[, i] <- attr(fsy, which = "scoring_matrix") }
saveRDS(list(tfsy_sim, fsy_sim, a_sim), file = "sim_correction-error.RDS")
tfsy_sim <- readRDS("sim_correction-error.RDS")[[1]] fsy_sim <- readRDS("sim_correction-error.RDS")[[2]] a_sim <- readRDS("sim_correction-error.RDS")[[3]]
#' Average conditional variance # a known apply(tfsy_sim, 1, var) |> mean() # compare to theoretical value true_a <- R2spa:::compute_a_reg(lambda, psi = matrix(1), theta = diag(theta)) true_a %*% diag(theta) %*% t(true_a) # a estimated apply(fsy_sim, 1, var) |> mean()
The standard errors are larger when using sample estimates of weights.
When $\boldsymbol{a}$ is known, the error variance is simply $\boldsymbol{a}^\top \boldsymbol{\Theta} \boldsymbol{a}$. However, when $\boldsymbol{a}$ is unknown and estimated as $\tilde{\boldsymbol{a}}$ from the data,
$$Var(\tilde{\boldsymbol{a}}^\top \boldsymbol{\varepsilon}) = E[Var(\tilde{\boldsymbol{a}}^\top \boldsymbol{\varepsilon} \mid \tilde{\boldsymbol{a}})] + Var[E(\tilde{\boldsymbol{a}}^\top \boldsymbol{\varepsilon} \mid \tilde{\boldsymbol{a}})].$$
Under the assumption of $E(\boldsymbol{\varepsilon})$ = $\boldsymbol{0}$, the second term is 0. The first term is an expectation of a quadratic form,
$$E(\tilde{\boldsymbol{a}}^\top \boldsymbol{\Theta} \tilde{\boldsymbol{a}}) = E(\tilde{\boldsymbol{a}}^\top) \boldsymbol{\Theta} E(\tilde{\boldsymbol{a}}) + \mathrm{tr}(\boldsymbol{\Theta} \boldsymbol{V}_{\tilde{\boldsymbol{a}}})$$
where $\boldsymbol{V}_{\tilde{\boldsymbol{a}}}$ is the covariance matrix of $\tilde{\boldsymbol{a}}$. We can see this using the simulated data
correct_fac <- sum(diag(diag(theta) %*% cov(t(a_sim)))) true_a %*% diag(theta) %*% t(true_a) + correct_fac
We can approximate $\boldsymbol{V}_{\tilde{\boldsymbol{a}}}$ using the delta method, and the other terms in the above using sample estimates
$$\tilde{\boldsymbol{a}}^\top \hat{\boldsymbol{\Theta}} \tilde{\boldsymbol{a}} + \mathrm{tr}(\hat{\boldsymbol{\Theta}} \hat{\boldsymbol{V}}_{\tilde{\boldsymbol{a}}}).$$
The delta method estimate, $\hat{\boldsymbol{V}}_{\tilde{\boldsymbol{a}}}$, is
$$\boldsymbol{J}{\tilde{\boldsymbol{a}}}(\boldsymbol{\theta})^\top \hat{\boldsymbol{V}}{\tilde{\boldsymbol{\theta}}} \boldsymbol{J}_{\tilde{\boldsymbol{a}}}(\boldsymbol{\theta}),$$
where $\boldsymbol{J}{\tilde{\boldsymbol{a}}}(\boldsymbol{\theta})$ is the Jacobian matrix, and $\hat{\boldsymbol{V}}{\tilde{\boldsymbol{\theta}}}$ is the sampling variance of the sample estimator of $\boldsymbol{\theta}$. The Jacobian can be approximated using finite difference with lavaan::lav_func_jacobian_complex()
.
y <- simy(eta = eta, lambda = lambda) cfa_fit <- cfa("f =~ y1 + y2 + y3 + y4", data = setNames(data.frame(y), paste0("y", 1:4)), std.lv = TRUE) # Jacobian R2spa:::compute_a(coef(cfa_fit), lavobj = cfa_fit) jac_a <- lavaan::lav_func_jacobian_complex( \(x) R2spa:::compute_a(x, lavobj = cfa_fit)[[1]], coef(cfa_fit) ) sum(diag(lavInspect(cfa_fit, what = "est")$theta %*% jac_a %*% vcov(cfa_fit) %*% t(jac_a))) fsy <- R2spa::get_fs(y, std.lv = TRUE) attr(fsy, which = "fsT") # Using `get_fs(..., corrected_fsT = TRUE) fsy2 <- R2spa::get_fs(y, std.lv = TRUE, corrected_fsT = TRUE) attr(fsy2, which = "fsT")
#' Set seed set.seed(251329) #' Fixed conditions num_obs <- 100 lambda <- Matrix::bdiag(list(c(.3, .5, .7, .9), c(.7, .6, .7))) |> as.matrix() theta <- c(.5, .4, .5, .7, .7, .8, .5) psi <- matrix(c(1, -.4, -.4, 1), nrow = 2) #' Simulate true score (exact means and covariances) eta <- MASS::mvrnorm(num_obs, mu = rep(0, 2), Sigma = psi, empirical = TRUE) #' Function for simulating y simy <- function(eta, lambda) { tcrossprod(eta, lambda) + MASS::mvrnorm(num_obs, mu = rep(0, length(theta)), Sigma = diag(theta)) }
#' Simulation nsim <- 2500 tfsy_sim <- fsy_sim <- array(NA, dim = c(num_obs, ncol(lambda), nsim)) # Also save the scoring matrix a_sim <- array(NA, dim = c(ncol(lambda), nrow(lambda), nsim)) for (i in seq_len(nsim)) { y <- simy(eta = eta, lambda = lambda) tfsy_sim[, , i] <- R2spa::compute_fscore( y, lambda = lambda, theta = diag(theta), psi = psi ) fsy <- R2spa::get_fs( data.frame(y) |> setNames(paste0("y", 1:7)), model = "f1 =~ y1 + y2 + y3 + y4\nf2 =~ y5 + y6 + y7", std.lv = TRUE) fsy_sim[, , i] <- as.matrix(fsy[, 1:2]) a_sim[, , i] <- attr(fsy, which = "scoring_matrix") }
saveRDS(list(tfsy_sim, fsy_sim, a_sim), file = "sim_correction-error-multi.RDS")
tfsy_sim <- readRDS("sim_correction-error-multi.RDS")[[1]] fsy_sim <- readRDS("sim_correction-error-multi.RDS")[[2]] a_sim <- readRDS("sim_correction-error-multi.RDS")[[3]]
#' Average conditional variance # a known apply(tfsy_sim, MARGIN = 1, FUN = \(x) cov(t(x))) |> rowMeans() |> matrix(ncol = 2, nrow = 2) # compare to theoretical value true_a <- R2spa:::compute_a_reg(lambda, psi = psi, theta = diag(theta)) true_a %*% diag(theta) %*% t(true_a) # a estimated apply(fsy_sim, MARGIN = 1, FUN = \(x) cov(t(x))) |> rowMeans() |> matrix(ncol = 2, nrow = 2)
correct_fac11 <- sum(diag(diag(theta) %*% cov(t(a_sim[1, , ])))) correct_fac22 <- sum(diag(diag(theta) %*% cov(t(a_sim[2, , ])))) correct_fac21 <- sum(diag(diag(theta) %*% cov(t(a_sim[2, , ]), t(a_sim[1, , ])))) correct_fac <- matrix(c(correct_fac11, correct_fac21, correct_fac21, correct_fac22), nrow = 2) true_a %*% diag(theta) %*% t(true_a) + correct_fac
The $i$, $j$ element of the correction matrix can be approximated by $\mathrm{tr}(\hat{\boldsymbol{\Theta}} \hat{\boldsymbol{V}}{{\tilde{\boldsymbol{a}}_i}{\tilde{\boldsymbol{a}}_j}})$, with $\hat{\boldsymbol{V}}{{\tilde{\boldsymbol{a}}_i}{\tilde{\boldsymbol{a}}_j}})$ obtained by
$$\boldsymbol{J}{\tilde{\boldsymbol{a}}_i}(\boldsymbol{\theta})^\top \hat{\boldsymbol{V}}{\tilde{\boldsymbol{\theta}}} \boldsymbol{J}_{\tilde{\boldsymbol{a}}_j}(\boldsymbol{\theta}),$$
y <- simy(eta = eta, lambda = lambda) cfa_fit <- cfa("f1 =~ y1 + y2 + y3 + y4\nf2 =~ y5 + y6 + y7", data = setNames(data.frame(y), paste0("y", 1:7)), std.lv = TRUE) # Jacobian R2spa:::compute_a(coef(cfa_fit), lavobj = cfa_fit) jac_a1 <- lavaan::lav_func_jacobian_complex( \(x) R2spa:::compute_a(x, lavobj = cfa_fit)[[1]][1, ], coef(cfa_fit) ) jac_a2 <- lavaan::lav_func_jacobian_complex( \(x) R2spa:::compute_a(x, lavobj = cfa_fit)[[1]][2, ], coef(cfa_fit) ) # Correction[1, 1] sum(diag(lavInspect(cfa_fit, what = "est")$theta %*% jac_a1 %*% vcov(cfa_fit) %*% t(jac_a1))) # Correction[2, 2] sum(diag(lavInspect(cfa_fit, what = "est")$theta %*% jac_a2 %*% vcov(cfa_fit) %*% t(jac_a2))) # Correction[2, 1] sum(diag(lavInspect(cfa_fit, what = "est")$theta %*% jac_a2 %*% vcov(cfa_fit) %*% t(jac_a1))) fsy <- R2spa::get_fs( data.frame(y) |> setNames(paste0("y", 1:7)), model = "f1 =~ y1 + y2 + y3 + y4\nf2 =~ y5 + y6 + y7", std.lv = TRUE) attr(fsy, which = "fsT") # Using `get_fs(..., corrected_fsT = TRUE) fsy2 <- R2spa::get_fs( data.frame(y) |> setNames(paste0("y", 1:7)), model = "f1 =~ y1 + y2 + y3 + y4\nf2 =~ y5 + y6 + y7", std.lv = TRUE, corrected_fsT = TRUE) attr(fsy2, which = "fsT")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.