Nothing
#' Repeated holdout (Monte Carlo) cross-validation of R^2 for structural equation models ('lavaan' objects)
#'
#' Estimate out-of-sample predictive performance for structural relations in a
#' fitted 'lavaan' model using repeated holdout (Monte Carlo cross-validation,
#' leave-group-out CV). At each repetition, the model is refitted on a random
#' training subset and evaluated on a disjoint test subset.
#'
#' For observed outcomes, R^2 is computed by comparing test-set observed values
#' with predictions obtained by applying the training-set structural coefficients
#' to the test-set predictors.
#'
#' For latent outcomes, the outcome is not directly observed in the test set.
#' Factor scores for the outcome are first computed in the test set using the
#' measurement model learned on the training set; these scores serve as the
#' outcome values. Predictions are then formed by applying the training-set
#' structural coefficients to the test-set predictors (including factor scores
#' for any latent predictors). R^2 is computed by comparing the test-set factor
#' scores of the outcome with these predicted scores.
#'
#' The in-sample baseline R^2 is computed on the full dataset using the same
#' metric as in cross-validation: observed outcomes use observed-versus-predicted
#' R^2; latent outcomes use score-versus-predicted-score R^2.
#'
#' By default, repetitions continue until the running mean R^2 for each outcome
#' stabilizes within a specified tolerance over a trailing window of successful
#' splits, or until a maximum number of splits is reached.
#'
#' The summary table reports the in-sample baseline R^2, the median cross-validated
#' R^2, its standard deviation, and the percent drop (baseline vs. median CV) with
#' heuristic threshold markers. The percent drop is suppressed when the in-sample
#' R^2 is very small.
#'
#' @usage
#' lav_cv(
#' fit,
#' data = NULL,
#' times = "auto",
#' train_prop = 0.8,
#' seed = 42L,
#' quiet = TRUE,
#' digits = 3L,
#' plot = TRUE,
#' tol = 0.001,
#' window = 50L,
#' max_times = 3000L,
#' min_r2_for_pct = 0.05
#' )
#'
#' @param fit A fitted 'lavaan' object (required).
#' @param data The data frame used to fit the model; if NULL, it is extracted from 'fit' when available (default: NULL).
#' @param times Integer indicating the number of random splits, or "auto" for stabilization-based early stopping (default: "auto").
#' @param train_prop Numeric in (0,1). Proportion of cases in the training split for each repetition (default: 0.8).
#' @param seed Integer. Random seed for reproducibility of the splits (default: 42).
#' @param quiet Logical. Suppress 'lavaan' refit messages when TRUE (default: TRUE).
#' @param digits Integer. Number of digits to print in summaries (default: 3).
#' @param plot Logical. Show convergence plots of the running mean R^2 per outcome (default: TRUE).
#' @param tol Numeric. Tolerance for the auto-stop rule on the running mean (default: 0.001).
#' @param window Integer. Trailing window size (number of successful splits) used by the auto-stop rule (default: 50).
#' @param max_times Integer. Maximum number of splits when \code{times} = "auto" (default: 3000).
#' @param min_r2_for_pct Numeric in (0,1). Minimum in-sample R^2 required to compute percent drop; below this, \%_drop is set to NA (default: 0.05).
#'
#' @return A list with class 'lav_cv' and elements:
#' \describe{
#' \item{\code{table}}{Data frame with columns:
#' \code{outcome}, \code{type} ("observed" or "latent"),
#' \code{r2_in}, \code{r2_cv_mean}, \code{r2_cv_median}, \code{r2_cv_sd},
#' \code{drop_mean_pct}, \code{drop_med_pct}, \code{splits_used}.}
#' \item{\code{split_matrix}}{Matrix of split-wise test-set R^2 values (rows = splits, columns = outcomes).}
#' \item{\code{times}}{Character or integer indicating the number of splits used (e.g., \code{"auto(534)"} or \code{500}).}
#' \item{\code{train_prop}}{Numeric. Training proportion used in each split.}
#' \item{\code{N}}{Integer. Number of rows in the input data.}
#' \item{\code{seed}}{Integer. Random seed used to generate the splits.}
#' \item{\code{tol}}{Numeric. Tolerance used by the auto-stop rule.}
#' \item{\code{window}}{Integer. Trailing window size for the auto-stop rule.}
#' \item{\code{min_r2_for_pct}}{Numeric. Minimum in-sample R^2 required to compute percent drop.}
#' \item{\code{call}}{\code{match.call()} of the function call.}
#' \item{\code{digits}}{Integer. Default number of digits for printing.}
#' }
#'
#' @seealso \code{\link[lavaan]{sem}}, \code{\link[lavaan]{lavPredict}},
#' \code{\link[lavaan]{inspect}}
#'
#' @references
#' Cudeck, R., & Browne, M. W. (1983). Cross-Validation Of Covariance Structures. Multivariate Behavioral Research, 18(2), 147-167. \doi{10.1207/s15327906mbr1802_2}
#'
#' Hastie, T., Friedman, J., & Tibshirani, R. (2001). The Elements of Statistical Learning. In Springer Series in Statistics. Springer New York. \doi{10.1007/978-0-387-21606-5}
#'
#' Kvalseth, T. O. (1985). Cautionary Note about R2. The American Statistician, 39(4), 279-285. \doi{10.1080/00031305.1985.10479448}
#'
#' Shmueli, G. (2010). To Explain or to Predict? Statistical Science, 25(3). \doi{10.1214/10-sts330}
#'
#' Yarkoni, T., & Westfall, J. (2017). Choosing Prediction Over Explanation in Psychology: Lessons From Machine Learning. Perspectives on Psychological Science, 12(6), 1100-1122. \doi{10.1177/1745691617693393}
#'
#' @examplesIf requireNamespace("lavaan", quietly = TRUE)
#' library("lavaan")
#' model <- "
#' ind60 =~ x1 + x2 + x3
#' dem60 =~ y1 + y2 + y3 + y4
#' dem65 =~ y5 + y6 + y7 + y8
#'
#' dem60 ~ ind60
#' dem65 ~ ind60 + dem60
#'
#' y1 ~~ y5
#' y2 ~~ y6
#' "
#' fit <- lavaan::sem(
#' model = model,
#' data = lavaan::PoliticalDemocracy,
#' std.lv = TRUE,
#' estimator = "MLR",
#' meanstructure = TRUE)
#'
#' result <- lav_cv(
#' fit = fit,
#' data = lavaan::PoliticalDemocracy,
#' times = 5)
#' print(result)
#'
#' @export
lav_cv <- function(
fit,
data = NULL,
times = "auto",
train_prop = 0.8,
seed = 42L,
quiet = TRUE,
digits = 3L,
plot = TRUE,
tol = 0.001,
window = 50L,
max_times = 3000L,
min_r2_for_pct = 0.05
) {
if (!inherits(fit, "lavaan")) stop("`fit` must be a fitted lavaan object.", call. = FALSE)
if (!(is.numeric(times) || identical(times, "auto"))) stop("`times` must be integer or 'auto'.", call. = FALSE)
if (train_prop <= 0 || train_prop >= 1) stop("`train_prop` must be in (0,1).", call. = FALSE)
OPT <- fit@Options
if (!is.null(OPT$group) && nzchar(OPT$group)) stop("Multigroup models are not supported yet.", call. = FALSE)
data_in <- data %||%
tryCatch(lavaan::lavInspect(fit, "data"), error = function(e) NULL) %||%
tryCatch(lavaan::lavInspect(fit, "data.original"), error = function(e) NULL)
if (is.null(data_in)) stop("Could not locate data; pass it via `data =`.", call. = FALSE)
if (is.matrix(data_in)) data_in <- as.data.frame(data_in)
pt <- lavaan::parTable(fit)
is_struct <- pt$op == "~" & pt$rhs != "1"
is_interaction <- grepl(":", pt$rhs, fixed = TRUE) | pt$op == "XWITH"
struct <- pt[is_struct & !is_interaction, c("lhs", "rhs")]
outcomes_all <- unique(struct$lhs)
if (!length(outcomes_all)) stop("No structural regressions found.", call. = FALSE)
latents_all <- tryCatch(lavaan::lavNames(fit, type = "lv"), error = function(e) character(0))
outcome_is_latent <- outcomes_all %in% latents_all
any_latents <- length(latents_all) > 0L
estimator_use <- "ML" # vabbe non serve ancora ma ok
missing_opt <- OPT$missing
std_lv <- isTRUE(OPT$std.lv)
parameterization <- OPT$parameterization
ordered_vars <- tryCatch(lavaan::lavNames(fit, type = "ov.ord"), error = function(e) character(0))
model_obj <- lavaan::parTable(fit)
r2_fun <- function(y, yhat) {
ok <- is.finite(y) & is.finite(yhat)
y <- y[ok]; yhat <- yhat[ok]
if (length(y) < 3L) return(NA_real_)
ss_res <- sum((y - yhat)^2)
ss_tot <- sum((y - mean(y))^2)
if (ss_tot <= 0) return(NA_real_)
1 - ss_res / ss_tot
}
get_struct_coefs <- function(f) {
pe <- lavaan::parameterEstimates(f, standardized = FALSE)
out <- lapply(outcomes_all, function(y) {
b <- pe[pe$op == "~" & pe$lhs == y, c("rhs", "est")]
ic <- pe[pe$op == "~1" & pe$lhs == y, "est"]; if (!length(ic)) ic <- 0
list(betas = stats::setNames(b$est, b$rhs), intercept = as.numeric(ic))
})
names(out) <- outcomes_all
out
}
compute_lv_scores <- function(fit_obj, newdata) {
if (!any_latents) return(NULL)
fs <- tryCatch(lavaan::lavPredict(fit_obj, newdata = newdata, type = "lv", method = "regression"),
error = function(e) NULL)
if (is.null(fs)) fs <- tryCatch(lavaan::lavPredict(fit_obj, newdata = newdata, type = "lv", method = "Bartlett"),
error = function(e) NULL)
fs
}
build_X <- function(pred_names, fs, newdata) {
M <- NULL
if (any_latents && !is.null(fs)) {
X_lat <- intersect(pred_names, colnames(fs))
if (length(X_lat)) M <- cbind(M, fs[, X_lat, drop = FALSE])
}
X_obs <- intersect(pred_names, colnames(newdata))
if (length(X_obs)) M <- cbind(M, as.matrix(newdata[, X_obs, drop = FALSE]))
M
}
# estrazione R^2 di baseline
coefs_full <- get_struct_coefs(fit)
fs_full <- if (any_latents) compute_lv_scores(fit, data_in) else NULL
baseline_r2 <- stats::setNames(rep(NA_real_, length(outcomes_all)), outcomes_all) # ok call diretta da namespace
for (y in outcomes_all) {
cY <- coefs_full[[y]]
preds <- names(cY$betas)
X <- build_X(preds, fs_full, data_in)
if (!is.null(X)) {
yhat <- drop(cY$intercept + X %*% cY$betas[colnames(X)])
if (!outcome_is_latent[match(y, outcomes_all)]) {
if (y %in% colnames(data_in)) baseline_r2[y] <- r2_fun(data_in[[y]], yhat)
} else if (!is.null(fs_full) && y %in% colnames(fs_full)) {
baseline_r2[y] <- r2_fun(fs_full[, y], yhat)
}
}
}
# aiuto, funziona questo! lascia cosi
set.seed(seed)
N <- nrow(data_in); n_train <- max(3L, floor(train_prop * N))
maxN <- if (is.numeric(times)) as.integer(times) else as.integer(max_times)
res_mat <- matrix(NA_real_, nrow = maxN, ncol = length(outcomes_all))
colnames(res_mat) <- outcomes_all
used <- 0L
done <- function(vmat) {
for (j in seq_len(ncol(vmat))) {
v <- vmat[, j]; v <- v[is.finite(v)]
if (length(v) < window + 5L) return(FALSE)
cm <- cumsum(v)/seq_along(v)
if (abs(cm[length(cm)] - cm[length(cm) - window]) > tol) return(FALSE)
}
TRUE
}
target_splits <- if (is.numeric(times)) as.integer(times) else as.integer(max_times)
while (used < target_splits) {
tr_idx <- sort(sample.int(N, size = n_train, replace = FALSE))
te_idx <- setdiff(seq_len(N), tr_idx); if (length(te_idx) < 3L) next
dat_train <- data_in[tr_idx, , drop = FALSE]
dat_test <- data_in[te_idx, , drop = FALSE]
fit_train <- tryCatch(
lavaan::sem(
model = model_obj, data = dat_train,
estimator = estimator_use, missing = missing_opt,
se = "none", test = "none",
std.lv = std_lv, parameterization = parameterization,
ordered = if (length(ordered_vars)) ordered_vars else NULL,
warn = !quiet
),
error = function(e) NULL
)
if (is.null(fit_train) || !isTRUE(lavaan::inspect(fit_train, "converged"))) next
used <- used + 1L
coefs_tr <- get_struct_coefs(fit_train)
fs_te <- if (any_latents) compute_lv_scores(fit_train, dat_test) else NULL
for (j in seq_along(outcomes_all)) {
y <- outcomes_all[j]
cY <- coefs_tr[[y]]
preds <- names(cY$betas)
X <- build_X(preds, fs_te, dat_test)
if (is.null(X)) { res_mat[used, j] <- NA_real_; next }
yhat <- drop(cY$intercept + X %*% cY$betas[colnames(X)])
if (!outcome_is_latent[j]) {
if (y %in% colnames(dat_test)) res_mat[used, j] <- r2_fun(dat_test[[y]], yhat)
} else {
if (!is.null(fs_te) && y %in% colnames(fs_te)) res_mat[used, j] <- r2_fun(fs_te[, y], yhat)
}
}
if (identical(times, "auto") && used >= (window + 5L)) {
if (done(res_mat[seq_len(used), , drop = FALSE])) break
}
}
if (used < nrow(res_mat)) res_mat <- res_mat[seq_len(used), , drop = FALSE]
# estrai sommari (esplicitare stats senno fa casino non so perche)
r2_cv_mean <- apply(res_mat, 2L, function(v) { v <- v[is.finite(v)]; if (!length(v)) NA_real_ else mean(v) })
r2_cv_median <- apply(res_mat, 2L, function(v) { v <- v[is.finite(v)]; if (!length(v)) NA_real_ else stats::median(v) })
r2_cv_sd <- apply(res_mat, 2L, function(v) { v <- v[is.finite(v)]; if (!length(v)) NA_real_ else stats::sd(v) })
n_ok <- apply(res_mat, 2L, function(v) sum(is.finite(v)))
drop_mean_pct <- rep(NA_real_, length(outcomes_all))
drop_median_pct <- rep(NA_real_, length(outcomes_all))
for (j in seq_along(outcomes_all)) {
if (is.finite(baseline_r2[j]) && baseline_r2[j] >= min_r2_for_pct) {
drop_mean_pct[j] <- 100 * (baseline_r2[j] - r2_cv_mean[j]) / baseline_r2[j]
drop_median_pct[j] <- 100 * (baseline_r2[j] - r2_cv_median[j]) / baseline_r2[j]
}
}
# ok maybe it works...spero v5
out_type <- ifelse(outcome_is_latent, "latent", "observed")
res_table <- data.frame(
outcome = outcomes_all,
type = out_type,
r2_in = as.numeric(baseline_r2),
r2_cv_mean = as.numeric(r2_cv_mean),
r2_cv_median = as.numeric(r2_cv_median),
r2_cv_sd = as.numeric(r2_cv_sd),
drop_mean_pct = as.numeric(drop_mean_pct),
drop_med_pct = as.numeric(drop_median_pct),
splits_used = as.integer(n_ok),
stringsAsFactors = FALSE
)
res <- list(
table = res_table[order(res_table$type, res_table$outcome), ],
split_matrix = res_mat,
times = if (identical(times, "auto")) paste0("auto(", used, ")") else as.integer(times),
train_prop = train_prop,
N = nrow(data_in),
seed = seed,
digits = digits,
call = match.call(),
tol = tol,
window = window,
min_r2_for_pct = min_r2_for_pct
)
class(res) <- c("lav_cv", "list")
# devi indicare graphics::!
if (isTRUE(plot)) {
op <- graphics::par(no.readonly = TRUE); on.exit(graphics::par(op), add = TRUE)
for (y in res$table$outcome) {
v <- res$split_matrix[, y]; v <- v[is.finite(v)]
if (!length(v)) next
m <- seq_along(v)
cm <- cumsum(v)/m
s2 <- c(NA_real_, cumsum((v[-1] - cm[-length(cm)]) * (v[-1] - cm[-1])) / pmax(1, m[-1] - 1))
se <- sqrt(s2 / m); se[1] <- NA_real_
upper <- cm + 1.96 * se
lower <- cm - 1.96 * se
graphics::plot(m, cm, type = "l", xlab = "Splits used", ylab = "Running mean of R^2",
main = paste0("CV convergence: ", y))
graphics::lines(m, upper, lty = 2)
graphics::lines(m, lower, lty = 2)
graphics::abline(h = utils::tail(cm, 1L), lty = 3) # mancava la call da utils!!!
}
}
res
}
### metodi S3 print e summary:
#' @param x A 'lav_cv' object.
#' @param ... Additional arguments; unused.
#' @rdname lav_cv
#' @export
print.lav_cv <- function(x, digits = x$digits %||% 3L, ...) {
stopifnot(is.list(x), !is.null(x$table))
u2 <- if (isTRUE(l10n_info()[["UTF-8"]])) "\u00B2" else "^2"
# header con i dettagli giusti!!
auto_txt <- if (grepl("^auto\\(", as.character(x$times))) {
paste0("auto-stop at ", gsub("[^0-9]", "", x$times), " splits")
} else {
paste0(x$times, " splits")
}
cat(sprintf(
"\nRepeated holdout (Monte Carlo) cross-validation of R%s\n", u2
))
cat(sprintf("Splits: %s (train proportion = %.2f; N = %d; seed = %s)\n\n",
auto_txt, x$train_prop, x$N, as.character(x$seed)))
tbl <- x$table
star_fun <- function(d){
if (is.na(d)) "" else if (d >= 50) "***" else if (d >= 30) "**" else if (d >= 10) "*" else ""
}
stars <- vapply(tbl$drop_med_pct, star_fun, character(1))
dash <- "-" # al posto di NA meglio un trattino
fmt <- function(z, d = digits) {
ifelse(is.na(z), dash, formatC(z, digits = d, format = "f"))
}
fmt1 <- function(z) {
ifelse(is.na(z), dash, formatC(z, digits = 1, format = "f"))
}
for (tp in c("observed", "latent")) {
sub <- tbl[tbl$type == tp, , drop = FALSE]
if (!nrow(sub)) next
out_names <- sub$outcome
r2_infit <- round(sub$r2_in, digits)
r2_med <- round(sub$r2_cv_median, digits)
sd_cv <- round(sub$r2_cv_sd, digits)
drop_pct <- ifelse(is.finite(sub$drop_med_pct), round(sub$drop_med_pct, 1), NA)
stars_tp <- stars[tbl$type == tp]
cat(sprintf("%s outcomes\n", tools::toTitleCase(tp)))
cat(rep("-", 86), "\n", sep = "")
cat(sprintf("%-16s %14s %16s %10s %9s %3s\n",
"Outcome", paste0("R", u2, " in-sample"),
paste0("CV R", u2), "CV SD", "% drop", ""))
cat(rep("-", 86), "\n", sep = "")
for (i in seq_len(nrow(sub))) {
cat(sprintf("%-16s %14s %16s %10s %9s %3s\n",
out_names[i],
fmt(r2_infit[i]),
fmt(r2_med[i]),
fmt(sd_cv[i]),
fmt1(drop_pct[i]),
stars_tp[i]))
}
cat("\n")
}
# notine a pie tabella
cat(sprintf("Auto-stop rule: window = %d, tol = %.3f (applied when times = 'auto').\n", x$window, x$tol))
cat("Stars indicate % drop thresholds: * >=10%, ** >=30%, *** >=50%.\n")
cat(sprintf("%% drop compares in-sample R%s to median CV R%s; suppressed when in-sample R%s < %.2f.\n",
u2, u2, u2, x$min_r2_for_pct %||% 0.05))
cat(sprintf("CV SD is the standard deviation of split-wise R%s across successful splits.\n", u2))
invisible(x)
}
#' @param object A 'lav_cv' object.
#' @param ... Additional arguments; unused.
#' @rdname lav_cv
#' @export
summary.lav_cv <- function(object, ...) {
print.lav_cv(object, ...);
invisible(object)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.