Nothing
#' Internal function for the Mixed-Effects Model Calibration Curve (MIXC)
#'
#' Estimates the calibration curve using a logistic generalized linear mixed model.
#'
#' @param data optional data frame containing the variables \code{p}, \code{y},
#' and \code{cluster}. If supplied, variable names should be given without
#' quotation marks.
#' @param p predicted probabilities (numeric vector) or name of the column in
#' \code{data}.
#' @param y binary outcome variable or the name of the column in \code{data}.
#' @param cluster Cluster identifier (factor, character, or integer) or name of
#' the column in \code{data}.
#' @param grid the grid for the calibration curve evaluation
#' @param method character, type of mixed-effects model: \code{"intercept"} (random intercept)
#' or \code{"slope"} (random slope). Default is \code{"slope"}.
#' @param plot logical, indicating whether to generate a calibration plot. Default is \code{TRUE}.
#' @param cluster_curves logical, whether to include cluster-specific curves in the plot.
#' Default is \code{FALSE}.
#' @param nsims_pi integer, number of simulations for prediction intervals. Default is \code{10000}.
#' @param CI logical, whether to calculate confidence intervals. Default is \code{TRUE}.
#' @param CI_method character, method for computing the confidence intervals of the observed proportions.
#' If \code{"delta"}, the delta method is applied. Conversely, when \code{CI_method == "naive"}, no correction is applied.
#' Default is \code{"naive"}.
#' @param cl.level the confidence level for the calculation of the confidence interval. Default is \code{0.95}.
#'
#' @details
#' This function estimates the calibration curves using a logistic generalized linear mixed model.
#'
#' @return A list containing:
#' \describe{
#' \item{\code{model}}{The fitted mixed-effects model object}
#' \item{\code{cluster_data}}{Data frame with calibration data for each cluster}
#' \item{\code{plot_data}}{Data frame with calibration data for the average cluster}
#' \item{\code{observed_data}}{Data frame with calibration data for individual observations}
#' \item{\code{plot}}{A \code{ggplot2} object if \code{plot = TRUE}, otherwise \code{NULL}}
#' }
MIXC <- function(data = NULL,
p,
y,
cluster,
grid,
method = c("slope", "intercept"),
plot = TRUE,
cluster_curves = FALSE,
nsims_pi = 10000,
CI = TRUE,
CI_method = c("naive", "delta"),
cl.level = 0.95) {
# --- Extract from data if provided ---
callFn = match.call()
method = match.arg(method)
CI_method = match.arg(CI_method)
if (!is.null(data)) {
if(!all(sapply(c("p", "y", "cluster"), function(a) as.character(callFn[a])) %in% colnames(data)))
stop(paste("Variables", paste0(
callFn[c("p", "y", "cluster")], collapse = ", "
), "were not found in the data.frame."))
p = eval(callFn$p, data)
logit = Logit(p)
y = eval(callFn$y, data)
cluster = eval(callFn$cluster, data)
}
alpha = 1 - cl.level
# --- Base dataframe ---
df = data.frame(
logit_preds = Logit(as.numeric(p)),
y = as.factor(y),
cluster = as.factor(cluster)
)
# --- Model fitting ---
args = list(
data = df,
family = "binomial",
verbose = 0
)
args$formula =
if(method == "intercept") {
y ~ rcs(logit_preds, 3) + (1 | cluster)
} else {
y ~ rcs(logit_preds, 3) + (rcs(logit_preds, 3) | cluster)
}
# Suppress "boundary (singular)" warning: arises from restricted cubic spline basis multicollinearity in GLMM,
# not from model misspecification, expected in this context.
fit_model = suppressMessages(do.call("glmer", args))
# --- Predictions for original data ---
df$re_preds = predict(fit_model, df)
df$obs_prob = Ilogit(df$re_preds)
# --- Cluster range calculation ---
cluster_ranges <- df %>%
group_by(cluster) %>%
summarise(
min_logit = min(logit_preds),
max_logit = max(logit_preds),
.groups = "drop"
)
# --- Cluster-specific calibration data ---
cluster_cal_data <- data.frame()
for (i in seq_len(nrow(cluster_ranges))) {
current_cluster = cluster_ranges$cluster[i]
cluster_logit_preds = Logit(grid)
temp_data <- data.frame(
cluster = current_cluster,
logit_preds = cluster_logit_preds,
x = grid
)
temp_data$p_obs_logit = predict(fit_model, newdata = temp_data)
temp_data$pred_prob = Ilogit(temp_data$logit_preds)
temp_data$obs_prob = Ilogit(temp_data$p_obs_logit)
cluster_cal_data = rbind(cluster_cal_data, temp_data)
}
# --- Average calibration data ---
avg_logit_preds = Logit(grid)
avg_cal_data = data.frame(logit_preds = avg_logit_preds)
avg_cal_data$p_obs_logit = eta = predict(fit_model, newdata = avg_cal_data, re.form = NA)
avg_cal_data$pred_prob = Ilogit(avg_cal_data$logit_preds)
avg_cal_data$obs_prob = Ilogit(avg_cal_data$p_obs_logit)
avg_cal_data$x = grid
# --- Confidence intervals ---
if (CI) {
deltaCI <- function(eta, se, alpha) {
muHat = Ilogit(eta)
seMu = binomial()$mu.eta(eta) * se
data.frame(
eta = eta,
etaLCL = eta + qnorm(alpha / 2) * se,
etaUCL = eta + qnorm(1 - alpha / 2) * se,
yHat = muHat,
yHatLCL = pmax(0, pmin(muHat + qnorm(alpha / 2) * seMu, 1)),
yHatUCL = pmax(0, pmin(muHat + qnorm(1 - alpha / 2) * seMu, 1))
)
}
Z = mm = model.matrix(~ rcs(logit_preds, 3), avg_cal_data)
# pvar <- diag(mm %*% Matrix::tcrossprod(vcov(fit_model), mm))
V = vcov(fit_model) # dense
pvar = rowSums(as.array((mm %*% V) * mm))
D = VarCorr(fit_model)$cluster
rvar = rowSums((Z %*% D) * Z)
tvar = pvar + rvar
if (CI_method == "delta") {
deltaResults = deltaCI(eta, sqrt(tvar), alpha)
avg_cal_data = within(
avg_cal_data, {
p_lower_ci = deltaResults$yHatLCL
p_upper_ci = deltaResults$yHatUCL
logit_lower_ci = deltaResults$etaLCL
logit_upper_ci = deltaResults$etaUCL
}
)
} else {
avg_cal_data = within(
avg_cal_data, {
logit_lower_ci = p_obs_logit + qnorm(alpha / 2) * sqrt(tvar)
logit_upper_ci = p_obs_logit + qnorm(1 - alpha / 2) * sqrt(tvar)
p_lower_ci = Ilogit(logit_lower_ci)
p_upper_ci = Ilogit(logit_upper_ci)
}
)
}
avg_cal_data$cluster <- "average"
# --- Prediction intervals ---
# Here, we suppress the following warnings:
# Warning messages:
# 1: The following levels of cluster from newdata
# -- average -- are not in the model data.
# Currently, predictions for these values are based only on the
# fixed coefficients and the observation-level error.
# 2: executing %dopar% sequentially: no parallel backend registered
PI <- suppressWarnings(predictInterval(
merMod = fit_model,
newdata = avg_cal_data,
level = cl.level,
n.sims = nsims_pi,
stat = "median",
type = "probability",
include.resid.var = TRUE
))
avg_cal_data <- cbind(avg_cal_data, PI)
}
# --- Plotting ---
plot_obj <- NULL
if (plot) {
plot_obj <- ggplot(avg_cal_data) +
geom_abline(linetype = "dashed", alpha = 0.1) +
geom_ribbon(aes(
x = pred_prob, ymin = p_lower_ci, ymax = p_upper_ci,
fill = "CI 95%"
), alpha = 0.3) +
geom_ribbon(aes(
x = pred_prob, ymin = lwr, ymax = upr,
fill = "PI 95%"
), alpha = 0.2) +
geom_line(aes(x = pred_prob, y = obs_prob),
linewidth = 1, linetype = "dashed", color = "black"
) +
xlab("Estimated probability") +
ylab("Observed proportion") +
theme_classic(base_size = 8, base_family = "serif") +
scale_x_continuous(breaks = seq(0, 1, 0.1)) +
scale_y_continuous(breaks = seq(0, 1, 0.2)) +
coord_cartesian(xlim = c(0, 1), ylim = c(0, 1)) +
scale_fill_manual(name = "Uncertainty", values = c("green4", "green")) +
theme(
legend.key.size = unit(0.3, "cm"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
)
if (cluster_curves) {
plot_obj <- plot_obj +
geom_line(
data = cluster_cal_data,
aes(x = pred_prob, y = obs_prob, color = cluster),
linewidth = 0.2, linetype = "dotted", show.legend = FALSE
)
}
}
# --- Patient-level calibration data ---
patient_cal_data <- data.frame()
for (current_cluster in unique(df$cluster)) {
cluster_subset <- df %>% filter(cluster == current_cluster)
temp_patient_data <- data.frame(
cluster = cluster_subset$cluster,
logit_preds = cluster_subset$logit_preds
)
temp_patient_data$p_obs_logit <- predict(fit_model, newdata = temp_patient_data)
temp_patient_data$pred_prob <- Ilogit(temp_patient_data$logit_preds)
temp_patient_data$obs_prob <- Ilogit(temp_patient_data$p_obs_logit)
patient_cal_data <- rbind(patient_cal_data, temp_patient_data)
}
# --- Return ---
list(
model = fit_model,
cluster_data = cluster_cal_data,
plot_data = avg_cal_data,
observed_data = patient_cal_data,
plot = plot_obj
)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.