R/CGC.R
In CalibrationCurves: Calibration Performance

Documented in CGC

#' Internal function for the Clustered Grouped Calibration Curve (CGC)
#'
#' Estimates the calibration curves using the CGC approach. The function supports two grouping methods:
#' equal-sized groups (\code{"grouped"}) or interval-based groups (\code{"interval"}).
#' Optionally, a calibration plot can be produced with cluster-specific curves.
#'
#' @param data optional data frame containing the variables \code{p}, \code{y},
#'   and \code{cluster}. If supplied, variable names should be given without
#'   quotation marks.
#' @param p predicted probabilities (numeric vector) or name of the column in
#'   \code{data}.
#' @param y binary outcome variable or the name of the column in \code{data}.
#' @param cluster cluster identifier (factor, character, or integer) or name of
#'   the column in \code{data}.
#' @param ntiles integer, number of groups (tiles) for calibration. Default is \code{10}.
#' @param cluster_curves logical, whether to include cluster-specific calibration
#'   curves in the plot. Default is \code{FALSE}.
#' @param plot logical, whether to return a calibration plot. Default is \code{TRUE}.
#' @param size numeric, point size for plotted curves. Default is \code{1}.
#' @param linewidth numeric, line width for plotted curves. Default is \code{0.4}.
#' @param univariate logical, whether to use univariate meta-analysis. Default is \code{FALSE}.
#' @param method character, grouping method: \code{"grouped"} (equal-sized groups) or
#'   \code{"interval"} (interval-based). Default is \code{"grouped"}.
#' @param cl.level the confidence level for the calculation of the confidence interval. Default is \code{0.95}.
#'
#' @details
#' When \code{method = "grouped"}, the predictions are divided into equal-sized bins using quantiles.
#' Conversely, if \code{method ="interval"}, the predictions are divided into fixed-width bins across [0, 1].
#'
#' The function performs a meta-analysis within each group. This can be either a univariate or bivariate analysis,
#' which is specified in the \code{univariate} argument. The univariate analysis is performed using the
#' \code{\link[meta]{metaprop}} function and the bivariate analysis employs the \code{\link[metafor]{rma.mv}} function.
#' Hereafter, the results are aggregated and plotted as calibration curves.
#'
#' @return A list containing:
#' \describe{
#'   \item{\code{plot_data}}{Data frame of meta-analysis calibration estimates.}
#'   \item{\code{trad_grouped}}{Data frame with traditional grouped calibration results.}
#'   \item{\code{observed_data}}{Data frame with per-observation calibration data.}
#'   \item{\code{cluster_data}}{Data frame with cluster-specific calibration summaries.}
#'   \item{\code{plot}}{A \code{ggplot2} object if \code{plot = TRUE}, otherwise \code{NULL}.}
#' }
#'
CGC <- function(data = NULL,
                p,
                y,
                cluster,
                cl.level = 0.95,
                ntiles = 10,
                cluster_curves = FALSE,
                plot = TRUE,
                size = 1,
                linewidth = 0.4,
                univariate = FALSE,
                method = c("grouped", "interval")) {
  # --- Arguments check ---
  callFn = match.call()
  method = match.arg(method)
  alpha  = 1 - cl.level

  if (!is.null(data)) {
    if(!all(sapply(c("p", "y", "cluster"), function(a) as.character(callFn[a])) %in% colnames(data)))
      stop(paste("Variables", paste0(
        callFn[c("p", "y", "cluster")], collapse = ", "
      ), "were not found in the data.frame."))
    p   = eval(callFn$p, data)
    logit   = Logit(p)
    y       = eval(callFn$y, data)
    cluster = eval(callFn$cluster, data)
  }

  if(ntiles < 5) {
    warning(sprintf("Number of groups is %s, which is too low. Will be set to 5", ntiles),
            immediate. = TRUE)
    ntiles = 5
  }

  # --- Base dataframe ---
  df = data.frame(
    p       = as.numeric(p),
    y       = as.numeric(y),
    cluster = as.factor(cluster)
  )

  # --- Assign decile/interval groups ---
  if (method == "grouped") {
    df <- df %>%
      group_by(cluster) %>%
      mutate(decile_group = ntile(p, ntiles))
  } else if (method == "interval") {
    df <- df %>%
      group_by(cluster) %>%
      mutate(decile_group = cut(p, breaks = seq(0, 1, length.out = ntiles + 1)))
  }

  # --- Cluster-level summaries ---
  deciles_cluster <- df %>%
    group_by(cluster, decile_group) %>%
    mutate(n_tile = n()) %>%
    ungroup() %>%
    mutate(N_tile = n())

  if(min(deciles_cluster$n_tile) < 50)
    warning("There is a group with less than 50 observations. Consider decreasing the number of groups.")

  decilesbycluster <- deciles_cluster %>%
    group_by(cluster, decile_group) %>%
    summarise(
      n_tile = n(),
      y_mean = mean(y),
      x_mean = mean(p),
      .groups = "drop"
    )

  # --- Traditional grouped calibration ---
  deciles_all <- df %>%
    ungroup() %>%
    mutate(decile_group = if (method == "grouped") ntile(p, ntiles) else cut(p, breaks = seq(0, 1, length.out = ntiles + 1))) %>%
    group_by(decile_group) %>%
    summarise(
      std_error_y = sd(y) / sqrt(length(y)),
      std_error_x = sd(p) / sqrt(length(p)),
      var_x = var(p),
      var_y = var(y),
      p = mean(p),
      y = mean(y),
      .groups = "drop"
    )

  # --- Within-cluster variances ---
  var_150_cluster_decile <- deciles_cluster %>%
    group_by(cluster, decile_group) %>%
    summarise(
      var_x_cluster_tile = var(p),
      var_y_cluster_tile = var(y),
      preds_150 = mean(p),
      y_150 = mean(y),
      ntile_150 = n(),
      .groups = "drop"
    )

  # --- Meta-analysis across groups ---
  data_all <- data.frame()
  for (i in unique(var_150_cluster_decile$decile_group)) {
    data_meta <- var_150_cluster_decile %>% filter(decile_group == i)

    if (univariate) {
      x_meta <- metaprop(
        data = data_meta, event = preds_150 * ntile_150,
        n = ntile_150, studlab = cluster,
        method = "Inverse", backtransf = TRUE
      )
      y_meta <- metaprop(
        data = data_meta, event = y_150 * ntile_150,
        n = ntile_150, studlab = cluster,
        method = "Inverse", backtransf = TRUE
      )
      data_meta_curve <- data.frame(
        te_x = x_meta$TE.random,
        ci_up_x = x_meta$upper.random,
        ci_low_x = x_meta$lower.random,
        pre_up_x = x_meta$upper.predict,
        pre_low_x = x_meta$lower.predict,
        te_y = y_meta$TE.random,
        ci_up_y = y_meta$upper.random,
        ci_low_y = y_meta$lower.random,
        pre_up_y = y_meta$upper.predict,
        pre_low_y = y_meta$lower.predict,
        decile_group = i,
        ntile_plot = sum(data_meta$ntile_150)
      )
    } else {
      preds_escalc <- escalc(
        measure = "PLO", xi = preds_150 * ntile_150,
        ni = ntile_150, data = data_meta
      )
      preds_escalc$group <- "p"
      y_escalc <- escalc(
        measure = "PLO", xi = y_150 * ntile_150,
        ni = ntile_150, data = data_meta
      )
      y_escalc$group <- "y"
      dat <- rbind(preds_escalc, y_escalc)

      res <- try(rma.mv(yi, vi,
        mods = ~ group - 1,
        random = ~ group | cluster,
        struct = "UN", data = dat,
        control = list(iter.max = 10000, rel.tol = 1e-6)
      ))

      if (inherits(res, "try-error") | nrow(dat) == 2) {
        warning("Meta-analysis failed for decile group ", i, ". Returning NA values.")
        next
      }

      pred_up  = res$b + qt(1 - alpha / 2, df = nrow(y_escalc) - 2) * sqrt(res$tau2 + res$se^2)
      pred_low = res$b + qt(alpha / 2, df = nrow(y_escalc) - 2) * sqrt(res$tau2 + res$se^2)

      data_meta_curve <- data.frame(
        te_x = res$b[1],
        ci_up_x = res$ci.ub[1],
        ci_low_x = res$ci.lb[1],
        pre_up_x = pred_up[1],
        pre_low_x = pred_low[1],
        te_y = res$b[2],
        ci_up_y = res$ci.ub[2],
        ci_low_y = res$ci.lb[2],
        pre_up_y = pred_up[2],
        pre_low_y = pred_low[2],
        decile_group = i,
        ntile_plot = sum(data_meta$ntile_150)
      )
    }

    data_all <- rbind(data_all, data_meta_curve)
  }

  data_all <- data_all %>% mutate(across(-c(decile_group, ntile_plot), Ilogit))

  # --- Plotting ---
  curve <- NULL
  if (plot) {
    curve <- ggplot(data_all, aes(x = te_x, y = te_y)) +
      geom_abline(linetype = "dashed", alpha = 0.1) +
      geom_ribbon(aes(ymax = pre_up_y, ymin = pre_low_y, fill = "PI 95%"), alpha = 1) +
      geom_ribbon(aes(ymax = ci_up_y, ymin = ci_low_y, fill = "CI 95%"), alpha = 1) +
      geom_errorbar(aes(x = te_x, y = te_y, xmin = pre_low_x, xmax = pre_up_x),
        alpha = 0.2, width = 0, lwd = linewidth, lty = "dashed"
      ) +
      scale_fill_manual(name = "Heterogeneity", values = c("cornflowerblue", "lightblue")) +
      geom_point(data = deciles_all, aes(x = p, y = y, color = "Traditional grouped"), size = size) +
      geom_line(data = deciles_all, aes(x = p, y = y, color = "Traditional grouped"), linewidth = linewidth) +
      geom_point(data = data_all, aes(color = paste0("CG-C(", method, ")")), size = size * 1.3) +
      geom_line(data = data_all, aes(color = paste0("CG-C(", method, ")")), linewidth = linewidth) +
      scale_color_manual(name = "Methodology", values = c("black", "gold")) +
      xlab("Estimated probability") +
      ylab("Observed proportion") +
      scale_x_continuous(breaks = seq(0, 1, 0.1)) +
      scale_y_continuous(breaks = seq(0, 1, 0.2)) +
      coord_cartesian(xlim = c(0, 1), ylim = c(0, 1)) +
      theme_classic(base_size = 8, base_family = "serif") +
      theme(
        legend.key.size = unit(0.3, "cm"),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank()
      )

    if (cluster_curves) {
      curve <- curve +
        geom_line(
          data = decilesbycluster, aes(x_mean, y_mean, group = cluster),
          lwd = linewidth, alpha = 0.5, show.legend = FALSE
        ) +
        geom_point(
          data = decilesbycluster, aes(x_mean, y_mean, group = cluster),
          alpha = 0.8, size = size / 3, show.legend = FALSE
        )
    }
  }

  # --- Return ---
  list(
    plot_data = data_all,
    trad_grouped = deciles_all,
    observed_data = deciles_cluster,
    cluster_data = decilesbycluster,
    plot = curve
  )
}