R/classification_stability.R
In TrustworthyMLR: Stability and Robustness Evaluation for Machine Learning Models

Documented in classification_stability

#' Stability Index for Classification Models
#'
#' Computes the stability of classification predictions across multiple
#' runs. For classification, stability is measured as the average
#' agreement between pairs of runs, adjusted for chance (similar to
#' Cohen's Kappa but extended for multiple runs).
#'
#' @param class_matrix A matrix or data.frame where each row represents
#'   an observation and each column represents a predicted class (factor
#'   or character) from a single model run.
#'
#' @return A numeric scalar between 0 and 1, where 1 indicates perfect
#'   consistency and 0 indicates consistency no better than chance.
#'
#' @examples
#' # Simulate classification predictions from 3 runs
#' preds <- data.frame(
#'     run1 = c("A", "A", "B", "C"),
#'     run2 = c("A", "B", "B", "C"),
#'     run3 = c("A", "A", "B", "C")
#' )
#' classification_stability(preds)
#'
#' @export
classification_stability <- function(class_matrix) {
    if (!is.matrix(class_matrix) && !is.data.frame(class_matrix)) {
        stop("'class_matrix' must be a matrix or data.frame.", call. = FALSE)
    }

    mat <- as.matrix(class_matrix)
    n_obs <- nrow(mat)
    n_runs <- ncol(mat)

    if (n_runs < 2L) {
        stop("At least two runs are required.", call. = FALSE)
    }

    # Calculate average pair-wise agreement
    agreements <- 0
    n_pairs <- 0

    for (i in 1:(n_runs - 1)) {
        for (j in (i + 1):n_runs) {
            agreements <- agreements + sum(mat[, i] == mat[, j])
            n_pairs <- n_pairs + 1
        }
    }

    total_possible <- n_obs * n_pairs
    stability <- agreements / total_possible

    return(stability)
}