R/senser.R
In senseR: Proxy Indicator Diagnostic Tool for Analytical and Policy Use

Documented in senser

#' Proxy Indicator Diagnostic Tool
#'
#' @description
#' `senser()` is a statistical diagnostic function designed to evaluate
#' whether one or more proxy indicators are suitable representations of an
#' underlying construct that cannot be directly observed or measured.
#'
#' The function assesses each proxy based on multiple statistical dimensions:
#' monotonicity, information content, stability, distributional alignment,
#' bias risk, and dynamic range (sensitivity).
#'
#' The output is returned as a structured data.frame containing quantitative
#' scores and qualitative classifications suitable for applied research and
#' policy diagnostics.
#'
#' @param data
#' A data.frame containing the target construct and proxy variables.
#'
#' @param proxy
#' Character vector specifying one or more proxy variable names contained
#' in `data`.
#'
#' @param target
#' Character string specifying the target construct variable name contained
#' in `data`.
#'
#' @param lang
#' Language for status labels and interpretation text.
#' Must be either `"english"` or `"indonesia"`.
#'
#' @param stagnation_cut
#' Threshold used to detect stagnation (very small average absolute change).
#' Default is `0.01`.
#'
#' @param cv_cut
#' Threshold for the coefficient of variation (CV).
#' Default is `0.02`.
#'
#' @param ceiling_cut
#' Threshold used to detect ceiling effects.
#' Default is `0.95`.
#'
#' @details
#' The diagnostic score for each proxy is computed using six normalized components:
#'
#' \itemize{
#'   \item \strong{monotonicity}: Spearman rank correlation.
#'   \item \strong{information_content}: R-squared from linear regression.
#'   \item \strong{elasticity_score}: Responsiveness index.
#'   \item \strong{variability_score}: Based on coefficient of variation.
#'   \item \strong{stagnation_score}: Penalizes near-flat dynamics.
#'   \item \strong{ceiling_score}: Penalizes ceiling effects.
#'   \item \strong{stability_score}: Coefficient stability across subsamples.
#' }
#'
#' The final proxy score is computed as the median of all components.
#'
#' Classification categories:
#'
#' \itemize{
#'   \item Score >= 0.70: Suitable proxy
#'   \item 0.40 <= Score < 0.70: Conditionally suitable
#'   \item Score < 0.40: Not suitable proxy
#' }
#'
#' @return
#' A data.frame with one row per proxy variable containing:
#'
#' \describe{
#'   \item{target}{Target construct name.}
#'   \item{proxy}{Proxy variable name.}
#'   \item{monotonicity}{Spearman correlation (absolute).}
#'   \item{information_content}{R-squared value.}
#'   \item{elasticity}{Elasticity index.}
#'   \item{cv}{Coefficient of variation.}
#'   \item{avg_change}{Average absolute change.}
#'   \item{ceiling_ratio}{Mean-to-maximum ratio.}
#'   \item{stability_score}{Coefficient stability index.}
#'   \item{final_score}{Median diagnostic score.}
#'   \item{classification}{Qualitative proxy category.}
#'   \item{interpretation}{Plain-language interpretation.}
#' }
#'
#' @examples
#' set.seed(123)
#' df <- data.frame(
#'   gdp = rnorm(100, 10, 2),
#'   ntl = rnorm(100, 50, 10),
#'   road_density = rnorm(100, 3, 0.5)
#' )
#'
#' senser(
#'   data = df,
#'   proxy = c("ntl", "road_density"),
#'   target = "gdp",
#'   lang = "english"
#' )
#'
#' @references
#' Spearman, C. (1904). The proof and measurement of association between two things.
#' \emph{American Journal of Psychology}, 15(1), 72–101.
#'
#' Chow, G. C. (1960). Tests of equality between sets of coefficients
#' in two linear regressions. \emph{Econometrica}.
#'
#' OECD (2008). Handbook on Constructing Composite Indicators:
#' Methodology and User Guide.
#'
#' @author
#' Joko Nursiyono
#'
#' @seealso
#' \code{\link{lm}}, \code{\link{cor}}, \code{\link{median}}
#'
#' @importFrom stats cor coef lm sd median complete.cases
#' @export
senser <- function(data,
                   proxy,
                   target,
                   lang = c("english", "indonesia"),
                   stagnation_cut = 0.01,
                   cv_cut = 0.02,
                   ceiling_cut = 0.95) {

  lang <- match.arg(lang)

  if (!is.data.frame(data))
    stop("data must be a data.frame")

  if (!all(proxy %in% names(data)))
    stop("proxy not found in data")

  if (!target %in% names(data))
    stop("target not found in data")

  Y <- data[[target]]
  if (!is.numeric(Y))
    stop("target must be numeric")

  results <- lapply(proxy, function(p) {

    X <- data[[p]]
    if (!is.numeric(X)) return(NULL)

    idx <- complete.cases(X, Y)
    X <- X[idx]
    Yc <- Y[idx]

    if (length(X) < 10) return(NULL)

    mono  <- abs(cor(X, Yc, method = "spearman"))
    info  <- cor(X, Yc)^2

    elast <- abs(coef(lm(Yc ~ X))[2]) * sd(X)/sd(Yc)
    elast_score <- ifelse(elast < 0.1, 0.3, 1)

    cv <- sd(X)/mean(X)
    cv_score <- ifelse(cv < cv_cut, 0.3, 1)

    avg_change <- mean(abs(diff(X)), na.rm = TRUE)
    stagnation_score <- ifelse(avg_change < stagnation_cut, 0.2, 1)

    ceiling_ratio <- mean(X)/max(X)
    ceiling_score <- ifelse(ceiling_ratio > ceiling_cut, 0.3, 1)

    half <- floor(length(X)/2)
    b_full <- coef(lm(Yc ~ X))[2]
    b_sub1 <- coef(lm(Yc[1:half] ~ X[1:half]))[2]
    b_sub2 <- coef(lm(Yc[(half+1):length(X)] ~ X[(half+1):length(X)]))[2]

    sens <- 1 - sd(c(b_full, b_sub1, b_sub2)) /
      max(abs(b_full), 1e-8)
    sens <- max(min(sens, 1), 0)

    components <- c(mono, info, elast_score,
                    cv_score, stagnation_score,
                    ceiling_score, sens)

    score <- median(components)

    if (score >= 0.7) {
      classif <- if (lang == "english")
        "Suitable proxy" else "Proxy layak"
    } else if (score >= 0.4) {
      classif <- if (lang == "english")
        "Conditionally suitable" else "Layak bersyarat"
    } else {
      classif <- if (lang == "english")
        "Not suitable proxy" else "Proxy tidak layak"
    }

    interp <- if (lang == "english") {
      if (score >= 0.7)
        "Proxy demonstrates adequate variability and statistical stability."
      else if (score >= 0.4)
        "Proxy partially represents the construct with limitations."
      else
        "Proxy lacks sufficient statistical reliability."
    } else {
      if (score >= 0.7)
        "Proxy memiliki variabilitas dan stabilitas yang memadai."
      else if (score >= 0.4)
        "Proxy menangkap sebagian konstruk dengan keterbatasan."
      else
        "Proxy tidak cukup andal secara statistik."
    }

    data.frame(
      target = target,
      proxy = p,
      monotonicity = mono,
      information_content = info,
      elasticity = elast,
      cv = cv,
      avg_change = avg_change,
      ceiling_ratio = ceiling_ratio,
      stability_score = sens,
      final_score = score,
      classification = classif,
      interpretation = interp,
      stringsAsFactors = FALSE
    )
  })

  do.call(rbind, results)
}