R/statistics.R
In SQIpro: Comprehensive Soil Quality Index Computation and Visualization

Documented in sqi_anova sqi_sensitivity

#' One-Way ANOVA and Tukey HSD Post-Hoc Test for SQI
#'
#' @description
#' Performs a one-way ANOVA to test whether Soil Quality Index values differ
#' significantly across land-use groups, followed by Tukey's Honest
#' Significant Difference (HSD) test for pairwise comparisons.
#'
#' @param scored A scored data frame from \code{\link{score_all}}.
#' @param sqi_col Character. Name of the SQI column to test (e.g.,
#'   \code{"SQI_linear"}).  This must be a column in \code{scored} or
#'   returned by one of the indexing functions joined back to the data.
#'   Alternatively pass the output of \code{\link{sqi_compare}} with
#'   individual observations merged in.
#' @param group_col Character. Grouping variable column name (e.g.,
#'   \code{"LandUse"}).
#' @param alpha Numeric. Significance level for the ANOVA. Default 0.05.
#'
#' @return A list with:
#'   \describe{
#'     \item{anova_table}{An \code{anova} object.}
#'     \item{tukey}{A \code{TukeyHSD} object.}
#'     \item{significant}{Logical. Whether the ANOVA is significant at
#'       \code{alpha}.}
#'     \item{compact_letters}{Data frame of compact letter display for
#'       plotting.}
#'   }
#'
#' @references
#' Tukey, J.W. (1949). Comparing individual means in the analysis of
#' variance. \emph{Biometrics}, 5(2), 99--114. \doi{10.2307/3001913}
#'
#' @examples
#' data(soil_data)
#' cfg <- make_config(
#'   variable = c("pH","EC","BD","OC","MBC","Clay"),
#'   type     = c("opt","less","less","more","more","opt"),
#'   opt_low  = c(6.0, NA, NA, NA, NA, 20),
#'   opt_high = c(7.0, NA, NA, NA, NA, 35)
#' )
#' scored <- score_all(soil_data, cfg, group_cols = c("LandUse","Depth"))
#' # Compute per-observation linear SQI for ANOVA
#' scored$SQI_obs <- rowMeans(scored[, cfg$variable], na.rm = TRUE)
#' aov_result <- sqi_anova(scored, sqi_col = "SQI_obs",
#'                          group_col = "LandUse")
#' print(aov_result$tukey)
#'
#' @export
sqi_anova <- function(scored, sqi_col, group_col, alpha = 0.05) {

  if (!sqi_col %in% names(scored))
    stop("`sqi_col` '", sqi_col, "' not found in data.", call. = FALSE)
  if (!group_col %in% names(scored))
    stop("`group_col` '", group_col, "' not found in data.", call. = FALSE)

  df         <- scored[, c(group_col, sqi_col)]
  df[[group_col]] <- as.factor(df[[group_col]])
  df         <- stats::na.omit(df)

  formula    <- stats::as.formula(paste(sqi_col, "~", group_col))
  aov_fit    <- stats::aov(formula, data = df)
  aov_table  <- summary(aov_fit)
  tukey_res  <- stats::TukeyHSD(aov_fit)

  p_val      <- aov_table[[1]][["Pr(>F)"]][1]
  significant <- !is.na(p_val) && p_val < alpha

  # Compact letter display (simple)
  tk_df  <- as.data.frame(tukey_res[[1]])
  tk_df$comparison <- rownames(tk_df)
  tk_df$significant <- tk_df[["p adj"]] < alpha

  groups   <- levels(df[[group_col]])
  letters_ <- stats::setNames(rep("a", length(groups)), groups)
  letter_counter <- 1L
  for (g in seq_along(groups)) {
    for (h in seq_along(groups)) {
      if (h <= g) next
      comp1 <- paste(groups[h], groups[g], sep = "-")
      comp2 <- paste(groups[g], groups[h], sep = "-")
      row_  <- tk_df[tk_df$comparison %in% c(comp1, comp2), ]
      if (nrow(row_) > 0 && row_$significant[1]) {
        if (letters_[groups[h]] == letters_[groups[g]]) {
          letter_counter  <- letter_counter + 1L
          letters_[groups[h]] <- letters[letter_counter]  # letters is a vector
        }
      }
    }
  }
  cld <- data.frame(Group = names(letters_), Letter = unname(letters_),
                    stringsAsFactors = FALSE)

  list(anova_table     = aov_table,
       tukey           = tukey_res,
       significant     = significant,
       compact_letters = cld)
}


#' Sensitivity Analysis for SQI
#'
#' @description
#' Quantifies the contribution of each soil variable to the overall Soil
#' Quality Index by a leave-one-out approach: each variable is removed in
#' turn and the resulting index is compared to the full index.  A larger
#' change indicates a higher-sensitivity (more important) variable.
#'
#' @param scored A scored data frame from \code{\link{score_all}}.
#' @param config A \code{sqi_config} object.
#' @param group_cols Character vector of grouping columns.
#' @param method Character. Which indexing method to use for sensitivity
#'   analysis: \code{"linear"} (default), \code{"fuzzy"}, \code{"entropy"},
#'   or \code{"topsis"}.
#' @param mds_vars Character vector of MDS variable names.  If \code{NULL},
#'   all config variables are used.
#'
#' @return A data frame with columns \code{variable}, \code{mean_change}
#'   (mean absolute change in SQI when variable is removed),
#'   \code{sd_change}, and \code{relative_importance} (0--1, normalised).
#'
#' @references
#' Saltelli, A., Ratto, M., Andres, T., Campolongo, F., Cariboni, J.,
#' Gatelli, D., Saisana, M., & Tarantola, S. (2008).
#' \emph{Global Sensitivity Analysis: The Primer}.
#' John Wiley & Sons, Chichester. \doi{10.1002/9780470725184}
#'
#' @examples
#' data(soil_data)
#' cfg <- make_config(
#'   variable = c("pH","EC","BD","OC","MBC","Clay"),
#'   type     = c("opt","less","less","more","more","opt"),
#'   opt_low  = c(6.0, NA, NA, NA, NA, 20),
#'   opt_high = c(7.0, NA, NA, NA, NA, 35)
#' )
#' scored  <- score_all(soil_data, cfg, group_cols = c("LandUse","Depth"))
#' sa      <- sqi_sensitivity(scored, cfg, group_cols = c("LandUse","Depth"))
#' print(sa)
#'
#' @export
sqi_sensitivity <- function(scored, config, group_cols = "LandUse",
                             method = c("linear","fuzzy","entropy","topsis"),
                             mds_vars = NULL) {

  method <- match.arg(method)
  vars   <- if (!is.null(mds_vars)) mds_vars else config$variable
  vars   <- intersect(vars, names(scored))

  run_method <- function(v) {
    switch(method,
      "linear"  = sqi_linear(scored, config, group_cols, mds_vars = v),
      "fuzzy"   = sqi_fuzzy(scored, config, group_cols, mds_vars = v),
      "entropy" = sqi_entropy(scored, config, group_cols, mds_vars = v),
      "topsis"  = sqi_topsis(scored, config, group_cols, mds_vars = v)
    )
  }

  sqi_col  <- paste0("SQI_", method)
  baseline <- run_method(vars)[[sqi_col]]

  results <- lapply(vars, function(drop_var) {
    sub_vars <- setdiff(vars, drop_var)
    if (length(sub_vars) < 1) return(data.frame(variable = drop_var,
                                                 mean_change = NA, sd_change = NA))
    res    <- run_method(sub_vars)[[sqi_col]]
    change <- abs(baseline - res)
    data.frame(variable    = drop_var,
               mean_change = mean(change, na.rm = TRUE),
               sd_change   = stats::sd(change, na.rm = TRUE))
  })

  sa <- do.call(rbind, results)
  sa$relative_importance <- round(sa$mean_change /
                                    max(sa$mean_change, na.rm = TRUE), 4)
  sa$mean_change <- round(sa$mean_change, 4)
  sa$sd_change   <- round(sa$sd_change,   4)
  sa[order(-sa$relative_importance), ]
}