R/mds.R

Defines functions print.sqi_mds select_mds

Documented in select_mds

#' Select a Minimum Data Set (MDS) of Soil Quality Indicators
#'
#' @description
#' Identifies the most informative subset of soil variables (the Minimum
#' Data Set, MDS) using Principal Component Analysis (PCA).  Only variables
#' with high factor loadings on principal components explaining eigenvalue
#' > 1 (Kaiser criterion) are retained.  Where multiple variables load
#' highly on the same component, the one with the highest correlation to
#' others in that component is selected to minimise redundancy.
#'
#' This approach follows the widely cited method of Andrews et al. (2004)
#' and Sharma et al. (2008), and is equivalent to the \code{PCAIndex}
#' algorithm in Wani et al. (2023).
#'
#' @details
#' **Algorithm steps:**
#' \enumerate{
#'   \item Standardise all numeric variables (mean = 0, sd = 1).
#'   \item Perform PCA; retain components with eigenvalue > 1.
#'   \item For each retained component, identify variables with absolute
#'     loading \eqn{\ge} \code{load_threshold}.
#'   \item Among those, select the variable with the highest sum of
#'     absolute Pearson correlations to all others in the set (i.e., the
#'     most correlated, least redundant variable).
#'   \item Optionally, remove variables with high Variance Inflation Factor
#'     (VIF > \code{vif_threshold}) among the MDS candidates.
#' }
#'
#' @param data A data frame of scored or raw soil variables (numeric
#'   columns only, or with group columns specified in \code{group_cols}).
#' @param group_cols Character vector of grouping columns to exclude from
#'   the analysis.  Default: \code{"LandUse"}.
#' @param load_threshold Numeric in (0, 1).  Minimum absolute factor
#'   loading for a variable to be considered for MDS membership.
#'   Default: \code{0.6} (Andrews et al., 2004).
#' @param vif_threshold Numeric.  Maximum allowable Variance Inflation
#'   Factor among MDS variables.  Variables exceeding this are iteratively
#'   removed.  Set to \code{Inf} to skip VIF filtering.  Default: \code{10}.
#' @param n_pc Integer or \code{"auto"}.  Number of principal components to
#'   consider.  \code{"auto"} (default) uses the Kaiser criterion
#'   (eigenvalue > 1).
#' @param verbose Logical.  Print MDS selection summary.  Default \code{TRUE}.
#'
#' @return A list of class \code{sqi_mds} with:
#'   \describe{
#'     \item{mds_vars}{Character vector of selected MDS variable names.}
#'     \item{all_vars}{Character vector of all candidate variable names.}
#'     \item{pca}{The \code{\link[FactoMineR]{PCA}} result object.}
#'     \item{loadings}{Matrix of factor loadings.}
#'     \item{eigenvalues}{Numeric vector of eigenvalues.}
#'     \item{var_explained}{Numeric vector of variance explained (\%) per
#'       component.}
#'   }
#'
#' @references
#' Andrews, S.S., Karlen, D.L., & Cambardella, C.A. (2004). The soil
#' management assessment framework: A quantitative soil quality evaluation
#' method. \emph{Soil Science Society of America Journal}, 68(6),
#' 1945--1962. \doi{10.2136/sssaj2004.1945}
#'
#' Kaiser, H.F. (1960). The application of electronic computers to factor
#' analysis. \emph{Educational and Psychological Measurement}, 20(1),
#' 141--151. \doi{10.1177/001316446002000116}
#'
#' Sharma, K.L., et al. (2008). Long-term soil management effects on soil
#' quality indices. \emph{Geoderma}, 144, 290--300.
#' \doi{10.1016/j.geoderma.2007.11.019}
#'
#' @examples
#' data(soil_data)
#' cfg <- make_config(
#'   variable = c("pH","EC","BD","OC","MBC","PMN","Clay","WHC","DEH","AP","TN"),
#'   type     = c("opt","less","less","more","more","more",
#'                "opt","more","more","more","more"),
#'   opt_low  = c(6.0, NA, NA, NA, NA, NA, 20, NA, NA, NA, NA),
#'   opt_high = c(7.0, NA, NA, NA, NA, NA, 35, NA, NA, NA, NA)
#' )
#' scored <- score_all(soil_data, cfg, group_cols = c("LandUse","Depth"))
#' mds    <- select_mds(scored, group_cols = c("LandUse","Depth"))
#' mds$mds_vars
#'
#' @export
select_mds <- function(data, group_cols = "LandUse",
                       load_threshold = 0.5,
                       vif_threshold  = 10,
                       n_pc           = "auto",
                       verbose        = TRUE) {

  num_data <- data[, setdiff(names(data), group_cols), drop = FALSE]
  num_data <- num_data[, sapply(num_data, is.numeric), drop = FALSE]
  num_data <- stats::na.omit(num_data)

  if (ncol(num_data) < 2)
    stop("At least 2 numeric variables are required for MDS selection.",
         call. = FALSE)

  # ── PCA ────────────────────────────────────────────────────────────────────
  scaled   <- scale(num_data)
  pca_res  <- FactoMineR::PCA(scaled, graph = FALSE, ncp = ncol(num_data))
  eig      <- pca_res$eig[, 1]          # eigenvalues
  var_expl <- pca_res$eig[, 2]          # % variance

  n_comp <- if (identical(n_pc, "auto")) sum(eig > 1) else as.integer(n_pc)
  if (n_comp == 0) {
    warning("No eigenvalue > 1 found; defaulting to first 2 components.")
    n_comp <- min(2L, ncol(num_data))
  }

  loadings <- pca_res$svd$V[, seq_len(n_comp), drop = FALSE]
  rownames(loadings) <- colnames(num_data)
  colnames(loadings) <- paste0("PC", seq_len(n_comp))

  # ── Variable selection per component ───────────────────────────────────────
  mds_vars <- character(0)
  for (pc in seq_len(n_comp)) {
    ld    <- abs(loadings[, pc])
    cands <- names(ld[ld >= load_threshold])
    # Fallback: if no variable passes threshold, take the top loader
    if (length(cands) == 0) cands <- names(which.max(ld))
    if (length(cands) == 1) {
      mds_vars <- union(mds_vars, cands)
      next
    }
    # Select variable with highest sum of absolute correlation to others
    cor_mat       <- abs(cor(num_data[, cands, drop = FALSE]))
    diag(cor_mat) <- 0
    cor_sums      <- colSums(cor_mat)
    best          <- names(which.max(cor_sums))
    mds_vars      <- union(mds_vars, best)
  }

  # ── VIF filtering ──────────────────────────────────────────────────────────
  if (length(mds_vars) >= 2 && is.finite(vif_threshold)) {
    repeat {
      tmp <- num_data[, mds_vars, drop = FALSE]
      if (ncol(tmp) < 2) break
      vif_vals <- tryCatch({
        lm_fit <- stats::lm(
          stats::reformulate(mds_vars[-1], response = mds_vars[1]),
          data = as.data.frame(tmp))
        car::vif(lm_fit)
      }, error = function(e) rep(0, length(mds_vars) - 1))

      if (all(vif_vals <= vif_threshold)) break
      drop_var <- names(which.max(vif_vals))
      mds_vars <- setdiff(mds_vars, drop_var)
      if (length(mds_vars) < 2) break
    }
  }

  # ── Report ─────────────────────────────────────────────────────────────────
  if (verbose) {
    cat("\n=== MDS Selection Summary ===\n")
    cat(sprintf("  Components retained (eigenvalue > 1): %d\n", n_comp))
    cat(sprintf("  Total variance explained: %.1f%%\n",
                sum(var_expl[seq_len(n_comp)])))
    cat(sprintf("  MDS variables selected  : %d\n", length(mds_vars)))
    cat("  MDS:", paste(mds_vars, collapse = ", "), "\n\n")
  }

  structure(
    list(mds_vars    = mds_vars,
         all_vars    = colnames(num_data),
         pca         = pca_res,
         loadings    = loadings,
         eigenvalues = eig,
         var_explained = var_expl),
    class = "sqi_mds"
  )
}


#' @export
print.sqi_mds <- function(x, ...) {
  cat("SQIpro MDS Selection\n")
  cat("  All variables :", paste(x$all_vars, collapse = ", "), "\n")
  cat("  MDS selected  :", paste(x$mds_vars, collapse = ", "), "\n")
  cat("  PCs retained  :", ncol(x$loadings), "\n")
  invisible(x)
}

Try the SQIpro package in your browser

Any scripts or data that you put into this service are public.

SQIpro documentation built on April 20, 2026, 5:06 p.m.