Nothing
#' Select a Minimum Data Set (MDS) of Soil Quality Indicators
#'
#' @description
#' Identifies the most informative subset of soil variables (the Minimum
#' Data Set, MDS) using Principal Component Analysis (PCA). Only variables
#' with high factor loadings on principal components explaining eigenvalue
#' > 1 (Kaiser criterion) are retained. Where multiple variables load
#' highly on the same component, the one with the highest correlation to
#' others in that component is selected to minimise redundancy.
#'
#' This approach follows the widely cited method of Andrews et al. (2004)
#' and Sharma et al. (2008), and is equivalent to the \code{PCAIndex}
#' algorithm in Wani et al. (2023).
#'
#' @details
#' **Algorithm steps:**
#' \enumerate{
#' \item Standardise all numeric variables (mean = 0, sd = 1).
#' \item Perform PCA; retain components with eigenvalue > 1.
#' \item For each retained component, identify variables with absolute
#' loading \eqn{\ge} \code{load_threshold}.
#' \item Among those, select the variable with the highest sum of
#' absolute Pearson correlations to all others in the set (i.e., the
#' most correlated, least redundant variable).
#' \item Optionally, remove variables with high Variance Inflation Factor
#' (VIF > \code{vif_threshold}) among the MDS candidates.
#' }
#'
#' @param data A data frame of scored or raw soil variables (numeric
#' columns only, or with group columns specified in \code{group_cols}).
#' @param group_cols Character vector of grouping columns to exclude from
#' the analysis. Default: \code{"LandUse"}.
#' @param load_threshold Numeric in (0, 1). Minimum absolute factor
#' loading for a variable to be considered for MDS membership.
#' Default: \code{0.6} (Andrews et al., 2004).
#' @param vif_threshold Numeric. Maximum allowable Variance Inflation
#' Factor among MDS variables. Variables exceeding this are iteratively
#' removed. Set to \code{Inf} to skip VIF filtering. Default: \code{10}.
#' @param n_pc Integer or \code{"auto"}. Number of principal components to
#' consider. \code{"auto"} (default) uses the Kaiser criterion
#' (eigenvalue > 1).
#' @param verbose Logical. Print MDS selection summary. Default \code{TRUE}.
#'
#' @return A list of class \code{sqi_mds} with:
#' \describe{
#' \item{mds_vars}{Character vector of selected MDS variable names.}
#' \item{all_vars}{Character vector of all candidate variable names.}
#' \item{pca}{The \code{\link[FactoMineR]{PCA}} result object.}
#' \item{loadings}{Matrix of factor loadings.}
#' \item{eigenvalues}{Numeric vector of eigenvalues.}
#' \item{var_explained}{Numeric vector of variance explained (\%) per
#' component.}
#' }
#'
#' @references
#' Andrews, S.S., Karlen, D.L., & Cambardella, C.A. (2004). The soil
#' management assessment framework: A quantitative soil quality evaluation
#' method. \emph{Soil Science Society of America Journal}, 68(6),
#' 1945--1962. \doi{10.2136/sssaj2004.1945}
#'
#' Kaiser, H.F. (1960). The application of electronic computers to factor
#' analysis. \emph{Educational and Psychological Measurement}, 20(1),
#' 141--151. \doi{10.1177/001316446002000116}
#'
#' Sharma, K.L., et al. (2008). Long-term soil management effects on soil
#' quality indices. \emph{Geoderma}, 144, 290--300.
#' \doi{10.1016/j.geoderma.2007.11.019}
#'
#' @examples
#' data(soil_data)
#' cfg <- make_config(
#' variable = c("pH","EC","BD","OC","MBC","PMN","Clay","WHC","DEH","AP","TN"),
#' type = c("opt","less","less","more","more","more",
#' "opt","more","more","more","more"),
#' opt_low = c(6.0, NA, NA, NA, NA, NA, 20, NA, NA, NA, NA),
#' opt_high = c(7.0, NA, NA, NA, NA, NA, 35, NA, NA, NA, NA)
#' )
#' scored <- score_all(soil_data, cfg, group_cols = c("LandUse","Depth"))
#' mds <- select_mds(scored, group_cols = c("LandUse","Depth"))
#' mds$mds_vars
#'
#' @export
select_mds <- function(data, group_cols = "LandUse",
load_threshold = 0.5,
vif_threshold = 10,
n_pc = "auto",
verbose = TRUE) {
num_data <- data[, setdiff(names(data), group_cols), drop = FALSE]
num_data <- num_data[, sapply(num_data, is.numeric), drop = FALSE]
num_data <- stats::na.omit(num_data)
if (ncol(num_data) < 2)
stop("At least 2 numeric variables are required for MDS selection.",
call. = FALSE)
# ── PCA ────────────────────────────────────────────────────────────────────
scaled <- scale(num_data)
pca_res <- FactoMineR::PCA(scaled, graph = FALSE, ncp = ncol(num_data))
eig <- pca_res$eig[, 1] # eigenvalues
var_expl <- pca_res$eig[, 2] # % variance
n_comp <- if (identical(n_pc, "auto")) sum(eig > 1) else as.integer(n_pc)
if (n_comp == 0) {
warning("No eigenvalue > 1 found; defaulting to first 2 components.")
n_comp <- min(2L, ncol(num_data))
}
loadings <- pca_res$svd$V[, seq_len(n_comp), drop = FALSE]
rownames(loadings) <- colnames(num_data)
colnames(loadings) <- paste0("PC", seq_len(n_comp))
# ── Variable selection per component ───────────────────────────────────────
mds_vars <- character(0)
for (pc in seq_len(n_comp)) {
ld <- abs(loadings[, pc])
cands <- names(ld[ld >= load_threshold])
# Fallback: if no variable passes threshold, take the top loader
if (length(cands) == 0) cands <- names(which.max(ld))
if (length(cands) == 1) {
mds_vars <- union(mds_vars, cands)
next
}
# Select variable with highest sum of absolute correlation to others
cor_mat <- abs(cor(num_data[, cands, drop = FALSE]))
diag(cor_mat) <- 0
cor_sums <- colSums(cor_mat)
best <- names(which.max(cor_sums))
mds_vars <- union(mds_vars, best)
}
# ── VIF filtering ──────────────────────────────────────────────────────────
if (length(mds_vars) >= 2 && is.finite(vif_threshold)) {
repeat {
tmp <- num_data[, mds_vars, drop = FALSE]
if (ncol(tmp) < 2) break
vif_vals <- tryCatch({
lm_fit <- stats::lm(
stats::reformulate(mds_vars[-1], response = mds_vars[1]),
data = as.data.frame(tmp))
car::vif(lm_fit)
}, error = function(e) rep(0, length(mds_vars) - 1))
if (all(vif_vals <= vif_threshold)) break
drop_var <- names(which.max(vif_vals))
mds_vars <- setdiff(mds_vars, drop_var)
if (length(mds_vars) < 2) break
}
}
# ── Report ─────────────────────────────────────────────────────────────────
if (verbose) {
cat("\n=== MDS Selection Summary ===\n")
cat(sprintf(" Components retained (eigenvalue > 1): %d\n", n_comp))
cat(sprintf(" Total variance explained: %.1f%%\n",
sum(var_expl[seq_len(n_comp)])))
cat(sprintf(" MDS variables selected : %d\n", length(mds_vars)))
cat(" MDS:", paste(mds_vars, collapse = ", "), "\n\n")
}
structure(
list(mds_vars = mds_vars,
all_vars = colnames(num_data),
pca = pca_res,
loadings = loadings,
eigenvalues = eig,
var_explained = var_expl),
class = "sqi_mds"
)
}
#' @export
print.sqi_mds <- function(x, ...) {
cat("SQIpro MDS Selection\n")
cat(" All variables :", paste(x$all_vars, collapse = ", "), "\n")
cat(" MDS selected :", paste(x$mds_vars, collapse = ", "), "\n")
cat(" PCs retained :", ncol(x$loadings), "\n")
invisible(x)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.