Nothing
#' Validate Input Data for SQI Analysis
#'
#' @description
#' Checks that a data frame meets requirements for Soil Quality Index (SQI)
#' computation: correct column types, sufficient sample sizes, absence of
#' infinite values, and appropriate variable configuration.
#'
#' @param data A data frame. The first column(s) should be grouping factors
#' (character or factor); remaining columns should be numeric soil
#' variables.
#' @param group_cols Character vector. Names of grouping columns (e.g.,
#' \code{c("LandUse", "Depth")}). Defaults to the first column.
#' @param config A data frame produced by \code{\link{make_config}} or
#' manually created, with columns \code{variable}, \code{type},
#' \code{opt_low}, \code{opt_high}, \code{min_val}, \code{max_val}.
#' If \code{NULL}, only basic data checks are performed.
#' @param min_n Integer. Minimum number of observations per group.
#' Default is 3.
#' @param verbose Logical. If \code{TRUE} (default), prints a validation
#' summary to the console.
#'
#' @return Invisibly returns a list with components:
#' \describe{
#' \item{valid}{Logical. \code{TRUE} if all checks pass.}
#' \item{messages}{Character vector of warning/info messages.}
#' \item{n_per_group}{Data frame of group sizes.}
#' }
#'
#' @references
#' Andrews, S.S., Karlen, D.L., & Cambardella, C.A. (2004). The soil
#' management assessment framework: A quantitative soil quality evaluation
#' method. \emph{Soil Science Society of America Journal}, 68(6),
#' 1945--1962. \doi{10.2136/sssaj2004.1945}
#'
#' @examples
#' data(soil_data)
#' result <- validate_data(soil_data, group_cols = c("LandUse", "Depth"))
#' result$valid
#' result$n_per_group
#'
#' @export
validate_data <- function(data, group_cols = NULL, config = NULL,
min_n = 3, verbose = TRUE) {
messages <- character(0)
valid <- TRUE
# ── 1. Basic checks ────────────────────────────────────────────────────────
if (!is.data.frame(data)) {
stop("`data` must be a data frame.", call. = FALSE)
}
if (nrow(data) == 0) stop("`data` has zero rows.", call. = FALSE)
if (ncol(data) < 2) stop("`data` must have at least 2 columns.",
call. = FALSE)
# ── 2. Identify grouping columns ───────────────────────────────────────────
if (is.null(group_cols)) group_cols <- names(data)[1]
missing_gc <- setdiff(group_cols, names(data))
if (length(missing_gc) > 0) {
stop("Group column(s) not found in data: ",
paste(missing_gc, collapse = ", "), call. = FALSE)
}
num_cols <- setdiff(names(data), group_cols)
non_num <- num_cols[!sapply(data[num_cols], is.numeric)]
if (length(non_num) > 0) {
msg <- paste("Non-numeric variable column(s) detected (will be ignored):",
paste(non_num, collapse = ", "))
messages <- c(messages, paste("WARNING:", msg))
valid <- FALSE
}
num_cols <- setdiff(num_cols, non_num)
# ── 3. Missing values ──────────────────────────────────────────────────────
na_counts <- colSums(is.na(data[num_cols]))
na_vars <- na_counts[na_counts > 0]
if (length(na_vars) > 0) {
msg <- paste0("Missing values detected in: ",
paste(names(na_vars), "(", na_vars, "NA)",
collapse = ", "))
messages <- c(messages, paste("WARNING:", msg))
}
# ── 4. Infinite values ─────────────────────────────────────────────────────
inf_vars <- num_cols[sapply(data[num_cols], function(x)
any(is.infinite(x)))]
if (length(inf_vars) > 0) {
msg <- paste("Infinite values in:", paste(inf_vars, collapse = ", "))
messages <- c(messages, paste("ERROR:", msg))
valid <- FALSE
}
# ── 5. Zero-variance columns ───────────────────────────────────────────────
zero_var <- num_cols[sapply(data[num_cols], function(x)
stats::var(x, na.rm = TRUE) == 0)]
if (length(zero_var) > 0) {
msg <- paste("Zero-variance columns (constant values):",
paste(zero_var, collapse = ", "))
messages <- c(messages, paste("WARNING:", msg))
}
# ── 6. Group sizes ─────────────────────────────────────────────────────────
group_sym <- rlang::syms(group_cols)
n_per_group <- data %>%
dplyr::group_by(!!!group_sym) %>%
dplyr::summarise(n = dplyr::n(), .groups = "drop")
small_groups <- n_per_group[n_per_group$n < min_n, ]
if (nrow(small_groups) > 0) {
msg <- paste0("Groups with fewer than ", min_n, " observations: ",
nrow(small_groups), " group(s). Consider merging or ",
"increasing sample size for reliable index estimation.")
messages <- c(messages, paste("WARNING:", msg))
}
# ── 7. Config validation ───────────────────────────────────────────────────
if (!is.null(config)) {
req_cols <- c("variable", "type")
miss_cfg <- setdiff(req_cols, names(config))
if (length(miss_cfg) > 0) {
stop("Config is missing required columns: ",
paste(miss_cfg, collapse = ", "), call. = FALSE)
}
valid_types <- c("more", "less", "opt", "trap", "custom")
bad_types <- config$type[!config$type %in% valid_types]
if (length(bad_types) > 0) {
stop("Invalid scoring type(s): ",
paste(unique(bad_types), collapse = ", "),
". Must be one of: ", paste(valid_types, collapse = ", "),
call. = FALSE)
}
cfg_vars <- config$variable
missing_var <- setdiff(cfg_vars, num_cols)
if (length(missing_var) > 0) {
msg <- paste("Config variables not found in data:",
paste(missing_var, collapse = ", "))
messages <- c(messages, paste("WARNING:", msg))
}
# Check opt/trap have required parameters
opt_rows <- config[config$type %in% c("opt", "trap"), ]
if (nrow(opt_rows) > 0) {
missing_opt <- opt_rows$variable[
is.na(opt_rows$opt_low) | is.na(opt_rows$opt_high)]
if (length(missing_opt) > 0) {
stop("Variables with type 'opt' or 'trap' must have opt_low and ",
"opt_high: ", paste(missing_opt, collapse = ", "), call. = FALSE)
}
}
trap_rows <- config[config$type == "trap", ]
if (nrow(trap_rows) > 0) {
missing_trap <- trap_rows$variable[
is.na(trap_rows$min_val) | is.na(trap_rows$max_val)]
if (length(missing_trap) > 0) {
stop("Variables with type 'trap' must have min_val and max_val: ",
paste(missing_trap, collapse = ", "), call. = FALSE)
}
}
messages <- c(messages,
paste("INFO: Config validated for", nrow(config),
"variable(s)."))
}
# ── 8. Report ──────────────────────────────────────────────────────────────
if (verbose) {
cat("\n=== SQIpro Data Validation ===\n")
cat(sprintf(" Data: %d rows x %d columns\n", nrow(data), ncol(data)))
cat(sprintf(" Group columns : %s\n", paste(group_cols, collapse=", ")))
cat(sprintf(" Numeric variables: %d\n", length(num_cols)))
cat(sprintf(" Groups detected : %d\n", nrow(n_per_group)))
if (length(messages) > 0) {
cat("\nMessages:\n")
for (m in messages) cat(" ", m, "\n")
}
cat(sprintf("\nResult: %s\n\n",
if (valid) "PASS - data ready for SQI computation"
else "FAIL - please fix errors above"))
}
invisible(list(valid = valid, messages = messages,
n_per_group = n_per_group))
}
#' Build a Variable Configuration Table
#'
#' @description
#' Constructs a variable configuration data frame that specifies the scoring
#' function type and relevant parameters for each soil indicator. This
#' configuration table is the central object passed to all scoring and
#' indexing functions in \pkg{SQIpro}.
#'
#' @param variable Character vector of variable names (must match column names
#' in the data).
#' @param type Character vector of scoring types, one per variable. Must be
#' one of:
#' \describe{
#' \item{\code{"more"}}{Higher values are better (e.g., organic carbon,
#' CEC, microbial biomass).}
#' \item{\code{"less"}}{Lower values are better (e.g., bulk density, EC,
#' heavy metals).}
#' \item{\code{"opt"}}{A specific optimum value or range is best (e.g.,
#' pH, clay content). Requires \code{opt_low} and \code{opt_high}.}
#' \item{\code{"trap"}}{A trapezoidal function with a flat optimum plateau
#' and tapered shoulders. Requires all four boundary parameters.}
#' \item{\code{"custom"}}{User-supplied scoring function via
#' \code{\link{score_custom}}.}
#' }
#' @param opt_low Numeric vector. Lower bound of optimum range (required for
#' \code{"opt"} and \code{"trap"} types; \code{NA} otherwise).
#' @param opt_high Numeric vector. Upper bound of optimum range (required for
#' \code{"opt"} and \code{"trap"} types; \code{NA} otherwise).
#' @param min_val Numeric vector. Absolute minimum value (required for
#' \code{"trap"}; \code{NA} otherwise). Values at or below this receive
#' a score of 0.
#' @param max_val Numeric vector. Absolute maximum value (required for
#' \code{"trap"}; \code{NA} otherwise). Values at or above this receive
#' a score of 0.
#' @param weight Numeric vector of user-defined weights (0--1). Used only
#' when \code{method = "weighted"} in \code{\link{sqi_linear}}.
#' Defaults to \code{1} (equal weights).
#' @param description Character vector. Optional human-readable description
#' of each variable (units, rationale). Useful for automated reports.
#'
#' @return A data frame (class \code{sqi_config}) with one row per variable.
#'
#' @references
#' Doran, J.W., & Parkin, T.B. (1994). Defining and assessing soil quality.
#' In J.W. Doran et al. (Eds.), \emph{Defining Soil Quality for a
#' Sustainable Environment}, pp. 1--21. SSSA Special Publication 35.
#' \doi{10.2136/sssaspecpub35.c1}
#'
#' Andrews, S.S., Karlen, D.L., & Cambardella, C.A. (2004). The soil
#' management assessment framework. \emph{Soil Science Society of America
#' Journal}, 68(6), 1945--1962. \doi{10.2136/sssaj2004.1945}
#'
#' @examples
#' cfg <- make_config(
#' variable = c("pH", "EC", "BD", "OC", "MBC", "Clay"),
#' type = c("opt", "less","less","more","more","opt"),
#' opt_low = c(6.0, NA, NA, NA, NA, 20),
#' opt_high = c(7.0, NA, NA, NA, NA, 35),
#' description = c("Soil pH (H2O)",
#' "Electrical Conductivity (dS/m)",
#' "Bulk Density (g/cm3)",
#' "Organic Carbon (%)",
#' "Microbial Biomass Carbon (mg/kg)",
#' "Clay content (%)")
#' )
#' print(cfg)
#'
#' @export
make_config <- function(variable, type,
opt_low = rep(NA_real_, length(variable)),
opt_high = rep(NA_real_, length(variable)),
min_val = rep(NA_real_, length(variable)),
max_val = rep(NA_real_, length(variable)),
weight = rep(1, length(variable)),
description = rep(NA_character_, length(variable))) {
n <- length(variable)
if (!all(lengths(list(type, opt_low, opt_high,
min_val, max_val, weight,
description)) == n)) {
stop("All arguments must have the same length as `variable`.",
call. = FALSE)
}
cfg <- data.frame(
variable = variable,
type = type,
opt_low = as.numeric(opt_low),
opt_high = as.numeric(opt_high),
min_val = as.numeric(min_val),
max_val = as.numeric(max_val),
weight = as.numeric(weight),
description = as.character(description),
stringsAsFactors = FALSE
)
class(cfg) <- c("sqi_config", "data.frame")
cfg
}
#' @export
print.sqi_config <- function(x, ...) {
cat("SQIpro Variable Configuration\n")
cat(sprintf(" %d variable(s) defined\n\n", nrow(x)))
print.data.frame(x, row.names = FALSE, ...)
invisible(x)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.