R/validate.R

Defines functions print.sqi_config make_config validate_data

Documented in make_config validate_data

#' Validate Input Data for SQI Analysis
#'
#' @description
#' Checks that a data frame meets requirements for Soil Quality Index (SQI)
#' computation: correct column types, sufficient sample sizes, absence of
#' infinite values, and appropriate variable configuration.
#'
#' @param data A data frame. The first column(s) should be grouping factors
#'   (character or factor); remaining columns should be numeric soil
#'   variables.
#' @param group_cols Character vector. Names of grouping columns (e.g.,
#'   \code{c("LandUse", "Depth")}). Defaults to the first column.
#' @param config A data frame produced by \code{\link{make_config}} or
#'   manually created, with columns \code{variable}, \code{type},
#'   \code{opt_low}, \code{opt_high}, \code{min_val}, \code{max_val}.
#'   If \code{NULL}, only basic data checks are performed.
#' @param min_n Integer. Minimum number of observations per group.
#'   Default is 3.
#' @param verbose Logical. If \code{TRUE} (default), prints a validation
#'   summary to the console.
#'
#' @return Invisibly returns a list with components:
#'   \describe{
#'     \item{valid}{Logical. \code{TRUE} if all checks pass.}
#'     \item{messages}{Character vector of warning/info messages.}
#'     \item{n_per_group}{Data frame of group sizes.}
#'   }
#'
#' @references
#' Andrews, S.S., Karlen, D.L., & Cambardella, C.A. (2004). The soil
#' management assessment framework: A quantitative soil quality evaluation
#' method. \emph{Soil Science Society of America Journal}, 68(6),
#' 1945--1962. \doi{10.2136/sssaj2004.1945}
#'
#' @examples
#' data(soil_data)
#' result <- validate_data(soil_data, group_cols = c("LandUse", "Depth"))
#' result$valid
#' result$n_per_group
#'
#' @export
validate_data <- function(data, group_cols = NULL, config = NULL,
                          min_n = 3, verbose = TRUE) {

  messages <- character(0)
  valid    <- TRUE

  # ── 1. Basic checks ────────────────────────────────────────────────────────
  if (!is.data.frame(data)) {
    stop("`data` must be a data frame.", call. = FALSE)
  }
  if (nrow(data) == 0) stop("`data` has zero rows.", call. = FALSE)
  if (ncol(data) < 2)  stop("`data` must have at least 2 columns.",
                             call. = FALSE)

  # ── 2. Identify grouping columns ───────────────────────────────────────────
  if (is.null(group_cols)) group_cols <- names(data)[1]
  missing_gc <- setdiff(group_cols, names(data))
  if (length(missing_gc) > 0) {
    stop("Group column(s) not found in data: ",
         paste(missing_gc, collapse = ", "), call. = FALSE)
  }

  num_cols <- setdiff(names(data), group_cols)
  non_num  <- num_cols[!sapply(data[num_cols], is.numeric)]
  if (length(non_num) > 0) {
    msg <- paste("Non-numeric variable column(s) detected (will be ignored):",
                 paste(non_num, collapse = ", "))
    messages <- c(messages, paste("WARNING:", msg))
    valid <- FALSE
  }
  num_cols <- setdiff(num_cols, non_num)

  # ── 3. Missing values ──────────────────────────────────────────────────────
  na_counts <- colSums(is.na(data[num_cols]))
  na_vars   <- na_counts[na_counts > 0]
  if (length(na_vars) > 0) {
    msg <- paste0("Missing values detected in: ",
                  paste(names(na_vars), "(", na_vars, "NA)",
                        collapse = ", "))
    messages <- c(messages, paste("WARNING:", msg))
  }

  # ── 4. Infinite values ─────────────────────────────────────────────────────
  inf_vars <- num_cols[sapply(data[num_cols], function(x)
    any(is.infinite(x)))]
  if (length(inf_vars) > 0) {
    msg <- paste("Infinite values in:", paste(inf_vars, collapse = ", "))
    messages <- c(messages, paste("ERROR:", msg))
    valid <- FALSE
  }

  # ── 5. Zero-variance columns ───────────────────────────────────────────────
  zero_var <- num_cols[sapply(data[num_cols], function(x)
    stats::var(x, na.rm = TRUE) == 0)]
  if (length(zero_var) > 0) {
    msg <- paste("Zero-variance columns (constant values):",
                 paste(zero_var, collapse = ", "))
    messages <- c(messages, paste("WARNING:", msg))
  }

  # ── 6. Group sizes ─────────────────────────────────────────────────────────
  group_sym <- rlang::syms(group_cols)
  n_per_group <- data %>%
    dplyr::group_by(!!!group_sym) %>%
    dplyr::summarise(n = dplyr::n(), .groups = "drop")

  small_groups <- n_per_group[n_per_group$n < min_n, ]
  if (nrow(small_groups) > 0) {
    msg <- paste0("Groups with fewer than ", min_n, " observations: ",
                  nrow(small_groups), " group(s). Consider merging or ",
                  "increasing sample size for reliable index estimation.")
    messages <- c(messages, paste("WARNING:", msg))
  }

  # ── 7. Config validation ───────────────────────────────────────────────────
  if (!is.null(config)) {
    req_cols <- c("variable", "type")
    miss_cfg <- setdiff(req_cols, names(config))
    if (length(miss_cfg) > 0) {
      stop("Config is missing required columns: ",
           paste(miss_cfg, collapse = ", "), call. = FALSE)
    }
    valid_types <- c("more", "less", "opt", "trap", "custom")
    bad_types   <- config$type[!config$type %in% valid_types]
    if (length(bad_types) > 0) {
      stop("Invalid scoring type(s): ",
           paste(unique(bad_types), collapse = ", "),
           ". Must be one of: ", paste(valid_types, collapse = ", "),
           call. = FALSE)
    }
    cfg_vars    <- config$variable
    missing_var <- setdiff(cfg_vars, num_cols)
    if (length(missing_var) > 0) {
      msg <- paste("Config variables not found in data:",
                   paste(missing_var, collapse = ", "))
      messages <- c(messages, paste("WARNING:", msg))
    }
    # Check opt/trap have required parameters
    opt_rows <- config[config$type %in% c("opt", "trap"), ]
    if (nrow(opt_rows) > 0) {
      missing_opt <- opt_rows$variable[
        is.na(opt_rows$opt_low) | is.na(opt_rows$opt_high)]
      if (length(missing_opt) > 0) {
        stop("Variables with type 'opt' or 'trap' must have opt_low and ",
             "opt_high: ", paste(missing_opt, collapse = ", "), call. = FALSE)
      }
    }
    trap_rows <- config[config$type == "trap", ]
    if (nrow(trap_rows) > 0) {
      missing_trap <- trap_rows$variable[
        is.na(trap_rows$min_val) | is.na(trap_rows$max_val)]
      if (length(missing_trap) > 0) {
        stop("Variables with type 'trap' must have min_val and max_val: ",
             paste(missing_trap, collapse = ", "), call. = FALSE)
      }
    }
    messages <- c(messages,
                  paste("INFO: Config validated for", nrow(config),
                        "variable(s)."))
  }

  # ── 8. Report ──────────────────────────────────────────────────────────────
  if (verbose) {
    cat("\n=== SQIpro Data Validation ===\n")
    cat(sprintf("  Data: %d rows x %d columns\n", nrow(data), ncol(data)))
    cat(sprintf("  Group columns   : %s\n", paste(group_cols, collapse=", ")))
    cat(sprintf("  Numeric variables: %d\n", length(num_cols)))
    cat(sprintf("  Groups detected : %d\n", nrow(n_per_group)))
    if (length(messages) > 0) {
      cat("\nMessages:\n")
      for (m in messages) cat(" ", m, "\n")
    }
    cat(sprintf("\nResult: %s\n\n",
                if (valid) "PASS - data ready for SQI computation"
                else "FAIL - please fix errors above"))
  }

  invisible(list(valid = valid, messages = messages,
                 n_per_group = n_per_group))
}


#' Build a Variable Configuration Table
#'
#' @description
#' Constructs a variable configuration data frame that specifies the scoring
#' function type and relevant parameters for each soil indicator.  This
#' configuration table is the central object passed to all scoring and
#' indexing functions in \pkg{SQIpro}.
#'
#' @param variable Character vector of variable names (must match column names
#'   in the data).
#' @param type Character vector of scoring types, one per variable.  Must be
#'   one of:
#'   \describe{
#'     \item{\code{"more"}}{Higher values are better (e.g., organic carbon,
#'       CEC, microbial biomass).}
#'     \item{\code{"less"}}{Lower values are better (e.g., bulk density, EC,
#'       heavy metals).}
#'     \item{\code{"opt"}}{A specific optimum value or range is best (e.g.,
#'       pH, clay content). Requires \code{opt_low} and \code{opt_high}.}
#'     \item{\code{"trap"}}{A trapezoidal function with a flat optimum plateau
#'       and tapered shoulders. Requires all four boundary parameters.}
#'     \item{\code{"custom"}}{User-supplied scoring function via
#'       \code{\link{score_custom}}.}
#'   }
#' @param opt_low Numeric vector. Lower bound of optimum range (required for
#'   \code{"opt"} and \code{"trap"} types; \code{NA} otherwise).
#' @param opt_high Numeric vector. Upper bound of optimum range (required for
#'   \code{"opt"} and \code{"trap"} types; \code{NA} otherwise).
#' @param min_val Numeric vector. Absolute minimum value (required for
#'   \code{"trap"}; \code{NA} otherwise). Values at or below this receive
#'   a score of 0.
#' @param max_val Numeric vector. Absolute maximum value (required for
#'   \code{"trap"}; \code{NA} otherwise). Values at or above this receive
#'   a score of 0.
#' @param weight Numeric vector of user-defined weights (0--1). Used only
#'   when \code{method = "weighted"} in \code{\link{sqi_linear}}.
#'   Defaults to \code{1} (equal weights).
#' @param description Character vector. Optional human-readable description
#'   of each variable (units, rationale). Useful for automated reports.
#'
#' @return A data frame (class \code{sqi_config}) with one row per variable.
#'
#' @references
#' Doran, J.W., & Parkin, T.B. (1994). Defining and assessing soil quality.
#' In J.W. Doran et al. (Eds.), \emph{Defining Soil Quality for a
#' Sustainable Environment}, pp. 1--21. SSSA Special Publication 35.
#' \doi{10.2136/sssaspecpub35.c1}
#'
#' Andrews, S.S., Karlen, D.L., & Cambardella, C.A. (2004). The soil
#' management assessment framework. \emph{Soil Science Society of America
#' Journal}, 68(6), 1945--1962. \doi{10.2136/sssaj2004.1945}
#'
#' @examples
#' cfg <- make_config(
#'   variable    = c("pH",  "EC",  "BD",  "OC",  "MBC", "Clay"),
#'   type        = c("opt", "less","less","more","more","opt"),
#'   opt_low     = c(6.0,   NA,    NA,    NA,    NA,    20),
#'   opt_high    = c(7.0,   NA,    NA,    NA,    NA,    35),
#'   description = c("Soil pH (H2O)",
#'                   "Electrical Conductivity (dS/m)",
#'                   "Bulk Density (g/cm3)",
#'                   "Organic Carbon (%)",
#'                   "Microbial Biomass Carbon (mg/kg)",
#'                   "Clay content (%)")
#' )
#' print(cfg)
#'
#' @export
make_config <- function(variable, type,
                        opt_low     = rep(NA_real_, length(variable)),
                        opt_high    = rep(NA_real_, length(variable)),
                        min_val     = rep(NA_real_, length(variable)),
                        max_val     = rep(NA_real_, length(variable)),
                        weight      = rep(1,        length(variable)),
                        description = rep(NA_character_, length(variable))) {

  n <- length(variable)
  if (!all(lengths(list(type, opt_low, opt_high,
                        min_val, max_val, weight,
                        description)) == n)) {
    stop("All arguments must have the same length as `variable`.",
         call. = FALSE)
  }

  cfg <- data.frame(
    variable    = variable,
    type        = type,
    opt_low     = as.numeric(opt_low),
    opt_high    = as.numeric(opt_high),
    min_val     = as.numeric(min_val),
    max_val     = as.numeric(max_val),
    weight      = as.numeric(weight),
    description = as.character(description),
    stringsAsFactors = FALSE
  )
  class(cfg) <- c("sqi_config", "data.frame")
  cfg
}


#' @export
print.sqi_config <- function(x, ...) {
  cat("SQIpro Variable Configuration\n")
  cat(sprintf("  %d variable(s) defined\n\n", nrow(x)))
  print.data.frame(x, row.names = FALSE, ...)
  invisible(x)
}

Try the SQIpro package in your browser

Any scripts or data that you put into this service are public.

SQIpro documentation built on April 20, 2026, 5:06 p.m.