R/itemrest_core.R

Defines functions print.itemrest_result itemrest test_removals get_combinations identify_problem_items sort_item_ids efa_custom determine_n_factors cor_matrix_custom descriptive_stats howard

Documented in cor_matrix_custom descriptive_stats determine_n_factors efa_custom get_combinations howard identify_problem_items itemrest print.itemrest_result sort_item_ids test_removals

# ===================================================================
# NOTE ON PACKAGE DEPENDENCIES
# ===================================================================
# Required packages (e.g., psych, gtools, utils, stats) should be
# listed under the 'Imports:' field in the package's DESCRIPTION file.
# ===================================================================


# ===================================================================
# HELPER (INTERNAL) FUNCTIONS
# ===================================================================

#' Check for cross-loading using Howard and Costello & Osborne criteria.
#' @keywords internal
howard <- function(primary, secondary) {
  # An item is considered "clean" if:
  # Primary loading >= 0.40, secondary loading < 0.30, and difference >= 0.20.
  # Otherwise, it is identified as a cross-loading item.
  return(!(primary >= 0.40 && secondary < 0.30 && (primary - secondary) >= 0.20))
}

#' Calculate basic descriptive statistics for a dataset.
#' @keywords internal
descriptive_stats <- function(data) {
  list(
    n_items = ncol(data),
    n_obs = nrow(data),
    min_value = min(data, na.rm = TRUE),
    max_value = max(data, na.rm = TRUE)
  )
}

#' Calculate a correlation matrix.
#' @keywords internal
cor_matrix_custom <- function(data, method = "polychoric") {
  if (method == "polychoric") {
    cor_mat <- suppressMessages(qgraph::cor_auto(data))
  } else {
    cor_mat <- stats::cor(data, use = "pairwise.complete.obs", method = method)
  }
  return(cor_mat)
}

#' Determine the number of factors using Parallel Analysis.
#' @keywords internal
determine_n_factors <- function(data, cor_method = "pearson") {
  cor_mat <- cor_matrix_custom(data, cor_method)
  n_obs <- nrow(data)
  fa_parallel <- suppressMessages(psych::fa.parallel(
    cor_mat, fa = "fa", n.iter = 20, n.obs = n_obs,
    show.legend = FALSE, main = NULL
  ))
  return(fa_parallel$nfact)
}

#' Run a custom EFA.
#' @keywords internal
efa_custom <- function(data, n_factors = 1, cor_method = "polychoric", extract = "uls", rotate = "oblimin") {
  cor_mat <- cor_matrix_custom(data, cor_method)
  efa <- NULL
  invisible(utils::capture.output({
    efa <- psych::fa(r = cor_mat, nfactors = n_factors, rotate = rotate, fm = extract)
  }))
  alpha_val <- psych::alpha(data, check.keys = FALSE)$total$raw_alpha
  loading <- efa$loadings[]
  explained_var <- sum(efa$Vaccounted["SS loadings", 1:n_factors]) / ncol(data)
  list(efa = efa, alpha = alpha_val, explained_var = explained_var, loadings = loading)
}

#' Sort item IDs numerically.
#' @keywords internal
sort_item_ids <- function(x) {
  gtools::mixedsort(x)
}

#' Identify problematic items (low-loading or cross-loading).
#' @keywords internal
identify_problem_items <- function(efa_res, min_loading = 0.30, loading_diff = 0.10) {
  loadings <- abs(efa_res$loadings)
  items <- rownames(loadings)
  cross_3 <- c()
  cross_2 <- c()
  low_loading <- c()

  if (ncol(loadings) == 0) return(list(cross_3 = c(), cross_2 = c(), low_loading = c()))

  for (i in 1:nrow(loadings)) {
    item_loads <- loadings[i, ]
    primary <- max(item_loads, na.rm = TRUE)

    # 1. LOW LOADING CHECK:
    # If it is smaller than the user-defined `min_loading` value, mark it as `low_loading`.
    if (all(item_loads < min_loading, na.rm = TRUE)) {
      low_loading <- c(low_loading, items[i])
      next
    }

    if (ncol(loadings) > 1) {
      sorted_loads <- sort(item_loads, decreasing = TRUE)
      secondary <- sorted_loads[2]

      # 2. CROSS-LOADING CHECK:
      # If the user selected `howard`, use the classic criterion (0.40-0.30-0.20).
      # Otherwise, check whether the difference between the two loadings is smaller than `loading_diff`.
      if (loading_diff == "howard") {
        is_cross <- howard(primary, secondary)
      } else {
        is_cross <- (primary - secondary) < as.numeric(loading_diff)
      }

      if (is_cross) {
        # 0.30 threshold is retained for categorization.
        if (sum(item_loads >= 0.30, na.rm = TRUE) >= 3) {
          cross_3 <- c(cross_3, items[i])
        } else if (sum(item_loads >= 0.30, na.rm = TRUE) == 2) {
          cross_2 <- c(cross_2, items[i])
        }
      }
    }
  }
  list(cross_3 = unique(cross_3), cross_2 = unique(cross_2), low_loading = unique(low_loading))
}


#' Generate combinations of items to remove.
#' @keywords internal
get_combinations <- function(items) {
  n <- length(items)
  if (n == 0) return(list(list()))
  all_combs <- unlist(lapply(1:n, function(i) utils::combn(items, i, simplify = FALSE)), recursive = FALSE)
  return(c(list(list()), all_combs)) # Add empty set for the baseline (no removal) case
}

#' Test removal strategies and build a summary table.
#' @keywords internal
test_removals <- function(data, base_items, combs, n_factors, cor_method, extract, rotate, min_loading, loading_diff) {
  summary_list <- list()
  total <- length(combs)
  pb <- utils::txtProgressBar(min = 0, max = total, style = 3)

  for (i in seq_along(combs)) {
    items_to_remove <- combs[[i]]
    remaining_items <- setdiff(base_items, items_to_remove)

    if (length(remaining_items) > n_factors) {
      efa_out <- efa_custom(data[, remaining_items, drop = FALSE], n_factors, cor_method, extract, rotate)

      # Pass the new settings to the identify_problem_items function.
      prob_items_test <- identify_problem_items(efa_out, min_loading, loading_diff)

      has_cross_loading <- length(prob_items_test$cross_2) > 0 || length(prob_items_test$cross_3) > 0

      removed_label <- if (length(items_to_remove) == 0) "None" else paste(sort_item_ids(items_to_remove), collapse = "-")
      load_mat <- as.matrix(efa_out$loadings)
      threshold <- 0.30
      valid_loads <- abs(load_mat)[abs(load_mat) > threshold]
      load_min <- if (length(valid_loads) > 0) formatC(min(valid_loads), digits = 2, format = "f") else NA
      load_max <- if (length(valid_loads) > 0) formatC(max(valid_loads), digits = 2, format = "f") else NA
      loading_range <- if(is.na(load_min)) "N/A" else paste0(load_min, "-", load_max)

      summary_list[[i]] <- data.frame(
        Removed_Items = removed_label, Total_Explained_Var = efa_out$explained_var,
        Factor_Loading_Range = loading_range, Cronbachs_Alpha = efa_out$alpha,
        Cross_Loading = ifelse(has_cross_loading, "Yes", "No"), stringsAsFactors = FALSE
      )
    }
    utils::setTxtProgressBar(pb, i)
  }
  close(pb)
  summary_table <- do.call(rbind, summary_list)
  summary_table <- summary_table[!duplicated(summary_table$Removed_Items), ]

  if (nrow(summary_table) > 0) {
    sorted_indices <- order(summary_table$Cross_Loading, -summary_table$Total_Explained_Var)
    summary_table <- summary_table[sorted_indices, ]
  }

  return(summary_table)
}


# ===================================================================
# MAIN (COMPUTATION) FUNCTION
# ===================================================================

#' Evaluate Item Removal Strategies for Exploratory Factor Analysis (EFA)
#'
#' @description
#' This function automates the process of identifying low-quality items (those
#' with low factor loadings or significant cross-loadings) in an Exploratory
#' Factor Analysis (EFA). It systematically tests various combinations of
#' removing these problematic items and evaluates the impact on model fit,
#' returning a comprehensive summary of all tested strategies.
#'
#' @param data A numeric `data.frame` or `matrix` for the analysis.
#' @param cor_method The correlation method to use, e.g., `"pearson"` or `"polychoric"`.
#' @param n_factors The number of factors. If `NULL`, it's determined automatically by parallel analysis.
#' @param extract The factor extraction (estimation) method. See `psych::fa`. Default is `"uls"`.
#' @param rotate The rotation method. See `psych::fa`. Default is `"oblimin"`.
#' @param min_loading Minimum factor loading threshold. Default is 0.30.
#' @param loading_diff Threshold for cross-loading identification. Default is 0.10. Can be "howard".
#'
#' @return An object of class `itemrest_result`. This is a list containing the
#'   following components:
#'   \item{descriptive_stats}{Basic descriptive statistics of the input data.}
#'   \item{initial_efa}{The results of the initial EFA before any items are removed.}
#'   \item{problem_items}{A list of items identified as low-loading or cross-loading.}
#'   \item{removal_summary}{A data.frame summarizing the results of all tested removal strategies.}
#'   \item{optimal_strategy}{The best-performing strategy that resulted in a clean factor structure (no cross-loadings).}
#'   \item{settings}{A list of the settings used for the analysis.}
#'
#' @export
#'
#' @examples
#' \donttest{
#' # We will use the 'bfi' dataset from the 'psych' package.
#' # This requires the 'psych' package to be installed.
#' if (requireNamespace("psych", quietly = TRUE)) {
#'   data(bfi, package = "psych")
#'
#'   # 1. Prepare the data: Select the personality items (first 25 columns)
#'   #    and remove rows with missing values for this example.
#'   example_data <- bfi[, 1:25]
#'   example_data <- na.omit(example_data)
#'
#'   # 2. Run the item removal analysis.
#'   #    Based on theory, the Big Five model has 5 factors.
#'   results <- itemrest(
#'     data = example_data,
#'     n_factors = 5,
#'     cor_method = "pearson" # Data is not ordinal, so pearson is appropriate
#'   )
#'
#'   # 3. Print the report for optimal strategies (default).
#'   print(results)
#'
#'   # 4. Print the report for all tested strategies.
#'   print(results, report = "all")
#' }
#' }
itemrest <- function(data,
                     cor_method = "pearson",
                     n_factors = NULL,
                     extract = "uls",
                     rotate = "oblimin",
                     min_loading = 0.30,
                     loading_diff = 0.10) {

  # --- Parameter Validation and Setup ---
  colnames(data) <- gsub("\\.", "_", colnames(data))
  cor_method <- tolower(cor_method)
  # ... (other parameter validations would go here) ...

  n_factors_determined <- n_factors
  auto_n_factors_flag <- is.null(n_factors)
  if (auto_n_factors_flag) {
    n_factors_determined <- determine_n_factors(data, cor_method)
  }
  descriptives <- descriptive_stats(data)
  initial_efa <- efa_custom(data, n_factors_determined, cor_method, extract, rotate)

  problem_items <- identify_problem_items(initial_efa, min_loading = min_loading, loading_diff = loading_diff)
  all_problem_items <- sort_item_ids(unique(c(problem_items$cross_3, problem_items$cross_2, problem_items$low_loading)))

  message("--- Settings and Descriptive Statistics ---")
  if (auto_n_factors_flag) {
    message("Number of Factors (Auto): ", n_factors_determined, " (Determined by Parallel Analysis)")
  } else {
    message("Number of Factors (Manual): ", n_factors_determined)
  }
  message("Number of Items: ", descriptives$n_items)
  message("Number of Observations: ", descriptives$n_obs)
  message("Minimum Value: ", descriptives$min_value)
  message("Maximum Value: ", descriptives$max_value)
  message("\n--- Initial EFA Results (No items removed) ---")
  message("Cronbach's Alpha: ", formatC(initial_efa$alpha, digits = 3, format = "f"))
  message("Total Explained Variance: % ", formatC(initial_efa$explained_var * 100, digits = 2, format = "f"))
  message("Low-loading Items: ", ifelse(length(problem_items$low_loading) == 0, "None", paste(problem_items$low_loading, collapse = ", ")))
  message("Minimum Loading Threshold: ", formatC(min_loading, digits = 2, format = "f"))
  message("Loading Difference Criterion: ",
          ifelse(loading_diff == "howard",
                 "Howard (2016)",
                 formatC(as.numeric(loading_diff), digits = 2, format = "f")))
  cross_items_combined <- c(problem_items$cross_2, problem_items$cross_3)
  message("Cross-loading Items: ", ifelse(length(cross_items_combined) == 0, "None", paste(unique(sort_item_ids(cross_items_combined)), collapse = ", ")))

  message("\nAll Identified Low-Quality Items: ", ifelse(length(all_problem_items) == 0, "None", paste(all_problem_items, collapse = ", ")))

  # --- Test Removal Strategies ---
  removal_summary <- NULL
  optimal_strategy <- NULL

  if (length(all_problem_items) > 0) {
    all_combs <- get_combinations(all_problem_items)

    # --- Display the progress bar ---
    message("\n[Info] Testing ", length(all_combs) - 1, " different removal combinations for low-quality items...")
    removal_summary <- test_removals(data, colnames(data), all_combs, n_factors_determined,
                                     cor_method, extract, rotate,
                                     min_loading = min_loading, loading_diff = loading_diff)

    optimal_candidates <- removal_summary[ removal_summary[["Cross_Loading"]] == "No",
                                           , drop = FALSE ]

    if (nrow(optimal_candidates) > 0) {
      optimal_strategy <- optimal_candidates[1, , drop = FALSE]
    } else {
      optimal_strategy <- NULL
    }
  }

  # --- Create the output object ---
  output <- list(
    descriptive_stats = descriptives,
    initial_efa = initial_efa,
    problem_items = problem_items,
    all_problem_items_combined = all_problem_items,
    removal_summary = removal_summary,
    optimal_strategy = if (exists("optimal_strategy")) optimal_strategy else NULL,
    settings = list(
      n_factors = n_factors_determined, cor_method = cor_method,
      extract = extract, rotate = rotate,
      auto_n_factors = auto_n_factors_flag
    )
  )

  # Assign the custom class
  class(output) <- "itemrest_result"

  # Return the object
  return(output)
}


# ===================================================================
# PRESENTATION (PRINT) METHOD
# ===================================================================

#' Print method for `itemrest_result` class
#'
#' @param x An object of class `itemrest_result`.
#' @param report The type of report to generate: `"optimal"` (default) or `"all"`.
#' @param ... Other arguments (not used).
#'
#' @return No return value, called for side effects (prints the report to the console).
#' @export
#'
#' @examples
#' \donttest{
#' if (requireNamespace("psych", quietly = TRUE)) {
#'   data(bfi, package = "psych")
#'   example_data <- na.omit(bfi[, 1:25])
#'   results <- itemrest(example_data, n_factors = 5)
#'
#'   # Print the default optimal report
#'   print(results)
#'
#'   # Print the report of all strategies
#'   print(results, report = "all")
#' }
#' }
print.itemrest_result <- function(x, report = "optimal", ...) {

  # Initial info is printed by itemrest() during computation.
  # This function only prints the final result tables and the final message.

  # Header
  cat("\n==============================\n")
  cat(" Item Removal Strategy Report\n")
  cat("==============================\n")

  if (is.null(x$removal_summary)) {
    cat("\n>> Result: No low-quality items were detected. No additional report table available.\n")
  } else {
    report_table <- NULL
    header <- ""

    # Choose which table to display based on the 'report' argument
    if (report == "all") {
      header <- "\n--- Results of All Removal Strategies ---\n"
      report_table <- x$removal_summary
    } else { # default to optimal
      header <- "\n--- Optimal Removal Strategies (No Cross-Loadings) ---\n"

      # For the optimal report, we only show strategies with No cross-loading
      report_table <- x$removal_summary[x$removal_summary$Cross_Loading == "No", ]
    }

    cat(header)
    if (is.null(report_table) || nrow(report_table) == 0) {
      cat("No strategy matching this criterion was found.\n")
    } else {

      # Make the table more readable before printing
      display_table <- report_table
      display_table$Total_Explained_Var <- paste0("% ", formatC(display_table$Total_Explained_Var * 100, format = "f", digits = 2))
      display_table$Cronbachs_Alpha <- round(display_table$Cronbachs_Alpha, 3)
      row.names(display_table) <- NULL
      print(display_table)
    }
  }

  # Final Message
  cat("\n-----------------------------------------------------\n")
  message("Final Reminder: Let algorithms be your compass, not your captain. Valid item removal also requires theoretical competence.")

  invisible(x)
}

Try the ItemRest package in your browser

Any scripts or data that you put into this service are public.

ItemRest documentation built on April 13, 2026, 5:07 p.m.