Nothing
#' Estimate Number of Dimensions using Factor Forest
#'
#' Estimates the number of dimensions in data using the
#' pre-trained Random Forest model from Goretzko and Buhner
#' (2020, 2022). See examples to get started
#'
#' @param data Matrix or data frame.
#' Either a dataset with all numeric values
#' (rows = cases, columns = variables) or
#' a symmetric correlation matrix
#'
#' @param sample_size Numeric (length = 1).
#' If input into \code{data} is a correlation matrix,
#' then specifying the sample size is required
#'
#' @param maximum_factors Numeric (length = 1).
#' Maximum number of factors to search over.
#' Defaults to \code{8}
#'
#' @param PA_correlation Character (length = 1).
#' Type of correlation used in \code{\link[psych]{fa.parallel}}.
#' Must be set:
#'
#' \itemize{
#'
#' \item \code{"cor"} --- Pearson's correlation
#'
#' \item \code{"poly"} --- Polychoric correlation
#'
#' \item \code{"tet"} --- Tetrachoric correlation
#'
#' }
#'
#' @return Returns a list containing:
#'
#' \item{dimensions}{Number of dimensions identified}
#'
#' \item{probabilities}{Probability that the number of dimensions
#' is most likely}
#'
#' @examples
#' # Generate factor data
#' two_factor <- simulate_factors(
#' factors = 2, # factors = 2
#' variables = 6, # variables per factor = 6
#' loadings = 0.55, # loadings between = 0.45 to 0.65
#' cross_loadings = 0.05, # cross-loadings N(0, 0.05)
#' correlations = 0.30, # correlation between factors = 0.30
#' sample_size = 1000 # number of cases = 1000
#' )
#'
#' \dontrun{
#' # Perform Factor Forest
#' factor_forest(two_factor$data)}
#'
#' @author
#' # Authors of Factor Forest \cr
#' David Goretzko and Markus Buhner
#'
#' # Authors of \{latentFactoR\} \cr
#' Alexander P. Christensen <alexpaulchristensen@gmail.com>,
#' Hudson Golino <hfg9s@virginia.edu>,
#' Luis Eduardo Garrido <luisgarrido@pucmm.edu>
#'
#' @references
#' Goretzko, D., & Buhner, M. (2022).
#' Factor retention using machine learning with ordinal data.
#' \emph{Applied Psychological Measurement}, 01466216221089345.
#'
#' Goretzko, D., & Buhner, M. (2020).
#' One model to rule them all? Using machine learning
#' algorithms to determine the number of factors
#' in exploratory factor analysis.
#' \emph{Psychological Methods}, \emph{25}(6), 776-786.
#'
#' @import xgboost
#' @import mlr
#' @importFrom stats sd predict
#'
#' @export
#'
# Factor Forest
# Updated 18.04.2024
factor_forest <- function(
data, sample_size,
maximum_factors = 8,
PA_correlation = c("cor", "poly", "tet")
)
{
# Check for appropriate data
object_error(data, c("matrix", "data.frame", "array"));
sink <- apply(data, 2, type_error, expected_type = "numeric");
# Ensure data is matrix
data <- as.matrix(data)
# Check for variable names
if(is.null(colnames(data))){
colnames(data) <- paste0("V", 1:ncol(data))
}
# Obtain correlation matrix (if not already)
if(!isSymmetric(data)){
# Compute correlations
correlation <- EGAnet::auto.correlate(data, verbose = FALSE)
# Set sample size
sample_size <- nrow(data)
}else{
# Set data as correlations
correlation <- data
# Check for sample size
if(missing(sample_size)){
stop("Input for 'sample_size' is required when input for 'data' is a correlation (symmetric) matrix.")
}
}
# Check for appropriate sample size
type_error(sample_size, "numeric"); length_error(sample_size, 1);
range_error(sample_size, c(2, Inf))
# Check for appropriate maximum factors
type_error(maximum_factors, "numeric");
length_error(maximum_factors, 1);
range_error(maximum_factors, c(1, Inf));
# REORGANIZE EVENTUALLY...
newdata <- data
# Message user
message("Generating features...")
# calculate all necessary features:
N <- sample_size
p <- ncol(newdata)
# dat_cor <- cor(newdata, use = use)
dat_cor <- correlation
eigval <- eigen(dat_cor)$values
vareig <- cumsum(eigval)/p
# eigenvalue features
eiggreater1 <- sum(eigval > 1)
releig1 <- eigval[1]/p
releig2 <- sum(eigval[1:2])/p
releig3 <- sum(eigval[1:3])/p
eiggreater07 <- sum(eigval > 0.7)
sdeigval <- sd(eigval)
var50 <- min(which(vareig > 0.50))
var75 <- min(which(vareig > 0.75))
# matrix norm features
onenorm <- norm(dat_cor,"O")
frobnorm <- norm(dat_cor,"F")
maxnorm <- norm(dat_cor-diag(p),"M")
avgcor <- sum(abs(dat_cor-diag(p)))/(p*(p-1))
specnorm <- sqrt(eigen(t(dat_cor)%*%dat_cor)$values[1])
smlcor <- sum(dat_cor <= 0.1)
avgcom <- mean(psych::smc(dat_cor))
det <- det(dat_cor)
KMO <- psych::KMO(dat_cor)$MSA
Gini <- ineq::ineq(lower.tri(dat_cor), type = "Gini")
Kolm <- ineq::ineq(lower.tri(dat_cor), type = "Kolm")
# parallel analysis
sink <- capture.output(
pa <- psych::fa.parallel(
x = newdata,
fa = "fa", cor = PA_correlation,
plot = FALSE
)
)
pa_solution <- pa$nfact
fa_eigval <- pa$fa.values
# empirical kaiser criterion
# Obtain number of variables
variables <- ncol(correlation)
# Obtain eigenvalues
eigenvalues <- eigen(correlation)$values
# Initialize reference criterion
reference <- numeric(length = variables)
# Loop over variables
for(i in 1:variables){
reference[i] <- max(
((1 + sqrt(variables / sample_size))^2) *
(variables - sum(reference)) / # note difference with `EKC`
(variables - i + 1), 1
)
}
# Model was trained with `reference` but proper uses `eigenvalues`
# Identify last eigenvalue greater than reference
ekc <- max(which(eigenvalues >= reference))
# setting missing eigenvalues to -1000
eigval[(length(eigval)+1):80] <- -1000
fa_eigval[(length(fa_eigval)+1):80] <- -1000
names(eigval) <- paste("eigval", 1:80, sep = "")
names(fa_eigval) <- paste("fa_eigval", 1:80, sep = "")
cd <- EFA.Comp.Data(Data = newdata, F.Max = maximum_factors, use = "pairwise.complete.obs")
# combination of features
features <- cbind(data.frame(N,p,eiggreater1,releig1,releig2,releig3,eiggreater07,sdeigval,var50,var75,onenorm,frobnorm,
maxnorm, avgcor, specnorm, smlcor, avgcom,det, KMO, Gini, Kolm, pa_solution, ekc, cd), t(eigval), t(fa_eigval))
# Check if model is stored in the environment
if(!exists("factor_forest_model", envir = globalenv())){
# Download Factor Forest model from Google Drive
drive_link <- "1bK-lMOh2lO7sGIVxHy1jI4kr3LjOURDD"
# Check if Factor Forest model exists
if(
!"factor_forest_model.rdata" %in%
tolower(list.files(tempdir()))
){
# Let user know Factor Forest model is downloading
message("Downloading Factor Forest model...", appendLF = FALSE)
# Download Factor Forest model
model_file <- suppressMessages(
googledrive::drive_download(
googledrive::as_id(drive_link),
path = paste(tempdir(), "factor_forest_model.Rdata", sep = "\\"),
overwrite = TRUE
)
)
}else{
# Create dummy space file list
model_file <- list()
model_file$local_path <- paste(
tempdir(), "\\",
"factor_forest_model.RData",
sep = ""
)
}
# Let user know Factor Forest model is loading
message("Loading Factor Forest model...", appendLF = FALSE)
# Initialize object
factor_forest_model <- NULL
# Load Factor Forest model
load(model_file$local_path)
# Let user know loading is finished
message("done")
}
# Obtain output
sink <- capture.output(
suppressWarnings(
out <- predict(factor_forest_model, newdata = features)$data
)
)
# Organize output
## Dimensions
dimensions <- as.numeric(
as.character(
out$response
)
)
## Obtain probabilities
probabilities <- out[,grep("prob.", colnames(out))]
probabilities <- round(
as.vector(as.matrix(probabilities)), 4
)
names(probabilities) <- 1:length(probabilities)
# Set up return list
results <- list(
dimensions = dimensions,
probabilities = probabilities
)
# Let user know that they can store model in workspace
if(!exists("factor_forest_model", envir = globalenv())){
# Adjust file path
adjusted_path <- gsub("\\\\", "/", model_file$local_path)
# Let user know how to get the model
message(
paste0(
"\nFor most efficient use, you can load the model ",
"into your workspace. Run the following code to ",
"load the model:\n\n",
"load(\"", adjusted_path, "\")\n"
)
)
}
# Return results
return(results)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.