#' @title Entropy Fit Index
#'
#' @description Computes the fit of a dimensionality structure using empirical entropy.
#' Lower values suggest better fit of a structure to the data.
#'
#' @param data Matrix or data frame.
#' Contains variables to be used in the analysis
#'
#' @param structure Numeric or character vector (length = \code{ncol(data)}).
#' A vector representing the structure (numbers or labels for each item).
#' Can be theoretical factors or the structure detected by \code{\link[EGAnet]{EGA}}
#'
#' @return Returns a list containing:
#'
#' \item{Total.Correlation}{The total correlation of the dataset}
#'
#' \item{Total.Correlation.MM}{Miller-Madow correction for the total correlation of the dataset}
#'
#' \item{Entropy.Fit}{The Entropy Fit Index}
#'
#' \item{Entropy.Fit.MM}{Miller-Madow correction for the Entropy Fit Index}
#'
#' \item{Average.Entropy}{The average entropy of the dataset}
#'
#' @examples
#' # Load data
#' wmt <- wmt2[,7:24]
#'
#' \dontrun{
#' # Estimate EGA model
#' ega.wmt <- EGA(data = wmt)}
#'
#' # Compute entropy indices
#' entropyFit(data = wmt, structure = ega.wmt$wc)
#'
#' @references
#'\strong{Initial formalization and simulation} \cr
#' Golino, H., Moulder, R. G., Shi, D., Christensen, A. P., Garrido, L. E., Nieto, M. D., Nesselroade, J., Sadana, R., Thiyagarajan, J. A., & Boker, S. M. (2020).
#' Entropy fit indices: New fit measures for assessing the structure and dimensionality of multiple latent variables.
#' \emph{Multivariate Behavioral Research}.
#'
#' @author Hudson F. Golino <hfg9s at virginia.edu>, Alexander P. Christensen <alexpaulchristensen@gmail.com> and Robert Moulder <rgm4fd@virginia.edu>
#'
#' @export
# Entropy Fit Index ----
# Updated 09.08.2023
entropyFit <- function (data, structure)
{
# Argument errors (return data in case of tibble)
data <- entropyFit_errors(data, structure)
# Ensure data is a matrix
data <- as.matrix(data)
# Get data dimensions
dimensions <- dim(data)
# Get number of communities
communities <- unique_length(structure)
# Get number of bins
bins <- floor(sqrt(dimensions[1] / 5))
# Obtain summed data
if(communities == dimensions[2L]){
summed_data <- data # scores are already summed
}else{
# Get sums by community
summed_data <- nvapply(seq_len(communities), function(community){
rowSums(data[,structure == community, drop = FALSE], na.rm = TRUE)
}, LENGTH = dimensions[1L])
}
# Set bin length
bin_length <- bins + 1
# Get bin cuts
bin_cuts <- lapply(seq_len(communities), function(community){
# Get range
data_range <- range(summed_data[,community], na.rm = TRUE)
# Return cuts
return(
cut(
summed_data[,community],
breaks = seq.int(data_range[1L], data_range[2L], length.out = bin_length),
include.lowest = TRUE
)
)
})
# Get frequencies
bin_frequencies <- nvapply(bin_cuts, table, LENGTH = bins) / dimensions[1L]
# Get entropies
H <- nvapply(seq_len(communities), function(community){
# Get non-zero frequencies
bin_non_zero <- bin_frequencies[bin_frequencies[,community] > 0, community]
# Return entropy
return(entropy(bin_non_zero))
})
# Get joint frequency table
joint_frequency <- count_table(
do.call(cbind, bin_cuts)
)$Value / dimensions[1L]
# Get non-zero frequencies
joint_non_zero <- joint_frequency[joint_frequency > 0]
# Get joint entropy
H_joint <- entropy(joint_non_zero)
# Get maximum sums
max_sum <- rowSums(data, na.rm = TRUE)
# Obtain range
max_range <- range(max_sum, na.rm = TRUE)
# Count the cuts
max_frequency <- count_table(
cut(
max_sum,
breaks = seq.int(max_range[1], max_range[2], length.out = bin_length),
include.lowest = TRUE
)
)$Value / dimensions[1L]
# Get non-zero frequencies
max_non_zero <- max_frequency[max_frequency > 0]
# Get maximum entropy
H_max <- entropy(max_non_zero)
# Miller-Madow Bias Correction (for individual communities)
## Pre-compute denominator for corrections
MM_denominator <- 2L * dimensions[1L]
## Entropy
MM_non_zero <- colSums(bin_frequencies != 0, na.rm = TRUE)
MM_H <- H + (MM_non_zero - 1) / MM_denominator
## Joint Entropy
MM_joint_non_zero <- sum(joint_frequency != 0, na.rm = TRUE)
MM_H_joint <- H_joint + (MM_joint_non_zero - 1) / MM_denominator
# Compute mean entropy
H_mean <- mean(H, na.rm = TRUE)
# Pre-compute denominator for entropy fit measures
EF_denominator <- (H_max - H_mean) * sqrt(communities)
# Set up data frame
return(
fast.data.frame(
c(
sum(H, na.rm = TRUE) - H_joint,
sum(MM_H, na.rm = TRUE) - MM_H_joint,
H_mean - H_joint + EF_denominator,
mean(MM_H, na.rm = TRUE) - MM_H_joint + EF_denominator,
H_mean - H_joint
), ncol = 5L,
colnames = c(
"Total.Correlation", "Total.Correlation.MM",
"Entropy.Fit", "Entropy.Fit.MM",
"Average.Entropy"
)
)
)
}
# Bug Checking ----
# ## Basic input
# data <- wmt2[,7:24]; ega.wmt <- EGA(data, plot.EGA = FALSE)
# structure <- ega.wmt$wc
#' @noRd
# Argument errors ----
# Updated 13.08.2023
entropyFit_errors <- function(data, structure)
{
# 'data' errors
object_error(data, c("matrix", "data.frame", "tibble"), "entropyFit")
# Check for tibble
if(get_object_type(data) == "tibble"){
data <- as.data.frame(data)
}
# 'structure' errors
object_error(structure, "vector", "entropyFit")
length_error(structure, dim(data)[2], "entropyFit")
# Return data in case of tibble
return(data)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.