#' Conduct Parallel Analysis
#'
#' Computes the eigenvalues of the sample correlation matrix and the eigenvalues
#' obtained from a random correlation matrix for which no factors/components are
#' assumed. By default, the function utilizes a modified Horn's (1965) method,
#' which -- instead of mean -- uses 95th percentile of each item eigenvalues
#' sampling distribution as a threshold to find the optimal number of
#' factors/components.
#'
#' Horn proposed a solution to the problem of optimal factor number
#' identification using an approach based on a Monte Carlo simulation.
#'
#' First, several (20 by default) zero-factor `p`-variate normal
#' distributions (where `p` is the number of columns) are obtained, and
#' `p` × `p` correlation matrices are computed for them. Eigenvalues
#' of each matrix is then calculated in order to get an eigenvalues sampling
#' distribution for each simulated variable.
#'
#' Traditionally, Horn obtains an average of each sampling distribution and
#' these averages are used as a threshold which is compared with eigenvalues of
#' the original, real data. However, *usage of the mean was later disputed*
#' by Buja & Eyuboglu (1992), and 95th percentile of eigenvalues sampling
#' distribution was suggested as a more accurate threshold. This, more recent
#' method is used by default in the function.
#'
#' @param Data *data.frame* or *matrix*, dataset (where rows are
#' observations and columns items) or correlation matrix (recognized
#' automatically).
#' @param plot *logical*, if `TRUE` (the default), show the plot along
#' with the function results. To create the plot from the resulting object
#' afterwards, call `plot()`.
#' @param method *character*, either `fa`, `pca`, or `both`
#' (the default). Which method to use for the eigenvalues simulation and
#' computation.
#' @param cor *character*, how to calculate the correlation matrix of the
#' real data. Can be either `pearson` (default), `tetrachoric` or
#' `polychoric`. Unambiguous abbreviations accepted.
#' @param n_obs *integer*, in case you provided the correlation matrix
#' directly as the input, you have to provide the number of observations in
#' the original dataset.
#' @param threshold *character*, whether to use traditionall Horn's method
#' or more recent and well-performing quantile one. Either `mean` or
#' `quantile` (default). Can be abbreviated.
#' @param p *numeric* (0--1), probability for which the sample quantile is
#' produced. Defaults to `.95`. Ignored if `threshold = "mean"`.
#' @param n_iter *integer*, number of iterations, i.e. the number of
#' zero-factor multivariate normal distributions to sample. Defaults to
#' `20`.
#' @param fm *character*, factoring method. See [psych::fa()]
#' from the package [psych::psych()].
#' @param show_kaiser *logical*, whether to show Kaiser boundary in the
#' plot (the default) or not.
#' @param use an optional character string giving a method for computing
#' covariances in the presence of missing values. This must be (an
#' abbreviation of) one of the strings "everything", "all.obs",
#' "complete.obs", "na.or.complete", or "pairwise.complete.obs".
#' @inheritDotParams psych::polychoric -x -y -na.rm -polycor -std.err
#'
#' @return An object of class `data.frame` and `sia_parallel`. Can be
#' plotted using `plot()`.
#'
#' @examples
#' fa_parallel(TestAnxietyCor, n_obs = 335, method = "pca")
#'
#' \dontrun{
#' data("bfi", package = "psych")
#' items <- bfi[, 1:25]
#'
#' fa_parallel(items)
#' fa_parallel(items, threshold = "mean") # traditional Horn's method
#' }
#'
#' @author
#' Jan Netik \cr
#' Institute of Computer Science of the Czech Academy of Sciences \cr
#' \email{netik@@cs.cas.cz}
#'
#' Patricia Martinkova \cr
#' Institute of Computer Science of the Czech Academy of Sciences \cr
#' \email{martinkova@@cs.cas.cz}
#'
#' @references
#' Horn, J. L. (1965). A rationale and test for the number of factors in factor
#' analysis. Psychometrika, 30, 179--185. \doi{10.1007/BF02289447}
#'
#' Buja, A., & Eyuboglu, N. (1992). Remarks on parallel analysis. Multivariate
#' Behavioral Research, 27, 509--540. \doi{10.1207/s15327906mbr2704_2}
#'
#' @encoding UTF-8
#'
#' @importFrom stats rnorm
#' @importFrom psych fa tetrachoric polychoric
#' @importFrom parallel mclapply
#'
#' @export
#'
fa_parallel <- function(Data, cor = "pearson", n_obs = NULL,
method = "pca",
threshold = "quantile", p = .95, n_iter = 20,
plot = TRUE, show_kaiser = TRUE,
fm = "minres", use = "pairwise", ...) {
method <- match.arg(method, c("fa", "pca", "both"))
method_vec <- if (method == "both") c("fa", "pca") else method
# dims
n_subj <- nrow(Data)
n_vars <- ncol(Data)
if (p < 0 || p > 1) {
stop("Probability must lie between 0 and 1!", call. = FALSE)
}
# circumvent the correlation estimation when Data is recognized as a corr. already
if (is_corr(Data)) {
if (is.null(n_obs)) {
stop(c(
"You have to specify the number of observations of the original ",
"dataset when using a correlation matrix directly as the input."
), call. = FALSE)
}
message(
"The input was recognized as a correlation matrix.\nAssuming ",
n_obs,
" observations in the original data."
)
data_corr <- Data
n_subj <- n_obs # needed for random-valued matrix simulation
} else {
# real data part
data_corr <- switch(match.arg(cor, c("pearson", "tetrachoric", "polychoric")),
"pearson" = cor(Data, use = use),
"polychoric" = tryCatch(polychoric(Data, na.rm = TRUE, ...)$rho,
error = function(e) {
stop(
paste(
"Calculation of polychoric correlations returned an error:\n", e
),
call. = FALSE
)
}
),
"tetrachoric" = tryCatch(tetrachoric(Data, na.rm = TRUE, ...)$rho,
error = function(e) {
if (max(Data, na.rm = TRUE) > 1L) {
stop("Tetrachoric correlation requires dichotomous data.", # typo in original psych error
call. = FALSE
)
} else {
stop(e)
}
}
)
)
}
if ("fa" %in% method_vec) {
data_eigen_fa <- fa(data_corr, fm = fm, warnings = FALSE)$values
}
if ("pca" %in% method_vec) {
data_eigen_pca <- eigen(data_corr, symmetric = TRUE, only.values = TRUE)$values
}
# simulated part
# corr matrices from p-variate normal random matrices - for both pca and fa
sim_corr <- mclapply(seq_len(n_iter), function(x) {
sim_data <- matrix(rnorm(n_subj * n_vars), nrow = n_subj, ncol = n_vars)
cor(sim_data, use = "everything")
})
if ("fa" %in% method_vec) {
sim_eigen_list_fa <- mclapply(sim_corr, function(x) {
fa(x, rotate = "none", warnings = FALSE)$values
})
}
if ("pca" %in% method_vec) {
sim_eigen_list_pca <- mclapply(sim_corr, function(x) {
eigen(x, symmetric = TRUE, only.values = TRUE)$values
})
}
# get eigenvals function
get_eigenvals <- function(sim_eigen_list, threshold, n_iter, p) {
# turn the list of eigenvalues to a matrix
# we don't use byrow = TRUE nor t() because is slower
sim_eigen_df <- matrix(unlist(sim_eigen_list), ncol = n_iter)
# get a mean or percentile for every item (i.e., row)
switch(match.arg(threshold, c("mean", "quantile")),
mean = apply(sim_eigen_df, 1, mean),
quantile = apply(sim_eigen_df, 1, function(x) quantile(x, p))
)
}
if ("fa" %in% method_vec) {
sim_eigen_fa <- get_eigenvals(sim_eigen_list_fa, threshold, n_iter, p)
}
if ("pca" %in% method_vec) {
sim_eigen_pca <- get_eigenvals(sim_eigen_list_pca, threshold, n_iter, p)
}
out <- expand.grid(
fact_or_comp = seq_len(n_vars),
data_type = c("real", "simulated"),
method = method_vec
)
out[["eigenvalue"]] <- switch(method,
"fa" = c(data_eigen_fa, sim_eigen_fa),
"pca" = c(data_eigen_pca, sim_eigen_pca),
"both" = c(data_eigen_fa, sim_eigen_fa, data_eigen_pca, sim_eigen_pca)
)
attr(out, "row.names") <- seq_len(nrow(out))
attr(out, "class") <- c("sia_parallel", "data.frame")
if ("fa" %in% method_vec) {
factors_below_thr <- which(data_eigen_fa <= sim_eigen_fa)
n_factors <- max(factors_below_thr[1L] - 1L, 1L)
kaiser_fa <- sum(data_eigen_fa >= 0)
}
if ("pca" %in% method_vec) {
comp_below_thr <- which(data_eigen_pca <= sim_eigen_pca)
n_comp <- max(comp_below_thr[1L] - 1L, 1L)
kaiser_pca <- sum(data_eigen_pca >= 1)
}
if (plot) {
print(plot(out, show_kaiser = show_kaiser))
}
cat(
"According to the parallel analysis, the optimal number of",
switch(method,
"fa" = paste0("factors is ", n_factors, "."),
"pca" = paste0("principal components is ", n_comp, "."),
"both" = paste0(
"factors is ", n_factors,
" and the optimal number of principal components is ", n_comp, "."
)
), "\nFollowing the Kaiser rule,",
switch(method,
"fa" = paste0(
kaiser_fa, ifelse(kaiser_fa > 1L, " factors are", " factor is"),
" recommended."
),
"pca" = paste0(
kaiser_pca, ifelse(kaiser_pca > 1L, " components are", " component is"),
" recommended."
),
"both" = paste0(
kaiser_fa, ifelse(kaiser_fa > 1L, " factors", " factor"),
" and ",
kaiser_pca, ifelse(kaiser_pca > 1L, " components are", " component is"),
" recommended."
)
)
)
invisible(out)
}
#' Plot Method for Parallel Analysis Output
#'
#' You can call this method to plot an existing object resulting from
#' `fa_paralell()` function, which behaves as a standard `data.frame`,
#' but can be automatically recognized and processed with a dedicated plot
#' method. Also, you can *post-hoc* disable the Kaiser boundaries shown by
#' default.
#'
#' @param x object of class `sia_parallel` to plot.
#' @param y *ignored*
#' @param ... additional argument:
#' \describe{\item{`show_kaiser`}{*logical*, whether to show
#' horizonal lines denoting Kaiser boundaries (eigenvalue 0 and/or 1 for FA
#' and/or PCA, respectively). Defaults to `TRUE`.}}
#'
#' @examples
#' \dontrun{
#' fa_parallel_result <- BFI2[, 1:60] %>% fa_parallel(plot = FALSE) # without plot
#' fa_parallel_result %>% plot() # generate plot from "fitted" object
#' fa_parallel_result %>% plot(show_kaiser = FALSE) # hide Kaiser boundaries
#' }
#'
#' @importFrom ggplot2 ggplot aes geom_line geom_point geom_hline annotate
#' scale_x_continuous scale_color_manual theme_bw theme expansion element_line
#' element_rect scale_alpha_manual xlab element_blank element_text
#' coord_cartesian
#' @importFrom rlang .data
#'
#' @export
plot.sia_parallel <- function(x, y, ...) {
max_fact <- max(x[["fact_or_comp"]])
max_eigenvalue <- max(x[["eigenvalue"]])
method <- if (nlevels(x[["method"]]) == 2) "both" else levels(x[["method"]])
x[["method"]] <- toupper(x[["method"]])
y_lim_max <- ifelse(max_eigenvalue < 1.5, 1.5, max_eigenvalue)
kaiser_boundary <- function(method, show_kaiser = TRUE) {
if (show_kaiser) {
positions <- switch(method,
"fa" = 0,
"pca" = 1,
"both" = c(0, 1)
)
labels <- switch(method,
"fa" = "Kaiser boundary",
"pca" = "Kaiser boundary",
"both" = c("Kaiser boundary for FA", "Kaiser boundary for PCA")
)
list(
geom_hline(yintercept = positions, linetype = "dashed", alpha = .4, size = .8),
annotate("text",
x = max_fact, y = positions, label = labels, hjust = 1, vjust = -.75,
alpha = .4
)
)
}
}
x %>%
ggplot(aes(.data$fact_or_comp, .data$eigenvalue,
col = .data$method, alpha = .data$data_type
)) +
kaiser_boundary(method, ...) +
geom_line(size = .8) +
geom_point(size = 2) +
scale_x_continuous(
breaks = function(x) seq(1, x[2], 3),
expand = expansion(add = .75)
) +
scale_color_manual(
values = c("#00BFC4", "#F8766D"),
guide = ifelse(method == "both", "legend", "none")
) +
scale_alpha_manual(
values = c(real = 1, simulated = .25),
labels = c(real = "Real", simulated = "Simulated")
) +
xlab(switch(method,
"fa" = "Factor number",
"pca" = "Component number",
"both" = "Factor/component number"
)) +
ylab("Eigenvalue") +
coord_cartesian(ylim = c(NA, y_lim_max)) +
theme_bw(
base_size = 15, base_family = ""
) +
theme(
panel.grid.major = element_blank(), # element_line(size = .4),
panel.grid.minor = element_blank(), # element_line(size = .25),
panel.border = element_rect(),
legend.position = c(1, 1),
legend.justification = c(1.2, 1.25),
legend.box = "horizontal",
legend.title = element_blank(),
legend.background = element_blank()
)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.