R/EGM.R

Defines functions glasso_fit compute_density create_community_structure EGM.EGA.search EGM.EGA EGM.search EGM.standard compute_tefi_adjustment likelihood log_likelihood srmr_N_gradient srmr_N_cost nload2cor nload2pcor plot.EGM summary.EGM print.EGM EGM_errors EGM

Documented in EGM

#' Exploratory Graph Model
#'
#' @description Function to fit the Exploratory Graph Model
#'
#' @param data Matrix or data frame.
#' Should consist only of variables to be used in the analysis.
#' Can be raw data or a correlation matrix
#'
#' @param EGM.model Character vector (length = 1).
#' Sets the procedure to conduct \code{EGM}.
#' Available options:
#'
#' \itemize{
#'
#' \item \code{"EGA"} (default) --- Applies \code{\link[EGAnet]{EGA}} to obtain the
#' (sparse) regularized network structure, communities, and memberships
#'
#' \item \code{"standard"} --- Applies the standard EGM model which
#' estimates communities based on the non-regularized empirical partial
#' correlation matrix and sparsity is set using \code{p.in} and \code{p.out}
#'
#' }
#'
#' @param communities Numeric vector (length = 1).
#' Number of communities to use for the \code{"standard"} type of EGM.
#' Defaults to \code{NULL}.
#' Providing no input will use the communities and memberships output
#' from the Walktrap algorithm (\code{\link[igraph]{cluster_walktrap}}) based
#' on the empirical non-regularized partial correlation matrix
#'
#' @param structure Numeric or character vector (length = \code{ncol(data)}).
#' Can be theoretical factors or the structure detected by \code{\link[EGAnet]{EGA}}.
#' Defaults to \code{NULL}
#'
#' @param search Boolean (length = 1).
#' Whether a search over parameters should be conducted.
#' Defaults to \code{FALSE}.
#' Set to \code{TRUE} to select a model over a variety of parameters that
#' optimizes the \code{opt} objective
#'
#' @param p.in Numeric vector (length = 1).
#' Probability that a node is randomly linked to other nodes in the same community.
#' Within community edges are set to zero based on \code{quantile(x, prob = 1 - p.in)}
#' ensuring the lowest edge values are set to zero (i.e., most probable to \emph{not}
#' be randomly connected).
#' Only used for \code{EGM.type = "standard"}.
#' Defaults to \code{NULL} but must be set
#'
#' @param p.out Numeric vector (length = 1).
#' Probability that a node is randomly linked to other nodes \emph{not} in the same community.
#' Between community edges are set to zero based on \code{quantile(x, prob = 1 - p.out)}
#' ensuring the lowest edge values are set to zero (i.e., most probable to \emph{not}
#' be randomly connected).
#' Only used for \code{EGM.type = "standard"}.
#' Defaults to \code{NULL} but must be set
#'
#' @param opt Character vector (length = 1).
#' Fit index used to select from when searching over models
#' (only applies to \code{EGM.type = "search"}).
#' Available options include:
#'
#' \itemize{
#'
#' \item \code{"AIC"}
#'
#' \item \code{"BIC"}
#'
#' \item \code{"CFI"}
#'
#' \item \code{"chisq"}
#'
#' \item \code{"logLik"}
#'
#' \item \code{"RMSEA"}
#'
#' \item \code{"SRMR"}
#'
#' \item \code{"TEFI"}
#'
#' \item \code{"TEFI.adj"}
#'
#' \item \code{"TLI"}
#'
#' }
#'
#' Defaults to \code{"SRMR"}
#'
#' @param constrained Boolean (length = 1).
#' Whether memberships of the communities should
#' be added as a constraint when optimizing the network loadings.
#' Defaults to \code{TRUE} which ensures assigned loadings are
#' guaranteed to never be smaller than any cross-loadings.
#' Set to \code{FALSE} to freely estimate each loading similar to exploratory factor analysis
#'
#' @param verbose Boolean (length = 1).
#' Should progress be displayed?
#' Defaults to \code{TRUE}.
#' Set to \code{FALSE} to not display progress
#'
#' @param ... Additional arguments to be passed on to
#' \code{\link[EGAnet]{auto.correlate}},
#' \code{\link[EGAnet]{network.estimation}},
#' \code{\link[EGAnet]{community.detection}},
#' \code{\link[EGAnet]{community.consensus}},
#' \code{\link[EGAnet]{community.unidimensional}},
#' \code{\link[EGAnet]{EGA}}, and
#' \code{\link[EGAnet]{net.loads}}
#'
#' @examples
#' # Get depression data
#' data <- depression[,24:44]
#'
#' # Estimate EGM (using EGA)
#' egm_ega <- EGM(data)
#'
#' # Estimate EGM (using EGA) specifying communities
#' egm_ega_communities <- EGM(data, communities = 3)
#'
#' # Estimate EGM (using EGA) specifying structure
#' egm_ega_structure <- EGM(
#'   data, structure = c(
#'     1, 1, 1, 2, 1, 1, 1,
#'     1, 1, 1, 3, 2, 2, 2,
#'     2, 3, 3, 3, 3, 3, 2
#'   )
#' )
#'
#' # Estimate EGM (using standard)
#' egm_standard <- EGM(
#'   data, EGM.model = "standard",
#'   communities = 3, # specify number of communities
#'   p.in = 0.95, # probability of edges *in* each community
#'   p.out = 0.80 # probability of edges *between* each community
#' )
#'
#' \dontrun{
#' # Estimate EGM (using EGA search)
#' egm_ega_search <- EGM(
#'   data, EGM.model = "EGA", search = TRUE
#' )
#'
#' # Estimate EGM (using EGA search and AIC criterion)
#' egm_ega_search_AIC <- EGM(
#'   data, EGM.model = "EGA", search = TRUE, opt = "AIC"
#' )
#'
#' # Estimate EGM (using search)
#' egm_search <- EGM(
#'   data, EGM.model = "standard", search = TRUE,
#'   communities = 3, # need communities or structure
#'   p.in = 0.95 # only need 'p.in'
#' )}
#'
#' @author Hudson F. Golino <hfg9s at virginia.edu> and Alexander P. Christensen <alexpaulchristensen@gmail.com>
#'
#' @export
#'
# Estimate EGM ----
# Updated 06.11.2024
EGM <- function(
    data, EGM.model = c("standard", "EGA"),
    communities = NULL, structure = NULL, search = FALSE,
    p.in = NULL, p.out = NULL,
    opt = c(
      "AIC", "BIC", "CFI",
      "chisq", "logLik", "RMSEA",
      "SRMR", "TEFI", "TEFI.adj", "TLI"
    ), constrained = TRUE, verbose = TRUE, ...
)
{

  # Set default
  EGM.model <- set_default(EGM.model, "ega", EGM)
  opt <- set_default(opt, "srmr", EGM)

  # Set up EGM type internally
  EGM.type <- switch(
    EGM.model,
    "ega" = swiftelse(search, "ega.search", "ega"),
    "standard" = swiftelse(search, "search", "standard")
  )

  # Detect TEFI adjusted fit
  if(opt == "tefi.adj"){
    experimental("EGM")
  }

  # Check data and structure
  data <- EGM_errors(
    data, EGM.type, communities, structure,
    p.in, p.out, constrained, verbose, ...
  )

  # Switch and return results based on type
  return(
    switch(
      EGM.type,
      "standard" = EGM.standard(data, communities, structure, p.in, p.out, opt, constrained, ...),
      "search" = EGM.search(data, communities, structure, p.in, opt, constrained, verbose, ...),
      "ega" = EGM.EGA(data, structure, opt, constrained, ...),
      "ega.search" = EGM.EGA.search(data, communities, structure, opt, constrained, verbose, ...)
    )
  )

}

#' @noRd
# EGM Errors ----
# Updated 05.11.2024
EGM_errors <- function(
    data, EGM.type, communities, structure,
    p.in, p.out, constrained, verbose, ...
)
{

  # 'data' errors
  object_error(data, c("matrix", "data.frame", "tibble"), "EGM")

  # Check for tibble
  if(get_object_type(data) == "tibble"){
    data <- as.data.frame(data)
  }

  # Check for NULL communities
  if(!is.null(communities)){

    # If not NULL, check 'communities' errors
    typeof_error(communities, "numeric", "EGM")
    object_error(communities, "vector", "EGM")
    length_error(communities, 1, "EGM")

  }

  # Check for NULL structure
  if(!is.null(structure)){

    # If not NULL, check 'structure' errors
    typeof_error(structure, c("character", "numeric"), "EGM")
    object_error(structure, "vector", "EGM")
    length_error(structure, dim(data)[2], "EGM")

  }

  # Check first for parameters involved in both search and standard
  if(!grepl("ega", EGM.type)){

    # Check for NULL in 'p.in'
    if(is.null(p.in)){
      .handleSimpleError(
        h = stop,
        msg = paste0(
          "Input for 'p.in' is `NULL`. A value between 0 and 1 for 'p.in' must be provided."
        ),
        call = "EGM"
      )
    }

    # Check 'p.in' errors
    typeof_error(p.in, "numeric", "EGM")
    range_error(p.in, c(0, 1), "EGM")
    length_error(p.in, c(1, communities), "EGM")

  }

  # 'constrained' errors
  length_error(constrained, 1, "EGM")
  typeof_error(constrained, "logical", "EGM")

  # Check for EGM type
  if(EGM.type == "search"){

    # 'verbose' errors
    length_error(verbose, 1, "EGM")
    typeof_error(verbose, "logical", "EGM")

  }else if(EGM.type == "standard"){

    # Check for NULL in 'p.out'
    if(is.null(p.out)){
      .handleSimpleError(
        h = stop,
        msg = paste0(
          "Input for 'p.out' is `NULL`. A value between 0 and 1 for 'p.out' must be provided."
        ),
        call = "EGM"
      )
    }

    # Check 'p.out' errors
    typeof_error(p.out, "numeric", "EGM")
    range_error(p.out, c(0, 1), "EGM")
    length_error(p.out, c(1, communities), "EGM")

  }

  # Check for usable data
  if(needs_usable(list(...))){
    data <- usable_data(data, FALSE)
  }

  # Return data in case of tibble
  return(data)

}

#' @exportS3Method
# S3 Print Method ----
# Updated 09.11.2024
print.EGM <- function(x, digits = 3, ...)
{

  # Return EGA results
  print(x$EGA)

  # Add break
  cat("\n\n----\n\n")

  # Send title
  cat("Fit based on Optimized Loadings\n\n")

  # Add fit metrics
  print(format_decimal(x$model$optimized$fit, digits), quote = FALSE)

}

#' @exportS3Method
# S3 Summary Method ----
# Updated 09.11.2024
summary.EGM <- function(object, digits = 3, ...)
{
  print(object) # same as print
}

#' @exportS3Method
# S3 Plot Method ----
# Updated 22.10.2024
plot.EGM <- function(x, ...)
{

  # Return EGA plot
  plot(x$EGA)

}

# LOADINGS ----

#' @noRd
# Network loadings to partial correlations ----
# Updated 14.10.2024
nload2pcor <- function(loadings)
{

  # Compute partial correlation
  P <- tcrossprod(loadings)

  # Obtain interdependence
  interdependence <- sqrt(rowSums(loadings^2))

  # Set diagonal to interdependence
  diag(P) <- interdependence

  # Compute matrix I
  I <- diag(sqrt(1 / interdependence))

  # Compute zero-order correlations
  R <- I %*% P %*% I

  # Obtain inverse covariance matrix
  INV <- solve(R)

  # Compute matrix D
  D <- diag(sqrt(1 / diag(INV)))

  # Compute partial correlations
  P <- -D %*% INV %*% D

  # Set diagonal to zero
  diag(P) <- 0

  # Return partial correlation
  return(P)

}

#' @noRd
# Network loadings to correlations ----
# Updated 14.10.2024
nload2cor <- function(loadings)
{

  # Compute partial correlation
  P <- tcrossprod(loadings)

  # Obtain interdependence
  interdependence <- sqrt(rowSums(loadings^2))

  # Set diagonal to interdependence
  diag(P) <- interdependence

  # Compute matrix I
  I <- diag(sqrt(1 / interdependence))

  # Return correlation
  return(I %*% P %*% I)

}

# OPTIMIZATION ----

#' @noRd
# Estimated loadings cost (based on SRMR) ----
# Updated 06.11.2024
srmr_N_cost <- function(
    loadings_vector, zeros, R,
    loading_structure, rows,
    constrained, lower_triangle, ...
)
{

  # Check for constraint
  if(constrained){

    # Assemble loading matrix
    loadings_matrix <- matrix(loadings_vector * zeros, nrow = rows, byrow = TRUE)

    # Obtain assign loadings
    assign_loadings <- loadings_matrix[loading_structure]

    # Transpose loadings matrix
    loadings_matrix <- t(loadings_matrix)

    # Obtain differences
    differences <- abs(loadings_matrix) - assign_loadings

    # Obtain difference values
    difference_values <- (differences * (differences > 0))^2

    # Convert loadings to implied correlations following `nload2cor`
    # Uses raw code to avoid overhead of additional function calls

    # Compute partial correlation (decrease loadings based on constraints)
    P <- tcrossprod(loadings_matrix)

    # Obtain interdependence
    interdependence <- sqrt(rowSums(loadings_matrix^2))

    # Set diagonal to interdependence
    diag(P) <- interdependence

    # Compute matrix I
    I <- diag(sqrt(1 / interdependence))

    # Get implied R
    implied_R <- I %*% P %*% I

    # Check for positive definite
    return(
      swiftelse(
        !anyNA(implied_R) && is_positive_definite(implied_R), # return SRMR
        srmr(R, implied_R) + sqrt(mean(difference_values)), Inf
      )
    )

  }else{ # Without constraints, send it

    # Assemble loading matrix
    loadings_matrix <- matrix(loadings_vector * zeros, ncol = rows)

    # Compute partial correlation
    P <- tcrossprod(loadings_matrix)

    # Obtain interdependence
    interdependence <- sqrt(rowSums(loadings_matrix^2))

    # Set diagonal to interdependence
    diag(P) <- interdependence

    # Compute matrix I
    I <- diag(sqrt(1 / interdependence))

    # Get implied R
    implied_R <- I %*% P %*% I

    # Check for positive definite
    return(
      swiftelse(
        !anyNA(implied_R) && is_positive_definite(implied_R), # return SRMR
        srmr(R, implied_R), Inf
      )
    )

  }

}

#' @noRd
# Estimated loadings gradient (based on SRMR) ----
# Updated 06.11.2024
srmr_N_gradient <- function(
    loadings_vector, zeros, R,
    loading_structure, rows,
    constrained, lower_triangle, ...
)
{

  # Check for constraint
  if(constrained){

    # Assemble loading matrix
    loadings_matrix <- matrix(loadings_vector * zeros, nrow = rows, byrow = TRUE)

    # Obtain assign loadings
    assign_loadings <- loadings_matrix[loading_structure]

    # Transpose loadings matrix
    loadings_matrix <- t(loadings_matrix)

    # Obtain differences
    differences <- abs(loadings_matrix) - assign_loadings

    # Obtain difference values
    difference_values <- (differences * (differences > 0))

    # Convert loadings to implied correlations following `nload2cor`
    # Uses raw code to avoid overhead of additional function calls

    # Compute partial correlation
    implied_P <- tcrossprod(loadings_matrix)

    # Obtain interdependence
    interdependence <- sqrt(rowSums(loadings_matrix^2))

    # Set diagonal to interdependence
    diag(implied_P) <- interdependence

    # Compute matrix I
    I <- diag(sqrt(1 / interdependence))

    # Compute error
    error <- (I %*% implied_P %*% I - R)[lower_triangle]

    # Derivative of error with respect to P (covariance)
    dError <- matrix(0, nrow = ncol(R), ncol = ncol(R))
    dError[lower_triangle] <- 2 * error / length(error)
    dError <- dError + t(dError)

    # Return gradient
    return(
      as.vector( # (2x leads to fewer iterations)
        t(crossprod(2 * loadings_matrix, I %*% tcrossprod(dError, I))) +
        difference_values
      ) * zeros
    )

  }else{ # Without constraints, send it

    # Assemble loading matrix
    loadings_matrix <- matrix(loadings_vector * zeros, ncol = rows)

    # Compute partial covariance
    implied_P <- tcrossprod(loadings_matrix)

    # Obtain interdependence
    interdependence <- sqrt(rowSums(loadings_matrix^2))

    # Set diagonal to interdependence
    diag(implied_P) <- interdependence

    # Compute matrix I
    I <- diag(sqrt(1 / interdependence))

    # Compute error
    error <- (I %*% implied_P %*% I - R)[lower_triangle]

    # Derivative of error with respect to P (covariance)
    dError <- matrix(0, nrow = ncol(R), ncol = ncol(R))
    dError[lower_triangle] <- 2 * error / length(error)
    dError <- dError + t(dError)

    # Return gradient (2x leads to fewer iterations)
    return(as.vector(t(crossprod(2 * loadings_matrix, I %*% tcrossprod(dError, I)))) * zeros)

  }

}

# LIKELIHOOD ----

#' @noRd
# Log-likelihood only ----
# Updated 06.10.2024
log_likelihood <- function(n, p, R, S, type = c("partial", "zero"))
{

  # Set default to zero-order
  if(missing(type)){
    type <- "zero"
  }else{type <- match.arg(type)}

  # Return
  return(
    swiftelse(
      type == "zero",
      -(n / 2) * (p * log(2 * pi) + log(det(R)) + sum(diag(S %*% solve(R)))),
      (n / 2) * (log(det(R)) - sum(diag((S %*% R)))) - (n * p / 2) * log(2 * pi)
    )
  )

}

#' @noRd
# Compute log-likelihood metrics ----
# Updated 06.10.2024
likelihood <- function(n, p, R, S, loadings, type)
{

  # Get number of communities
  m <- dim(loadings)[2]

  # Log-likelihood
  loglik <- log_likelihood(n, p, R, S, type)

  # Total number of parameters
  parameters <- (p * m) + p + ((m * (m - 1)) / 2)

  # Model parameters
  model_parameters <- parameters - sum(loadings == 0)

  # Return log-likelihood
  return(
    c(
      logLik = loglik,
      AIC = -2 * loglik + 2 * model_parameters, # -2L + 2k
      BIC = -2 * loglik + model_parameters * log(n) # -2L + klog(n)
      # EBIC = -2 * loglik + model_parameters * log(n) + 2 * gamma * log(
      #   choose(parameters, model_parameters)
      # ), # -2L + klog(n) + 2 gamma log(binom(pk))
      # GFI = 1 - sum((R - S)^2) / sum(S^2)
    )
  )

}

#' @noRd
# Compute model parameters for TEFI adjustment ----
# Updated 22.10.2024
compute_tefi_adjustment <- function(loadings, correlations)
{

  # Obtain dimensions
  dimensions <- dim(loadings)

  # Total number of parameters
  parameters <- (dimensions[1] * dimensions[2]) + dimensions[1] +
                ((dimensions[2] * (dimensions[2] - 1)) / 2)

  # Obtain model parameters
  model_parameters <- parameters - sum(loadings == 0)

  # Return adjustment
  return(-2 * log(model_parameters) + mean(abs(correlations)))

}

# EGM MODELS ----

#' @noRd
# EGM | Standard ----
# Updated 06.11.2024
EGM.standard <- function(data, communities, structure, p.in, p.out, opt, constrained, ...)
{

  # Get ellipse
  ellipse <- list(...)

  # Get dimensions
  data_dimensions <- dim(data)
  dimension_names <- dimnames(data)

  # Determine if sample size was provided
  if(data_dimensions[1] == data_dimensions[2]){

    # Check if sample size was provided
    if("n" %in% names(ellipse)){

      # Set sample size
      data_dimensions[1] <- ellipse$n

      # Set empirical zero-order correlations
      empirical_R <- data

    }else{

      # Stop and send error
      .handleSimpleError(
        h = stop,
        msg = paste0(
          "A symmetric (", data_dimensions[1], " x ",
          data_dimensions[2], ") matrix was input but sample size was not provided.\n\n",
          "If you'd like to use a correlation matrix, please set the argument 'n' to your sample size"
        ),
        call = "EGM"
      )

    }

  }else{

    # Estimate zero-order correlations
    empirical_R <- auto.correlate(data, ...)

  }

  # Obtain partial correlations
  empirical_P <- cor2pcor(empirical_R)

  # Check for whether structure is provided
  if(is.null(structure)){

    # Obtain structure if not provided
    walktrap <- igraph::cluster_walktrap(convert2igraph(abs(empirical_P)))

    # Obtain memberships based on number of communities
    structure <- swiftelse(
      is.null(communities) || unique_length(walktrap$membership) == communities,
      walktrap$membership, cutree(as.hclust(walktrap), k = communities)
    )

  }

  # Set number of communities
  communities <- unique_length(structure)

  # Set up community variables
  community_variables <- lapply(
    seq_len(communities), function(community){structure == community}
  )

  # Obtain community structure
  community_P <- create_community_structure(
    P = empirical_P, total_variables = data_dimensions[2],
    communities = communities,
    community_variables = community_variables,
    p.in = p.in, p.out = p.out
  )

  # Update loadings
  output <- silent_call(net.loads(community_P, structure, ...))
  output$std <- output$std[dimension_names[[2]],, drop = FALSE]

  # Obtain dimension names
  loading_names <- dimnames(output$std)

  # Compute network scores
  standard_scores <- compute_scores(output, data, "network", "simple")$std.scores

  # Compute community correlations
  standard_correlations <- cor(standard_scores, use = "pairwise")

  # Compute model-implied correlations
  standard_R <- nload2cor(output$std)
  standard_P <- cor2pcor(standard_R)

  # Obtain loadings vector and get bounds
  loadings_vector <- as.vector(output$std)
  zeros <- loadings_vector != 0
  loadings_length <- length(loadings_vector)

  # Set up loading structure
  # Uses transpose for 2x speed up in optimization
  loading_structure <- matrix(FALSE, nrow = communities, ncol = data_dimensions[2])

  # Fill structure
  for(i in seq_along(structure)){
    loading_structure[structure[i], i] <- TRUE
  }

  # Optimize over loadings
  result <- silent_call(
    nlminb(
      start = loadings_vector, objective = srmr_N_cost,
      gradient = srmr_N_gradient,
      zeros = zeros, R = empirical_R,
      loading_structure = loading_structure,
      rows = communities, n = data_dimensions[1],
      constrained = constrained,
      lower_triangle = lower.tri(empirical_R),
      lower = rep(-1, loadings_length),
      upper = rep(1, loadings_length),
      control = list(eval.max = 10000, iter.max = 10000)
    )
  )

  # Set estimate
  result$estimate <- result$par

  # Extract optimized loadings
  optimized_loadings <- matrix(
    result$estimate, nrow = data_dimensions[2],
    dimnames = loading_names
  )

  # Replace loadings output with optimized loadings
  output$std <- optimized_loadings

  # Compute network scores
  optimized_scores <- compute_scores(output, data, "network", "simple")$std.scores

  # Compute community correlations
  optimized_correlations <- cor(optimized_scores, use = "pairwise")

  # Compute model-implied correlations
  optimized_R <- nload2cor(optimized_loadings)
  optimized_P <- cor2pcor(optimized_R)

  # Set up EGA
  ega_list <- list(
    dim.variables = data.frame(
      items = dimension_names[[2]],
      dimension = structure
    ),
    network = community_P, wc = structure,
    n.dim = unique_length(structure), correlation = empirical_R,
    n = data_dimensions[1], TEFI = tefi(empirical_R, structure)$VN.Entropy.Fit
  ); class(ega_list) <- "EGA"

  # Attach methods to network
  attr(ega_list$network, which = "methods") <- list(
    model = "egm", communities = communities, p.in = p.in, p.out = p.out
  )

  # Attach class to memberships
  class(ega_list$wc) <- "EGA.community"

  # Attach methods to memberships
  names(ega_list$wc) <- dimension_names[[2]]
  attr(ega_list$wc, which = "methods") <- list(
    algorithm = swiftelse(is.null(structure), NULL, "Walktrap"),
    objective_function = NULL
  )

  # Set up results
  results <- list(
    EGA = ega_list, structure = structure,
    model = list( # general list
      standard = list( # using standard parameters
        loadings = output$std,
        scores = standard_scores,
        correlations = standard_correlations,
        fit = fit(
          n = data_dimensions[1], p = data_dimensions[2],
          R = standard_R, S = empirical_R,
          loadings = output$std, correlations = standard_correlations,
          structure = structure, ci = 0.95
        ),
        implied = list(R = standard_R, P = standard_P)
      ),
      optimized = list( # using optimized parameters
        loadings = optimized_loadings,
        scores = optimized_scores,
        correlations = optimized_correlations,
        fit = fit(
          n = data_dimensions[1], p = data_dimensions[2],
          R = optimized_R, S = empirical_R,
          loadings = optimized_loadings, correlations = optimized_correlations,
          structure = structure, ci = 0.95
        ),
        implied = list(R = optimized_R, P = optimized_P)
      )
    )
  )

  # Set class
  class(results) <- "EGM"

  # Return results
  return(results)

}

#' @noRd
# EGM | Search ----
# Updated 06.11.2024
EGM.search <- function(data, communities, structure, p.in, opt, constrained, verbose, ...)
{

  # Perform search based on 'p.in'
  p_grid <- expand.grid(
    p_in = seq(p.in, 1, 0.05), p_out = seq(0.00, p.in, 0.05)
  )

  # Get number of search
  p_number <- dim(p_grid)[1]

  # Fit name based on fit
  fit_name <- switch(
    opt,
    "aic" = "AIC", "bic" = "BIC",
    "cfi" = "CFI", "chisq" = "chisq",
    "loglik" = "logLik", "rmsea" = "RMSEA",
    "srmr" = "SRMR", "tefi" = "TEFI",
    "tefi.adj" = "TEFI.adj", "tli" = "TLI"
  )

  # Loop over grid search
  grid_search <- parallel_process(
    iterations = p_number, datalist = seq_len(p_number),
    FUN = function(i, data, p_grid, communities, structure, opt, constrained, ...){

      # First, try
      output <- silent_call(
        try(
          EGM.standard(
            data = data, communities = communities,
            structure = structure, p.in = p_grid$p_in[i],
            p.out = p_grid$p_out[i], opt = opt,
            constrained = constrained, ...
          ), silent = TRUE
        )
      )

      # Return result
      return(swiftelse(is(output, "try-error"), NULL, output))

    },
    # `EGM.standard` arguments
    data = data, p_grid = p_grid,
    communities = communities, structure = structure,
    opt = opt, constrained = constrained, ...,
    # `parallel_process` arguments
    ncores = 1, progress = verbose
  )

  # Obtain fits
  optimized_fits <- nvapply(
    grid_search, function(x){
      swiftelse(is.null(x), NA, x$model$optimized$fit[[fit_name]])
    }
  )

  # Check for all NA
  if(all(is.na(optimized_fits))){

    # Set error if no solution is found
    .handleSimpleError(
      h = stop,
      msg = paste0(
        "Search could not converge to any solutions. \n\n",
        "Try:\n",
        "  +  increasing 'p.in'\n",
        "  +  changing 'communities'\n",
        "  +  changing 'structure'"
      ),
      call = "EGM"
    )

  }

  # Index function based on fit
  index_FUN <- switch(
    opt,
    "aic" = which.min, "bic" = which.min,
    "cfi" = which.max, "chisq" = which.min,
    "loglik" = which.max, "rmsea" = which.min,
    "srmr" = which.min, "tefi" = which.min,
    "tefi.adj" = which.min, "tli" = which.max
  )

  # Obtain index
  opt_index <- index_FUN(optimized_fits)

  # Set up final model
  results <- grid_search[[opt_index]]

  # Add 'p.in' and 'p.out' parameters
  results$search <- c(
    p.in = p_grid[opt_index, "p_in"],
    p.out = p_grid[opt_index, "p_out"]
  )

  # Overwrite class
  class(results) <- "EGM"

  # Return results
  return(results)

}

#' @noRd
# EGM | EGA ----
# Updated 06.11.2024
EGM.EGA <- function(data, structure, opt, constrained, ...)
{

  # Obtain data dimensions
  data_dimensions <- dim(data)

  # Estimate EGA
  ega <- EGA(data, plot.EGA = FALSE, ...)
  empirical_P <- cor2pcor(ega$correlation)

  # Update number of rows based on EGA
  data_dimensions[1] <- ega$n

  # Obtain variable names from the network
  variable_names <- dimnames(ega$network)[[2]]

  # Set memberships based on structure
  if(!is.null(structure)){

    # Update EGA features
    ega$wc[] <- structure
    ega$dim.variables$dimension <- structure

  }else{
    structure <- ega$wc
  }

  # Set communities
  ega$n.dim <- communities <- unique_length(structure)

  # Obtain standard network loadings
  output <- silent_call(net.loads(A = ega$network, wc = ega$wc, ...))

  # Obtain standard loadings
  standard_loadings <- output$std[variable_names,, drop = FALSE]

  # Compute network scores
  standard_scores <- compute_scores(output, data, "network", "simple")$std.scores

  # Compute community correlations
  standard_correlations <- cor(standard_scores, use = "pairwise")

  # Obtain model-implied correlations
  standard_R <- nload2cor(standard_loadings)
  standard_P <- cor2pcor(standard_R)

  # Get loading dimensions
  dimensions <- dim(standard_loadings)
  dimension_names <- dimnames(standard_loadings)

  # Obtain loadings vector and get bounds
  loadings_vector <- as.vector(standard_loadings)
  zeros <- loadings_vector != 0
  loadings_length <- length(loadings_vector)

  # Set up loading structure
  # Uses transpose for 2x speed up in optimization
  loading_structure <- matrix(
    FALSE, nrow = dimensions[2],
    ncol = dimensions[1],
    dimnames = list(dimension_names[[2]], dimension_names[[1]])
  )

  # Fill structure
  for(i in seq_along(structure)){
    loading_structure[structure[i], i] <- TRUE
  }

  # Optimize over loadings
  result <- silent_call(
    nlminb(
      start = loadings_vector, objective = srmr_N_cost,
      gradient = srmr_N_gradient,
      zeros = zeros, R = ega$correlation,
      loading_structure = loading_structure,
      rows = communities, n = data_dimensions[1],
      constrained = constrained,
      lower_triangle = lower.tri(ega$correlation),
      lower = rep(-1, loadings_length),
      upper = rep(1, loadings_length),
      control = list(eval.max = 10000, iter.max = 10000)
    )
  )

  # Set estimate
  result$estimate <- result$par

  # Extract optimized loadings
  optimized_loadings <- matrix(
    result$estimate, nrow = dimensions[1],
    dimnames = dimension_names
  )

  # Replace loadings output with optimized loadings
  output$loadings$std <- optimized_loadings

  # Compute network scores
  optimized_scores <- compute_scores(output, data, "network", "simple")$std.scores

  # Compute community correlations
  optimized_correlations <- cor(optimized_scores, use = "pairwise")

  # Obtain model-implied correlations
  optimized_R <- nload2cor(optimized_loadings)
  optimized_P <- cor2pcor(optimized_R)

  # Set up results
  results <- list(
    EGA = ega, structure = structure,
    model = list( # general list
      standard = list( # using standard parameters
        loadings = standard_loadings,
        scores = standard_scores,
        correlations = standard_correlations,
        fit = fit(
          n = data_dimensions[1], p = data_dimensions[2],
          R = standard_R, S = ega$correlation,
          loadings = output$std, correlations = standard_correlations,
          structure = ega$wc, ci = 0.95
        ),
        implied = list(R = standard_R, P = standard_P)
      ),
      optimized = list( # using optimized parameters
        loadings = optimized_loadings,
        scores = optimized_scores,
        correlations = optimized_correlations,
        fit = fit(
          n = data_dimensions[1], p = data_dimensions[2],
          R = optimized_R, S = ega$correlation,
          loadings = optimized_loadings,
          correlations = optimized_correlations,
          structure = ega$wc, ci = 0.95
        ),
        implied = list(R = optimized_R, P = optimized_P)
      )
    )
  )

  # Set class
  class(results) <- "EGM"

  # Return results
  return(results)

}

# EGM | EGA search ----
# Updated 06.11.2024
EGM.EGA.search <- function(data, communities, structure, opt, constrained, verbose, ...)
{

  # Fit name based on fit
  fit_name <- switch(
    opt,
    "aic" = "AIC", "bic" = "BIC",
    "cfi" = "CFI", "chisq" = "chisq",
    "loglik" = "logLik", "rmsea" = "RMSEA",
    "srmr" = "SRMR", "tefi" = "TEFI",
    "tefi.adj" = "TEFI.adj", "tli" = "TLI"
  )

  # Get number of dimensions
  data_dimensions <- dim(data)

  # Put data through `EBICglasso.qgraph`
  glasso_output <- network.estimation(data, network.only = FALSE, ...)

  # Obtain attributes
  glasso_attr <- attributes(glasso_output$estimated_network)

  # Obtain fits
  fits <- lapply(
    seq_len(glasso_attr$methods$nlambda), function(i){

      # Convert to partial correlations
      P <- wi2net(glasso_output$output$results$wi[,,i])
      dimnames(P) <- glasso_attr$dimnames

      # Obtain organized output
      output <- try(
        silent_call(
          glasso_fit(
            data = data, S = glasso_output$output$S,
            glasso_attr = glasso_attr, P = P,
            data_dimensions = data_dimensions,
            constrained = constrained, opt = opt,
            ...
          )
        ), silent = TRUE
      )

      # Return output
      return(swiftelse(is(output, "try-error"), NULL, output))

    }
  )

  # Add names to fits
  names(fits) <- glasso_output$output$lambda

  # Obtain fit index
  fit_index <- nvapply(fits[!lvapply(fits, is.null)], function(x){
    return(x$model$optimized$fit[[fit_name]])
  })

  # Index function based on fit
  index_FUN <- switch(
    opt,
    "aic" = min, "bic" = min,
    "cfi" = max, "chisq" = min,
    "loglik" = max, "rmsea" = min,
    "srmr" = min, "tefi" = min,
    "tefi.adj" = min, "tli" = max
  )

  # Obtain optimum value
  opt_value <- index_FUN(fit_index)

  # Obtain index
  opt_index <- max(which(fit_index == opt_value))

  # Obtain lambda (as character)
  lambda <- names(fit_index)[[opt_index]]

  # Obtain results
  results <- fits[[lambda]]

  # Set up 'dim.variables' for EGA
  dim.variables <- fast.data.frame(
    c(dimnames(results$EGA$network)[[2]], structure),
    nrow = data_dimensions[2], ncol = 2,
    colnames = c("items", "dimension")
  )

  # Add to EGA results
  results$EGA$dim.variables <- dim.variables[order(dim.variables$dimension),]

  # Compute TEFI for EGA
  results$EGA$TEFI <- tefi(glasso_output$output$S, results$EGA$wc)$VN.Entropy.Fit

  # Set up network methods
  attributes(results$EGA$network) <- glasso_attr
  attributes(results$EGA$network)$methods[
    c("model.selection", "lambda", "criterion")
  ] <- list(
    model.selection = fit_name, lambda = as.numeric(lambda), criterion = opt_value
  )

  # Overwrite class
  class(results) <- "EGM"

  # Return results
  return(results)

}

# EGM UTILITIES ----

#' @noRd
# Creates community structure ----
# Updated 08.10.2024
create_community_structure <- function(
    P, total_variables, communities, community_variables, p.in, p.out
)
{

  # Set vectors of 'p.in' and 'p.out'
  p.in <- swiftelse(length(p.in) == 1, rep(p.in, communities), p.in)
  p.out <- swiftelse(length(p.out) == 1, rep(p.out, communities), p.out)

  # Set diagonal to missing
  diag(P) <- NA

  # Set community blocks
  for(i in seq_len(communities)){

    # Randomly set zero in block
    indices <- P[community_variables[[i]], community_variables[[i]]]

    # Check for current sparsity
    if(compute_density(indices) > p.in[i]){

      # Get lower triangle
      lower_triangle <- lower.tri(indices)

      # Sample to set to zero
      indices[
        abs(indices) < quantile(
          abs(indices[lower_triangle]), probs = 1 - p.in[i], na.rm = TRUE
        )
      ] <- 0

      # Set back into block
      P[community_variables[[i]], community_variables[[i]]] <- indices

    }

    # Set between-community indices
    indices <- P[community_variables[[i]], -unlist(community_variables[-i])]

    # Check for current sparsity
    if(compute_density(indices) > p.out[i]){

      # Get threshold value
      threshold_value <- quantile(abs(indices), probs = 1 - p.out[i], na.rm = TRUE)

      # Set below to zero
      indices[abs(indices) < threshold_value] <- 0

      # Set back into block
      P[community_variables[[i]], -unlist(community_variables[-i])] <- indices

      # Do other side
      indices <- P[-unlist(community_variables[-i]), community_variables[[i]]]

      # Set below to zero
      indices[abs(indices) < threshold_value] <- 0

      # Set back into block
      P[-unlist(community_variables[-i]), community_variables[[i]]] <- indices

    }

  }

  # Set diagonal to zero
  diag(P) <- 0

  # Return community network
  return(P)

}

#' @noRd
# Computes density ----
# Updated 08.10.2024
compute_density <- function(network)
{

  # Get dimensions
  dimensions <- dim(network)

  # Obtain total number of edges
  edges <- prod(dimensions)

  # Check for on-diagonal
  if(dimensions[1] == dimensions[2]){

    # Total possible edges
    total_possible <- (edges - dimensions[2]) / 2

    # Compute sparsity
    return((total_possible - (sum(network == 0, na.rm = TRUE) / 2)) / total_possible)

  }else{ # Otherwise, treat as off-diagonal

    # Compute sparsity
    return((edges - sum(network == 0, na.rm = TRUE)) / edges)

  }

}

#' @noRd
# GLASSO fit ----
# Updated 06.11.2024
glasso_fit <- function(data, S, glasso_attr, P, data_dimensions, constrained, opt, ...)
{

  # Obtain structure based on community detection
  structure <- community.detection(P, ...)

  # Obtain communities
  communities <- unique_length(structure)

  # Initialize loadings
  output <- silent_call(net.loads(A = P, wc = structure, ...))
  standard_loadings <- output$std[
    glasso_attr$dimnames[[2]], seq_len(communities), drop = FALSE
  ]

  # Obtain scores
  standard_scores <- compute_scores(output, data, "network", "simple")$std.scores

  # Extract standard correlations
  standard_correlations <- cor(standard_scores, use = "pairwise")

  # Obtain model-implied correlations
  standard_R <- nload2cor(standard_loadings)
  standard_P <- cor2pcor(standard_R)

  # Get dimension names
  dimensions <- dim(standard_loadings)
  dimension_names <- dimnames(standard_loadings)

  # Obtain loadings vector and get bounds
  loadings_vector <- as.vector(standard_loadings)
  zeros <- loadings_vector != 0
  loadings_length <- length(loadings_vector)

  # Set up loading structure
  # Uses transpose for 2x speed up in optimization
  loading_structure <- matrix(
    FALSE, nrow = dimensions[2],
    ncol = dimensions[1],
    dimnames = list(dimension_names[[2]], dimension_names[[1]])
  )

  # Fill structure
  for(i in seq_along(structure)){
    loading_structure[structure[i], i] <- TRUE
  }

  # Optimize over loadings
  result <- try(
    nlminb(
      start = loadings_vector, objective = srmr_N_cost,
      gradient = srmr_N_gradient,
      zeros = zeros, R = S,
      loading_structure = loading_structure,
      rows = communities, n = data_dimensions[1],
      constrained = constrained, lower_triangle = lower.tri(S),
      lower = rep(-1, loadings_length),
      upper = rep(1, loadings_length),
      control = list(eval.max = 10000, iter.max = 10000)
    ), silent = TRUE
  )

  # Check for error
  if(is(result, "try-error")){
    stop("bad result")
  }

  # Set estimate
  result$estimate <- result$par

  # Extract optimized loadings
  optimized_loadings <- matrix(
    result$estimate, nrow = dimensions[1],
    dimnames = dimension_names
  )

  # Update output
  output$std <- optimized_loadings

  # Extract optimized scores
  optimized_scores <- compute_scores(output, data, "network", "simple")$std.scores

  # Extract optimized correlations
  optimized_correlations <- cor(optimized_scores, use = "pairwise")

  # Obtain model-implied correlations
  optimized_R <- nload2cor(optimized_loadings)
  optimized_P <- cor2pcor(optimized_R)

  # Set up EGA object
  ega <- list(
    network = P, wc = structure, n.dim = communities,
    correlation = S, n = data_dimensions[1]
  ); class(ega) <- "EGA"

  # Set up results
  return(
    list(
      EGA = ega, structure = structure,
      model = list( # general list
        standard = list( # using standard parameters
          loadings = standard_loadings,
          scores = standard_scores,
          correlations = standard_correlations,
          fit = fit(
            n = data_dimensions[1], p = data_dimensions[2],
            R = standard_R, S = ega$correlation,
            loadings = output$std, correlations = standard_correlations,
            structure = ega$wc, ci = 0.95
          ),
          implied = list(R = standard_R, P = standard_P)
        ),
        optimized = list( # using optimized parameters
          loadings = optimized_loadings,
          scores = optimized_scores,
          correlations = optimized_correlations,
          fit = fit(
            n = data_dimensions[1], p = data_dimensions[2],
            R = optimized_R, S = ega$correlation,
            loadings = optimized_loadings,
            correlations = optimized_correlations,
            structure = ega$wc, ci = 0.95
          ),
          implied = list(R = optimized_R, P = optimized_P)
        )
      )
    )
  )

}
hfgolino/EGA documentation built on Nov. 11, 2024, 9:28 p.m.