R/full_factor.R

Defines functions full_factor

Documented in full_factor

#' Factor analysis (PCA)
#'
#' @details See \url{https://radiant-rstats.github.io/docs/multivariate/full_factor.html} for an example in Radiant
#'
#' @param dataset Dataset
#' @param vars Variables to include in the analysis
#' @param method Factor extraction method to use
#' @param nr_fact Number of factors to extract
#' @param rotation Apply varimax rotation or no rotation ("varimax" or "none")
#' @param hcor Use polycor::hetcor to calculate the correlation matrix
#' @param data_filter Expression entered in, e.g., Data > View to filter the dataset in Radiant. The expression should be a string (e.g., "price > 10000")
#' @param envir Environment to extract data from
#'
#' @return A list with all variables defined in the function as an object of class full_factor
#'
#' @examples
#' full_factor(shopping, "v1:v6") %>% str()
#'
#' @seealso \code{\link{summary.full_factor}} to summarize results
#' @seealso \code{\link{plot.full_factor}} to plot results
#'
#' @importFrom psych principal fa factor.scores
#' @importFrom GPArotation quartimax oblimin simplimax
#' @importFrom polycor hetcor
#' @importFrom psych scoreIrt
#'
#' @export
full_factor <- function(dataset, vars, method = "PCA", hcor = FALSE, nr_fact = 1,
                        rotation = "varimax", data_filter = "",
                        envir = parent.frame()) {
  df_name <- if (is_string(dataset)) dataset else deparse(substitute(dataset))
  dataset <- get_data(dataset, vars, filt = data_filter, envir = envir) %>%
    mutate_if(is.Date, as.numeric)
  rm(envir)

  ## in case : is used
  if (length(vars) < ncol(dataset)) {
    vars <- colnames(dataset)
  }

  anyCategorical <- sapply(dataset, function(x) is.numeric(x) || is.Date(x)) == FALSE
  nrObs <- nrow(dataset)
  nrFac <- max(1, as.numeric(nr_fact))
  if (nrFac > ncol(dataset)) {
    return("The number of factors cannot exceed the number of variables" %>%
      add_class("full_factor"))
    nrFac <- ncol(dataset)
  }

  if (hcor) {
    cmat <- try(sshhr(polycor::hetcor(dataset, ML = FALSE, std.err = FALSE)), silent = TRUE)
    dataset <- mutate_all(dataset, radiant.data::as_numeric)
    if (inherits(cmat, "try-error")) {
      warning("Calculating the heterogeneous correlation matrix produced an error.\nUsing standard correlation matrix instead")
      hcor <- "Calculation failed"
      cmat <- cor(dataset)
    } else {
      cmat <- cmat$correlations
    }
  } else {
    dataset <- mutate_all(dataset, radiant.data::as_numeric)
    cmat <- cor(dataset)
  }

  if (method == "PCA") {
    fres <- psych::principal(
      cmat,
      nfactors = nrFac, rotate = rotation, scores = FALSE,
      oblique.scores = FALSE
    )
    m <- fres$loadings[, colnames(fres$loadings)]
    cscm <- m %*% solve(crossprod(m))
    fres$scores <- as.matrix(mutate_all(dataset, radiant.data::standardize)) %*% cscm
  } else {
    fres <- try(psych::fa(cmat, nfactors = nrFac, rotate = rotation, oblique.scores = FALSE, fm = "ml"), silent = TRUE)
    if (inherits(fres, "try-error")) {
      return(
        "An error occured. Increase the number of variables or reduce the number of factors" %>%
          add_class("full_factor")
      )
    }
    if (sum(anyCategorical) == ncol(dataset) && isTRUE(hcor)) {
      ## necessary to deal with psych::irt.tau qnorm issue
      isMaxMinOne <- sapply(dataset, function(x) (max(x, na.rm = TRUE) - min(x, na.rm = TRUE) == 1))
      dataset <- mutate_if(dataset, isMaxMinOne, ~ (. - min(., na.rm = TRUE)) / (max(., na.rm = TRUE) - min(., na.rm = TRUE)))
      .irt.tau <- function() {
        tau <- psych::irt.tau(dataset)
        m <- fres$loadings[, colnames(fres$loadings), drop = FALSE]
        nf <- dim(m)[2]
        max_dat <- max(dataset)
        min_dat <- min(dataset)
        if (any(tau == Inf)) {
          tau[tau == Inf] <- max((max_dat - min_dat) * 5, 4)
          warning("Tau values of Inf found. Adjustment applied")
        }
        if (any(tau == -Inf)) {
          tau[tau == -Inf] <- min(-(max_dat - min_dat) * 5, -4)
          warning("Tau values of -Inf found. Adjustment applied")
        }
        diffi <- list()
        for (i in 1:nf) diffi[[i]] <- tau / sqrt(1 - m[, i]^2)
        discrim <- m / sqrt(1 - m^2)
        new.stats <- list(difficulty = diffi, discrimination = discrim)
        psych::score.irt.poly(new.stats, dataset, cut = 0, bounds = c(-4, 4))
      }
      scores <- try(.irt.tau(), silent = TRUE)
      # scores <- psych::scoreIrt(fres, dataset, cut = 0)
      rm(.irt.tau)
      if (inherits(scores, "try-error")) {
        return(
          paste0("An error occured estimating latent factor scores using psychIrt. The error message was:\n\n", attr(scores, "condition")$message) %>% add_class("full_factor")
        )
      } else {
        fres$scores <- apply(scores[, 1:nrFac, drop = FALSE], 2, radiant.data::standardize)
        rm(scores)
        colnames(fres$scores) <- colnames(fres$loadings)
      }
    } else {
      fres$scores <- psych::factor.scores(as.matrix(dataset), fres, method = "Thurstone")$scores
    }
  }

  ## convert loadings object to data.frame
  floadings <-
    fres$loadings %>%
    {
      dn <- dimnames(.)
      matrix(., nrow = length(dn[[1]])) %>%
        set_colnames(., dn[[2]]) %>%
        set_rownames(., dn[[1]]) %>%
        data.frame(stringsAsFactors = FALSE)
    }

  as.list(environment()) %>% add_class("full_factor")
}

#' Summary method for the full_factor function
#'
#' @details See \url{https://radiant-rstats.github.io/docs/multivariate/full_factor.html} for an example in Radiant
#'
#' @param object Return value from \code{\link{full_factor}}
#' @param cutoff Show only loadings with (absolute) values above cutoff (default = 0)
#' @param fsort Sort factor loadings
#' @param dec Number of decimals to show
#' @param ... further arguments passed to or from other methods
#'
#' @examples
#' result <- full_factor(shopping, "v1:v6", nr_fact = 2)
#' summary(result)
#' summary(result, cutoff = .5, fsort = TRUE)
#'
#' @seealso \code{\link{full_factor}} to calculate results
#' @seealso \code{\link{plot.full_factor}} to plot results
#'
#' @importFrom psych fa.sort
#'
#' @export
summary.full_factor <- function(object, cutoff = 0, fsort = FALSE,
                                dec = 2, ...) {
  if (is.character(object)) {
    return(cat(object))
  }

  cat("Factor analysis\n")
  cat("Data        :", object$df_name, "\n")
  if (!is.empty(object$data_filter)) {
    cat("Filter      :", gsub("\\n", "", object$data_filter), "\n")
  }
  cat("Variables   :", paste0(object$vars, collapse = ", "), "\n")
  cat("Factors     :", object$nr_fact, "\n")
  cat("Method      :", object$method, "\n")
  cat("Rotation    :", object$rotation, "\n")
  cat("Observations:", format_nr(object$nrObs, dec = 0), "\n")
  if (is.character(object$hcor)) {
    cat(paste0("Correlation : Pearson (adjustment using polycor::hetcor failed)\n"))
  } else if (isTRUE(object$hcor)) {
    if (sum(object$anyCategorical) > 0) {
      cat(paste0("Correlation : Heterogeneous correlations using polycor::hetcor\n"))
    } else {
      cat(paste0("Correlation : Pearson\n"))
    }
  } else {
    cat("Correlation : Pearson\n")
  }
  if (sum(object$anyCategorical) > 0) {
    if (isTRUE(object$hcor)) {
      cat("** Variables of type {factor} are assumed to be ordinal **\n")
      if (object$method == "PCA") {
        cat("** Factor scores are biased when using PCA when one or more {factor} variables are included **\n\n")
      } else if (sum(object$anyCategorical) == length(object$vars)) {
        cat("** Factor scores calculated using psych::scoreIrt **\n\n")
      } else if (sum(object$anyCategorical) < length(object$vars)) {
        cat("** Factor scores are biased when a mix of {factor} and numeric variables are used **\n\n")
      }
    } else {
      cat("** Variables of type {factor} included without adjustment **\n\n")
    }
  } else if (isTRUE(object$hcor)) {
    cat("** No variables of type {factor} selected. No adjustment applied **\n\n")
  } else {
    cat("\n")
  }

  cat("Factor loadings:\n")

  ## show only the loadings > cutoff
  clean_loadings(object$floadings, cutoff = cutoff, fsort = fsort, dec = dec, repl = "") %>%
    print()

  ## fit measures
  cat("\nFit measures:\n")
  colSums(object$floadings^2) %>%
    rbind(., 100 * (. / nrow(object$floadings))) %>%
    rbind(., cumsum(.[2, ])) %>%
    as.data.frame(stringsAsFactors = FALSE) %>%
    format_df(dec = dec) %>%
    set_rownames(c("Eigenvalues", "Variance %", "Cumulative %")) %>%
    print()

  # results from psych - uncomment to validate results
  # object$fres$loadings %>%
  # { if (fsort) psych::fa.sort(.) else . } %>%
  # print(cutoff = cutoff, digits = 2)

  cat("\nAttribute communalities:")
  data.frame(1 - object$fres$uniqueness, stringsAsFactors = FALSE) %>%
    format_df(dec = dec, perc = TRUE) %>%
    set_rownames(object$vars) %>%
    set_colnames("") %>%
    print()

  cat("\nFactor scores (max 10 shown):\n")
  as.data.frame(object$fres$scores, stringsAsFactors = FALSE) %>%
    .[1:min(nrow(.), 10), , drop = FALSE] %>%
    format_df(dec = dec) %>%
    print(row.names = FALSE)
}

#' Plot method for the full_factor function
#'
#' @details See \url{https://radiant-rstats.github.io/docs/multivariate/full_factor.html} for an example in Radiant
#'
#' @param x Return value from \code{\link{full_factor}}
#' @param plots Include attribute ("attr"), respondents ("resp") or both in the plot
#' @param shiny Did the function call originate inside a shiny app
#' @param custom Logical (TRUE, FALSE) to indicate if ggplot object (or list of ggplot objects) should be returned. This option can be used to customize plots (e.g., add a title, change x and y labels, etc.). See examples and \url{https://ggplot2.tidyverse.org/} for options.
#' @param ... further arguments passed to or from other methods
#'
#' @examples
#' result <- full_factor(shopping, "v1:v6", nr_fact = 2)
#' plot(result)
#'
#' @seealso \code{\link{full_factor}} to calculate results
#' @seealso \code{\link{plot.full_factor}} to plot results
#'
#' @importFrom ggrepel geom_text_repel
#' @importFrom rlang .data
#'
#' @export
plot.full_factor <- function(x, plots = "attr", shiny = FALSE, custom = FALSE, ...) {

  ## when no analysis was conducted (e.g., no variables selected)
  if (is.character(x)) {
    return(plot(x = 1, type = "n", main = x, axes = FALSE, xlab = "", ylab = ""))
  } else if (x$fres$factors < 2) {
    x <- "Plots require two or more factors"
    return(plot(x = 1, type = "n", main = x, axes = FALSE, xlab = "", ylab = ""))
  }

  df <- x$floadings
  scores <- as.data.frame(x$fres$scores)
  plot_scale <- if ("resp" %in% plots) max(scores) else 1
  rnames <- rownames(df)
  cnames <- colnames(df)
  plot_list <- list()
  for (i in 1:(length(cnames) - 1)) {
    for (j in (i + 1):length(cnames)) {
      i_name <- cnames[i]
      j_name <- cnames[j]
      df2 <- cbind(df[, c(i_name, j_name)], rnames)

      p <- ggplot(df2, aes(x = .data[[i_name]], y = .data[[j_name]])) +
        theme(legend.position = "none") +
        coord_cartesian(xlim = c(-plot_scale, plot_scale), ylim = c(-plot_scale, plot_scale)) +
        geom_vline(xintercept = 0) +
        geom_hline(yintercept = 0)

      if ("resp" %in% plots) {
        p <- p + geom_point(data = scores, aes(x = .data[[i_name]], y = .data[[j_name]]), alpha = 0.5)
      }

      if ("attr" %in% plots) {
        p <- p + geom_point(aes(color = .data$rnames)) +
          ggrepel::geom_text_repel(aes(color = .data$rnames, label = .data$rnames)) +
          geom_segment(
            aes(x = 0, y = 0, xend = .data[[i_name]], yend = .data[[j_name]], color = .data$rnames),
            linewidth = 0.5, linetype = "dashed", alpha = 0.5
          )
      }

      plot_list[[paste0(i_name, "_", j_name)]] <- p
    }
  }

  if (length(plot_list) > 0) {
    if (custom) {
      if (length(plot_list) == 1) plot_list[[1]] else plot_list
    } else {
      patchwork::wrap_plots(plot_list, ncol = min(length(plot_list), 2)) %>%
        (function(x) if (shiny) x else print(x))
    }
  }
}

#' Store factor scores to active dataset
#'
#' @details See \url{https://radiant-rstats.github.io/docs/multivariate/full_factor.html} for an example in Radiant
#'
#' @param dataset Dataset to append to factor scores to
#' @param object Return value from \code{\link{full_factor}}
#' @param name Name of factor score variables
#' @param ... Additional arguments
#'
#' @examples
#' full_factor(shopping, "v1:v6", nr_fact = 3) %>%
#'   store(shopping, .) %>%
#'   head()
#'
#' @seealso \code{\link{full_factor}} to generate results
#' @seealso \code{\link{summary.full_factor}} to summarize results
#' @seealso \code{\link{plot.full_factor}} to plot results
#'
#' @export
store.full_factor <- function(dataset, object, name = "", ...) {
  if (is.empty(name)) name <- "factor"
  fscores <- as.data.frame(object$fres$scores, stringsAsFactors = FALSE)
  indr <- indexr(dataset, object$vars, object$data_filter)
  fs <- data.frame(matrix(NA, nrow = indr$nr, ncol = ncol(fscores)), stringsAsFactors = FALSE)
  fs[indr$ind, ] <- fscores
  dataset[, sub("^[a-zA-Z]+([0-9]+)$", paste0(name, "\\1"), colnames(fscores))] <- fs
  dataset
}

#' Sort and clean loadings
#'
#' @details See \url{https://radiant-rstats.github.io/docs/multivariate/full_factor.html} for an example in Radiant
#'
#' @param floadings Data frame with loadings
#' @param fsort Sort factor loadings
#' @param cutoff Show only loadings with (absolute) values above cutoff (default = 0)
#' @param dec Number of decimals to show
#' @param repl Replace loadings below the cutoff by NA (or "")
#'
#' @examples
#' result <- full_factor(shopping, "v1:v6", nr_fact = 2)
#' clean_loadings(result$floadings, fsort = TRUE, cutoff = .5, dec = 2)
#'
#' @importFrom psych fa.sort
#'
#' @export
clean_loadings <- function(floadings, cutoff = 0, fsort = FALSE, dec = 8, repl = NA) {
  if (fsort) {
    floadings <- select(psych::fa.sort(floadings), -order)
  }

  if (cutoff == 0) {
    floadings %<>% round(dec)
  } else {
    ind <- abs(floadings) < cutoff
    floadings %<>% round(dec)
    floadings[ind] <- repl
  }
  floadings
}
radiant-rstats/radiant.multivariate documentation built on Nov. 29, 2023, 9:52 p.m.