R/factor_analysis.R

Defines functions fa_nfactors fa_loadings fa_congruence_matrix fa_splitsample_repeat fa_CAFL fa_robust_fa fa_robust_cormatrix fa_rank_fa fa_mixedness fa_all_methods fa_CFS fa_MAR fa_residuals

Documented in fa_all_methods fa_CAFL fa_CFS fa_congruence_matrix fa_loadings fa_MAR fa_mixedness fa_nfactors fa_rank_fa fa_residuals fa_robust_cormatrix fa_robust_fa fa_splitsample_repeat

#' Find residuals on case-level basis for all indicators in a factor analysis.
#'
#' Extracts the first factor, then uses the factor scores to predict the indicator values for each indicator and for each case. Returns a data.frame with residuals.
#' @param data (data.frame) The data.
#' @param standardize (log scalar) Whether to standardize the residuals (default true). If not done, they may not have the same standard deviation.
#' @param ... (arguments to fa) Further arguments to \code{\link{fa}}.
#' @export
#' @examples
#' fa_residuals(iris[-5])
fa_residuals = function(data, standardize = T, ...) {

  #initial
  data = df_standardize(data) #standardize
  fa = psych::fa(data, ...) #factor analyze
  factor.scores = as.numeric(fa$scores) #get scores
  data2 = data; data2$factor.scores = factor.scores #insert scores into copy
  resids.table = data.frame(matrix(nrow=nrow(data), ncol=ncol(data))) #make df for resids
  colnames(resids.table) = colnames(data) #set names
  rownames(resids.table) = rownames(data) #

  #for each indicator
  for (indicator in colnames(data)) {
    formula = str_c(indicator, " ~ factor.scores") #the regression formula as string
    model = lm(formula, data2, na.action = "na.exclude") #regress
    resids = residuals(model) #extract residuals for this indicator
    if (standardize) {
      resids.table[, indicator] = standardize(resids)
    } else {
      resids.table[, indicator] = resids
    }
  }

  return(resids.table) #return resids
}

#' Calculates the mean absolute residual at the case-level.
#'
#' Extracts the first factor, then uses the factor scores to predict the indicator values for each indicator and for each case. Returns a data.frame with residuals. Then calculates the mean absolute value of these by case.
#' @param data a data.frame.
#' @param sort whether to sort the results or not. Defaults to true.
#' @export
#' @examples
#' fa_MAR(iris[-5])
fa_MAR = function(data, sort = T, ...) {
  resids = fa_residuals(data, ...)

  #mean absolute residuals
  mean.abs.resid = apply(resids, 1, function(x) {
    return(mean(abs(x)))
  }
  )

  #sort?
  if (sort) {
    mean.abs.resid = sort(mean.abs.resid, decreasing=T)
  }

  #return
  return(data.frame(MAR = mean.abs.resid))
}


## For finding problematic cases in FA
# Runs FA on a dataset without a case, for each case. Returns var% for subset analyses and difference to all cases.
#'
#' Returns a numerical vector of the change in the size of the first factor by removing that case.
#' @param data A data.frame.
#' @param sort Whether to sort the results or not. Defaults to true.
#' @param include_full_sample Whether to include the 'case' with the full sample. Defaults to true.
#' @export
#' @examples
#' fa_CFS(iris[-5])
fa_CFS = function(data, sort = T, include_full_sample = T) {
  #initial
  prop.vars = as.data.frame(matrix(nrow=nrow(data)+1, ncol=2)) #for results
  colnames(prop.vars) = c("Prop.var%", "IPV")

  #all cases
  fa = psych::fa(data) #factor analyze
  prop.var = mean(fa$communality) #the proportion of variance accounted for
  prop.vars[nrow(prop.vars),] = c(prop.var, 0) #insert
  rownames(prop.vars)[[nrow(prop.vars)]] = "All cases"

  #for each case
  for (case in 1:nrow(data)) {
    data2 = data[-case,] #get subset without that case
    tryCatch({
      fa = psych::fa(data2) #factor analyze
      prop.var = round(mean(fa$communality),3) #the proportion of variance accounted for
      prop.var.delta = prop.var - prop.vars["All cases",1] #difference to row "all cases"
      prop.vars[case,] = c(prop.var, prop.var.delta) #insert
    },
    error = function(e) {
      prop.vars[case,] = c(NA, NA) #insert NAs
    }
    )

    rownames(prop.vars)[case] = rownames(data)[case] #set rowname
  }

  #sort?
  if (sort) {
    prop.vars = prop.vars[order(prop.vars[,2], decreasing=T),]
  }

  #full sample?
  if(!include_full_sample) {
    prop.vars = prop.vars[1:nrow(data), ]
  }

  return(prop.vars)
}

#' Factor analyze with all methods
#'
#' Runs factor analysis to extract a single factor on a dataset with all possible combinations of extraction and scoring methods. Returns a list with all scores and loadings for further use.
#' @param DF A data.frame to extract factors from.
#' @param ... Parameters to fa().
#' @param skip_methods A character vector of methods to skip. Defaults to none.
#' @export
#' @examples
#' fa_all_methods(iris[-5])
fa_all_methods = function(DF, ..., skip_methods = "", messages = T) {

  #settings
  score.methods = c("regression", "Thurstone", "tenBerge", "Anderson", "Bartlett")
  extraction.methods = c("minres", "ols", "wls", "gls", "pa", "ml", "minchi", "minrank")
  #all combitations of above: choose 1 of 5, choose 1 of 6
  perms = as.matrix(expand.grid(1:length(score.methods), 1:length(extraction.methods)))

  scores = data.frame(matrix(nrow = nrow(DF), ncol = 0))
  loadings = data.frame(matrix(nrow = ncol(DF), ncol = 0))
  #main loop
  for (row in 1:nrow(perms)) {
    #combination name
    score_meth_num = perms[row, 1]
    score_meth_name = score.methods[score_meth_num]
    extract_meth_num = perms[row, 2]
    extract_meth_name = extraction.methods[extract_meth_num]
    name = str_c(score_meth_name, "_", extract_meth_name)
    silence({message(stringr::str_c(row, " out of ", nrow(perms), " - ", name))}, messages = messages)

    #skip methods
    if (!skip_methods == "") {
      if (any(str_detect(name, skip_methods))) {
        silence({message(str_c("Skipping method: ", name))}, messages = messages)
        next
      }
    }

    #analyze
    err = tryCatch({
      silence({.fa = psych::fa(DF, fm = extract_meth_name, scores = score_meth_name)}, messages = messages)
    },
    error = function(e) {
      scores[, name] = NA
      loadings[, name] = NA
      return("error")
    }
    )
    #skip on error
    if (is.character(err)) {
      silence({message(str_c("Skipping ", name, " due to extraction error"))}, messages = messages)

      next
    }

    #skip on Heywood case
    if (any(as.vector(.fa$loadings) > 1 | as.vector(.fa$loadings) < -1)) {
      silence({message(stringr::str_c("Heywood case found for ", name))}, messages = messages)
      next
    }

    #save
    silence({message(stringr::str_c("Saving results from ", name))}, messages = messages)
    scores[, name] = as.vector(.fa$scores)
    loadings[, extract_meth_name] = as.vector(.fa$loadings)
  }

  return(list(scores = scores,
              loadings = loadings))
}


#' Calculate mixedness metrics
#'
#' Returns 4 metrics that attempt to identify cases that are structural outliers/mixed in theor structure.
#'
#' @details
#' MAR, mean absolute residuals. Measures the how well indicator scores can be prediced from the factor scores.
#'
#' CFS, change in factor size. Measures how much the factor size changes with direction.
#'
#' ACFS, absolute change in factor size. Measures how much the factor size changes without direction.
#'
#' MeanALC, mean absolute loading change. Measures how much loadings are affected in general.
#'
#' MaxALC, max absolute loading change. Measures the maximal loading change.
#' @param df A data.frame to calculate mixedness metrics for.
#' @param ... Parameters to \code{\link{fa_MAR}}. These get passed on to \code{\link{fa}}.
#' @export
#' @examples
#' fa_mixedness(iris[-5])
fa_mixedness = function(df, ...){

  #check if colnames contain illegal characters
  if (any(stringr::str_detect(colnames(df), " "))) stop("Colnames contain spaces. Remove and try again.")
  if (any(stringr::str_detect(colnames(df), "&"))) stop("Colnames contain ambersands. Remove and try again.")

  #for results
  return_df = data.frame(matrix(nrow=nrow(df), ncol=0))

  #mean abs. resids
  return_df$MAR = fa_MAR(df, sort = F, ...) %>% unlist %>% as.vector

  #all cases
  fa = psych::fa(df, ...) #factor analyze
  prop_var_full = mean(fa$communality)
  loads_full = as.numeric(fa$loadings)

  #get subsets
  each_subset = get_each_subset_minus_1(df)

  #perform fa on each
  each_fa = lapply(each_subset, fa, ...)

  #get propVars from each fa
  each_propVar = plyr::ldply(each_fa, function(x) {
    return(as.numeric(mean(x$communality)))
  })

  #change in factor size
  return_df$CFS = (each_propVar - prop_var_full) %>% unlist %>% as.vector

  #absolute change in factor size
  return_df$ACFS = abs(return_df$CFS)

  #get the loadings from each fa
  each_loadingset = plyr::ldply(each_fa, function(x) {
    return(as.numeric(x$loadings))
  })

  #check and fix for reversed factors
  reversed = apply(each_loadingset, 1, function(x) {cor(x, loads_full)}) < 0
  each_loadingset[which(reversed), ] = each_loadingset[which(reversed), ] * -1

  #change in loadings
  each_loadingset_change = each_loadingset %>% t %>% - loads_full %>% t

  #mean abs loading change
  return_df$MeanALC = each_loadingset_change %>% abs %>% apply(., 1, mean)

  #max abs loading change
  return_df$MaxALC = each_loadingset_change %>% abs %>% apply(., 1, max)

  #rownames
  rownames(return_df) = rownames(df)

  return(return_df)
}



#' Rank order factor analysis
#'
#'
#' Runs factor analysis on a rank-ordered dataset which prevents outliers from having strong effects.
#' @param x A data.frame to extract factors from.
#' @param ... Parameters to fa().
#' @export
#' @examples
#' fa_rank_fa()
fa_rank_fa = function(x, ...) {

  #rank matrix
  rank_data = apply(x, 2, rank)
  rank_data = as.data.frame(rank_data)
  rownames(rank_data) = rownames(x)
  colnames(rank_data) = colnames(x)

  #fa
  rank_fa = psych::fa(rank_data, ...)

  return(rank_fa)
}

#' Robust correlation matrix
#'
#'
#' Returns a correlation matrix with robust correlations. These are derived from either rlm() [MASS] or lmrob() [robustbase].
#' @param x A data.frame to correlate.
#' @param x method Which robust method to use. Options are "lmrob" [robustbase] or "rlm" [MASS]. Defaults to the first.
#' @export
#' @examples
#' fa_robust_cormatrix()
fa_robust_cormatrix = function(x, method = "lmrob") {
  #std dataset
  x.std = as.data.frame(scale(x))

  #combinations
  combos = gtools::combinations(ncol(x), 2)

  #df for results
  r_mat = as.data.frame(matrix(nrow = ncol(x), ncol = ncol(x)))
  rownames(r_mat) = colnames(r_mat) =  colnames(x)

  #fit each model
  for (row_idx in 1:nrow(combos)) {
    pred = colnames(x)[combos[row_idx, 1]]
    outcome = colnames(x)[combos[row_idx, 2]]

    #create models
    model = str_c(outcome, " ~ ", pred)
    model2 = str_c(pred, " ~ ", outcome)

    #fit models
    if (method == "lmrob") {
      r_fit = robustbase::lmrob(as.formula(model), x.std)
      r_fit2 = robustbase::lmrob(as.formula(model2), x.std)
    }

    if (method == "rlm") {
      r_fit = MASS::rlm(as.formula(model), x.std)
      r_fit2 = MASS::rlm(as.formula(model2), x.std)
    }

    #take mean
    r_mean_coef = (coef(r_fit)[2] + coef(r_fit2))[2] / 2

    #save result
    r_mat[outcome, pred] = r_mean_coef

  }

  #make complete matrix
  r_mat = combine_upperlower(t(r_mat), r_mat, .diag = 1)

  return(r_mat)
}


#' Robust factor analysis.
#'
#'
#' Performs a robust factor analysis. It is done using the output from fa_robust_cormatrix(). Note that this uses a correlation matrix, so factor scores are not available.
#' @param x A data.frame to factor analyze.
#' @export
#' @examples
#' fa_robust_cormatrix()
fa_robust_fa = function(x, ..., .method = "lmrob") {
  #get robust matrix
  r_mat = fa_robust_cormatrix(x, method = .method)

  #fa
  r_fa = psych::fa(r_mat, ...)

  return(r_fa)
}


#' Mixedness detection with a change in absolute factor loadings approach
#'
#'
#' Examines how the exclusion of each particular case influences the factor loadings.
#' @param x A data.frame to factor analyze.
#' @param ... Settings for fa().
#' @param sort Which column to sort results after. Defaults to the first column (mean absolute change). Set to anything else than 1 or 2 to avoid sorting.
#' @param include_full_sample Whether to include the full sample as a case. Defaults to true.
#' @export
#' @examples
#' fa_robust_cormatrix()
fa_CAFL = function(x, ..., sort = 1, include_full_sample = T) {

  #initial fa
  full_fa = fa(x, ...)

  #loadings object
  loadings = data.frame(matrix(ncol = ncol(x), nrow = nrow(x) + 1))
  rownames(loadings) = c(rownames(x), "Full data")
  colnames(loadings) = colnames(x)

  #insert loadings from full dataset
  loadings[nrow(loadings), ] = loadings(full_fa)

  #try each subset
  for (row_idx in 1:nrow(x)) {
    #subset data
    data_subset = x[-row_idx, ]

    #fa
    subset_fa = fa(data_subset, ...)

    #get loadings
    subset_loadings = loadings(subset_fa)

    #insert
    loadings[row_idx, ] = subset_loadings
  }


  #absolute change in loadings by case
  AC = plyr::adply(loadings, 1, function(k) {
    return(k - loadings[nrow(loadings), ])
  })

  #absolute value
  AC = abs(AC)

  #sensible rownames
  rownames(AC) = rownames(loadings)

  #mean value by subset
  MeCAFL = apply(AC, 1, mean)

  #max change by subset
  MaCAFL = apply(AC, 1, max)

  #list
  r = data.frame(mean_change = MeCAFL,
                 max_change = MaCAFL)

  #sort?
  if (sort == 1) {
    r = r[order(r[1], decreasing = T), ]
  }

  if (sort == 2) {
    r = r[order(r[2], decreasing = T), ]
  }

  #full sample?
  if (!include_full_sample) {
    r = r[-nrow(r), ]
  }

  return(r)
}


#' Repeated splithalf reliability with factor analysis
#'
#' Divides a dataset into 2 at random, extracts a factor from each and correlates them. Then saves the correlation. Repeats this any desired number of times. Can also return the factor scores instead of correlations.
#' @param data (data.frame or matrix) The data.
#' @param runs (integer scalar) The number of runs to do.
#' @param save_scores (boolean scalar) Whether to save scores.
#' @param messages (lgl scalar) Whether to display messages.
#' @param progress (lgl scalar) Whether to display progress bar.
#' @param seed (num scalar) Seed to use for reproducible results. Pass \code{NULL} to not use a seed.
#' @param ... Extra parameters to pass to psych::fa().
#' @export
#' @examples
#' fa_splitsample_repeat(iris[-5])
fa_splitsample_repeat = function(data, runs = 100, save_scores = F, messages = T, progress = T, seed = 1, ...) {

  #rename input
  df = data; rm(data)

  #seed
  set.seed(seed)

  #input test
  if (!inherits(df, c("data.frame", "matrix"))) stop("data was not a data.frame or matrix", call. = F)
  df = as.data.frame(df)

  #missing values?
  if (any(is.na(df))) if (messages) message("Cases with missing values were removed.")
  df = na.omit(df)

  #results
  results_scores = list()

  #loop
  if (progress) pb <- txtProgressBar(min = 1, max = runs, initial = 1, style = 3)
  for (run in 1:runs){
    if (progress) setTxtProgressBar(pb, value = run)

    #reorder df
    df = sample(df) #reorder at random

    #how to split
    split_at = ceiling(ncol(df)/2)
    df1_i = 1:split_at
    df2_i = (split_at+1):ncol(df)

    #split
    df1 = df[df1_i]
    df2 = df[df2_i]

    #fa
    df1_fa = fa(df1, ...)
    df2_fa = fa(df2, ...)

    #scores
    scores = data.frame(fa1 = as.vector(df1_fa$scores),
                        fa2 = as.vector(df2_fa$scores))

    #what to save
    if (save_scores) results_scores[[run]] = scores
    else results_scores[[run]] = cor(scores)[1, 2]
  }
  if (progress) close(pb)

  #simplify to vector if just saving correlations
  if(!save_scores) results_scores = data.frame(r = unlist(results_scores))

  #return
  return(results_scores)
}


#' Calculate congruence coefficient matrix.
#'
#' Takes an input either a list of factor analysis objects from fa() (psych package) or a data.frame/matrix of loadings. Returns a matrix of congruence coefficients. These are calculated using factor.congruence() from psych package.
#' @param x A list of factor analysis objects from [psych::fa()] or a data.frame/matrix of factor loadings.
#' @export
fa_congruence_matrix = function(x) {

  #right input type?
  if (!class(x) %in% c("list", "data.frame", "matrix")) stop("Input was not a list, data.frame or matrix!")

  #if input is a list of fa's
  if (class(x) == "list") {
    #are they all factor analysis objects?
    if (all(unlist(lapply(x, function(x) inherits(x, "fa"))))) {
      #get loadings
      all_loadings = lapply(x, function(x) as.matrix(x$loadings))

      return(psych::factor.congruence(all_loadings)) #return congruence matrix

    } else {
      stop("Input was a list of something other than factor analysis objects!")
    }
  }

  #not a list
  plyr::alply(x, 2, as.matrix) %>% factor.congruence %>% return
  #split by column, into a list of matrices, calls the congruence function and returns
}


#' Get loadings from a factor analysis object.
#'
#' Extract loadings from a factor analysis object of class "fa") into a data.frame.
#' @param fa (object of class "fa") A factor analysis object.
#' @param threshold (num) A threshold for loadings to include.
#' @param long_form (lgl) Whether to return data frame in long format. Default is no. Useful for plotting.
#' @export
#' @examples
#' mpg_numeric = mpg %>% select(where(is.numeric))
#' fa_loadings(psych::fa(mpg_numeric))
#' fa_loadings(psych::fa(mpg_numeric, 2))
#' fa_loadings(psych::fa(mpg_numeric, 2), long_form = T)
#' fa_loadings(psych::fa(mpg_numeric, 2), .40)
#' fa_loadings(psych::fa(mpg_numeric, 2), .20, long_form = T)
fa_loadings = function(fa, threshold = NA, long_form = F) {
  loads = loadings(fa)
  class(loads) = "matrix"
  df = as.data.frame(loads)

  #salient loadings only?
  if (!is.na(threshold)) {
    #remove those below
    df[abs(df) < threshold] = NA
  }

  #long form if desired
  if (long_form) {
    df = cbind(indicator = rownames(df), df)
    df = tidyr::gather(df, key = factor, value = loading, !!names(df)[-1]) %>%
      na.omit
  }

  df
}


#' Determine number of factors
#'
#' Convenience function for determining the number of factors in a fa object. Vectorized.
#' @param .fa (fa or list of fa objects) The object from which to determine the number of factors.
#' @export
#' @examples
#' fa(iris[-5]) %>% fa_nfactors
#' fa(iris[-5], nfactors = 2) %>% fa_nfactors
fa_nfactors = function(.fa) {
  #single
  if (inherits(.fa, what = "fa")) return (.fa$loadings %>% ncol)
  #multiple
  if (!all(purrr::map_lgl(.fa, inherits, what = "fa"))) stop("Input was not a single fa object or list or fa objects.")
  purrr::map_int(.fa, function(x) x$loadings %>% ncol)
}




#' Plot factor loadings
#'
#' Returns a ggplot2 plot with sorted loadings colored by the analysis they belong to. Supports reversing of any factors that are reversed. Dodges to avoid overplotting.
#'
#' Non-overlapping indicates are put in the bottom. Note that internally, x and y coods have been flipped, so to modify the scales, use the opposite command, e.g. ylim to modify x axis limits.
#' @param fa_objects (list of fa-class objects) Factor analyses objects from the fa() function from the \code{\link{psych}} package.
#' @param fa_labels (chr vector) Names of the analyses.
#' @param factor_names (chr vectr) Names of the factors if multiple.
#' @param reverse_vector (num vector) Vector of numbers to use for reversing factors. Use e.g. c(1, -1) to reverse the second factor. Defaults not reversing.
#' @param reorder (chr scalar or NA) Which factor analysis to order the loadings by. Can be integers that reprensent each factor analysis. Can also be "mean", "median" to use the means and medians of the loadings. Use "all" for the old method. Default = "mean".
#' @export
#' @examples
#' library(psych)
#' fa_plot_loadings(fa(iris[-5])) #extract 1 factor and plot
#' fa_plot_loadings(fa(iris[-5], 2)) #extract 2 factors and plot
#' #list of FAs
#' fa_list = list(part1 = fa(iris[1:50, -c(1, 5)]),
#'                 part2 = fa(iris[51:100, -c(2, 5)]),
#'                 part3 = fa(iris[101:150, -c(3, 5)]))
#' #notice that it handles non-overlapping indicators
#' fa_plot_loadings(fa_list)
#' #reorder by a particular FA
#' fa_plot_loadings(fa_list, reorder = 1)
fa_plot_loadings = function (fa_objects, fa_labels = NA, factor_names = NA, reverse_vector = NA, reorder = "mean", clean_factor_labels = T, clean_indicator_labels = T) {

  #str cleaning func
  cleaner_func = function(x) {
    stringr::str_replace_all(x, pattern = "[._]", replacement = " ")
  }

  #which kind of plot?
  #1 unifactor analysis - 11
  #1 multifactor analysis - 1m
  #multiple unifactor analyses - m1
  #multiple multifactor analyses - mm
  if (inherits(fa_objects, "fa")) fa_objects = list(fa_objects) #put it in a list for consistency
  if (!all(purrr::map_lgl(fa_objects, inherits, what = "fa"))) stop("Could not recognize input. Input must be a single fa object, or a list of fa objects.")
  fa_num = length(fa_objects)
  factor_num = purrr::map_int(fa_objects, fa_nfactors)

  if (fa_num == 1 && all(1 == factor_num)) {
    plot_type = "11"
  } else if (fa_num == 1) {
    plot_type = "1m"
  } else if (all(factor_num == 1)) {
    plot_type = "m1"
  } else {
    plot_type = "mm"
  }

  #labels to use?
  fa_names = stringr::str_c("fa.", 1:fa_num)
  if (is_scalar_NA(fa_labels)) {
    #names on list?
    if (!is.null(names(fa_objects))) fa_labels = names(fa_objects) else fa_labels = 1:fa_num
  } else {
    if (length(fa_labels) != fa_num) stop("Factor analysis labels length is not identical to number of analyses.")
    fa_labels = fa_names
  }

  #one or more unifactor analyses
  if (plot_type %in% c("11", "m1")) {

    #check reverse_vector
    if (anyNA(reverse_vector)) {
      reverse_vector = rep(1, fa_num)
    } else if (length(reverse_vector) != fa_num) {
      stop("Length of reversing vector does not match number of factor analyses.")
    }

    #extract data
    d = data.frame()
    for (fa_idx in 1:fa_num) {
      #extract and reverse loadings if necessary
      loads = fa_objects[[fa_idx]]$loadings * reverse_vector[fa_idx]
      #indicators names
      rnames = rownames(loads)
      #as df
      loads = as.data.frame(as.vector(loads))
      #set dimnames
      rownames(loads) = rnames
      colnames(loads) = fa_names[fa_idx]
      #merge data
      silence({
        d = merge_datasets(d, loads)
      })
    }

    #reshape data to long form
    d2 = reshape(d, varying = 1:fa_num, direction = "long", ids = rownames(d))
    d2$time = as.factor(d2$time)
    d2$id = as.factor(d2$id)
    colnames(d2)[2] = "fa"

    #clean indicators?
    if (clean_indicator_labels) d2$id %<>% cleaner_func

    #reorder factor?
    if (reorder == "mean") {
      v_aggregate_values = plyr::daply(d2, .variables = "id", function(x) {
        mean(x$fa, na.rm=T)
      })

      #re-level
      d2$id = factor(d2$id, levels = names(sort(v_aggregate_values, decreasing = F)))

    } else if (reorder == "median") {
      v_aggregate_values = plyr::daply(d2, .(id), function(x) {
        median(x$fa, na.rm=T)
      })

      #re-level
      d2$id = factor(d2$id, levels = names(sort(v_aggregate_values, decreasing = F)))

    } else {
      d2_sub = d2[d2$time == reorder, ] #subset the analysis whose loading is to be used for the reorder

      #get vector of the chosen analysis
      v_values = d2_sub$fa; names(v_values) = d2_sub$id

      #re-level
      d2$id = factor(d2$id, levels = names(sort(v_values, decreasing = F, na.last = F)))
    }

    #plot
    if (fa_num > 1) {
      g = ggplot2::ggplot(d2, aes(x = id, y = fa, color = time, group = time)) +
        ggplot2::geom_point(position = position_dodge(width = 0.5)) +
        ggplot2::ylab("Loading") +
        ggplot2::xlab("Indicator") +
        ggplot2::scale_color_discrete(name = "Analysis", labels = fa_labels) +
        ggplot2::coord_flip()
    } else {
      g = ggplot2::ggplot(d2, aes(x = id, y = fa)) +
        ggplot2::geom_point(position = position_dodge(width = 0.5)) +
        ggplot2::ylab("Loading") +
        ggplot2::xlab("Indicator") +
        ggplot2::coord_flip()
    }
  }

  #1 fa with multiple factors
  if (plot_type == "1m") {

    #check reverse_vector
    if (anyNA(reverse_vector)) {
      reverse_vector = rep(1, fa_num)
    } else if (length(reverse_vector) != fa_num) {
      stop("Length of reversing vector does not match number of factor analyses.")
    }

    #extract data
    d = fa_objects[[1]]$loadings %>% unclass %>% as.data.frame
    #rename
    colnames(d) = 1:ncol(d)
    #indicator col
    d = cbind(indicator = rownames(d), d)
    #clean indicator labels?
    d$indicator %<>% stringr::str_replace_all("[._]", " ")
    #spread
    d = tidyr::gather_(d, gather_cols = colnames(d)[-1], key_col = "factor", value_col = "loading")

    #reorder indicators
    if (!is_scalar_NA(reorder)) {
      #sort by mean abs loading
      #this orders the indicators by their importance, overall
      d$indicator %<>% forcats::fct_reorder(d$loading, fun = function(x) {
        mean(abs(x))
      })
    }

    #factor names
    if (is_scalar_NA(factor_names)) factor_names = 1:factor_num

    #clean factor labels?
    if (clean_factor_labels) clean_factor_labels %<>% cleaner_func

    #reorder factor?
    #not clear how to do this

    #plot
    g = ggplot2::ggplot(d, aes(x = indicator, y = loading, color = factor, group = factor)) +
      ggplot2::geom_point(position = position_dodge(width = 0.1)) +
      ggplot2::ylab("Loading") +
      ggplot2::xlab("Indicator") +
      ggplot2::scale_color_discrete(labels = factor_names, name = "Factor") +
      ggplot2::coord_flip()
  }

  if (plot_type == "mm") {
    stop("Multiple multifactor plotting is not implemented yet.")
    }


  return(g + ggplot2::theme_bw())
}


#vectorized
suited_for_pearson = function(x, unique_min = 5) {
  #inner func
  inner_func = function(x) {
    if (inherits(x, "factor")) return(F)
    if (inherits(x, "logical")) return(F)
    if (length(unique(x)) < unique_min) return(F)
    T
  }

  #is input atomic?
  if (purrr::is_atomic(x)) return(inner_func(x))

  #if not, attempt to apply to each part
  purrr::map_lgl(x, inner_func)
}

#all together
suited_for_pearson_all = function(x, unique_min = 5) {
  all(suited_for_pearson(x, unique_min = unique_min))
}

#calculate cors for mixed dfs
#avoid re-doing necessary calculations
hetcor_partial = function(primary, secondaries) {
  cors = purrr::map(seq_along(secondaries), function(x) {
    polycor::hetcor(cbind(primary, secondaries[x]))
  })


  vals = map(cors, "correlations") %>% map_dbl(~magrittr::extract(., 2, 1))
  attr(vals, "type") = map(cors, "type") %>% map_chr(~magrittr::extract(., 2, 1))
  vals
}


#' Scatter plot of Jensen's method
#'
#' Takes a factor analysis, data frame and name of the criterion variable as inputs and returns a ggplot2 scatter plot with Jensen's method applied.
#'
#' #indicator_criterion_method
#' By default the function tries to automatically detect which kind of indicators are supplied. If indicators have any factors, logicals or numeric variables with less than 5 unique values, it it will call the hetcor function from polycor. If all are numeric with many levels, it will call the Pearson correlation (wtd.cors) from package weights with the supplied weights. Alternatively, one can input a numeric vector with the desired indicator-criterion relationships. These could be betas from complicated models.
#' @param fa (fa obj) A factor analysis object from fa().
#' @param df (data frame) A data frame that contains indicators and criterion.
#' @param criterion (chr) The name of the criterion variable.
#' @param reverse_factor (lgl) Whether to reverse the factor. Used when factor analysis results in a reversed factor.
#' @param loading_reversing (lgl) Whether to use loading reversing to avoid inflated results. Defaults to TRUE.
#' @param n_factor (int) Which factor to use? Default is 1st factor. Only relevant for multifactor analysis.
#' @param indicator_criterion_method (chr / num) Which method to use to compute indicator-criterion relationships?
#' @export
#' @examples
#' #load a subset of okcupid data
#' data(okcupid_social_prudence)
#' library(psych); library(polycor)
#' #estimate latent correlations
#' cors = polycor::hetcor(okcupid_social_prudence[-1]) %>% magrittr::extract2("correlations")
#' #factor analyze normally
#' fa = fa(cors)
#' #apply Jensen's method
#' fa_Jensens_method(fa, okcupid_social_prudence, criterion = "CA")
#' #output shows that indicators with larger loadings tend to be more positively related to cognitive ability
fa_Jensens_method = function(fa, df, criterion, reverse_factor = F, loading_reversing = T, check_overlap = TRUE, n_factor = 1, indicator_criterion_method = "auto", .weights = NA, ...) {
  args = list(...)

  if ("criteria" %in% args) stop("'criteria' changed name to 'criterion'")

  #get loadings
  if (ncol(fa$loadings) < n_factor) stop(sprintf("You tried to extract a factor that doesn't exist. n_factor was %d but fa only has %d factors.", n_factor, ncol(fa$loadings)))
  fa_loadings = fa$loadings %>% unclass() %>% .[, n_factor]

  #weights
  if (is.na(.weights)) .weights = rep(1, nrow(df))

  #reverse factor is desired
  if (reverse_factor) fa_loadings = fa_loadings * -1

  #get indicator names
  indicator_names = rownames(fa$loadings)
  indicator_num = length(indicator_names)

  #criterion
  if (is.numeric(criterion)) {
    #fitting length?
    if (!indicator_num == length(criterion)) stop("The criterion values supplies did not have the right length. They must match the number of indicators. Criterion %d vs. indicators %d", length(criterion), indicator_num)

    #put into the value vector
    criterion_vals = criterion
  }

  if (is.character(criterion)) {
    #is it there?
    if (!criterion %in% names(df)) stop("Criterion variable not found in the data")

    #setsub df
    df2 = df[c(indicator_names, criterion)]

    #get criterion x indicator relationships
    if (indicator_criterion_method == "auto") {
      #can we use Pearson correlations for all of these?
      if (!suited_for_pearson_all(df2)) {
        message("Using latent correlations for the criterion-indicators relationships.")
        criterion_vals = silence(hetcor_partial(df2[criterion], df2[-which(names(df2) == criterion)]))
      } else {
        message("Using Pearson correlations for the criterion-indicators relationships.")
        #convert all to numeric
        df2 = purrr::map_df(df2, as.numeric)
        df2_cors = weights::wtd.cors(df2, weight = .weights)
        criterion_vals = df2_cors[ncol(df2_cors), -ncol(df2_cors)]
      }
    } else if (indicator_criterion_method == "pearson") {
      message("Using Pearson correlations for the criterion-indicators relationships.")
      df2 = df_colFunc(df2, func = as.numeric)
      df2_cors = weights::wtd.cors(df2, weight = .weights)
      criterion_vals = df2_cors[ncol(df2_cors), -ncol(df2_cors)]
    } else if (indicator_criterion_method == "latent") {
      message("Using latent correlations for the criterion-indicators relationships.")
      criterion_vals = silence(hetcor_partial(df2[criterion], df[-which(names(df) == criterion)]))
    } else stop(sprintf("Could not recognize indicator_criterion_method: %s", indicator_criterion_method))

  }

  #make df for plotting
  df3 = data.frame(loading = fa_loadings,
                   crit_vals = criterion_vals)

  #reverse?
  if (loading_reversing) {
    for (i in 1:nrow(df3)) {
      if (df3[i, "loading"] < 0) {
        #reverse
        df3[i, ] = df3[i, ] * -1

        #name change
        indicator_names[i] = indicator_names[i] + "_r"
      }
    }
  }

  #plot
  g = GG_scatter(df3, x_var = "loading", y_var = "crit_vals", case_names = indicator_names, ...) +
    ylab("Criterion-indicator relationship")+
    xlab("Loading")

  #return ggplot object
  return(g)
}
Deleetdk/kirkegaard documentation built on June 7, 2024, 7:49 a.m.