R/principalcomponentsanalysis.R

Defines functions throwWarningAbout2DScatterplotNotPossible convertVariableForFactorAnalysis sampleSizeMatrix scaleDataUsingWeights BartlettTestOfSphericity prepareDataForFactorAnalysis ExtractChartData.flipFactorAnalysis ComponentPlot ScreePlot fitted.flipFactorAnalysis PrincipalComponentsAnalysis

Documented in BartlettTestOfSphericity ComponentPlot fitted.flipFactorAnalysis prepareDataForFactorAnalysis PrincipalComponentsAnalysis ScreePlot

#' \code{PrincipalComponentsAnalysis}
#' @description Calculate a Principal Component Analysis
#' @param data A data frame with numeric columns which contains the data to be
#'   analyzed.
#' @param weights A numeric vector containing the weight for each case in data.
#' @param subset A logical vector which describes the subset of \code{data} to
#'   be analyzed.
#' @param missing A string specifiying what to do when the \code{data} contains
#'   missing values. The valid options are \code{"Error if missing data"},
#'   \code{"Exclude cases with missing data"}, \code{"Use partial data (pairwise
#'   correlations)"}, and \code{"Imputation (replace missing values with
#'   estimates)"}.
#' @param use.correlation A logical value specifying whether to use the
#'   correlation matrix (\code{TRUE}), or the covariance matrix (\code{FALSE}).
#' @param rotation A string specifying the type of rotation to be used. Valid
#'   options are  \code{"none"}, \code{"varimax"}, \code{"quartimax"},
#'   \code{"equamax"}, \code{"promax"}, and \code{"oblimin"}.
#' @param oblimin.delta A parameter supplied for oblimin rotations.
#' @param promax.kappa A parameter supplied for promax rotations.
#' @param select.n.rule Method for selecting the number of principal components to keep. May be one of \code{"Kaiser rule"}, \code{"Eigenvalues over"}, or \code{"Number of components"}.
#' @param n.factors An integer specifying the number of principal components to keep. Used if \code{select.n.rule} is \code{"Number of components"}.
#' @param eigen.min Cut-off above which eigenvalues are selected. Used if \code{select.n.rule} is \code{"Eigenvalues over"}.
#' @param sort.coefficients.by.size A logical value determining whether loadings
#'   should be sorted when printed.
#' @param suppress.small.coefficients A logical value specifying whether components
#' that are less than \code{min.display.loading.value} in magnitude will be replaced
#' with blanks when printing.
#' @param min.display.loading.value Loadings smaller than this value will not be
#'   displayed when printed.
#' @param print.type A string specifying the type of printing that should be
#'   done. Valid options are \code{"loadings"} to display a (rotated) loading table,
#'   \code{"structure"} to display a component structure matrix (which is the loadings
#'   multiplied by the component correlations), \code{"details"} to display a plain-text
#'   output containing more details from the analysis, \code{"variance"} to display a
#'   table showing the original eigenvalues of the input, and the corresponding variance
#'   explained,
#'   \code{"scree"} to display a Scree Plot, \code{"scatter"} to display a
#'   plot of the first two dimensions of the final loadings, and \code{"2d"} to plot the
#'   first two dimensions of the data, grouped by a categorical variable.
#'   The latter three options make use of HTML widgets.
#' @param show.labels If \code{TRUE}, labels are shown rather than name in outputs.
#' @param plot.labels A logical value which determines whether or not the
#'   scatter plot will show the labels of the input data, or just integers
#'   specifying the column number of each variable.
#' @param data.groups A \code{\link{vector}} of labels used to group the cases
#' when \code{"print.type"} is \code{"2d"}.
#' @param tol When the correlation martrix (or covariance) matrix has any singular values below this number
#'   the analysis will stop. Note that the function \code{principal} from package \code{psych} has its
#'   own internal cuttoff as well.
#'
#' @details This uses \code{\link[psych]{principal}} from package \code{psych} to compute the unrotated
#' PCA, and uses package \code{GPArotation} to find a rotated solution if required, to match SPSS' PCA. The
#' rotation includes a Kaiser normalization and a method of Promax which matches what SPSS does.
#' Components with large negative loadings will have signs flipped to give positive components after rotation.
#' Includes handling of missing data, weighting, and filtering.
#' @importFrom flipFormat Labels
#' @importFrom flipStatistics CovarianceAndCorrelationMatrix StandardDeviation
#' @importFrom psych principal factor.scores
#' @importFrom verbs Sum SumEachRow
#' @export
PrincipalComponentsAnalysis <- function(data,
                               weights = NULL,
                               subset = NULL,
                               missing = "Exclude cases with missing data",
                               use.correlation = TRUE,
                               rotation = "none",
                               oblimin.delta = 0,
                               promax.kappa = 4,
                               select.n.rule = "Number of factors",
                               eigen.min = 1.0,
                               n.factors = 2,
                               sort.coefficients.by.size = FALSE,
                               suppress.small.coefficients = FALSE,
                               min.display.loading.value = 0.1,
                               print.type = "loadings",
                               show.labels = TRUE,
                               plot.labels = TRUE,
                               data.groups = NULL,
                               tol = 1e-13)
{
    if (is.null(rownames(data)))
        rownames(data) <- 1:nrow(data)
    if (select.n.rule == "Kaiser rule")
        eigen.min <- 1.0
    if (show.labels && !is.null(Labels(data)))
        colnames(data) <- Labels(data)
    if (rotation != "Promax" && rotation != "promax")
        promax.kappa = NULL
    if (rotation != "Oblimin" && rotation != "oblimin")
        oblimin.delta = NULL
    if (print.type %in% c("Component Plot", "Scree Plot", "Variance Explained", "2D Scatterplot"))
    {
        sort.coefficients.by.size = FALSE
        suppress.small.coefficients = FALSE
        min.display.loading.value = 0.1
    }
    if (print.type == "2D Scatterplot" && select.n.rule == "Number of components" && n.factors < 2L)
    {
        throwWarningAbout2DScatterplotNotPossible(select.n.rule)
        n.factors <- 2L
    }
    if (print.type != "Component Plot")
        plot.labels = TRUE

    # Generate the data that will be input to the correlation/covariance
    # matrix by filtering and imputing if specified.
    prepared.data <- prepareDataForFactorAnalysis(data, weights, subset, missing)

    # If any variables have a standard deviation of 0 remove from analysis
    stddevs <- StandardDeviation(prepared.data$subset.data, weights = prepared.data$subset.weights)
    ind.zero.variance <- which(stddevs == 0)
    if (length(ind.zero.variance) > 0)
    {
        warning("Some of your variables have no variation and have been removed for principal components analysis: ", paste(names(stddevs)[ind.zero.variance], sep = "", collapse = ", "))
        prepared.data$subset.data <- prepared.data$subset.data[,-ind.zero.variance]
        stddevs <- stddevs[-ind.zero.variance]
    }

    correlation.matrix <- CovarianceAndCorrelationMatrix(data = prepared.data$subset.data,
                                                         weights = prepared.data$subset.weights,
                                                         pairwise = missing == "Use partial data (pairwise correlations)",
                                                         use.correlation = TRUE)

    # SPSS computes the covariance matrix that it uses as an imput to PCA
    # by first calculating the pairwise correlation matrix and then
    # scaling by the products of the standard deviations of each pair of
    # variables. When there is no missing data this ought to match the
    # covariance matrix which would be computed by the usual covariance
    # formula, but when there is missing data the two results will differ.
    if (!use.correlation)
        input.matrix <- correlation.matrix * stddevs %o% stddevs
    else
        input.matrix <- correlation.matrix

    # Compute eigenvalues for component selection
    if (select.n.rule %in% c("Kaiser rule", "Eigenvalues over"))
    {
        eigens <- eigen(input.matrix, only.values = TRUE)
        if (!use.correlation)
        {
            warning("Select components with eigenvalues > ",
                    eigen.min, " times mean of eigenvalues as we are using unscaled covariance matrix\n")
            eigen.min <- mean(eigens$values)
        }
        values.above.threshold <- eigens$values > eigen.min
        if (print.type == "2D Scatterplot" && sum(values.above.threshold, na.rm = TRUE) < 2L)
        {
            throwWarningAbout2DScatterplotNotPossible(select.n.rule, eigens$values, eigen.min)
            eigen.min <- eigens$values[2] - .Machine$double.eps
        }
        n.factors <- sum(eigens$values > eigen.min, na.rm = TRUE)
    }
    stdmat <- matrix(rep(stddevs, n.factors), ncol = n.factors)

    # Unrotated loadings
    # Don't return all of the properties returned by
    # principal() as they are not designed to account
    # for weighting
    initial.results <- try(suppressWarnings(principal(input.matrix,
                                 nfactors = n.factors,
                                 rotate = "none",
                                 covar = !use.correlation,
                                 scores = FALSE)), silent = TRUE)
    if (inherits(initial.results, "try-error"))
        stop("Could not perform PCA on the input matrix")
    unrotated.loadings <- initial.results$loadings

    # Flip eigenvectors so the largest loadings are positive
    unrotated.loadings <- apply(unrotated.loadings, 2,
                               function(x){sg=sign(x); ss=Sum(sg*x^2, remove.missing = FALSE); return(x*sign(ss))})
    loadings <- unrotated.loadings

    # Work out which rotation to use
    # Convert from the strings that are to be used in the menus, which begin with upppercase letters
    substr(rotation, 1, 1) <- tolower(substr(rotation, 1, 1))

    oblique.rotation <- rotation == "oblimin" || rotation == "promax"

    # Rotate the loadings
    if (n.factors == 1 & rotation != "none")
    {
        warning("No rotation for single-component analysis\n")
        rotation <- "none"
    }

    if (rotation != "none")
    {
        rotation.results <- RotateLoadings(unrotated.loadings,
                                           rotation = rotation,
                                           delta = oblimin.delta,
                                           kappa = promax.kappa,
                                           covar = !use.correlation,
                                           stds = stddevs)
        sign.loadings <- apply(rotation.results$rotated.loadings, 2,
                                 function(x){sg=sign(x); ss=Sum(sg*x^2, remove.missing = FALSE); return(rep(sign(ss), length(x)))})
        rotated.loadings <- rotation.results$rotated.loadings * sign.loadings

        loadings <- rotated.loadings
        if (oblique.rotation)
            structure.matrix <- rotation.results$structure.matrix * sign.loadings
        else
            structure.matrix <- rotated.loadings
        sign.matrix <- tcrossprod(sign.loadings[1,], sign.loadings[1,])
        component.correlations <- rotation.results$component.correlations
        if (!is.null(component.correlations))
            component.correlations <- component.correlations * sign.matrix
        colnames(structure.matrix) <- colnames(loadings)

    } else {
        sign.loadings <- apply(unrotated.loadings, 2,
                                 function(x){sg=sign(x); ss=Sum(sg*x^2, remove.missing = FALSE); return(rep(sign(ss), length(x)))})
        unrotated.loadings <- unrotated.loadings * sign.loadings
        loadings <- unrotated.loadings
        rotated.loadings <- unrotated.loadings
        structure.matrix <- loadings
        component.correlations <- NULL
    }
    comp.names <- paste("Component", 1:n.factors)
    colnames(rotated.loadings) <- comp.names
    colnames(structure.matrix) <- comp.names
    colnames(loadings) <- comp.names
    if (!is.null(component.correlations))
    {
        rownames(component.correlations) <- comp.names
        colnames(component.correlations) <- comp.names
    }

    # Rescale the loadings
    raw.loadings <- loadings
    raw.unrotated.loadings <- unrotated.loadings
    raw.rotated.loadings <- rotated.loadings
    raw.structure.matrix <- structure.matrix
    if (!use.correlation)
    {
        loadings <- loadings/stdmat
        unrotated.loadings <- unrotated.loadings/stdmat
        rotated.loadings <- rotated.loadings/stdmat
        structure.matrix <- structure.matrix/stdmat
    }

    # Generate score weights using the regression method.
    # For PCA, all methods give the same answer as scores
    # are well-defined, so there is no need to use Bartlett
    # or Anderson.

    # For oblique rotations, use structure matrix
    # otherwise use pattern matrix
    if (rotation == "promax" || rotation == "oblimin")
    {
        S <- structure.matrix
    } else {
        S <- loadings
    }

    # Smooth non-positive definite correlation in the same way as psych::principal
    cor <- InterceptExceptions(cor.smooth2(correlation.matrix),
                               error.handler = function(e) {
                                   warning("I am sorry, there is something seriously wrong with the correlation matrix,",
                                           "\ncor.smooth failed to smooth it because some of the eigen values are NA.",
                                           "\nAre you sure you specified the data correctly?")
                               },
                               warning.handler = function(w) {
                                 smoothing.done <- startsWith(w$message, "The analysis has failed to satisfy a technical assumption of PCA")
                                 if (smoothing.done)
                                     warning(w$message, checkDataAfterCorrelationSmoothing(data, prepared.data, missing))
                                 else
                                     warning(w$message)
                               })
    score.weights <- try(solve(cor, S), silent = TRUE)
    if (inherits(score.weights, "try-error"))
        stop("Component scores could not be computed as the correlation or correlation matrix is singular.")

    # Original data is scaled before generating scores
    if (!is.null(weights))
        scaled.data <- scaleDataUsingWeights(data = prepared.data$subset.data, weights = prepared.data$subset.weights)
    else
        scaled.data <- scale(prepared.data$subset.data)

    # Multiply the scaled data by the weights to produce scores
    scores <- as.matrix(scaled.data) %*% score.weights

    # Fill out any additional cases with missing values, so that the size of the output scores
    # matches the number of respondents in the original data set, and that the cases are
    # matched up correctly
    new.data <- matrix(NA, nrow = nrow(data), ncol = ncol(scores))
    row.names(new.data) <- row.names(data)
    colnames(new.data) <- colnames(scores)
    if (!any(duplicated(row.names(data))) && !"" %in% row.names(data))
    {
        common.names <- intersect(rownames(new.data), row.names(scores))
        new.data[common.names,] <- scores[common.names,]
    } else if (nrow(prepared.data$subset.data) == nrow(data))
        new.data <- scores
    else # subset should always be non-null at this point.
        new.data[subset, ] <- scores
    scores <- new.data


    # Communalities
    initial.communalities <- diag(input.matrix)
    extracted.communalities <- initial.results$communality
    if (!use.correlation)
    {
        rescaled.initial.communalities <- rep(1, nrow(input.matrix))
        rescaled.extracted.communalities <- SumEachRow(as.matrix(unrotated.loadings)^2, remove.missing = FALSE)
    }

    # Variance explained by factors


    # Results
    results <- list()
    results$unrotated.loadings <- unrotated.loadings
    if (rotation != "none")
    {
        results$rotated.loadings <- rotated.loadings
    }


    results$loadings <- loadings
    results$structure.matrix <- structure.matrix

    results$raw.loadings <- raw.loadings
    results$raw.unrotated.loadings <- raw.unrotated.loadings
    results$raw.rotated.loadings <- raw.rotated.loadings
    results$raw.structure.matrix <- raw.structure.matrix
    results$unrotated.loadings <- unrotated.loadings
    results$rotated.loadings <- rotated.loadings

    results$sort.coefficients.by.size <- sort.coefficients.by.size
    results$suppress.small.coefficients <- suppress.small.coefficients
    results$min.display.loading.value <- min.display.loading.value
    results$original.data <- data
    results$original.weights <- weights
    results$data.used <- prepared.data
    results$use.correlation <- use.correlation
    results$print.type <- print.type
    results$plot.labels <- plot.labels
    results$input.matrix <- input.matrix
    results$values <- initial.results$values
    results$scores <- scores
    results$score.weights <- score.weights
    results$rotation <- rotation
    results$missing <- missing
    results$component.correlations <- component.correlations
    results$data.groups <- data.groups


    results$initial.communalities <- initial.communalities
    results$extracted.communalities <- extracted.communalities
    if (!use.correlation)
    {
        results$rescaled.initial.communalities <- rescaled.initial.communalities
        results$rescaled.extracted.communalities  <- rescaled.extracted.communalities
    }


    class(results) <- "flipFactorAnalysis"
    attr(results, "ChartData") <- ExtractChartData(results)
    if (results$print.type == "2d" || results$print.type == "2D Scatterplot" ||
        results$print.type == "Component Plot")
        attr(results, "ChartType") <- "X Y Scatter"
    return(results)
}

#' @rdname PrincipalComponentsAnalysis
#' @param object Object of class \code{"flipFactorAnalysis"} created using \code{PrincipalComponentsAnalysis}.
#' @param ... Not used.
#' @export

fitted.flipFactorAnalysis <- function(object, ...)
{
    return(object$scores)
}


#' \code{ScreePlot}
#' @description Plot the eigenvalues from an existing principal component or
#'   factor analysis or plot the eigenvalues from the correlation or covariance
#'   matrix of a data frame.
#' @param x Either a data frame, a numeric vector of eigenvalues, or the
#'   eigenvalues from an analysis of class \code{flipFactorAnalysis} from
#'   \code{\link{PrincipalComponentsAnalysis}}, or \code{fa} or \code{principal} from package
#'   psych. When x is a data frame, additional arguments can be supplied as to
#'   how to compute the covariance or correlation matrix.
#' @param trim.padding Logical; whether to remove extra padding around the htmlwidget.
#'   By default this is set to \code{FALSE} to be the same as old charts
#' @inheritParams PrincipalComponentsAnalysis
#'
#' @return An HTML widget object from plotly containing the Scree Plot.
#' @importFrom flipStatistics CovarianceAndCorrelationMatrix
#' @importFrom plotly plot_ly layout
#' @export
ScreePlot <- function(x, weights = NULL, subset = NULL, missing = "Exclude cases with missing data",
    use.correlation = TRUE, trim.padding = FALSE)
{
    if ("data.frame" %in% class(x))
    {
        prepared.data <- prepareDataForFactorAnalysis(data = x, weights = weights, subset = subset, missing = missing)
        input.matrix <- CovarianceAndCorrelationMatrix(
            data = prepared.data$subset.data,
            weights = prepared.data$subset.weights,
            pairwise = missing == "Use partial data (pairwise correlations)",
            use.correlation = use.correlation)
        input.values <- eigen(input.matrix)$values
    }
    else if ("numeric" %in% class(x))
    {
        input.values <- x
    }
    else if (inherits(x, "flipFactorAnalysis") || any(class(x) == "fa") || any(class(x) == "principal"))
    {
        input.values <- x$values
    }
    else
    {
        stop(paste0("Can't make a Scree Plot for object of class ", paste(class(x), collapse = ", ")),
             ". Select a Principal Components Analysis output.")
    }

    input.values <- sort(input.values, decreasing = TRUE)

    df = data.frame(eig.num = 1:length(input.values), eig.vals = input.values)

    `Component Number` <- 1:length(input.values)
    Eigenvalue <- input.values

    my.plot <- plot_ly(x = ~`Component Number`,
                       y = ~Eigenvalue,
                       mode = "lines+markers", type="scatter")
    my.plot <- layout(p = my.plot,
                      title = "Scree Plot",
                      yaxis = list(range = c(0, max(input.values) + 1)),
                      xaxis = list(title = "Component Number"))
    my.plot$sizingPolicy$browser$padding <- if (trim.padding) 0 else 40
    my.plot <- plotly::config(p = my.plot, displayModeBar = FALSE)
    class(my.plot) <- c(class(my.plot), "visualization-selector")
    return(my.plot)
}


#' \code{ComponentPlot}
#'
#' @description Create a scatter plot showing the loadings of each variable on
#'   the first two principal components.
#'
#' @param x An object of class \code{flipFactorAnalysis}.
#' @param show.labels Label the points with the row names.
#' @importFrom rhtmlLabeledScatter LabeledScatter
#' @importFrom verbs SumEachColumn
#' @export
ComponentPlot <- function(x, show.labels = TRUE)
{
    if (is.null(x$loadings))
    {
        stop("Input should be created by Data Reduction - Principal Components Analysis")
    }

    if (ncol(x$loadings) < 2)
    {
        stop("There aren't enough components to plot.")
    }

    labels <- as.character(1:nrow(x$loadings))
    if (show.labels)
    {
        if (is.null(row.names(x$loadings)))
            warning("The loadings do not contain labels.")
        labels <- row.names(x$loadings)
    }

    x.label <- "Component 1"
    y.label <- "Component 2"
    add.ve <- FALSE
    if ("princomp" %in% class(x))
        add.ve <- TRUE
    if (any(c("psych", "flipFactorAnalysis") %in% class(x)) && !(x$rotation %in% c("promax", "oblimin")))
        add.ve <- TRUE

    if (add.ve)
    {
        ss.loadings <- SumEachColumn(x$loadings^2, remove.missing = FALSE)
        variance.explained <- ss.loadings/nrow(x$loadings)
        x.label <- sprintf("%s (%.1f%% variance explained)", x.label, variance.explained[1]*100)
        y.label <- sprintf("%s (%.1f%% variance explained)", y.label, variance.explained[2]*100)
    }


    coords <- x$loadings
    groups <- 1:nrow(coords)
    colors <- rep(c('#5B9BD5', '#ED7D31', '#A5A5A5', '#1EC000', '#4472C4', '#70AD47','#255E91','#9E480E','#636363','#997300','#264478','#43682B','#FF2323'),
                  length = length(groups))
    # Append a transparent point to force the origin to be shown
    # Note that axis bounds cannot be set with fixed.aspect
    res <- LabeledScatter(X = c(0, coords[, 1]),
                   Y = c(0, coords[, 2]),
                   label = c(" ", labels),
                   group = c("Origin", groups),
                   colors = c("transparent", colors),
                   fixed.aspect = TRUE,
                   title = "Component Plot",
                   x.title = x.label,
                   y.title = y.label,
                   axis.font.size = 10,
                   labels.font.size = 12,
                   title.font.size = 20,
                   y.title.font.size = 16,
                   x.title.font.size = 16,
                   legend.show = FALSE)
    class(res) <- c(class(res), "visualization-selector")
    res
}

#' @importFrom verbs Sum SumEachColumn
#' @export
ExtractChartData.flipFactorAnalysis <- function(x)
{
    if (x$print.type == "scree" || x$print.type == "Scree Plot")
        return(sort(x$values, decreasing = TRUE))
    if (x$print.type == "2d" || x$print.type == "2D Scatterplot")
    {
        tmp <- convertFactorAnalysisTo2D(x)
        if (is.null(tmp$data.groups))
        {
            data <- tmp$embedding
            attr(data, "scatter.variable.indices") <- c(x = 1, y = 2, sizes = NA, colors = NA)
            return(data)
        }
        data <- data.frame(tmp$embedding, Group = tmp$data.groups, stringsAsFactors = FALSE,
            check.names = FALSE, check.rows = FALSE)
        attr(data, "scatter.variable.indices") <- c(x = 1, y = 2, sizes = NA, colors = 3)
        return(data)
    }
    if (x$print.type == "loadings" || x$print.type == "Loadings Table")
        return(.tidy.loadings(x, input.matrix = x$loadings))
    if (x$print.type == "structure" || x$print.type == "Structure Matrix")
        return(.tidy.loadings(x, input.matrix = x$structure.matrix))
    if (x$print.type =="variance" || x$print.type == "Variance Explained")
    {
        eigenvalues <- x$values
        variance.proportions = eigenvalues / Sum(eigenvalues, remove.missing = FALSE)
        cumulative.proportions = cumsum(variance.proportions)
        result <- cbind('Eigenvalue' = eigenvalues,
                     '% of Variance' = variance.proportions,
                     'Cumulative %' = cumulative.proportions)
        rownames(result) <- paste("Component", 1:length(eigenvalues))
        return(result)
    }
    if (x$print.type == "details" || x$print.type == "Detailed Output")
    {
        headings <- matrix("", nrow = 10, ncol = ncol(x$loadings) + 1)
        headings[,1] <- c("", "Loadings", "", "Structure matrix", "", "",
                          "", "Communalities", "", "Score Cofficient Matrix")

        .print <- function(x, digits = 3)
        {
            n <- nrow(x)
            m <- ncol(x)
            res <- matrix("", nrow = n+1, ncol = m+1, dimnames = list(rep("", n+1), rep("", m+1)))
            res[1,(1:m)+1] <- colnames(x)
            res[(1:n)+1,1] <- rownames(x)
            res[(1:n)+1,(1:m)+1] <- round(x, digits)
            return(res)
        }

        ss.loadings <- SumEachColumn(x$loadings ^ 2, remove.missing = FALSE)
        nvar <- ncol(x$original.data)
        ve.table <- rbind(`Sum of Square Loadings` = ss.loadings)
        ve.table <- rbind(`Sum of Square Loadings` = ss.loadings)
        ve.table <- rbind(ve.table, `% of Variance` = ss.loadings/nvar*100)
        if (ncol(x$loadings) > 1)
            ve.table <- rbind(ve.table, `Cumulative %` = cumsum(ss.loadings/nvar*100))

        if (x$use.correlation)
            communality.table <- cbind("Initial" = x$initial.communalities,
                                       "Extraction" = x$extracted.communalities)
        else
            communality.table <- cbind("Initial" = x$rescaled.initial.communalities,
                                       "Extraction" = x$rescaled.extracted.communalities)

        empty.mat <- matrix("", nrow = length(x$initial.communalities) + 1, ncol = ncol(headings)-3)
        colnames(empty.mat) <- rep("", ncol(headings) - 3)
        all.tables <- rbind(headings[1:2,], .print(.tidy.loadings(x, input.matrix = x$loadings)),
             headings[3:4,], .print(.tidy.loadings(x, input.matrix = x$structure.matrix)),
             headings[5:6,], .print(ve.table),
             headings[7:9,], cbind(.print(communality.table), empty.mat),
             headings[9:10,], .print(x$score.weights))
        return(all.tables)
    }

    # Otherwise return data for a component plot
    if (NCOL(x$loadings) < 2)
        return(x$loadings)
    component.data <- x$loadings[,1:2]
    if (!(x$rotation %in% c("promax", "oblimin")))
    {
        var.exp <- SumEachColumn(x$loadings^2, remove.missing = FALSE)/nrow(x$loadings)
        colnames(component.data) <- sprintf("Component %d (%.1f%% variance explained)",
                                            1:2, var.exp[1:2] * 100)
    }
    attr(component.data, "scatter.variable.indices") <- c(x = 1, y = 2, sizes = NA, colors = NA)
    return(component.data)
}


#' \code{prepareDataForFactorAnalysis}
#' @description Filter data, remove cases with missing values, and impute where requested.
#' @inheritParams PrincipalComponentsAnalysis
#' @return A list containing \code{subset.data}, which is a data frame which has had subset applied
#' and missing values removed or imputed as specified by the parameter \code{missing}, and \code{prepared.weights}
#' which is a nuneric vector containing the weight values that correspond to the remaining cases (or NULL when
#' the input weight is NULL).
#' @importFrom flipImputation Imputation
#' @importFrom flipData ExcludeCasesWithCompletelyMissingData ExcludeCasesWithAnyMissingData ErrorIfMissingDataFound
#' @importFrom flipTransformations ProcessQVariables
prepareDataForFactorAnalysis <- function(data, weights, subset, missing)
{
    data <- ProcessQVariables(data)

    row.names <- rownames(data)

    # Create the input data by filtering and removing missing values or
    # imputing where specified.

    # If no filter specified, create a subset containing all rows
    if (is.null(subset))
        subset <- rep(TRUE, nrow(data))

    # Check filtered rows for missing data
    subset.data <- data[subset, ]
    imputation.label <- NULL
    if (missing == "Error if missing data")
    {
        ErrorIfMissingDataFound(subset.data)
    } else if (missing == "Imputation (replace missing values with estimates)") {
        imputed.data <- Imputation(data)[[1]]
        imputation.label <- attr(imputed.data, "imputation.method")
        subset.data <- imputed.data[subset, ]
    } else if (missing == "Exclude cases with missing data") {
        # Ensure only complete responses remain
        subset.data <- ExcludeCasesWithAnyMissingData(subset.data)
    } else if (missing == "Use partial data (pairwise correlations)") {
        subset.data <- ExcludeCasesWithCompletelyMissingData(subset.data)
    } else {
        stop(paste0("Don't recognize the missing data option", missing))
    }

    # Figure out which of the total set of weight values correspond to the
    # remaining respondents.
    subset.weights <- NULL
    if (!is.null(weights))
    {
        if (missing == "Exclude cases with missing data")
            row.to.remove <- apply(data, 1, function(x) any(is.na(x)))
        else if (missing == "Use partial data (pairwise correlations)")
            row.to.remove <- apply(data, 1, function(x) all(is.na(x)))
        else
            row.to.remove <- rep(FALSE, nrow(data))
        subset.weights <- weights[subset & !row.to.remove]
    }
    return(list(subset.data = subset.data,
                subset.weights = subset.weights,
                imputation.label = imputation.label))
}


#' \code{BartlettTestOfSphericity}
#'
#' @description Conduct the Bartlett Test of Sphericity for a set of data, which
#'   tests that the correlation matrix of the data is not the identity matrix.
#' @param data A data frame containing the data to test.
#' @inheritParams PrincipalComponentsAnalysis
#' @return A list containing the Chi-Square value, degrees of freedom
#'   (\code{df}), and p-value for the test.
#' @details This function wraps \code{\link[psych]{cortest.bartlett}}. In
#'   particular, it extends the existing funcitonality to weighted data, and it
#'   computes the test using a more conservative value of the sample size when
#'   there is missing data. The value for the sample size that is used is the
#'   size of the smallest pairwise-complete set of cases among all pairs of
#'   variables. This is consistent with SPSS.
#' @importFrom flipStatistics CovarianceAndCorrelationMatrix
#' @importFrom psych cortest.bartlett
#' @importFrom verbs Sum
#' @export

# For the sample size, use the min sample size of the correlation matrix
BartlettTestOfSphericity <- function(data,
                         weights = NULL,
                         subset = NULL,
                         missing = "Exclude cases with missing data")
{
    prepared.data <- prepareDataForFactorAnalysis(data, weights, subset, missing)
    correlation.matrix <- CovarianceAndCorrelationMatrix(
        data = prepared.data$subset.data,
        weights = prepared.data$subset.weights,
        pairwise = missing == "Use partial data (pairwise correlations)",
        use.correlation = TRUE)

    # If using a weight, supply the Effective Sample Size, which is the sum of the weights, otherwise
    # supply the actual sample size of the prepared data.
    # When missing is set to "Use partial data (pairwise correlations)" then the sample size can vary between cells in
    # correlation matrix. In this case, use the smallest sample size, or effective sameple size.
    if (missing == "Use partial data (pairwise correlations)")
    {
        sample.size.matrix <- sampleSizeMatrix(data, weights)
        sample.size <- min(sample.size.matrix)
    }
    else if (!is.null(weights))
        sample.size <- Sum(prepared.data$subset.weights, remove.missing = FALSE)
    else
        sample.size <- nrow(prepared.data$subset.data)
    test.results <- cortest.bartlett(correlation.matrix, n = sample.size)
    test.results$n.estimation <- nrow(prepared.data$subset.data)
    test.results$imputation.label <- prepared.data$imputation.label
    class(test.results) <- "flipBartlett"

    return(test.results)
}

# Scale and center data using the weighted mean and standard deviation
#' @importFrom verbs Sum
scaleDataUsingWeights <- function(data, weights)
{
    .weightedMeanAndSD <- function(x, weights)
    {
        complete.cases <- !is.na(x) & weights > 0
        wx <- x * weights
        weighted.mean <- Sum(wx[complete.cases], remove.missing = FALSE) / Sum(weights[complete.cases], remove.missing = FALSE)
        sx <- x - weighted.mean
        wsx2 <- weights * sx * sx
        weighted.variance <- Sum(wsx2[complete.cases], remove.missing = FALSE) / (Sum(weights[complete.cases], remove.missing = FALSE) - 1)
        return(list(weighted.mean = weighted.mean, weighted.sd = sqrt(weighted.variance)))
    }

    for (j in 1L:ncol(data))
    {
        weighted.stats <- .weightedMeanAndSD(data[,j], weights)
        data[,j] <- (data[,j] - weighted.stats$weighted.mean) / weighted.stats$weighted.sd
    }
    return(data)
}


# Calculate the (effective) sample size for each pair of variables in data
#' @importFrom verbs Sum SumEmptyHandling
sampleSizeMatrix <- function(data, weights)
{
    if (is.null(weights))
    {
        weights <- rep(1, nrow(data))
    }
    numvars <- ncol(data)
    sample.size.matrix <- matrix(NA, nrow = numvars, ncol = numvars)
    for (row in 1L:numvars)
    {
        for (col in 1L:numvars)
        {
            if (row == col) {
                # Sample size would be zero if
                # length of summand is zero
                sample.size.matrix[row, row] <- SumEmptyHandling(weights[!is.na(data[, row])], remove.missing = FALSE)
            }
            else
            {
                indicator <- !is.na(data[, row] * data[, col])
                # Sample size would be zero if
                # length of summand is zero
                sample.size <- SumEmptyHandling(weights[indicator], remove.missing = FALSE)
                sample.size.matrix[row, col] <- sample.size
                sample.size.matrix[col, row] <- sample.size
            }
        }
    }
    return(sample.size.matrix)
}

# Convert a variable to the appropriate form for use in a PCA.
# - Numeric variables are unchanged.
# - Ordered factors are replaced with their levels
# - Non-ordered factors are converted to a set of indicator variables
# with one variable for each level but the first.
#' @importFrom flipTransformations FactorToNumeric
convertVariableForFactorAnalysis <- function(variable, include.question.name = TRUE)
{
    if (is.numeric(variable))
    {
        return(variable)
    }

    if (is.ordered(variable))
    {
        return(unclass(variable))
    }

    # Non-ordered factors
#     if (include.question.name)
#     {
#         vn <- paste0(attr(variable, "question"), ":", attr(variable, "label"))
#     } else {
#         vn <- attr(variable, "label")
#     }
    vn <- attr(variable, "label")
    if (is.null(vn))
    {
        vn <- deparse(substitute(x))
    }
    indicator.matrix <- FactorToNumeric(variable, name = vn)
    if (ncol(indicator.matrix) > 2)
    {
        indicator.matrix <- indicator.matrix[, 2:ncol(indicator.matrix)]
    } else if (ncol(indicator.matrix) == 2) {
        cn <- colnames(indicator.matrix)[2]
        indicator.matrix <- as.matrix(indicator.matrix[,2])
        colnames(indicator.matrix) <- cn
    }
    return(indicator.matrix)
}

#' @param select.n.rule Same parameter used in PrincipalComponentsAnalysis to determine the number of components
#' @param eigenvalues The eigenvalues of the PCA before component selection
#' @param eigen.min The boundary for selection used in the Kaiser rule and Eigenvalues over specifications
#' @noRd
throwWarningAbout2DScatterplotNotPossible <- function(select.n.rule, eigenvalues, eigen.min)
{
    if (select.n.rule == "Number of components")
    {
        warn.prefix <- "Only a single component was requested in the PCA. "
    }
    if (select.n.rule %in% c("Kaiser rule", "Eigenvalues over"))
    {
        n.eigen <- sum(eigenvalues > eigen.min, na.rm = TRUE)
        if (select.n.rule == "Kaiser rule")
        {
            warn.prefix <- "The Kaiser rule was specified to "
        } else
        {
            warn.prefix <- "The eigenvalue selection rule was chosen to "
        }
        warn.prefix <- paste0(warn.prefix,
                              "keep a component when its Eigenvalue is larger than ", eigen.min,
                              ". With this setting, only ", n.eigen, " component was chosen. ")
    }
    warning(warn.prefix, "However, two components are required to create a 2D Scatterplot. ",
            "Consequently, the first two components were computed and used to create the 2D Scatterplot.")
}
Displayr/flipDimensionReduction documentation built on March 2, 2024, 3:50 a.m.