# R/principalcomponentsanalysis.R In NumbersInternational/flipDimensionReduction: Algorithms for dimension reduction

#### Documented in BartlettTestOfSphericityComponentPlotfitted.flipFactorAnalysisprepareDataForFactorAnalysisPrincipalComponentsAnalysisScreePlot

```#' \code{PrincipalComponentsAnalysis}
#' @description Calculate a Principal Component Analysis
#' @param data A data frame with numeric columns which contains the data to be
#'   analyzed.
#' @param weights A numeric vector containing the weight for each case in data.
#' @param subset A logical vector which describes the subset of \code{data} to
#'   be analyzed.
#' @param missing A string specifiying what to do when the \code{data} contains
#'   missing values. The valid options are \code{"Error if missing data"},
#'   \code{"Exclude cases with missing data"}, \code{"Use partial data (pairwise
#'   correlations)"}, and \code{"Imputation (replace missing values with
#'   estimates)"}.
#' @param use.correlation A logical value specifying whether to use the
#'   correlation matrix (\code{TRUE}), or the covariance matrix (\code{FALSE}).
#' @param rotation A string specifying the type of rotation to be used. Valid
#'   options are  \code{"none"}, \code{"varimax"}, \code{"quartimax"},
#'   \code{"equamax"}, \code{"promax"}, and \code{"oblimin"}.
#' @param oblimin.delta A parameter supplied for oblimin rotations.
#' @param promax.kappa A parameter supplied for promax rotations.
#' @param select.n.rule Method for selecting the number of principal components to keep. May be one of \code{"Kaiser rule"}, \code{"Eigenvalues over"}, or \code{"Number of components"}.
#' @param n.factors An integer specifying the number of principal components to keep. Used if \code{select.n.rule} is \code{"Number of components"}.
#' @param eigen.min Cut-off above which eigenvalues are selected. Used if \code{select.n.rule} is \code{"Eigenvalues over"}.
#' @param sort.coefficients.by.size A logical value determining whether loadings
#'   should be sorted when printed.
#' @param suppress.small.coefficients A logical value specifying whether components
#' that are less than \code{min.display.loading.value} in magnitude will be replaced
#' with blanks when printing.
#'   displayed when printed.
#' @param print.type A string specifying the type of printing that should be
#'   \code{"structure"} to display a component structure matrix (which is the loadings
#'   multiplied by the component correlations), \code{"details"} to display a plain-text
#'   output containing more details from the analysis, \code{"variance"} to display a
#'   table showing the original eigenvalues of the input, and the corresponding variance
#'   explained,
#'   \code{"scree"} to display a Scree Plot, \code{"scatter"} to display a
#'   plot of the first two dimensions of the final loadings, and \code{"2d"} to plot the
#'   first two dimensions of the data, grouped by a categorical variable.
#'   The latter three options make use of HTML widgets.
#' @param show.labels If \code{TRUE}, labels are shown rather than name in outputs.
#' @param plot.labels A logical value which determines whether or not the
#'   scatter plot will show the labels of the input data, or just integers
#'   specifying the column number of each variable.
#' @param data.groups A \code{\link{vector}} of labels used to group the cases
#' when \code{"print.type"} is \code{"2d"}.
#' @param tol When the correlation martrix (or covariance) matrix has any singular values below this number
#'   the analysis will stop. Note that the function \code{principal} from package \code{psych} has its
#'   own internal cuttoff as well.
#'
#' @details This uses \code{\link[psych]{principal}} from package \code{psych} to compute the unrotated
#' PCA, and uses package \code{GPArotation} to find a rotated solution if required, to match SPSS' PCA. The
#' rotation includes a Kaiser normalization and a method of Promax which matches what SPSS does.
#' Components with large negative loadings will have signs flipped to give positive components after rotation.
#' Includes handling of missing data, weighting, and filtering.
#' @importFrom flipFormat Labels
#' @importFrom flipStatistics CovarianceAndCorrelationMatrix StandardDeviation
#' @importFrom psych principal factor.scores
#' @importFrom verbs Sum SumEachRow
#' @export
PrincipalComponentsAnalysis <- function(data,
weights = NULL,
subset = NULL,
missing = "Exclude cases with missing data",
use.correlation = TRUE,
rotation = "none",
oblimin.delta = 0,
promax.kappa = 4,
select.n.rule = "Number of factors",
eigen.min = 1.0,
n.factors = 2,
sort.coefficients.by.size = FALSE,
suppress.small.coefficients = FALSE,
show.labels = TRUE,
plot.labels = TRUE,
data.groups = NULL,
tol = 1e-13)
{
if (is.null(rownames(data)))
rownames(data) <- 1:nrow(data)
if (select.n.rule == "Kaiser rule")
eigen.min <- 1.0
if (show.labels && !is.null(Labels(data)))
colnames(data) <- Labels(data)
if (rotation != "Promax" && rotation != "promax")
promax.kappa = NULL
if (rotation != "Oblimin" && rotation != "oblimin")
oblimin.delta = NULL
if (print.type %in% c("Component Plot", "Scree Plot", "Variance Explained", "2D Scatterplot"))
{
sort.coefficients.by.size = FALSE
suppress.small.coefficients = FALSE
}
if (print.type == "2D Scatterplot" && select.n.rule == "Number of components" && n.factors < 2L)
{
n.factors <- 2L
}
if (print.type != "Component Plot")
plot.labels = TRUE

# Generate the data that will be input to the correlation/covariance
# matrix by filtering and imputing if specified.
prepared.data <- prepareDataForFactorAnalysis(data, weights, subset, missing)

# If any variables have a standard deviation of 0 remove from analysis
stddevs <- StandardDeviation(prepared.data\$subset.data, weights = prepared.data\$subset.weights)
ind.zero.variance <- which(stddevs == 0)
if (length(ind.zero.variance) > 0)
{
warning("Some of your variables have no variation and have been removed for principal components analysis: ", paste(names(stddevs)[ind.zero.variance], sep = "", collapse = ", "))
prepared.data\$subset.data <- prepared.data\$subset.data[,-ind.zero.variance]
stddevs <- stddevs[-ind.zero.variance]
}

correlation.matrix <- CovarianceAndCorrelationMatrix(data = prepared.data\$subset.data,
weights = prepared.data\$subset.weights,
pairwise = missing == "Use partial data (pairwise correlations)",
use.correlation = TRUE)

# SPSS computes the covariance matrix that it uses as an imput to PCA
# by first calculating the pairwise correlation matrix and then
# scaling by the products of the standard deviations of each pair of
# variables. When there is no missing data this ought to match the
# covariance matrix which would be computed by the usual covariance
# formula, but when there is missing data the two results will differ.
if (!use.correlation)
input.matrix <- correlation.matrix * stddevs %o% stddevs
else
input.matrix <- correlation.matrix

# Compute eigenvalues for component selection
if (select.n.rule %in% c("Kaiser rule", "Eigenvalues over"))
{
eigens <- eigen(input.matrix, only.values = TRUE)
if (!use.correlation)
{
warning("Select components with eigenvalues > ",
eigen.min, " times mean of eigenvalues as we are using unscaled covariance matrix\n")
eigen.min <- mean(eigens\$values)
}
values.above.threshold <- eigens\$values > eigen.min
if (print.type == "2D Scatterplot" && sum(values.above.threshold, na.rm = TRUE) < 2L)
{
eigen.min <- eigens\$values[2] - .Machine\$double.eps
}
n.factors <- sum(eigens\$values > eigen.min, na.rm = TRUE)
}
stdmat <- matrix(rep(stddevs, n.factors), ncol = n.factors)

# Don't return all of the properties returned by
# principal() as they are not designed to account
# for weighting
initial.results <- try(suppressWarnings(principal(input.matrix,
nfactors = n.factors,
rotate = "none",
covar = !use.correlation,
scores = FALSE)), silent = TRUE)
if (inherits(initial.results, "try-error"))
stop("Could not perform PCA on the input matrix")

# Flip eigenvectors so the largest loadings are positive
function(x){sg=sign(x); ss=Sum(sg*x^2, remove.missing = FALSE); return(x*sign(ss))})

# Work out which rotation to use
# Convert from the strings that are to be used in the menus, which begin with upppercase letters
substr(rotation, 1, 1) <- tolower(substr(rotation, 1, 1))

oblique.rotation <- rotation == "oblimin" || rotation == "promax"

if (n.factors == 1 & rotation != "none")
{
warning("No rotation for single-component analysis\n")
rotation <- "none"
}

if (rotation != "none")
{
rotation = rotation,
delta = oblimin.delta,
kappa = promax.kappa,
covar = !use.correlation,
stds = stddevs)
function(x){sg=sign(x); ss=Sum(sg*x^2, remove.missing = FALSE); return(rep(sign(ss), length(x)))})

if (oblique.rotation)
else
component.correlations <- rotation.results\$component.correlations
if (!is.null(component.correlations))
component.correlations <- component.correlations * sign.matrix

} else {
function(x){sg=sign(x); ss=Sum(sg*x^2, remove.missing = FALSE); return(rep(sign(ss), length(x)))})
component.correlations <- NULL
}
comp.names <- paste("Component", 1:n.factors)
colnames(structure.matrix) <- comp.names
if (!is.null(component.correlations))
{
rownames(component.correlations) <- comp.names
colnames(component.correlations) <- comp.names
}

raw.structure.matrix <- structure.matrix
if (!use.correlation)
{
structure.matrix <- structure.matrix/stdmat
}

# Generate score weights using the regression method.
# For PCA, all methods give the same answer as scores
# are well-defined, so there is no need to use Bartlett
# or Anderson.

# For oblique rotations, use structure matrix
# otherwise use pattern matrix
if (rotation == "promax" || rotation == "oblimin")
{
S <- structure.matrix
} else {
}

# Smooth non-positive definite correlation in the same way as psych::principal
cor <- InterceptExceptions(cor.smooth2(correlation.matrix),
error.handler = function(e) {
warning("I am sorry, there is something seriously wrong with the correlation matrix,",
"\ncor.smooth failed to smooth it because some of the eigen values are NA.",
"\nAre you sure you specified the data correctly?")
},
warning.handler = function(w) {
smoothing.done <- startsWith(w\$message, "The analysis has failed to satisfy a technical assumption of PCA")
if (smoothing.done)
warning(w\$message, checkDataAfterCorrelationSmoothing(data, prepared.data, missing))
else
warning(w\$message)
})
score.weights <- try(solve(cor, S), silent = TRUE)
if (inherits(score.weights, "try-error"))
stop("Component scores could not be computed as the correlation or correlation matrix is singular.")

# Original data is scaled before generating scores
if (!is.null(weights))
scaled.data <- scaleDataUsingWeights(data = prepared.data\$subset.data, weights = prepared.data\$subset.weights)
else
scaled.data <- scale(prepared.data\$subset.data)

# Multiply the scaled data by the weights to produce scores
scores <- as.matrix(scaled.data) %*% score.weights

# Fill out any additional cases with missing values, so that the size of the output scores
# matches the number of respondents in the original data set, and that the cases are
# matched up correctly
new.data <- matrix(NA, nrow = nrow(data), ncol = ncol(scores))
row.names(new.data) <- row.names(data)
colnames(new.data) <- colnames(scores)
if (!any(duplicated(row.names(data))) && !"" %in% row.names(data))
{
common.names <- intersect(rownames(new.data), row.names(scores))
new.data[common.names,] <- scores[common.names,]
} else if (nrow(prepared.data\$subset.data) == nrow(data))
new.data <- scores
else # subset should always be non-null at this point.
new.data[subset, ] <- scores
scores <- new.data

# Communalities
initial.communalities <- diag(input.matrix)
extracted.communalities <- initial.results\$communality
if (!use.correlation)
{
rescaled.initial.communalities <- rep(1, nrow(input.matrix))
}

# Variance explained by factors

# Results
results <- list()
if (rotation != "none")
{
}

results\$structure.matrix <- structure.matrix

results\$raw.structure.matrix <- raw.structure.matrix

results\$sort.coefficients.by.size <- sort.coefficients.by.size
results\$suppress.small.coefficients <- suppress.small.coefficients
results\$original.data <- data
results\$original.weights <- weights
results\$data.used <- prepared.data
results\$use.correlation <- use.correlation
results\$print.type <- print.type
results\$plot.labels <- plot.labels
results\$input.matrix <- input.matrix
results\$values <- initial.results\$values
results\$scores <- scores
results\$score.weights <- score.weights
results\$rotation <- rotation
results\$missing <- missing
results\$component.correlations <- component.correlations
results\$data.groups <- data.groups

results\$initial.communalities <- initial.communalities
results\$extracted.communalities <- extracted.communalities
if (!use.correlation)
{
results\$rescaled.initial.communalities <- rescaled.initial.communalities
results\$rescaled.extracted.communalities  <- rescaled.extracted.communalities
}

class(results) <- "flipFactorAnalysis"
attr(results, "ChartData") <- ExtractChartData(results)
if (results\$print.type == "2d" || results\$print.type == "2D Scatterplot" ||
results\$print.type == "Component Plot")
attr(results, "ChartType") <- "X Y Scatter"
return(results)
}

#' @rdname PrincipalComponentsAnalysis
#' @param object Object of class \code{"flipFactorAnalysis"} created using \code{PrincipalComponentsAnalysis}.
#' @param ... Not used.
#' @export

fitted.flipFactorAnalysis <- function(object, ...)
{
return(object\$scores)
}

#' \code{ScreePlot}
#' @description Plot the eigenvalues from an existing principal component or
#'   factor analysis or plot the eigenvalues from the correlation or covariance
#'   matrix of a data frame.
#' @param x Either a data frame, a numeric vector of eigenvalues, or the
#'   eigenvalues from an analysis of class \code{flipFactorAnalysis} from
#'   \code{\link{PrincipalComponentsAnalysis}}, or \code{fa} or \code{principal} from package
#'   psych. When x is a data frame, additional arguments can be supplied as to
#'   how to compute the covariance or correlation matrix.
#' @param trim.padding Logical; whether to remove extra padding around the htmlwidget.
#'   By default this is set to \code{FALSE} to be the same as old charts
#' @inheritParams PrincipalComponentsAnalysis
#'
#' @return An HTML widget object from plotly containing the Scree Plot.
#' @importFrom flipStatistics CovarianceAndCorrelationMatrix
#' @importFrom plotly plot_ly layout
#' @export
ScreePlot <- function(x, weights = NULL, subset = NULL, missing = "Exclude cases with missing data",
use.correlation = TRUE, trim.padding = FALSE)
{
if ("data.frame" %in% class(x))
{
prepared.data <- prepareDataForFactorAnalysis(data = x, weights = weights, subset = subset, missing = missing)
input.matrix <- CovarianceAndCorrelationMatrix(
data = prepared.data\$subset.data,
weights = prepared.data\$subset.weights,
pairwise = missing == "Use partial data (pairwise correlations)",
use.correlation = use.correlation)
input.values <- eigen(input.matrix)\$values
}
else if ("numeric" %in% class(x))
{
input.values <- x
}
else if (inherits(x, "flipFactorAnalysis") || any(class(x) == "fa") || any(class(x) == "principal"))
{
input.values <- x\$values
}
else
{
stop(paste0("Can't make a Scree Plot for object of class ", paste(class(x), collapse = ", ")),
". Select a Principal Components Analysis output.")
}

input.values <- sort(input.values, decreasing = TRUE)

df = data.frame(eig.num = 1:length(input.values), eig.vals = input.values)

`Component Number` <- 1:length(input.values)
Eigenvalue <- input.values

my.plot <- plot_ly(x = ~`Component Number`,
y = ~Eigenvalue,
mode = "lines+markers", type="scatter")
my.plot <- layout(p = my.plot,
title = "Scree Plot",
yaxis = list(range = c(0, max(input.values) + 1)),
xaxis = list(title = "Component Number"))
my.plot\$sizingPolicy\$browser\$padding <- if (trim.padding) 0 else 40
my.plot <- plotly::config(p = my.plot, displayModeBar = FALSE)
class(my.plot) <- c(class(my.plot), "visualization-selector")
return(my.plot)
}

#' \code{ComponentPlot}
#'
#' @description Create a scatter plot showing the loadings of each variable on
#'   the first two principal components.
#'
#' @param x An object of class \code{flipFactorAnalysis}.
#' @param show.labels Label the points with the row names.
#' @importFrom rhtmlLabeledScatter LabeledScatter
#' @importFrom verbs SumEachColumn
#' @export
ComponentPlot <- function(x, show.labels = TRUE)
{
{
stop("Input should be created by Data Reduction - Principal Components Analysis")
}

{
stop("There aren't enough components to plot.")
}

if (show.labels)
{
}

x.label <- "Component 1"
y.label <- "Component 2"
if ("princomp" %in% class(x))
if (any(c("psych", "flipFactorAnalysis") %in% class(x)) && !(x\$rotation %in% c("promax", "oblimin")))

{
x.label <- sprintf("%s (%.1f%% variance explained)", x.label, variance.explained[1]*100)
y.label <- sprintf("%s (%.1f%% variance explained)", y.label, variance.explained[2]*100)
}

groups <- 1:nrow(coords)
colors <- rep(c('#5B9BD5', '#ED7D31', '#A5A5A5', '#1EC000', '#4472C4', '#70AD47','#255E91','#9E480E','#636363','#997300','#264478','#43682B','#FF2323'),
length = length(groups))
# Append a transparent point to force the origin to be shown
# Note that axis bounds cannot be set with fixed.aspect
res <- LabeledScatter(X = c(0, coords[, 1]),
Y = c(0, coords[, 2]),
label = c(" ", labels),
group = c("Origin", groups),
colors = c("transparent", colors),
fixed.aspect = TRUE,
title = "Component Plot",
x.title = x.label,
y.title = y.label,
axis.font.size = 10,
labels.font.size = 12,
title.font.size = 20,
y.title.font.size = 16,
x.title.font.size = 16,
legend.show = FALSE)
class(res) <- c(class(res), "visualization-selector")
res
}

#' @importFrom verbs Sum SumEachColumn
#' @export
ExtractChartData.flipFactorAnalysis <- function(x)
{
if (x\$print.type == "scree" || x\$print.type == "Scree Plot")
return(sort(x\$values, decreasing = TRUE))
if (x\$print.type == "2d" || x\$print.type == "2D Scatterplot")
{
tmp <- convertFactorAnalysisTo2D(x)
if (is.null(tmp\$data.groups))
{
data <- tmp\$embedding
attr(data, "scatter.variable.indices") <- c(x = 1, y = 2, sizes = NA, colors = NA)
return(data)
}
data <- data.frame(tmp\$embedding, Group = tmp\$data.groups, stringsAsFactors = FALSE,
check.names = FALSE, check.rows = FALSE)
attr(data, "scatter.variable.indices") <- c(x = 1, y = 2, sizes = NA, colors = 3)
return(data)
}
if (x\$print.type == "structure" || x\$print.type == "Structure Matrix")
if (x\$print.type =="variance" || x\$print.type == "Variance Explained")
{
eigenvalues <- x\$values
variance.proportions = eigenvalues / Sum(eigenvalues, remove.missing = FALSE)
cumulative.proportions = cumsum(variance.proportions)
result <- cbind('Eigenvalue' = eigenvalues,
'% of Variance' = variance.proportions,
'Cumulative %' = cumulative.proportions)
rownames(result) <- paste("Component", 1:length(eigenvalues))
return(result)
}
if (x\$print.type == "details" || x\$print.type == "Detailed Output")
{
headings <- matrix("", nrow = 10, ncol = ncol(x\$loadings) + 1)
"", "Communalities", "", "Score Cofficient Matrix")

.print <- function(x, digits = 3)
{
n <- nrow(x)
m <- ncol(x)
res <- matrix("", nrow = n+1, ncol = m+1, dimnames = list(rep("", n+1), rep("", m+1)))
res[1,(1:m)+1] <- colnames(x)
res[(1:n)+1,1] <- rownames(x)
res[(1:n)+1,(1:m)+1] <- round(x, digits)
return(res)
}

nvar <- ncol(x\$original.data)
ve.table <- rbind(ve.table, `% of Variance` = ss.loadings/nvar*100)
ve.table <- rbind(ve.table, `Cumulative %` = cumsum(ss.loadings/nvar*100))

if (x\$use.correlation)
communality.table <- cbind("Initial" = x\$initial.communalities,
"Extraction" = x\$extracted.communalities)
else
communality.table <- cbind("Initial" = x\$rescaled.initial.communalities,
"Extraction" = x\$rescaled.extracted.communalities)

empty.mat <- matrix("", nrow = length(x\$initial.communalities) + 1, ncol = ncol(headings)-3)
colnames(empty.mat) <- rep("", ncol(headings) - 3)
return(all.tables)
}

# Otherwise return data for a component plot
if (!(x\$rotation %in% c("promax", "oblimin")))
{
colnames(component.data) <- sprintf("Component %d (%.1f%% variance explained)",
1:2, var.exp[1:2] * 100)
}
attr(component.data, "scatter.variable.indices") <- c(x = 1, y = 2, sizes = NA, colors = NA)
return(component.data)
}

#' \code{prepareDataForFactorAnalysis}
#' @description Filter data, remove cases with missing values, and impute where requested.
#' @inheritParams PrincipalComponentsAnalysis
#' @return A list containing \code{subset.data}, which is a data frame which has had subset applied
#' and missing values removed or imputed as specified by the parameter \code{missing}, and \code{prepared.weights}
#' which is a nuneric vector containing the weight values that correspond to the remaining cases (or NULL when
#' the input weight is NULL).
#' @importFrom flipImputation Imputation
#' @importFrom flipData ExcludeCasesWithCompletelyMissingData ExcludeCasesWithAnyMissingData ErrorIfMissingDataFound
#' @importFrom flipTransformations ProcessQVariables
prepareDataForFactorAnalysis <- function(data, weights, subset, missing)
{
data <- ProcessQVariables(data)

row.names <- rownames(data)

# Create the input data by filtering and removing missing values or
# imputing where specified.

# If no filter specified, create a subset containing all rows
if (is.null(subset))
subset <- rep(TRUE, nrow(data))

# Check filtered rows for missing data
subset.data <- data[subset, ]
imputation.label <- NULL
if (missing == "Error if missing data")
{
ErrorIfMissingDataFound(subset.data)
} else if (missing == "Imputation (replace missing values with estimates)") {
imputed.data <- Imputation(data)[[1]]
imputation.label <- attr(imputed.data, "imputation.method")
subset.data <- imputed.data[subset, ]
} else if (missing == "Exclude cases with missing data") {
# Ensure only complete responses remain
subset.data <- ExcludeCasesWithAnyMissingData(subset.data)
} else if (missing == "Use partial data (pairwise correlations)") {
subset.data <- ExcludeCasesWithCompletelyMissingData(subset.data)
} else {
stop(paste0("Don't recognize the missing data option", missing))
}

# Figure out which of the total set of weight values correspond to the
# remaining respondents.
subset.weights <- NULL
if (!is.null(weights))
{
if (missing == "Exclude cases with missing data")
row.to.remove <- apply(data, 1, function(x) any(is.na(x)))
else if (missing == "Use partial data (pairwise correlations)")
row.to.remove <- apply(data, 1, function(x) all(is.na(x)))
else
row.to.remove <- rep(FALSE, nrow(data))
subset.weights <- weights[subset & !row.to.remove]
}
return(list(subset.data = subset.data,
subset.weights = subset.weights,
imputation.label = imputation.label))
}

#' \code{BartlettTestOfSphericity}
#'
#' @description Conduct the Bartlett Test of Sphericity for a set of data, which
#'   tests that the correlation matrix of the data is not the identity matrix.
#' @param data A data frame containing the data to test.
#' @inheritParams PrincipalComponentsAnalysis
#' @return A list containing the Chi-Square value, degrees of freedom
#'   (\code{df}), and p-value for the test.
#' @details This function wraps \code{\link[psych]{cortest.bartlett}}. In
#'   particular, it extends the existing funcitonality to weighted data, and it
#'   computes the test using a more conservative value of the sample size when
#'   there is missing data. The value for the sample size that is used is the
#'   size of the smallest pairwise-complete set of cases among all pairs of
#'   variables. This is consistent with SPSS.
#' @importFrom flipStatistics CovarianceAndCorrelationMatrix
#' @importFrom psych cortest.bartlett
#' @importFrom verbs Sum
#' @export

# For the sample size, use the min sample size of the correlation matrix
BartlettTestOfSphericity <- function(data,
weights = NULL,
subset = NULL,
missing = "Exclude cases with missing data")
{
prepared.data <- prepareDataForFactorAnalysis(data, weights, subset, missing)
correlation.matrix <- CovarianceAndCorrelationMatrix(
data = prepared.data\$subset.data,
weights = prepared.data\$subset.weights,
pairwise = missing == "Use partial data (pairwise correlations)",
use.correlation = TRUE)

# If using a weight, supply the Effective Sample Size, which is the sum of the weights, otherwise
# supply the actual sample size of the prepared data.
# When missing is set to "Use partial data (pairwise correlations)" then the sample size can vary between cells in
# correlation matrix. In this case, use the smallest sample size, or effective sameple size.
if (missing == "Use partial data (pairwise correlations)")
{
sample.size.matrix <- sampleSizeMatrix(data, weights)
sample.size <- min(sample.size.matrix)
}
else if (!is.null(weights))
sample.size <- Sum(prepared.data\$subset.weights, remove.missing = FALSE)
else
sample.size <- nrow(prepared.data\$subset.data)
test.results <- cortest.bartlett(correlation.matrix, n = sample.size)
test.results\$n.estimation <- nrow(prepared.data\$subset.data)
test.results\$imputation.label <- prepared.data\$imputation.label
class(test.results) <- "flipBartlett"

return(test.results)
}

# Scale and center data using the weighted mean and standard deviation
#' @importFrom verbs Sum
scaleDataUsingWeights <- function(data, weights)
{
.weightedMeanAndSD <- function(x, weights)
{
complete.cases <- !is.na(x) & weights > 0
wx <- x * weights
weighted.mean <- Sum(wx[complete.cases], remove.missing = FALSE) / Sum(weights[complete.cases], remove.missing = FALSE)
sx <- x - weighted.mean
wsx2 <- weights * sx * sx
weighted.variance <- Sum(wsx2[complete.cases], remove.missing = FALSE) / (Sum(weights[complete.cases], remove.missing = FALSE) - 1)
return(list(weighted.mean = weighted.mean, weighted.sd = sqrt(weighted.variance)))
}

for (j in 1L:ncol(data))
{
weighted.stats <- .weightedMeanAndSD(data[,j], weights)
data[,j] <- (data[,j] - weighted.stats\$weighted.mean) / weighted.stats\$weighted.sd
}
return(data)
}

# Calculate the (effective) sample size for each pair of variables in data
#' @importFrom verbs Sum SumEmptyHandling
sampleSizeMatrix <- function(data, weights)
{
if (is.null(weights))
{
weights <- rep(1, nrow(data))
}
numvars <- ncol(data)
sample.size.matrix <- matrix(NA, nrow = numvars, ncol = numvars)
for (row in 1L:numvars)
{
for (col in 1L:numvars)
{
if (row == col) {
# Sample size would be zero if
# length of summand is zero
sample.size.matrix[row, row] <- SumEmptyHandling(weights[!is.na(data[, row])], remove.missing = FALSE)
}
else
{
indicator <- !is.na(data[, row] * data[, col])
# Sample size would be zero if
# length of summand is zero
sample.size <- SumEmptyHandling(weights[indicator], remove.missing = FALSE)
sample.size.matrix[row, col] <- sample.size
sample.size.matrix[col, row] <- sample.size
}
}
}
return(sample.size.matrix)
}

# Convert a variable to the appropriate form for use in a PCA.
# - Numeric variables are unchanged.
# - Ordered factors are replaced with their levels
# - Non-ordered factors are converted to a set of indicator variables
# with one variable for each level but the first.
#' @importFrom flipTransformations FactorToNumeric
convertVariableForFactorAnalysis <- function(variable, include.question.name = TRUE)
{
if (is.numeric(variable))
{
return(variable)
}

if (is.ordered(variable))
{
return(unclass(variable))
}

# Non-ordered factors
#     if (include.question.name)
#     {
#         vn <- paste0(attr(variable, "question"), ":", attr(variable, "label"))
#     } else {
#         vn <- attr(variable, "label")
#     }
vn <- attr(variable, "label")
if (is.null(vn))
{
vn <- deparse(substitute(x))
}
indicator.matrix <- FactorToNumeric(variable, name = vn)
if (ncol(indicator.matrix) > 2)
{
indicator.matrix <- indicator.matrix[, 2:ncol(indicator.matrix)]
} else if (ncol(indicator.matrix) == 2) {
cn <- colnames(indicator.matrix)[2]
indicator.matrix <- as.matrix(indicator.matrix[,2])
colnames(indicator.matrix) <- cn
}
return(indicator.matrix)
}

#' @param select.n.rule Same parameter used in PrincipalComponentsAnalysis to determine the number of components
#' @param eigenvalues The eigenvalues of the PCA before component selection
#' @param eigen.min The boundary for selection used in the Kaiser rule and Eigenvalues over specifications
#' @noRd
throwWarningAbout2DScatterplotNotPossible <- function(select.n.rule, eigenvalues, eigen.min)
{
if (select.n.rule == "Number of components")
{
warn.prefix <- "Only a single component was requested in the PCA. "
}
if (select.n.rule %in% c("Kaiser rule", "Eigenvalues over"))
{
n.eigen <- sum(eigenvalues > eigen.min, na.rm = TRUE)
if (select.n.rule == "Kaiser rule")
{
warn.prefix <- "The Kaiser rule was specified to "
} else
{
warn.prefix <- "The eigenvalue selection rule was chosen to "
}
warn.prefix <- paste0(warn.prefix,
"keep a component when its Eigenvalue is larger than ", eigen.min,
". With this setting, only ", n.eigen, " component was chosen. ")
}
warning(warn.prefix, "However, two components are required to create a 2D Scatterplot. ",
"Consequently, the first two components were computed and used to create the 2D Scatterplot.")
}
```
NumbersInternational/flipDimensionReduction documentation built on June 12, 2024, 11:30 a.m.