Nothing
#' Create plot of variances explained from `pca_test` object
#'
#' The variance explained by each PC in a dataset is plotted with confidence
#' intervals generated by bootstrapping and a null distribution generated by
#' permutation. The function accepts the result of calling the `pca_test`
#' function.
#'
#' By default, variance explained is represented as a percentage. If the
#' argument `percent` is set to FALSE, then the variance explained is
#' represented by the eigenvalues corresponding to each PC.
#'
#' @param pca_test an object of class pca_test_results generated by `pca_test`.
#' @param pc_max the maximum number of PCs to plot. If NA, plot all PCs.
#' @param percent if TRUE, represent variance explained as a percentage. If
#' FALSE, represent as eigenvalues.
#' @return `ggplot` object.
#' @importFrom dplyr if_else lead rename mutate arrange filter
#' @importFrom stringr str_sub str_detect
#' @importFrom tidyr pivot_longer
#' @importFrom forcats fct_reorder
#' @importFrom ggplot2 ggplot geom_errorbar geom_point scale_x_discrete aes
#' guide_axis labs
#' @importFrom tidyselect all_of
#' @examples
#' onze_pca <- pca_test(onze_intercepts |> dplyr::select(-speaker), n = 10)
#' # Plot with percentages
#' plot_variance_explained(onze_pca)
#' # Plot with eigenvalues and only the first 5 PCs.
#' plot_variance_explained(onze_pca, pc_max = 5, percent = FALSE)
#' @export
plot_variance_explained <- function(pca_test, pc_max = NA, percent = TRUE) {
stopifnot(
"Data must come from an object of class pca_test_results" =
class(pca_test) == "pca_test_results",
"pc_max must be a number or NA." =
is.numeric(pc_max) | is.na(pc_max)
)
if (percent) {
desired_variables <- c(
'low_confint_var',
'high_confint_var',
'low_null_var',
'high_null_var',
'variance_explained',
'PC',
'sig_PC'
)
base_mapping <- aes(
x = .data$PC,
y = .data$variance_explained * 100,
colour = .data$distribution
)
error_mapping <- aes(
ymin = .data$low_limit * 100,
ymax = .data$high_limit * 100
)
var_formant <- '%'
} else {
desired_variables <- c(
'low_confint',
'high_confint',
'low_null',
'high_null',
'eigenvalue',
'PC',
'sig_PC'
)
base_mapping <- aes(
x = .data$PC,
y = .data$eigenvalue,
colour = .data$distribution
)
error_mapping <- aes(
ymin = .data$low_limit,
ymax = .data$high_limit
)
var_formant <- 'Eigenvalue'
}
# filter high PCs
if (is.numeric(pc_max)) {
plot_data <- pca_test$variance |>
filter(
as.numeric(str_sub(.data$PC, start = 3L)) <= pc_max
)
} else {
plot_data <- pca_test$variance
}
# Output plot
plot_data |>
select(all_of(desired_variables)) |>
pivot_longer(
cols = contains('low'),
names_to = "distribution",
values_to = 'low_limit',
names_pattern = "_(.*)"
) |>
pivot_longer(
cols = contains('high'),
names_to = "distribution_2",
values_to = 'high_limit',
names_pattern = "_(.*)"
) |>
filter(
.data$distribution == .data$distribution_2
) |>
select(-"distribution_2") |>
mutate(
distribution = if_else(
str_detect(.data$distribution, 'null'),
"Null",
"Sampling"
),
distribution = factor(.data$distribution, levels = c("Sampling", "Null"))
) |>
ggplot(
mapping = base_mapping
) +
geom_errorbar(
mapping = error_mapping,
width = 0.2
) +
geom_point(colour = "red") +
scale_x_discrete(guide = guide_axis(angle = 90)) +
labs(
title = "Variance Explained by Principal Components",
colour = "Test Distribution",
x = "Principal Component",
y = glue("Variance Explained ({var_formant})"),
caption = glue(
"Bars indicate {pca_test$variance_confint * 100}% confidence intervals."
)
)
}
#' Plot PC index loadings from `pca_test` object.
#'
#' Index loadings (Vieira 2012) are presented with confidence intervals on the
#' sampling distribution generated by bootstrapping and a null distribution
#' generated by permutation.
#'
#' If PCs are unstable, there is an option (`filter_boots`) to take only the
#' bootstrap iterations in which the variable with the highest median loading
#' across all iterations is above `quantile_threshold` (default: 0.25). This
#' helps to reveal reliable connections of this variable with other variables in
#' the data set.
#'
#' @param pca_test an object of class pca_test_results generated by `pca_test`.
#' @param pc_no An integer indicating which PC to plot.
#' @param violin If TRUE, violin plots are added for the confidence intervals of
#' the sampling distribution.
#' @param filter_boots if TRUE, only bootstrap iterations in which the variable
#' with the highest median loading is above `quantile_threshold`.
#' @param quantile_threshold a real value between 0 and 1. Use this to change
#' the threshold used for filtering bootstrap iterations. The default is 0.25.
#' @return `ggplot` object.
#' @importFrom dplyr if_else lead rename mutate arrange filter
#' @importFrom stringr str_sub str_detect
#' @importFrom tidyr pivot_longer
#' @importFrom forcats fct_reorder
#' @importFrom ggplot2 ggplot geom_errorbar geom_point scale_x_discrete aes
#' guide_axis labs
#' @importFrom tidyselect all_of
#' @examples
#' onze_pca <- pca_test(onze_intercepts |> dplyr::select(-speaker), n = 10)
#' # Plot PC1
#' plot_loadings(onze_pca, pc_no=1)
#' # Plot PC2 with violins (not particularly useful in this case!)
#' plot_loadings(onze_pca, pc_no=2, violin = TRUE)
#' @references
#' Vieira, Vasco (2012): Permutation tests to estimate significances on
#' Principal Components Analysis. _Computational Ecology and Software_ 2.
#' 103–123.
#' @export
plot_loadings <- function(
pca_test, pc_no = 1, violin=FALSE, filter_boots = FALSE,
quantile_threshold = 0.25
) {
stopifnot(
"Data must come from an object of class pca_test_results" =
class(pca_test) == "pca_test_results",
"pc_no must be a number." =
is.numeric(pc_no),
"violin must be either TRUE or FALSE." =
is.logical(violin),
"filter_boots must be either TRUE or FALSE." =
is.logical(filter_boots)
)
if (filter_boots) {
plot_data <- pca_test$raw_data |>
filter(
as.numeric(str_sub(.data$PC, start = 3L)) == pc_no,
source != "original"
) |>
group_by(.data$source, .data$variable) |>
mutate(
median_index = stats::median(.data$index_loading),
quant_threshold = stats::quantile(
.data$index_loading,
quantile_threshold
)
) |>
ungroup() |>
mutate(
largest_loading = .data$median_index == base::max(.data$median_index),
) |>
group_by(.data$source, .data$iteration) |>
filter(
source != "bootstrapped" |
any(
.data$largest_loading & .data$index_loading >= .data$quant_threshold
)
) |>
group_by(.data$source, .data$variable) |>
mutate(
low_limit = stats::quantile(
.data$index_loading,
(1 - pca_test$loadings_confint)/2
),
high_limit = stats::quantile(
.data$index_loading,
1 - (1 - pca_test$loadings_confint)/2
)
) |>
ungroup() |>
mutate(
distribution = if_else(
.data$source == "bootstrapped",
"Sampling",
"Null"
)
) |>
select(-.data$index_loading, -.data$loading) |>
left_join(
pca_test$loadings |> select(
.data$PC,
.data$variable,
.data$index_loading,
.data$loading
),
by = c("PC", "variable")
)
# Calculate count of kept iterations for subtitle
kept_iterations <- plot_data |>
filter(
.data$distribution == "Sampling"
) |>
pull(.data$iteration) |>
base::unique() |>
base::length()
subtitle = glue(
"Filtered Bootstrap Sampling ({kept_iterations} Iterations) ",
"and Permutation-Based Null Distributions"
)
} else {
plot_data <- pca_test$loadings |>
# Filter so we only have data from desired PC.
filter(
as.numeric(str_sub(.data$PC, start = 3L)) == pc_no
) |>
# Reshape so that both permutation and bootstrapped limits are on same
# variables.
pivot_longer(
cols = contains('low'),
names_to = "distribution",
values_to = 'low_limit',
names_pattern = "_(.*)"
) |>
pivot_longer(
cols = contains('high'),
names_to = "distribution_2",
values_to = 'high_limit',
names_pattern = "_(.*)"
) |>
filter(
.data$distribution == .data$distribution_2
) |>
select(-"distribution_2") |>
mutate(
distribution = if_else(
str_detect(.data$distribution, 'null'),
"Null",
"Sampling"
)
)
subtitle = glue(
"Bootstrapped Sampling and Permutation-Based Null Distributions"
)
}
plot_data <- plot_data |>
mutate(
# Reorder 'variable' column so variables plotted in ascending order by
# loading.
variable = fct_reorder(.data$variable, .data$index_loading),
loading_sign = if_else(.data$loading < 0, "-", "+")
)
if (violin) {
violin_data <- pca_test$raw_data |>
filter(
source == "bootstrapped",
as.numeric(str_sub(.data$PC, start = 3L)) == pc_no
) |>
mutate(
distribution = "Sampling"
) |>
select(
.data$distribution,
.data$variable,
.data$index_loading,
.data$loading,
.data$iteration
) |>
# Reorder 'variable' column so variables plotted in ascending order by
# median bootstrapped loading.
group_by(.data$variable) |>
mutate(
median_index = stats::median(.data$index_loading)
) |>
ungroup() |>
mutate(
variable = fct_reorder(.data$variable, .data$median_index)
)
if (filter_boots) {
violin_data <- violin_data |>
group_by(.data$variable) |>
mutate(
first_quartile = stats::quantile(.data$index_loading, 0.25)
) |>
ungroup() |>
mutate(
largest_loading = .data$median_index == max(.data$median_index),
) |>
group_by(.data$iteration) |>
filter(
any(.data$largest_loading & .data$index_loading >= .data$first_quartile)
)
kept_iterations <- base::length(base::unique(violin_data$iteration))
}
violin_element <- geom_violin(
data = violin_data,
alpha = 0.5
)
} else {
violin_element <- NULL
}
out_plot <- plot_data |>
ggplot(
aes(
x = .data$variable,
y = .data$index_loading,
colour = .data$distribution
)
) +
violin_element +
geom_errorbar(
aes(
ymin = .data$low_limit,
ymax = .data$high_limit
)
) +
# geom_point(colour = "red") +
geom_text(aes(label = .data$loading_sign), size = 8, colour = "black") +
scale_x_discrete(guide = guide_axis(angle = 90)) +
scale_colour_manual(
values = c("Sampling" = "#F8766D", "Null" = "#00BFC4")
) +
labs(
title = glue("Index Loadings for PC{pc_no}"),
subtitle = subtitle,
colour = "Distribution",
y = "Index Loading",
x = "Variable"
)
out_plot
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.