R/pca_test_plots.R
In nzilbb.vowels: Vowel Covariation Tools

Documented in plot_loadings plot_variance_explained

#' Create plot of variances explained from `pca_test` object
#'
#' The variance explained by each PC in a dataset is plotted with confidence
#' intervals generated by bootstrapping and a null distribution generated by
#' permutation. The function accepts the result of calling the `pca_test`
#' function.
#'
#' By default, variance explained is represented as a percentage. If the
#' argument `percent` is set to FALSE, then the variance explained is
#' represented by the eigenvalues corresponding to each PC.
#'
#' @param pca_test an object of class pca_test_results generated by `pca_test`.
#' @param pc_max the maximum number of PCs to plot. If NA, plot all PCs.
#' @param percent if TRUE, represent variance explained as a percentage. If
#'   FALSE, represent as eigenvalues.
#' @return `ggplot` object.
#' @importFrom dplyr if_else lead rename mutate arrange filter
#' @importFrom stringr str_sub str_detect
#' @importFrom tidyr pivot_longer
#' @importFrom forcats fct_reorder
#' @importFrom ggplot2 ggplot geom_errorbar geom_point scale_x_discrete aes
#'   guide_axis labs
#' @importFrom tidyselect all_of
#' @examples
#'   onze_pca <- pca_test(onze_intercepts |> dplyr::select(-speaker), n = 10)
#'   # Plot with percentages
#'   plot_variance_explained(onze_pca)
#'   # Plot with eigenvalues and only the first 5 PCs.
#'   plot_variance_explained(onze_pca, pc_max = 5, percent = FALSE)
#' @export
plot_variance_explained <- function(pca_test, pc_max = NA, percent = TRUE) {

  stopifnot(
    "Data must come from an object of class pca_test_results" =
      class(pca_test) == "pca_test_results",
    "pc_max must be a number or NA." =
      is.numeric(pc_max) | is.na(pc_max)
  )

  if (percent) {

    desired_variables <- c(
      'low_confint_var',
      'high_confint_var',
      'low_null_var',
      'high_null_var',
      'variance_explained',
      'PC',
      'sig_PC'
    )

    base_mapping <- aes(
      x = .data$PC,
      y = .data$variance_explained * 100,
      colour = .data$distribution
    )

    error_mapping <- aes(
      ymin = .data$low_limit * 100,
      ymax = .data$high_limit * 100
    )

    var_formant <- '%'

  } else {

    desired_variables <- c(
      'low_confint',
      'high_confint',
      'low_null',
      'high_null',
      'eigenvalue',
      'PC',
      'sig_PC'
    )

    base_mapping <- aes(
      x = .data$PC,
      y = .data$eigenvalue,
      colour = .data$distribution
    )

    error_mapping <- aes(
      ymin = .data$low_limit,
      ymax = .data$high_limit
    )

    var_formant <- 'Eigenvalue'

  }

  # filter high PCs
  if (is.numeric(pc_max)) {
    plot_data <- pca_test$variance |>
      filter(
        as.numeric(str_sub(.data$PC, start = 3L)) <= pc_max
      )
  } else {
    plot_data <- pca_test$variance
  }

  # Output plot
  plot_data |>
    select(all_of(desired_variables)) |>
    pivot_longer(
      cols = contains('low'),
      names_to = "distribution",
      values_to = 'low_limit',
      names_pattern = "_(.*)"
    ) |>
    pivot_longer(
      cols = contains('high'),
      names_to = "distribution_2",
      values_to = 'high_limit',
      names_pattern = "_(.*)"
    ) |>
    filter(
      .data$distribution == .data$distribution_2
    ) |>
    select(-"distribution_2") |>
    mutate(
      distribution = if_else(
        str_detect(.data$distribution, 'null'),
        "Null",
        "Sampling"
      ),
      distribution = factor(.data$distribution, levels = c("Sampling", "Null"))
    ) |>
    ggplot(
      mapping = base_mapping
    ) +
    geom_errorbar(
      mapping = error_mapping,
      width = 0.2
    ) +
    geom_point(colour = "red") +
    scale_x_discrete(guide = guide_axis(angle = 90)) +
    labs(
      title = "Variance Explained by Principal Components",
      colour = "Test Distribution",
      x = "Principal Component",
      y = glue("Variance Explained ({var_formant})"),
      caption = glue(
        "Bars indicate {pca_test$variance_confint * 100}% confidence intervals."
      )
    )

}

#' Plot PC index loadings from `pca_test` object.
#'
#' Index loadings (Vieira 2012) are presented with confidence intervals on the
#' sampling distribution generated by bootstrapping and a null distribution
#' generated by permutation.
#'
#' If PCs are unstable, there is an option (`filter_boots`) to take only the
#' bootstrap iterations in which the variable with the highest median loading
#' across all iterations is above `quantile_threshold` (default: 0.25). This
#' helps to reveal reliable connections of this variable with other variables in
#' the data set.
#'
#' @param pca_test an object of class pca_test_results generated by `pca_test`.
#' @param pc_no An integer indicating which PC to plot.
#' @param violin If TRUE, violin plots are added for the confidence intervals of
#'   the sampling distribution.
#' @param filter_boots if TRUE, only bootstrap iterations in which the variable
#' with the highest median loading is above `quantile_threshold`.
#' @param quantile_threshold a real value between 0 and 1. Use this to change
#' the threshold used for filtering bootstrap iterations. The default is 0.25.
#' @return `ggplot` object.
#' @importFrom dplyr if_else lead rename mutate arrange filter
#' @importFrom stringr str_sub str_detect
#' @importFrom tidyr pivot_longer
#' @importFrom forcats fct_reorder
#' @importFrom ggplot2 ggplot geom_errorbar geom_point scale_x_discrete aes
#'   guide_axis labs
#' @importFrom tidyselect all_of
#' @examples
#'   onze_pca <- pca_test(onze_intercepts |> dplyr::select(-speaker), n = 10)
#'   # Plot PC1
#'   plot_loadings(onze_pca, pc_no=1)
#'   # Plot PC2 with violins (not particularly useful in this case!)
#'   plot_loadings(onze_pca, pc_no=2, violin = TRUE)
#' @references
#'   Vieira, Vasco (2012): Permutation tests to estimate significances on
#'   Principal Components Analysis. _Computational Ecology and Software_ 2.
#'   103–123.
#' @export
plot_loadings <- function(
    pca_test, pc_no = 1, violin=FALSE, filter_boots = FALSE,
    quantile_threshold = 0.25
) {

  stopifnot(
    "Data must come from an object of class pca_test_results" =
      class(pca_test) == "pca_test_results",
    "pc_no must be a number." =
      is.numeric(pc_no),
    "violin must be either TRUE or FALSE." =
      is.logical(violin),
    "filter_boots must be either TRUE or FALSE." =
      is.logical(filter_boots)
  )

  if (filter_boots) {

    plot_data <- pca_test$raw_data |>
      filter(
        as.numeric(str_sub(.data$PC, start = 3L)) == pc_no,
        source != "original"
      ) |>
      group_by(.data$source, .data$variable) |>
      mutate(
        median_index = stats::median(.data$index_loading),
        quant_threshold = stats::quantile(
          .data$index_loading,
          quantile_threshold
        )
      ) |>
      ungroup() |>
      mutate(
        largest_loading = .data$median_index == base::max(.data$median_index),
      ) |>
      group_by(.data$source, .data$iteration) |>
      filter(
        source != "bootstrapped" |
        any(
          .data$largest_loading & .data$index_loading >= .data$quant_threshold
        )
      ) |>
      group_by(.data$source, .data$variable) |>
      mutate(
        low_limit = stats::quantile(
          .data$index_loading,
          (1 - pca_test$loadings_confint)/2
        ),
        high_limit = stats::quantile(
          .data$index_loading,
          1 - (1 - pca_test$loadings_confint)/2
        )
      ) |>
      ungroup() |>
      mutate(
        distribution = if_else(
          .data$source == "bootstrapped",
          "Sampling",
          "Null"
        )
      ) |>
      select(-.data$index_loading, -.data$loading) |>
      left_join(
        pca_test$loadings |> select(
          .data$PC,
          .data$variable,
          .data$index_loading,
          .data$loading
        ),
        by = c("PC", "variable")
      )

    # Calculate count of kept iterations for subtitle
    kept_iterations <- plot_data |>
      filter(
        .data$distribution == "Sampling"
      ) |>
      pull(.data$iteration) |>
      base::unique() |>
      base::length()

    subtitle = glue(
      "Filtered Bootstrap Sampling ({kept_iterations} Iterations) ",
      "and Permutation-Based Null Distributions"
    )

  } else {

    plot_data <- pca_test$loadings |>
      # Filter so we only have data from desired PC.
      filter(
        as.numeric(str_sub(.data$PC, start = 3L)) == pc_no
      ) |>
      # Reshape so that both permutation and bootstrapped limits are on same
      # variables.
      pivot_longer(
        cols = contains('low'),
        names_to = "distribution",
        values_to = 'low_limit',
        names_pattern = "_(.*)"
      ) |>
      pivot_longer(
        cols = contains('high'),
        names_to = "distribution_2",
        values_to = 'high_limit',
        names_pattern = "_(.*)"
      ) |>
      filter(
        .data$distribution == .data$distribution_2
      ) |>
      select(-"distribution_2") |>
      mutate(
        distribution = if_else(
          str_detect(.data$distribution, 'null'),
          "Null",
          "Sampling"
        )
      )

    subtitle = glue(
      "Bootstrapped Sampling and Permutation-Based Null Distributions"
    )

  }

  plot_data <- plot_data |>
    mutate(
      # Reorder 'variable'  column so variables plotted in ascending order by
      # loading.
      variable = fct_reorder(.data$variable, .data$index_loading),
      loading_sign = if_else(.data$loading < 0, "-", "+")
    )

  if (violin) {

    violin_data <- pca_test$raw_data |>
      filter(
        source == "bootstrapped",
        as.numeric(str_sub(.data$PC, start = 3L)) == pc_no
      ) |>
      mutate(
        distribution = "Sampling"
      ) |>
      select(
        .data$distribution,
        .data$variable,
        .data$index_loading,
        .data$loading,
        .data$iteration
      ) |>
      # Reorder 'variable'  column so variables plotted in ascending order by
      # median bootstrapped loading.
      group_by(.data$variable) |>
      mutate(
        median_index = stats::median(.data$index_loading)
      ) |>
      ungroup() |>
      mutate(
        variable = fct_reorder(.data$variable, .data$median_index)
      )

    if (filter_boots) {
      violin_data <- violin_data |>
        group_by(.data$variable) |>
        mutate(
          first_quartile = stats::quantile(.data$index_loading, 0.25)
        ) |>
        ungroup() |>
        mutate(
          largest_loading = .data$median_index == max(.data$median_index),
        ) |>
        group_by(.data$iteration) |>
        filter(
          any(.data$largest_loading & .data$index_loading >= .data$first_quartile)
        )

      kept_iterations <- base::length(base::unique(violin_data$iteration))

    }

    violin_element <- geom_violin(
        data = violin_data,
        alpha = 0.5
      )

  } else {

    violin_element <- NULL

  }

  out_plot <- plot_data |>
    ggplot(
      aes(
        x = .data$variable,
        y = .data$index_loading,
        colour = .data$distribution
      )
    ) +
    violin_element +
    geom_errorbar(
      aes(
        ymin = .data$low_limit,
        ymax = .data$high_limit
      )
    ) +
    # geom_point(colour = "red") +
    geom_text(aes(label = .data$loading_sign), size = 8, colour = "black") +
    scale_x_discrete(guide = guide_axis(angle = 90)) +
    scale_colour_manual(
      values = c("Sampling" = "#F8766D", "Null" = "#00BFC4")
    ) +
    labs(
      title = glue("Index Loadings for PC{pc_no}"),
      subtitle = subtitle,
      colour = "Distribution",
      y = "Index Loading",
      x = "Variable"
    )

  out_plot

}