R/uplot_pca.R

Defines functions uplot_pca

Documented in uplot_pca

#' Principal Component Analysis (PCA) Plotting
#'
#' @title uplot_pca
#' @name uplot_pca
#' @family plots
#' @description
#' This function performs Principal Component Analysis (PCA) on a dataset, and visualizes the results in various ways, including a scatter plot of the first two principal components (PC1 vs PC2) and a Van Krevelen plot projected using PC1 values. The PCA is performed on the molecular formula data, aggregated by a grouping variable, and handles cases where columns exhibit zero variance (which cannot be included in PCA).
#' @inheritParams main_docu
#' @import data.table stats ggplot2
# @importFrom ggplot2 scale_color_identity scale_color_viridis_d geom_hline ggplot geom_bar geom_histogram coord_cartesian geom_point scale_color_gradientn labs theme_minimal theme annotation_custom
#' @importFrom graphics text
#'
#' @return A list containing:
#' \item{pca}{The PCA model object (class `prcomp`).}
#' \item{t_score}{A data table of PCA scores (principal component values for each sample).}
#' \item{fig_vk}{A Van Krevelen plot projected with PC1 values.}
#' \item{fig_pca}{A scatter plot of the first two principal components (PC1 vs PC2).}
#' \item{mfd}{The input data table, augmented with principal component values.}
#'
# @examples
# # Example usage assuming mfd is a data table with appropriate columns:
# pca_results <- uplot_pca(mfd = mf_data_demo, grp = "file_id", int_col = "norm_int")
#'
#' @note The function uses `prcomp` for PCA and `uplot_vk` for the Van Krevelen plot.
#'
#' @seealso \code{\link{uplot_vk}} for the Van Krevelen plot function.
#' @export

uplot_pca <- function(mfd, grp, int_col = "norm_int",
                      palname = "viridis",
                      col_bar = TRUE,
                      ...) {

  files <- PC1 <- NULL

  # --- Input checks ---------------------------------------------------------
  if (!"mf" %in% names(mfd)) {
    stop("Missing required column: 'mf' (molecular formula).")
  }
  if (!int_col %in% names(mfd)) {
    stop("Column not found: ", int_col)
  }
  if (!grp %in% names(mfd)) {
    stop("Grouping column not found: ", grp)
  }

  # --- Pivot table: samples × molecular formulas ----------------------------
  df_pivot <- dcast(
    mfd,
    formula = get(grp) ~ mf,
    value.var = int_col,
    fun.aggregate = mean,
    fill = 0
  )

  df_pivot <- data.frame(df_pivot)
  rownames(df_pivot) <- df_pivot[[1]]
  df_pivot[[1]] <- NULL

  # --- Remove zero-variance columns -----------------------------------------
  zero_var <- sapply(df_pivot, function(x) var(x) == 0)
  if (any(zero_var)) {
    df_pivot <- df_pivot[, !zero_var, drop = FALSE]
  }

  # --- PCA -------------------------------------------------------------------
  pca <- stats::prcomp(df_pivot, scale. = TRUE)

  t_score <- as.data.table(pca$x)
  t_score[, files := rownames(pca$x)]

  # --- Prepare axis labels ---------------------------------------------------
  pc1_var <- pca$sdev[1]^2 / sum(pca$sdev^2) * 100
  pc2_var <- pca$sdev[2]^2 / sum(pca$sdev^2) * 100

  xlab <- sprintf("PC1 (%.3g%%)", pc1_var)
  ylab <- sprintf("PC2 (%.3g%%)", pc2_var)

  # --- Plotly PCA score plot -------------------------------------------------
  x_min <- min(t_score$PC1)
  x_max <- max(t_score$PC1) * 1.5

  fig_pca <- plotly::plot_ly(
    data = t_score,
    x = ~PC1,
    y = ~PC2,
    type = "scatter",
    mode = "markers+text",
    text = ~files,
    textposition = "right",
    textfont = list(color = "black"),
    marker = list(size = 8)
  ) |>
    layout(
      xaxis = list(title = xlab, range = c(x_min, x_max)),
      yaxis = list(title = ylab),
      title = ""
    )

  # --- Prepare PCA loadings for VK plot -------------------------------------
  loadings <- as.data.table(pca$rotation)
  loadings[, mf := rownames(pca$rotation)]

  # Merge PC1 loading onto original mfd
  mfd2 <- merge(mfd, loadings[, .(mf, PC1)], by = "mf", all.x = TRUE)
  mfd2 <- mfd2[!is.na(PC1)]

  # --- Van Krevelen plot projected by PC1 -----------------------------------
  fig_vk <- uplot_vk(
    mfd = mfd2,
    z_var = "PC1",
    palname = palname,
    col_bar = col_bar,
    main = "Van Krevelen Diagram (colored by PC1)"
  )

  # --- Return results --------------------------------------------------------
  list(
    pca = pca,
    t_score = t_score,
    fig_vk = fig_vk,
    fig_pca = fig_pca,
    mfd = mfd2
  )
}

Try the ume package in your browser

Any scripts or data that you put into this service are public.

ume documentation built on Dec. 13, 2025, 1:06 a.m.