R/pca_plot.R

Defines functions olink_pca_plot.internal olink_calculate_pca olink_pca_plot

Documented in olink_pca_plot

#' Function to plot a PCA of the data
#'
#' @description
#' Generates a PCA projection of all samples from NPX data along two
#' principal components (default PC2 vs. PC1) including the explained
#' variance and dots colored by QC_Warning using
#' \code{stats::prcomp} and \code{ggplot2::ggplot}.
#'
#' The values are by default scaled and centered in the PCA and
#' proteins with missing NPX values are by default removed from the
#' corresponding assay.
#'
#' Unique sample names are required.
#'
#' Imputation by the median is done for assays with missingness <10\%
#' for multi-plate projects and <5\% for single plate projects.
#'
#' The plot is printed, and a list of ggplot objects is returned.
#' If byPanel = TRUE, the data processing (imputation of missing
#' values etc) and subsequent PCA is performed separately per panel.
#' A faceted plot is printed, while the individual ggplot objects
#' are returned.
#'
#' The arguments outlierDefX and outlierDefY can be used to identify
#' outliers in the PCA. Samples more than +/- outlierDefX and
#' outlierDefY standard deviations from the mean of the plotted PC
#' will be labelled. Both arguments have to be specified.
#'
#' @param df data frame in long format with Sample Id, NPX and column
#' of choice for colors.
#' @param check_log A named list returned by [`check_npx()`]. If `NULL`,
#' [`check_npx()`] will be run internally using `df`.
#' @param color_g Character value indicating which column to use for
#' colors (default QC_Warning). Continuous color scale for Olink(R)
#' Sample Index (OSI) columns OSITimeToCentrifugation,
#' OSIPreparationTemperature and OSISummary is also supported.
#' @param x_val Integer indicating which principal component to plot
#' along the x-axis (default 1)
#' @param y_val Integer indicating which principal component to plot
#' along the y-axis (default 2)
#' @param label_samples Logical. If TRUE, points are replaced with
#' SampleID (default FALSE)
#' @param drop_assays Logical. All assays with any missing values will
#' be dropped. Takes precedence over sample drop.
#' @param drop_samples Logical. All samples with any missing values
#' will be dropped.
#' @param n_loadings Integer. Will plot the top n_loadings based on
#' size.
#' @param loadings_list Character vector indicating for which
#' OlinkID's to plot as loadings. It is possible to use n_loadings
#' and loadings_list simultaneously.
#' @param byPanel Perform the PCA per panel (default FALSE)
#' @param outlierDefX The number standard deviations along the PC
#' plotted on the x-axis that defines an outlier. See also 'Details"
#' @param outlierDefY The number standard deviations along the PC
#' plotted on the y-axis that defines an outlier. See also 'Details"
#' @param outlierLines Draw dashed lines at +/- outlierDefX and
#' outlierDefY standard deviations from the mean of the plotted PCs
#' (default FALSE)
#' @param label_outliers Use ggrepel to label samples lying outside
#' the limits set by the outlierLines (default TRUE)
#' @param quiet Logical. If TRUE, the resulting plot is not printed
#' @param verbose Logical. Whether warnings about the number of
#' samples and/or assays dropped or imputed should be printed to
#' the console.
#' @param ... coloroption passed to specify color order.
#'
#' @return A list of objects of class "ggplot", each plot contains
#' scatter plot of PCs
#'
#' @keywords NPX PCA
#'
#' @export
#' @examples
#' \donttest{
#' if (rlang::is_installed(pkg = c("ggrepel", "ggpubr"))) {
#'   npx_data <- npx_data1 |>
#'     dplyr::filter(
#'       !grepl(pattern = "CONTROL",
#'              x = .data[["SampleID"]],
#'              ignore.case = TRUE)
#'     )
#'
#'   check_log <- check_npx(npx_data)
#'
#'   # PCA using all the data
#'   OlinkAnalyze::olink_pca_plot(
#'     df = npx_data,
#'     check_log = check_log,
#'     color_g = "QC_Warning"
#'   )
#'
#'   # PCA per panel
#'   g <- OlinkAnalyze::olink_pca_plot(
#'     df = npx_data,
#'     check_log = check_log,
#'     color_g = "QC_Warning",
#'     byPanel = TRUE
#'   )
#'   g[[2L]] # Plot only the second panel
#'
#'   # Label outliers
#'   OlinkAnalyze::olink_pca_plot(
#'     df = npx_data,
#'     check_log = check_log,
#'     color_g = "QC_Warning",
#'     outlierDefX = 2L,
#'     outlierDefY = 4L
#'    ) # All data
#'
#'   OlinkAnalyze::olink_pca_plot(
#'     df = npx_data,
#'     check_log = check_log,
#'     color_g = "QC_Warning",
#'     outlierDefX = 2.5,
#'     outlierDefY = 4L,
#'     byPanel = TRUE
#'   ) # Per panel
#'
#'   # Retrieve the outliers
#'   g <- OlinkAnalyze::olink_pca_plot(
#'     df = npx_data,
#'     check_log = check_log,
#'     color_g = "QC_Warning",
#'     outlierDefX = 2.5,
#'     outlierDefY = 4L,
#'     byPanel = TRUE
#'   )
#'   outliers <- lapply(g, function(x) {
#'     return(x$data)
#'   }) |>
#'   dplyr::bind_rows() |>
#'   dplyr::filter(
#'     .data[["Outlier"]] == 1L
#'   )
#' }
#' }
#'
olink_pca_plot <- function(df,
                           check_log = NULL,
                           color_g = "QC_Warning",
                           x_val = 1L,
                           y_val = 2L,
                           label_samples = FALSE,
                           drop_assays = FALSE,
                           drop_samples = FALSE,
                           n_loadings = 0,
                           loadings_list = NULL,
                           byPanel = FALSE, # nolint: object_name_linter
                           outlierDefX = NA, # nolint: object_name_linter
                           outlierDefY = NA, # nolint: object_name_linter
                           outlierLines = FALSE, # nolint: object_name_linter
                           label_outliers = TRUE,
                           quiet = FALSE,
                           verbose = TRUE,
                           ...) {
  # checking ellipsis
  if (length(list(...)) > 0L) {
    ellipsis_variables <- names(list(...))

    if (length(ellipsis_variables) == 1L) {
      if (!(ellipsis_variables == "coloroption")) {
        cli::cli_abort(
          "The ... option only takes the coloroption argument. ... currently
          contains the variable {ellipsis_variables}."
        )
      }
    } else {
      cli::cli_abort(
        "The ... option only takes the coloroption argument. ... currently
        contains the variables {paste(ellipsis_variables, collapse = ', ')}."
      )
    }
  }

  # Check if check_log is correct
  check_log <- run_check_npx(df = df, check_log = check_log)

  # other checks
  check_is_dataset(x = df, error = TRUE)
  check_is_scalar_character(x = color_g, error = TRUE)
  check_is_scalar_boolean(x = label_samples, error = TRUE)
  check_is_scalar_boolean(x = drop_assays, error = TRUE)
  check_is_scalar_boolean(x = drop_samples, error = TRUE)
  check_is_scalar_boolean(x = byPanel, error = TRUE)
  check_is_scalar_boolean(x = outlierLines, error = TRUE)
  check_is_scalar_boolean(x = label_outliers, error = TRUE)
  check_is_scalar_boolean(x = quiet, error = TRUE)
  check_is_scalar_boolean(x = verbose, error = TRUE)

  # Stop if duplicate sample ID's detected
  if (length(check_log$sample_id_dups) > 0L) {
    cli::cli_abort(
      "Duplicate SampleID(s) detected:
      {paste(check_log$sample_id_dups, collapse = ', ')}.
      Each sample ID must be unique. Please check your data and ensure that
      each sample has a unique identifier."
    )
  }

  # Remove invalid OlinkID, assays with all NA values, and convert non-unique
  # Uniprot IDs. Note that we do not remove samples with duplicate SampleID,
  # control samples or assays, or samples/assays with QC warnings, as this
  # would be the user's decision.
  df <- run_clean_npx(
    df = df,
    check_log = check_log,
    remove_assay_na = TRUE,
    remove_invalid_oid = TRUE,
    remove_dup_sample_id = FALSE,
    remove_control_assay = FALSE,
    remove_control_sample = FALSE,
    remove_qc_warning = FALSE,
    remove_assay_warning = FALSE,
    convert_nonunique_uniprot = TRUE,
    out_df = "tibble",
    verbose = FALSE
  )

  # OSI checks - ran only if OSI columns selected to color
  osi_cat_cols <- c("OSICategory")
  osi_cont_cols <- c(
    "OSITimeToCentrifugation",
    "OSIPreparationTemperature",
    "OSISummary"
  )

  if (color_g %in% c(osi_cat_cols, osi_cont_cols)) {
    # Check for invalid values and NA columns
    df <- check_osi(
      df = df,
      check_log = check_log,
      osi_score = color_g
    )
  }

  # Check that the user didn't specify just one of outlierDefX and outlierDefY
  if (sum(c(is.numeric(outlierDefX), is.numeric(outlierDefY))) == 1L) {
    cli::cli_abort(
      "To label outliers, both outlierDefX and outlierDefY have to be specified
      as numerical values."
    )
  }

  # If outlierLines == TRUE, both outlierDefX and outlierDefY have to be
  # specified
  if (outlierLines) {
    if (!all(is.numeric(outlierDefX), is.numeric(outlierDefY))) {
      cli::cli_abort(
        "outlierLines requested but boundaries not specified. To draw lines,
        both outlierDefX and outlierDefY have to be specified as numerical
        values"
      )
    }
  }

  if (byPanel) {
    # Keep continuous OSI cols unchanged; only convert other color_g to factor
    if (!color_g %in% osi_cont_cols) {
      if (!is.factor(df[[color_g]])) {
        df[[color_g]] <- as.factor(df[[color_g]])
      }
    }

    df <- df |>
      dplyr::mutate(
        Panel = stringr::str_replace(
          .data[[check_log$col_names$panel]], "Olink ", ""
        )
      ) # Strip "Olink" from the panel names

    plotList <- lapply(unique(df[["Panel"]]), function(x) { # nolint: object_name_linter
      g <- olink_pca_plot.internal(
        df = df |> dplyr::filter(.data[["Panel"]] == x),
        check_log = check_log,
        color_g = color_g,
        x_val = x_val,
        y_val = y_val,
        label_samples = label_samples,
        drop_assays = drop_assays,
        drop_samples = drop_samples,
        n_loadings = n_loadings,
        loadings_list = loadings_list,
        outlierDefX = outlierDefX,
        outlierDefY = outlierDefY,
        outlierLines = outlierLines,
        label_outliers = label_outliers,
        verbose = verbose,
        ...
      ) +
        ggplot2::labs(title = x)

      # Add Panel info inside the ggplot object
      g$data <- g$data |>
        dplyr::mutate(Panel = .env[["x"]])

      return(g)
    })
    names(plotList) <- unique(df[["Panel"]]) # nolint: object_name_linter
    if (quiet == FALSE) {
      print(
        ggpubr::ggarrange(
          plotlist = plotList,
          common.legend = TRUE
        )
      )
    }
  } else {
    pca_plot <- olink_pca_plot.internal(
      df = df,
      check_log = check_log,
      color_g = color_g,
      x_val = x_val,
      y_val = y_val,
      label_samples = label_samples,
      drop_assays = drop_assays,
      drop_samples = drop_samples,
      n_loadings = n_loadings,
      loadings_list = loadings_list,
      outlierDefX = outlierDefX,
      outlierDefY = outlierDefY,
      outlierLines = outlierLines,
      label_outliers = label_outliers,
      verbose = verbose,
      ...
    )
    if (quiet == FALSE) {
      print(pca_plot)
    }
    # For consistency, return a list even when there's just one plot
    plotList <- list(pca_plot) # nolint: object_name_linter
  }
  return(invisible(plotList))
}

olink_calculate_pca <- function(procData, # nolint: object_name_linter
                                x_val = 1L,
                                y_val = 2L,
                                outlierDefX = NA, # nolint: object_name_linter
                                outlierDefY = NA ) { # nolint: object_name_linter

  #### PCA ####
  pca_fit <- stats::prcomp(
    procData$df_wide_matrix,
    scale. = TRUE,
    center = TRUE
  )

  # Standardizing and selecting components
  scaling_factor_lambda <- pca_fit$sdev * sqrt(nrow(procData$df_wide_matrix))

  PCX <- pca_fit$x[, x_val] / scaling_factor_lambda[x_val] # nolint: object_name_linter
  PCY <- pca_fit$x[, y_val] / scaling_factor_lambda[y_val] # nolint: object_name_linter
  PoV <- pca_fit$sdev^2L / sum(pca_fit$sdev^2L) # nolint: object_name_linter
  LX <- pca_fit$rotation[, x_val] # nolint: object_name_linter
  LY <- pca_fit$rotation[, y_val] # nolint: object_name_linter


  # Sort order is dependent on locale -> set locale here to make code
  # deterministic
  old_collate <- Sys.getlocale("LC_COLLATE")
  Sys.setlocale("LC_COLLATE", "C")

  scores <- tibble::tibble(
    SampleID = rownames(pca_fit$x),
    PCX = PCX,
    PCY = PCY
  ) |>
    dplyr::arrange(.data[["SampleID"]], .locale = "C") |>
    tibble::column_to_rownames(var = "SampleID")

  loadings <- tibble::tibble(
    variables = rownames(pca_fit$rotation),
    LX = LX,
    LY = LY
  )

  Sys.setlocale("LC_COLLATE", old_collate)

  range_PX <- c(-abs(min(PCX, na.rm = TRUE)), abs(max(PCX, na.rm = TRUE))) # nolint: object_name_linter
  range_PY <- c(-abs(min(PCY, na.rm = TRUE)), abs(max(PCY, na.rm = TRUE))) # nolint: object_name_linter
  range_LX <- c(-abs(min(LX, na.rm = TRUE)), abs(max(LX, na.rm = TRUE))) # nolint: object_name_linter
  range_LY <- c(-abs(min(LY, na.rm = TRUE)), abs(max(LY, na.rm = TRUE))) # nolint: object_name_linter

  loadings_scaling_factor <- 0.8 / max(range_LX / range_PX, range_LY / range_PY)

  # Identify outliers
  if (!is.na(outlierDefX) && !is.na(outlierDefY)) {
    scores <- scores |>
      tibble::rownames_to_column(var = "SampleID") |>
      dplyr::mutate(
        PCX_low = mean(.data[["PCX"]], na.rm = TRUE) -
          outlierDefX * stats::sd(.data[["PCX"]], na.rm = TRUE),
        PCX_high = mean(.data[["PCX"]], na.rm = TRUE) +
          outlierDefX * stats::sd(.data[["PCX"]], na.rm = TRUE),
        PCY_low = mean(.data[["PCY"]], na.rm = TRUE) -
          outlierDefY * stats::sd(.data[["PCY"]], na.rm = TRUE),
        PCY_high = mean(.data[["PCY"]], na.rm = TRUE) +
          outlierDefY * stats::sd(.data[["PCY"]], na.rm = TRUE)
      ) |>
      dplyr::mutate(
        Outlier = dplyr::if_else(
          .data[["PCX"]] < .data[["PCX_high"]] &
            .data[["PCX"]] > .data[["PCX_low"]] &
            .data[["PCY"]] > .data[["PCY_low"]] &
            .data[["PCY"]] < .data[["PCY_high"]],
          0L,
          1L
        )
      )
  }

  return(
    list(
      scores = scores,
      loading = loadings,
      loadings_scaling_factor = loadings_scaling_factor,
      PoV = PoV
    )
  )
}

olink_pca_plot.internal <- function(df, # nolint: object_name_linter
                                    check_log = NULL,
                                    color_g = "QC_Warning",
                                    x_val = 1L,
                                    y_val = 2L,
                                    label_samples = FALSE,
                                    drop_assays = FALSE,
                                    drop_samples = FALSE,
                                    n_loadings = 0,
                                    loadings_list = NULL,
                                    outlierDefX, # nolint: object_name_linter
                                    outlierDefY, # nolint: object_name_linter
                                    outlierLines, # nolint: object_name_linter
                                    label_outliers,
                                    verbose = verbose,
                                    ...) {
  # Check if check_log is correct
  check_log <- run_check_npx(df = df, check_log = check_log)

  # Ensure one unique color value per SampleID (required by
  # npxProcessing_forDimRed)
  df <- df |>
    dplyr::group_by(.data[["SampleID"]]) |>
    dplyr::mutate(
      !!rlang::sym(color_g) := dplyr::first(stats::na.omit(.data[[color_g]]))
    ) |>
    dplyr::ungroup()

  # Data pre-processing
  procData <- npxProcessing_forDimRed( # nolint: object_name_linter
    df = df,
    check_log = check_log,
    color_g = color_g,
    drop_assays = drop_assays,
    drop_samples = drop_samples,
    verbose = verbose
  )

  # Did we drop any of the of the assays specified in the loadings_list?
  if (!is.null(loadings_list)) {
    # Dropped because of NAs
    dropped_loadings <- intersect(
      procData$dropped_assays.na,
      loadings_list
    )

    if (length(dropped_loadings) > 0L) {
      if (verbose) {
        cli::cli_warn(
          "The loading(s) {paste0(dropped_loadings, collapse = ', ')} from the
          loadings_list contain NA and are dropped."
        )
      }

      loadings_list <- setdiff(loadings_list, dropped_loadings)
    }


    # Dropped because of to high missingness
    dropped_loadings <- intersect(
      procData$dropped_assays.missingness,
      loadings_list
    )

    if (length(dropped_loadings) > 0L) {
      if (verbose) {
        cli::cli_warn(
          "The loading(s) {paste0(dropped_loadings, collapse = ', ')} from the
          loadings_list are dropped due to high missingness."
        )
      }

      loadings_list <- setdiff(loadings_list, dropped_loadings)
    }

    if (length(loadings_list) == 0L) {
      loadings_list <- NULL
    }
  }

  #### PCA ####
  pca_results <- olink_calculate_pca(
    procData = procData,
    x_val = x_val,
    y_val = y_val,
    outlierDefX = outlierDefX,
    outlierDefY = outlierDefY
  )

  scores <- pca_results$scores

  if (!"SampleID" %in% colnames(scores)) {
    scores <- scores |>
      dplyr::mutate(SampleID = rownames(scores))
  }

  if (is.numeric(scores[["SampleID"]])) {
    message("SampleID converted to character.")
    scores[["SampleID"]] <- as.character(scores[["SampleID"]])
  }
  if (is.numeric(procData$df_wide[["SampleID"]])) {
    message("SampleID converted to character.")
    procData$df_wide[["SampleID"]] <-  # nolint: object_name_linter
      as.character(procData$df_wide[["SampleID"]])
  }

  scores <- scores |>
    dplyr::left_join(
      procData$df_wide[, c("SampleID", "colors")],
      by = c("SampleID")
    )

  loadings <- pca_results$loading
  loadings_scaling_factor <- pca_results$loadings_scaling_factor
  PoV <- pca_results[["PoV"]] # nolint: object_name_linter

  # Plotting
  pca_plot <- ggplot2::ggplot(
    data = scores,
    mapping = ggplot2::aes(
      x = .data[["PCX"]],
      y = .data[["PCY"]]
    )
  ) +
    ggplot2::xlab(
      paste0("PC", x_val, " (", round(x = PoV[x_val] * 100L, digits = 2L), "%)")
    ) +
    ggplot2::ylab(
      paste0("PC", y_val, " (", round(x = PoV[y_val] * 100L, digits = 2L), "%)")
    )

  # Drawing scores
  if (label_samples) {
    pca_plot <- pca_plot +
      ggplot2::geom_text(
        mapping = ggplot2::aes(
          label = .data[["SampleID"]],
          color = .data[["colors"]]
        ),
        size = 3L
      ) +
      ggplot2::labs(
        color = color_g
      ) +
      ggplot2::guides(
        size = "none"
      )
  } else {
    pca_plot <- pca_plot +
      ggplot2::geom_point(
        mapping = ggplot2::aes(
          color = .data[["colors"]]
        ),
        size = 2.
      ) +
      ggplot2::labs(
        color = color_g
      ) +
      ggplot2::guides(
        size = "none"
      )
  }

  # Continuous color scale for OSI columns
  osi_cont_cols <- c(
    "OSITimeToCentrifugation",
    "OSIPreparationTemperature",
    "OSISummary"
  )

  if (color_g %in% osi_cont_cols) {
    # Ensure numeric for continuous scale (handles character/factor inputs)
    if (is.factor(scores[["colors"]])) {
      scores[["colors"]] <- as.character(scores[["colors"]])
    }
    scores[["colors"]] <- suppressWarnings(as.numeric(scores[["colors"]]))

    pca_plot <- pca_plot +
      ggplot2::scale_color_gradient(
        low = "#FFB200FF",
        high = "#332D56FF",
        limits = c(0L, 1L),
        breaks = seq(from = 0L, to = 1L, by = 0.25),
        oob = scales::squish
      )
  }

  # Drawing loadings

  if (n_loadings > 0 || !is.null(loadings_list)) {
    N_loadings <- data.frame( # nolint: object_name_linter
      matrix(vector(), 0, ncol(loadings)),
      stringsAsFactors = FALSE
    )
    colnames(N_loadings) <- colnames(loadings) # nolint: object_name_linter

    L_loadings <- N_loadings # nolint: object_name_linter

    if (n_loadings > 0) {
      # Largest loadings based on Pythagoras

      N_loadings <- loadings |> # nolint: object_name_linter
        dplyr::mutate(
          abs_loading = sqrt(.data[["LX"]]^2L + .data[["LY"]]^2L)
        ) |>
        dplyr::arrange(
          dplyr::desc(.data[["abs_loading"]])
        ) |>
        utils::head(n_loadings) |>
        dplyr::select(
          -dplyr::all_of("abs_loading")
        )
    }

    if (!is.null(loadings_list)) {
      # Selected loadings

      L_loadings <- loadings |> # nolint: object_name_linter
        dplyr::filter(
          .data[["variables"]] %in% .env[["loadings_list"]]
        )
    }

    loadings <- rbind(
      N_loadings,
      L_loadings
    ) |>
      dplyr::distinct()

    pca_plot <- pca_plot +
      ggplot2::geom_segment(
        data = loadings,
        ggplot2::aes(
          x = 0,
          y = 0,
          xend = .data[["LX"]] * loadings_scaling_factor,
          yend = .data[["LY"]] * loadings_scaling_factor
        ),
        arrow = ggplot2::arrow(
          length = grid::unit(1 / 2, "picas")
        ),
        color = "black"
      ) +
      ggrepel::geom_label_repel(
        data = loadings,
        ggplot2::aes(
          x = .data[["LX"]] * loadings_scaling_factor,
          y = .data[["LY"]] * loadings_scaling_factor,
          label = .data[["variables"]]
        ),
        box.padding = 1,
        show.legend = FALSE,
        segment.colour = "gray"
      )
  }

  # Label outliers in figure
  if (!is.na(outlierDefX) && !is.na(outlierDefY) && label_outliers) {
    pca_plot <- pca_plot +
      ggrepel::geom_label_repel(
        data = pca_plot$data |>
          dplyr::mutate(
            SampleIDPlot = dplyr::if_else(
              .data[["Outlier"]] == 1L,
              .data[["SampleID"]],
              ""
            )
          ),
        ggplot2::aes(label = .data[["SampleIDPlot"]]),
        box.padding = 0.5,
        min.segment.length = 0.1,
        show.legend = FALSE,
        size = 3L
      )
  }

  # Add outlier lines
  if (outlierLines) {
    pca_plot <- pca_plot +
      ggplot2::geom_hline(
        ggplot2::aes(
          yintercept = .data[["PCY_low"]]
        ),
        linetype = "dashed",
        color = "grey"
      ) +
      ggplot2::geom_hline(
        ggplot2::aes(
          yintercept = .data[["PCY_high"]]
        ),
        linetype = "dashed",
        color = "grey"
      ) +
      ggplot2::geom_vline(
        ggplot2::aes(
          xintercept = .data[["PCX_low"]]
        ),
        linetype = "dashed",
        color = "grey"
      ) +
      ggplot2::geom_vline(
        ggplot2::aes(
          xintercept = .data[["PCX_high"]]
        ),
        linetype = "dashed",
        color = "grey"
      )
  }

  pca_plot <- pca_plot +
    OlinkAnalyze::set_plot_theme()

  if (!(color_g %in% osi_cont_cols)) {
    pca_plot <- pca_plot +
      OlinkAnalyze::olink_color_discrete(..., drop = FALSE)
  }

  return(pca_plot)
}

Try the OlinkAnalyze package in your browser

Any scripts or data that you put into this service are public.

OlinkAnalyze documentation built on June 24, 2026, 1:06 a.m.