R/pca.R

Defines functions pca_plot

Documented in pca_plot

# SPDX-License-Identifier: AGPL-3.0-or-later
# Copyright (C) 2021  Kevin Lu
# Parts modified from EpigenCentral

#' Generate principal component analysis plots from normalized methylation beta values.
#' By default, this will do this for all CpGs, however, you can pass in a data frame
#' of specific ones of interest to only consider differentially methylated CpGs.
#'
#' @param grset minfi GenomicRatioSet containing beta values
#' @param df_dmps optional data frame of CpGs of interest
#'
#' @examples
#' \dontrun{
#' grset <- read_geo_tsv("extdata/GSE55491/GSE55491_series_matrix.txt")
#' pca_plot(grset)
#' }
#' @references
#' H. Wickham. ggplot2: Elegant Graphics for Data Analysis. Springer-Verlag
#' New York, 2016.
#'
#' @return ggplot2 PCA plot
#' @export
pca_plot <- function(grset, df_dmps = NULL) {
  if (is.null(df_dmps)) {
    betas <- minfi::getBeta(grset)
  } else {
    betas <- minfi::getBeta(grset)[df_dmps$CpG.Site,]
  }

  # Perform the principal component analysis
  epsilon = 0.0001
  betas[betas < epsilon] <- epsilon
  betas[betas > 1 - epsilon] <- 1 - epsilon
  x <- scale(t(betas))
  x[is.nan(x)] <- 0
  pca <- stats::prcomp(x[, matrixStats::colSds(x, na.rm = T) > 0])

  plot_df <- as.data.frame(pca$x)

  # Centres the plot on the data points while keeping the plot square
  limits_x <- c(min(plot_df$PC1), max(plot_df$PC1))
  limits_y <- c(min(plot_df$PC2), max(plot_df$PC2))
  width_x <- limits_x[2] - limits_x[1]
  width_y <- limits_y[2] - limits_y[1]
  padding <- abs(width_y - width_x) / 2
  if (width_y > width_x) {
    limits_x <- limits_x + c(-padding, padding)
  } else {
    limits_y <- limits_y + c(-padding, padding)
  }

  # Create the centred PCA plot
  if (is.null(grset$Sample_Group)) {
    # No sample sheet to discriminate
    aesthetic_map <- ggplot2::aes(plot_df$PC1, plot_df$PC2)
  } else {
    plot_df$Groups <- grset$Sample_Group
    aesthetic_map <- ggplot2::aes(plot_df$PC1, plot_df$PC2, colour = plot_df$Groups)
  }
  ggplot2::ggplot(plot_df, x = plot_df$PC1, y = plot_df$PC2, aesthetic_map) +
    ggplot2::geom_point() +
    ggplot2::coord_fixed(ratio = 1, xlim = limits_x, ylim = limits_y)
  # Show sample labels if there aren't too many
  if (length(Biobase::pData(grset)$Sample_Name) < 50) {
    ggplot2::last_plot() +
      ggplot2::geom_text(
        label = rownames(plot_df),
        size = 1.9,
        nudge_y = (limits_y[2] - limits_y[1]) / 100)
  }
  ggplot2::last_plot()
}

# [END]
kevinlul/EpigeneLite documentation built on Dec. 21, 2021, 6:35 a.m.