R/RestrictedPCA.R
In MetabolomicsBasics: Basic Functions to Investigate Metabolomics Data Matrices

Documented in RestrictedPCA

#' @title RestrictedPCA.
#' @description \code{RestrictedPCA} combines an ANOVA based on 'fmod' and
#'   restricts a PCA using the ANOVA result as a filter.
#' @details `fmod` should be something like 'GT*TR+Batch' to perform
#'   an ANOVA with these factors defined as columns in sam.
#' @param dat Metabolite matrix (samples x metabolites).
#' @param sam Sample definition dataframe.
#' @param use.sam Numeric index vector (or logical) to select specific samples to be included in the analysis or NULL to include all.
#' @param fmod ANOVA model to calculate before PCA.
#' @param sign.col Which column(s) of the ANOVA result shall be used for P-value filtering (specify column names or leave on NULL to filter on all).
#' @param p.adjust.method Method use to adjust P-values (e.g. none, BH or bonferroni).
#' @param P P-value threshold used as a cutoff after P-value adjustment.
#' @param pcaMethods.scale pcaMethods scale parameter (usually pareto for metabolite data).
#' @param n.metab.min Minimum number of metabolites kept for PCA calculation (even if they exceed P).
#' @param group.col Column used for legend creation (column name from sam).
#' @param text.col Column used for text annotation of data points (column name from sam).
#' @param ... Handed through to \code{\link{PlotMetabolitePCA}}.
#' @return Will generate a PCA plot (generated by \link{PlotMetabolitePCA} internally)
#'   restricted based on an ANOVA result based on \link{MetaboliteANOVA}.
#' @examples
#' # load raw data and sample description
#' raw <- MetabolomicsBasics::raw
#' sam <- MetabolomicsBasics::sam
#' # standard behavior
#' RestrictedPCA(dat = raw, sam = sam, group.col = "GT")
#' \dontrun{
#' # apply multiple testing using a strict P-value cutoff,
#' # dont show a legend but plot group mean values and sd's as overlay
#' RestrictedPCA(
#'   dat = raw, sam = sam, group.col = "GT", p.adjust.method = "BH", P = 10^-10,
#'   fmod = "GT+Batch+Order", sign.col = "GT", medsd = T, legend.x = NULL
#' )
#' # limit to a subset of samples, switching the ANOVA selection of by setting P=1
#' # and adding text (from \code{sam}) to each data point
#' RestrictedPCA(
#'   dat = raw, sam = sam, use.sam = which(sam$GT %in% c("Mo17", "B73")), group.col = "GT",
#'   fmod = "GT+Batch+Order", P = 1, sign.col = "GT", legend.x = NULL, text.col = "Batch"
#' )
#' }
#' @export
#' @importFrom pcaMethods pca
RestrictedPCA <- function(dat = NULL, sam = NULL, use.sam = NULL, group.col = NULL, text.col = NULL, fmod = NULL, sign.col = NULL, p.adjust.method = "none", P = 0.01, pcaMethods.scale = "pareto", n.metab.min = 20, ...) {
  # helper function to identify cols/rows with all NA data
  NAcrs <- function(x) {
    list("r" = unlist(apply(x, 1, function(y) {
      !all(is.na(y))
    })), "c" = unlist(apply(x, 2, function(y) {
      !all(is.na(y))
    })))
  }
  stopifnot(length(group.col %in% colnames(sam)) == 1)
  if (is.null(use.sam)) use.sam <- 1:nrow(dat)
  if (is.factor(use.sam)) use.sam <- which(use.sam)
  stopifnot(length(use.sam) >= 2)
  filt <- NAcrs(dat[use.sam, ])
  dat <- dat[use.sam, ][filt$r, filt$c]
  sam <- as.data.frame(sam[use.sam, ][filt$r, , drop = FALSE]) # convert potential tibble to df
  if (!is.null(text.col) && !is.character(sam[, text.col])) sam[, text.col] <- as.character(sam[, text.col])
  if (!is.null(fmod)) {
    colnames(dat) <- 1:ncol(dat)
    out <- MetaboliteANOVA(dat = dat, sam = sam, model = fmod)
  } else {
    out <- data.frame("none" = rep(0, ncol(dat)))
    fmod <- "none"
  }
  if (is.null(sign.col)) sign.col <- colnames(out)
  if (length(sign.col) >= 2) {
    p_adj <- apply(apply(out[, sign.col], 2, p.adjust, p.adjust.method), 1, function(x) {
      ifelse(all(is.na(x)), NA, min(x, na.rm = T))
    })
  } else {
    p_adj <- p.adjust(out[, sign.col], p.adjust.method)
  }
  if (sum(is.finite(p_adj)) < n.metab.min + 1) {
    P <- 1
  } else {
    P <- ifelse(sort(p_adj)[n.metab.min + 1] >= P, sort(p_adj)[n.metab.min + 1], P)
  }
  use.met <- which(p_adj <= P)
  filt <- NAcrs(dat[, use.met])
  PlotMetabolitePCA(
    pca_res = pcaMethods::pca(dat[, use.met][filt$r, filt$c], scale = pcaMethods.scale, method = "rnipals"),
    sam = sam[, c("cols", "pchs", text.col)][filt$r, , drop = FALSE], g = factor(sam[, group.col][filt$r]),
    comm = c(paste("m=", length(use.met), sep = ""), p.adjust.method, paste("P<=", ifelse(P < 0.01, formatC(P, digits = 2, format = "E"), round(P, 2)), sep = "")),
    text.col = text.col, ...
  )
  invisible(data.frame(out, "selected" = p_adj <= P))
}