rtemis: Machine Learning and Visualization

Documented in massGLM plot.massGLM print.massGLM summary.massGLM

# massGLM.R
# ::rtemis::
# 2021-2 E.D. Gennatas www.lambdamd.org

#' Mass-univariate GLM Analysis
#'
#' Run a mass-univariate analysis with either:
#' a) single outome (y) and multiple predictors (x), one at a time, with an
#' optional common set of covariates in each model - "massx"
#' b) multiple different outcomes (y) with a fixed set of predictors (x) - "massy"
#' Therefore, the term mass-univariate refers to looking at one variable of
#' interest (with potential covariates of no interest) at a time
#'
#' @param x Matrix / data frame of features
#' @param y Matrix / data frame of outcomes
#' @param scale.x Logical: If TRUE, scale and center `x`
#' @param scale.y Logical: If TRUE, scale and center `y`
#' @param type Character: "massx" or "massy". Default = NULL,
#' where if (NCOL(x) > NCOL(y)) "massx" else "massy"
#' @param xnames Character vector: names of `x` feature(s)
#' @param ynames Character vector: names of `y` feature(s)
#' @param coerce.y.numeric Logical: If `TRUE`, coerce `y` to numeric
#' @param save.mods Logical: If TRUE, save models.
# @param p.adjust.method Character: p-value adjustment method. See \code{p.adjust}.
#' @param print.plot Logical: If TRUE, print plot.
#' @param include_anova_pvals Logical: If TRUE, include ANOVA p-values,
#' (generated by `glm2table`)
#' @param trace Integer: If > 0, print more verbose output to console.
#' @param verbose Logical: If TRUE, print messages during run
#'
#' @author E.D. Gennatas
#' @export
#'
#' @examples
#' \dontrun{
#' # Common usage is "reversed":
#' # x: outcome of interest as first column, optional covariates
#' # in the other columns
#' # y: features whose association with x we want to study
#' set.seed(2022)
#' features <- rnormmat(500, 40)
#' outcome <- features[, 3] - features[, 5] + features[, 14] + rnorm(500)
#' massmod <- massGLM(outcome, features)
#' plot(massmod)
#' plot(massmod, what = "coef")
#' plot(massmod, what = "volcano")
#' }
#'
massGLM <- function(x, y,
                    scale.x = FALSE,
                    scale.y = FALSE,
                    type = NULL,
                    xnames = NULL,
                    ynames = NULL,
                    coerce.y.numeric = FALSE,
                    save.mods = FALSE,
                    print.plot = FALSE,
                    include_anova_pvals = NA,
                    verbose = TRUE,
                    trace = 0) {
  # Intro ----
  .call <- match.call()
  .call[2] <- list(str2lang("dat"))
  dependency_check("progressr")
  start_time <- intro(verbose = verbose)

  # Data ----
  if (is.null(type)) type <- if (NCOL(x) > NCOL(y)) "massx" else "massy"
  if (trace > 0) msg20('massGLM type is "', type, '"')
  if (type == "massx") {
    if (is.null(colnames(x))) colnames(x) <- paste0("Feature_", seq_len(NCOL(x)))
    nmods <- NCOL(x)
  } else {
    if (is.null(colnames(y))) colnames(y) <- paste0("Outcome_", seq_len(NCOL(y)))
    nmods <- NCOL(y)
  }

  if (is.null(xnames)) {
    xnames <- if (is.null(colnames(x))) {
      paste(deparse(substitute(x)), seq_len(NCOL(x)), sep = "_")
    } else {
      colnames(x)
    }
  }
  if (trace > 0) msg2("Feature names:", paste(xnames, collapse = ", "))
  if (is.null(ynames)) {
    ynames <- if (is.null(colnames(y))) deparse(substitute(y)) else colnames(y)
  }
  if (trace > 0) msg2("Outcome names:", paste(ynames, collapse = ", "))

  if (coerce.y.numeric) {
    y <- sapply(y, as.numeric)
  }
  if (scale.x) {
    x <- preprocess(x, scale = TRUE, verbose = verbose)
  }
  if (scale.y) {
    y <- preprocess(y, scale = TRUE, verbose = verbose)
  }
  dat <- data.frame(x, y)
  colnames(dat) <- c(xnames, ynames)

  # fit1: Loop function ----
  p <- progressr::progressor(along = seq_len(nmods))
  fit1 <- function(index, dat, type) {
    if (type == "massx") {
      .formula <- as.formula(paste(ynames, "~", xnames[index]))
      .family <- if (is.factor(dat[[ynames]])) "binomial" else "gaussian"
      out <- glm(.formula, family = .family, data = dat)
      p()
      out
    } else {
      .formula <- as.formula(paste(ynames[index], "~", paste(xnames, collapse = " + ")))
      .family <- if (is.factor(dat[[ynames[index]]])) "binomial" else "gaussian"
      out <- glm(.formula, family = .family, data = dat)
      p()
      out
    }
  }

  # Models ----
  if (verbose) msg2("Training", nmods, "mass-GLM models...")

  mods <- lapply(
    seq_len(nmods),
    fit1,
    dat = dat,
    type = type
  )

  names(mods) <- if (type == "massx") xnames else ynames

  # Outro ----
  out <- list(
    mods = if (save.mods) mods else NULL,
    summary = glm2table(mods, include_anova_pvals = include_anova_pvals),
    xnames = xnames,
    coefnames = names(coef(mods[[1]])),
    ynames = ynames,
    type = type,
    call = .call
  )
  class(out) <- c("massGLM", "list")
  if (print.plot) print(plot(out))
  outro(start_time, verbose = verbose)
  out
} # rtemis::massGLM


#' `print`[massGLM] object
#'
#' @method print massGLM
#' @param x [massGLM] object
#' @param ... Not used
#'
#' @author E.D. Gennatas
#' @export

print.massGLM <- function(x, ...) {
  nx <- length(x$xnames)
  ny <- length(x$ynames)
  .text <- paste(
    "Mass-univariate GLM analysis with", nx,
    ngettext(nx, "predictor", "predictors"),
    "and", ny, ngettext(ny, "outcome", "outcomes\n")
  )
  cat(.text)
  invisible(.text)
}

#' `massGLM` object summary
#'
#' @param object An object created by [massGLM]
#' @param ... Not used
#'
#' @author E.D. Gennatas
#' @export

summary.massGLM <- function(object, ...) {
  print(object$summary, row.names = FALSE, class = FALSE)
}

#' Plot `massGLM` object
#'
#' Plots a `massGLM` object using [dplot3_bar]
#'
#' @method plot massGLM
#' @param x `massGLM` object
#' @param what Character: "adjusted" or "raw" p-values to plot
#' @param xlab Character: x-axis label for volcano plot
#'
#' @author E.D. Gennatas
#' @export

plot.massGLM <- function(x,
                         predictor = NULL,
                         main = NULL,
                         what = c("volcano", "coefs", "pvals"),
                         # which_pvals = c("glm", "anova2", "anova3"),
                         p.adjust.method = "holm",
                         p.transform = \(x) -log10(x),
                         show = c("all", "signif"),
                         xnames = NULL,
                         pval.hline = c(.05, .001),
                         hline.col = "#ffffff",
                         hline.dash = "dash",
                         hline.annotate = as.character(pval.hline),
                         ylim = NULL,
                         xlab = NULL,
                         ylab = NULL,
                         group = NULL,
                         col.neg = "#43A4AC",
                         col.pos = "#FA9860",
                         col.ns = "#7f7f7f",
                         theme = rtTheme,
                         alpha = NULL,
                         volcano.annotate = TRUE,
                         volcano.annotate.n = 7,
                         volcano.hline = NULL,
                         volcano.hline.dash = "dot",
                         volcano.hline.annotate = NULL,
                         volcano.p.transform = \(x) -log10(x),
                         margin = NULL,
                         displayModeBar = TRUE,
                         trace = 0,
                         filename = NULL, ...) {
  what <- match.arg(what)
  # which_pvals <- match.arg(which_pvals)
  which_pvals <- "glm"
  show <- match.arg(show)

  if (x$type == "massy") {
    if (is.null(predictor)) {
      if (which_pvals == "glm") {
        predictor <- x$coefnames[2]
      } else {
        predictor <- x$xnames[1]
      }
    }
    what <- match.arg(what)

    # pval_idi <- switch(which_pvals,
    #     glm = which(names(x$summary) == paste("p_value", predictor)),
    #     anova2 = which(names(x$summary) == paste("p_value type II", predictor)),
    #     anova3 = which(names(x$summary) == paste("p_value type III", predictor))
    # )
    pval_idi <- which(names(x$summary) == paste("p_value", predictor))
    if (what == "pvals") {
      # p-values ----
      if (is.null(main)) main <- "p-values"
      pval_idi <- grep(paste("p_value", predictor), names(x$summary))[1]
      .name <- gsub("p_value ", "", names(x$summary)[pval_idi])
      .pvals <- p.adjust(x$summary[[pval_idi]], method = p.adjust.method)
      .coefname <- getnames(x$summary, paste("Coefficient", .name))
      .cols <- rep(col.ns, length(x$summary[[.coefname]]))
      .cols[x$summary[[.coefname]] < 0 & .pvals < .05] <- col.neg
      .cols[x$summary[[.coefname]] > 0 & .pvals < .05] <- col.pos

      if (is.null(ylab)) {
        ylab <- paste(
          # print_transform(deparse(p.transform)[2]),
          print_fn(p.transform),
          what, .name, "p-value"
        )
      }
      if (is.null(xnames)) {
        xnames <- if (x$type == "massy") x$ynames else x$xnames
      }
      dplot3_bar(
        p.transform(.pvals),
        group.names = xnames,
        main = main,
        # ylim = c(0, 1),
        ylim = ylim,
        legend = FALSE,
        ylab = ylab,
        col = .cols,
        hline = p.transform(pval.hline),
        hline.col = hline.col,
        hline.dash = hline.dash,
        hline.annotate = hline.annotate,
        theme = theme,
        margin = margin,
        displayModeBar = displayModeBar,
        filename = filename, ...
      )
    } else if (what == "coefs") {
      # Coefficients ----
      if (is.null(main)) main <- "Coefficients"
      coef_idi <- which(names(x$summary) == paste("Coefficient", predictor))
      .name <- gsub("Coefficient ", "", names(x$summary)[coef_idi])
      .pvals <- p.adjust(x$summary[[pval_idi]], method = p.adjust.method)
      .coefname <- getnames(x$summary, paste("Coefficient", .name))
      .cols <- rep(col.ns, length(x$summary[[.coefname]]))
      .cols[x$summary[[.coefname]] < 0 & .pvals < .05] <- col.neg
      .cols[x$summary[[.coefname]] > 0 & .pvals < .05] <- col.pos
      if (is.null(xnames)) xnames <- x$ynames
      dplot3_bar(
        x$summary[[coef_idi]],
        # group.names = if (x$type == "massy") x$ynames else x$xnames,
        group.names = xnames,
        main = main,
        legend = FALSE,
        ylab = paste(.name, "Coefficients"),
        col = .cols,
        theme = theme,
        margin = margin,
        displayModeBar = displayModeBar,
        filename = filename, ...
      )
    } else {
      # Volcano ----
      if (is.null(xlab)) xlab <- paste(predictor, "Coefficient")
      coef_idi <- which(names(x$summary) == paste("Coefficient", predictor))
      if (is.null(xnames)) xnames <- x$ynames
      dplot3_volcano(
        x = x$summary[[coef_idi]],
        pvals = x$summary[[pval_idi]],
        group = group,
        x.thresh = 0,
        label.lo = "Neg",
        label.hi = "Pos",
        xnames = xnames,
        xlab = xlab,
        main = main,
        p.adjust.method = p.adjust.method,
        p.transform = volcano.p.transform,
        annotate = volcano.annotate,
        annotate.n = volcano.annotate.n,
        hline = volcano.hline,
        hline.annotate = volcano.hline.annotate,
        hline.dash = volcano.hline.dash,
        theme = theme,
        alpha = alpha,
        displayModeBar = displayModeBar,
        verbose = trace > 0,
        filename = filename, ...
      )
    }
  } else {
    cat('"massx" support not yet implemented')
  }
} # rtemis::plot.massGLM

# print_transform <- function(x) gsub("[x()]", "", x)
print_fn <- function(x) {
  fn <- deparse(x)[2]
  fn <- gsub("\\(.*", "", fn)
  if (fn == "I") fn <- ""
  fn
}