rtemis: Machine Learning and Visualization

Documented in massGAM

# massGAM.R
# ::rtemis::
# 2015 E.D. Gennatas www.lambdamd.org

#' Mass-univariate GAM Analysis
#'
#' Fits a GAM for each of multiple outcomes using a fixed set of features (many y's, one X).
# To fit GAMs for a fixed outcome using multiple different features, see \link{multiGAM}
#'
#' NA in the input will be kept as NA in the results, maintaining n of cases.
#'
#' @inheritParams s_GAM.default
#' @param y Numeric matrix / data frame: Outcomes
#' @param covariates Numeric matrix / data.frame of additional covariates
#' @param x.name Character: Name of the predictor
#' @param y.name Character, vector: Names of the outcomes
#' @param k Integer: Basis dimension for smoothing spline
#' @param family `family` argument for `mgcv::gam`
#' @param weights Vector, numeric: Weights for GAM
#' @param method Estimation method for GAM
#' @param n.cores Integer. Number of cores to use
#' @param save.mods Logical. Should models be saved
#' @param save.summary Logical. Should model summary be saved
#' @param print.plots Logical Should plots be shown
#' @param outdir Path to save output
#  @param labeledNifti String. Path to labeled nifti file.
#' @param save.plots Logical. Should plots be saved
#' @param new.x.breaks Integer. Number of splits in the range of x
#'   to form vector of features for estimation of fitted values
#'
#' @author E.D. Gennatas
#' @export

massGAM <- function(x, y,
                    covariates = NULL,
                    x.name = NULL, y.name = NULL,
                    k = NULL,
                    family = gaussian(),
                    weights = NULL,
                    method = "REML",
                    n.cores = rtCores,
                    save.mods = FALSE,
                    save.summary = TRUE,
                    print.plots = FALSE,
                    outdir = NULL,
                    # labeledNifti = NULL,
                    save.plots = FALSE,
                    new.x.breaks = 9) {
  # Dependencies ----
  dependency_check("mgcv")

  # Arguments ----
  if (missing(x)) {
    print(args(massGAM))
    stop("x is missing")
  }
  if (missing(y) && NCOL(x) == 1) {
    print(args(massGAM))
    stop("y is missing")
  }
  if (is.null(n.cores)) n.cores <- future::availableCores()
  if (!is.null(covariates)) {
    if (!is.list(covariates)) covariates <- as.list(as.data.frame(covariates))
  }

  # Intro ----
  ptm <- proc.time()
  scriptVersion <- 0.2
  start.date <- date()
  cat(start.date, "\n::: massGAM version ", scriptVersion, sep = "")

  # Data ----
  # Name cols first, because data.frame() or as.data.frame()
  # will assign colnames X1 ... Xn and V1 ... Vn respectively
  if (is.null(colnames(y))) colnames(y) <- paste0("Outcome.", seq_len(NCOL(y)))
  y <- data.frame(y)

  x <- data.frame(x)
  if (!is.null(x.name) && NCOL(x) == length(x.name)) {
    colnames(x) <- x.name
  } else {
    colnames(x) <- paste0("Predictor.", seq_len(NCOL(x)))
  }

  ### Predictors
  if (!is.null(k)) {
    features <- paste0("s(", colnames(x), ", k = ", k, ")", collapse = " + ")
  } else {
    warning("k was not specified: I recommend you check results and rerun with specified k if necessary.")
    features <- paste0("s(", colnames(x), ")", collapse = " + ")
  }

  ### Covariates: ordered factors -- currently works with one main predictor
  if (!is.null(covariates)) {
    covs <- covariates
    if (is.null(names(covs))) names(covs) <- paste0("Covariate.", seq_along(covs))
    covariates <- ""
    for (j in seq_along(covs)) {
      if (is.null(k)) {
        covariates <- paste(covariates, paste0(
          names(covs)[j], " + s(", colnames(x)[1],
          ", by = ", names(covs)[j], ")"
        ), sep = " + ")
      } else {
        covariates <- paste(covariates, paste0(
          names(covs)[j], " + s(", colnames(x)[1],
          ", by = ", names(covs)[j], ", k = ", k, ")"
        ), sep = " + ")
      }
    }
  }

  ### Data frame
  if (is.null(covariates)) df.train <- data.frame(y, x) else df.train <- data.frame(y, x, covs)

  ### Formulae
  formulae <- lapply(
    seq_len(NCOL(y)),
    function(index) as.formula(paste0(colnames(y)[index], " ~ ", features, covariates))
  )

  ### Output
  s.out <- list()

  # GAM ----
  # use na.exclude so that NAs give NAs when resid() is used. $residuals will still be shorter, don't use
  cat(
    ">>> I will regress", NCOL(y), "outcomes on the following features:\n",
    paste0(features, covariates), "\n"
  )
  cat(">>> Running", length(formulae), "GAMs on", n.cores, "cores...\n")
  mod.gam <- pbapply::pblapply(formulae, function(f) {
    mgcv::gam(
      formula = f,
      family = family,
      data = df.train,
      weights = weights,
      na.action = na.exclude,
      method = method
    )
  },
  cl = n.cores
  )
  names(mod.gam) <- names(y)

  cat("GAM fitting complete.\n")

  if (save.mods) s.out$mod.gam <- mod.gam

  # Output ----
  # summary
  cat(">>> Getting GAM summary and statistics...\n")
  mod.summary <- lapply(mod.gam, summary)
  if (save.summary) s.out$summary <- mod.summary
  # R squared
  s.out$r.sq <- data.frame(matrix(sapply(mod.summary, function(mod) c(mod$r.sq)), nrow = 1))
  colnames(s.out$r.sq) <- colnames(y)
  # P values - parameters
  s.out$p.table <- lapply(mod.summary, function(mod) {
    return(mod$p.table)
  })
  # P values - smooths
  s.out$s.table <- lapply(mod.summary, function(mod) {
    return(mod$s.table)
  })
  # P values - first smooth
  s.out$s.pv <- data.frame(matrix(sapply(mod.summary, function(mod) {
    return(mod$s.pv[1])
  }), nrow = 1))
  colnames(s.out$s.pv) <- colnames(y)
  # Holm-corrected P values of first smooth
  s.out$s.pv.holm <- p.adjust(s.out$s.pv, method = "holm")
  # FDR-corrected P values of first smooth
  s.out$s.pv.fdr <- p.adjust(s.out$s.pv, method = "fdr")
  # Residuals - Note: $residuals will omit NAs; use resid()
  # s.out$residuals <- data.frame(sapply(mod.gam, function(mod) return(mod$residuals)))
  s.out$residuals <- data.frame(sapply(mod.gam, resid))
  colnames(s.out$residuals) <- colnames(y)
  # Predict
  mod.pred <- lapply(mod.gam, function(mod) predict(mod, se.fit = TRUE))
  # Fitted values
  s.out$fitted <- data.frame(sapply(mod.pred, function(mod) {
    return(mod$fit)
  }))
  colnames(s.out$fitted) <- colnames(y)
  # Standard error
  s.out$se.pred <- data.frame(sapply(mod.pred, function(mod) {
    return(mod$se.fit)
  }))
  colnames(s.out$se.pred) <- colnames(y)

  # RDS ----
  if (!is.null(outdir)) {
    outdir <- paste0(normalizePath(outdir, mustWork = FALSE), "/")
    dir.create(outdir, showWarnings = FALSE, recursive = TRUE)
    outfile <- paste0(outdir, "massGAM.rds")
    saveRDS(s.out, outfile)
    # if (!is.null(labeledNifti)) {
    #   prprcss::labels2nii(s.out$s.pv.fdr, labeledNifti, paste0(outdir, "s.pv.fdr"))
    #   Rsq.fdr <- as.numeric(s.out$r.sq)
    #   Rsq.fdr[s.out$s.pv.fdr >= .05] <- 0
    #   prprcss::labels2nii(Rsq.fdr, labeledNifti, paste0(outdir, "r.sq.fdr"))
    # }
  }

  # Outro ----
  cat(date(), "::: massGAM version", scriptVersion, "completed.\n")
  elapsed <- proc.time() - ptm
  cat("Elapsed time:\n")
  print(elapsed)
  s.out
} # rtemis::massGAM