R/lmmSeq.R
In glmmSeq: General Linear Mixed Models for Gene-Level Differential Expression

Documented in lmmSeq

#' Linear mixed models for data matrix
#'
#' Fits many linear mixed effects models for analysis of gaussian data with
#' random effects, with parallelisation and optimisation for speed. It is
#' suitable for longitudinal analysis of high dimensional data. Wald type 2
#' Chi-squared test is used to calculate p-values.
#'
#' @param modelFormula the model formula. This must be of the form `"~ ..."`
#'   where the structure is assumed to be `"gene ~ ..."`. The formula must
#'   include a random effects term. See formula structure for random effects in
#'   \code{\link[lme4:lmer]{lme4::lmer()}}
#' @param maindata data matrix with genes in rows and samples in columns
#' @param metadata a dataframe of sample information with variables in columns
#'   and samples in rows
#' @param id Optional. Used to specify the column in metadata which contains the
#'   sample IDs to be used in repeated samples for random effects. If not
#'   specified, the function defaults to using the variable after the "|" in the
#'   random effects term in the formula.
#' @param offset Vector containing model offsets (default = NULL). If provided
#'   the `lmer()` offset is set to `offset`. See
#'   \code{\link[lme4:lmer]{lme4::lmer()}}
#' @param test.stat Character value specifying test statistic. Current options
#'   are "Wald" for type 2 Wald Chi square test using code derived and modified
#'   from [car::Anova] to improve speed for matrix tests. Or "F" for conditional
#'   F tests using Saiterthwaite's method of approximated Df. This uses
#'   [lmerTest::lmer] and is somewhat slower.
#' @param reduced Optional reduced model formula. If this is chosen, a
#'   likelihood ratio test is used to calculate p-values instead of the default
#'   Wald type 2 Chi-squared test.
#' @param modelData Optional dataframe. Default is generated by call to
#'   `expand.grid` using levels of variables in the formula. Used to calculate
#'   model predictions (estimated means & 95% CI) for plotting via [modelPlot].
#'   It can therefore be used to add/remove points in [modelPlot].
#' @param designMatrix Optional custom design matrix generated by call to
#'   `model.matrix` using `modelData` and `FEformula`. Used to calculate
#'   model predictions for plotting.
#' @param control the `lmer` optimizer control (default = `lmerControl()`). See
#'   \code{\link[lme4:lmerControl]{lme4::lmerControl()}}.
#' @param cores number of cores to use for parallelisation. Default = 1. 
#' @param removeSingles whether to remove individuals with no repeated measures
#'   (default = FALSE)
#' @param verbose Logical whether to display messaging (default = TRUE)
#' @param returnList Logical whether to return results as a list or lmmSeq 
#' object (default = FALSE). Helpful for debugging.
#' @param progress Logical whether to display a progress bar
#' @param ... Other parameters passed to
#'   \code{\link[lmerTest:lmer]{lmerTest::lmer()}}. Only available if 
#'   `test.stat = "F"`.
#' @return Returns an S4 class `lmmSeq` object with results for gene-wise linear
#'   mixed models; or a list of results if `returnList` is `TRUE`, which is
#'   useful for debugging. If all genes return errors from `lmer`, then an error
#'   message is shown and a character vector containing error messages for all
#'   genes is returned.
#'   
#' @details
#' By default, p-values for each model term are computed using Wald type 2
#' Chi-squared test as per [car::Anova()]. The underlying code for this has been
#' optimised for speed. However, if a reduced model formula is specified by
#' setting `reduced`, then a likelihood ratio test (LRT) is performed instead
#' using \code{\link[lme4:anova.merMod]{anova}}. This will double computation
#' time since two LMM have to be fitted for each gene. For LRT, models being
#' compared are optimised by maximum likelihood and not REML (`REML=FALSE`).
#' 
#' Two key methods are used to speed up computation above and beyond simple
#' parallelisation. The first is to speed up [lme4::lmer()] by calling
#' [lme4::lFormula()] once at the start and then updating the `lFormula` output
#' with new data. The 2nd speed up is through optimised code for repeated type 2
#' Wald Chi-squared tests (original code was derived from [car::Anova]). For
#' example, elements such as the hypothesis matrices are generated only once to
#' reduce unnecessarily repetitive computation, and the generation of p-values
#' from Chi-squared values is vectorised and performed at the end. F-tests using
#' the `lmerTest` package have not been optimised and are therefore slower.
#' 
#' Parallelisation is performed using [parallel::mclapply] on unix/mac and
#' [parallel::parLapply] on windows. Progress bars use [pbmcapply::pbmclapply]
#' on unix/mac and [pbapply::pblapply] on windows.
#' 
#' The `id` argument is usually optional. By default the `id` column in the
#' metadata is determined as the term after the bar in the random effects term
#' of the model. Note that `id` is not passed to `lmer`. It is only really used
#' to remove singletons from the `maindata` matrix and `metadata` dataframe. The
#' `id` is also stored in the output from `lmmSeq` and used by plotting function
#' [modelPlot()]. However, due to its flexible nature, in theory `lmmSeq` should
#' allow for more than one random effect term, although this has not been tested
#' formally. In this case, it is probably prudent to specify a value for `id`.
#' 
#' @importFrom lme4 subbars findbars fixef lmerControl nobars isSingular
#'   lFormula
#' @importFrom lmerTest lmer
#' @importFrom parallel mclapply detectCores parLapply makeCluster clusterEvalQ
#'   clusterExport stopCluster
#' @importFrom pbmcapply pbmclapply
#' @importFrom pbapply pblapply
#' @importFrom methods slot new
#' @importFrom stats AIC complete.cases logLik reshape terms vcov pchisq
#'   update.formula model.matrix predict setNames anova coef
#' @export
#' @examples
#' data(PEAC_minimal_load)
#' logtpm <- log2(tpm +1)
#' lmmtest <- lmmSeq(~ Timepoint * EULAR_6m + (1 | PATID),
#'                      maindata = logtpm[1:2, ],
#'                      metadata = metadata,
#'                      verbose = FALSE)
#' names(attributes(lmmtest))


lmmSeq <- function(modelFormula,
                   maindata,
                   metadata,
                   id = NULL,
                   offset = NULL,
                   test.stat = c("Wald", "F", "LRT"),
                   reduced = NULL,
                   modelData = NULL,
                   designMatrix = NULL,
                   control = lmerControl(),
                   cores = 1,
                   removeSingles = FALSE,
                   verbose = TRUE,
                   returnList = FALSE, 
                   progress = FALSE,
                   ...) {
  lmmcall <- match.call(expand.dots = TRUE)
  test.stat <- match.arg(test.stat)
  maindata <- as.matrix(maindata)
  # Catch errors
  if (length(findbars(modelFormula)) == 0) {
    stop("No random effects terms specified in formula")
  }
  if (ncol(maindata) != nrow(metadata)) {
    stop("maindata columns different size to metadata rows")
  }
  if (!is.null(offset) & ncol(maindata) != length(offset)) {
    stop("Different offset length")
  }
  
  # Manipulate formulae
  fullFormula <- update.formula(modelFormula, gene ~ ., simplify = FALSE)
  subFormula <- subbars(modelFormula)
  variables <- rownames(attr(terms(subFormula), "factors"))
  subsetMetadata <- metadata[, variables]
  if (is.null(id)) {
    fb <- findbars(modelFormula)
    id <- sub(".*[|]", "", fb)
    id <- gsub(" ", "", id)
  }
  ids <- as.character(metadata[, id])
  
  # Option to subset to remove unpaired samples
  if (removeSingles) {
    nonSingle <- names(table(ids))[table(ids) > 1]
    pairedIndex <- ids %in% nonSingle
    maindata <- maindata[, pairedIndex]
    subsetMetadata <- subsetMetadata[pairedIndex, ]
    ids <- ids[pairedIndex]
    offset <- offset[pairedIndex]
  }
  
  if (verbose) cat(paste0("\nn = ", length(ids), " samples, ",
                          length(unique(ids)), " individuals\n"))
  
  # setup model prediction
  FEformula <- nobars(modelFormula)
  if (is.null(modelData)) {
    reducedVars <- rownames(attr(terms(FEformula), "factors"))
    varLevels <- lapply(reducedVars, function(x) {
      if (is.factor(metadata[, x])) {
        return(levels(subsetMetadata[, x]))
      } else {sort(unique(subsetMetadata[, x]))}
    })
    modelData <- expand.grid(varLevels)
    colnames(modelData) <- reducedVars
  }
  
  if (is.null(designMatrix)){
    designMatrix <- model.matrix(FEformula, modelData)
  }
  
  start <- Sys.time()
  fullList <- lapply(rownames(maindata), function(i) maindata[i, ])
  
  if (!is.null(reduced)) {
    # LRT
    if (length(findbars(reduced)) == 0) {
      stop("No random effects terms specified in reduced formula")
    }
    subReduced <- subbars(reduced)
    redvars <- rownames(attr(terms(subReduced), "factors"))
    if (any(!redvars %in% variables)) {
      stop("Extra terms in reduced formula not found full formula")
    }
    reduced <- update.formula(reduced, gene ~ ., simplify = FALSE)
    test.stat <- "LRT"
  }
  
  if (test.stat == "Wald") {
    # Adapted from lme4::modular / lme4::lmer
    subsetMetadata$gene <- maindata[1, ]
    lmod <- lFormula(fullFormula, subsetMetadata,
                     offset = offset, control = control, ...)
    
    hyp.matrix <- hyp_matrix(fullFormula, metadata, "gene")
    
    # For each gene perform a fit
    # lmerFast
    if (Sys.info()["sysname"] == "Windows" & cores > 1) {
      cl <- makeCluster(cores)
      on.exit(stopCluster(cl))
      clusterExport(cl, varlist = c("lmerFast",
                                    "lmod", "control", "modelData",
                                    "designMatrix",
                                    "hyp.matrix"),
                    envir = environment())
      if (progress) {
        resultList <- pblapply(fullList, function(geneRow) {
          lmerFast(geneRow, lmod, control,
                   modelData, designMatrix, hyp.matrix)
        }, cl = cl)
      } else {
        resultList <- parLapply(cl = cl, fullList, function(geneRow) {
          lmerFast(geneRow, lmod, control,
                   modelData, designMatrix, hyp.matrix)
        })
      }
    } else{
      if (progress) {
        resultList <- pbmclapply(fullList, function(geneRow) {
          lmerFast(geneRow, lmod, control,
                   modelData, designMatrix, hyp.matrix)
        }, mc.cores = cores)
        if ("value" %in% names(resultList)) resultList <- resultList$value
      } else {
        resultList <- mclapply(fullList, function(geneRow) {
          lmerFast(geneRow, lmod, control,
                   modelData, designMatrix, hyp.matrix)
        }, mc.cores = cores)
      }
    }
    
  } else {
    # lmerTest or LRT
    if (Sys.info()["sysname"] == "Windows" & cores > 1) {
      cl <- makeCluster(cores)
      on.exit(stopCluster(cl))
      dots <- list(...)
      varlist <- c("lmerTestCore", "fullList", "fullFormula", "reduced",
                   "subsetMetadata", "control", "modelData", "offset",
                   "designMatrix", "dots")
      clusterExport(cl, varlist = varlist, envir = environment())
      if (progress) {
        resultList <- pblapply(fullList, function(geneRow) {
          args <- c(list(geneRow = geneRow, fullFormula = fullFormula,
                         reduced = reduced,
                         data = subsetMetadata, control = control,
                         modelData = modelData, offset = offset,
                         designMatrix = designMatrix), dots)
          do.call(lmerTestCore, args)
        }, cl = cl)
      } else {
        resultList <- parLapply(cl = cl, fullList, function(geneRow) {
          args <- c(list(geneRow = geneRow, fullFormula = fullFormula,
                         reduced = reduced,
                         data = subsetMetadata, control = control,
                         modelData = modelData, offset = offset,
                         designMatrix = designMatrix), dots)
          do.call(lmerTestCore, args)
        })
      }
      
    } else{
      if (progress) {
        resultList <- pbmclapply(fullList, function(geneRow) {
          lmerTestCore(geneRow, fullFormula = fullFormula, reduced = reduced, 
                       data = subsetMetadata, control = control,
                       modelData = modelData, offset = offset,
                       designMatrix = designMatrix, ...)
        }, mc.cores = cores)
        if ("value" %in% names(resultList)) resultList <- resultList$value
      } else {
        resultList <- mclapply(fullList, function(geneRow) {
          lmerTestCore(geneRow, fullFormula = fullFormula, reduced = reduced, 
                       data = subsetMetadata, control = control,
                       modelData = modelData, offset = offset,
                       designMatrix = designMatrix, ...)
        }, mc.cores = cores)
      }
    }
  }
  
  if(returnList) return(resultList)
  
  # Output
  names(resultList) <- rownames(maindata)
  noErr <- vapply(resultList, function(x) x$tryErrors == "", FUN.VALUE = TRUE)
  if (sum(noErr) == 0) { 
    message("All genes returned an error. Check call. Check sufficient data in each group")
    outputErrors <- vapply(resultList[!noErr], function(x) {x$tryErrors},
                           FUN.VALUE = c("test"))
    print(outputErrors[1])
    return(outputErrors)
  }
  # Print timing if verbose
  end <- Sys.time()
  if (verbose) print(end - start)
  
  predList <- lapply(resultList[noErr], "[[", "predict")
  outputPredict <- do.call(rbind, predList)
  
  outLabels <- apply(modelData, 1, function(x) paste(x, collapse = "_"))
  colnames(outputPredict) <- c(paste0("y_", outLabels),
                               paste0("LCI_", outLabels),
                               paste0("UCI_", outLabels))
  
  if (sum(!noErr) != 0) {
    if (verbose) cat(paste0("Errors in ", sum(!noErr), " gene(s): ",
                            paste0(names(noErr)[! noErr], collapse = ", ")))
    outputErrors <- vapply(resultList[!noErr], function(x) {x$tryErrors},
                           FUN.VALUE = character(1))
  } else {outputErrors <- c("No errors")}
  
  optInfo <- t(vapply(resultList[noErr], function(x) {
    setNames(x$optinfo, c("Singular", "Conv"))
  }, FUN.VALUE = c(1, 1)))
  
  s <- organiseStats(resultList[noErr], test.stat)
  meanExp <- rowMeans(maindata[noErr, , drop = FALSE])
  s$res <- cbind(s$res, meanExp)
  
  # Create lmmSeq object with results
  new("lmmSeq",
      info = list(call = lmmcall,
                  offset = offset,
                  designMatrix = designMatrix,
                  control = substitute(control),
                  test.stat = test.stat),
      formula = fullFormula,
      stats = s,
      predict = outputPredict,
      reduced = reduced,
      maindata = maindata,
      metadata = subsetMetadata,
      modelData = modelData,
      optInfo = optInfo,
      errors = outputErrors,
      vars = list(id = id,
                  removeSingles = removeSingles)
  )
}


## see lme4::modular
#' @importFrom lme4 mkLmerDevfun optimizeLmer checkConv mkMerMod
#' 
lmerFast <- function(geneRow,
                     lmod,
                     control,
                     modelData,
                     designMatrix,
                     hyp.matrix) {
  lmod$fr$gene <- geneRow
  devfun <- do.call(mkLmerDevfun, c(lmod, list(control=control)))
  opt <- optimizeLmer(devfun,
                      optimizer = control$optimizer,
                      restart_edge = control$restart_edge,
                      boundary.tol = control$boundary.tol,
                      control = control$optCtrl,
                      calc.derivs=control$calc.derivs,
                      use.last.params=control$use.last.params)
  cc <- try(suppressMessages(suppressWarnings(
    checkConv(attr(opt,"derivs"), opt$par,
              ctrl = control$checkConv,
              lbound = environment(devfun)$lower)
  )), silent = TRUE)
  fit <- mkMerMod(environment(devfun), opt, lmod$reTrms, fr = lmod$fr,
                  lme4conv=cc)
  
  if (!inherits(fit, "try-error")) {
    # intercept dropped genes
    if (length(attr(fit@pp$X, "msgRankdrop")) > 0)  {
      return( list(stats = NA, predict = NA, optinfo = NA,
                   tryErrors = attr(fit@pp$X, "msgRankdrop")) )
    }
    stats <- setNames(c(AIC(fit), as.numeric(logLik(fit))),
                      c("AIC", "logLik"))
    fixedEffects <- lme4::fixef(fit)
    stdErr <- coef(summary(fit))[, 2]
    vcov. <- suppressWarnings(vcov(fit, complete = FALSE))
    vcov. <- as.matrix(vcov.)
    waldtest <- lmer_wald(fixedEffects, hyp.matrix, vcov.)
    
    newY <- predict(fit, newdata = modelData, re.form = NA)
    a <- designMatrix %*% vcov.
    b <- as.matrix(a %*% t(designMatrix))
    predVar <- diag(b)
    newSE <- sqrt(predVar)
    newLCI <- newY - newSE * 1.96
    newUCI <- newY + newSE * 1.96
    predictdf <- c(newY, newLCI, newUCI)
    singular <- as.numeric(lme4::isSingular(fit))
    conv <- length(slot(fit, "optinfo")$conv$lme4$messages)
    ret <- list(stats = stats,
                coef = fixedEffects,
                stdErr = stdErr,
                chisq = waldtest$chisq,
                df = waldtest$df,
                predict = predictdf,
                optinfo = c(singular, conv),
                tryErrors = "")
    return(ret)
  } else {
    return(list(stats = NA, coef = NA, stdErr = NA, chisq = NA, df = NA, 
                predict = NA, optinfo = NA, tryErrors = fit[1]))
  }
}


lmerTestCore <- function(geneRow,
                         fullFormula,
                         reduced,
                         data,
                         control,
                         modelData,
                         designMatrix,
                         offset,
                         ...) {
  data[, "gene"] <- geneRow
  dots <- list(...)
  REML <- if (is.null(dots$REML)) TRUE else dots$REML
  if (!is.null(reduced)) REML <- FALSE
  fit <- try(suppressMessages(suppressWarnings(
    lmerTest::lmer(fullFormula, data = data, control = control, offset = offset,
                   REML = REML, ...))),
    silent = TRUE)
  if (!inherits(fit, "try-error")) {
    # intercept dropped genes
    if (length(attr(fit@pp$X, "msgRankdrop")) > 0)  {
      return( list(stats = NA, predict = NA, optinfo = NA,
                   tryErrors = attr(fit@pp$X, "msgRankdrop")) )
    }
    stats <- setNames(c(AIC(fit), as.numeric(logLik(fit))),
                      c("AIC", "logLik"))
    fixedEffects <- lme4::fixef(fit)
    stdErr <- coef(summary(fit))[, 2]
    vcov. <- suppressWarnings(vcov(fit, complete = FALSE))
    vcov. <- as.matrix(vcov.)
    
    if (is.null(reduced)) {
      # F test
      test <- as.matrix(anova(fit)[, -c(1,2)])
    } else {
      # LRT
      fit2 <- try(suppressMessages(suppressWarnings(
        lmerTest::lmer(reduced, data = data, control = control, offset = offset,
                       REML = FALSE, ...))),
        silent = TRUE)
      if (!inherits(fit2, "try-error")) {
        lrt <- suppressMessages(anova(fit, fit2))
        test <- unlist(lrt[2, c("Chisq", "Df", "Pr(>Chisq)")])
      } else {
        test <- setNames(c(NA, NA, NA), c("Chisq", "Df", "Pr(>Chisq)"))
      }
    }
    newY <- predict(fit, newdata = modelData, re.form = NA)
    a <- designMatrix %*% vcov.
    b <- as.matrix(a %*% t(designMatrix))
    predVar <- diag(b)
    newSE <- sqrt(predVar)
    newLCI <- newY - newSE * 1.96
    newUCI <- newY + newSE * 1.96
    predictdf <- c(newY, newLCI, newUCI)
    singular <- as.numeric(lme4::isSingular(fit))
    conv <- length(slot(fit, "optinfo")$conv$lme4$messages)
    rm(fit, data)
    ret <- list(stats = stats,
                coef = fixedEffects,
                stdErr = stdErr,
                test = test,
                predict = predictdf,
                optinfo = c(singular, conv),
                tryErrors = "")
    return(ret)
  } else {
    return(list(stats = NA, coef = NA, stdErr = NA, test = NA,
                predict = NA, optinfo = NA, tryErrors = fit[1]))
  }
}


organiseStats <- function(resultList, test.stat) {
  statsList <- lapply(resultList, "[[", "stats")
  s <- do.call(rbind, statsList)
  coefList <- lapply(resultList, "[[", "coef")
  cf <- do.call(rbind, coefList)
  SEList <- lapply(resultList, "[[", "stdErr")
  stdErr <- do.call(rbind, SEList)
  if (test.stat == "Wald") {
    chisqList <- lapply(resultList, "[[", "chisq")
    chisq <- do.call(rbind, chisqList)
    dfList <- lapply(resultList, "[[", "df")
    df <- do.call(rbind, dfList)
    pvals <- pchisq(chisq, df=df, lower.tail = FALSE)
    colnames(df) <- colnames(chisq)
    colnames(pvals) <- colnames(chisq)
    s <- list(res = s, coef = cf, stdErr = stdErr, Chisq = chisq, Df = df,
              pvals = pvals)
  } else if (test.stat == "F") {
    NumDF <- lapply(resultList, function(x) x$test[,1])
    NumDF <- do.call(rbind, NumDF)
    DenDF <- lapply(resultList, function(x) x$test[,2])
    DenDF <- do.call(rbind, DenDF)
    Fval <- lapply(resultList, function(x) x$test[,3])
    Fval <- do.call(rbind, Fval)
    pvals <- lapply(resultList, function(x) x$test[,4])
    pvals <- do.call(rbind, pvals)
    if (ncol(NumDF) == 1) {
      colnames(NumDF) <- colnames(DenDF) <- colnames(Fval) <- 
        colnames(pvals) <- rownames(resultList[[1]]$test)
    }
    s <- list(res = s, coef = cf, stdErr = stdErr, NumDF = NumDF, DenDF = DenDF, 
              Fval = Fval, pvals = pvals)
  } else {
    # LRT
    LRT <- lapply(resultList, "[[", "test")
    LRT <- do.call(rbind, LRT)
    chisq <- LRT[,1, drop = FALSE]
    df <- LRT[,2, drop = FALSE]
    pvals <- LRT[,3, drop = FALSE]
    colnames(chisq) <- colnames(df) <- colnames(pvals) <- "LRT"
    s <- list(res = s, coef = cf, stdErr = stdErr, Chisq = chisq, Df = df,
              pvals = pvals)
  }
}