Nothing
#' Linear mixed models for data matrix
#'
#' Fits many linear mixed effects models for analysis of gaussian data with
#' random effects, with parallelisation and optimisation for speed. It is
#' suitable for longitudinal analysis of high dimensional data. Wald type 2
#' Chi-squared test is used to calculate p-values.
#'
#' @param modelFormula the model formula. This must be of the form `"~ ..."`
#' where the structure is assumed to be `"gene ~ ..."`. The formula must
#' include a random effects term. See formula structure for random effects in
#' \code{\link[lme4:lmer]{lme4::lmer()}}
#' @param maindata data matrix with genes in rows and samples in columns
#' @param metadata a dataframe of sample information with variables in columns
#' and samples in rows
#' @param id Optional. Used to specify the column in metadata which contains the
#' sample IDs to be used in repeated samples for random effects. If not
#' specified, the function defaults to using the variable after the "|" in the
#' random effects term in the formula.
#' @param offset Vector containing model offsets (default = NULL). If provided
#' the `lmer()` offset is set to `offset`. See
#' \code{\link[lme4:lmer]{lme4::lmer()}}
#' @param test.stat Character value specifying test statistic. Current options
#' are "Wald" for type 2 Wald Chi square test using code derived and modified
#' from [car::Anova] to improve speed for matrix tests. Or "F" for conditional
#' F tests using Saiterthwaite's method of approximated Df. This uses
#' [lmerTest::lmer] and is somewhat slower.
#' @param reduced Optional reduced model formula. If this is chosen, a
#' likelihood ratio test is used to calculate p-values instead of the default
#' Wald type 2 Chi-squared test.
#' @param modelData Optional dataframe. Default is generated by call to
#' `expand.grid` using levels of variables in the formula. Used to calculate
#' model predictions (estimated means & 95% CI) for plotting via [modelPlot].
#' It can therefore be used to add/remove points in [modelPlot].
#' @param designMatrix Optional custom design matrix generated by call to
#' `model.matrix` using `modelData` and `FEformula`. Used to calculate
#' model predictions for plotting.
#' @param control the `lmer` optimizer control (default = `lmerControl()`). See
#' \code{\link[lme4:lmerControl]{lme4::lmerControl()}}.
#' @param cores number of cores to use for parallelisation. Default = 1.
#' @param removeSingles whether to remove individuals with no repeated measures
#' (default = FALSE)
#' @param verbose Logical whether to display messaging (default = TRUE)
#' @param returnList Logical whether to return results as a list or lmmSeq
#' object (default = FALSE). Helpful for debugging.
#' @param progress Logical whether to display a progress bar
#' @param ... Other parameters passed to
#' \code{\link[lmerTest:lmer]{lmerTest::lmer()}}. Only available if
#' `test.stat = "F"`.
#' @return Returns an S4 class `lmmSeq` object with results for gene-wise linear
#' mixed models; or a list of results if `returnList` is `TRUE`, which is
#' useful for debugging. If all genes return errors from `lmer`, then an error
#' message is shown and a character vector containing error messages for all
#' genes is returned.
#'
#' @details
#' By default, p-values for each model term are computed using Wald type 2
#' Chi-squared test as per [car::Anova()]. The underlying code for this has been
#' optimised for speed. However, if a reduced model formula is specified by
#' setting `reduced`, then a likelihood ratio test (LRT) is performed instead
#' using \code{\link[lme4:anova.merMod]{anova}}. This will double computation
#' time since two LMM have to be fitted for each gene. For LRT, models being
#' compared are optimised by maximum likelihood and not REML (`REML=FALSE`).
#'
#' Two key methods are used to speed up computation above and beyond simple
#' parallelisation. The first is to speed up [lme4::lmer()] by calling
#' [lme4::lFormula()] once at the start and then updating the `lFormula` output
#' with new data. The 2nd speed up is through optimised code for repeated type 2
#' Wald Chi-squared tests (original code was derived from [car::Anova]). For
#' example, elements such as the hypothesis matrices are generated only once to
#' reduce unnecessarily repetitive computation, and the generation of p-values
#' from Chi-squared values is vectorised and performed at the end. F-tests using
#' the `lmerTest` package have not been optimised and are therefore slower.
#'
#' Parallelisation is performed using [parallel::mclapply] on unix/mac and
#' [parallel::parLapply] on windows. Progress bars use [pbmcapply::pbmclapply]
#' on unix/mac and [pbapply::pblapply] on windows.
#'
#' The `id` argument is usually optional. By default the `id` column in the
#' metadata is determined as the term after the bar in the random effects term
#' of the model. Note that `id` is not passed to `lmer`. It is only really used
#' to remove singletons from the `maindata` matrix and `metadata` dataframe. The
#' `id` is also stored in the output from `lmmSeq` and used by plotting function
#' [modelPlot()]. However, due to its flexible nature, in theory `lmmSeq` should
#' allow for more than one random effect term, although this has not been tested
#' formally. In this case, it is probably prudent to specify a value for `id`.
#'
#' @importFrom lme4 subbars findbars fixef lmerControl nobars isSingular
#' lFormula
#' @importFrom lmerTest lmer
#' @importFrom parallel mclapply detectCores parLapply makeCluster clusterEvalQ
#' clusterExport stopCluster
#' @importFrom pbmcapply pbmclapply
#' @importFrom pbapply pblapply
#' @importFrom methods slot new
#' @importFrom stats AIC complete.cases logLik reshape terms vcov pchisq
#' update.formula model.matrix predict setNames anova coef
#' @export
#' @examples
#' data(PEAC_minimal_load)
#' logtpm <- log2(tpm +1)
#' lmmtest <- lmmSeq(~ Timepoint * EULAR_6m + (1 | PATID),
#' maindata = logtpm[1:2, ],
#' metadata = metadata,
#' verbose = FALSE)
#' names(attributes(lmmtest))
lmmSeq <- function(modelFormula,
maindata,
metadata,
id = NULL,
offset = NULL,
test.stat = c("Wald", "F", "LRT"),
reduced = NULL,
modelData = NULL,
designMatrix = NULL,
control = lmerControl(),
cores = 1,
removeSingles = FALSE,
verbose = TRUE,
returnList = FALSE,
progress = FALSE,
...) {
lmmcall <- match.call(expand.dots = TRUE)
test.stat <- match.arg(test.stat)
maindata <- as.matrix(maindata)
# Catch errors
if (length(findbars(modelFormula)) == 0) {
stop("No random effects terms specified in formula")
}
if (ncol(maindata) != nrow(metadata)) {
stop("maindata columns different size to metadata rows")
}
if (!is.null(offset) & ncol(maindata) != length(offset)) {
stop("Different offset length")
}
# Manipulate formulae
fullFormula <- update.formula(modelFormula, gene ~ ., simplify = FALSE)
subFormula <- subbars(modelFormula)
variables <- rownames(attr(terms(subFormula), "factors"))
subsetMetadata <- metadata[, variables]
if (is.null(id)) {
fb <- findbars(modelFormula)
id <- sub(".*[|]", "", fb)
id <- gsub(" ", "", id)
}
ids <- as.character(metadata[, id])
# Option to subset to remove unpaired samples
if (removeSingles) {
nonSingle <- names(table(ids))[table(ids) > 1]
pairedIndex <- ids %in% nonSingle
maindata <- maindata[, pairedIndex]
subsetMetadata <- subsetMetadata[pairedIndex, ]
ids <- ids[pairedIndex]
offset <- offset[pairedIndex]
}
if (verbose) cat(paste0("\nn = ", length(ids), " samples, ",
length(unique(ids)), " individuals\n"))
# setup model prediction
FEformula <- nobars(modelFormula)
if (is.null(modelData)) {
reducedVars <- rownames(attr(terms(FEformula), "factors"))
varLevels <- lapply(reducedVars, function(x) {
if (is.factor(metadata[, x])) {
return(levels(subsetMetadata[, x]))
} else {sort(unique(subsetMetadata[, x]))}
})
modelData <- expand.grid(varLevels)
colnames(modelData) <- reducedVars
}
if (is.null(designMatrix)){
designMatrix <- model.matrix(FEformula, modelData)
}
start <- Sys.time()
fullList <- lapply(rownames(maindata), function(i) maindata[i, ])
if (!is.null(reduced)) {
# LRT
if (length(findbars(reduced)) == 0) {
stop("No random effects terms specified in reduced formula")
}
subReduced <- subbars(reduced)
redvars <- rownames(attr(terms(subReduced), "factors"))
if (any(!redvars %in% variables)) {
stop("Extra terms in reduced formula not found full formula")
}
reduced <- update.formula(reduced, gene ~ ., simplify = FALSE)
test.stat <- "LRT"
}
if (test.stat == "Wald") {
# Adapted from lme4::modular / lme4::lmer
subsetMetadata$gene <- maindata[1, ]
lmod <- lFormula(fullFormula, subsetMetadata,
offset = offset, control = control, ...)
hyp.matrix <- hyp_matrix(fullFormula, metadata, "gene")
# For each gene perform a fit
# lmerFast
if (Sys.info()["sysname"] == "Windows" & cores > 1) {
cl <- makeCluster(cores)
on.exit(stopCluster(cl))
clusterExport(cl, varlist = c("lmerFast",
"lmod", "control", "modelData",
"designMatrix",
"hyp.matrix"),
envir = environment())
if (progress) {
resultList <- pblapply(fullList, function(geneRow) {
lmerFast(geneRow, lmod, control,
modelData, designMatrix, hyp.matrix)
}, cl = cl)
} else {
resultList <- parLapply(cl = cl, fullList, function(geneRow) {
lmerFast(geneRow, lmod, control,
modelData, designMatrix, hyp.matrix)
})
}
} else{
if (progress) {
resultList <- pbmclapply(fullList, function(geneRow) {
lmerFast(geneRow, lmod, control,
modelData, designMatrix, hyp.matrix)
}, mc.cores = cores)
if ("value" %in% names(resultList)) resultList <- resultList$value
} else {
resultList <- mclapply(fullList, function(geneRow) {
lmerFast(geneRow, lmod, control,
modelData, designMatrix, hyp.matrix)
}, mc.cores = cores)
}
}
} else {
# lmerTest or LRT
if (Sys.info()["sysname"] == "Windows" & cores > 1) {
cl <- makeCluster(cores)
on.exit(stopCluster(cl))
dots <- list(...)
varlist <- c("lmerTestCore", "fullList", "fullFormula", "reduced",
"subsetMetadata", "control", "modelData", "offset",
"designMatrix", "dots")
clusterExport(cl, varlist = varlist, envir = environment())
if (progress) {
resultList <- pblapply(fullList, function(geneRow) {
args <- c(list(geneRow = geneRow, fullFormula = fullFormula,
reduced = reduced,
data = subsetMetadata, control = control,
modelData = modelData, offset = offset,
designMatrix = designMatrix), dots)
do.call(lmerTestCore, args)
}, cl = cl)
} else {
resultList <- parLapply(cl = cl, fullList, function(geneRow) {
args <- c(list(geneRow = geneRow, fullFormula = fullFormula,
reduced = reduced,
data = subsetMetadata, control = control,
modelData = modelData, offset = offset,
designMatrix = designMatrix), dots)
do.call(lmerTestCore, args)
})
}
} else{
if (progress) {
resultList <- pbmclapply(fullList, function(geneRow) {
lmerTestCore(geneRow, fullFormula = fullFormula, reduced = reduced,
data = subsetMetadata, control = control,
modelData = modelData, offset = offset,
designMatrix = designMatrix, ...)
}, mc.cores = cores)
if ("value" %in% names(resultList)) resultList <- resultList$value
} else {
resultList <- mclapply(fullList, function(geneRow) {
lmerTestCore(geneRow, fullFormula = fullFormula, reduced = reduced,
data = subsetMetadata, control = control,
modelData = modelData, offset = offset,
designMatrix = designMatrix, ...)
}, mc.cores = cores)
}
}
}
if(returnList) return(resultList)
# Output
names(resultList) <- rownames(maindata)
noErr <- vapply(resultList, function(x) x$tryErrors == "", FUN.VALUE = TRUE)
if (sum(noErr) == 0) {
message("All genes returned an error. Check call. Check sufficient data in each group")
outputErrors <- vapply(resultList[!noErr], function(x) {x$tryErrors},
FUN.VALUE = c("test"))
print(outputErrors[1])
return(outputErrors)
}
# Print timing if verbose
end <- Sys.time()
if (verbose) print(end - start)
predList <- lapply(resultList[noErr], "[[", "predict")
outputPredict <- do.call(rbind, predList)
outLabels <- apply(modelData, 1, function(x) paste(x, collapse = "_"))
colnames(outputPredict) <- c(paste0("y_", outLabels),
paste0("LCI_", outLabels),
paste0("UCI_", outLabels))
if (sum(!noErr) != 0) {
if (verbose) cat(paste0("Errors in ", sum(!noErr), " gene(s): ",
paste0(names(noErr)[! noErr], collapse = ", ")))
outputErrors <- vapply(resultList[!noErr], function(x) {x$tryErrors},
FUN.VALUE = character(1))
} else {outputErrors <- c("No errors")}
optInfo <- t(vapply(resultList[noErr], function(x) {
setNames(x$optinfo, c("Singular", "Conv"))
}, FUN.VALUE = c(1, 1)))
s <- organiseStats(resultList[noErr], test.stat)
meanExp <- rowMeans(maindata[noErr, , drop = FALSE])
s$res <- cbind(s$res, meanExp)
# Create lmmSeq object with results
new("lmmSeq",
info = list(call = lmmcall,
offset = offset,
designMatrix = designMatrix,
control = substitute(control),
test.stat = test.stat),
formula = fullFormula,
stats = s,
predict = outputPredict,
reduced = reduced,
maindata = maindata,
metadata = subsetMetadata,
modelData = modelData,
optInfo = optInfo,
errors = outputErrors,
vars = list(id = id,
removeSingles = removeSingles)
)
}
## see lme4::modular
#' @importFrom lme4 mkLmerDevfun optimizeLmer checkConv mkMerMod
#'
lmerFast <- function(geneRow,
lmod,
control,
modelData,
designMatrix,
hyp.matrix) {
lmod$fr$gene <- geneRow
devfun <- do.call(mkLmerDevfun, c(lmod, list(control=control)))
opt <- optimizeLmer(devfun,
optimizer = control$optimizer,
restart_edge = control$restart_edge,
boundary.tol = control$boundary.tol,
control = control$optCtrl,
calc.derivs=control$calc.derivs,
use.last.params=control$use.last.params)
cc <- try(suppressMessages(suppressWarnings(
checkConv(attr(opt,"derivs"), opt$par,
ctrl = control$checkConv,
lbound = environment(devfun)$lower)
)), silent = TRUE)
fit <- mkMerMod(environment(devfun), opt, lmod$reTrms, fr = lmod$fr,
lme4conv=cc)
if (!inherits(fit, "try-error")) {
# intercept dropped genes
if (length(attr(fit@pp$X, "msgRankdrop")) > 0) {
return( list(stats = NA, predict = NA, optinfo = NA,
tryErrors = attr(fit@pp$X, "msgRankdrop")) )
}
stats <- setNames(c(AIC(fit), as.numeric(logLik(fit))),
c("AIC", "logLik"))
fixedEffects <- lme4::fixef(fit)
stdErr <- coef(summary(fit))[, 2]
vcov. <- suppressWarnings(vcov(fit, complete = FALSE))
vcov. <- as.matrix(vcov.)
waldtest <- lmer_wald(fixedEffects, hyp.matrix, vcov.)
newY <- predict(fit, newdata = modelData, re.form = NA)
a <- designMatrix %*% vcov.
b <- as.matrix(a %*% t(designMatrix))
predVar <- diag(b)
newSE <- sqrt(predVar)
newLCI <- newY - newSE * 1.96
newUCI <- newY + newSE * 1.96
predictdf <- c(newY, newLCI, newUCI)
singular <- as.numeric(lme4::isSingular(fit))
conv <- length(slot(fit, "optinfo")$conv$lme4$messages)
ret <- list(stats = stats,
coef = fixedEffects,
stdErr = stdErr,
chisq = waldtest$chisq,
df = waldtest$df,
predict = predictdf,
optinfo = c(singular, conv),
tryErrors = "")
return(ret)
} else {
return(list(stats = NA, coef = NA, stdErr = NA, chisq = NA, df = NA,
predict = NA, optinfo = NA, tryErrors = fit[1]))
}
}
lmerTestCore <- function(geneRow,
fullFormula,
reduced,
data,
control,
modelData,
designMatrix,
offset,
...) {
data[, "gene"] <- geneRow
dots <- list(...)
REML <- if (is.null(dots$REML)) TRUE else dots$REML
if (!is.null(reduced)) REML <- FALSE
fit <- try(suppressMessages(suppressWarnings(
lmerTest::lmer(fullFormula, data = data, control = control, offset = offset,
REML = REML, ...))),
silent = TRUE)
if (!inherits(fit, "try-error")) {
# intercept dropped genes
if (length(attr(fit@pp$X, "msgRankdrop")) > 0) {
return( list(stats = NA, predict = NA, optinfo = NA,
tryErrors = attr(fit@pp$X, "msgRankdrop")) )
}
stats <- setNames(c(AIC(fit), as.numeric(logLik(fit))),
c("AIC", "logLik"))
fixedEffects <- lme4::fixef(fit)
stdErr <- coef(summary(fit))[, 2]
vcov. <- suppressWarnings(vcov(fit, complete = FALSE))
vcov. <- as.matrix(vcov.)
if (is.null(reduced)) {
# F test
test <- as.matrix(anova(fit)[, -c(1,2)])
} else {
# LRT
fit2 <- try(suppressMessages(suppressWarnings(
lmerTest::lmer(reduced, data = data, control = control, offset = offset,
REML = FALSE, ...))),
silent = TRUE)
if (!inherits(fit2, "try-error")) {
lrt <- suppressMessages(anova(fit, fit2))
test <- unlist(lrt[2, c("Chisq", "Df", "Pr(>Chisq)")])
} else {
test <- setNames(c(NA, NA, NA), c("Chisq", "Df", "Pr(>Chisq)"))
}
}
newY <- predict(fit, newdata = modelData, re.form = NA)
a <- designMatrix %*% vcov.
b <- as.matrix(a %*% t(designMatrix))
predVar <- diag(b)
newSE <- sqrt(predVar)
newLCI <- newY - newSE * 1.96
newUCI <- newY + newSE * 1.96
predictdf <- c(newY, newLCI, newUCI)
singular <- as.numeric(lme4::isSingular(fit))
conv <- length(slot(fit, "optinfo")$conv$lme4$messages)
rm(fit, data)
ret <- list(stats = stats,
coef = fixedEffects,
stdErr = stdErr,
test = test,
predict = predictdf,
optinfo = c(singular, conv),
tryErrors = "")
return(ret)
} else {
return(list(stats = NA, coef = NA, stdErr = NA, test = NA,
predict = NA, optinfo = NA, tryErrors = fit[1]))
}
}
organiseStats <- function(resultList, test.stat) {
statsList <- lapply(resultList, "[[", "stats")
s <- do.call(rbind, statsList)
coefList <- lapply(resultList, "[[", "coef")
cf <- do.call(rbind, coefList)
SEList <- lapply(resultList, "[[", "stdErr")
stdErr <- do.call(rbind, SEList)
if (test.stat == "Wald") {
chisqList <- lapply(resultList, "[[", "chisq")
chisq <- do.call(rbind, chisqList)
dfList <- lapply(resultList, "[[", "df")
df <- do.call(rbind, dfList)
pvals <- pchisq(chisq, df=df, lower.tail = FALSE)
colnames(df) <- colnames(chisq)
colnames(pvals) <- colnames(chisq)
s <- list(res = s, coef = cf, stdErr = stdErr, Chisq = chisq, Df = df,
pvals = pvals)
} else if (test.stat == "F") {
NumDF <- lapply(resultList, function(x) x$test[,1])
NumDF <- do.call(rbind, NumDF)
DenDF <- lapply(resultList, function(x) x$test[,2])
DenDF <- do.call(rbind, DenDF)
Fval <- lapply(resultList, function(x) x$test[,3])
Fval <- do.call(rbind, Fval)
pvals <- lapply(resultList, function(x) x$test[,4])
pvals <- do.call(rbind, pvals)
if (ncol(NumDF) == 1) {
colnames(NumDF) <- colnames(DenDF) <- colnames(Fval) <-
colnames(pvals) <- rownames(resultList[[1]]$test)
}
s <- list(res = s, coef = cf, stdErr = stdErr, NumDF = NumDF, DenDF = DenDF,
Fval = Fval, pvals = pvals)
} else {
# LRT
LRT <- lapply(resultList, "[[", "test")
LRT <- do.call(rbind, LRT)
chisq <- LRT[,1, drop = FALSE]
df <- LRT[,2, drop = FALSE]
pvals <- LRT[,3, drop = FALSE]
colnames(chisq) <- colnames(df) <- colnames(pvals) <- "LRT"
s <- list(res = s, coef = cf, stdErr = stdErr, Chisq = chisq, Df = df,
pvals = pvals)
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.