Nothing
#' GLMM with negative binomial distribution for sequencing count data
#'
#' Fits many generalised linear mixed effects models (GLMM) with negative
#' binomial distribution for analysis of overdispersed count data with random
#' effects. Designed for longitudinal analysis of RNA-Sequencing count data.
#'
#' @param modelFormula the model formula. This must be of the form `"~ ..."`
#' where the structure is assumed to be `"counts ~ ..."`. The formula must
#' include a random effects term. For more information on formula structure
#' for random effects see \code{\link[lme4:glmer]{lme4::glmer()}}
#' @param countdata the sequencing count data matrix with genes in rows and
#' samples in columns
#' @param metadata a dataframe of sample information with variables in columns
#' and samples in rows
#' @param id Optional. Used to specify the column in metadata which contains the
#' sample IDs to be used in repeated samples for random effects. If not
#' specified, the function defaults to using the variable after the "|" in the
#' random effects term in the formula.
#' @param dispersion a numeric vector of gene dispersion. Not required for
#' `glmmTMB` models, as these determine and fit dispersion for each gene.
#' @param sizeFactors size factors (default = NULL). If provided the `glmer`
#' offset is set to log(sizeFactors). For more information see``
#' \code{\link[lme4:glmer]{lme4::glmer()}}
#' @param reduced Optional reduced model formula. If this is chosen, a
#' likelihood ratio test is used to calculate p-values instead of the default
#' Wald type 2 Chi-squared test.
#' @param modelData Optional dataframe. Default is generated by call to
#' `expand.grid` using levels of variables in the formula. Used to calculate
#' model predictions (estimated means & 95% CI) for plotting via [modelPlot].
#' It can therefore be used to add/remove points in [modelPlot].
#' @param designMatrix custom design matrix, used only for prediction
#' @param method Specifies which package to use for fitting GLMM models. Either
#' "lme4" or "glmmTMB" depending on whether to use [lme4::glmer] or
#' [glmmTMB::glmmTMB] to fit GLMM models.
#' @param control the `glmer` optimizer control. If `method = "lme4"` default is
#' `glmerControl(optimizer = "bobyqa")`). If
#' `method = "glmmTMB"` default is `glmmTMBControl()`
#' @param family Only used with `glmmTMB` models. Default is `nbinom2`. See
#' [glmmTMB::nbinom2]
#' @param cores number of cores to use. Default = 1.
#' @param removeSingles whether to remove individuals without repeated measures
#' (default = FALSE)
#' @param zeroCount numerical value to offset zeroes for the purpose of log
#' (default = 0.125)
#' @param verbose Logical whether to display messaging (default = TRUE)
#' @param returnList Logical whether to return results as a list or `glmmSeq`
#' object (default = FALSE). Useful for debugging.
#' @param progress Logical whether to display a progress bar
#' @param ... Other parameters to pass to
#' \code{\link[lme4:glmer]{lme4::glmer()}}
#' @return Returns an S4 class `GlmmSeq` object with results for gene-wise
#' general linear mixed models. A list of results is returned if `returnList`
#' is `TRUE` which is useful for debugging. If all genes return errors from
#' `glmer`, then an error message is shown and a character vector containing
#' error messages for all genes is returned.
#' @details
#' This function is a wrapper for [lme4::glmer()]. By default, p-values for each
#' model term are computed using Wald type 2 Chi-squared test as per
#' [car::Anova()]. The underlying code for this has been optimised for speed.
#' However, if a reduced model formula is specified by setting `reduced`, then a
#' likelihood ratio test is performed instead using [stats::anova]. This will
#' double computation time since two GLMM have to be fitted.
#'
#' Parallelisation is provided using [parallel::mclapply] on Unix/Mac or
#' [parallel::parLapply] on PC.
#'
#' Setting `method = "glmmTMB"` enables an alternative method of fitting GLMM
#' using the `glmmTMB` package. This gives access to a variety of alternative
#' GLM family functions. Note, `glmmTMB` negative binomial models are
#' substantially slower to fit than `glmer` models with known dispersion due to
#' the extra time taken by `glmmTMB` to optimise the dispersion parameter.
#'
#' The `id` argument is usually optional. By default the `id` column in the
#' metadata is determined as the term after the bar in the random effects term
#' of the model. Note that `id` is not passed to `glmer` or `glmmTMB`. It is
#' only really used to remove singletons from the `countdata` matrix and
#' `metadata` dataframe. The `id` is also stored in the output from `glmmSeq`
#' and used by plotting function [modelPlot()]. However, due to its flexible
#' nature, in theory `glmmSeq` should allow for more than one random effect
#' term, although this has not been tested formally. In this case, it is
#' probably prudent to specify a value for `id`.
#'
#' @seealso [lme4::glmer] [lme4::glmerControl] [glmmTMB::glmmTMB]
#' [glmmTMB::nbinom2] [glmmTMB::glmmTMBControl] [car::Anova]
#'
#' @examples
#' data(PEAC_minimal_load)
#' disp <- apply(tpm, 1, function(x) {
#' (var(x, na.rm = TRUE)-mean(x, na.rm = TRUE))/(mean(x, na.rm = TRUE)**2)
#' })
#' MS4A1glmm <- glmmSeq(~ Timepoint * EULAR_6m + (1 | PATID),
#' countdata = tpm[1:2, ],
#' metadata = metadata,
#' dispersion = disp,
#' verbose = FALSE)
#' names(attributes(MS4A1glmm))
#'
#' @importFrom MASS negative.binomial
#' @importFrom lme4 subbars findbars glmer fixef glmerControl nobars isSingular
#' @importFrom parallel mclapply detectCores parLapply makeCluster clusterEvalQ
#' clusterExport stopCluster
#' @importFrom pbmcapply pbmclapply
#' @importFrom pbapply pblapply
#' @importFrom car Anova
#' @importFrom glmmTMB glmmTMB glmmTMBControl nbinom2 sigma
#' @importFrom methods slot new
#' @importFrom stats AIC complete.cases logLik reshape terms vcov pchisq
#' update.formula model.matrix predict setNames coef
#' @export
glmmSeq <- function(modelFormula,
countdata,
metadata,
id = NULL,
dispersion = NA,
sizeFactors = NULL,
reduced = NULL,
modelData = NULL,
designMatrix = NULL,
method = c("lme4", "glmmTMB"),
control = NULL,
family = nbinom2,
cores = 1,
removeSingles = FALSE,
zeroCount = 0.125,
verbose = TRUE,
returnList = FALSE,
progress = FALSE,
...) {
glmmcall <- match.call(expand.dots = TRUE)
method <- match.arg(method)
if (is.null(control)) {
control <- switch(method,
"lme4" = glmerControl(optimizer = "bobyqa"),
"glmmTMB" = glmmTMBControl())
}
countdata <- as.matrix(countdata)
# Catch errors
if (length(findbars(modelFormula)) == 0) {
stop("No random effects terms specified in formula")
}
if (ncol(countdata) != nrow(metadata)) {
stop("countdata columns different size to metadata rows")
}
if (!is.null(sizeFactors) & ncol(countdata) != length(sizeFactors)) {
stop("Different sizeFactors length")
}
if (! is.numeric(zeroCount)) stop("zeroCount must be numeric")
if (zeroCount < 0) stop("zeroCount must be >= 0")
if (zeroCount > 0) countdata[countdata == 0] <- zeroCount
# Manipulate formulae
fullFormula <- update.formula(modelFormula, count ~ ., simplify = FALSE)
subFormula <- subbars(modelFormula)
variables <- rownames(attr(terms(subFormula), "factors"))
subsetMetadata <- metadata[, variables]
if (is.null(id)) {
fb <- findbars(modelFormula)
id <- sub(".*[|]", "", fb)
id <- gsub(" ", "", id)
}
ids <- as.character(metadata[, id])
# Option to subset to remove unpaired samples
if (removeSingles) {
nonSingles <- names(table(ids))[table(ids) > 1]
nonSingleIDs <- ids %in% nonSingles
countdata <- countdata[, nonSingleIDs]
sizeFactors <- sizeFactors[nonSingleIDs]
subsetMetadata <- subsetMetadata[nonSingleIDs, ]
ids <- ids[nonSingleIDs]
}
if (!is.null(sizeFactors)) offset <- log(sizeFactors) else offset <- NULL
if (verbose) cat(paste0("\nn = ", length(ids), " samples, ",
length(unique(ids)), " individuals\n"))
# setup model prediction
FEformula <- nobars(modelFormula)
if (is.null(modelData)) {
reducedVars <- rownames(attr(terms(FEformula), "factors"))
varLevels <- lapply(reducedVars, function(x) {
if (is.factor(metadata[, x])) {
return(levels(subsetMetadata[, x]))
} else {sort(unique(subsetMetadata[, x]))}
})
modelData <- expand.grid(varLevels)
colnames(modelData) <- reducedVars
if (method == "glmmTMB") {
modelData <- cbind(modelData, .id = NA)
colnames(modelData)[which(colnames(modelData) == ".id")] <- id
}
}
if (is.null(designMatrix)){
designMatrix <- model.matrix(FEformula, modelData)
}
if (is.null(reduced)) {
test.stat <- "Wald"
hyp.matrix <- hyp_matrix(fullFormula, metadata, "count")
} else {
# LRT
if (length(findbars(reduced)) == 0) {
stop("No random effects terms specified in reduced formula")
}
subReduced <- subbars(reduced)
redvars <- rownames(attr(terms(subReduced), "factors"))
if (any(!redvars %in% variables)) {
stop("Extra terms in reduced formula not found full formula")
}
reduced <- update.formula(reduced, count ~ ., simplify = FALSE)
test.stat <- "LRT"
hyp.matrix <- NULL
}
start <- Sys.time()
if (method == "lme4") {
if (!all(rownames(countdata) %in% names(dispersion))) {
stop("Some dispersion values are missing")
}
fullList <- lapply(rownames(countdata), function(i) {
list(y = countdata[i, ], dispersion = dispersion[i])
})
# For each gene perform a fit
if (Sys.info()["sysname"] == "Windows" & cores > 1) {
cl <- makeCluster(cores)
on.exit(stopCluster(cl))
dots <- list(...)
varlist <- c("glmerCore", "fullList", "fullFormula", "reduced",
"subsetMetadata", "control", "modelData",
"offset", "designMatrix", "hyp.matrix", "dots")
clusterExport(cl, varlist = varlist, envir = environment())
if (progress) {
resultList <- pblapply(fullList, function(geneList) {
args <- c(list(geneList=geneList, fullFormula=fullFormula,
reduced=reduced,
data=subsetMetadata, control=control, offset=offset,
modelData=modelData, designMatrix=designMatrix,
hyp.matrix=hyp.matrix), dots)
do.call(glmerCore, args)
}, cl = cl)
} else {
resultList <- parLapply(cl = cl, fullList, function(geneList) {
args <- c(list(geneList=geneList, fullFormula=fullFormula,
reduced=reduced,
data=subsetMetadata, control=control, offset=offset,
modelData=modelData, designMatrix=designMatrix,
hyp.matrix=hyp.matrix), dots)
do.call(glmerCore, args)
})
}
} else{
if (progress) {
resultList <- pbmclapply(fullList, function(geneList) {
glmerCore(geneList, fullFormula, reduced, subsetMetadata,
control, offset, modelData, designMatrix, hyp.matrix, ...)
}, mc.cores = cores)
if ("value" %in% names(resultList)) resultList <- resultList$value
} else {
resultList <- mclapply(fullList, function(geneList) {
glmerCore(geneList, fullFormula, reduced, subsetMetadata,
control, offset, modelData, designMatrix, hyp.matrix, ...)
}, mc.cores = cores)
}
}
} else {
# glmmTMB
fullList <- lapply(rownames(countdata), function(i) {
countdata[i, ]
})
# For each gene perform a fit
if (Sys.info()["sysname"] == "Windows" & cores > 1) {
cl <- makeCluster(cores)
on.exit(stopCluster(cl))
dots <- list(...)
varlist <- c("glmmTMBcore", "fullList", "fullFormula", "reduced",
"subsetMetadata", "family", "control",
"modelData", "offset", "designMatrix", "hyp.matrix", "dots")
clusterExport(cl, varlist = varlist, envir = environment())
if (progress) {
resultList <- pblapply(fullList, function(geneList) {
args <- c(list(geneList=geneList, fullFormula=fullFormula,
reduced=reduced,
data=subsetMetadata, family=family, control=control,
offset=offset, modelData=modelData,
designMatrix=designMatrix, hyp.matrix=hyp.matrix), dots)
do.call(glmmTMBcore, args)
}, cl = cl)
} else {
resultList <- parLapply(cl = cl, fullList, function(geneList) {
args <- c(list(geneList=geneList, fullFormula=fullFormula,
reduced=reduced,
data=subsetMetadata, family=family, control=control,
offset=offset, modelData=modelData,
designMatrix=designMatrix, hyp.matrix=hyp.matrix), dots)
do.call(glmmTMBcore, args)
})
}
} else{
if (progress) {
resultList <- pbmclapply(fullList, function(geneList) {
glmmTMBcore(geneList, fullFormula, reduced, subsetMetadata, family,
control, offset, modelData, designMatrix, hyp.matrix, ...)
}, mc.cores = cores)
if ("value" %in% names(resultList)) resultList <- resultList$value
} else {
resultList <- mclapply(fullList, function(geneList) {
glmmTMBcore(geneList, fullFormula, reduced, subsetMetadata, family,
control, offset, modelData, designMatrix, hyp.matrix, ...)
}, mc.cores = cores)
}
}
}
if(returnList) return(resultList)
# Output
names(resultList) <- rownames(countdata)
noErr <- vapply(resultList, function(x) x$tryErrors == "", FUN.VALUE = TRUE)
if (sum(noErr) == 0) {
message("All genes returned an error. Check call. Check sufficient data in each group")
outputErrors <- vapply(resultList[!noErr], function(x) {x$tryErrors},
FUN.VALUE = c("test"))
print(outputErrors[1])
return(outputErrors)
}
# Print timing if verbose
end <- Sys.time()
if (verbose) print(end - start)
predList <- lapply(resultList[noErr], "[[", "predict")
outputPredict <- do.call(rbind, predList)
outLabels <- apply(modelData, 1, function(x) paste(x, collapse = "_"))
colnames(outputPredict) <- c(paste0("y_", outLabels),
paste0("LCI_", outLabels),
paste0("UCI_", outLabels))
optInfo <- t(vapply(resultList[noErr], function(x) {
setNames(x$optinfo, c("Singular", "Conv"))
}, FUN.VALUE = c(1, 1)))
s <- organiseStats(resultList[noErr], "Wald")
meanExp <- rowMeans(log2(countdata[noErr, , drop = FALSE] +1))
s$res <- cbind(s$res, meanExp)
if (method == "lme4") {
if (sum(!noErr) != 0) {
if (verbose) cat(paste("Errors in", sum(!noErr), "gene(s):",
paste(names(noErr)[! noErr], collapse = ", ")))
outputErrors <- vapply(resultList[!noErr], function(x) {x$tryErrors},
FUN.VALUE = c("test"))
} else {outputErrors <- c("No errors")}
} else {
# glmmTMB
err <- is.na(s$res[, 'AIC'])
if (any(err)) {
if (verbose) cat(paste("Errors in", sum(err), "gene(s):",
paste(names(err)[err], collapse = ", ")))
outputErrors <- vapply(resultList[err], function(x) {x$message},
FUN.VALUE = character(1))
} else outputErrors <- c("No errors")
}
# Create GlmmSeq object with results
new("GlmmSeq",
info = list(call = glmmcall,
offset = offset,
designMatrix = designMatrix,
method = method,
control = control,
family = substitute(family),
test.stat = test.stat,
dispersion = dispersion),
formula = fullFormula,
stats = s,
predict = outputPredict,
reduced = reduced,
countdata = countdata,
metadata = subsetMetadata,
modelData = modelData,
optInfo = optInfo,
errors = outputErrors,
vars = list(id = id,
removeSingles = removeSingles)
)
}
glmerCore <- function(geneList, fullFormula, reduced, data, control, offset,
modelData, designMatrix, hyp.matrix, ...) {
data[, "count"] <- geneList$y
disp <- geneList$dispersion
fit <- try(suppressMessages(suppressWarnings(
lme4::glmer(fullFormula, data = data, control = control, offset = offset,
family = MASS::negative.binomial(theta = 1/disp), ...)
)), silent = TRUE)
if (!inherits(fit, "try-error")) {
# intercept dropped genes
if (length(attr(fit@pp$X, "msgRankdrop")) > 0) {
return( list(stats = NA, predict = NA, optinfo = NA,
tryErrors = attr(fit@pp$X, "msgRankdrop")) )
}
stdErr <- suppressWarnings(coef(summary(fit))[, 2])
singular <- as.numeric(lme4::isSingular(fit))
conv <- length(slot(fit, "optinfo")$conv$lme4$messages)
vcov. <- suppressWarnings(as.matrix(vcov(fit, complete = FALSE)))
fixedEffects <- fixef(fit)
stats <- setNames(c(disp, AIC(fit),
as.numeric(logLik(fit))),
c("Dispersion", "AIC", "logLik"))
if (is.null(reduced)) {
test <- lmer_wald(fixedEffects, hyp.matrix, vcov.)
} else {
# LRT
fit2 <- try(suppressMessages(suppressWarnings(
lme4::glmer(reduced, data = data, control = control, offset = offset,
family = MASS::negative.binomial(theta = 1/disp), ...)
)), silent = TRUE)
if (!inherits(fit2, "try-error")) {
lrt <- anova(fit, fit2)
test <- list(chisq = setNames(lrt$Chisq[2], "LRT"), df = lrt$Df[2])
} else {
test <- list(chisq = NA, df = NA)
}
}
newY <- predict(fit, newdata = modelData, re.form = NA)
a <- designMatrix %*% vcov.
b <- as.matrix(a %*% t(designMatrix))
predVar <- diag(b)
newSE <- sqrt(predVar)
newLCI <- exp(newY - newSE * 1.96)
newUCI <- exp(newY + newSE * 1.96)
predictdf <- c(exp(newY), newLCI, newUCI)
# rm(fit, data)
return(list(stats = stats,
coef = fixedEffects,
stdErr = stdErr,
chisq = test$chisq,
df = test$df,
predict = predictdf,
optinfo = c(singular, conv),
tryErrors = "") )
} else {
return(list(stats = NA, coef = NA, stdErr = NA, chisq = NA, df = NA,
predict = NA, optinfo = NA, tryErrors = fit[1]))
}
}
glmmTMBcore <- function(geneList, fullFormula, reduced, data, family, control,
offset, modelData, designMatrix, hyp.matrix, ...) {
data[, "count"] <- geneList
fit <- try(suppressMessages(suppressWarnings(
glmmTMB(fullFormula, data, family, control = control, offset = offset, ...)
)), silent = TRUE)
if (!inherits(fit, "try-error")) {
singular <- conv <- NA
stdErr <- suppressWarnings(coef(summary(fit))$cond[, 2])
vcov. <- vcov(fit)$cond
fixedEffects <- fixef(fit)$cond
disp <- sigma(fit)
msg <- fit$fit$message
stats <- setNames(c(disp, AIC(fit),
as.numeric(logLik(fit))),
c("Dispersion", "AIC", "logLik"))
if (is.null(reduced)) {
test <- lmer_wald(fixedEffects, hyp.matrix, vcov.)
} else {
# LRT
fit2 <- try(suppressMessages(suppressWarnings(
glmmTMB(reduced, data, family, control = control, offset = offset, ...)
)), silent = TRUE)
if (!inherits(fit2, "try-error")) {
lrt <- anova(fit, fit2)
test <- list(chisq = setNames(lrt$Chisq[2], "LRT"),
df = lrt[2, 'Chi Df'])
} else {
test <- list(chisq = NA, df = NA)
}
}
newY <- predict(fit, newdata = modelData, re.form = NA)
a <- designMatrix %*% vcov.
b <- as.matrix(a %*% t(designMatrix))
predVar <- diag(b)
newSE <- suppressWarnings(sqrt(predVar))
newLCI <- exp(newY - newSE * 1.96)
newUCI <- exp(newY + newSE * 1.96)
predictdf <- c(exp(newY), newLCI, newUCI)
# rm(fit, data)
return(list(stats = stats,
coef = fixedEffects,
stdErr = stdErr,
chisq = test$chisq,
df = test$df,
predict = predictdf,
optinfo = c(singular, conv),
message = msg,
tryErrors = "") )
} else {
return(list(stats = NA, coef = NA, stdErr = NA, chisq = NA, df = NA,
predict = NA, optinfo = NA, tryErrors = fit[1]))
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.