### 1.1. ==== Run Generalised Linear Model in NIMBLE ====
#' @title Run Generalised Linear Model in NIMBLE
#'
#' @description A simple interface to run a generalised linear model using the NIMBLE
#' implementation of MCMC for Bayesian analysis using standard model notation familiar
#' to users of the \code{\link[base::glm]{glm}} function.
#'
#' @param modelFormula A \code{formula} project defining the relationship between the
#' response variable and the predictor variables in a similar manner to that used in
#' the \code{\link[base::glm]{glm}} function
#' @param inputData A \code{data.frame} with dataset containing the variables described in
#' the model formula
#' @param errorFamily A \code{family} object defining the error distribution of the
#' regression model and a possible use of a link function. This parameter can also be
#' a list with two elements: link and family. Each of these are a \code{character} scalar
#' assigning the link function and error family respectively
#' \code{\link[errorFamilies]{errorFamilies()}} will return a list of error distributions
#' supported by \code{glmNIMBLE} and the potential link functions associated with them.
#' @param regCoeffs A character scalar containing the text \code{"none"},
#' \code{"ridge"}, or \code{"lasso"}. This determines the type of regularisation to use for the
#' regression coefficients in the model. \code{"none"} (the default) results in wide normal priors
#' for the regression coefficients. \code{"ridge"} results in normal priors for each of the
#' regression coefficients regulated by a shared precision parameter which itself has a wide
#' gamma hyperprior. \code{"lasso"} results in Laplace priors for each of the regression
#' coefficients regulated by a shared rate parameter which itself has a wide gamma hyperprior
#' @param modelSuffix A \code{character} scalar to be used as a suffix to give to nodes in this
#' model. Defaults to emptry string (\code{""})
#' @param mcmcParams A list containing parameters regulating the Markov chain Monte Carlo
#' algorithm applied in NIMBLE. This list needs the following named elements: numRuns, numChains,
#' numBurnIn, thinDensity, predictThinDensity
#' @param numCores An \code{integer} scalar denoting the number of cores to use. MCMC chains
#' will be distributed across the cores. A value of \code{NA} or \code{0} results in the number
#' of cores being set equal to that returned by \code{\link[parallel::detectCores]{detectCores}}
#'
#' @details \code{mcmcParams} is a list containing the following elements:
#' \itemize{
#' \item{numSamples}{The number of iterations to sample from the posterior distribution in each chain}
#' \item{numChains}{The number of chains to simulate in the MCMC algorithm}
#' \item{numBurnIn}{The number of samples to remove at the beginning of each chain}
#' \item{thinDensity}{The thinning parameter to use for the model paramters}
#' \item{predictThinDensity}{The thinning parameter to use for the model predictions}
#' }
#' If any of the elements are missing from the \code{mcmcParams} object then sensible defaults are supplied.
#'
#' @return A \code{list} object containing the following elements:
#' \itemize{
#' \item{modelDefinition}{The NIMBLE code used in the model definition as would be the output
#' of \code{\link[nimble]{nimbleCode}}}
#' \item{compiledModel}{The compiled NIMBLE model object as would be produced by running
#' \code{\link[nimble]{nimbleModel}} followed by \code{\link[nimble]{compileNimble}}}
#' \item{compiledMCMC}{The compiled NIMBLE MCMC object as would be produced by running
#' \code{\link[nimble]{buildMCMC}} followed by \code{\link[nimble]{compileNimble}}}
#' \item{parameterSamples}{A \code{\link[coda]{mcmc.list}} object containing the samples from
#' the MCMC analysis}
#' \item{parameterSummary}{A \code{data.frame} containing summary statistics for each of the
#' sampled parameters}
#' \item{predictionSamples}{A \code{\link[coda]{mcmc.list}} object containing the samples of the
#' mean predictions from the MCMC analysis}
#' \item{predictionSummary}{A \code{data.frame} containing summary statistics for the mean
#' predictions}
#' \item{WAIC}{A scalar containing the Watanabe-Akaike information criterion for the model}
#' \item{DHARMaResiduals}{An object created by \code{\link[DHARMa]{createDHARMa}} that contains
#' an analysis of residuals of the model.}
#' \item{parameterFigure}{A graphical object (\code{\link[ggplot2::ggplot]{ggplot}}) containing violin plots
#' for each of the parameters}
#' \item{rSquared}{A \code{list} object containing the following elements: \code{samples}, a \link[coda]{mcmc.list}
#' object containing the sampled r squared values and a set of summary statistics for these samples across all chains.
#' The r squared is calculated according to [Gelman et al. 2019](https://doi.org/10.1080/00031305.2018.1549100)}
#' }
#'
#' @seealso \code{\link[DHARMa::createDHARMa]{createDHARMa}} \code{\link[nimble::nimbleCode]{nimbleCode}}
#' \code{\link[nimble::buildMCMC]{buildMCMC}} \code{\link[glmNIMBLE]{glmNIMBLE}} \code{\link[errorFamilies]{errorFamilies}}
#' \code{\link[base::glm]{glm}}
#' @author Joseph D. Chipperfield, \email{joechip90@@googlemail.com}
#'
glmNIMBLE <- function(modelFormula, inputData, errorFamily = gaussian, regCoeffs = "none", modelSuffix = "", mcmcParams = list(), numCores = 1) {
inMCMCParams <- sanityCheckMCMCParameters(mcmcParams)
# Create the linear model node specifications
modelNodeDefinitions <- linearModelToNodeDefinition(modelFormula, inputData, errorFamily, regCoeffs, modelSuffix)
# Create the model code to run in NIMBLE
modelCode <- nodeDefinitionToNIMBLECode(modelNodeDefinitions$stochasticNodeDef, modelNodeDefinitions$deterministicNodeDef)
# Retrieve the node names
nonDataNodeNames <- names(modelNodeDefinitions$stochasticNodeDef)[!(names(modelNodeDefinitions$stochasticNodeDef) %in% names(modelNodeDefinitions$inputData))]
predictionNodeNames <- names(modelNodeDefinitions$deterministicNodeDef)[grepl("^meanPred", names(modelNodeDefinitions$deterministicNodeDef), perl = TRUE)]
# Create a set of initial values for the stochastic non-data nodes
initValues <- setNames(lapply(X = nonDataNodeNames, FUN = function(curName) {
ifelse(grepl("Coeff$", curName, perl = TRUE), 0.0, 1.0)
}), nonDataNodeNames)
# Multiplier for the distributions that have a number of trials
numTrials <- 1.0
if(as.character(modelNodeDefinitions$family) == "binomial") {
numTrials <- modelNodeDefinitions$inputData[[which(grepl("NumTrials$", names(modelNodeDefinitions$inputData), perl = TRUE))[1]]]
}
# Set the intercept so that it is the mean value of the response variable
if(any(paste("intercept", as.character(modelSuffix), "Coeff", sep = "") == nonDataNodeNames)) {
initValues[[paste("intercept", as.character(modelSuffix), "Coeff", sep = "")]] <- applyLink(
mean(modelNodeDefinitions$inputData[[modelNodeDefinitions$responseDataNodeNames[1]]] / numTrials, na.rm = TRUE), modelNodeDefinitions$link)
}
# Compile and run the model
modelOutputs <- mcmcNIMBLERun(modelCode, modelNodeDefinitions$inputData, modelNodeDefinitions$inputConstants, nonDataNodeNames, predictionNodeNames, initValues, inMCMCParams, numCores, TRUE)
uncompiledModel <- modelOutputs$uncompiledModel
compiledModel <- modelOutputs$compiledModel
uncompiledMCMC <- modelOutputs$uncompiledMCMC
compiledMCMC <- modelOutputs$compiledMCMC
mcmcOutput <- modelOutputs$mcmcOutput
# Retrieve the offset values (if there are any)
offsetVals <- 0.0
if(!is.null(modelNodeDefinitions$inputData[[paste("offset", as.character(modelSuffix), sep = "")]])) {
offsetVals <- as.double(modelNodeDefinitions$inputData[[paste("offset", as.character(modelSuffix), sep = "")]])
}
sampledParams <- as.matrix(do.call(rbind, mcmcOutput$samples))
covMatrix <- as.matrix(as.data.frame(modelNodeDefinitions$inputData[
gsub("Coeff$", "", nonDataNodeNames[grepl("Coeff$", nonDataNodeNames, perl = TRUE) & !grepl("^intercept", nonDataNodeNames, perl = TRUE)], perl = TRUE)
]))
if(ncol(covMatrix) <= 0 || nrow(covMatrix) <= 0) {
covMatrix <- matrix(NA, nrow = max(sapply(X = modelNodeDefinitions$inputData, FUN = length)), ncol = 0)
}
# Simulate response values according to the sampled parameter values
simulatedValues <- apply(X = sampledParams, FUN = function(curRow, covMatrix, inFamily, inLink, inData, inOffset) {
interceptCoeffVal <- 0.0
if(any(grepl("^intercept.*Coeff$", names(curRow), perl = TRUE))) {
interceptCoeffVal <- curRow[grepl("^intercept.*Coeff$", names(curRow), perl = TRUE)]
}
if(ncol(covMatrix) <= 0) {
meanPredVals <- rep(0.0, nrow(covMatrix))
} else {
meanPredVals <- covMatrix %*% curRow[paste(colnames(covMatrix), "Coeff", sep = "")]
}
# Calculate the mean prediction values
meanPredVals <- applyInverseLink(
as.double(meanPredVals + interceptCoeffVal) + inOffset,
inLink)
# Retrieve the relevant scale parameter for the error distribution
scaleValues <- switch(inFamily,
gaussian = curRow[grepl("Prec$", names(curRow), perl = TRUE)],
gamma = curRow[grepl("SD$", names(curRow), perl = TRUE)],
beta = curRow[grepl("Scale$", names(curRow), perl = TRUE)],
poisson = c(),
binomial = inData[grepl("NumTrials$", names(inData), perl = TRUE)][[1]],
negbinomial = curRow[grepl("Scale$", names(curRow), perl = TRUE)])
# Sample from the relevant distribution
simulateFromErrorFamily(meanPredVals, scaleValues, inFamily)
}, covMatrix = covMatrix, inFamily = as.character(modelNodeDefinitions$family), inLink = as.character(modelNodeDefinitions$link),
inData = modelNodeDefinitions$inputData, inOffset = offsetVals, MARGIN = 1)
# Calculate the fitted median response
fittedPred <- mcmcOutput$summary$all.chains[grepl("^meanPred", rownames(mcmcOutput$summary$all.chains), perl = TRUE), "Median"] * numTrials
# Create a DHARMa object so that model checking can be done
residAnalysisOb <- createDHARMa(
simulatedResponse = simulatedValues,
observedResponse = modelNodeDefinitions$inputData[[modelNodeDefinitions$responseDataNodeNames]],
fittedPredictedResponse = fittedPred,
integerResponse = switch(as.character(modelNodeDefinitions$family),
gaussian = FALSE, gamma = FALSE, beta = FALSE,
poisson = TRUE, binomial = TRUE, negbinomial = TRUE
)
)
# Create violin plots of the regression coefficients
regCoeffNames <- nonDataNodeNames[grepl("Coeff$", nonDataNodeNames, perl = TRUE) & !grepl("^intercept", nonDataNodeNames, perl = TRUE)]
coeffPlot <- NULL
if(length(regCoeffNames) > 0) {
# Only plot the regression coefficients if there are any to plot
coeffPlot <- ggplot(do.call(rbind, lapply(X = regCoeffNames, FUN = function(curName, inData, inSuffix) {
data.frame(coeffVal = inData[, curName], covName = rep(gsub(paste(inSuffix, "Coeff$", sep = ""), "", curName, perl = TRUE), nrow(inData)))
}, inData = do.call(rbind, mcmcOutput$samples), inSuffix = as.character(modelSuffix))), aes(covName, coeffVal)) +
geom_violin(draw_quantiles = c(0.025, 0.5, 0.975)) + coord_flip() +
geom_hline(yintercept = 0.0, colour = "grey") + theme_classic() + ylab("Scaled coefficient value") +
theme(axis.title.y = element_blank(), axis.ticks.y = element_blank())
}
# Organise all the outputs into a list
list(
modelDefinition = modelCode,
compiledModel = compiledModel,
compiledMCMC = compiledMCMC,
parameterSamples = mcmcOutput$samples,
parameterSummary = mcmcOutput$summary$all.chains[nonDataNodeNames, ],
predictionSamples = mcmcOutput$samples2,
predictionSummary = mcmcOutput$summary$all.chains[!(rownames(mcmcOutput$summary$all.chains) %in% nonDataNodeNames), ],
WAIC = mcmcOutput$WAIC,
DHARMaResiduals = residAnalysisOb,
parameterFigure = coeffPlot,
stochasticNodeDef = modelNodeDefinitions$stochasticNodeDef,
deterministicNodeDef = modelNodeDefinitions$deterministicNodeDef,
modelFormula = as.formula(modelFormula),
covSummaryStats = modelNodeDefinitions$covSummaryStats,
modelSuffix = processSuffix(modelSuffix),
rSquared = bayesRSquared(mcmcOutput$samples2, modelNodeDefinitions$inputData[[modelNodeDefinitions$responseDataNodeNames]])
)
}
bayesRSquared <- function(predictionChains, observedResponse) {
# Sample variance calculation function
vzFunc <- function(zIn) {
sum((zIn - mean(zIn))^2) / (length(zIn) - 1.0)
}
# Sanity check the inputs
if("mcmc" %in% class(predictionChains)) {
predictionChains <- coda::mcmc.list(predictionChains)
} else if("mcmc.list" %in% class(predictionChains)) {
predictionChains <- tryCatch(as.mcmc.list(predictionChains), error = function(err) {
stop("error calculating R squared: ", err)
})
}
if(length(predictionChains) <= 0) {
stop("error calculating R squared: length of prediction chains is zero")
}
if(!is.numeric(observedResponse)) {
observedResponse <- tryCatch(as.numeric(observedResponse), error = function(err) {
stop("error calculating R squared: ", err)
})
}
if(length(observedResponse) <= 0) {
stop("error calculating R squared: length of the observed response is zero")
}
# Calculate the r squared values for each of the sampled predictors across the chains
sampledRSquared <- do.call(coda::mcmc.list, lapply(X = as.list(predictionChains), FUN = function(curChain, observedResponse) {
# Retrieve the samples from the current chain as a matrix
curMatrix <- as.matrix(curChain)
if(ncol(curMatrix) != length(observedResponse)) {
if(!is.null(names(observedResponse)) && !is.null(colnames(curMatrix))) {
if(!all(names(observedResponse) %in% colnames(curMatrix))) {
stop("error calculating R squared: response data has elements that are not present in the prediction chains")
} else {
curMatrix <- curMatrix[, names(observedResponse), drop = FALSE]
}
} else {
stop("error calculating R squared: response data length is different than column width in the prediction chains")
}
}
# Calculate the r squared for each sample of the MCMC chain
rSquaredVals <- apply(X = curMatrix, FUN = function(curPreds, observedResponse) {
# Remove any elements that have NA values in the observed response
valsToUse <- !is.na(observedResponse)
inCurPreds <- curPreds[valsToUse]
inObserved <- observedResponse[valsToUse]
# Calculate the variation in the vector of predicted values
varPred <- vzFunc(inCurPreds)
# Calculate the modelled residual variance
varRes <- vzFunc(inObserved - inCurPreds)
# Use these to calculate r squared
varPred / (varPred + varRes)
}, observedResponse = observedResponse, MARGIN = 1)
coda::mcmc(data = matrix(rSquaredVals, nrow = length(rSquaredVals), ncol = 1, dimnames = list(NULL, "rSquared")),
start = attr(curChain, "mcpar")[1], end = attr(curChain, "mcpar")[2], thin = attr(curChain, "mcpar")[3])
}, observedResponse = observedResponse))
sampledSummary <- summary(sampledRSquared)
# Calculate summary statistics of the r squared
summaryStats <- setNames(
c(sampledSummary$statistics, sampledSummary$quantiles),
gsub(" ", ".", tolower(c(names(sampledSummary$statistics), paste("quant", as.double(gsub("%", "", names(sampledSummary$quantiles), fixed = TRUE)) / 100.0, sep = ""))), fixed = TRUE)
)
# Arrange the summary statistics and the samples into a list
c(list(samples = sampledRSquared), as.list(summaryStats))
}
predict.glmNIMBLE <- function(modelOb, newData, type = "link") {
# Import the model object
inGLM <- tryCatch(as.list(modelOb), error = function(err) {
stop("error encountered processing model object: object is not correct type")
})
# Retrieve the model formula
inModForm <- tryCatch(as.formula(inGLM$modelFormula), error = function(err) {
stop("error encountered processing model object: ", err)
})
# Retrieve the centre and scaling information
cenScaleInfo <- tryCatch(as.matrix(inGLM$covSummaryStats), error = function(err) {
stop("error encountered processing model object: ", err)
})
if(!all(c("mean", "sd") %in% rownames(cenScaleInfo))) {
stop("error encountered processing model object: centre and scale information is not correctly formatted")
}
# Import the new data
inData <- tryCatch(as.data.frame(newData), error = function(err) {
stop("error encountered processing prediction data frame: ", err)
})
# Centre and scale the new data according to the moment of the training data (if required)
inData <- as.data.frame(setNames(lapply(X = names(inData), FUN = function(curCol, inData, cenScaleInfo) {
outVec <- inData[, curCol]
if(curCol %in% colnames(cenScaleInfo)) {
if(all(!is.na(cenScaleInfo[, curCol]))) {
outVec <- (inData[, curCol] - cenScaleInfo["mean", curCol]) / cenScaleInfo["sd", curCol]
}
}
outVec
}, inData = inData, cenScaleInfo = cenScaleInfo), names(inData)))
# Retrieve the model suffix (if it has one)
curSuffix <- ""
if(!is.null(inGLM$modelSuffix)){
curSuffix <- inGLM$modelSuffix
}
# Retrieve the model matrix
inModForm <- as.formula(gsub("^.*~", "~", Reduce(paste, deparse(inModForm)), perl = TRUE))
curModelMatrix <- model.matrix(inModForm, model.frame(inModForm, data = inData, na.action = na.pass))
# Check whether an intercept term is present in the model matrix
hasIntercept <- 0 %in% attr(curModelMatrix, "assign")
if(hasIntercept) {
# Remove the intercept term from the model matrix if it exists
curModelMatrix <- curModelMatrix[, attr(curModelMatrix, "assign") != 0, drop = FALSE]
}
if(ncol(curModelMatrix) > 0) {
# Rename the model matrix covariates to NIMBLE-friendly names
colnames(curModelMatrix) <- paste(makeBUGSFriendlyNames(colnames(curModelMatrix)), curSuffix, sep = "")
}
# Retrieve the prediction node of the model
predNode <- inGLM$deterministicNodeDef
if(is.null(predNode)) {
stop("error encountered processing model object: deterministic node specification is missing")
}
if(!(paste("meanPred", curSuffix, sep = "") %in% names(predNode))) {
stop("error encountered processing model object: prediction node is missing")
}
predNode <- predNode[[paste("meanPred", curSuffix, sep = "")]]
# Retrieve the prediction specification text
modSpecText <- tryCatch(gsub(paste("[1:meanPred", curSuffix, "N]", sep = ""), "", as.character(predNode), fixed = TRUE), error = function(err) {
stop("error encountered processing model object: ", err)
})
if(is.null(inGLM$parameterSamples)) {
stop("error encountered processing model object: unable to retrieve parameter samples")
}
# Retrieve the samples of the parameters as a matrix
paramSamples <- NULL
if(class(inGLM$parameterSamples) == "mcmc") {
paramSamples <- as.data.frame(inGLM$parameterSamples)
} else {
paramSamples <- tryCatch(do.call(rbind, lapply(X = inGLM$parameterSamples, FUN = as.data.frame)), error = function(err) {
stop("error encountered processing model object: parameter samples are in an incorrect format")
})
}
# Retrieve the type of prediction to make
inType <- tryCatch(as.character(type), error = function(err) {
stop("error encountered processing prediction: invalid prediction type")
})
if(is.null(inType) || length(inType) <= 0) {
inType <- "link"
} else if(length(inType) > 1) {
warning("prediction type specification has length greater than zero: only the first element will be used")
}
if(!(inType %in% c("link", "response"))) {
stop("error encountered processing prediction: invalid prediction type")
}
# Retrieve the link specification
linkFunc <- tryCatch(as.character(attr(predNode, "linkFunction")), error = function(err) {
stop("error encountered processing model object: ", err)
})
if(is.null(linkFunc) || length(linkFunc) <= 0) {
stop("error encountered processing model object: link function for prediction node not specified")
} else if(length(linkFunc) > 1) {
warning("link function specification has length greater than zero: only the first element will be used")
linkFunc <- linkFunc[1]
}
if(!(linkFunc %in% unique(unlist(errorFamilies())))) {
stop("error encountered processing model object: invalid link function detected")
}
# Go through each of the rows of the model frame and calculate the predictions for each
predOut <- t(apply(X = curModelMatrix, MARGIN = 1, FUN = function(curRow, paramSamples, modSpecText, inType, linkFunc) {
# Evaluate the model using the current parameter and covarite values
preds <- eval(parse(text = modSpecText), envir = c(paramSamples, as.list(curRow)))
if(inType == "response") {
preds <- applyInverseLink(preds, linkFunc)
}
preds
}, paramSamples = as.list(paramSamples), modSpecText = modSpecText, inType = inType, linkFunc = linkFunc))
# Return the prediction samples alongside the summary statistics
list(
predictionMatrix = predOut,
summary = t(apply(X = predOut, MARGIN = 1, FUN = function(curRow) {
setNames(
c(mean(curRow), median(curRow), sd(curRow), quantile(curRow, c(0.025, 0.975))),
c("Mean", "Median", "St.Dev.", "95%CI_low", "95%CI_upp")
)
}))
)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.