# functions to set model info
# supported model types
model.type.list =
list(
linear = "Gaussian",
beta = "Beta",
binary = "Binomial",
categorical = "Categorical",
ordered = "Categorical (ordered)",
poisson = "Count",
neg.binomial = "Count (over-dispersed)",
survival = "Event History",
comp.risk = "Competing Risks"
)
#' Parse a formula into its component parts.
#'
#' This internal function takes a formula and data, parses the various components of the formula,
#' formats the data, and returns everything as a list.
#'
#' @family model setup functions
#' @param formula A formula in the format 'outcome ~ variables + specials' with all special components.
#' @param data The raw data that the formula will be run on.
#' @return A list with the transformed formula, the transformed data (based on functions in the formula),
#' original data, the originak formula, the response variables, the explanatory variables, the special
#' components to add to the formula, the random effects, and the fixed effects.
#'
parse_formula = function(formula, data) {
## setup a few variables
# select only relevant vars and complete cases
data = dplyr::select(data, all.vars(formula))
data = dplyr::filter(data, complete.cases(data))
# get the sides of the formula
formula.explanatory = formula(delete.response(terms(formula)))
formula.response = update(formula, . ~ 1)
formula.add = NULL
## find and deal with random effects
# get random effects terms
formula.random = lme4::findbars(formula.explanatory) # by hand: unlist(stringr::str_extract_all(as.character(formula[3]), "(?<=\\()[^\\)]*[\\|][^\\)|]*"))
# remove random effects if needed
if(!is.null(formula.random)) {
# remove bars
formula.explanatory = lme4::nobars(formula.explanatory)
# add to extra
formula.random = paste0("(", formula.random, ")")
}
## find and deal wit special terms
# special terms
specials = c("strata", "tt", "frailty", "cluster")
# get the special terms for event history models
time.terms = terms(formula.explanatory, specials = specials)
# list of items to drop
special.names = lapply(specials, survival:::untangle.specials, tt = time.terms)
formula.survival = unlist(purrr::map(special.names, "vars"))
to.drop = unlist(purrr::map(special.names, "terms"))
# modify main formula
if(!is.null(to.drop)) {
formula.explanatory = formula(drop.terms(time.terms, to.drop, keep.response = F), env = environment(formula))
}
## find and deal with fixed effects
# first get the model frame so we can identify fixed effect
data.frame = model.frame(formula.explanatory, data)
# loop through and identify factors or characters with more than five categories
fixed.effects = sapply(colnames(data.frame), function(x) (is.character(data.frame[[x]]) || is.factor(data.frame[[x]]) && dplyr::n_distinct(data.frame[[x]]) > 4))
## assemble the additional formulas
# add a formula additional?
if(!is.null(formula.random) || !is.null(formula.survival)) {
formula.additional = formula(paste("~", paste(c(formula.random, formula.survival), collapse = " + ")), env = environment(formula))
} else {
formula.additional = NULL
}
## assemble the full data and the full formula to test on
# get the model matrix and the test data -- this needs to go here so that we can pull the column names
data.full = as.matrix(model.matrix(update(formula.explanatory, ~ . + 0), data))
# make the full formula -- the full formula is always response ~ explanatory + added
formula.full = formula(paste(as.character(formula.response)[2], "~", paste(c(sapply(colnames(data.full), glue::backtick), formula.random, formula.survival), collapse = " + ")), env = environment(formula))
# formula.full = formula(paste(as.character(formula.response)[2], "~", paste(c(".", formula.random, formula.survival), collapse = " + ")), env = environment(formula))
# add response and additional to full data -- also converts to data_frame
data.full = dplyr::bind_cols(list(dplyr::select(data, all.vars(formula.response)), as.data.frame(data.full), dplyr::select(data, all.vars(formula.add))))
# the list that we will return
# return list
r = list(
# the formula and data for running the model
formula = formula,
data = data,
data.raw = data,
# formula components
formula.original = formula,
response = formula.response,
explanatory = formula.explanatory,
added = formula.additional,
# flags for random and fixed effects
random.effects = !is.null(formula.random),
fixed.effects = any(fixed.effects)
)
# return
return(r)
}
#' Determine the type of model to run.
#'
#' This internal function takes the parsed formula and raw data and determines the type of
#' model that should be run.
#'
#' @family model setup functions
#' @param formula.parsed A parsed formula object returned from the function 'parse_formula'.
#' @param data The raw data that the formula will be run on.
#' @return A character vector that identifies the type of model to be run based on the formula and data.
#'
determine_model = function(formula.parsed, data) {
# make character
response.char = as.character(formula.parsed$response[2])
# set model type
model.type = NULL
model.effects = NULL
# get the dependent variable
response.length = all.vars(formula.parsed$response)
# no response variable
if(length(response.length) == 1 & all(response.length == ".")) {
response.length = NULL
}
# start identifying what type of model to run
if(length(response.length) > 1) {
# time series
if(stringr::str_detect(response.char, "Surv")) {
# set model type
model.type = model.type.list$survival
# fix formula
if(!stringr::str_detect(response.char, "survival::")) {
formula.parsed$response = as.formula(paste(stringr::str_replace(as.character(formula.parsed$response[2]), stringr::fixed("Surv"), "Surv"), "~ 1"), env = .GlobalEnv) # add package reference?
}
}
else if(stringr::str_detect(response.char, "Event")) {
# set model type
model.type = model.type.list$comp.risk
# fix formula
if(!stringr::str_detect(response.char, "timereg::")) {
formula.parsed$response = as.formula(paste(stringr::str_replace(as.character(formula.parsed$response[2]), stringr::fixed("Event"), "Event"), "~ 1"), env = .GlobalEnv) # add package reference?
}
}
else {
warning("Unsuported response variable.")
}
}
else if(length(response.length) > 0) {
# check the dv
data.response = model.frame(formula.parsed$response, data)[, 1]
# check types
if(is.factor(data.response)) {
# we have a binomial, categorical, or ordered model
if(dplyr::n_distinct(data.response) > 2) {
if(is.ordered(data.response)) {
model.type = model.type.list$ordered
} else {
model.type = model.type.list$categorical
}
} else {
model.type = model.type.list$binary
}
}
else if(all(data.response >= 0) & all(data.response %% 1 == 0) & dplyr::n_distinct(data.response) > 4) {
# count data
# check for over dispersion
# first run a poisson to see if it fits well or not
m.t = suppressWarnings(glm(formula.parsed$formula, data, family = poisson))
# check if residual deviance/df is much bigger than one -- also: summary(m.t)$deviance / m.t$df.residual > 1.5
if(1 - pchisq(summary(m.t)$deviance, m.t$df.residual) < 0.05) {
# over-dispersed when accounting for covaraiates
model.type = model.type.list$neg.binomial
} else {
# not over-dispersed so run poisson
model.type = model.type.list$poisson
}
} else {
model.type = model.type.list$linear
}
}
else {
# set to null
model.type = NULL
# we have a problem!
warning("No response variables.")
}
# return the model type
return(model.type)
}
#' Determine whether a survival model needs to be run with time-varying hazards.
#'
#' This internal function takes a (survival) formula and data and determines if
#' any variables need to be run with time-varying hazards. Optionally focus only
#' on 'var' variables.
#'
#' The basic logic is to run the model conventionally and test the proportional hazards
#' assumption using 'cox.zph' in the 'survival' package. Test works with normal survival
#' models run using 'coxph' and mixed-effects models run using 'coxme'.
#'
#' If time-varying hazards are identified than the appropriate variables are modified in
#' the formula.
#'
#' @family model setup functions
#' @param formula A formula for the survival model to be run.
#' @param data The raw data that the formula will be run on.
#' @param var An optional vector indicating which variables to focus on (e.g., when running causal analysis).
#' If 'var' is not null, non-proportional hazards for remaining variables are ignored.
#' @param random.effects A logical variable indicating whether the formula has random effets. If so,
#' a mixed-effects Cox model is run.
#' @return A data frame with the list of variables (or a subset if 'var' is set) and whether the variable
#' has fails the proportional hazards assumption.
#'
identify_tve = function(formula, data, var = NULL, random.effects = F) {
# identify time varying covariates for survival model
# more info: https://cran.r-project.org/web/packages/survival/vignettes/timedep.pdf + https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6015946/
# https://myweb.uiowa.edu/pbreheny/7210/f15/notes/12-1.pdf + https://arxiv.org/pdf/2002.09633.pdf
# a time varying covariate just means the beta varies with time B = B * X + B(t) * X
# when producing predictions this just means that the linear predictors are a function of time
# select data
data = dplyr::select(data, all.vars(formula))
dplyr::filter(data, complete.cases(data))
# run model -- could be useful: f.terms = terms(formula, specials = c("strata", "tt", "cluster"))
m.surv = NULL
withr::with_package("survival", try(m.surv <- if(random.effects) coxme::coxme(formula, data) else coxph(formula, data), T))
# check if we could run it
if(is.null(m.surv)) {
warning("Could not run model to check for time-varying hazards.")
return(NULL)
}
# people need to scope their functions better -- what a pain to debug this -- this might be because of our "try" but sheesh!
m.surv$call = call(if(random.effects) "coxme::coxme" else "coxph", formula = formula, data = data)
# identify time variable
time.var = as.matrix(m.surv$y)
time.var = if("time" %in% colnames(time.var)) time.var[, "time"] else time.var[, "stop"]
# check
if(is.null(time.var)) {
stop("Could not find time variable in formula.")
}
# set var to all variables in the formula if null
if(is.null(var)) {
var = all.vars(lme4:::nobars(rlang::f_rhs(formula)))
}
# identify relevant variables
t.mat = withr::with_package("survival", model.frame(formula, data))
var.name = colnames(t.mat)[colnames(t.mat) %in% var]
# check time varying
zph.surv = NULL
withr::with_package("survival", try(zph.surv <- cox.zph(m.surv, global = F), T))
# check if we could run it
if(is.null(zph.surv)) {
warning("Could not estimate relationship between residuals and time.")
return(NULL)
}
# plot(zph.surv[1], resid = F)
# # identify vars with time dependent hazards
# rownames(zph.surv$table)[zph.surv$table[, 3] < 0.05]
# identify which row our variable is
num.in.zph = which(rownames(zph.surv$table) %in% var.name)
# create a tibble with the vars and the p-values
var.tdh = tibble::tibble(var = var.name, var.formula = var.name, p.value = zph.surv$table[num.in.zph, 3])
# return
return(var.tdh)
}
#' Produce an object with model-specific functions and arguments.
#'
#' This internal function takes the model type, parsed formula, inference, and extra args and returns
#' a list with all of the specific information (e.g., extra functions) needed to run the model. The output
#' is used in the frequentist and Bayesian internal model run functions.
#'
#' @family model setup functions
#' @param model.type The type of model to run.
#' @param formula.parsed The parsed formula object.
#' @param inference Whether the model should be run using 'frequentist' or 'Bayesian' inference.
#' @param model.extra.args Extra arguments passed to the function used to run the model.
#' @param main.ivs A list of the main effects (e.g., a treatment) that the analysis is focused on. Used on
#' to identify if a survival model has time-varying hazards.
#' @return Based on the type of model to run and the statistical inference used, this function returns a list of
#' model-specific functions and flags that allow the model to be easily run.
#'
#' The functions model_run.frequentist' and 'model_run.bayesian' take the returns from this function and actually run the model.
#' For a frequentist approach the model is run once for each random sample of the main data. For the Bayesian approach
#' the model is run once for each chain.
#'
produce_model_function = function(model.type, formula.parsed, inference, model.extra.args = NULL, main.ivs = NULL) {
# function to assemble the model calls for running
# input: model type, parsed formula, extra arguments
# output: list to allow a call to model functions (run, predict, residual) and extra model libraries
# the return list -- could make it an s4 class
model.extra = list(
# main model calls
model.run = NULL,
model.args = NULL,
family = NULL,
class = NULL,
# access functions
coefs = NULL,
residuals = NULL,
# additional info
converged = NULL,
fitted = NULL,
performance = NULL,
special = NULL,
libraries = "stats",
has.tve = F,
# additional rstanarm stuff
cores = NULL,
chains = NULL,
iter = NULL,
seed = NULL
)
## set the model info list -- these are all just families for glm (negative binomial requires an extra estimation of theta)
# set base model calls -- defaults to gaussian generalized linear model
model.extra$model.run = stats::glm
model.extra$model.args = list(family = stats::gaussian(link = "identity"))
model.extra$family = model.extra$model.args$family
model.extra$class = c("frequentist", "glm")
# set functions to get data
model.extra$coefs = stats::coef
model.extra$residuals = stats::residuals
# set additional utility functions
model.extra$converged = function(x) x$boundary == F & x$converged == T
model.extra$fitted = function(x) c("mean_PPD" = mean(stats::fitted(x), na.rm = T))
model.extra$performance = function(x) c("log-posterior" = as.numeric(stats::logLik(x))) #c("log-posterior" = x$rank - (x$aic / 2))
model.extra$special = function(x) c("sigma" = stats::sigma(x))
# additions/changes for binomial
if(model.type == model.type.list$binary) {
model.extra$model.args$family = stats::binomial(link = "logit")
model.extra$family = model.extra$model.args$family
}
# additions/changes for poisson
if(model.type == model.type.list$poisson) {
model.extra$model.args$family = stats::poisson(link = "log")
model.extra$family = model.extra$model.args$family
model.extra$class = c("frequentist", "poisson", "glm")
}
# additions/changes for negative binomial
if(model.type == model.type.list$neg.binomial) {
if(inference == "frequentist") {
# get theta
theta = do.call(MASS::glm.nb, c(list(formula = formula.parsed$formula, data = formula.parsed$data, trace = 0), model.extra.args))
# set
model.extra$model.args$family = MASS::negative.binomial(theta = theta$theta, link = "log")
model.extra$family = model.extra$model.args$family
model.extra$class = c("frequentist", "negbin", "glm")
} else {
# baysian neg binomial family
model.extra$model.args$family = rstanarm::neg_binomial_2(link = "log")
model.extra$family = rstanarm::neg_binomial_2(link = "log")
}
}
# additions/changes for beta regression
if(model.type == model.type.list$beta) {
model.extra$model.run = betareg::betareg
model.extra$model.args = list(link = "logit", model = F, y = F)
model.extra$family = make.link(model.extra$model.args$link)
model.extra$class = c("frequentist", "betareg")
}
# additions/changes for ordered regression
if(model.type == model.type.list$ordered) {
model.extra$model.run = MASS::polr
model.extra$model.args = list(method = "logistic")
model.extra$family = make.link("logit")
model.extra$class = c("frequentist", "polr")
model.extra$converged = function(x) x$converged == 1
}
# additions/changes for categorical regression
if(model.type == model.type.list$categorical) {
model.extra$model.run = nnet::multinom
model.extra$model.args = list(model = F, trace = F)
model.extra$family = make.link("logit")
model.extra$class = c("frequentist", "nnet")
model.extra$converged = function(x) x$converged == 1
# Residuals for Multinomial Models: https://www.jstor.org/stable/2673571
# predictions for MNL https://cran.r-project.org/web/packages/MNLpred/vignettes/OVA_Predictions_For_MNL.html
}
# for survival
if(model.type == model.type.list$survival) {
model.extra$model.run = survival::coxph
model.extra$model.args = list(ties = "efron", y = T, x = T, model = T)
model.extra$class = c("frequentist", "survival")
model.extra$fitted = function(x) c("mean_PPD" = mean(x$y[, ncol(x$y)] - x$residuals, na.rm = T))
model.extra$performance = function(x) c("log-posterior" = x$loglik[2]) # could also use concordance: x$concordance["concordance"]
model.extra$residuals = function(x) x$residuals # martingale residuals = true events - predict(fit, type = 'expected') events
model.extra$converged = function(x) x$info["convergence"] == 0
# fast_bh -- this returns our full baseline hazard instead of a shape parameter as would occur in a stan model -- right now just assume an exponential baseline
model.extra$special = function(x) { bh = danalyze:::fast_bh(x); intercept = log(sum(bh$hazard) / sum(bh$time)); c("(Intercept)" = intercept) }
# set library
model.extra$libraries = "survival"
# we need to figure out if the main IVs have time-varying hazard
tve = identify_tve(formula = formula.parsed$formula, data = formula.parsed$data, var = main.ivs, random.effects = formula.parsed$random.effects)
# deal with tve
if(!is.null(tve)) {
# check if we have a tve problem
tve.probs = as.character(na.omit(tve$var.formula[tve$p.value < 0.05]))
# if we have a problem then modify our formula to wrap the affected variables -- works for both types of inference
if(length(tve.probs) > 0) {
# set flag
model.extra$has.tve = F
# modify formula
formula.modify = formula(paste(". ~", paste(if(inference == "bayesian") "tve(" else "tt(", tve.probs, ")", sep = "", collapse = " + "), "+ . -", paste(tve.probs, collapse = " - ")))
formula.parsed$formula = update(formula.parsed$formula, formula.modify)
}
}
}
# additions/changes for mixed effects
if(formula.parsed$random.effects & inference == "frequentist") {
# check convergence
model.extra$converged = function(x) x@optinfo$conv$opt == 0
# set class
model.extra$class = c(model.extra$class, "lmerMod")
# function changes
model.extra$coefs = lme4::fixef
# we cant do some models with random effects
if(model.type == model.type.list$linear) {
# model changes -- hassle to use both lmer and glmer since they accept different arguments
model.extra$model.run = lme4::lmer
model.extra$model.args$family = NULL
model.extra$family = stats::gaussian(link = "identity")
model.extra$model.args = c(model.extra$model.args, list(control = lme4::lmerControl(calc.derivs = F, optimizer = "nloptwrap")))
} else if(model.type == model.type.list$binary) {
# model changes
model.extra$model.run = lme4::glmer
model.extra$model.args = c(model.extra$model.args, list(control = lme4::glmerControl(calc.derivs = F, optimizer = "nloptwrap")))
} else if(model.type == model.type.list$survival) {
# model changes
model.extra$model.run = coxme::coxme
model.extra$model.args = list(ties = "efron", y = T, x = T)
# function changes -- this makes it run but you wont get anything usable -- need to find the intercept and the mean_PPD
model.extra$coefs = coxme::fixef
model.extra$converged = function(x) T
model.extra$fitted = function(x) c("mean_PPD" = NULL)
model.extra$special = function(x) { c("(Intercept)" = 0) }
# set class
model.extra$class = c("frequentist", "survival", "coxme")
} else {
stop(paste("Unable to run --", model.type, "-- with Random Effects"))
}
}
# additional stuff for bayesian models
if(inference == "bayesian") {
# set model
model.extra$model.run = rstanarm::stan_glm
# set mixed effects version
if(formula.parsed$random.effects) {
model.extra$model.run = rstanarm::stan_glmer
}
# set class
model.extra$class[1] = "bayesian"
# set priors
model.extra$model.args = NULL
model.extra$model.args$prior = rstanarm::normal(0, 2.5)
model.extra$model.args$prior_intercept = rstanarm::normal(0, 2.5)
model.extra$model.args$prior_aux = rstanarm::exponential(1)
# set multi-core stuff
model.extra$model.args$cores = as.integer(parallel::detectCores(T, T) * 0.5)
model.extra$model.args$chains = 1
model.extra$model.args$warmup = 250
model.extra$model.args$iter = 500
model.extra$model.args$seed = 24021985
# additional for neg binomial
if(model.type == model.type.list$binary) {
# baysian neg binomial family
model.extra$model.args$family = binomial(link = "logit")
model.extra$family = binomial(link = "logit")
}
# additions/changes for ordered regression
if(model.type == model.type.list$ordered) {
model.extra$model.run = rstanarm::stan_polr
model.extra$model.args$prior = rstanarm::R2(NULL)
model.extra$model.args$prior_counts = rstanarm::dirichlet(1)
}
# additions/changes for survival
if(model.type == model.type.list$survival) {
model.extra$model.run = rstanarm::stan_surv
model.extra$model.args$prior_aux = NULL
model.extra$model.args$basehaz = "ms"
}
# additional for neg binomial
if(model.type == model.type.list$neg.binomial) {
# baysian neg binomial family
model.extra$model.args$family = rstanarm::neg_binomial_2(link = "log")
model.extra$family = rstanarm::neg_binomial_2(link = "log")
}
## need to add in brms zero inflated
}
# if(model.type == model.type.list$survival && !formula.parsed$random.effects) {
# # from terry thernau
# # A good rule for Cox models is to have 10-20 events for each coefficient. When models get below 2 events/coef the
# # results can be unreliable, both numerically and biologically, and some version of a shrinkage model is called for.
#
# # set model info
# model.extra$model.run = survival::coxph
# model.extra$model.args = list(ties = "efron", y = T, x = T, model = T)
# model.extra$coefs = stats::coef
# model.extra$fitted = function(x) c("mean_PPD" = mean(x$y[, ncol(x$y)] - x$residuals, na.rm = T))
# model.extra$performance = function(x) c("log-posterior" = x$loglik[2]) # could also use concordance: x$concordance["concordance"]
# model.extra$residuals = function(x) x$residuals # martingale residuals = true events - predict(fit, type = 'expected') events
# model.extra$converged = function(x) x$info["convergence"] == 0
# model.extra$special = function(x) NULL # no shape parameter for the baseline hazard estimated
# # model.extra$libraries = "survival"
# }
# else if(model.type == model.type.list$survival && formula.parsed$random.effects) {
# # # get extras
# # .extra = as.list(as.call(model.info$response[[2]]))[-1]
# # .args = c("time", "time2", "event", "type", "origin", "cens.code")
# # .rep = if(is.null(names(.extra))) 1:length(.extra) else 1:sum(names(.extra) == "")
# # names(.extra)[.rep] = .args[!.args %in% names(.extra)][.rep]
# #
# # # set model info
# # model.extra$model.run = survival_wrap
# # model.extra$model.args = list(ties = "efron", y = T, x = T, refine.n = 0, .response.var = as.character(.extra$event), .cens.code = .extra$cens.code, .re = T)
# # model.extra$clean = clean_glm
# # model.extra$coefs = coef_survival
# # model.extra$predict = predict_survival
# # model.extra$predict.args = NULL
# # model.extra$residuals = NULL
# # model.extra$libraries = "coxme"
# }
# else {
# stop(paste("Incorrect model type --", model.info$model, "-- selected."))
# }
# add in extra model args
if(!is.null(model.extra.args)) {
if(is.null(model.extra$model.args)) {
model.extra$model.args = model.extra.args
} else {
model.extra$model.args[names(model.extra.args)] = model.extra.args
}
}
# return
return(model.extra)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.