Nothing
#' tdlmm
#' @title Treed Distributed Lag Mixture Models (Deprecated)
#' @description TDLMM is a method for estimating a Treed Distributed Lag
#' Mixture Model. It operates by building an ensemble of pairs of regression
#' trees. Each tree in a tree-pair partitions the time span of the exposure data
#' and estimates a piecewise constant distributed lag effect. The two trees
#' are then intersected to create an interaction surface for estimating the
#' interaction between two exposures. Exposures are selected for each tree
#' stochastically and each exposure or interaction has a unique shrinkage
#' variance component. This allows for exposure variable selection in addition
#' to the estimation of the distributed lag mixture model.
#'
#' @param formula object of class formula, a symbolic description of the fixed
#' effect model to be fitted, e.g. y ~ a + b
#' @param data data frame containing variables used in the formula
#' @param exposure.data named list containing equally sized numerical matrices
#' of exposure data with same, having same length as data
#' @param n.trees integer for number of trees in ensemble
#' @param n.burn integer for length of burn-in
#' @param n.iter integer for number of iterations to run model after burn-in
#' @param n.thin integer thinning factor, i.e. keep every tenth iteration
#' @param family 'gaussian' for continuous response, 'logit' for binomial
#' response with logit link, or 'zinb' for zero-inflated negative binomial with logit link
#' @param binomial.size integer type scalar (if all equal, default = 1) or
#' vector defining binomial size for 'logit' family
#' @param formula.zi object of class formula, a symbolic description of the ZI
#' model to be fitted, e.g. y ~ a + b. This only applies to ZINB where covariates for
#' ZI model is different from NB model. This is same as the main formula by default
#' @param keep_XZ FALSE (default) or TRUE: keep the model scale exposure and covariate data
#' @param mixture.interactions 'noself' (default) which estimates interactions
#' only between two different exposures, 'all' which also allows
#' interactions within the same exposure, or 'none' which eliminates all
#' interactions and estimates only main effects of each exposure
#' @param tree.params numerical vector of alpha and beta hyperparameters
#' controlling tree depth (see Bayesian CART, 1998), default: alpha = 0.95,
#' beta = 2
#' @param step.prob numerical vector for probability of 1) grow/prune,
#' 2) change, 3) switch exposure, defaults to (0.25, 0.25, 0.25) or equal
#' probability of each step for tree updates
#' @param mix.prior positive scalar hyperparameter for sparsity of exposures
#' @param shrinkage character "all" (default), "trees", "exposures", "none",
#' turns on horseshoe-like shrinkage priors for different parts of model
#' @param subset integer vector to analyze only a subset of data and exposures
#' @param verbose TRUE (default) or FALSE: print output
#' @param diagnostics TRUE or FALSE (default) keep model diagnostic such as
#' terminal nodes, acceptance details, etc.
#' @param initial.params initial parameters for fixed effects model, FALSE = none (default), "glm" = generate using GLM, or user defined, length must equal number of parameters in fixed effects model
#' @param ... NA
#'
#' @details Model is recommended to be run for at minimum 5000 burn-in
#' iterations followed by 15000 sampling iterations with a thinning factor of 5.
#' Convergence can be checked by re-running the model and validating consistency
#' of results.
#'
#' @returns object of class 'tdlmm'
#'
tdlmm <- function(formula,
data,
exposure.data,
n.trees = 20,
n.burn = 2000,
n.iter = 5000,
n.thin = 5,
family = "gaussian",
binomial.size = 1,
formula.zi = NULL,
keep_XZ = FALSE,
mixture.interactions = "noself",
tree.params = c(.95, 2),
step.prob = c(.25, .25, .25),
mix.prior = 1,
shrinkage = "exposures",
subset = NULL,
verbose = TRUE,
diagnostics = FALSE,
initial.params = NULL,
...)
{
message("Execution halt: 'tdlmm.R' has been deprecated and will be removed in the future. Please use 'dlmtree.fit' with arguments: dlm.type = `linear` and mixture = TRUE, instead. For an example code, use ?dlmtree.fit.")
# model <- list()
# options(stringsAsFactors = F)
# # ---- Check user inputs ----
# # number of iterations [Control for MCMC iteration vs thinning]
# if (n.iter < n.thin * 10)
# stop("after thinning you will be left with less than 10 MCMC samples,
# increase the number of iterations!")
# # data for formula [data frame check]
# if (!is.data.frame(data))
# stop("`data` must be a data.frame")
# # exposure data
# if (!is.list(exposure.data))
# stop("`exposure.data` must be a list of named exposures")
# model$nExp <- length(exposure.data)
# model$expNames <- names(exposure.data)
# # [Exposure name check]
# if (is.null(model$expNames) || length(unique(model$expNames)) != model$nExp || any(model$expNames == ""))
# stop("`exposure.data` must be a named list with unique, non-empty names")
# model$pExp <- ncol(exposure.data[[1]])
# # Sanity check for each exposure
# for (i in 1:length(exposure.data)) {
# # [Not numeric values for exposure data]
# if (!is.numeric(exposure.data[[i]]))
# stop("each exposure in list `exposure.data` must be a numeric matrix")
# # [Number of observations do not match]
# if (nrow(data) != nrow(exposure.data[[i]]))
# stop("`data` and `exposure.data` must have same number of rows")
# # [Exposure total time lag does not match with the others]
# if (ncol(exposure.data[[i]]) != model$pExp)
# stop("each exposure in `exposure.data` must have the same
# number of time observations")
# # [NA/Missing values in exposure data]
# if (any(is.na(exposure.data[[i]]))) # any(): Is there at least one which meets the condition?
# stop("missing values in exposure data")
# }
# # [iteration control] Check if all MCMC parameters are positive integers
# if (all(sapply(list(n.trees, n.burn, n.iter, n.thin),
# function(i) is.integer(i) & i > 0)))
# stop("n.* must be integer and > 0")
# # [response type] Model much be Gaussian / Logistic / Zero-inflated negative binomial
# if (!(family %in% c("gaussian", "logit", "zinb")))
# stop("`family` must be one of `gaussian`, `logit`, or 'zinb'")
# # [binomial size] Only for binomial
# model$binomial <- 0
# model$binomialSize <- rep(0, nrow(data))
# # If binomial is called, update the parameters(binomial, binomialSize) above.
# if (family == "logit") {
# # If binary, update binomialSize to a vector of ones.
# if (length(binomial.size) == 1)
# binomial.size <- rep(binomial.size, nrow(data))
# # binomialSize vector must be the same length as the sample size.
# if (length(binomial.size) != nrow(data))
# stop("`binomial.size` must be positive integer and same length as data")
# # Update the parameters
# model$binomialSize <- force(binomial.size)
# model$binomial <- 1
# }
# # ZINB flag
# model$zinb <- 0
# if (family == "zinb") {
# model$zinb <- 1
# model$sigma2 <- 1
# }
# # [Mixture interactions]
# # [Correct options for interaction parameter]
# if (!(mixture.interactions %in% c("noself", "all", "none")))
# stop("`mixture.interactions must be one of `noself`, `all`, `none`")
# if (mixture.interactions %in% c("marginal", "none")) {
# model$interaction <- 0
# model$nMix <- 0
# } else if (mixture.interactions == "noself") {
# model$interaction <- 1
# model$nMix <- model$nExp * (model$nExp - 1) / 2
# } else {
# model$interaction <- 2
# model$nMix <- model$nExp * (model$nExp + 1) / 2
# }
# # [tree parameters]
# # Tree param default: alpha, beta = c(.95, 2)
# if (length(tree.params) != 2) {
# stop("tree.params must have length 2")
# } else {
# # [Sanity check on alpha & beta range]
# if (tree.params[1] > 1 | tree.params[1] < 0)
# stop("tree.params[1] must be between 0 and 1")
# if (tree.params[2] < 0)
# stop("tree.params[2] must be greater than 0")
# }
# # step probabilities
# # 1) grow/prune, 2) change, 3) switch exposure, defaults to (0.25, 0.25, 0.25)
# if (any(step.prob < 0) || any(step.prob > 1) || length(step.prob) != 3)
# stop("must specify three step.prob components between zero and one")
# # ---- [Model control arguments] ----
# model$nTrees <- force(n.trees)
# model$nBurn <- force(n.burn)
# model$nIter <- force(n.iter)
# model$nThin <- force(n.thin)
# model$mcmcIter <- force(floor(n.iter / n.thin))
# model$family <- force(family)
# model$verbose <- force(verbose)
# model$diagnostics <- force(diagnostics)
# model$treePriorTDLM <- force(tree.params)
# # Step probabilities
# model$stepProb <- force(c(step.prob[1], step.prob[1], step.prob[2], step.prob[3]))
# model$stepProbTDLM <- force(model$stepProb / sum(model$stepProb))
# model$mixPrior <- ifelse(mix.prior == "auto", -1, mix.prior)
# model$shrinkage <- ifelse(shrinkage == "all", 3,
# ifelse(shrinkage == "trees", 2,
# ifelse(shrinkage == "exposures", 1, 0)))
# if (model$verbose)
# cat("Preparing data...\n")
# # ---- [Create data subset] ----
# if (!is.null(subset)) {
# if (length(subset) > 1 & is.integer(subset) &
# all(subset > 0) & all(subset <= nrow(data))) {
# data <- data[subset,]
# exposure.data <- lapply(exposure.data, function(i) i[subset,])
# if (model$family == "logit")
# model$binomialSize <- model$binomialSize[subset]
# } else {
# stop("invalid subset, must be integers within range of data length")
# }
# }
# # ---- [Setup control and response variables] ----
# if (is.null(formula.zi)) {
# formula.zi <- formula
# }
# model$formula <- force(as.formula(formula))
# model$formula.zi <- force(as.formula(formula.zi))
# tf <- terms.formula(model$formula, data = data)
# tf.zi <- terms.formula(model$formula.zi, data = data)
# # Sanity check for response
# if (!attr(tf, "response") & !attr(tf.zi, "response"))
# stop("no valid response in formula")
# # Save if intercept is included in the model
# model$intercept <- force(ifelse(attr(tf, "intercept"), TRUE, FALSE))
# model$intercept.zi <- force(ifelse(attr(tf.zi, "intercept"), TRUE, FALSE))
# # Sanity check for variables and the names
# if (length(which(attr(tf, "term.labels") %in% colnames(data))) == 0 &
# length(which(attr(tf.zi, "term.labels") %in% colnames(data))) == 0 &
# !model$intercept & !model$intercept.zi)
# stop("no valid variables in formula")
# # ---- [Exposure splits] ----
# model$splitProb <- as.double(c())
# model$timeProb <- force(rep(1 / (model$pExp - 1), model$pExp - 1))
# # Exposure data organize
# model$X <- list()
# for (i in 1:model$nExp) {
# model$X[[i]] <- force(list(Xscale = sd(exposure.data[[i]]),
# X = exposure.data[[i]]))
# model$X[[i]]$X <- force(model$X[[i]]$X / model$X[[i]]$Xscale)
# model$X[[i]]$Xrange <- force(range(model$X[[i]]$X))
# model$X[[i]]$Xquant <- force(quantile(model$X[[i]]$X, 0:100/100) * model$X[[i]]$Xscale)
# model$X[[i]]$intX <- force(mean(model$X[[i]]$X))
# model$X[[i]]$Tcalc <-
# force(sapply(1:ncol(model$X[[i]]$X), function(j) {
# rowSums(model$X[[i]]$X[, 1:j, drop = F]) }))
# }
# names(model$X) <- model$expNames
# model$expProb <- force(rep(1/length(model$X), length(model$X)))
# # ---- [Scale data and setup exposures] ----
# data <- droplevels(data)
# mf <- model.frame(model$formula, data = data,
# drop.unused.levels = TRUE,
# na.action = NULL)
# mf.zi <- model.frame(model$formula.zi, data = data,
# drop.unused.levels = TRUE,
# na.action = NULL)
# if (any(is.na(mf)) & any(is.na(mf.zi)))
# stop("missing values in model data, use `complete.cases()` to subset data")
# # Dan's ---
# # model$Y <- force(model.response(mf))
# # model$Z <- force(model.matrix(model$formula, data = mf))
# # # QR <- qr(crossprod(model$Z))
# # # model$Z <- model$Z[,sort(QR$pivot[seq_len(QR$rank)])]
# # model$Znames <- colnames(model$Z)#[sort(QR$pivot[seq_len(QR$rank)])]
# # model$droppedCovar <- c()#colnames(model$Z)[QR$pivot[-seq_len(QR$rank)]]
# # # model$Z <- matrix(model$Z[,sort(QR$pivot[seq_len(QR$rank)])], nrow(model$Z), QR$rank)
# # # model$Z <- force(scaleModelMatrix(model$Z))
# # # rm(QR)
# # Organize response variable & fixed effect variable
# # Main covariates
# model$Y <- force(model.response(mf))
# model$Z <- force(model.matrix(model$formula, data = mf))
# QR <- qr(crossprod(model$Z))
# model$Z <- model$Z[, sort(QR$pivot[seq_len(QR$rank)])]
# model$Znames <- colnames(model$Z)[sort(QR$pivot[seq_len(QR$rank)])]
# model$droppedCovar <- colnames(model$Z)[QR$pivot[-seq_len(QR$rank)]]
# model$Z <- force(scaleModelMatrix(model$Z))
# rm(QR)
# # covariates for ZI model
# model$Z.zi <- force(model.matrix(model$formula.zi, data = mf.zi))
# QR.zi <- qr(crossprod(model$Z.zi))
# model$Z.zi <- model$Z.zi[,sort(QR.zi$pivot[seq_len(QR.zi$rank)])]
# model$droppedCovar.zi <- colnames(model$Z.zi)[QR.zi$pivot[-seq_len(QR.zi$rank)]]
# model$Z.zi <- force(scaleModelMatrix(model$Z.zi))
# rm(QR.zi)
# # Organize for different models
# # Gaussian
# if (model$family == "gaussian") {
# model$Ymean <- sum(range(model$Y)) / 2
# model$Yscale <- diff(range(model$Y - model$Ymean))
# model$Y <- force((model$Y - model$Ymean) / model$Yscale)
# } else { # For both logistic and ZINB,
# model$Yscale <- 1
# model$Ymean <- 0
# model$Y <- force(scale(model$Y, center = 0, scale = 1))
# }
# # Store the processed values
# model$Y <- force(c(model$Y))
# model$Zscale <- attr(model$Z, "scaled:scale")
# model$Zmean <- attr(model$Z, "scaled:center")
# model$Znames <- colnames(model$Z)
# model$Z <- force(matrix(model$Z, nrow(model$Z), ncol(model$Z)))
# model$Zscale.zi <- attr(model$Z.zi, "scaled:scale")
# model$Zmean.zi <- attr(model$Z.zi, "scaled:center")
# model$Znames.zi <- colnames(model$Z.zi)
# model$Z.zi <- force(matrix(model$Z.zi, nrow(model$Z.zi), ncol(model$Z.zi)))
# model$initParams <- rep(0, ncol(model$Z))
# if (!is.null(initial.params)) {
# names(model$initParams) <- colnames(model$Z)
# if (sum(names(initial.params) %in% colnames(model$Z)) > 0) {
# na <- names(initial.params[ # get matching names
# which(names(initial.params) %in% colnames(model$Z))])
# model$initParams[na] <- initial.params[na]
# }
# }
# # ---- Run model ----
# out <- tdlmm_Cpp(model)
# if (verbose)
# cat("\nCompiling results...\n")
# for (n in names(out))
# model[[n]] <- out[[n]]
# # ---- Prepare output ----
# model$Y <- model$Y * model$Yscale + model$Ymean
# model$fhat <- model$fhat * model$Yscale
# model$sigma2 <- model$sigma2 * (model$Yscale^2)
# # rescale fixed effect estimates
# # Gaussian & Logistic fixed effect
# model$gamma <- sapply(1:ncol(model$gamma), function(i) {
# model$gamma[,i] * model$Yscale / model$Zscale[i] })
# # ZINB fixed effect (binary)
# model$b1 <- sapply(1:ncol(model$b1), function(i) {
# model$b1[,i] * model$Yscale / model$Zscale.zi[i] })
# # ZINB fixed effect (NegBin)
# model$b2 <- sapply(1:ncol(model$b2), function(i) {
# model$b2[,i] * model$Yscale / model$Zscale[i] })
# # If intercept:
# if (model$intercept) {
# model$gamma[,1] <- model$gamma[,1] + model$Ymean
# if (ncol(model$Z) > 1)
# model$gamma[,1] <- model$gamma[,1] - model$gamma[,-1,drop=FALSE] %*% model$Zmean[-1]
# model$b2[,1] <- model$b2[,1] + model$Ymean
# if (ncol(model$Z) > 1)
# model$b2[,1] <- model$b2[,1] - model$b2[,-1,drop=FALSE] %*% model$Zmean[-1]
# }
# if (model$intercept.zi) {
# model$b1[,1] <- model$b1[,1] + model$Ymean
# if (ncol(model$Z.zi) > 1)
# model$b1[,1] <- model$b1[,1] - model$b1[,-1,drop=FALSE] %*% model$Zmean.zi[-1]
# }
# colnames(model$gamma) <- model$Znames
# colnames(model$b1) <- model$Znames.zi
# colnames(model$b2) <- model$Znames
# # rescale DLM and Mixture estimates
# model$DLM <- as.data.frame(model$DLM)
# colnames(model$DLM) <- c("Iter", "Tree", "TreePair", "exp", "tmin", "tmax",
# "est", "kappa")
# model$MIX <- as.data.frame(model$MIX)
# colnames(model$MIX) <- c("Iter", "Tree", "exp1", "tmin1", "tmax1",
# "exp2", "tmin2", "tmax2", "est", "kappa")
# model$mixNames <- c()
# for (i in 1:length(model$X)) {
# idx <- which(model$DLM$exp == (i - 1)) # Converting indices from cpp to R
# if (length(idx) > 0)
# model$DLM$est[idx] <- model$DLM$est[idx] * model$Yscale / model$X[[i]]$Xscale
# for (j in i:length(model$X)) {
# if ((model$interaction > 1) || (j > i))
# model$mixNames <- c(model$mixNames,
# paste0(model$expNames[i], "-", model$expNames[j]))
# idx <- which(model$MIX$exp1 == (i - 1) & model$MIX$exp2 == (j - 1))
# if (length(idx) > 0) {
# model$MIX$est[idx] <- model$MIX$est[idx] * model$Yscale / (model$X[[i]]$Xscale * model$X[[j]]$Xscale)
# }
# }
# }
# # Exposure names
# colnames(model$expProb) <- colnames(model$expCount) <-
# colnames(model$expInf) <- colnames(model$muExp) <- model$expNames
# if (model$interaction > 0) {
# colnames(model$mixInf) <- colnames(model$muMix) <-
# colnames(model$mixCount) <- model$mixNames
# }
# if (model$diagnostics) {
# model$treeAccept <- as.data.frame(model$treeAccept)
# colnames(model$treeAccept) <- c("tree", "step", "success", "exp", "term", "treeMhr", "mhr")
# model$treeAccept$step <- factor(model$treeAccept$step,
# levels = 0:3,
# labels = c("Grow", "Prune", "Change",
# "SwitchExposure"))
# model$treeAccept$exp <- factor(model$treeAccept$exp + 1,
# levels = 1:model$nExp,
# labels = model$expNames)
# }
# # Remove model and exposure data
# if (!keep_XZ) {
# for (i in 1:length(model$X))
# model$X[[i]]$X <- model$X[[i]]$Tcalc <- NULL
# model$Z <- NULL
# model$Z.zi <- NULL
# }
# # Change env to list
# model.out <- lapply(names(model), function(i) model[[i]])
# names(model.out) <- names(model)
# rm(model)
# gc()
model.out <- NULL
#class(model.out) <- "tdlmm"
return(model.out)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.