msm_fit <- function(f,
qdata,
intvals,
expnms,
rr=TRUE,
main=TRUE,
degree=1,
id=NULL,
weights,
bayes=FALSE,
MCsize=nrow(qdata), hasintercept=TRUE, ...){
#' @title Fitting marginal structural model (MSM) within quantile g-computation
#' @description This is an internal function called by \code{\link[qgcomp]{qgcomp}},
#' \code{\link[qgcomp]{qgcomp.glm.boot}}, and \code{\link[qgcomp]{qgcomp.glm.noboot}},
#' but is documented here for clarity. Generally, users will not need to call
#' this function directly.
#' @details This function first computes expected outcomes under hypothetical
#' interventions to simultaneously set all exposures to a specific quantile. These
#' predictions are based on g-computation, where the exposures are `quantized',
#' meaning that they take on ordered integer values according to their ranks,
#' and the integer values are determined by the number of quantile cutpoints used.
#' The function then takes these expected outcomes and fits an additional model
#' (a marginal structural model) with the expected outcomes as the outcome and
#' the intervention value of the exposures (the quantile integer) as the exposure.
#' Under causal identification assumptions and correct model specification,
#' the MSM yields a causal exposure-response representing the incremental
#' change in the expected outcome given a joint intervention on all exposures.
#' @param f an r formula representing the conditional model for the outcome, given all
#' exposures and covariates. Interaction terms that include exposure variables
#' should be represented via the \code{\link[base]{AsIs}} function
#' @param qdata a data frame with quantized exposures
#' @param expnms a character vector with the names of the columns in qdata that represent
#' the exposures of interest (main terms only!)
#' @param intvals sequence, the sequence of integer values that the joint exposure
#' is 'set' to for estimating the msm. For quantile g-computation, this is just
#' 0:(q-1), where q is the number of quantiles of exposure.
#' @param rr logical, estimate log(risk ratio) (family='binomial' only)
#' @param main logical, internal use: produce estimates of exposure effect (psi)
#' and expected outcomes under g-computation and the MSM
#' @param degree polynomial bases for marginal model (e.g. degree = 2
#' allows that the relationship between the whole exposure mixture and the outcome
#' is quadratic. Default=1)
#' @param id (optional) NULL, or variable name indexing individual units of
#' observation (only needed if analyzing data with multiple observations per
#' id/cluster)
#' @param weights "case weights" - passed to the "weight" argument of
#' \code{\link[stats]{glm}} or \code{\link[arm]{bayesglm}}
#' @param bayes use underlying Bayesian model (`arm` package defaults). Results
#' in penalized parameter estimation that can help with very highly correlated
#' exposures. Note: this does not lead to fully Bayesian inference in general,
#' so results should be interpreted as frequentist.
#' @param MCsize integer: sample size for simulation to approximate marginal
#' zero inflated model parameters. This can be left small for testing, but should be as large
#' as needed to reduce simulation error to an acceptable magnitude (can compare psi coefficients for
#' linear fits with qgcomp.zi.noboot to gain some intuition for the level of expected simulation
#' error at a given value of MCsize)
#' @param hasintercept (logical) does the model have an intercept?
#' @param ... arguments to glm (e.g. family)
#' @seealso \code{\link[qgcomp]{qgcomp.glm.boot}}, and \code{\link[qgcomp]{qgcomp}}
#' @concept variance mixtures
#' @import stats arm
#' @export
#' @examples
#' set.seed(50)
#' dat <- data.frame(y=runif(200), x1=runif(200), x2=runif(200), z=runif(200))
#' X <- c('x1', 'x2')
#' qdat <- quantize(dat, X, q=4)$data
#' mod <- msm_fit(f=y ~ z + x1 + x2 + I(x1*x2),
#' expnms = c('x1', 'x2'), qdata=qdat, intvals=1:4, bayes=FALSE)
#' summary(mod$fit) # outcome regression model
#' summary(mod$msmfit) # msm fit (variance not valid - must be obtained via bootstrap)
newform <- terms(f, data = qdata)
nobs = nrow(qdata)
thecall <- match.call(expand.dots = FALSE)
names(thecall) <- gsub("qdata", "data", names(thecall))
names(thecall) <- gsub("f", "formula", names(thecall))
m <- match(c("formula", "data", "weights", "offset"), names(thecall), 0L)
hasweights = ifelse("weights" %in% names(thecall), TRUE, FALSE)
thecall <- thecall[c(1L, m)]
thecall$drop.unused.levels <- TRUE
thecall[[1L]] <- quote(stats::model.frame)
thecalle <- eval(thecall, parent.frame())
if(hasweights){
qdata$weights <- as.vector(model.weights(thecalle))
} else qdata$weights = rep(1, nobs)
if(is.null(id)) {
id <- "id__"
qdata$id__ <- seq_len(dim(qdata)[1])
}
# conditional outcome regression fit
nidx = which(!(names(qdata) %in% id))
if(!bayes) fit <- glm(newform, data = qdata,
weights=weights,
...)
if(bayes){
requireNamespace("arm")
fit <- bayesglm(f, data = qdata[,nidx,drop=FALSE],
weights=weights,
...)
}
if(fit$family$family %in% c("gaussian", "poisson")) rr=FALSE
###
# get predictions (set exposure to 0,1,...,q-1)
if(is.null(intvals)){
intvals <- (seq_len(length(table(qdata[expnms[1]])))) - 1
}
predit <- function(idx, newdata){
#newdata <- qdata
newdata[,expnms] <- idx
suppressWarnings(predict(fit, newdata=newdata, type='response'))
}
if(MCsize==nrow(qdata)){
newdata <- qdata
}else{
newids <- data.frame(temp=sort(sample(unique(qdata[,id, drop=TRUE]), MCsize,
#probs=weights, #bootstrap sampling with weights works with fixed weights, but not time-varying weights
replace = TRUE
)))
names(newids) <- id
newdata <- merge(qdata,newids, by=id, all.x=FALSE, all.y=TRUE)[seq_len(MCsize),]
}
predmat = lapply(intvals, predit, newdata=newdata)
# fit MSM using g-computation estimates of expected outcomes under joint
# intervention
#nobs <- dim(qdata)[1]
msmdat <- data.frame(
cbind(
Ya = unlist(predmat),
psi = rep(intvals, each=MCsize),
weights = rep(newdata$weights, times=length(intvals))
#times=length(table(qdata[expnms[1]])))
)
)
# to do: allow functional form variations for the MSM via specifying the model formula
msmforms = paste0("Ya ~ ",
ifelse(hasintercept, "1 +", "-1 +"),
"poly(psi, degree=",degree,", raw=TRUE)"
)
msmform = as.formula(msmforms)
if(bayes){
if(!rr) suppressWarnings(msmfit <- bayesglm(msmform, data=msmdat,
weights=weights, x=TRUE,
...))
if(rr) suppressWarnings(msmfit <- bayesglm(msmform, data=msmdat,
family=binomial(link='log'), start=rep(-0.0001, degree+1),
weights=weights, x=TRUE))
}
if(!bayes){
if(!rr) suppressWarnings(msmfit <- glm(msmform, data=msmdat,
weights=weights, x=TRUE,
...))
if(rr) suppressWarnings(msmfit <- glm(msmform, data=msmdat,
family=binomial(link='log'), start=rep(-0.0001, degree+1),
weights=weights, x=TRUE))
}
res <- list(fit=fit, msmfit=msmfit)
if(main) {
res$Ya <- msmdat$Ya # expected outcome under joint exposure, by gcomp
res$Yamsm <- predict(msmfit, type='response')
res$Yamsml <- predict(msmfit, type="link")
res$A <- msmdat$psi # joint exposure (0 = all exposures set category with
# upper cut-point as first quantile)
}
res
}
msm.fit <- msm_fit
qgcomp.glm.noboot <- function(f,
data,
expnms=NULL,
q=4,
breaks=NULL,
id=NULL,
weights,
alpha=0.05,
bayes=FALSE,
...){
#' @title Quantile g-computation for continuous, binary, and count outcomes under linearity/additivity
#' @aliases qgcomp.noboot
#' @description This function estimates a linear dose-response parameter representing a one quantile
#' increase in a set of exposures of interest. This function is limited to linear and additive
#' effects of individual components of the exposure. This model estimates the parameters of a marginal
#' structural model (MSM) based on g-computation with quantized exposures. Note: this function is
#' valid only under linear and additive effects of individual components of the exposure, but when
#' these hold the model can be fit with very little computational burden.
#' qgcomp.noboot is an equivalent function (slated for deprecation)
#'
#' @details For continuous outcomes, under a linear model with no
#' interaction terms, this is equivalent to g-computation of the effect of
#' increasing every exposure by 1 quantile. For binary/count outcomes
#' outcomes, this yields a conditional log odds/rate ratio(s) representing the
#' change in the expected conditional odds/rate (conditional on covariates)
#' from increasing every exposure by 1 quantile. In general, the latter
#' quantity is not equivalent to g-computation estimates. Hypothesis test
#' statistics and confidence intervals are based on using the delta
#' estimate variance of a linear combination of random variables.
#'
#' @param f R style formula
#' @param data data frame
#' @param expnms character vector of exposures of interest
#' @param q NULL or number of quantiles used to create quantile indicator variables
#' representing the exposure variables. If NULL, then gcomp proceeds with un-transformed
#' version of exposures in the input datasets (useful if data are already transformed,
#' or for performing standard g-computation)
#' @param breaks (optional) NULL, or a list of (equal length) numeric vectors that
#' characterize the minimum value of each category for which to
#' break up the variables named in expnms. This is an alternative to using 'q'
#' to define cutpoints.
#' @param id (optional) NULL, or variable name indexing individual units of
#' observation (only needed if analyzing data with multiple observations per
#' id/cluster). Note that qgcomp.glm.noboot will not produce cluster-appropriate
#' standard errors (this parameter is essentially ignored in qgcomp.glm.noboot).
#' qgcomp.glm.boot can be used for this, which will use bootstrap
#' sampling of clusters/individuals to estimate cluster-appropriate standard
#' errors via bootstrapping.
#' @param weights "case weights" - passed to the "weight" argument of
#' \code{\link[stats]{glm}} or \code{\link[arm]{bayesglm}}
#' @param alpha alpha level for confidence limit calculation
#' @param bayes use underlying Bayesian model (`arm` package defaults). Results
#' in penalized parameter estimation that can help with very highly correlated
#' exposures. Note: this does not lead to fully Bayesian inference in general,
#' so results should be interpreted as frequentist.
#' @param ... arguments to glm (e.g. family)
#' @family qgcomp_methods
#' @return a qgcompfit object, which contains information about the effect
#' measure of interest (psi) and associated variance (var.psi), as well
#' as information on the model fit (fit) and information on the
#' weights/standardized coefficients in the positive (pos.weights) and
#' negative (neg.weights) directions.
#' @concept variance mixtures
#' @import stats arm
#' @export
#' @examples
#' set.seed(50)
#' # linear model
#' dat <- data.frame(y=runif(50,-1,1), x1=runif(50), x2=runif(50), z=runif(50))
#' qgcomp.glm.noboot(f=y ~ z + x1 + x2, expnms = c('x1', 'x2'), data=dat, q=2, family=gaussian())
#' # not intercept model
#' qgcomp.glm.noboot(f=y ~-1+ z + x1 + x2, expnms = c('x1', 'x2'), data=dat, q=2, family=gaussian())
#' # logistic model
#' dat2 <- data.frame(y=rbinom(50, 1,0.5), x1=runif(50), x2=runif(50), z=runif(50))
#' qgcomp.glm.noboot(f=y ~ z + x1 + x2, expnms = c('x1', 'x2'), data=dat2, q=2, family=binomial())
#' # poisson model
#' dat3 <- data.frame(y=rpois(50, .5), x1=runif(50), x2=runif(50), z=runif(50))
#' qgcomp.glm.noboot(f=y ~ z + x1 + x2, expnms = c('x1', 'x2'), data=dat3, q=2, family=poisson())
#' # weighted model
#' N=5000
#' dat4 <- data.frame(y=runif(N), x1=runif(N), x2=runif(N), z=runif(N))
#' dat4$w=runif(N)*2
#' qdata = quantize(dat4, expnms = c("x1", "x2"))$data
#' (qgcfit <- qgcomp.glm.noboot(f=y ~ z + x1 + x2, expnms = c('x1', 'x2'), data=dat4, q=4,
#' family=gaussian(), weights=w))
#' qgcfit$fit
#' glm(y ~ z + x1 + x2, data = qdata, weights=w)
newform <- terms(f, data = data)
hasintercept = as.logical(attr(newform, "intercept"))
nobs = nrow(data)
origcall <- thecall <- match.call(expand.dots = FALSE)
names(thecall) <- gsub("f", "formula", names(thecall))
m <- match(c("formula", "data", "weights", "offset"), names(thecall), 0L)
#m <- match(c("f", "data", "weights", "offset"), names(thecall), 0L)
hasweights = ifelse("weights" %in% names(thecall), TRUE, FALSE)
thecall <- thecall[c(1L, m)]
thecall$drop.unused.levels <- TRUE
thecall[[1L]] <- quote(stats::model.frame)
thecalle <- eval(thecall, parent.frame()) # a model frame pulled in from the environment in which the function was called
if(hasweights){
data$weights <- as.vector(model.weights(thecalle))
} else data$weights = rep(1, nobs)
if (is.null(expnms)) {
#expnms <- attr(terms(f, data = data), "term.labels")
expnms <- attr(newform, "term.labels")
message("Including all model terms as exposures of interest\n")
}
lin = checknames(expnms)
if(!lin) stop("Model appears to be non-linear: use qgcomp.glm.boot instead")
if (!is.null(q) | !is.null(breaks)){
ql <- quantize(data, expnms, q, breaks)
qdata <- ql$data
br <- ql$breaks
} else{
qdata <- data
br <- breaks
}
if(is.null(id)) {
# not yet implemented
id = "id__"
qdata$id__ = seq_len(dim(qdata)[1])
}
if(!bayes) fit <- glm(newform, data = qdata,
weights=weights,
...)
if(bayes){
requireNamespace("arm")
fit <- bayesglm(newform, data = qdata,
weights=weights,
...)
}
mod <- summary(fit)
if(length(setdiff(expnms, rownames(mod$coefficients)))>0){
stop("Model aliasing occurred, likely due to perfectly correlated quantized exposures.
Try one of the following:
1) set 'bayes' to TRUE in the qgcomp function (recommended)
2) set 'q' to a higher value in the qgcomp function (recommended)
3) check correlation matrix of exposures, and drop all but one variable in each highly correlated set (not recommended)
")
}
estb <- sum(mod$coefficients[expnms,1, drop=TRUE])
seb <- se_comb(expnms, covmat = mod$cov.scaled)
if(hasintercept){
estb <- c(fit$coefficients[1], estb)
seb <- c(sqrt(mod$cov.scaled[1,1]), seb)
}
tstat <- estb / seb
df <- mod$df.null - length(expnms)
pval <- 2 - 2 * pt(abs(tstat), df = df)
pvalz <- 2 - 2 * pnorm(abs(tstat))
ci <- cbind(estb + seb * qnorm(alpha / 2), estb + seb * qnorm(1 - alpha / 2))
# 'weights'
wcoef <- fit$coefficients[expnms]
names(wcoef) <- gsub("_q", "", names(wcoef))
pos.coef <- which(wcoef > 0)
neg.coef <- which(wcoef <= 0)
pos.weights <- abs(wcoef[pos.coef]) / sum(abs(wcoef[pos.coef]))
neg.weights <- abs(wcoef[neg.coef]) / sum(abs(wcoef[neg.coef]))
pos.psi <- sum(wcoef[pos.coef])
neg.psi <- sum(wcoef[neg.coef])
qx <- qdata[, expnms]
names(qx) <- paste0(names(qx), "_q")
psiidx = 1+hasintercept
names(estb)[psiidx] <- "psi1"
cnms = "psi1"
if(hasintercept){
covmat.coef=vc_comb(aname="(Intercept)", expnms=expnms, covmat = mod$cov.scaled)
cnms = c("(intercept)", cnms)
}
if(!hasintercept)
covmat.coef=as.matrix(seb^2,nrow=1,ncol=1)
colnames(covmat.coef) <- rownames(covmat.coef) <- names(estb) <- cnms
res <- .qgcomp_object(
qx = qx, fit = fit,
psi = estb[psiidx],
var.psi = seb[psiidx] ^ 2,
covmat.psi=c('psi1' = seb[psiidx]^2),
ci = ci[psiidx,],
coef = estb, var.coef = seb ^ 2,
covmat.coef=covmat.coef,
ci.coef = ci[,],
expnms=expnms, q=q, breaks=br,
pos.psi = pos.psi, neg.psi = neg.psi,
pos.weights = sort(pos.weights, decreasing = TRUE),
neg.weights = sort(neg.weights, decreasing = TRUE),
pos.size = sum(abs(wcoef[pos.coef])),
neg.size = sum(abs(wcoef[neg.coef])),
alpha=alpha,
call=origcall,
hasintercept=hasintercept
)
if(fit$family$family=='gaussian'){
res$tstat <- tstat
res$df <- df
res$pval <- pval
}
if(fit$family$family %in% c('binomial', 'poisson')){
res$zstat <- tstat
res$pval <- pvalz
}
res
}
qgcomp.glm.boot <- function(
f,
data,
expnms=NULL,
q=4,
breaks=NULL,
id=NULL,
weights,
alpha=0.05,
B=200,
rr=TRUE,
degree=1,
seed=NULL,
bayes=FALSE,
MCsize=nrow(data),
parallel=FALSE,
parplan = FALSE,
...
){
#' @title Quantile g-computation for continuous and binary outcomes
#' @aliases qgcomp.boot
#'
#' @description This function estimates a dose-response parameter representing a one quantile
#' increase in a set of exposures of interest. This model estimates the parameters of a marginal
#' structural model (MSM) based on g-computation with quantized exposures. Note: this function
#' allows non-linear and non-additive effects of individual components of the exposure, as well as
#' non-linear joint effects of the mixture via polynomial basis functions, which increase the
#' computational computational burden due to the need for non-parametric bootstrapping.
#' qgcomp.boot is an equivalent function (slated for deprecation)
#'
#' @details Estimates correspond to the average expected change in the
#' (log) outcome per quantile increase in the joint exposure to all exposures
#' in `expnms'. Test statistics and confidence intervals are based on
#' a non-parametric bootstrap, using the standard deviation of the bootstrap
#' estimates to estimate the standard error. The bootstrap standard error is
#' then used to estimate Wald-type confidence intervals. Note that no bootstrapping
#' is done on estimated quantiles of exposure, so these are treated as fixed
#' quantities
#'
#' @param f R style formula
#' @param data data frame
#' @param expnms character vector of exposures of interest
#' @param q NULL or number of quantiles used to create quantile indicator variables
#' representing the exposure variables. If NULL, then gcomp proceeds with un-transformed
#' version of exposures in the input datasets (useful if data are already transformed,
#' or for performing standard g-computation)
#' @param breaks (optional) NULL, or a list of (equal length) numeric vectors that
#' characterize the minimum value of each category for which to
#' break up the variables named in expnms. This is an alternative to using 'q'
#' to define cutpoints.
#' @param id (optional) NULL, or variable name indexing individual units of
#' observation (only needed if analyzing data with multiple observations per
#' id/cluster). Note that qgcomp.glm.noboot will not produce cluster-appropriate
#' standard errors. qgcomp.glm.boot can be used for this, which will use bootstrap
#' sampling of clusters/individuals to estimate cluster-appropriate standard
#' errors via bootstrapping.
#' @param weights "case weights" - passed to the "weight" argument of
#' \code{\link[stats]{glm}} or \code{\link[arm]{bayesglm}}
#' @param alpha alpha level for confidence limit calculation
#' @param B integer: number of bootstrap iterations (this should typically be >=200,
#' though it is set lower in examples to improve run-time).
#' @param rr logical: if using binary outcome and rr=TRUE, qgcomp.glm.boot will
#' estimate risk ratio rather than odds ratio
#' @param degree polynomial bases for marginal model (e.g. degree = 2
#' allows that the relationship between the whole exposure mixture and the outcome
#' is quadratic (default = 1).
#' @param seed integer or NULL: random number seed for replicable bootstrap results
#' @param bayes use underlying Bayesian model (`arm` package defaults). Results
#' in penalized parameter estimation that can help with very highly correlated
#' exposures. Note: this does not lead to fully Bayesian inference in general,
#' so results should be interpreted as frequentist.
#' @param MCsize integer: sample size for simulation to approximate marginal
#' zero inflated model parameters. This can be left small for testing, but should be as large
#' as needed to reduce simulation error to an acceptable magnitude (can compare psi coefficients for
#' linear fits with qgcomp.glm.noboot to gain some intuition for the level of expected simulation
#' error at a given value of MCsize). This likely won't matter much in linear models, but may
#' be important with binary or count outcomes.
#' @param parallel use (safe) parallel processing from the future and future.apply packages
#' @param parplan (logical, default=FALSE) automatically set future::plan to plan(multisession) (and set to existing plan, if any, after bootstrapping)
#' @param ... arguments to glm (e.g. family)
#' @family qgcomp_methods
#' @return a qgcompfit object, which contains information about the effect
#' measure of interest (psi) and associated variance (var.psi), as well
#' as information on the model fit (fit) and information on the
#' marginal structural model (msmfit) used to estimate the final effect
#' estimates.
#' @concept variance mixtures
#' @import stats
#' @export
#' @examples
#' set.seed(30)
#' # continuous outcome
#' dat <- data.frame(y=rnorm(100), x1=runif(100), x2=runif(100), z=runif(100))
#' # Conditional linear slope
#' qgcomp.glm.noboot(y ~ z + x1 + x2, expnms = c('x1', 'x2'), data=dat, q=4, family=gaussian())
#' # Marginal linear slope (population average slope, for a purely linear,
#' # additive model this will equal the conditional)
#' \dontrun{
#' qgcomp.glm.boot(f=y ~ z + x1 + x2, expnms = c('x1', 'x2'), data=dat, q=4,
#' family=gaussian(), B=200) # B should be at least 200 in actual examples
#' # no intercept model
#' qgcomp.glm.boot(f=y ~ -1+z + x1 + x2, expnms = c('x1', 'x2'), data=dat, q=4,
#' family=gaussian(), B=200) # B should be at least 200 in actual examples
#'
#' # Note that these give different answers! In the first, the estimate is conditional on Z,
#' # but in the second, Z is marginalized over via standardization. The estimates
#' # can be made approximately the same by centering Z (for linear models), but
#' # the conditional estimate will typically have lower standard errors.
#' dat$z = dat$z - mean(dat$z)
#'
#' # Conditional linear slope
#' qgcomp.glm.noboot(y ~ z + x1 + x2, expnms = c('x1', 'x2'), data=dat, q=4, family=gaussian())
#' # Marginal linear slope (population average slope, for a purely linear,
#' # additive model this will equal the conditional)
#'
#' qgcomp.glm.boot(f=y ~ z + x1 + x2, expnms = c('x1', 'x2'), data=dat, q=4,
#' family=gaussian(), B=200) # B should be at least 200 in actual examples
#'
#' # Population average mixture slope which accounts for non-linearity and interactions
#' qgcomp.glm.boot(y ~ z + x1 + x2 + I(x1^2) + I(x2*x1), family="gaussian",
#' expnms = c('x1', 'x2'), data=dat, q=4, B=200)
#'
#' # generally non-linear/non-addiive underlying models lead to non-linear mixture slopes
#' qgcomp.glm.boot(y ~ z + x1 + x2 + I(x1^2) + I(x2*x1), family="gaussian",
#' expnms = c('x1', 'x2'), data=dat, q=4, B=200, deg=2)
#'
#' # binary outcome
#' dat <- data.frame(y=rbinom(50,1,0.5), x1=runif(50), x2=runif(50), z=runif(50))
#'
#' # Conditional mixture OR
#' qgcomp.glm.noboot(y ~ z + x1 + x2, family="binomial", expnms = c('x1', 'x2'),
#' data=dat, q=2)
#'
#' #Marginal mixture OR (population average OR - in general, this will not equal the
#' # conditional mixture OR due to non-collapsibility of the OR)
#' qgcomp.glm.boot(y ~ z + x1 + x2, family="binomial", expnms = c('x1', 'x2'),
#' data=dat, q=2, B=3, rr=FALSE)
#'
#' # Population average mixture RR
#' qgcomp.glm.boot(y ~ z + x1 + x2, family="binomial", expnms = c('x1', 'x2'),
#' data=dat, q=2, rr=TRUE, B=3)
#'
#' # Population average mixture RR, indicator variable representation of x2
#' # note that I(x==...) operates on the quantile-based category of x,
#' # rather than the raw value
#' res = qgcomp.glm.boot(y ~ z + x1 + I(x2==1) + I(x2==2) + I(x2==3),
#' family="binomial", expnms = c('x1', 'x2'), data=dat, q=4, rr=TRUE, B=200)
#' res$fit
#' plot(res)
#'
#' # now add in a non-linear MSM
#' res2 = qgcomp.glm.boot(y ~ z + x1 + I(x2==1) + I(x2==2) + I(x2==3),
#' family="binomial", expnms = c('x1', 'x2'), data=dat, q=4, rr=TRUE, B=200,
#' degree=2)
#' res2$fit
#' res2$msmfit # correct point estimates, incorrect standard errors
#' res2 # correct point estimates, correct standard errors
#' plot(res2)
#' # Log risk ratio per one IQR change in all exposures (not on quantile basis)
#' dat$x1iqr <- dat$x1/with(dat, diff(quantile(x1, c(.25, .75))))
#' dat$x2iqr <- dat$x2/with(dat, diff(quantile(x2, c(.25, .75))))
#' # note that I(x>...) now operates on the untransformed value of x,
#' # rather than the quantized value
#' res2 = qgcomp.glm.boot(y ~ z + x1iqr + I(x2iqr>0.1) + I(x2>0.4) + I(x2>0.9),
#' family="binomial", expnms = c('x1iqr', 'x2iqr'), data=dat, q=NULL, rr=TRUE, B=200,
#' degree=2)
#' res2
#' # using parallel processing
#'
#' qgcomp.glm.boot(y ~ z + x1iqr + I(x2iqr>0.1) + I(x2>0.4) + I(x2>0.9),
#' family="binomial", expnms = c('x1iqr', 'x2iqr'), data=dat, q=NULL, rr=TRUE, B=200,
#' degree=2, parallel=TRUE, parplan=TRUE)
#'
#'
#' # weighted model
#' N=5000
#' dat4 <- data.frame(id=seq_len(N), x1=runif(N), x2=runif(N), z=runif(N))
#' dat4$y <- with(dat4, rnorm(N, x1*z + z, 1))
#' dat4$w=runif(N) + dat4$z*5
#' qdata = quantize(dat4, expnms = c("x1", "x2"), q=4)$data
#' # first equivalent models with no covariates
#' qgcomp.glm.noboot(f=y ~ x1 + x2, expnms = c('x1', 'x2'), data=dat4, q=4, family=gaussian())
#' qgcomp.glm.noboot(f=y ~ x1 + x2, expnms = c('x1', 'x2'), data=dat4, q=4, family=gaussian(),
#' weights=w)
#'
#' set.seed(13)
#' qgcomp.glm.boot(f=y ~ x1 + x2, expnms = c('x1', 'x2'), data=dat4, q=4, family=gaussian(),
#' weights=w)
#' # using the correct model
#' set.seed(13)
#' qgcomp.glm.boot(f=y ~ x1*z + x2, expnms = c('x1', 'x2'), data=dat4, q=4, family=gaussian(),
#' weights=w, id="id")
#' (qgcfit <- qgcomp.glm.boot(f=y ~ z + x1 + x2, expnms = c('x1', 'x2'), data=dat4, q=4,
#' family=gaussian(), weights=w))
#' qgcfit$fit
#' summary(glm(y ~ z + x1 + x2, data = qdata, weights=w))
#' }
# character names of exposure mixture components
oldq = NULL
if(is.null(seed)) seed = round(runif(1, min=0, max=1e8))
newform <- terms(f, data = data)
hasintercept = as.logical(attr(newform, "intercept"))
class(newform) <- "formula"
nobs = nrow(data)
origcall <- thecall <- match.call(expand.dots = FALSE)
names(thecall) <- gsub("f", "formula", names(thecall))
m <- match(c("formula", "data", "weights", "offset"), names(thecall), 0L)
hasweights = ifelse("weights" %in% names(thecall), TRUE, FALSE)
thecall <- thecall[c(1L, m)]
thecall$drop.unused.levels <- TRUE
thecall[[1L]] <- quote(stats::model.frame)
thecalle <- eval(thecall, parent.frame())
if(hasweights){
data$weights <- as.vector(model.weights(thecalle))
} else data$weights = rep(1, nobs)
if (is.null(expnms)) {
#expnms <- attr(terms(f, data = data), "term.labels")
expnms <- attr(newform, "term.labels")
message("Including all model terms as exposures of interest\n")
}
lin = checknames(expnms)
if(!lin) stop("Model appears to be non-linear and I'm having trouble parsing it:
please use `expnms` parameter to define the variables making up the exposure")
if (!is.null(q) & !is.null(breaks)){
# if user specifies breaks, prioritize those
oldq = q
q <- NULL
}
if (!is.null(q) | !is.null(breaks)){
ql <- quantize(data, expnms, q, breaks)
qdata <- ql$data
br <- ql$breaks
if(is.null(q)){
# rare scenario with user specified breaks and q is left at NULL
nvals <- length(br[[1]])-1
} else{
nvals <- q
}
intvals <- (seq_len(nvals))-1
} else {
# if( is.null(breaks) & is.null(q)) # also includes NA
qdata <- data
# if no transformation is made (no quantiles, no breaks given)
# then draw distribution values from quantiles of all the exposures
# pooled together
# : allow user specification of this
nvals = length(table(unlist(data[,expnms])))
if(nvals < 10){
message("\nNote: using all possible values of exposure as the
intervention values\n")
p = length(expnms)
intvals <- as.numeric(names(table(unlist(data[,expnms]))))
br <- lapply(seq_len(p), function(x) c(-1e16, intvals[2:nvals]-1e-16, 1e16))
}else{
message("\nNote: using quantiles of all exposures combined in order to set
proposed intervention values for overall effect (25th, 50th, 75th %ile)
You can ensure this is valid by scaling all variables in expnms to have similar ranges.")
intvals = as.numeric(quantile(unlist(data[,expnms]), c(.25, .5, .75)))
br <- NULL
}
}
if(is.null(id)) {
id <- "id__"
qdata$id__ <- seq_len(dim(qdata)[1])
}
###
msmfit <- msm_fit(newform, qdata, intvals, expnms, rr, main=TRUE,degree=degree, id=id,
weights,
bayes,
MCsize=MCsize, hasintercept = hasintercept,
...)
# main estimate
#estb <- as.numeric(msmfit$msmfit$coefficients[-1])
estb <- as.numeric(msmfit$msmfit$coefficients)
#bootstrap to get std. error
nobs <- dim(qdata)[1]
nids <- length(unique(qdata[,id, drop=TRUE]))
starttime = Sys.time()
psi.only <- function(i=1, f=f, qdata=qdata, intvals=intvals, expnms=expnms, rr=rr, degree=degree,
nids=nids, id=id,
weights,MCsize=MCsize, hasintercept = hasintercept,
...){
if(i==2 & !parallel){
timeiter = as.numeric(Sys.time() - starttime)
if((timeiter*B/60)>0.5) message(paste0("Expected time to finish: ", round(B*timeiter/60, 2), " minutes \n"))
}
bootids <- data.frame(temp=sort(sample(unique(qdata[,id, drop=TRUE]), nids,
replace = TRUE
)))
names(bootids) <- id
qdata_ <- merge(qdata,bootids, by=id, all.x=FALSE, all.y=TRUE)
ft = msm_fit(f, qdata_, intvals, expnms, rr, main=FALSE, degree, id, weights=weights, bayes, MCsize=MCsize,
hasintercept = hasintercept,
...)
yhatty = data.frame(yhat=predict(ft$msmfit), psi=ft$msmfit$data[,"psi"])
as.numeric(
# the yhat estimates will be identical across individuals due to this being a marginal model
c(ft$msmfit$coefficients, with(yhatty, tapply(yhat, psi, mean)))
)
}
set.seed(seed)
if(parallel){
#Sys.setenv(R_FUTURE_SUPPORTSMULTICORE_UNSTABLE="quiet")
if (parplan) {
oplan <- future::plan(strategy = future::multisession)
on.exit(future::plan(oplan), add = TRUE)
}
bootsamps <- future.apply::future_lapply(X=seq_len(B), FUN=psi.only,f=f, qdata=qdata, intvals=intvals,
expnms=expnms, rr=rr, degree=degree, nids=nids, id=id,
weights=qdata$weights,MCsize=MCsize, hasintercept = hasintercept,
future.seed=TRUE,
...)
}else{
bootsamps <- lapply(X=seq_len(B), FUN=psi.only,f=f, qdata=qdata, intvals=intvals,
expnms=expnms, rr=rr, degree=degree, nids=nids, id=id,
weights=weights, MCsize=MCsize, hasintercept = hasintercept,
...)
}
bootsamps = do.call("cbind", bootsamps)
# these are the linear predictors
hats = t(bootsamps[-c(seq_len(degree+ifelse(hasintercept,1,0))),,drop=FALSE])
# covariance of the linear predictors
cov.yhat = cov(hats)
bootsamps = bootsamps[seq_len(degree+ifelse(hasintercept,1,0)),,drop=FALSE]
seb <- apply(bootsamps, 1, sd)
covmat <- cov(t(bootsamps))
cnms = c(paste0("psi", seq_len(nrow(bootsamps)-1)))
if(hasintercept)
cnms = c("(intercept)", cnms)
colnames(covmat) <- rownames(covmat) <- names(estb) <- cnms
tstat <- estb / seb
df <- nobs - length(attr(terms(f, data = data), "term.labels")) - 1 - degree # df based on obs - gcomp terms - msm terms
pval <- 2 - 2 * pt(abs(tstat), df = df)
pvalz <- 2 - 2 * pnorm(abs(tstat))
ci <- cbind(estb + seb * qnorm(alpha / 2), estb + seb * qnorm(1 - alpha / 2))
# outcome 'weights' not applicable in this setting, generally (i.e. if using this function for non-linearity,
# then weights will vary with level of exposure)
if (!is.null(oldq)){
q = oldq
}
psiidx = seq_len(degree)+ifelse(hasintercept,1,0)
qx <- qdata[, expnms]
names(qx) <- paste0(names(qx), "_q")
res <- .qgcomp_object(
qx = qx, fit = msmfit$fit, msmfit = msmfit$msmfit,
psi = estb[psiidx], var.psi = seb[psiidx] ^ 2,
covmat.psi=covmat[psiidx,psiidx, drop=FALSE],
ci = ci[psiidx,],
coef = estb, var.coef = seb ^ 2, covmat.coef=covmat, ci.coef = ci,
expnms=expnms, q=q, breaks=br, degree=degree,
bootstrap=TRUE,
y.expected=msmfit$Ya, y.expectedmsm=msmfit$Yamsm, index=msmfit$A,
bootsamps = bootsamps,
cov.yhat=cov.yhat,
alpha=alpha,
call=origcall,
hasintercept=hasintercept
)
if(msmfit$fit$family$family=='gaussian'){
res$tstat <- tstat
res$df <- df
res$pval <- pval
}
if(msmfit$fit$family$family %in% c('binomial', 'poisson')){
res$zstat <- tstat
res$pval <- pvalz
}
res
}
qgcomp <- function(f,data=data,family=gaussian(),rr=TRUE,...){
#' @title Quantile g-computation for continuous, binary, count, and censored survival outcomes
#'
#' @description This function automatically selects between qgcomp.glm.noboot, qgcomp.glm.boot,
#' qgcomp.cox.noboot, and qgcomp.cox.boot
#' for the most efficient approach to estimate the average expected
#' change in the (log) outcome per quantile increase in the joint
#' exposure to all exposures in `expnms', given the underlying model. For example,
#' if the underlying model (specified by the formula `f`) is a linear model with all
#' linear terms for exposure, then `qgcomp.glm.noboot`` will be called to fit the model. Non-linear
#' terms or requesting the risk ratio for binomial outcomes will result in the `qgcomp.glm.boot`
#' function being called. For a given linear model, boot and noboot versions will give identical
#' inference, though when using survival outcomes, the `boot` version uses simulation based
#' inference, which can vary from the `noboot` version due to simulation error (which can
#' be minimized via setting the MCsize parameter very large - see
#' \code{\link[qgcomp]{qgcomp.cox.boot}} for details).
#'
#'
#'
#' @param f R style formula (may include survival outcome via \code{\link[survival]{Surv}})
#' @param data data frame
#' @param family \code{gaussian()}, \code{binomial()}, \code{cox()}, \code{poisson()} (works as
#' argument to 'family' parameter in \code{\link[stats]{glm}}` or 'dist' parameter in
#' \code{\link[pscl]{zeroinfl}})
#' @param rr logical: if using binary outcome and rr=TRUE, qgcomp.glm.boot will
#' estimate risk ratio rather than odds ratio. Note, to get population average
#' effect estimates for a binary outcome, set rr=TRUE (default: ORs are generally not
#' of interest as population average effects, so if rr=FALSE then a conditional
#' OR will be estimated, which cannot be interpreted as a population average
#' effect
#' @param ... arguments to qgcomp.glm.noboot or qgcomp.glm.boot (e.g. q) or glm
#' @seealso \code{\link[qgcomp]{qgcomp.glm.noboot}}, \code{\link[qgcomp]{qgcomp.glm.boot}},
#' \code{\link[qgcomp]{qgcomp.cox.noboot}}, \code{\link[qgcomp]{qgcomp.cox.boot}}
#' \code{\link[qgcomp]{qgcomp.zi.noboot}} and \code{\link[qgcomp]{qgcomp.zi.boot}}
#' (\code{\link[qgcomp]{qgcomp}} is just a wrapper for these functions)
#' @return a qgcompfit object, which contains information about the effect
#' measure of interest (psi) and associated variance (var.psi), as well
#' as information on the model fit (fit) and possibly information on the
#' marginal structural model (msmfit) used to estimate the final effect
#' estimates (qgcomp.glm.boot, qgcomp.cox.boot only).
#' If appropriate, weights are also reported, which represent the proportion
#' of a directional (positive/negative) effect that is accounted for by
#' each exposure.
#' @concept variance mixtures
#' @import stats
#' @export
#' @examples
#' set.seed(50)
#' dat <- data.frame(y=runif(50), x1=runif(50), x2=runif(50), z=runif(50))
#' qgcomp.glm.noboot(y ~ z + x1 + x2, expnms = c('x1', 'x2'), data=dat, q=2)
#' qgcomp.glm.boot(y ~ z + x1 + x2, expnms = c('x1', 'x2'), data=dat, q=2, B=10, seed=125)
#' # automatically selects appropriate method
#' qgcomp(y ~ z + x1 + x2, expnms = c('x1', 'x2'), data=dat, q=2)
#' # note for binary outcome this will choose the risk ratio (and bootstrap methods) by default
#' dat <- data.frame(y=rbinom(100, 1, 0.5), x1=runif(100), x2=runif(100), z=runif(100))
#' \dontrun{
#' qgcomp.glm.noboot(y ~ z + x1 + x2, expnms = c('x1', 'x2'), data=dat, q=2, family=binomial())
#' set.seed(1231)
#' qgcomp.glm.boot(y ~ z + x1 + x2, expnms = c('x1', 'x2'), data=dat, q=2, family=binomial())
#' set.seed(1231)
#' qgcomp(y ~ z + x1 + x2, expnms = c('x1', 'x2'), data=dat, q=2, family=binomial())
#'
#' # automatically selects appropriate method when specifying rr or degree explicitly
#' qgcomp(y ~ z + x1 + x2, expnms = c('x1', 'x2'), data=dat, q=2, family=binomial(), rr=FALSE)
#' qgcomp(y ~ z + x1 + x2, expnms = c('x1', 'x2'), data=dat, q=2, family=binomial(), rr=TRUE)
#' qgcomp(y ~ z + factor(x1) + factor(x2), degree=2, expnms = c('x1', 'x2'), data=dat, q=4,
#' family=binomial())
#
#'
#' #survival objects
#' set.seed(50)
#' N=200
#' dat <- data.frame(time=(tmg <- pmin(.1,rweibull(N, 10, 0.1))),
#' d=1.0*(tmg<0.1), x1=runif(N), x2=runif(N), z=runif(N))
#' expnms=paste0("x", 1:2)
#' f = survival::Surv(time, d)~x1 + x2
#' qgcomp(f, expnms = expnms, data = dat)
#' # note if B or MCsize are set but the model is linear, an error will result
#' try(qgcomp(f, expnms = expnms, data = dat, B1=, MCsize))
#' # note that in the survival models, MCsize should be set to a large number
#' # such that results are repeatable (within an error tolerance such as 2 significant digits)
#' # if you run them under different seed values
#' f = survival::Surv(time, d)~x1 + x2 + x1:x2
#' qgcomp(f, expnms = expnms, data = dat, B=10, MCsize=100)
#' }
requireNamespace("survival")
iszip = (length(grep("\\|", as.character(f)))>0)
issurv = survival::is.Surv(eval(attr(terms(f, data = data), "variables")[[2]], envir = data))
if (is.character(family))
family <- get(family, mode = "function", envir = parent.frame())
if (is.function(family))
family <- family()
if (is.null(family$family)) {
if(issurv){
message("Survival model\n")
}else{
print(family)
stop("'family' not recognized")
}
}
if(!(family$family == "binomial") & rr) {
#warning("'rr=TRUE' is for bimomial family only, setting rr=FALSE")
rr = FALSE
}
terms <- attr(terms(f,data=data), 'term.labels')
doboot <- !checknames(terms)
if(issurv & doboot ){
res <- qgcomp.cox.boot(f=f,data=data,...)
} else if(issurv & !doboot ){
res <- qgcomp.cox.noboot(f=f,data=data,...)
} else if(!iszip & (rr | doboot)){
res <- qgcomp.glm.boot(f=f,data=data,family=family,rr=rr,...)
}else if(!iszip){
res <- qgcomp.glm.noboot(f=f,data=data,family=family,...)
}else if(iszip & doboot){
res <- qgcomp.zi.boot(f=f,data=data,dist=family$family,rr=rr,...)
}else if(iszip & !doboot){
res <- qgcomp.zi.noboot(f=f,data=data,dist=family$family,...)
} else{
stop('Unable to parse the model type: try one of the specific functions in the qgcomp package
e.g. qgcomp.glm.noboot, qgcomp.glm.boot, qgcomp.cox.noboot, qgcomp.cox.boot, qgcomp.zi.noboot,
qgcomp.zi.boot, ')
}
res
}
msm.predict <- function(object, newdata=NULL){
#' @title Secondary prediction method for the (non-survival) qgcomp MSM.
#'
#' @description this is an internal function called by
#' \code{\link[qgcomp]{qgcomp.glm.boot}},
#' but is documented here for clarity. Generally, users will not need to call
#' this function directly.
#' @description Get predicted values from a qgcompfit object from
#' \code{\link[qgcomp]{qgcomp.glm.boot}}.
#'
#' @details (Not usually called by user) Makes predictions from the MSM
#' (rather than the conditional g-computation
#' fit) from a "qgcompfit" object. Generally, this should not be used in
#' favor of the default \code{\link[qgcomp]{predict.qgcompfit}} function. This
#' function can only be used following the `qgcomp.glm.boot` function. For the
#' `qgcomp.glm.noboot` function, \code{\link[qgcomp]{predict.qgcompfit}} gives
#' identical inference to predicting from an MSM.
#'
#'
#' @param object "qgcompfit" object from `qgcomp.glm.boot` function
#' @param newdata (optional) new set of data (data frame) with a variable
#' called `psi` representing the joint exposure level of all exposures
#' under consideration
#' @export
#' @examples
#' set.seed(50)
#' dat <- data.frame(y=runif(50), x1=runif(50), x2=runif(50), z=runif(50))
#' obj <- qgcomp.glm.boot(y ~ z + x1 + x2 + I(z*x1), expnms = c('x1', 'x2'),
#' data=dat, q=4, B=10, seed=125)
#' dat2 <- data.frame(psi=seq(1,4, by=0.1))
#' summary(msm.predict(obj))
#' summary(msm.predict(obj, newdata=dat2))
if(!object$bootstrap) stop("only valid for results from qgcomp.glm.boot function")
if(is.null(newdata)){
pred <- predict(object$msmfit, type='response')
}
if(!is.null(newdata)){
pred <- predict(object$msmfit, newdata=newdata, type='response')
}
return(pred)
}
#' @rdname qgcomp.glm.boot
#' @export
qgcomp.boot <- qgcomp.glm.boot
#' @rdname qgcomp.glm.noboot
#' @export
qgcomp.noboot <- qgcomp.glm.noboot
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.