Nothing
msm_fit <- function(f,
qdata,
intvals,
expnms,
rr=TRUE,
main=TRUE,
degree=1,
id=NULL,
weights,
bayes=FALSE,
MCsize=nrow(qdata), hasintercept=TRUE, ...){
#' @title Fitting marginal structural model (MSM) within quantile g-computation
#' @description This is an internal function called by \code{\link[qgcomp]{qgcomp}},
#' \code{\link[qgcomp]{qgcomp.glm.boot}}, and \code{\link[qgcomp]{qgcomp.glm.noboot}},
#' but is documented here for clarity. Generally, users will not need to call
#' this function directly.
#' @details This function first computes expected outcomes under hypothetical
#' interventions to simultaneously set all exposures to a specific quantile. These
#' predictions are based on g-computation, where the exposures are `quantized',
#' meaning that they take on ordered integer values according to their ranks,
#' and the integer values are determined by the number of quantile cutpoints used.
#' The function then takes these expected outcomes and fits an additional model
#' (a marginal structural model) with the expected outcomes as the outcome and
#' the intervention value of the exposures (the quantile integer) as the exposure.
#' Under causal identification assumptions and correct model specification,
#' the MSM yields a causal exposure-response representing the incremental
#' change in the expected outcome given a joint intervention on all exposures.
#' @param f an r formula representing the conditional model for the outcome, given all
#' exposures and covariates. Interaction terms that include exposure variables
#' should be represented via the \code{\link[base]{AsIs}} function
#' @param qdata a data frame with quantized exposures
#' @param expnms a character vector with the names of the columns in qdata that represent
#' the exposures of interest (main terms only!)
#' @param intvals sequence, the sequence of integer values that the joint exposure
#' is 'set' to for estimating the msm. For quantile g-computation, this is just
#' 0:(q-1), where q is the number of quantiles of exposure.
#' @param rr logical, estimate log(risk ratio) (family='binomial' only)
#' @param main logical, internal use: produce estimates of exposure effect (psi)
#' and expected outcomes under g-computation and the MSM
#' @param degree polynomial bases for marginal model (e.g. degree = 2
#' allows that the relationship between the whole exposure mixture and the outcome
#' is quadratic. Default=1)
#' @param id (optional) NULL, or variable name indexing individual units of
#' observation (only needed if analyzing data with multiple observations per
#' id/cluster)
#' @param weights "case weights" - passed to the "weight" argument of
#' \code{\link[stats]{glm}} or \code{\link[arm]{bayesglm}}
#' @param bayes use underlying Bayesian model (`arm` package defaults). Results
#' in penalized parameter estimation that can help with very highly correlated
#' exposures. Note: this does not lead to fully Bayesian inference in general,
#' so results should be interpreted as frequentist.
#' @param MCsize integer: sample size for simulation to approximate marginal
#' zero inflated model parameters. This can be left small for testing, but should be as large
#' as needed to reduce simulation error to an acceptable magnitude (can compare psi coefficients for
#' linear fits with qgcomp.zi.noboot to gain some intuition for the level of expected simulation
#' error at a given value of MCsize)
#' @param hasintercept (logical) does the model have an intercept?
#' @param ... arguments to glm (e.g. family)
#' @seealso \code{\link[qgcomp]{qgcomp.glm.boot}}, and \code{\link[qgcomp]{qgcomp}}
#' @concept variance mixtures
#' @import stats arm
#' @export
#' @examples
#' set.seed(50)
#' dat <- data.frame(y=runif(200), x1=runif(200), x2=runif(200), z=runif(200))
#' X <- c('x1', 'x2')
#' qdat <- quantize(dat, X, q=4)$data
#' mod <- msm_fit(f=y ~ z + x1 + x2 + I(x1*x2),
#' expnms = c('x1', 'x2'), qdata=qdat, intvals=1:4, bayes=FALSE)
#' summary(mod$fit) # outcome regression model
#' summary(mod$msmfit) # msm fit (variance not valid - must be obtained via bootstrap)
newform <- terms(f, data = qdata)
nobs = nrow(qdata)
thecall <- match.call(expand.dots = FALSE)
names(thecall) <- gsub("qdata", "data", names(thecall))
names(thecall) <- gsub("f", "formula", names(thecall))
m <- match(c("formula", "data", "weights", "offset"), names(thecall), 0L)
hasweights = ifelse("weights" %in% names(thecall), TRUE, FALSE)
thecall <- thecall[c(1L, m)]
thecall$drop.unused.levels <- TRUE
thecall[[1L]] <- quote(stats::model.frame)
thecalle <- eval(thecall, parent.frame())
if(hasweights){
qdata$weights <- as.vector(model.weights(thecalle))
} else qdata$weights = rep(1, nobs)
if(is.null(id)) {
id <- "id__"
qdata$id__ <- seq_len(dim(qdata)[1])
}
# conditional outcome regression fit
nidx = which(!(names(qdata) %in% id))
if(!bayes) fit <- glm(newform, data = qdata,
weights=weights,
...)
if(bayes){
requireNamespace("arm")
fit <- bayesglm(f, data = qdata[,nidx,drop=FALSE],
weights=weights,
...)
}
if(fit$family$family %in% c("gaussian", "poisson")) rr=FALSE
###
# get predictions (set exposure to 0,1,...,q-1)
if(is.null(intvals)){
intvals <- (seq_len(length(table(qdata[expnms[1]])))) - 1
}
predit <- function(idx, newdata){
#newdata <- qdata
newdata[,expnms] <- idx
suppressWarnings(predict(fit, newdata=newdata, type='response'))
}
if(MCsize==nrow(qdata)){
newdata <- qdata
}else{
newids <- data.frame(temp=sort(sample(unique(qdata[,id, drop=TRUE]), MCsize,
#probs=weights, #bootstrap sampling with weights works with fixed weights, but not time-varying weights
replace = TRUE
)))
names(newids) <- id
newdata <- merge(qdata,newids, by=id, all.x=FALSE, all.y=TRUE)[seq_len(MCsize),]
}
predmat = lapply(intvals, predit, newdata=newdata)
# fit MSM using g-computation estimates of expected outcomes under joint
# intervention
#nobs <- dim(qdata)[1]
msmdat <- data.frame(
cbind(
Ya = unlist(predmat),
psi = rep(intvals, each=MCsize),
weights = rep(newdata$weights, times=length(intvals))
#times=length(table(qdata[expnms[1]])))
)
)
# to do: allow functional form variations for the MSM via specifying the model formula
msmforms = paste0("Ya ~ ",
ifelse(hasintercept, "1 +", "-1 +"),
"poly(psi, degree=",degree,", raw=TRUE)"
)
msmform = as.formula(msmforms)
if(bayes){
if(!rr) suppressWarnings(msmfit <- bayesglm(msmform, data=msmdat,
weights=weights, x=TRUE,
...))
if(rr) suppressWarnings(msmfit <- bayesglm(msmform, data=msmdat,
family=binomial(link='log'), start=rep(-0.0001, degree+1),
weights=weights, x=TRUE))
}
if(!bayes){
if(!rr) suppressWarnings(msmfit <- glm(msmform, data=msmdat,
weights=weights, x=TRUE,
...))
if(rr) suppressWarnings(msmfit <- glm(msmform, data=msmdat,
family=binomial(link='log'), start=rep(-0.0001, degree+1),
weights=weights, x=TRUE))
}
res <- list(fit=fit, msmfit=msmfit)
if(main) {
res$Ya <- msmdat$Ya # expected outcome under joint exposure, by gcomp
res$Yamsm <- predict(msmfit, type='response')
res$Yamsml <- predict(msmfit, type="link")
res$A <- msmdat$psi # joint exposure (0 = all exposures set category with
# upper cut-point as first quantile)
}
res
}
msm.fit <- msm_fit
qgcomp.glm.noboot <- function(f,
data,
expnms=NULL,
q=4,
breaks=NULL,
id=NULL,
weights,
alpha=0.05,
bayes=FALSE,
...){
#' @title Quantile g-computation for continuous, binary, and count outcomes under linearity/additivity
#' @aliases gcomp.noboot
#' @description This function estimates a linear dose-response parameter representing a one quantile
#' increase in a set of exposures of interest. This function is limited to linear and additive
#' effects of individual components of the exposure. This model estimates the parameters of a marginal
#' structural model (MSM) based on g-computation with quantized exposures. Note: this function is
#' valid only under linear and additive effects of individual components of the exposure, but when
#' these hold the model can be fit with very little computational burden.
#' qgcomp.noboot is an equivalent function (slated for deprecation)
#'
#' @details For continuous outcomes, under a linear model with no
#' interaction terms, this is equivalent to g-computation of the effect of
#' increasing every exposure by 1 quantile. For binary/count outcomes
#' outcomes, this yields a conditional log odds/rate ratio(s) representing the
#' change in the expected conditional odds/rate (conditional on covariates)
#' from increasing every exposure by 1 quantile. In general, the latter
#' quantity is not equivalent to g-computation estimates. Hypothesis test
#' statistics and confidence intervals are based on using the delta
#' estimate variance of a linear combination of random variables.
#'
#' @param f R style formula
#' @param data data frame
#' @param expnms character vector of exposures of interest
#' @param q NULL or number of quantiles used to create quantile indicator variables
#' representing the exposure variables. If NULL, then gcomp proceeds with un-transformed
#' version of exposures in the input datasets (useful if data are already transformed,
#' or for performing standard g-computation)
#' @param breaks (optional) NULL, or a list of (equal length) numeric vectors that
#' characterize the minimum value of each category for which to
#' break up the variables named in expnms. This is an alternative to using 'q'
#' to define cutpoints.
#' @param id (optional) NULL, or variable name indexing individual units of
#' observation (only needed if analyzing data with multiple observations per
#' id/cluster). Note that qgcomp.glm.noboot will not produce cluster-appropriate
#' standard errors (this parameter is essentially ignored in qgcomp.glm.noboot).
#' qgcomp.glm.boot can be used for this, which will use bootstrap
#' sampling of clusters/individuals to estimate cluster-appropriate standard
#' errors via bootstrapping.
#' @param weights "case weights" - passed to the "weight" argument of
#' \code{\link[stats]{glm}} or \code{\link[arm]{bayesglm}}
#' @param alpha alpha level for confidence limit calculation
#' @param bayes use underlying Bayesian model (`arm` package defaults). Results
#' in penalized parameter estimation that can help with very highly correlated
#' exposures. Note: this does not lead to fully Bayesian inference in general,
#' so results should be interpreted as frequentist.
#' @param ... arguments to glm (e.g. family)
#' @family qgcomp_methods
#' @return a qgcompfit object, which contains information about the effect
#' measure of interest (psi) and associated variance (var.psi), as well
#' as information on the model fit (fit) and information on the
#' weights/standardized coefficients in the positive (pos.weights) and
#' negative (neg.weights) directions.
#' @concept variance mixtures
#' @import stats arm
#' @export
#' @examples
#' set.seed(50)
#' # linear model
#' dat <- data.frame(y=runif(50,-1,1), x1=runif(50), x2=runif(50), z=runif(50))
#' qgcomp.glm.noboot(f=y ~ z + x1 + x2, expnms = c('x1', 'x2'), data=dat, q=2, family=gaussian())
#' # not intercept model
#' qgcomp.glm.noboot(f=y ~-1+ z + x1 + x2, expnms = c('x1', 'x2'), data=dat, q=2, family=gaussian())
#' # logistic model
#' dat2 <- data.frame(y=rbinom(50, 1,0.5), x1=runif(50), x2=runif(50), z=runif(50))
#' qgcomp.glm.noboot(f=y ~ z + x1 + x2, expnms = c('x1', 'x2'), data=dat2, q=2, family=binomial())
#' # poisson model
#' dat3 <- data.frame(y=rpois(50, .5), x1=runif(50), x2=runif(50), z=runif(50))
#' qgcomp.glm.noboot(f=y ~ z + x1 + x2, expnms = c('x1', 'x2'), data=dat3, q=2, family=poisson())
#' # weighted model
#' N=5000
#' dat4 <- data.frame(y=runif(N), x1=runif(N), x2=runif(N), z=runif(N))
#' dat4$w=runif(N)*2
#' qdata = quantize(dat4, expnms = c("x1", "x2"))$data
#' (qgcfit <- qgcomp.glm.noboot(f=y ~ z + x1 + x2, expnms = c('x1', 'x2'), data=dat4, q=4,
#' family=gaussian(), weights=w))
#' qgcfit$fit
#' glm(y ~ z + x1 + x2, data = qdata, weights=w)
newform <- terms(f, data = data)
hasintercept = as.logical(attr(newform, "intercept"))
nobs = nrow(data)
origcall <- thecall <- match.call(expand.dots = FALSE)
names(thecall) <- gsub("f", "formula", names(thecall))
m <- match(c("formula", "data", "weights", "offset"), names(thecall), 0L)
#m <- match(c("f", "data", "weights", "offset"), names(thecall), 0L)
hasweights = ifelse("weights" %in% names(thecall), TRUE, FALSE)
thecall <- thecall[c(1L, m)]
thecall$drop.unused.levels <- TRUE
thecall[[1L]] <- quote(stats::model.frame)
thecalle <- eval(thecall, parent.frame()) # a model frame pulled in from the environment in which the function was called
if(hasweights){
data$weights <- as.vector(model.weights(thecalle))
} else data$weights = rep(1, nobs)
if (is.null(expnms)) {
#expnms <- attr(terms(f, data = data), "term.labels")
expnms <- attr(newform, "term.labels")
message("Including all model terms as exposures of interest\n")
}
lin = checknames(expnms)
if(!lin) stop("Model appears to be non-linear: use qgcomp.glm.boot instead")
if (!is.null(q) | !is.null(breaks)){
ql <- quantize(data, expnms, q, breaks)
qdata <- ql$data
br <- ql$breaks
} else{
qdata <- data
br <- breaks
}
if(is.null(id)) {
# not yet implemented
id = "id__"
qdata$id__ = seq_len(dim(qdata)[1])
}
if(!bayes) fit <- glm(newform, data = qdata,
weights=weights,
...)
if(bayes){
requireNamespace("arm")
fit <- bayesglm(newform, data = qdata,
weights=weights,
...)
}
mod <- summary(fit)
if(length(setdiff(expnms, rownames(mod$coefficients)))>0){
stop("Model aliasing occurred, likely due to perfectly correlated quantized exposures.
Try one of the following:
1) set 'bayes' to TRUE in the qgcomp function (recommended)
2) set 'q' to a higher value in the qgcomp function (recommended)
3) check correlation matrix of exposures, and drop all but one variable in each highly correlated set (not recommended)
")
}
estb <- sum(mod$coefficients[expnms,1, drop=TRUE])
seb <- se_comb(expnms, covmat = mod$cov.scaled)
if(hasintercept){
estb <- c(fit$coefficients[1], estb)
seb <- c(sqrt(mod$cov.scaled[1,1]), seb)
}
tstat <- estb / seb
df <- mod$df.null - length(expnms)
pval <- 2 - 2 * pt(abs(tstat), df = df)
pvalz <- 2 - 2 * pnorm(abs(tstat))
ci <- cbind(estb + seb * qnorm(alpha / 2), estb + seb * qnorm(1 - alpha / 2))
# 'weights'
wcoef <- fit$coefficients[expnms]
names(wcoef) <- gsub("_q", "", names(wcoef))
pos.coef <- which(wcoef > 0)
neg.coef <- which(wcoef <= 0)
pos.weights <- abs(wcoef[pos.coef]) / sum(abs(wcoef[pos.coef]))
neg.weights <- abs(wcoef[neg.coef]) / sum(abs(wcoef[neg.coef]))
pos.psi <- sum(wcoef[pos.coef])
neg.psi <- sum(wcoef[neg.coef])
qx <- qdata[, expnms]
names(qx) <- paste0(names(qx), "_q")
psiidx = 1+hasintercept
names(estb)[psiidx] <- c("psi1")
cnms = "psi1"
if(hasintercept){
covmat.coef=vc_comb(aname="(Intercept)", expnms=expnms, covmat = mod$cov.scaled)
cnms = c("(intercept)", cnms)
}
if(!hasintercept)
covmat.coef=as.matrix(seb^2,nrow=1,ncol=1)
colnames(covmat.coef) <- rownames(covmat.coef) <- names(estb) <- cnms
res <- .qgcomp_object(
qx = qx, fit = fit,
psi = estb[psiidx],
var.psi = seb[psiidx] ^ 2,
covmat.psi=c('psi1' = seb[psiidx]^2),
ci = ci[psiidx,],
coef = estb, var.coef = seb ^ 2,
covmat.coef=covmat.coef,
ci.coef = ci[,],
expnms=expnms, q=q, breaks=br,
pos.psi = pos.psi, neg.psi = neg.psi,
pos.weights = sort(pos.weights, decreasing = TRUE),
neg.weights = sort(neg.weights, decreasing = TRUE),
pos.size = sum(abs(wcoef[pos.coef])),
neg.size = sum(abs(wcoef[neg.coef])),
alpha=alpha,
call=origcall,
hasintercept=hasintercept
)
if(fit$family$family=='gaussian'){
res$tstat <- tstat
res$df <- df
res$pval <- pval
}
if(fit$family$family %in% c('binomial', 'poisson')){
res$zstat <- tstat
res$pval <- pvalz
}
res
}
qgcomp.glm.boot <- function(
f,
data,
expnms=NULL,
q=4,
breaks=NULL,
id=NULL,
weights,
alpha=0.05,
B=200,
rr=TRUE,
degree=1,
seed=NULL,
bayes=FALSE,
MCsize=nrow(data),
parallel=FALSE,
parplan = FALSE,
...
){
#' @title Quantile g-computation for continuous and binary outcomes
#' @aliases gcomp.boot
#'
#' @description This function estimates a dose-response parameter representing a one quantile
#' increase in a set of exposures of interest. This model estimates the parameters of a marginal
#' structural model (MSM) based on g-computation with quantized exposures. Note: this function
#' allows non-linear and non-additive effects of individual components of the exposure, as well as
#' non-linear joint effects of the mixture via polynomial basis functions, which increase the
#' computational computational burden due to the need for non-parametric bootstrapping.
#' qgcomp.boot is an equivalent function (slated for deprecation)
#'
#' @details Estimates correspond to the average expected change in the
#' (log) outcome per quantile increase in the joint exposure to all exposures
#' in `expnms'. Test statistics and confidence intervals are based on
#' a non-parametric bootstrap, using the standard deviation of the bootstrap
#' estimates to estimate the standard error. The bootstrap standard error is
#' then used to estimate Wald-type confidence intervals. Note that no bootstrapping
#' is done on estimated quantiles of exposure, so these are treated as fixed
#' quantities
#'
#' @param f R style formula
#' @param data data frame
#' @param expnms character vector of exposures of interest
#' @param q NULL or number of quantiles used to create quantile indicator variables
#' representing the exposure variables. If NULL, then gcomp proceeds with un-transformed
#' version of exposures in the input datasets (useful if data are already transformed,
#' or for performing standard g-computation)
#' @param breaks (optional) NULL, or a list of (equal length) numeric vectors that
#' characterize the minimum value of each category for which to
#' break up the variables named in expnms. This is an alternative to using 'q'
#' to define cutpoints.
#' @param id (optional) NULL, or variable name indexing individual units of
#' observation (only needed if analyzing data with multiple observations per
#' id/cluster). Note that qgcomp.glm.noboot will not produce cluster-appropriate
#' standard errors. qgcomp.glm.boot can be used for this, which will use bootstrap
#' sampling of clusters/individuals to estimate cluster-appropriate standard
#' errors via bootstrapping.
#' @param weights "case weights" - passed to the "weight" argument of
#' \code{\link[stats]{glm}} or \code{\link[arm]{bayesglm}}
#' @param alpha alpha level for confidence limit calculation
#' @param B integer: number of bootstrap iterations (this should typically be >=200,
#' though it is set lower in examples to improve run-time).
#' @param rr logical: if using binary outcome and rr=TRUE, qgcomp.glm.boot will
#' estimate risk ratio rather than odds ratio
#' @param degree polynomial bases for marginal model (e.g. degree = 2
#' allows that the relationship between the whole exposure mixture and the outcome
#' is quadratic (default = 1).
#' @param seed integer or NULL: random number seed for replicable bootstrap results
#' @param bayes use underlying Bayesian model (`arm` package defaults). Results
#' in penalized parameter estimation that can help with very highly correlated
#' exposures. Note: this does not lead to fully Bayesian inference in general,
#' so results should be interpreted as frequentist.
#' @param MCsize integer: sample size for simulation to approximate marginal
#' zero inflated model parameters. This can be left small for testing, but should be as large
#' as needed to reduce simulation error to an acceptable magnitude (can compare psi coefficients for
#' linear fits with qgcomp.glm.noboot to gain some intuition for the level of expected simulation
#' error at a given value of MCsize). This likely won't matter much in linear models, but may
#' be important with binary or count outcomes.
#' @param parallel use (safe) parallel processing from the future and future.apply packages
#' @param parplan (logical, default=FALSE) automatically set future::plan to plan(multisession) (and set to existing plan, if any, after bootstrapping)
#' @param ... arguments to glm (e.g. family)
#' @family qgcomp_methods
#' @return a qgcompfit object, which contains information about the effect
#' measure of interest (psi) and associated variance (var.psi), as well
#' as information on the model fit (fit) and information on the
#' marginal structural model (msmfit) used to estimate the final effect
#' estimates.
#' @concept variance mixtures
#' @import stats
#' @export
#' @examples
#' set.seed(30)
#' # continuous outcome
#' dat <- data.frame(y=rnorm(100), x1=runif(100), x2=runif(100), z=runif(100))
#' # Conditional linear slope
#' qgcomp.glm.noboot(y ~ z + x1 + x2, expnms = c('x1', 'x2'), data=dat, q=4, family=gaussian())
#' # Marginal linear slope (population average slope, for a purely linear,
#' # additive model this will equal the conditional)
#' \dontrun{
#' qgcomp.glm.boot(f=y ~ z + x1 + x2, expnms = c('x1', 'x2'), data=dat, q=4,
#' family=gaussian(), B=200) # B should be at least 200 in actual examples
#' # no intercept model
#' qgcomp.glm.boot(f=y ~ -1+z + x1 + x2, expnms = c('x1', 'x2'), data=dat, q=4,
#' family=gaussian(), B=200) # B should be at least 200 in actual examples
#'
#' # Note that these give different answers! In the first, the estimate is conditional on Z,
#' # but in the second, Z is marginalized over via standardization. The estimates
#' # can be made approximately the same by centering Z (for linear models), but
#' # the conditional estimate will typically have lower standard errors.
#' dat$z = dat$z - mean(dat$z)
#'
#' # Conditional linear slope
#' qgcomp.glm.noboot(y ~ z + x1 + x2, expnms = c('x1', 'x2'), data=dat, q=4, family=gaussian())
#' # Marginal linear slope (population average slope, for a purely linear,
#' # additive model this will equal the conditional)
#'
#' qgcomp.glm.boot(f=y ~ z + x1 + x2, expnms = c('x1', 'x2'), data=dat, q=4,
#' family=gaussian(), B=200) # B should be at least 200 in actual examples
#'
#' # Population average mixture slope which accounts for non-linearity and interactions
#' qgcomp.glm.boot(y ~ z + x1 + x2 + I(x1^2) + I(x2*x1), family="gaussian",
#' expnms = c('x1', 'x2'), data=dat, q=4, B=200)
#'
#' # generally non-linear/non-addiive underlying models lead to non-linear mixture slopes
#' qgcomp.glm.boot(y ~ z + x1 + x2 + I(x1^2) + I(x2*x1), family="gaussian",
#' expnms = c('x1', 'x2'), data=dat, q=4, B=200, deg=2)
#'
#' # binary outcome
#' dat <- data.frame(y=rbinom(50,1,0.5), x1=runif(50), x2=runif(50), z=runif(50))
#'
#' # Conditional mixture OR
#' qgcomp.glm.noboot(y ~ z + x1 + x2, family="binomial", expnms = c('x1', 'x2'),
#' data=dat, q=2)
#'
#' #Marginal mixture OR (population average OR - in general, this will not equal the
#' # conditional mixture OR due to non-collapsibility of the OR)
#' qgcomp.glm.boot(y ~ z + x1 + x2, family="binomial", expnms = c('x1', 'x2'),
#' data=dat, q=2, B=3, rr=FALSE)
#'
#' # Population average mixture RR
#' qgcomp.glm.boot(y ~ z + x1 + x2, family="binomial", expnms = c('x1', 'x2'),
#' data=dat, q=2, rr=TRUE, B=3)
#'
#' # Population average mixture RR, indicator variable representation of x2
#' # note that I(x==...) operates on the quantile-based category of x,
#' # rather than the raw value
#' res = qgcomp.glm.boot(y ~ z + x1 + I(x2==1) + I(x2==2) + I(x2==3),
#' family="binomial", expnms = c('x1', 'x2'), data=dat, q=4, rr=TRUE, B=200)
#' res$fit
#' plot(res)
#'
#' # now add in a non-linear MSM
#' res2 = qgcomp.glm.boot(y ~ z + x1 + I(x2==1) + I(x2==2) + I(x2==3),
#' family="binomial", expnms = c('x1', 'x2'), data=dat, q=4, rr=TRUE, B=200,
#' degree=2)
#' res2$fit
#' res2$msmfit # correct point estimates, incorrect standard errors
#' res2 # correct point estimates, correct standard errors
#' plot(res2)
#' # Log risk ratio per one IQR change in all exposures (not on quantile basis)
#' dat$x1iqr <- dat$x1/with(dat, diff(quantile(x1, c(.25, .75))))
#' dat$x2iqr <- dat$x2/with(dat, diff(quantile(x2, c(.25, .75))))
#' # note that I(x>...) now operates on the untransformed value of x,
#' # rather than the quantized value
#' res2 = qgcomp.glm.boot(y ~ z + x1iqr + I(x2iqr>0.1) + I(x2>0.4) + I(x2>0.9),
#' family="binomial", expnms = c('x1iqr', 'x2iqr'), data=dat, q=NULL, rr=TRUE, B=200,
#' degree=2)
#' res2
#' # using parallel processing
#'
#' qgcomp.glm.boot(y ~ z + x1iqr + I(x2iqr>0.1) + I(x2>0.4) + I(x2>0.9),
#' family="binomial", expnms = c('x1iqr', 'x2iqr'), data=dat, q=NULL, rr=TRUE, B=200,
#' degree=2, parallel=TRUE, parplan=TRUE)
#'
#'
#' # weighted model
#' N=5000
#' dat4 <- data.frame(id=seq_len(N), x1=runif(N), x2=runif(N), z=runif(N))
#' dat4$y <- with(dat4, rnorm(N, x1*z + z, 1))
#' dat4$w=runif(N) + dat4$z*5
#' qdata = quantize(dat4, expnms = c("x1", "x2"), q=4)$data
#' # first equivalent models with no covariates
#' qgcomp.glm.noboot(f=y ~ x1 + x2, expnms = c('x1', 'x2'), data=dat4, q=4, family=gaussian())
#' qgcomp.glm.noboot(f=y ~ x1 + x2, expnms = c('x1', 'x2'), data=dat4, q=4, family=gaussian(),
#' weights=w)
#'
#' set.seed(13)
#' qgcomp.glm.boot(f=y ~ x1 + x2, expnms = c('x1', 'x2'), data=dat4, q=4, family=gaussian(),
#' weights=w)
#' # using the correct model
#' set.seed(13)
#' qgcomp.glm.boot(f=y ~ x1*z + x2, expnms = c('x1', 'x2'), data=dat4, q=4, family=gaussian(),
#' weights=w, id="id")
#' (qgcfit <- qgcomp.glm.boot(f=y ~ z + x1 + x2, expnms = c('x1', 'x2'), data=dat4, q=4,
#' family=gaussian(), weights=w))
#' qgcfit$fit
#' summary(glm(y ~ z + x1 + x2, data = qdata, weights=w))
#' }
# character names of exposure mixture components
oldq = NULL
if(is.null(seed)) seed = round(runif(1, min=0, max=1e8))
newform <- terms(f, data = data)
hasintercept = as.logical(attr(newform, "intercept"))
class(newform) <- "formula"
nobs = nrow(data)
origcall <- thecall <- match.call(expand.dots = FALSE)
names(thecall) <- gsub("f", "formula", names(thecall))
m <- match(c("formula", "data", "weights", "offset"), names(thecall), 0L)
hasweights = ifelse("weights" %in% names(thecall), TRUE, FALSE)
thecall <- thecall[c(1L, m)]
thecall$drop.unused.levels <- TRUE
thecall[[1L]] <- quote(stats::model.frame)
thecalle <- eval(thecall, parent.frame())
if(hasweights){
data$weights <- as.vector(model.weights(thecalle))
} else data$weights = rep(1, nobs)
if (is.null(expnms)) {
#expnms <- attr(terms(f, data = data), "term.labels")
expnms <- attr(newform, "term.labels")
message("Including all model terms as exposures of interest\n")
}
lin = checknames(expnms)
if(!lin) stop("Model appears to be non-linear and I'm having trouble parsing it:
please use `expnms` parameter to define the variables making up the exposure")
if (!is.null(q) & !is.null(breaks)){
# if user specifies breaks, prioritize those
oldq = q
q <- NULL
}
if (!is.null(q) | !is.null(breaks)){
ql <- quantize(data, expnms, q, breaks)
qdata <- ql$data
br <- ql$breaks
if(is.null(q)){
# rare scenario with user specified breaks and q is left at NULL
nvals <- length(br[[1]])-1
} else{
nvals <- q
}
intvals <- (seq_len(nvals))-1
} else {
# if( is.null(breaks) & is.null(q)) # also includes NA
qdata <- data
# if no transformation is made (no quantiles, no breaks given)
# then draw distribution values from quantiles of all the exposures
# pooled together
# : allow user specification of this
nvals = length(table(unlist(data[,expnms])))
if(nvals < 10){
message("\nNote: using all possible values of exposure as the
intervention values\n")
p = length(expnms)
intvals <- as.numeric(names(table(unlist(data[,expnms]))))
br <- lapply(seq_len(p), function(x) c(-1e16, intvals[2:nvals]-1e-16, 1e16))
}else{
message("\nNote: using quantiles of all exposures combined in order to set
proposed intervention values for overall effect (25th, 50th, 75th %ile)
You can ensure this is valid by scaling all variables in expnms to have similar ranges.")
intvals = as.numeric(quantile(unlist(data[,expnms]), c(.25, .5, .75)))
br <- NULL
}
}
if(is.null(id)) {
id <- "id__"
qdata$id__ <- seq_len(dim(qdata)[1])
}
###
msmfit <- msm_fit(newform, qdata, intvals, expnms, rr, main=TRUE,degree=degree, id=id,
weights,
bayes,
MCsize=MCsize, hasintercept = hasintercept,
...)
# main estimate
#estb <- as.numeric(msmfit$msmfit$coefficients[-1])
estb <- as.numeric(msmfit$msmfit$coefficients)
#bootstrap to get std. error
nobs <- dim(qdata)[1]
nids <- length(unique(qdata[,id, drop=TRUE]))
starttime = Sys.time()
psi.only <- function(i=1, f=f, qdata=qdata, intvals=intvals, expnms=expnms, rr=rr, degree=degree,
nids=nids, id=id,
weights,MCsize=MCsize, hasintercept = hasintercept,
...){
if(i==2 & !parallel){
timeiter = as.numeric(Sys.time() - starttime)
if((timeiter*B/60)>0.5) message(paste0("Expected time to finish: ", round(B*timeiter/60, 2), " minutes \n"))
}
bootids <- data.frame(temp=sort(sample(unique(qdata[,id, drop=TRUE]), nids,
replace = TRUE
)))
names(bootids) <- id
qdata_ <- merge(qdata,bootids, by=id, all.x=FALSE, all.y=TRUE)
ft = msm_fit(f, qdata_, intvals, expnms, rr, main=FALSE, degree, id, weights=weights, bayes, MCsize=MCsize,
hasintercept = hasintercept,
...)
yhatty = data.frame(yhat=predict(ft$msmfit), psi=ft$msmfit$data[,"psi"])
as.numeric(
# the yhat estimates will be identical across individuals due to this being a marginal model
c(ft$msmfit$coefficients, with(yhatty, tapply(yhat, psi, mean)))
)
}
set.seed(seed)
if(parallel){
#Sys.setenv(R_FUTURE_SUPPORTSMULTICORE_UNSTABLE="quiet")
if (parplan) {
oplan <- future::plan(strategy = future::multisession)
on.exit(future::plan(oplan), add = TRUE)
}
bootsamps <- future.apply::future_lapply(X=seq_len(B), FUN=psi.only,f=f, qdata=qdata, intvals=intvals,
expnms=expnms, rr=rr, degree=degree, nids=nids, id=id,
weights=qdata$weights,MCsize=MCsize, hasintercept = hasintercept,
future.seed=TRUE,
...)
}else{
bootsamps <- lapply(X=seq_len(B), FUN=psi.only,f=f, qdata=qdata, intvals=intvals,
expnms=expnms, rr=rr, degree=degree, nids=nids, id=id,
weights=weights, MCsize=MCsize, hasintercept = hasintercept,
...)
}
bootsamps = do.call("cbind", bootsamps)
# these are the linear predictors
hats = t(bootsamps[-c(seq_len(degree+ifelse(hasintercept,1,0))),,drop=FALSE])
# covariance of the linear predictors
cov.yhat = cov(hats)
bootsamps = bootsamps[seq_len(degree+ifelse(hasintercept,1,0)),,drop=FALSE]
seb <- apply(bootsamps, 1, sd)
covmat <- cov(t(bootsamps))
cnms = c(paste0("psi", seq_len(nrow(bootsamps)-1)))
if(hasintercept)
cnms = c("(intercept)", cnms)
colnames(covmat) <- rownames(covmat) <- names(estb) <- cnms
tstat <- estb / seb
df <- nobs - length(attr(terms(f, data = data), "term.labels")) - 1 - degree # df based on obs - gcomp terms - msm terms
pval <- 2 - 2 * pt(abs(tstat), df = df)
pvalz <- 2 - 2 * pnorm(abs(tstat))
ci <- cbind(estb + seb * qnorm(alpha / 2), estb + seb * qnorm(1 - alpha / 2))
# outcome 'weights' not applicable in this setting, generally (i.e. if using this function for non-linearity,
# then weights will vary with level of exposure)
if (!is.null(oldq)){
q = oldq
}
psiidx = seq_len(degree)+ifelse(hasintercept,1,0)
qx <- qdata[, expnms]
names(qx) <- paste0(names(qx), "_q")
res <- .qgcomp_object(
qx = qx, fit = msmfit$fit, msmfit = msmfit$msmfit,
psi = estb[psiidx], var.psi = seb[psiidx] ^ 2,
covmat.psi=covmat[psiidx,psiidx, drop=FALSE],
ci = ci[psiidx,],
coef = estb, var.coef = seb ^ 2, covmat.coef=covmat, ci.coef = ci,
expnms=expnms, q=q, breaks=br, degree=degree,
bootstrap=TRUE,
y.expected=msmfit$Ya, y.expectedmsm=msmfit$Yamsm, index=msmfit$A,
bootsamps = bootsamps,
cov.yhat=cov.yhat,
alpha=alpha,
call=origcall,
hasintercept=hasintercept
)
if(msmfit$fit$family$family=='gaussian'){
res$tstat <- tstat
res$df <- df
res$pval <- pval
}
if(msmfit$fit$family$family %in% c('binomial', 'poisson')){
res$zstat <- tstat
res$pval <- pvalz
}
res
}
qgcomp <- function(f,data=data,family=gaussian(),rr=TRUE,...){
#' @title Quantile g-computation for continuous, binary, count, and censored survival outcomes
#'
#' @description This function automatically selects between qgcomp.glm.noboot, qgcomp.glm.boot,
#' qgcomp.cox.noboot, and qgcomp.cox.boot
#' for the most efficient approach to estimate the average expected
#' change in the (log) outcome per quantile increase in the joint
#' exposure to all exposures in `expnms', given the underlying model. For example,
#' if the underlying model (specified by the formula `f`) is a linear model with all
#' linear terms for exposure, then `qgcomp.glm.noboot`` will be called to fit the model. Non-linear
#' terms or requesting the risk ratio for binomial outcomes will result in the `qgcomp.glm.boot`
#' function being called. For a given linear model, boot and noboot versions will give identical
#' inference, though when using survival outcomes, the `boot` version uses simulation based
#' inference, which can vary from the `noboot` version due to simulation error (which can
#' be minimized via setting the MCsize parameter very large - see
#' \code{\link[qgcomp]{qgcomp.cox.boot}} for details).
#'
#'
#'
#' @param f R style formula (may include survival outcome via \code{\link[survival]{Surv}})
#' @param data data frame
#' @param family \code{gaussian()}, \code{binomial()}, \code{cox()}, \code{poisson()} (works as
#' argument to 'family' parameter in \code{\link[stats]{glm}}` or 'dist' parameter in
#' \code{\link[pscl]{zeroinfl}})
#' @param rr logical: if using binary outcome and rr=TRUE, qgcomp.glm.boot will
#' estimate risk ratio rather than odds ratio. Note, to get population average
#' effect estimates for a binary outcome, set rr=TRUE (default: ORs are generally not
#' of interest as population average effects, so if rr=FALSE then a conditional
#' OR will be estimated, which cannot be interpreted as a population average
#' effect
#' @param ... arguments to qgcomp.glm.noboot or qgcomp.glm.boot (e.g. q) or glm
#' @seealso \code{\link[qgcomp]{qgcomp.glm.noboot}}, \code{\link[qgcomp]{qgcomp.glm.boot}},
#' \code{\link[qgcomp]{qgcomp.cox.noboot}}, \code{\link[qgcomp]{qgcomp.cox.boot}}
#' \code{\link[qgcomp]{qgcomp.zi.noboot}} and \code{\link[qgcomp]{qgcomp.zi.boot}}
#' (\code{\link[qgcomp]{qgcomp}} is just a wrapper for these functions)
#' @return a qgcompfit object, which contains information about the effect
#' measure of interest (psi) and associated variance (var.psi), as well
#' as information on the model fit (fit) and possibly information on the
#' marginal structural model (msmfit) used to estimate the final effect
#' estimates (qgcomp.glm.boot, qgcomp.cox.boot only).
#' If appropriate, weights are also reported, which represent the proportion
#' of a directional (positive/negative) effect that is accounted for by
#' each exposure.
#' @concept variance mixtures
#' @import stats
#' @export
#' @examples
#' set.seed(50)
#' dat <- data.frame(y=runif(50), x1=runif(50), x2=runif(50), z=runif(50))
#' qgcomp.glm.noboot(y ~ z + x1 + x2, expnms = c('x1', 'x2'), data=dat, q=2)
#' qgcomp.glm.boot(y ~ z + x1 + x2, expnms = c('x1', 'x2'), data=dat, q=2, B=10, seed=125)
#' # automatically selects appropriate method
#' qgcomp(y ~ z + x1 + x2, expnms = c('x1', 'x2'), data=dat, q=2)
#' # note for binary outcome this will choose the risk ratio (and bootstrap methods) by default
#' dat <- data.frame(y=rbinom(100, 1, 0.5), x1=runif(100), x2=runif(100), z=runif(100))
#' \dontrun{
#' qgcomp.glm.noboot(y ~ z + x1 + x2, expnms = c('x1', 'x2'), data=dat, q=2, family=binomial())
#' set.seed(1231)
#' qgcomp.glm.boot(y ~ z + x1 + x2, expnms = c('x1', 'x2'), data=dat, q=2, family=binomial())
#' set.seed(1231)
#' qgcomp(y ~ z + x1 + x2, expnms = c('x1', 'x2'), data=dat, q=2, family=binomial())
#'
#' # automatically selects appropriate method when specifying rr or degree explicitly
#' qgcomp(y ~ z + x1 + x2, expnms = c('x1', 'x2'), data=dat, q=2, family=binomial(), rr=FALSE)
#' qgcomp(y ~ z + x1 + x2, expnms = c('x1', 'x2'), data=dat, q=2, family=binomial(), rr=TRUE)
#' qgcomp(y ~ z + factor(x1) + factor(x2), degree=2, expnms = c('x1', 'x2'), data=dat, q=4,
#' family=binomial())
#
#'
#' #survival objects
#' set.seed(50)
#' N=200
#' dat <- data.frame(time=(tmg <- pmin(.1,rweibull(N, 10, 0.1))),
#' d=1.0*(tmg<0.1), x1=runif(N), x2=runif(N), z=runif(N))
#' expnms=paste0("x", 1:2)
#' f = survival::Surv(time, d)~x1 + x2
#' qgcomp(f, expnms = expnms, data = dat)
#' # note if B or MCsize are set but the model is linear, an error will result
#' try(qgcomp(f, expnms = expnms, data = dat, B1=, MCsize))
#' # note that in the survival models, MCsize should be set to a large number
#' # such that results are repeatable (within an error tolerance such as 2 significant digits)
#' # if you run them under different seed values
#' f = survival::Surv(time, d)~x1 + x2 + x1:x2
#' qgcomp(f, expnms = expnms, data = dat, B=10, MCsize=100)
#' }
requireNamespace("survival")
iszip = (length(grep("\\|", as.character(f)))>0)
issurv = survival::is.Surv(eval(attr(terms(f, data = data), "variables")[[2]], envir = data))
if (is.character(family))
family <- get(family, mode = "function", envir = parent.frame())
if (is.function(family))
family <- family()
if (is.null(family$family)) {
if(issurv){
message("Survival model\n")
}else{
print(family)
stop("'family' not recognized")
}
}
if(!(family$family == "binomial") & rr) {
#warning("'rr=TRUE' is for bimomial family only, setting rr=FALSE")
rr = FALSE
}
terms <- attr(terms(f,data=data), 'term.labels')
doboot <- !checknames(terms)
if(issurv & doboot ){
res <- qgcomp.cox.boot(f=f,data=data,...)
} else if(issurv & !doboot ){
res <- qgcomp.cox.noboot(f=f,data=data,...)
} else if(!iszip & (rr | doboot)){
res <- qgcomp.glm.boot(f=f,data=data,family=family,rr=rr,...)
}else if(!iszip){
res <- qgcomp.glm.noboot(f=f,data=data,family=family,...)
}else if(iszip & doboot){
res <- qgcomp.zi.boot(f=f,data=data,dist=family$family,rr=rr,...)
}else if(iszip & !doboot){
res <- qgcomp.zi.noboot(f=f,data=data,dist=family$family,...)
} else{
stop('Unable to parse the model type: try one of the specific functions in the qgcomp package
e.g. qgcomp.glm.noboot, qgcomp.glm.boot, qgcomp.cox.noboot, qgcomp.cox.boot, qgcomp.zi.noboot,
qgcomp.zi.boot, ')
}
res
}
msm.predict <- function(object, newdata=NULL){
#' @title Secondary prediction method for the (non-survival) qgcomp MSM.
#'
#' @description this is an internal function called by
#' \code{\link[qgcomp]{qgcomp.glm.boot}},
#' but is documented here for clarity. Generally, users will not need to call
#' this function directly.
#' @description Get predicted values from a qgcompfit object from
#' \code{\link[qgcomp]{qgcomp.glm.boot}}.
#'
#' @details (Not usually called by user) Makes predictions from the MSM
#' (rather than the conditional g-computation
#' fit) from a "qgcompfit" object. Generally, this should not be used in
#' favor of the default \code{\link[qgcomp]{predict.qgcompfit}} function. This
#' function can only be used following the `qgcomp.glm.boot` function. For the
#' `qgcomp.glm.noboot` function, \code{\link[qgcomp]{predict.qgcompfit}} gives
#' identical inference to predicting from an MSM.
#'
#'
#' @param object "qgcompfit" object from `qgcomp.glm.boot` function
#' @param newdata (optional) new set of data (data frame) with a variable
#' called `psi` representing the joint exposure level of all exposures
#' under consideration
#' @export
#' @examples
#' set.seed(50)
#' dat <- data.frame(y=runif(50), x1=runif(50), x2=runif(50), z=runif(50))
#' obj <- qgcomp.glm.boot(y ~ z + x1 + x2 + I(z*x1), expnms = c('x1', 'x2'),
#' data=dat, q=4, B=10, seed=125)
#' dat2 <- data.frame(psi=seq(1,4, by=0.1))
#' summary(msm.predict(obj))
#' summary(msm.predict(obj, newdata=dat2))
if(!object$bootstrap) stop("only valid for results from qgcomp.glm.boot function")
if(is.null(newdata)){
pred <- predict(object$msmfit, type='response')
}
if(!is.null(newdata)){
pred <- predict(object$msmfit, newdata=newdata, type='response')
}
return(pred)
}
#' @rdname qgcomp.glm.boot
#' @export
qgcomp.boot <- qgcomp.glm.boot
#' @rdname qgcomp.glm.noboot
#' @export
qgcomp.noboot <- qgcomp.glm.noboot
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.