qgcomp-basic-vignette.R
In qgcomp: Quantile G-Computation

## ----invisibles, echo=FALSE, results='markup', message=FALSE------------------
library("knitr")
#library("gWQS")

## ----first step, echo=TRUE, results='markup', message=FALSE-------------------
library("qgcomp")
set.seed(543210)
qdat = simdata_quantized(n=5000, outcomtype="continuous", cor=c(.95, -0.3), b0=0, coef=c(0.25, -0.1, 0.05), q=4)
head(qdat)
cor(qdat[,c("x1", "x2", "x3")])
qgcomp(y~x1+x2+x3, expnms=c("x1", "x2", "x3"), data=qdat)

## ----metals data, echo=TRUE, results='markup', message=FALSE------------------
library("ggplot2")
data("metals", package="qgcomp")
head(metals)

## ----linear model and runtime, results='markup', fig.show='hold', fig.height=5, fig.width=7.5, cache=FALSE----
# we save the names of the mixture variables in the variable "Xnm"
Xnm <- c(
    'arsenic','barium','cadmium','calcium','chromium','copper',
    'iron','lead','magnesium','manganese','mercury','selenium','silver',
    'sodium','zinc'
)
covars = c('nitrate','nitrite','sulfate','ph', 'total_alkalinity','total_hardness')



# Example 1: linear model
# Run the model and save the results "qc.fit"
system.time(qc.fit <- qgcomp.glm.noboot(y~.,dat=metals[,c(Xnm, 'y')], family=gaussian()))
#   user  system elapsed 
#  0.011   0.002   0.018 

# contrasting other methods with computational speed
# WQS regression (v3.0.1 of gWQS package)
#system.time(wqs.fit <- gWQS::gwqs(y~wqs,mix_name=Xnm, data=metals[,c(Xnm, 'y')], family="gaussian"))
#   user  system elapsed 
# 35.775   0.124  36.114 

# Bayesian kernel machine regression (note that the number of iterations here would 
#  need to be >5,000, at minimum, so this underestimates the run time by a factor
#  of 50+
#system.time(bkmr.fit <- kmbayes(y=metals$y, Z=metals[,Xnm], family="gaussian", iter=100))
#   user  system elapsed 
# 81.644   4.194  86.520 

## ----linear model and runtimeb, results='markup', fig.show='hold', fig.height=5, fig.width=7.5, cache=FALSE----
# View results: scaled coefficients/weights and statistical inference about
# mixture effect
qc.fit

## ----linear model and runtime c, results='markup', fig.show='hold', fig.height=5, fig.width=7.5, cache=FALSE----
# quantized data
head(qc.fit$qx)

## ----linear model and runtime d, results='markup', fig.show='hold', fig.height=5, fig.width=7.5, cache=FALSE----
# regression with quantized data
qc.fit$qx$y = qc.fit$fit$data$y # first bring outcome back into the quantized data
newfit <- lm(y ~ arsenic_q + barium_q + cadmium_q + calcium_q + chromium_q + copper_q + 
    iron_q + lead_q + magnesium_q + manganese_q + mercury_q + selenium_q + 
    silver_q + sodium_q + zinc_q, data=qc.fit$qx)
newfit

## ----linear model and runtime e, results='markup', fig.show='hold', fig.height=5, fig.width=7.5, cache=FALSE----
sum(newfit$coefficients[-1]) # sum of all coefficients excluding intercept and confounders, if any
coef(qc.fit) # overall effect and intercept from qgcomp fit

## ----logistic qgcomp, results='markup', fig.show='hold', fig.height=5, fig.width=7.5, cache=FALSE----

qc.fit2 <- qgcomp.glm.noboot(disease_state~., expnms=Xnm, 
          data = metals[,c(Xnm, 'disease_state')], family=binomial(), 
          q=4)
qcboot.fit2 <- qgcomp.glm.boot(disease_state~., expnms=Xnm, 
          data = metals[,c(Xnm, 'disease_state')], family=binomial(), 
          q=4, B=10,# B should be 200-500+ in practice
          seed=125, rr=FALSE)
qcboot.fit2b <- qgcomp.glm.boot(disease_state~., expnms=Xnm, 
          data = metals[,c(Xnm, 'disease_state')], family=binomial(), 
          q=4, B=10,# B should be 200-500+ in practice
          seed=125, rr=TRUE)

## ----logistic qgcompb, results='markup', fig.show='hold', fig.height=5, fig.width=7.5, cache=FALSE----
qc.fit2

## ----logistic qgcompc, results='markup', fig.show='hold', fig.height=5, fig.width=7.5, cache=FALSE----
qcboot.fit2

## ----logistic qgcompd, results='markup', fig.show='hold', fig.height=5, fig.width=7.5, cache=FALSE----
qcboot.fit2b

## ----adj4cov-a, results='markup', fig.show='hold', fig.height=5, fig.width=7.5, cache=FALSE----

qc.fit3 <- qgcomp.glm.noboot(y ~ mage35 + arsenic + barium + cadmium + calcium + chloride + 
                           chromium + copper + iron + lead + magnesium + manganese + 
                           mercury + selenium + silver + sodium + zinc,
                         expnms=Xnm,
                         metals, family=gaussian(), q=4)
qc.fit3

plot(qc.fit3)


## ----adj4cov-b, results='markup', fig.show='hold', fig.height=5, fig.width=7.5, cache=FALSE----
qcboot.fit3 <- qgcomp.glm.boot(y ~ mage35 + arsenic + barium + cadmium + calcium + chloride + 
                           chromium + copper + iron + lead + magnesium + manganese + 
                           mercury + selenium + silver + sodium + zinc,
                         expnms=Xnm,
                         metals, family=gaussian(), q=4, B=10,# B should be 200-500+ in practice
                         seed=125)
qcboot.fit3
qcee.fit3 <- qgcomp.glm.ee(y ~ mage35 + arsenic + barium + cadmium + calcium + chloride + 
                           chromium + copper + iron + lead + magnesium + manganese + 
                           mercury + selenium + silver + sodium + zinc,
                         expnms=Xnm,
                         metals, family=gaussian(), q=4)
qcee.fit3

## ----adj4cov-c, results='markup', fig.show='hold', fig.height=5, fig.width=7.5, cache=FALSE----
qgcomp:::modelbound.ee(qcee.fit3)
plot(qcee.fit3, pointwiseref = 3, flexfit = FALSE, modelbound=TRUE)
plot(qcboot.fit3, pointwiseref = 3, flexfit = FALSE)

## ----adj4cov-d, results='markup', fig.show='hold', fig.height=5, fig.width=7.5, cache=FALSE----
pointwisebound.boot(qcboot.fit3, pointwiseref=3)
qgcomp:::modelbound.boot(qcboot.fit3)

## ----n-lin-non-hom-intro, results='markup', fig.show='hold', fig.height=5, fig.width=7.5, cache=FALSE----

qcboot.fit4 <- qgcomp(y~. + .^2,
                         expnms=Xnm,
                         metals[,c(Xnm, 'y')], family=gaussian(), q=4, B=10, seed=125)
plot(qcboot.fit4)

## ----ovrl-n-lin, results='markup', fig.show='hold', fig.height=5, fig.width=7.5, cache=FALSE----

qcboot.fit5 <- qgcomp(y~. + .^2,
                         expnms=Xnm,
                         metals[,c(Xnm, 'y')], family=gaussian(), q=4, degree=2, 
                      B=10, rr=FALSE, seed=125)
plot(qcboot.fit5)
qcee.fit5b <- qgcomp.glm.ee(y~. + .^2,
                         expnms=Xnm,
                         metals[,c(Xnm, 'y')], family=gaussian(), q=4, degree=2, 
                         rr=FALSE)
plot(qcee.fit5b)

## ----ovrl-n-linb, results='markup', fig.show='hold', fig.height=5, fig.width=7.5, cache=FALSE----
modelbound.boot(qcboot.fit5)
pointwisebound.boot(qcboot.fit5)
pointwisebound.noboot(qcee.fit5b)

## ----ovrl-n-lin-psi interp, results='markup', fig.show='hold', fig.height=5, fig.width=7.5, cache=FALSE----
qcboot.fit5

## ----graf-n-lin-1, results='markup', fig.show='hold', fig.height=3, fig.width=7.5, cache=FALSE----
library(splines)
# find all correlations > 0.6 (this is an arbitrary choice)
cormat = cor(metals[,Xnm])
idx = which(cormat>0.6 & cormat <1.0, arr.ind = TRUE)
newXnm = unique(rownames(idx)) # iron, lead, and cadmium


qc.fit6lin <- qgcomp.glm.boot(y ~ iron + lead + cadmium + 
                         mage35 + arsenic + magnesium + manganese + mercury + 
                         selenium + silver + sodium + zinc,
                         expnms=newXnm,
                         metals, family=gaussian(), q=8, B=10)

qc.fit6nonlin <- qgcomp.glm.boot(y ~ bs(iron) + bs(cadmium) + bs(lead) +
                         mage35 + arsenic + magnesium + manganese + mercury + 
                         selenium + silver + sodium + zinc,
                         expnms=newXnm,
                         metals, family=gaussian(), q=8, B=10, degree=2)

qc.fit6nonhom <- qgcomp.glm.boot(y ~ bs(iron)*bs(lead) + bs(iron)*bs(cadmium) + bs(lead)*bs(cadmium) +
                         mage35 + arsenic + magnesium + manganese + mercury + 
                         selenium + silver + sodium + zinc,
                         expnms=newXnm,
                         metals, family=gaussian(), q=8, B=10, degree=3)

## ----graf-n-lin-1b, results='markup', fig.show='hold', fig.height=3, fig.width=7.5, cache=FALSE----
pl_fit6lin <- plot(qc.fit6lin, suppressprint = TRUE, pointwiseref = 4)
pl_fit6lin + coord_cartesian(ylim=c(-0.75, .75)) + 
  ggtitle("Linear fit: mixture of iron, lead, and cadmium")

## ----graf-n-lin-2, results='markup', fig.show='hold', fig.height=3, fig.width=7.5, cache=FALSE----
pl_fit6nonlin <- plot(qc.fit6nonlin, suppressprint = TRUE, pointwiseref = 4)
pl_fit6nonlin + coord_cartesian(ylim=c(-0.75, .75)) + 
  ggtitle("Non-linear fit: mixture of iron, lead, and cadmium")

## ----graf-n-lin 3, results='markup', fig.show='hold', fig.height=3, fig.width=7.5, cache=FALSE----
pl_fit6nonhom <- plot(qc.fit6nonhom, suppressprint = TRUE, pointwiseref = 4)
pl_fit6nonhom + coord_cartesian(ylim=c(-0.75, .75)) + 
  ggtitle("Non-linear, non-homogeneous fit: mixture of iron, lead, and cadmium")

## ----grafwarn, results='markup', fig.show='hold', fig.height=3, fig.width=7.5, cache=FALSE----
qc.overfit <- qgcomp.glm.boot(y ~ bs(iron) + bs(cadmium) + bs(lead) +
                         mage35 + bs(arsenic) + bs(magnesium) + bs(manganese) + bs(mercury) + 
                         bs(selenium) + bs(silver) + bs(sodium) + bs(zinc),
                         expnms=Xnm,
                         metals, family=gaussian(), q=8, B=10, degree=1)
qc.overfit
plot(qc.overfit, pointwiseref = 5)

## ----grafwarn-2, results='markup', fig.show='hold', fig.height=3, fig.width=7.5, cache=FALSE----
plot(qc.overfit, flexfit = FALSE, pointwiseref = 5)

## ----n-lin-exs, results='markup', fig.show='hold', fig.height=3, fig.width=7.5, cache=FALSE----
qc.fit7a <- qgcomp.glm.boot(y ~ factor(iron) + lead + cadmium + 
                         mage35 + arsenic + magnesium + manganese + mercury + 
                         selenium + silver + sodium + zinc,
                         expnms=newXnm,
                         metals, family=gaussian(), q=8, B=20, deg=2)
# underlying fit
summary(qc.fit7a$fit)$coefficients
plot(qc.fit7a)

## ----n-lin-exs-2, results='markup', fig.show='hold', fig.height=3, fig.width=7.5, cache=FALSE----
qc.fit7b <- qgcomp.glm.boot(y ~ factor(iron)*factor(lead) + cadmium + 
                         mage35 + arsenic + magnesium + manganese + mercury + 
                         selenium + silver + sodium + zinc,
                         expnms=newXnm,
                         metals, family=gaussian(), q=8, B=10, deg=3)
# underlying fit
#summary(qc.fit7b$fit)$coefficients
plot(qc.fit7b)

## ----n-lin-exs-3, results='markup', fig.show='hold', fig.height=3, fig.width=7.5, cache=FALSE----
qc.fit7c <- qgcomp.glm.boot(y ~ I(iron>4)*I(lead>4) + cadmium + 
                         mage35 + arsenic + magnesium + manganese + mercury + 
                         selenium + silver + sodium + zinc,
                         expnms=newXnm,
                         metals, family=gaussian(), q=8, B=10, deg=2)
# underlying fit
summary(qc.fit7c$fit)$coefficients
plot(qc.fit7c)

## ----splines, results='markup', fig.show='hold', fig.height=5, fig.width=7.5, cache=FALSE----
AIC(qc.fit6lin$fit)
AIC(qc.fit6nonlin$fit)
AIC(qc.fit6nonhom$fit)

BIC(qc.fit6lin$fit)
BIC(qc.fit6nonlin$fit)
BIC(qc.fit6nonhom$fit)