knitr::opts_chunk$set(echo = TRUE)
setwd(here::here("tests/analyses"))

# Enable to generate high-res plots for the paper
#knitr::opts_chunk$set(dpi = 300, fig.width = 4, fig.height = 4)

library(magrittr)

See https://www.niehs.nih.gov/news/events/pastmtg/2015/statistical/index.cfm

Simulated Dataset 1

Import data

data = rio::import("../../inst/extdata/niehs-2015-dataset1.xls")
str(data)

task = list(
  id = "obs",
  exposures = paste0("X", 1:7),
  outcome = "Y")

task$covariates = setdiff(names(data), c(task$id, task$exposures, task$outcome))
task

Exploratory data analysis

TODO: determine if we need to log-transform the data.

library(ggplot2)

for (exposure in task$exposures) {
  print(qplot(data[[exposure]]) + ggtitle(paste("Exposure:", exposure)) + theme_minimal())
}


# Log-transform the data.
for (exposure in task$exposures) {
  data[[exposure]] = log(data[[exposure]])
}

summary(data)

for (exposure in task$exposures) {
  print(qplot(data[[exposure]]) + ggtitle(paste("Exposure:", exposure)) + theme_minimal())
}

# Also log-transform Y?
# No, doesn't seem log-normally distributed.
qplot(data[[task$outcome]])
qplot(log(data[[task$outcome]]))

Prep to run

library(tlmixture)
#folds_cvtmle = 2L
folds_cvtmle = 5L
estimators = c("SL.mean", "SL.glmnet", "SL.ranger")
cluster_exposures = FALSE
verbose = TRUE
quantiles_exposures = 4L
quantiles_mixtures = 3L

names(data)
class(data)

#(exposures = paste0("X", 1:3))

df = data[, !names(data) %in% task$id]
str(df)

# Normalize the exposures.
for (exposure in task$exposures) {
  df[[exposure]] = (df[[exposure]] - mean(df[[exposure]])) / sd(df[[exposure]])
}

# Review exposure distributions.
summary(df[, task$exposures])


# Compare to OLS.
reg = lm(as.formula(paste(task$outcome, "~ .")), data = df)
summary(reg)

# Create a noise z2 - temporary.
# TODO: fix tlmixture to work with 1 or 0 adjustment variables.
# This may be a bug in SuperLearner can comes up when doing propensity score estimation as part of CV-TMLE.
df$z2 = rnorm(nrow(df))
#df$z2 = NULL

task$exposures

library(tmle)

get_cores = function() getOption("sl.cores", RhpcBLASctl::get_num_cores())

dbarts2_50 = function(...) {
  tmle::tmle.SL.dbarts2(..., ntree = 50, nthread = get_cores()) }

ranger_fast = function(...) SL.ranger(..., num.trees = 150, num.threads = get_cores())

#mix_lib = c("SL.mean", "SL.lm")#, "ranger_fast", "dbarts2_50") 
#mix_lib = c("SL.mean", "SL.lm", "SL.glmnet", "ranger_fast", "SL.rpart")#, "dbarts2_50") 
#mix_lib = c("SL.mean", "lm2")
#mix_lib = c("SL.mean", "lm2", "ranger_fast")
mix_lib = c("lm2", "ranger_fast")
mix_sl_1 = function(...) mixture_backfit_sl(..., estimator_mixture = mix_lib) 


lm2 = function(...) {
  # Hide all the damn "prediction from rank-deficient fit may be misleading" warnings.
  suppressWarnings(SL.lm(...))
}

library(sl3)

lrnr_glm <- make_learner(Lrnr_glm)
#lrnr_glm_fast <- make_learner(Lrnr_glm_fast)
lrnr_mean <- make_learner(Lrnr_mean)
# Can't use ranger - doesn't support offsets (presumably).
# lrnr_ranger100 <- make_learner(Lrnr_ranger, num.trees = 100)
# TODO: add offset support to Lrnr_glmnet.
lrnr_glmnet <- make_learner(Lrnr_glmnet)
lrnr_xgb <- make_learner(Lrnr_xgboost, nrounds = 100, nthread = get_cores(), eta = 0.1)
lrnr_xgb2 <- make_learner(Lrnr_xgboost, nrounds = 40, nthread = get_cores(), eta = 0.3, max_depth = 3L)
lrnr_hal_d3 = make_learner(Lrnr_hal9001)
lrnr_hal_d2 = make_learner(Lrnr_hal9001, max_degree = 2)
lrnr_hal_d1 = make_learner(Lrnr_hal9001, max_degree = 1)

# stack <- make_learner(Stack, lrnr_glm, lrnr_mean)
sl3_screen_cor <- make_learner(Lrnr_screener_corP)
sl3_cor_glm <- make_learner(Pipeline, sl3_screen_cor, lrnr_glm)

stack <- make_learner(Stack,
                      lrnr_glm,
                      sl3_cor_glm,
                      #lrnr_mean,
                      lrnr_hal_d2,
                      lrnr_hal_d1,
                      #lrnr_glm_fast,
                      lrnr_xgb,
                      lrnr_xgb2)#, lrnr_glmnet)
sl <- Lrnr_sl$new(learners = stack,
                  metalearner = make_learner(Lrnr_nnls, convex = TRUE))

estimator_sl3 = function(...) {
  #tlmixture::mixture_backfit_sl3(...,
  mixture_score_sl3(...,
                                 debug = TRUE,
                                 #max_iterations = 3L,
                                 #max_iterations = 10L,
                                 estimator_mixture = sl$clone(),
                                 estimator_confounders = sl$clone())
}

Orthogonalize exposures

(covariates = setdiff(names(df), c(task$exposures, task$outcome)))

# Orthogonalize exposures.
df_ortho = df

# Orthogonalize exposures per manuscript algorithm.
for (exposure in task$exposures) {
  # TODO: use SuperLearner here.
  reg = glm(as.formula(paste0(exposure, " ~ .")), data = df_ortho[, c(covariates, exposure)])
  # A' = A - E[A | W]
  df_ortho[[exposure]] = df_ortho[[exposure]] - reg$fitted.values
}

Run TLMixture

set.seed(1, "L'Ecuyer-CMRG")
result =
  tlmixture(#df,
            df_ortho,
    outcome = task$outcome,
            exposures = task$exposures,
            #quantiles_mixtures = quantiles_mixtures,
            quantiles_mixtures = 4,
            #quantiles_mixtures = 2,
            #quantiles_mixtures = 5,
            #estimator_outcome = estimators,
            #estimator_outcome = c("SL.mean", "SL.lm"),
            estimator_outcome = c("SL.mean", "lm2", "ranger_fast"),
            #mixture_fn = tlmixture::mixture_backfit_glm,
            #mixture_fn = mixture_backfit_glm,
            #mixture_fn = tlmixture::mixture_backfit_lm,
            #mixture_fn = mixture_backfit_sl3,
            mixture_fn = estimator_sl3,
            #folds_cvtmle = 4,
            folds_cvtmle = 2,
            #folds_cvtmle = folds_cvtmle,
            #folds_cvtmle = 3,
            #folds_cvtmle = 2,
            #folds_cvtmle = 10,
            #folds_cvtmle = 20,
            verbose = TRUE)

save(result, file = "data/niehs-sim1-result.RData")

result

result$combined$groups_df
qplot(result$groups$mixture_df$all) + theme_minimal()

Test backfit lm (old)

df2 = df
df2$z2 = NULL
#res = mixture_backfit_lm(df2,
res = mixture_backfit_sl(df2,
                         task$outcome, task$exposures,
                         max_iterations = 5L,
                         verbose = TRUE)

res
res$coefs_mixture[50L, ]
summary(res$reg_mixture)
summary(res$reg_adjust)

# Check prediction.
preds = predict(res, data[, res$exposures])
dim(preds)
qplot(preds)
preds = predict(res$reg_mixture, data[, res$exposures], onlySL = TRUE)
str(preds)
result$groups$groups_df

Run mixture_sl3 on full data (old)

#res = mixture_backfit_lm(df2,
res = mixture_backfit_sl3(df,
                         task$outcome, task$exposures,
                         estimator_mixture = sl$clone(),
                         estimator_confounders = sl$clone(),
                         #max_iterations = 10L,
                         max_iterations = 5L,
                         debug = TRUE,
                         verbose = TRUE)

# Review sl3 learner for mixture
res$reg_mixture
# Review sl3 learner for confounder adjustment
res$reg_adjust

# Check prediction.
preds = predict(res, data[, res$exposures])
qplot(preds) + theme_minimal()

# Try to predict the confounder regression.
preds2 = predict(res, df, type = "confounder")
qplot(preds2) + theme_minimal()

# Examine predictors of mixture
# Making one error: X2 should be significant and positive
reg = glm(preds ~ ., data = data[, res$exposures])
summary(reg)

# Examine predictors of confounding adjustment
reg2 = glm(preds2 ~ ., data = df[, res$confounders])
# NOTE: z2 is a noise variable, so we shouldn't find it to be significant.
summary(reg2)

# Try including all data, but exclude outcome. 
reg3 = glm(preds2 ~ ., data = df[, !names(df) %in% c("Y")])
# Looks like the direction on exposures is generally swapped.
summary(reg3)

# TODO: need to make an sl3 task for prediction.
if (FALSE) {
  preds3 = res$reg_mixture$predict(data[, res$exposures])
}

Review results

summary(result$outcome_rescaled)
df[[result$outcome]] = result$outcome_rescaled

# Analyze mixture distribution.
mix_result = analyze_mixture(df, result,
                             name = "Dataset 1",
                             reg_vars = task$covariates,
                             vars_desc = task$covariates)

# TODO: consider integrating plot_analysis() into analyze_mixture()
plot_analysis(mix_result)

weight_df = do.call(rbind, sapply(result$folds, `[[`, "weights"))
colMeans(weight_df)
display_mixture(mix_result)
# Run again, this time as latex.
options(knitr.table.format = "latex")
kab_tab = display_mixture(mix_result,
                          label = "mixture-data1",
                          caption = "Summary of mixture analysis for dataset 1")
cat(kab_tab, file = "data1-summary-table.tex")
# Restore previous table format setting.
# We unfortunately can't just assign it to NULL :/ Have to pick a specific value.
options(knitr.table.format = "html")

Explore mixture

# Loop over each variable and plot lowess smooth of that variable vs. mixture.
task$exposures

for (exposure_i in task$exposures) {

  gg_df = cbind.data.frame(df[, exposure_i, drop = FALSE],
                           result$groups$mixture_df)
  g = ggplot(data = gg_df, aes(x = get(exposure_i), y = all)) +
    geom_smooth() +
    labs(y = "Mixture",
         title = paste("Mixture relationship of exposure", exposure_i)) +
    theme_minimal()
  print(g)
}

reg = lm(result$groups$mixture_df$all ~ .,
         data = df[, c(task$exposures, task$covariates)])
summary(reg)

reg_v2 = lm(result$groups$mixture_df$all ~ .,
         data = df[, task$exposures])
summary(reg_v2)

Simulated Dataset 2

Import data

data2 = rio::import("../inst/extdata/niehs-2015-dataset2.xls")
str(data2)

names(data2) = tolower(names(data2))

task2 = list(
  id = "obs",
  exposures = paste0("x", 1:14),
  outcome = "y")

task2$covariates = setdiff(names(data2), c(task2$id, task2$exposures, task2$outcome))
task2

Exploratory data analysis

TODO: determine if we need to log-transform the data.

library(ggplot2)

for (exposure in task2$exposures) {
  print(qplot(data2[[exposure]]) + ggtitle(paste("Exposure:", exposure)) + theme_minimal())
}


# Skip log-transforming the data.
if (FALSE) {
for (exposure in task2$exposures) {
  data2[[exposure]] = log(data[[exposure]])
}

print(summary(data2))

for (exposure in task2$exposures) {
  print(qplot(data2[[exposure]]) + ggtitle(paste("Exposure:", exposure)) + theme_minimal())
}

}

Prep to run

library(tlmixture)
library(tmle)
#folds_cvtmle = 2L
folds_cvtmle = 5L
#estimators = c("SL.mean", "SL.glmnet", "SL.ranger", "tmle.SL.dbarts2")
#estimators = c("SL.mean", "SL.glmnet", "SL.ranger")
estimators = c("SL.mean", "lm2", "ranger_fast")
cluster_exposures = FALSE
verbose = TRUE
quantiles_exposures = 4L
quantiles_mixtures = 3L

names(data2)
(exposures = task2$exposures)
class(data2)

task2$exposures

# Simpler version.
#(exposures = paste0("x", 1:3))

df2 = data2[, !names(data2) %in% task2$id]
str(df2)

# Create a noise z2 - temporary.
# TODO: fix tlmixture to work with 1 or 0 adjustment variables.
#df$z2 = rnorm(nrow(df))


lrnr_hal_d1_lassi = make_learner(Lrnr_hal9001, max_degree = 1, fit_type = "lassi")


stack2 <- make_learner(Stack,
                      lrnr_glm,
                      sl3_cor_glm,
                      #lrnr_mean,
                      lrnr_hal_d1_lassi,
                      lrnr_hal_d2,
                      #lrnr_glm_fast,
                      lrnr_xgb,
                      lrnr_xgb2)#, lrnr_glmnet)
sl2 <- Lrnr_sl$new(learners = stack2,
                   metalearner = make_learner(Lrnr_nnls, convex = TRUE))

estimator_sl3_v2 = function(...) {
  #tlmixture::mixture_backfit_sl3(...,
  mixture_score_sl3(...,
                                 debug = TRUE,
                                 max_iterations = 3L,
                                 #max_iterations = 10L,
                                 estimator_mixture = sl2$clone(),
                                 estimator_confounders = sl2$clone())
}

Orthogonalize exposures

(covariates = setdiff(names(df2), c(task2$exposures, task2$outcome)))

# Orthogonalize exposures.
df2_ortho = df2

# Orthogonalize exposures per manuscript algorithm.
for (exposure in task2$exposures) {
  # TODO: use SuperLearner here.
  reg = glm(as.formula(paste0(exposure, " ~ .")), data = df2_ortho[, c(covariates, exposure)])
  # A' = A - E[A | W]
  df2_ortho[[exposure]] = df2_ortho[[exposure]] - reg$fitted.values
}

Run TLMixture

set.seed(1, "L'Ecuyer-CMRG")
result2 = tlmixture(#df2,
                   df2_ortho,
                   outcome = task2$outcome,
                   exposures = exposures,
                   quantiles_exposures = quantiles_exposures,
                   #quantiles_mixtures = quantiles_mixtures,
                   quantiles_mixtures = 4,
                   #quantiles_mixtures = 5,
                   estimator_outcome = estimators,
                   cluster_exposures = cluster_exposures,
                   #mixture_fn = mixture_backfit_sl3,
                   mixture_fn = estimator_sl3_v2,
                   #mixture_fn = sl_mix_2,
                   #folds_cvtmle = folds_cvtmle,
                   #folds_cvtmle = 3,
                   #folds_cvtmle = 4,
                   folds_cvtmle = 5,
                   # folds_cvtmle = 8,
                   #folds_cvtmle = 10,
                   #folds_cvtmle = 20,
                   verbose = TRUE)

# TODO: compile mixture predictions from test folds so that we can analyze post-hoc.
# result$folds[[1]]$test_results
result2

Run mixture_sl3 on full data

res2 = mixture_backfit_sl3(df2,
                         task2$outcome, task2$exposures,
                         estimator_mixture = sl2$clone(),
                         estimator_confounders = sl2$clone(),
                         max_iterations = 3L,
                         debug = TRUE,
                         verbose = TRUE)

# Review sl3 learner for mixture
res2$reg_mixture
# Review sl3 learner for confounder adjustment
res2$reg_adjust

# Check prediction.
preds_d2 = predict(res2, df2[, res2$exposures])
qplot(preds_d2) + theme_minimal()

# Try to predict the confounder regression.
preds2_d2 = predict(res2, df2, type = "confounder")
qplot(preds2_d2) + theme_minimal()

# Examine predictors of mixture
# Making one error: X2 should be significant and positive
reg_d2 = glm(preds_d2 ~ ., data = data2[, res2$exposures])
summary(reg_d2)

# Examine predictors of confounding adjustment
reg2_d2 = glm(preds2_d2 ~ ., data = df2[, res2$confounders])
# NOTE: z2 is a noise variable, so we shouldn't find it to be significant.
summary(reg2_d2)

# Try including all data, but exclude outcome. 
reg3_d2 = glm(preds2_d2 ~ ., data = df2[, !names(df2) %in% c("Y")])
# Looks like the direction on exposures is generally swapped.
summary(reg3_d2)


# Mixture preds: Try including all data, but exclude outcome. 
reg4_d2 = glm(preds_d2 ~ ., data = df2[, !names(df2) %in% c("Y")])
# Looks like the direction on exposures is generally swapped.
summary(reg4_d2)
# TODO: need to make an sl3 task for prediction.
if (FALSE) {
  preds3_d2 = res2$reg_mixture$predict(data2[, res2$exposures])
}

Review results

summary(result2$outcome_rescaled)
data2[[result2$outcome]] = result2$outcome_rescaled
# Analyze mixture distribution.
# TODO: fix "unknown column" error due to rescaling of outcome variable.
mix_result2 = analyze_mixture(data2, result2,
                             name = "Dataset 2",
                             reg_vars = task2$covariates,
                             vars_desc = task2$covariates)

# TODO: get plot_analysis() working.
plot_analysis(mix_result2)
# Run once and display as html.
display_mixture(mix_result2)
# Run again, this time as latex.
options(knitr.table.format = "latex")
kab_tab = display_mixture(mix_result2,
                          label = "mixture-data2",
                          caption = "Summary of mixture analysis for dataset 2")
cat(kab_tab, file = "data2-summary-table.tex")
# Restore previous table format setting.
options(knitr.table.format = "html")
#getOption("knitr.table.format")


ck37/tlmixture documentation built on Dec. 19, 2021, 5:13 p.m.