knitr::opts_chunk$set(echo = TRUE) setwd(here::here("tests/analyses")) # Enable to generate high-res plots for the paper #knitr::opts_chunk$set(dpi = 300, fig.width = 4, fig.height = 4) library(magrittr)
See https://www.niehs.nih.gov/news/events/pastmtg/2015/statistical/index.cfm
data = rio::import("../../inst/extdata/niehs-2015-dataset1.xls") str(data) task = list( id = "obs", exposures = paste0("X", 1:7), outcome = "Y") task$covariates = setdiff(names(data), c(task$id, task$exposures, task$outcome)) task
TODO: determine if we need to log-transform the data.
library(ggplot2) for (exposure in task$exposures) { print(qplot(data[[exposure]]) + ggtitle(paste("Exposure:", exposure)) + theme_minimal()) } # Log-transform the data. for (exposure in task$exposures) { data[[exposure]] = log(data[[exposure]]) } summary(data) for (exposure in task$exposures) { print(qplot(data[[exposure]]) + ggtitle(paste("Exposure:", exposure)) + theme_minimal()) } # Also log-transform Y? # No, doesn't seem log-normally distributed. qplot(data[[task$outcome]]) qplot(log(data[[task$outcome]]))
library(tlmixture) #folds_cvtmle = 2L folds_cvtmle = 5L estimators = c("SL.mean", "SL.glmnet", "SL.ranger") cluster_exposures = FALSE verbose = TRUE quantiles_exposures = 4L quantiles_mixtures = 3L names(data) class(data) #(exposures = paste0("X", 1:3)) df = data[, !names(data) %in% task$id] str(df) # Normalize the exposures. for (exposure in task$exposures) { df[[exposure]] = (df[[exposure]] - mean(df[[exposure]])) / sd(df[[exposure]]) } # Review exposure distributions. summary(df[, task$exposures]) # Compare to OLS. reg = lm(as.formula(paste(task$outcome, "~ .")), data = df) summary(reg) # Create a noise z2 - temporary. # TODO: fix tlmixture to work with 1 or 0 adjustment variables. # This may be a bug in SuperLearner can comes up when doing propensity score estimation as part of CV-TMLE. df$z2 = rnorm(nrow(df)) #df$z2 = NULL task$exposures library(tmle) get_cores = function() getOption("sl.cores", RhpcBLASctl::get_num_cores()) dbarts2_50 = function(...) { tmle::tmle.SL.dbarts2(..., ntree = 50, nthread = get_cores()) } ranger_fast = function(...) SL.ranger(..., num.trees = 150, num.threads = get_cores()) #mix_lib = c("SL.mean", "SL.lm")#, "ranger_fast", "dbarts2_50") #mix_lib = c("SL.mean", "SL.lm", "SL.glmnet", "ranger_fast", "SL.rpart")#, "dbarts2_50") #mix_lib = c("SL.mean", "lm2") #mix_lib = c("SL.mean", "lm2", "ranger_fast") mix_lib = c("lm2", "ranger_fast") mix_sl_1 = function(...) mixture_backfit_sl(..., estimator_mixture = mix_lib) lm2 = function(...) { # Hide all the damn "prediction from rank-deficient fit may be misleading" warnings. suppressWarnings(SL.lm(...)) } library(sl3) lrnr_glm <- make_learner(Lrnr_glm) #lrnr_glm_fast <- make_learner(Lrnr_glm_fast) lrnr_mean <- make_learner(Lrnr_mean) # Can't use ranger - doesn't support offsets (presumably). # lrnr_ranger100 <- make_learner(Lrnr_ranger, num.trees = 100) # TODO: add offset support to Lrnr_glmnet. lrnr_glmnet <- make_learner(Lrnr_glmnet) lrnr_xgb <- make_learner(Lrnr_xgboost, nrounds = 100, nthread = get_cores(), eta = 0.1) lrnr_xgb2 <- make_learner(Lrnr_xgboost, nrounds = 40, nthread = get_cores(), eta = 0.3, max_depth = 3L) lrnr_hal_d3 = make_learner(Lrnr_hal9001) lrnr_hal_d2 = make_learner(Lrnr_hal9001, max_degree = 2) lrnr_hal_d1 = make_learner(Lrnr_hal9001, max_degree = 1) # stack <- make_learner(Stack, lrnr_glm, lrnr_mean) sl3_screen_cor <- make_learner(Lrnr_screener_corP) sl3_cor_glm <- make_learner(Pipeline, sl3_screen_cor, lrnr_glm) stack <- make_learner(Stack, lrnr_glm, sl3_cor_glm, #lrnr_mean, lrnr_hal_d2, lrnr_hal_d1, #lrnr_glm_fast, lrnr_xgb, lrnr_xgb2)#, lrnr_glmnet) sl <- Lrnr_sl$new(learners = stack, metalearner = make_learner(Lrnr_nnls, convex = TRUE)) estimator_sl3 = function(...) { #tlmixture::mixture_backfit_sl3(..., mixture_score_sl3(..., debug = TRUE, #max_iterations = 3L, #max_iterations = 10L, estimator_mixture = sl$clone(), estimator_confounders = sl$clone()) }
(covariates = setdiff(names(df), c(task$exposures, task$outcome))) # Orthogonalize exposures. df_ortho = df # Orthogonalize exposures per manuscript algorithm. for (exposure in task$exposures) { # TODO: use SuperLearner here. reg = glm(as.formula(paste0(exposure, " ~ .")), data = df_ortho[, c(covariates, exposure)]) # A' = A - E[A | W] df_ortho[[exposure]] = df_ortho[[exposure]] - reg$fitted.values }
set.seed(1, "L'Ecuyer-CMRG") result = tlmixture(#df, df_ortho, outcome = task$outcome, exposures = task$exposures, #quantiles_mixtures = quantiles_mixtures, quantiles_mixtures = 4, #quantiles_mixtures = 2, #quantiles_mixtures = 5, #estimator_outcome = estimators, #estimator_outcome = c("SL.mean", "SL.lm"), estimator_outcome = c("SL.mean", "lm2", "ranger_fast"), #mixture_fn = tlmixture::mixture_backfit_glm, #mixture_fn = mixture_backfit_glm, #mixture_fn = tlmixture::mixture_backfit_lm, #mixture_fn = mixture_backfit_sl3, mixture_fn = estimator_sl3, #folds_cvtmle = 4, folds_cvtmle = 2, #folds_cvtmle = folds_cvtmle, #folds_cvtmle = 3, #folds_cvtmle = 2, #folds_cvtmle = 10, #folds_cvtmle = 20, verbose = TRUE) save(result, file = "data/niehs-sim1-result.RData") result result$combined$groups_df qplot(result$groups$mixture_df$all) + theme_minimal()
df2 = df df2$z2 = NULL #res = mixture_backfit_lm(df2, res = mixture_backfit_sl(df2, task$outcome, task$exposures, max_iterations = 5L, verbose = TRUE) res res$coefs_mixture[50L, ] summary(res$reg_mixture) summary(res$reg_adjust) # Check prediction. preds = predict(res, data[, res$exposures]) dim(preds) qplot(preds) preds = predict(res$reg_mixture, data[, res$exposures], onlySL = TRUE) str(preds) result$groups$groups_df
#res = mixture_backfit_lm(df2, res = mixture_backfit_sl3(df, task$outcome, task$exposures, estimator_mixture = sl$clone(), estimator_confounders = sl$clone(), #max_iterations = 10L, max_iterations = 5L, debug = TRUE, verbose = TRUE) # Review sl3 learner for mixture res$reg_mixture # Review sl3 learner for confounder adjustment res$reg_adjust # Check prediction. preds = predict(res, data[, res$exposures]) qplot(preds) + theme_minimal() # Try to predict the confounder regression. preds2 = predict(res, df, type = "confounder") qplot(preds2) + theme_minimal() # Examine predictors of mixture # Making one error: X2 should be significant and positive reg = glm(preds ~ ., data = data[, res$exposures]) summary(reg) # Examine predictors of confounding adjustment reg2 = glm(preds2 ~ ., data = df[, res$confounders]) # NOTE: z2 is a noise variable, so we shouldn't find it to be significant. summary(reg2) # Try including all data, but exclude outcome. reg3 = glm(preds2 ~ ., data = df[, !names(df) %in% c("Y")]) # Looks like the direction on exposures is generally swapped. summary(reg3) # TODO: need to make an sl3 task for prediction. if (FALSE) { preds3 = res$reg_mixture$predict(data[, res$exposures]) }
summary(result$outcome_rescaled) df[[result$outcome]] = result$outcome_rescaled # Analyze mixture distribution. mix_result = analyze_mixture(df, result, name = "Dataset 1", reg_vars = task$covariates, vars_desc = task$covariates) # TODO: consider integrating plot_analysis() into analyze_mixture() plot_analysis(mix_result) weight_df = do.call(rbind, sapply(result$folds, `[[`, "weights")) colMeans(weight_df)
display_mixture(mix_result) # Run again, this time as latex. options(knitr.table.format = "latex") kab_tab = display_mixture(mix_result, label = "mixture-data1", caption = "Summary of mixture analysis for dataset 1") cat(kab_tab, file = "data1-summary-table.tex") # Restore previous table format setting. # We unfortunately can't just assign it to NULL :/ Have to pick a specific value. options(knitr.table.format = "html")
# Loop over each variable and plot lowess smooth of that variable vs. mixture. task$exposures for (exposure_i in task$exposures) { gg_df = cbind.data.frame(df[, exposure_i, drop = FALSE], result$groups$mixture_df) g = ggplot(data = gg_df, aes(x = get(exposure_i), y = all)) + geom_smooth() + labs(y = "Mixture", title = paste("Mixture relationship of exposure", exposure_i)) + theme_minimal() print(g) } reg = lm(result$groups$mixture_df$all ~ ., data = df[, c(task$exposures, task$covariates)]) summary(reg) reg_v2 = lm(result$groups$mixture_df$all ~ ., data = df[, task$exposures]) summary(reg_v2)
data2 = rio::import("../inst/extdata/niehs-2015-dataset2.xls") str(data2) names(data2) = tolower(names(data2)) task2 = list( id = "obs", exposures = paste0("x", 1:14), outcome = "y") task2$covariates = setdiff(names(data2), c(task2$id, task2$exposures, task2$outcome)) task2
TODO: determine if we need to log-transform the data.
library(ggplot2) for (exposure in task2$exposures) { print(qplot(data2[[exposure]]) + ggtitle(paste("Exposure:", exposure)) + theme_minimal()) } # Skip log-transforming the data. if (FALSE) { for (exposure in task2$exposures) { data2[[exposure]] = log(data[[exposure]]) } print(summary(data2)) for (exposure in task2$exposures) { print(qplot(data2[[exposure]]) + ggtitle(paste("Exposure:", exposure)) + theme_minimal()) } }
library(tlmixture) library(tmle) #folds_cvtmle = 2L folds_cvtmle = 5L #estimators = c("SL.mean", "SL.glmnet", "SL.ranger", "tmle.SL.dbarts2") #estimators = c("SL.mean", "SL.glmnet", "SL.ranger") estimators = c("SL.mean", "lm2", "ranger_fast") cluster_exposures = FALSE verbose = TRUE quantiles_exposures = 4L quantiles_mixtures = 3L names(data2) (exposures = task2$exposures) class(data2) task2$exposures # Simpler version. #(exposures = paste0("x", 1:3)) df2 = data2[, !names(data2) %in% task2$id] str(df2) # Create a noise z2 - temporary. # TODO: fix tlmixture to work with 1 or 0 adjustment variables. #df$z2 = rnorm(nrow(df)) lrnr_hal_d1_lassi = make_learner(Lrnr_hal9001, max_degree = 1, fit_type = "lassi") stack2 <- make_learner(Stack, lrnr_glm, sl3_cor_glm, #lrnr_mean, lrnr_hal_d1_lassi, lrnr_hal_d2, #lrnr_glm_fast, lrnr_xgb, lrnr_xgb2)#, lrnr_glmnet) sl2 <- Lrnr_sl$new(learners = stack2, metalearner = make_learner(Lrnr_nnls, convex = TRUE)) estimator_sl3_v2 = function(...) { #tlmixture::mixture_backfit_sl3(..., mixture_score_sl3(..., debug = TRUE, max_iterations = 3L, #max_iterations = 10L, estimator_mixture = sl2$clone(), estimator_confounders = sl2$clone()) }
(covariates = setdiff(names(df2), c(task2$exposures, task2$outcome))) # Orthogonalize exposures. df2_ortho = df2 # Orthogonalize exposures per manuscript algorithm. for (exposure in task2$exposures) { # TODO: use SuperLearner here. reg = glm(as.formula(paste0(exposure, " ~ .")), data = df2_ortho[, c(covariates, exposure)]) # A' = A - E[A | W] df2_ortho[[exposure]] = df2_ortho[[exposure]] - reg$fitted.values }
set.seed(1, "L'Ecuyer-CMRG") result2 = tlmixture(#df2, df2_ortho, outcome = task2$outcome, exposures = exposures, quantiles_exposures = quantiles_exposures, #quantiles_mixtures = quantiles_mixtures, quantiles_mixtures = 4, #quantiles_mixtures = 5, estimator_outcome = estimators, cluster_exposures = cluster_exposures, #mixture_fn = mixture_backfit_sl3, mixture_fn = estimator_sl3_v2, #mixture_fn = sl_mix_2, #folds_cvtmle = folds_cvtmle, #folds_cvtmle = 3, #folds_cvtmle = 4, folds_cvtmle = 5, # folds_cvtmle = 8, #folds_cvtmle = 10, #folds_cvtmle = 20, verbose = TRUE) # TODO: compile mixture predictions from test folds so that we can analyze post-hoc. # result$folds[[1]]$test_results result2
res2 = mixture_backfit_sl3(df2, task2$outcome, task2$exposures, estimator_mixture = sl2$clone(), estimator_confounders = sl2$clone(), max_iterations = 3L, debug = TRUE, verbose = TRUE) # Review sl3 learner for mixture res2$reg_mixture # Review sl3 learner for confounder adjustment res2$reg_adjust # Check prediction. preds_d2 = predict(res2, df2[, res2$exposures]) qplot(preds_d2) + theme_minimal() # Try to predict the confounder regression. preds2_d2 = predict(res2, df2, type = "confounder") qplot(preds2_d2) + theme_minimal() # Examine predictors of mixture # Making one error: X2 should be significant and positive reg_d2 = glm(preds_d2 ~ ., data = data2[, res2$exposures]) summary(reg_d2) # Examine predictors of confounding adjustment reg2_d2 = glm(preds2_d2 ~ ., data = df2[, res2$confounders]) # NOTE: z2 is a noise variable, so we shouldn't find it to be significant. summary(reg2_d2) # Try including all data, but exclude outcome. reg3_d2 = glm(preds2_d2 ~ ., data = df2[, !names(df2) %in% c("Y")]) # Looks like the direction on exposures is generally swapped. summary(reg3_d2) # Mixture preds: Try including all data, but exclude outcome. reg4_d2 = glm(preds_d2 ~ ., data = df2[, !names(df2) %in% c("Y")]) # Looks like the direction on exposures is generally swapped. summary(reg4_d2) # TODO: need to make an sl3 task for prediction. if (FALSE) { preds3_d2 = res2$reg_mixture$predict(data2[, res2$exposures]) }
summary(result2$outcome_rescaled) data2[[result2$outcome]] = result2$outcome_rescaled # Analyze mixture distribution. # TODO: fix "unknown column" error due to rescaling of outcome variable. mix_result2 = analyze_mixture(data2, result2, name = "Dataset 2", reg_vars = task2$covariates, vars_desc = task2$covariates) # TODO: get plot_analysis() working. plot_analysis(mix_result2)
# Run once and display as html. display_mixture(mix_result2) # Run again, this time as latex. options(knitr.table.format = "latex") kab_tab = display_mixture(mix_result2, label = "mixture-data2", caption = "Summary of mixture analysis for dataset 2") cat(kab_tab, file = "data2-summary-table.tex") # Restore previous table format setting. options(knitr.table.format = "html") #getOption("knitr.table.format")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.