knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
library(CausalGPS) library(ggplot2)
In this vignette, we present application of CausalGPS package on the Synthetic Medicare Data. In the dataset, we link the 2010 synthetic Medicare claims data to environmental exposures and potential confounders. The dataset is hosted on Harvard Dataverse [@khoshnevis_2022].
data("synthetic_us_2010") data <- synthetic_us_2010 knitr::kable(head((data)))
# transformers pow2 <- function(x) {x^2} pow3 <- function(x) {x^3} clog <- function(x) log(x+0.001) confounders <- names(data) confounders <- confounders[!(confounders %in% c("FIPS","Name","STATE", "STATE_CODE","cms_mortality_pct", "qd_mean_pm25"))]
confounders_s1 <- c("cs_poverty","cs_hispanic", "cs_black", "cs_ed_below_highschool", "cs_median_house_value", "cs_population_density", "cdc_mean_bmi","cdc_pct_nvsmoker", "gmet_mean_summer_tmmx", "gmet_mean_summer_rmx", "gmet_mean_summer_sph", "cms_female_pct", "region" ) study_data <- data[, c("qd_mean_pm25", confounders, "cms_mortality_pct")] study_data$region <- as.factor(study_data$region) study_data$cs_PIR <- study_data$cs_median_house_value/study_data$cs_household_income # Choose subset of data q1 <- stats::quantile(study_data$qd_mean_pm25,0.25) q2 <- stats::quantile(study_data$qd_mean_pm25,0.99) trimmed_data <- subset(study_data[stats::complete.cases(study_data) ,], qd_mean_pm25 <= q2 & qd_mean_pm25 >= q1) trimmed_data$gmet_mean_summer_sph <- pow2(trimmed_data$gmet_mean_summer_sph) set.seed(172) pseudo_pop_1 <- generate_pseudo_pop(trimmed_data$cms_mortality_pct, trimmed_data$qd_mean_pm25, data.frame(trimmed_data[, confounders_s1, drop=FALSE]), ci_appr = "matching", pred_model = "sl", gps_model = "parametric", bin_seq = NULL, trim_quantiles = c(0.0 , 1.0), optimized_compile = TRUE, use_cov_transform = TRUE, transformers = list("pow2","pow3","clog"), sl_lib = c("m_xgboost"), params = list(xgb_nrounds=c(17), xgb_eta=c(0.28)), nthread = 1, covar_bl_method = "absolute", covar_bl_trs = 0.1, covar_bl_trs_type = "mean", max_attempt = 1, matching_fun = "matching_l1", delta_n = 0.1, scale = 1) plot(pseudo_pop_1)
set.seed(172) pseudo_pop_2 <- generate_pseudo_pop(trimmed_data$cms_mortality_pct, trimmed_data$qd_mean_pm25, data.frame(trimmed_data[, confounders_s1, drop=FALSE]), ci_appr = "matching", pred_model = "sl", gps_model = "parametric", bin_seq = NULL, trim_quantiles = c(0.0 , 1.0), optimized_compile = FALSE, use_cov_transform = TRUE, transformers = list("pow2","pow3","clog"), sl_lib = c("m_xgboost"), params = list(xgb_nrounds=c(17), xgb_eta=c(0.28)), nthread = 1, covar_bl_method = "absolute", covar_bl_trs = 0.1, covar_bl_trs_type = "mean", max_attempt = 1, matching_fun = "matching_l1", delta_n = 0.1, scale = 1) plot(pseudo_pop_2)
By activating optimized_compile
flag, we keep track of number of data samples, instead of aggregating them. Both approach should result in the same values, however, optimized_compile
version will consume less memory.
optimized_data_1 <- pseudo_pop_1$pseudo_pop[,c("w","gps","counter_weight")] nonoptimized_data_2 <- pseudo_pop_2$pseudo_pop[,c("w","gps","counter_weight")] print(paste("Number of rows of data in the optimized approach: ", nrow(optimized_data_1))) print(paste("Number of rows of data in the non-optimized approach: ", nrow(nonoptimized_data_2))) print(paste("Sum of data samples in the optimized approach: ", sum(optimized_data_1$counter_weight))) print(paste("Number of data in the non-optimized approach: ", length(nonoptimized_data_2$w))) # Replicate gps values of optimized approach expanded_opt_data_1 <- optimized_data_1[rep(seq_len(nrow(optimized_data_1)), optimized_data_1$counter_weight), 1:3] exp_gps_a_1 <- expanded_opt_data_1$gps gps_b_1 <- nonoptimized_data_2$gps differences <- sort(gps_b_1) - sort(exp_gps_a_1) print(paste("Sum of differences in gps values between optimized and ", "non-optimized approaches is: ", sum(differences)))
trimmed_data <- subset(study_data[stats::complete.cases(study_data) ,], qd_mean_pm25 <= q2 & qd_mean_pm25 >= q1) set.seed(8967) pseudo_pop_3 <- generate_pseudo_pop(trimmed_data$cms_mortality_pct, trimmed_data$qd_mean_pm25, data.frame(trimmed_data[, confounders_s1, drop=FALSE]), ci_appr = "matching", pred_model = "sl", gps_model = "non-parametric", bin_seq = NULL, trim_quantiles = c(0.0 , 1.0), optimized_compile = TRUE, use_cov_transform = TRUE, transformers = list("pow2","pow3","clog"), sl_lib = c("m_xgboost"), params = list(xgb_nrounds=c(12), xgb_eta=c(0.1)), nthread = 1, covar_bl_method = "absolute", covar_bl_trs = 0.1, covar_bl_trs_type = "mean", max_attempt = 1, matching_fun = "matching_l1", delta_n = 0.1, scale = 1) plot(pseudo_pop_3)
trimmed_data <- subset(study_data[stats::complete.cases(study_data) ,], qd_mean_pm25 <= q2 & qd_mean_pm25 >= q1) trimmed_data$cs_poverty <- pow2(trimmed_data$cs_poverty) set.seed(672) pseudo_pop_4 <- generate_pseudo_pop(trimmed_data$cms_mortality_pct, trimmed_data$qd_mean_pm25, data.frame(trimmed_data[, confounders_s1, drop=FALSE]), ci_appr = "weighting", pred_model = "sl", gps_model = "parametric", bin_seq = NULL, trim_quantiles = c(0.0 , 1.0), optimized_compile = TRUE, use_cov_transform = TRUE, transformers = list("pow2","pow3","clog"), sl_lib = c("m_xgboost"), params = list(xgb_nrounds=c(35), xgb_eta=c(0.14)), nthread = 1, covar_bl_method = "absolute", covar_bl_trs = 0.1, covar_bl_trs_type = "mean", max_attempt = 1, matching_fun = "matching_l1", delta_n = 0.1, scale = 1) plot(pseudo_pop_4)
In the previous examples, we passed specific parameters for estimating GPS values. Achieving acceptable covariate balance can be computed by searching for the most appropriate parameters and might not be a simple task. This package uses transformers on the features to get an acceptable covariate balance. The following parameters are directly related to searching for an acceptable covariate balance.
mean
, median
, or maximal
value of features correlation, which is defined in covar_bl_trs_type.xgb_nrounds=seq(1,100)
in the parameters, nround
parameter for xgboost
trainer will be selected a number between 1 and 100 at random, at each iteration.Optimized_compile: True
Search domain:
transformers
: pow2
, pow3
, clog
.nround
for xgboost
: 10-100.eta
for xgboost
: 0.1-0.5.max_attempt
: 5.covar_bl_trs
: 0.08.covar_bl_trs_type
: mean
trimmed_data <- subset(study_data[stats::complete.cases(study_data) ,], qd_mean_pm25 <= q2 & qd_mean_pm25 >= q1) set.seed(328) pseudo_pop_5 <- generate_pseudo_pop(trimmed_data$cms_mortality_pct, trimmed_data$qd_mean_pm25, data.frame(trimmed_data[, confounders_s1, drop=FALSE]), ci_appr = "matching", pred_model = "sl", gps_model = "non-parametric", bin_seq = NULL, trim_quantiles = c(0.0 , 1.0), optimized_compile = TRUE, use_cov_transform = TRUE, transformers = list("pow2","pow3","clog"), sl_lib = c("m_xgboost"), params = list(xgb_nrounds=seq(10, 100, 1), xgb_eta=seq(0.1,0.5,0.01)), nthread = 1, covar_bl_method = "absolute", covar_bl_trs = 0.08, covar_bl_trs_type = "mean", max_attempt = 5, matching_fun = "matching_l1", delta_n = 0.1, scale = 1) plot(pseudo_pop_5)
In this example, after 5 attempts, we could not find a pseudo population that can satisfy the covariate balance test.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.