In wxwx1993/GPSmatching: Matching on Generalized Propensity Scores with Continuous Exposures

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

library(CausalGPS)
library(ggplot2)

In this vignette, we present application of CausalGPS package on the Synthetic Medicare Data. In the dataset, we link the 2010 synthetic Medicare claims data to environmental exposures and potential confounders. The dataset is hosted on Harvard Dataverse [@khoshnevis_2022].

Load Data and Preprocessing

data("synthetic_us_2010")
data <- synthetic_us_2010
knitr::kable(head((data)))

# transformers
pow2 <- function(x) {x^2}
pow3 <- function(x) {x^3}
clog <- function(x) log(x+0.001)

confounders <- names(data)
confounders <- confounders[!(confounders %in% c("FIPS","Name","STATE", 
                                                "STATE_CODE","cms_mortality_pct",
                                                "qd_mean_pm25"))]

Examples of Generating Pseudo Population

Scenario 1

Causal Inference: Matching
GPS model: Parametric
Optimized_compile: True

confounders_s1 <- c("cs_poverty","cs_hispanic",
                   "cs_black",
                   "cs_ed_below_highschool",
                   "cs_median_house_value",
                   "cs_population_density",
                   "cdc_mean_bmi","cdc_pct_nvsmoker",
                   "gmet_mean_summer_tmmx",
                   "gmet_mean_summer_rmx",
                   "gmet_mean_summer_sph",
                   "cms_female_pct", "region"
)

study_data <- data[, c("qd_mean_pm25", confounders, "cms_mortality_pct")]
study_data$region <- as.factor(study_data$region)
study_data$cs_PIR <- study_data$cs_median_house_value/study_data$cs_household_income

# Choose subset of data
q1 <- stats::quantile(study_data$qd_mean_pm25,0.25)
q2 <- stats::quantile(study_data$qd_mean_pm25,0.99)
trimmed_data <- subset(study_data[stats::complete.cases(study_data) ,],
                       qd_mean_pm25 <= q2  & qd_mean_pm25 >= q1)

trimmed_data$gmet_mean_summer_sph <- pow2(trimmed_data$gmet_mean_summer_sph)

set.seed(172)
pseudo_pop_1 <- generate_pseudo_pop(trimmed_data$cms_mortality_pct,
                                    trimmed_data$qd_mean_pm25,
                                    data.frame(trimmed_data[, confounders_s1,
                                                            drop=FALSE]),
                                    ci_appr = "matching",
                                    pred_model = "sl",
                                    gps_model = "parametric",
                                    bin_seq = NULL,
                                    trim_quantiles = c(0.0 ,
                                                       1.0),
                                    optimized_compile = TRUE,
                                    use_cov_transform = TRUE,
                                    transformers = list("pow2","pow3","clog"),
                                    sl_lib = c("m_xgboost"),
                                    params = list(xgb_nrounds=c(17),
                                                  xgb_eta=c(0.28)),
                                    nthread = 1,
                                    covar_bl_method = "absolute",
                                    covar_bl_trs = 0.1,
                                    covar_bl_trs_type = "mean",
                                    max_attempt = 1,
                                    matching_fun = "matching_l1",
                                    delta_n = 0.1,
                                    scale = 1)

plot(pseudo_pop_1)

Scenario 2

Causal Inference: Matching
GPS model: Parametric
Optimized_compile: False

set.seed(172)
pseudo_pop_2 <- generate_pseudo_pop(trimmed_data$cms_mortality_pct,
                                    trimmed_data$qd_mean_pm25,
                                    data.frame(trimmed_data[, confounders_s1,
                                                            drop=FALSE]),
                                    ci_appr = "matching",
                                    pred_model = "sl",
                                    gps_model = "parametric",
                                    bin_seq = NULL,
                                    trim_quantiles = c(0.0 ,
                                                       1.0),
                                    optimized_compile = FALSE,
                                    use_cov_transform = TRUE,
                                    transformers = list("pow2","pow3","clog"),
                                    sl_lib = c("m_xgboost"),
                                    params = list(xgb_nrounds=c(17),
                                                  xgb_eta=c(0.28)),
                                    nthread = 1,
                                    covar_bl_method = "absolute",
                                    covar_bl_trs = 0.1,
                                    covar_bl_trs_type = "mean",
                                    max_attempt = 1,
                                    matching_fun = "matching_l1",
                                    delta_n = 0.1,
                                    scale = 1)

plot(pseudo_pop_2)

By activating optimized_compile flag, we keep track of number of data samples, instead of aggregating them. Both approach should result in the same values, however, optimized_compile version will consume less memory.

optimized_data_1 <- pseudo_pop_1$pseudo_pop[,c("w","gps","counter_weight")]
nonoptimized_data_2 <- pseudo_pop_2$pseudo_pop[,c("w","gps","counter_weight")]


print(paste("Number of rows of data in the optimized approach: ",
            nrow(optimized_data_1)))

print(paste("Number of rows of data in the non-optimized approach: ",
            nrow(nonoptimized_data_2)))

print(paste("Sum of data samples in the optimized approach: ",
            sum(optimized_data_1$counter_weight)))
print(paste("Number of data in the non-optimized approach: ",
            length(nonoptimized_data_2$w)))

# Replicate gps values of optimized approach
expanded_opt_data_1 <- optimized_data_1[rep(seq_len(nrow(optimized_data_1)),
                                            optimized_data_1$counter_weight), 1:3]

exp_gps_a_1 <- expanded_opt_data_1$gps
gps_b_1 <- nonoptimized_data_2$gps

differences <- sort(gps_b_1) - sort(exp_gps_a_1)

print(paste("Sum of differences in gps values between optimized and ", 
            "non-optimized approaches is: ",
            sum(differences)))

Scenario 3

Causal Inference: Matching
GPS model: Non-Parametric
Optimized_compile: True

trimmed_data <- subset(study_data[stats::complete.cases(study_data) ,],
                       qd_mean_pm25 <= q2  & qd_mean_pm25 >= q1)

set.seed(8967)
pseudo_pop_3 <- generate_pseudo_pop(trimmed_data$cms_mortality_pct,
                                      trimmed_data$qd_mean_pm25,
                                      data.frame(trimmed_data[, confounders_s1,
                                                              drop=FALSE]),
                                      ci_appr = "matching",
                                      pred_model = "sl",
                                      gps_model = "non-parametric",
                                      bin_seq = NULL,
                                      trim_quantiles = c(0.0 ,
                                                         1.0),
                                      optimized_compile = TRUE,
                                      use_cov_transform = TRUE,
                                      transformers = list("pow2","pow3","clog"),
                                      sl_lib = c("m_xgboost"),
                                      params = list(xgb_nrounds=c(12),
                                                    xgb_eta=c(0.1)),
                                      nthread = 1,
                                      covar_bl_method = "absolute",
                                      covar_bl_trs = 0.1,
                                      covar_bl_trs_type = "mean",
                                      max_attempt = 1,
                                      matching_fun = "matching_l1",
                                      delta_n = 0.1,
                                      scale = 1)

plot(pseudo_pop_3)

Scenario 4

Causal Inference: Weighting
GPS model: Parametric
Optimized_compile: N/A

trimmed_data <- subset(study_data[stats::complete.cases(study_data) ,],
                       qd_mean_pm25 <= q2  & qd_mean_pm25 >= q1)

trimmed_data$cs_poverty <- pow2(trimmed_data$cs_poverty)

set.seed(672)
pseudo_pop_4 <- generate_pseudo_pop(trimmed_data$cms_mortality_pct,
                                      trimmed_data$qd_mean_pm25,
                                      data.frame(trimmed_data[, confounders_s1,
                                                              drop=FALSE]),
                                      ci_appr = "weighting",
                                      pred_model = "sl",
                                      gps_model = "parametric",
                                      bin_seq = NULL,
                                      trim_quantiles = c(0.0 ,
                                                         1.0),
                                      optimized_compile = TRUE,
                                      use_cov_transform = TRUE,
                                      transformers = list("pow2","pow3","clog"),
                                      sl_lib = c("m_xgboost"),
                                      params = list(xgb_nrounds=c(35),
                                                    xgb_eta=c(0.14)),
                                      nthread = 1,
                                      covar_bl_method = "absolute",
                                      covar_bl_trs = 0.1,
                                      covar_bl_trs_type = "mean",
                                      max_attempt = 1,
                                      matching_fun = "matching_l1",
                                      delta_n = 0.1,
                                      scale = 1)

plot(pseudo_pop_4)

Covariate Balance

In the previous examples, we passed specific parameters for estimating GPS values. Achieving acceptable covariate balance can be computed by searching for the most appropriate parameters and might not be a simple task. This package uses transformers on the features to get an acceptable covariate balance. The following parameters are directly related to searching for an acceptable covariate balance.

covar_bl_trs: Is the acceptable threshold to stop searching. It can be computed either by mean, median, or maximal value of features correlation, which is defined in covar_bl_trs_type.
params: In different iterations, we choose a parameter at random from the provided list. For example, by xgb_nrounds=seq(1,100) in the parameters, nround parameter for xgboost trainer will be selected a number between 1 and 100 at random, at each iteration.
transformers: After each iteration, we choose a feature with the highest correlation and apply a transformer from the provided list. All transformers should be applied to a feature before reapplying the same transformer on the same feature.
max_attempt: Number of test iteration. If the covar_bl_trs is not met, the search will stop after max_attempt iteration and will return the best found population.

Scenario 5

Causal Inference: Matching + searching for acceptable covariate balance
GPS model: Non-Parametric
Optimized_compile: True
Search domain:
transformers: pow2, pow3, clog.
nround for xgboost: 10-100.
eta for xgboost: 0.1-0.5.
max_attempt: 5.
covar_bl_trs: 0.08.
covar_bl_trs_type: mean

trimmed_data <- subset(study_data[stats::complete.cases(study_data) ,],
                       qd_mean_pm25 <= q2  & qd_mean_pm25 >= q1)

set.seed(328)
pseudo_pop_5 <- generate_pseudo_pop(trimmed_data$cms_mortality_pct,
                                      trimmed_data$qd_mean_pm25,
                                      data.frame(trimmed_data[, confounders_s1,
                                                              drop=FALSE]),
                                      ci_appr = "matching",
                                      pred_model = "sl",
                                      gps_model = "non-parametric",
                                      bin_seq = NULL,
                                      trim_quantiles = c(0.0 ,
                                                         1.0),
                                      optimized_compile = TRUE,
                                      use_cov_transform = TRUE,
                                      transformers = list("pow2","pow3","clog"),
                                      sl_lib = c("m_xgboost"),
                                      params = list(xgb_nrounds=seq(10, 100, 1),
                                                    xgb_eta=seq(0.1,0.5,0.01)),
                                      nthread = 1,
                                      covar_bl_method = "absolute",
                                      covar_bl_trs = 0.08,
                                      covar_bl_trs_type = "mean",
                                      max_attempt = 5,
                                      matching_fun = "matching_l1",
                                      delta_n = 0.1,
                                      scale = 1)

plot(pseudo_pop_5)

In this example, after 5 attempts, we could not find a pseudo population that can satisfy the covariate balance test.

References

wxwx1993/GPSmatching documentation built on March 1, 2023, 9:32 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

wxwx1993/GPSmatching
Matching on Generalized Propensity Scores with Continuous Exposures

In wxwx1993/GPSmatching: Matching on Generalized Propensity Scores with Continuous Exposures

Load Data and Preprocessing

Examples of Generating Pseudo Population

Scenario 1

Scenario 2

Scenario 3

Scenario 4

Covariate Balance

Scenario 5

References

R Package Documentation

Browse R Packages

We want your feedback!

wxwx1993/GPSmatching Matching on Generalized Propensity Scores with Continuous Exposures

In wxwx1993/GPSmatching: Matching on Generalized Propensity Scores with Continuous Exposures

Load Data and Preprocessing

Examples of Generating Pseudo Population

Scenario 1

Scenario 2

Scenario 3

Scenario 4

Covariate Balance

Scenario 5

References

R Package Documentation

Browse R Packages

We want your feedback!

wxwx1993/GPSmatching
Matching on Generalized Propensity Scores with Continuous Exposures