In audreyrenson/didgformula: The difference-in-differences g-formula

library(tidyverse)
library(parallel)
library(didgformula)
library(kableExtra)
library(glue)

cores = detectCores()/2


set.seed(5)
nsims=5
Tt=5
Beta = generate_parameters(Tt, mu_Beta_A = c(.2,.2,.2,.2,-.4))
truth = estimate_truth(df_po = generate_data(1e7, Tt, Beta, TRUE), Tt, type = 'levels')
gc()
#>           used  (Mb) gc trigger   (Mb)  max used   (Mb)
#> Ncells 2424328 129.5    4536298  242.3   3412841  182.3
#> Vcells 4989007  38.1  530150331 4044.8 549990717 4196.1


onesim = function(n) {
  df = generate_data(n, Tt, Beta) 


  #num_mods = fit_treatment_models(df, rhs_formula = '~1', Tt=Tt, freq_w=1)
  den_mods_true = fit_treatment_models(df, rhs_formula = '~W1{t}+W2{t}+I(W2{t}^2)', Tt=Tt, freq_w =1)
  den_mods_false = fit_treatment_models(df, rhs_formula = '~W1{t}+W2{t}', Tt=Tt, freq_w =1)
  weights_true = cbind(1, calc_weights(denominator = pred_treatment_models(df, den_mods_true),
                                       numerator   = 1))
  weights_false = cbind(1, calc_weights(denominator = pred_treatment_models(df, den_mods_false),
                                        numerator   = 1))


  results = list(
    iptw_true = iptw_pipeline(df, Tt, '~W1{t}+W2{t}+I(W2{t}^2)', '~1',models = FALSE),
    iptw_gfal = iptw_pipeline(df, Tt, '~W1{t}+W2{t}', '~1',models = FALSE),
    ice_true = ice_pipeline(df, '~W1{t}+W2{t}+I(W2{t}^2)', '~W1{t-1}+W2{t-1}+I(W2{t-1}^2)','~W1{k}+W2{k}+I(W2{k}^2)',Tt,models = FALSE),
    ice_qfal = ice_pipeline(df, '~W1{t}+W2{t}', '~W1{t-1}+W2{t-1}','~W1{k}+W2{k}',Tt,models = FALSE),
    tmle_true = ice_pipeline(df, '~W1{t}+W2{t}+I(W2{t}^2)', '~W1{t-1}+W2{t-1}+I(W2{t-1}^2)','~W1{k}+W2{k}+I(W2{k}^2)',Tt,models = FALSE, tmle=TRUE, weights=weights_true),
    tmle_gfal = ice_pipeline(df, '~W1{t}+W2{t}+I(W2{t}^2)', '~W1{t-1}+W2{t-1}+I(W2{t-1}^2)','~W1{k}+W2{k}+I(W2{k}^2)',Tt,models = FALSE, tmle=TRUE, weights=weights_false),
    tmle_qfal = ice_pipeline(df, '~W1{t}+W2{t}          ',  '~W1{t-1}+W2{t-1}            ', '~W1{k}+W2{k}          ',Tt,models = FALSE, tmle=TRUE, weights=weights_true),
    tmle_bfal = ice_pipeline(df, '~W1{t}+W2{t}          ',  '~W1{t-1}+W2{t-1}            ', '~W1{k}+W2{k}          ',Tt,models = FALSE, tmle=TRUE, weights=weights_false)
  )


  #return estimate and bias for mean outcome at time 5
  y0 = mean(df$Y0)
  results %>%
    map(mutate, estimate = cumsum(estimate) + y0) %>%
    bind_rows(.id='estimator') %>%
    left_join(truth %>% rename(truth=estimate), by='t') %>%
    mutate(bias = estimate - truth) %>%
    filter(t==5) %>%
    select(estimator, estimate, bias)
}

fmt = function(x, dig =3, clip=TRUE, format='g') {
  out = formatC(x, digits=dig, format=format, flag='#')
  clipped_out = paste0(c('<0.', rep(0, dig-1), '1'), collapse = '')
  if(clip) {
    out = ifelse(x < 10^(-dig), clipped_out, out)
  }
  return (out)
}

sims1e3 = mclapply(1:nsims, function(x) onesim(1e2), mc.cores = cores, mc.set.seed = TRUE)
gc()
#>           used  (Mb) gc trigger   (Mb)  max used   (Mb)
#> Ncells 2424458 129.5    4536298  242.3   3412841  182.3
#> Vcells 4990716  38.1  424120265 3235.8 549990717 4196.1

sims1e4 = mclapply(1:nsims, function(x) onesim(1e3), mc.cores = cores, mc.set.seed = TRUE)
gc()
#>           used  (Mb) gc trigger   (Mb)  max used   (Mb)
#> Ncells 2424804 129.5    4536298  242.3   3412841  182.3
#> Vcells 4991628  38.1  339296212 2588.7 549990717 4196.1

sims1e5 = mclapply(1:nsims, function(x) onesim(1e4), mc.cores = cores, mc.set.seed = TRUE)
gc()
#>           used  (Mb) gc trigger   (Mb)  max used   (Mb)
#> Ncells 2424806 129.5    4536298  242.3   3412841  182.3
#> Vcells 4991697  38.1  271436970 2070.9 549990717 4196.1

#reshape
result = list('1e3'=sims1e3,'1e4'=sims1e4, '1e5'=sims1e5) %>%
  map(bind_rows, .id='rep') %>%
  bind_rows(.id='n') 

result_table = result %>%
  group_by(n, estimator) %>%
  summarise(variance = var(estimate), bias =mean(bias),
            p = nortest::lillie.test(estimate)$p.value) %>%
  mutate(p = fmt(p, dig=2),
         variance = fmt(variance, dig=2, clip=FALSE),
         bias = fmt(bias, dig=4, clip=FALSE, format='f')) %>%
  ungroup() %>%
  split(.$n) %>%
  {cbind(estimator = .[[1]]$estimator,  Reduce(cbind, map(., select, -n:-estimator))) }


result_table %>%
  as.matrix() %>%
  kable() %>%
  kable_styling() %>%
  add_header_above(c(" " = 1, "n=1,000" = 3, "n=10,000" =3, "n=100,000" = 3)) %>%
  footnote('p=p-value from Kolmogorov-Smirnoff test for normality.
Abbreviations: ice=iterated conditional expectation, iptw=inverse probability of treatment weighted, tmle=targeted
maximum likelihood, qfal=outcome models misspecified, gfal=treatment models misspecified, bfal=both sets of models
misspecified, true=all models correctly specified.
8
')

	n=1,000			n=10,000			n=100,000
estimator	variance	bias	p	variance	bias	p	variance	bias	p
ice_qfal	0.0043	0.0115	0.68	0.00046	0.0110	0.81	4.5e-05	0.0116	0.76
ice_true	0.0042	-0.0004	0.82	0.00045	-0.0002	0.23	4.2e-05	0.0002	0.45
iptw_gfal	0.0045	0.0117	0.86	0.00047	0.0117	0.49	4.6e-05	0.0125	0.69
iptw_true	0.0060	-0.0017	0.37	0.00063	-0.0008	0.87	7.4e-05	-0.0001	0.57
tmle_bfal	0.0044	0.0119	0.51	0.00046	0.0115	0.65	4.5e-05	0.0122	0.62
tmle_gfal	0.0042	-0.0008	0.86	0.00045	-0.0004	0.28	4.3e-05	0.0002	0.21
tmle_qfal	0.0059	-0.0009	0.15	0.00064	-0.0007	0.78	7.6e-05	-0.0002	0.41
tmle_true	0.0054	-0.0003	0.82	0.00056	0.0001	0.11	5.7e-05	0.0001	0.69
Note:
p=p-value from Kolmogorov-Smirnoff test for normality. Abbreviations: ice=iterated conditional expectation, iptw=inverse probability of treatment weighted, tmle=targeted maximum likelihood, qfal=outcome models misspecified, gfal=treatment models misspecified, bfal=both sets of models misspecified, true=all models correctly specified.