knitr::opts_chunk$set(echo = TRUE, dev = "ragg_png")
library(foreach) library(doSNOW) library(gsubfn) library(CausalGrid) library(causalTree) library(rprojroot) root_dir <- rprojroot::find_package_root_file() source(paste0(root_dir,"/project/ct_utils.R")) source(paste0(root_dir,"/tests/dgps.R"))
Some common parameters
my_seed=1337 S=200 cores_to_use = getOption("cl.cores", default=5) N=500 set.seed(1337) #data differs across sims, but the bump samples indexes are the same across sims n_bump = 20
Let's get some fake data
#gen data. Do serially so randomness is the same across par and non-par runs datas = list() for(s in 1:S) { datas[[s]] = XOR_sim(n=N) }
`%doChange%` = if(cores_to_use>1) `%dopar%` else `%do%` #so I can just change variable without changing other code for single-threaded if(cores_to_use>1){ pr_cl = makeCluster(cores_to_use) registerDoSNOW(pr_cl) } # using doSNOW instead of generic doparallel as it has a nice progress bar option. t1 = Sys.time() cat(paste("Start time: ",t1,"\n")) pb <- txtProgressBar(max = S, style = 1) progress_fn <- function(n) setTxtProgressBar(pb, n) n_methods = 1 + 1 + 1 + 4 fixed_names = c("No bumping", "Bumping") cv_names = c("No bumping", "Bumping (doCV=F, incl_comp_in_pick=F)", "Bumping (doCV=F, incl_comp_in_pick=T)", "Bumping (doCV=T, incl_comp_in_pick=F)", "Bumping (doCV=T, incl_comp_in_pick=T)") #sim_rets = list() #for(s in 1:S){ sim_rets = foreach(s=1:S, .packages=c("CausalGrid"), .options.snow = list(progress = progress_fn)) %doChange% { #.errorhandling = "pass" #setTxtProgressBar(pb, s) # data_xor = datas[[s]] index_tr = 1:250 set.seed(1337) base_args = list(y=data_xor$y, X=data_xor$X, d=data_xor$w, verbosity=0, tr_split=index_tr) #Show how bumping helps with fixed depth fixed_args = c(base_args, list(partition_i=3)) ret_nobmp_fixed <- do.call(fit_estimate_partition, fixed_args) fixed_b_args = c(fixed_args, list(bump_samples=n_bump)) ret_bmp_fixed <- do.call(fit_estimate_partition, c(fixed_b_args, list(bump_complexity=list(doCV=FALSE, incl_comp_in_pick=FALSE)))) is_objs = c(ret_nobmp_fixed$is_obj_val_seq[3], ret_bmp_fixed$is_obj_val_seq[3]) min_dev = c(min(abs(unlist(ret_nobmp_fixed$partition$s_by_dim))), min(abs(unlist(ret_bmp_fixed$partition$s_by_dim)))) max_dev = c(max(abs(unlist(ret_nobmp_fixed$partition$s_by_dim))), max(abs(unlist(ret_bmp_fixed$partition$s_by_dim)))) #Then add in CV complexity #cv_folds = 2 cv_folds = c(rep(1, 125), rep(2, 125)) cv_args = c(base_args, list(cv_folds=cv_folds)) ret_nobmp_cv <- do.call(fit_estimate_partition, cv_args) cv_b_args = c(cv_args, bump_samples=n_bump) ret_bmpa_cv <- do.call(fit_estimate_partition, c(cv_b_args, list(bump_complexity=list(doCV=FALSE, incl_comp_in_pick=FALSE)))) ret_bmpb_cv <- do.call(fit_estimate_partition, c(cv_b_args, list(bump_complexity=list(doCV=FALSE, incl_comp_in_pick=TRUE)))) ret_bmpc_cv <- do.call(fit_estimate_partition, c(cv_b_args, list(bump_complexity=list(doCV=TRUE, incl_comp_in_pick=FALSE)))) ret_bmpd_cv <- do.call(fit_estimate_partition, c(cv_b_args, list(bump_complexity=list(doCV=TRUE, incl_comp_in_pick=TRUE)))) n_cells = c(sapply(list(ret_nobmp_fixed, ret_bmp_fixed, ret_nobmp_cv, ret_bmpa_cv, ret_bmpb_cv, ret_bmpc_cv, ret_bmpd_cv), num_cells)) list(is_objs=is_objs, max_dev=max_dev, min_dev=min_dev, n_cells=n_cells) #sim_rets[[s]] = list(is_objs=is_objs, max_dev=max_dev, min_dev=min_dev, n_cells=n_cells) } t2 = Sys.time() #can us as.numeric(t1) to convert to seconds td = t2-t1 close(pb) cat(paste("Total time: ",format(as.numeric(td))," ", attr(td,"units"),"\n")) if(cores_to_use>1) stopCluster(pr_cl)
# reshape sim data is_obj_mat = matrix(NA, nrow=S, ncol=2) max_dev_mat = matrix(NA, nrow=S, ncol=2) min_dev_mat = matrix(NA, nrow=S, ncol=2) n_cells_mat = matrix(NA, nrow=S, ncol=n_methods) for(s in 1:S) { is_obj_mat[s,] = sim_rets[[s]]$is_objs max_dev_mat[s,] = sim_rets[[s]]$max_dev min_dev_mat[s,] = sim_rets[[s]]$min_dev n_cells_mat[s,] = sim_rets[[s]]$n_cells }
print("Fixed_depth (partition_i=3)") print(paste("is_obj mean:", paste(fixed_names, collapse=", "))) print(colMeans(is_obj_mat)) print(paste("max_dev mean:", paste(fixed_names, collapse=", "))) print(colMeans(max_dev_mat)) print(paste("min_dev mean:", paste(fixed_names, collapse=", "))) print(colMeans(min_dev_mat)) print(paste("n_cells mean (4 if diff dim splits):", paste(fixed_names, collapse=", "))) print(colMeans(n_cells_mat[,1:2])) print("CV Depth") print(paste("n_cells mean (4 is perfect):", paste(cv_names, collapse=", "))) print(colMeans(n_cells_mat[,3:n_methods])) #including complexity limits the #. Using Bumping in CV means we can tolearte more complexity when picking optimal lambda. And moving from no bumping to bumping the benefit of complexity is more so we have more cells.
yX_data <- function(y, X) { yX = cbind(y, X) colnames(yX) = c("Y", paste("X", 1:ncol(X), sep="")) yX = as.data.frame(yX) } sim_ct_predict_te <- function(obj, X_te) { #yX_te = yX_data(y_te, X_te) colnames(X_te) = paste("X", 1:ncol(X_te), sep="") return(predict(obj, newdata=as.data.frame(X_te), type="vector")) } sim_ct_fit <- function(y, X, w, tr_sample, honest=TRUE, nfolds=5, minsize=25, b=4) { yX = yX_data(y, X) set.seed(my_seed) fit = ct_cv_tree("Y~.", data=yX, treatment=w, index_tr=tr_sample, minsize=minsize, split.Bucket=TRUE, bucketNum=b, xval=nfolds, split.Honest=honest, cv.Honest=honest) attr(fit$terms, ".Environment") <- NULL #save space other captures environment return(fit) }
kappa3_ratio = .1 data2 = XOR2_sim(n=N, kappa3_ratio=kappa3_ratio) te_mult = 16 data2_te = XOR2_sim(n=te_mult*N, kappa3_ratio=kappa3_ratio) tr_sample = c(rep(TRUE, N/2), rep(FALSE, N-N/2))
ct_fit = sim_ct_fit(data2$y, data2$X, data2$w, tr_sample, nfolds=2) ct_fit
mse_ct = mean((data2_te$kappa - sim_ct_predict_te(ct_fit, data2_te$X))^2) print(mse_ct)
cg_fit = fit_estimate_partition(data2$y, data2$X, data2$w, tr_sample, cv_folds=2) cg_fit
mse_cg = mean((data2_te$kappa - predict(cg_fit, data2_te$X))^2) print(mse_cg)
cg_fit_b = fit_estimate_partition(data2$y, data2$X, data2$w, tr_sample, cv_folds=2, bump_samples = n_bump, bump_complexity=list(doCV=TRUE, incl_comp_in_pick=TRUE)) cg_fit_b
mse_cg_b = mean((data2_te$kappa - predict(cg_fit_b, data2_te$X))^2) print(mse_cg_b)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.