#' setup for prediction
library(vbr)
library(timeNet)
library(glmnet)
library(boot)
library(parallel)
setwd("")
df.file1 = ""
df.file2 = ""
pop = read.csv(df.file1)
comm = read.csv(df.file2)
outcome1 = 'pop_c'
outcome2 = 'healthy_foods_c'
outcome3 = 'phys_act_c'
id = 'trtid10'
exclude = 'X'
t = 'year'
yp.2 = var_direction(pop, outcome1, pos = TRUE) # transform outcomes to binary direction indicators
yc.hf.2 = var_direction(comm, outcome2, pos = TRUE)
yc.pa.2 = var_direction(comm, outcome3, pos = TRUE)
# population change predictor matrix
exclude_pop = colnames(pop)[grep('Flagged', colnames(pop))]
Xp.n = norm_preds(df = pop, excl = c(outcome1, id, exclude, exclude_pop))
Xpt = t_inters(df = Xp.n, t = t) # interactions with time variable
Xpt.n = norm_preds(df = Xpt)
Xp = cbind(Xp.n, Xpt.n)
X_p = c("hinc", "ppov", "pfb", "punemp", "pnhblk", "phisp", "pasian", "phys_act", "healthy_foods")
X_p.invariant = c("tr_size") # identify time invariant covariate -- without additional transformations
w_p = c("pop","nBusinesses") # variables on which surrounding tract characteristics are weighted
maplist = lapply(X_p, function(x) colnames(pop)[grep(x,colnames(pop))]) # get all derived variables with same root as base variables listed
maplist.nc = lapply(maplist, function(x) x[! x %in% X_p]) # reduce to list of lists of derived variables relevant to each base variable, excluding the base variables
pop.inters = ll_inters(pop, X_p, maplist.nc)
maplist.w = get_wt_vars(df = pop, wt = w_p)
maplist.w.nc = lapply(maplist.w, function(x) x[! x %in% w_p])
pop.w.inters = ll_inters(pop, w_p, maplist.w.nc)
Xp.all = cbind(Xp, pop.inters, pop.w.inters)
Xp.all = norm_preds(df = Xp.all)
rm(list = c('maplist','maplist.nc','maplist.w','maplist.w.nc'))
# physial activity commcercial change predictor matrix
exclude_pa = colnames(comm)[grep('hf|Flagged', colnames(comm))]
Xc1.n = norm_preds(df = comm, excl = c(outcome2, outcome3, id, exclude, exclude_pa))
Xct1 = t_inters(df = Xc1.n, t = t)
Xct1.n = norm_preds(df = Xct1)
Xc.pa = cbind(Xc1.n, Xct1.n)
X_p = c("hinc", "ppov", "pfb", "punemp", "pnhblk", "phisp", "pasian", "phys_act", "healthy_foods")
X_p.invariant = c("tr_size")
w_p = c("pop","nBusinesses_pa")
maplist = lapply(X_p, function(x) colnames(comm)[grep(x,colnames(comm))])
for(i in 1:length(maplist)){ # given repeats of 'nBusinesses' variable, where each excludes the counts of the outcome
vnames = unlist(maplist[[i]])
maplist[[i]] = vnames[grep("nBusinesses_hf", vnames, invert = TRUE)]
}
maplist.nc = lapply(maplist, function(x) x[! x %in% X_p])
comm_pa.inters = ll_inters(comm, X_p, maplist.nc)
maplist.w = get_wt_vars(df = comm, wt = w_p)
maplist.w.nc = lapply(maplist.w, function(x) x[! x %in% w_p])
comm_pa.w.inters = ll_inters(comm, w_p, maplist.w.nc)
Xc.pa.all = cbind(Xc.pa, comm_pa.inters, comm_pa.w.inters)
Xc.pa.all = norm_preds(df = Xc.pa.all)
rm(list = c('maplist','maplist.nc','maplist.w','maplist.w.nc'))
# healthy foods commcercial change predictor matrix
exclude_hf = colnames(comm)[grep('pa|Flagged', colnames(comm))]
Xc2.n = norm_preds(df = comm, excl = c(outcome2, outcome3, id, exclude, exclude_hf))
Xct2 = t_inters(df = Xc2.n, t = t)
Xct2.n = norm_preds(df = Xct2)
Xc.hf = cbind(Xc2.n, Xct2.n)
X_p = c("hinc", "ppov", "pfb", "punemp", "pnhblk", "phisp", "pasian", "phys_act", "healthy_foods")
X_p.invariant = c("tr_size")
w_p = c("pop","nBusinesses_hf")
maplist = lapply(X_p, function(x) colnames(comm)[grep(x,colnames(comm))])
for(i in 1:length(maplist)){
vnames = unlist(maplist[[i]])
maplist[[i]] = vnames[grep("nBusinesses_pa", vnames, invert = TRUE)]
}
maplist.nc = lapply(maplist, function(x) x[! x %in% X_p])
comm_hf.inters = ll_inters(comm, X_p, maplist.nc)
maplist.w = get_wt_vars(df = comm, wt = w_p)
maplist.w.nc = lapply(maplist.w, function(x) x[! x %in% w_p])
comm_hf.w.inters = ll_inters(comm, w_p, maplist.w.nc)
Xc.hf.all = cbind(Xc.hf, comm_hf.inters, comm_hf.w.inters)
Xc.hf.all = norm_preds(df = Xc.hf.all)
rm(list = c('maplist', 'maplist.nc', 'maplist.w', 'maplist.w.nc'))
data.pop = cbind(yp.2, Xp.all)
data.hf = cbind(yc.hf.2, Xc.hf.all)
data.pa = cbind(yc.pa.2, Xc.pa.all)
df = data.pa
K = 10
n = dim(df)[1]
ind = sample(x = n, size = n, replace = FALSE)
k.edges = c(round(seq(1, n, n/K)), n)
ci = FALSE
cores = detectCores()
boot.coefs.pa = list()
boot.models.pa = list()
boot.validation.pa = list()
for (i in 1:K){
train = df[!ind %in% seq(k.edges[i], k.edges[i+1], 1), ]
test = df[ind %in% seq(k.edges[i], k.edges[i+1], 1), ]
cl = makeCluster(cores)
sub_funcs = list('ndev.binom', 'rdev.binom', 'aic.binom', 'r2', 'cv.m.dev.binom', 'cv.m.classerr', 'cv.m.dev.binom.alt', 'cv.m.classerr.alt')
clusterExport(cl, c('train', 'boot.glmnet', 'cores', sub_funcs)) #'seeds'
clusterEvalQ(cl, c(library(glmnet),library(boot)))
system.time(nets.boot <- boot(train, statistic = boot.glmnet, R = 50, parallel = "snow",ncpus = cores, cl = cl))
stopCluster(cl)
boot.coefs.pa[[i]] = nets.boot
bag.boot.glmnet = apply(nets.boot$t,2,mean) #bootstrap model averaging
if (ci == TRUE){
diagn_ci_vals_l = unlist(lapply(1:length(nets.boot$t0), function(x) boot.ci(nets.boot,index=x, type=c("perc"))$percent[c(4)]))
diagn_ci_vals_u = unlist(lapply(1:length(nets.boot$t0), function(x) boot.ci(nets.boot,index=x, type=c("perc"))$percent[c(5)]))
glmnet.boot.bag = cbind(nets.boot$t0,bag.boot.glmnet,diagn_ci_vals_l,diagn_ci_vals_u )
} else {
glmnet.boot.bag = cbind(nets.boot$t0,bag.boot.glmnet)
}
msummary_outs = 6
ll_ind = which(nets.boot$t0 > 100*abs(nets.boot$t0[1]), arr.ind=TRUE)
models = length(ll_ind)/3
n.coefs = min(ll_ind)-1
ndev.boots = nets.boot$t[,n.coefs + 1]
rdev.min1 = which.min(nets.boot$t[,n.coefs+2]) # resid dev minimum
rdev.min2 = which.min(nets.boot$t[,2*n.coefs + msummary_outs + 2])
rdnd.min1 = which.min(nets.boot$t[,n.coefs+2] -ndev.boots) # residual dev to null dev ratio min
rdnd.min2 = which.min(nets.boot$t[,2*n.coefs + msummary_outs + 2] -ndev.boots)
ndev.max = which.max(ndev.boots) # null dev max
bf.mod1 = nets.boot$t[rdev.min1, 1:(n.coefs+msummary_outs)]
bf.mod2 = nets.boot$t[rdev.min2, (n.coefs+msummary_outs+1):length(nets.boot$t0)]
bf.mod1.rdndratio = nets.boot$t[rdnd.min1, 1:(n.coefs+msummary_outs)]
bf.mod2.rdndratio = nets.boot$t[rdnd.min2, (n.coefs+msummary_outs+1):length(nets.boot$t0)]
bf.mod1.ndev = nets.boot$t[ndev.max, 1:(n.coefs+msummary_outs)]
bf.mod2.ndev = nets.boot$t[ndev.max, (n.coefs+msummary_outs+1):length(nets.boot$t0)]
bf.mods = c(bf.mod1,bf.mod2)
bf.mods.rdndratio = c(bf.mod1.rdndratio, bf.mod2.rdndratio)
bf.mods.ndevmax = c(bf.mod1.ndev,bf.mod2.ndev)
validation.mods = cbind(glmnet.boot.bag, bf.mods,bf.mods.rdndratio, bf.mods.ndevmax)
modselect_criteria = ncol(validation.mods)
validation_params = c('models', 'n.coefs', 'msummary_outs', 'modselect_criteria')
# evaluate models on out of sample observed distribution
cl = makeCluster(cores)
clusterExport(cl, c('test', 'boot.test.compare', 'validation.mods', 'cores', validation_params, sub_funcs, 'sigmoid'))
clusterEvalQ(cl, c(library(glmnet),library(boot)))
system.time(nets.boot.test <- boot(data = test, mod_object = round(validation.mods,3), statistic = boot.test.compare,
mod_fit_type = models, mod_select_types = modselect_criteria, coefs = n.coefs, summary_vals = msummary_outs, R = 50, parallel="snow",ncpus=cores,cl=cl))
stopCluster(cl)
boot.models.pa[[i]] = validation.mods
boot.validation.pa[[i]] = nets.boot.test
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.