R/predictive_model_sim_analysis.R

#' setup for prediction
library(vbr)
library(timeNet)
library(glmnet)
library(boot)
library(parallel)

setwd("")
df.file1 = ""
df.file2 = ""
pop = read.csv(df.file1)
comm = read.csv(df.file2)
outcome1 = 'pop_c'
outcome2 = 'healthy_foods_c'
outcome3 = 'phys_act_c'
id = 'trtid10'
exclude = 'X'
t = 'year'
yp.2 = var_direction(pop, outcome1, pos = TRUE) # transform outcomes to binary direction indicators
yc.hf.2 = var_direction(comm, outcome2, pos = TRUE)
yc.pa.2 = var_direction(comm, outcome3, pos = TRUE)

# population change predictor matrix
exclude_pop = colnames(pop)[grep('Flagged', colnames(pop))]
Xp.n = norm_preds(df = pop, excl = c(outcome1, id, exclude, exclude_pop))
Xpt = t_inters(df = Xp.n, t = t) # interactions with time variable
Xpt.n = norm_preds(df = Xpt)
Xp = cbind(Xp.n, Xpt.n)
X_p = c("hinc", "ppov", "pfb", "punemp", "pnhblk", "phisp", "pasian", "phys_act", "healthy_foods")
X_p.invariant = c("tr_size") # identify time invariant covariate -- without additional transformations
w_p = c("pop","nBusinesses") # variables on which surrounding tract characteristics are weighted
maplist = lapply(X_p, function(x) colnames(pop)[grep(x,colnames(pop))]) # get all derived variables with same root as base variables listed
maplist.nc = lapply(maplist, function(x) x[! x %in% X_p]) # reduce to list of lists of derived variables relevant to each base variable, excluding the base variables
pop.inters = ll_inters(pop, X_p, maplist.nc)
maplist.w = get_wt_vars(df = pop, wt = w_p)
maplist.w.nc = lapply(maplist.w, function(x) x[! x %in% w_p])
pop.w.inters = ll_inters(pop, w_p, maplist.w.nc)
Xp.all = cbind(Xp, pop.inters, pop.w.inters)
Xp.all = norm_preds(df = Xp.all)
rm(list = c('maplist','maplist.nc','maplist.w','maplist.w.nc'))

# physial activity commcercial change predictor matrix
exclude_pa = colnames(comm)[grep('hf|Flagged', colnames(comm))]
Xc1.n = norm_preds(df = comm, excl = c(outcome2, outcome3, id, exclude, exclude_pa))
Xct1 = t_inters(df = Xc1.n, t = t)
Xct1.n = norm_preds(df = Xct1) 
Xc.pa = cbind(Xc1.n, Xct1.n)
X_p = c("hinc", "ppov", "pfb", "punemp", "pnhblk", "phisp", "pasian", "phys_act", "healthy_foods")
X_p.invariant = c("tr_size")
w_p = c("pop","nBusinesses_pa")
maplist = lapply(X_p, function(x) colnames(comm)[grep(x,colnames(comm))])
for(i in 1:length(maplist)){ # given repeats of 'nBusinesses' variable, where each excludes the counts of the outcome 
  vnames = unlist(maplist[[i]])
  maplist[[i]] = vnames[grep("nBusinesses_hf", vnames, invert = TRUE)]
}
maplist.nc = lapply(maplist, function(x) x[! x %in% X_p])
comm_pa.inters = ll_inters(comm, X_p, maplist.nc)
maplist.w = get_wt_vars(df = comm, wt = w_p)
maplist.w.nc = lapply(maplist.w, function(x) x[! x %in% w_p])
comm_pa.w.inters = ll_inters(comm, w_p, maplist.w.nc)
Xc.pa.all = cbind(Xc.pa, comm_pa.inters, comm_pa.w.inters)
Xc.pa.all = norm_preds(df = Xc.pa.all)
rm(list = c('maplist','maplist.nc','maplist.w','maplist.w.nc'))

# healthy foods commcercial change predictor matrix
exclude_hf = colnames(comm)[grep('pa|Flagged', colnames(comm))]
Xc2.n = norm_preds(df = comm, excl = c(outcome2, outcome3, id, exclude, exclude_hf))
Xct2 = t_inters(df = Xc2.n, t = t)
Xct2.n = norm_preds(df = Xct2) 
Xc.hf = cbind(Xc2.n, Xct2.n)
X_p = c("hinc", "ppov", "pfb", "punemp", "pnhblk", "phisp", "pasian", "phys_act", "healthy_foods")
X_p.invariant = c("tr_size")
w_p = c("pop","nBusinesses_hf")
maplist = lapply(X_p, function(x) colnames(comm)[grep(x,colnames(comm))])
for(i in 1:length(maplist)){
  vnames = unlist(maplist[[i]])
  maplist[[i]] = vnames[grep("nBusinesses_pa", vnames, invert = TRUE)]
}
maplist.nc = lapply(maplist, function(x) x[! x %in% X_p])
comm_hf.inters = ll_inters(comm, X_p, maplist.nc)
maplist.w = get_wt_vars(df = comm, wt = w_p)
maplist.w.nc = lapply(maplist.w, function(x) x[! x %in% w_p])
comm_hf.w.inters = ll_inters(comm, w_p, maplist.w.nc)
Xc.hf.all = cbind(Xc.hf, comm_hf.inters, comm_hf.w.inters)
Xc.hf.all = norm_preds(df = Xc.hf.all)
rm(list = c('maplist', 'maplist.nc', 'maplist.w', 'maplist.w.nc'))

data.pop = cbind(yp.2, Xp.all)
data.hf = cbind(yc.hf.2, Xc.hf.all)
data.pa = cbind(yc.pa.2, Xc.pa.all)

df = data.pa
K = 10
n = dim(df)[1]
ind = sample(x = n, size = n, replace = FALSE)
k.edges = c(round(seq(1, n, n/K)), n)
ci = FALSE
cores = detectCores()
boot.coefs.pa = list()
boot.models.pa = list()
boot.validation.pa = list()
for (i in 1:K){
  train = df[!ind %in% seq(k.edges[i], k.edges[i+1], 1), ]
  test = df[ind %in% seq(k.edges[i], k.edges[i+1], 1), ]
  cl = makeCluster(cores)
  sub_funcs = list('ndev.binom', 'rdev.binom', 'aic.binom', 'r2', 'cv.m.dev.binom', 'cv.m.classerr', 'cv.m.dev.binom.alt', 'cv.m.classerr.alt')  
  clusterExport(cl, c('train', 'boot.glmnet', 'cores', sub_funcs)) #'seeds'
  clusterEvalQ(cl, c(library(glmnet),library(boot)))
  system.time(nets.boot <- boot(train, statistic = boot.glmnet, R = 50, parallel = "snow",ncpus = cores, cl = cl))
  stopCluster(cl)
  boot.coefs.pa[[i]] = nets.boot
  bag.boot.glmnet = apply(nets.boot$t,2,mean)   #bootstrap model averaging
  if (ci == TRUE){
    diagn_ci_vals_l = unlist(lapply(1:length(nets.boot$t0), function(x) boot.ci(nets.boot,index=x, type=c("perc"))$percent[c(4)]))
    diagn_ci_vals_u = unlist(lapply(1:length(nets.boot$t0), function(x) boot.ci(nets.boot,index=x, type=c("perc"))$percent[c(5)]))
    glmnet.boot.bag = cbind(nets.boot$t0,bag.boot.glmnet,diagn_ci_vals_l,diagn_ci_vals_u )
  } else {
    glmnet.boot.bag = cbind(nets.boot$t0,bag.boot.glmnet)
  }
  msummary_outs = 6
  ll_ind = which(nets.boot$t0 > 100*abs(nets.boot$t0[1]), arr.ind=TRUE)
  models = length(ll_ind)/3
  n.coefs = min(ll_ind)-1
  ndev.boots = nets.boot$t[,n.coefs + 1]
  rdev.min1 = which.min(nets.boot$t[,n.coefs+2]) # resid dev minimum
  rdev.min2 = which.min(nets.boot$t[,2*n.coefs + msummary_outs + 2])
  rdnd.min1 = which.min(nets.boot$t[,n.coefs+2] -ndev.boots) # residual dev to null dev ratio min
  rdnd.min2 = which.min(nets.boot$t[,2*n.coefs + msummary_outs + 2] -ndev.boots)
  ndev.max = which.max(ndev.boots)  # null dev max
  bf.mod1 = nets.boot$t[rdev.min1, 1:(n.coefs+msummary_outs)]
  bf.mod2 = nets.boot$t[rdev.min2, (n.coefs+msummary_outs+1):length(nets.boot$t0)]
  bf.mod1.rdndratio = nets.boot$t[rdnd.min1, 1:(n.coefs+msummary_outs)]
  bf.mod2.rdndratio = nets.boot$t[rdnd.min2, (n.coefs+msummary_outs+1):length(nets.boot$t0)]
  bf.mod1.ndev = nets.boot$t[ndev.max, 1:(n.coefs+msummary_outs)]
  bf.mod2.ndev = nets.boot$t[ndev.max, (n.coefs+msummary_outs+1):length(nets.boot$t0)]
  bf.mods = c(bf.mod1,bf.mod2)
  bf.mods.rdndratio = c(bf.mod1.rdndratio, bf.mod2.rdndratio)
  bf.mods.ndevmax = c(bf.mod1.ndev,bf.mod2.ndev)
  validation.mods = cbind(glmnet.boot.bag, bf.mods,bf.mods.rdndratio, bf.mods.ndevmax)
  modselect_criteria = ncol(validation.mods)
  validation_params = c('models', 'n.coefs', 'msummary_outs', 'modselect_criteria') 
  
  # evaluate models on out of sample observed distribution
  cl = makeCluster(cores)
  clusterExport(cl, c('test', 'boot.test.compare', 'validation.mods', 'cores', validation_params, sub_funcs, 'sigmoid'))
  clusterEvalQ(cl, c(library(glmnet),library(boot)))
  system.time(nets.boot.test <- boot(data = test, mod_object = round(validation.mods,3), statistic = boot.test.compare,
    mod_fit_type = models, mod_select_types = modselect_criteria, coefs = n.coefs, summary_vals = msummary_outs, R = 50, parallel="snow",ncpus=cores,cl=cl))
  stopCluster(cl)
  boot.models.pa[[i]] = validation.mods
  boot.validation.pa[[i]] = nets.boot.test
}
davewutchiett/nets_demo_spatial_dynamics documentation built on May 25, 2019, 4:22 p.m.