# load, process and transform data, including missing value imputation
library(missing) # packages
working_directory = ""
data_file = ""
t_var = 'year' # time variable
t_values = c(1990,2000,2010)
id = 'trtid10'
setwd(working_directory)
df = read.csv(data_file)
df = df[df[,t_var] %in% t_values,] # subset to cases with time value in t
demographics = c('hinc90','hinc00','hinc0a','ppov90','ppov00','ppov0a',
'pfb90','pfb00','pfb0a','punemp90','punemp00','punemp0a',
'pnhblk90','pnhblk00','pnhblk0a','phisp90','phisp00','phisp0a',
'pasian90','pasian00','pasian0a','pop90','pop00','pop10')
df.demo = demog_long() # generate separate cases by year
# Commercial variables
na_ind = -999 # missing indicator as '-999', set to 'NA'
vars_na_ind = c('hinc0a')
df[,vars_na_ind][df[,vars_na_ind] == na_ind] = NA
# facility counts by category
df$phys_act = df$nLMPA_OV_col + df$nVPA_OV_col
df$healthy_foods = df$nSMK_OV_col + df$nFVM_OV_col + df$nNAT_OV_col + df$nFSH_OV_col
# define total counts excluding category counts
df$nBusinesses_pa = df$nBusinesses - df$phys_act
df$nBusinesses_hf = df$nBusinesses - df$healthy_foods
# commercial variables
commercial = c('phys_act','healthy_foods','nFlagged','nDES_OV',
'nLMPA_OV_col','nVPA_OV_col','nSMK_OV_col','nFVM_OV_col','nNAT_OV_col','nFSH_OV_col',
'nBusinesses', 'nBusinesses_pa', 'nBusinesses_hf')
df.comm = df[,c(commercial,t_var,id)] #subset columns to specified variables
# Geographic area variables
tract_area_file = "ct2010landarea.csv"
df.area = read.csv(tract_area_file)
original_area_var = 'ct2010landareasqkilometers'
df.area$tr_size = df.area[,original_area_var]
id.area = 'GEOID10'
df.area[,id] = df.area[,id.area]
in_sample = df.area[,id] %in% df[,id]
df.area = df.area[ in_sample, c(id,'tr_size')]
df.all = merge(df.demo, df.comm, by = c(id,t_var))
df.all = merge(df.all, df.area, by = id)
# check characteristics
lapply(df.all,summary)
outcomes.comm = c('healthy_foods','phys_act')
df.comm = df.all
extrav.comm = c("nDES_OV", "nLMPA_OV_col", "nVPA_OV_col", "nSMK_OV_col","nFVM_OV_col","nNAT_OV_col","nFSH_OV_col","nBusinesses")
df.comm = df.comm[!colnames(df.comm) %in% extrav.comm] # excess variables removed
# Population
outcomes.pop = c('pop') # define outcomes to remove from normalization
# remove tracts with missing population values
na_pop.tracts = unique( df.all[,id][ is.na(df.all[outcomes.pop])])
df.pop = df.all[! df.all[,id] %in% na_pop.tracts ,]
# excess variables that are included in aggregate measures
extrav.pop = c("nDES_OV", "nLMPA_OV_col", "nVPA_OV_col", "nSMK_OV_col","nFVM_OV_col","nNAT_OV_col","nFSH_OV_col","nBusinesses_pa","nBusinesses_hf")
df.pop = df.pop[!colnames(df.pop) %in% extrav.pop]
# Set to variables to mean zero, standard deviation 1
comm.n = !colnames(df.comm) %in% c(id, outcomes.comm)
pop.n = !colnames(df.pop) %in% c(id, outcomes.pop)
df.comm[,comm.n] = scale(df.comm[,comm.n], center=TRUE)
df.pop[,pop.n] = scale(df.pop[,pop.n], center=TRUE)
# Imputation
df.norm = df.pop
levs = levels(DF = df.norm)
imp_mthds = imp_methods_select(DF = df.norm, levs_obj = levs)
imp_cluster = init_cluster(DF = df.norm, imp_methods = imp_mthds)
output = par_imputation(DF = df.norm, imp_methods = imp_mthds, cluster = imp_cluster, id = id)
combined_imps = merge_imputation(output)
stopCluster(imp_cluster)
df.complete = imp_reduce(combined_imps, id, t_var)
df.all.pop = df.complete
rm(list = c('df.norm','levs','imp_mthds','imp_cluster','output','combined_imps','df.complete'))
df.norm = df.comm
levs = levels(DF = df.norm)
imp_mthds = imp_methods_select(DF = df.norm, levs_obj = levs)
imp_cluster = init_cluster(DF = df.norm, imp_methods = imp_mthds)
output = par_imputation(DF = df.norm, imp_methods = imp_mthds, cluster = imp_cluster, id = id)
combined_imps = merge_imputation(output)
stopCluster(imp_cluster)
df.complete = imp_reduce(combined_imps, id, t_var)
df.all.comm = df.complete
rm(list = c('df.norm','levs','imp_mthds','imp_cluster','output','combined_imps','df.complete')))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.