knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
## load libraries # library(corclus) devtools::load_all() ## set paths path <- system.file("inst", package = "corclus", mustWork = TRUE) path_results <- here::here(file.path("../", "SimData/Results/")) ## get subdirectories for path_results (the first is just path_results) sub_path_results <- list.dirs(path_results)[-1] ## load simulated data load( file.path( path, "extdata", list.files(file.path(path, "extdata"), pattern = ".rda") ) )
# use a "large" number of schools to demonstrate # variance is calculated correctly (5 students per school because # this many schools creates a very wide dataset - more than this # takes up too much R memory) dat <- generate_data( .n_sch = 100, .n_stu = 5, .u_resid_var = 0.2, .clust_cov = c(0.8, 0.4), .wt_vec = c(0.5, 0.5), .pct_mobile = 1, .mean_x = 0, .var_x = 0, .mean_r = 0, .var_r = 0, .gamma_z = 1, .gamma_x = c(0, 0)#, #.seed_start = 1 ) # with equal weights and 2 schools max, level-2 variance should be # (1-m) * (0.5 + 0.5 * 1) + m * (0.5 + 0.5 * r), # where r is the covariance among adjacent schools and m is the pct mobility. # # (we're only allowing students to go to adjacent schools - we could expand # that as a future factor by increasing the .max_dist argument - e.g., # to be able to go to a school two schools away, set .max_dist = 2) temp <- dat %>% dplyr::group_by(., sch_id_1) %>% dplyr::summarise( sch_avg_1 = mean(u_residual_1), sch_avg_2 = mean(u_residual_2) ) %>% dplyr::select(., -sch_id_1) %>% dplyr::mutate(., z1z2 = 0.5 * sch_avg_1 + 0.5 * sch_avg_2) var(temp)
hlm <- list(mod_list$hlm, mod_list$hlm, mod_list$hlm) mm <- list(mod_list$mm, mod_list$mm, mod_list$mm) # set condition number cond_num = 1 mod_results <- mm %>% purrr::imap(., ~{ # get number of chains n_chains <- .x[["nchains"]] # extract the parameter estimates par_est <- extract_estimates(.x, n_chains) # extract the global fit estimates (keep only DIC) sum_stat <- extract_sumstat(hlm[[1]]) %>% dplyr::filter(., stringr::str_detect(Parameter, "DIC|pD")) # merge the parameter estimates with global fit & # add a column identifiying the simulation number par_est %>% dplyr::bind_rows(., sum_stat) %>% dplyr::mutate(sim_num = .y, .before = 1) }) %>% purrr::reduce(., dplyr::bind_rows) %>% dplyr::mutate(., cond_num = cond_num, .before = 1) %>% dplyr::group_by(., cond_num, Parameter) %>% dplyr::summarise( .data = ., dplyr::across(Estimate:ESS, mean) )
## function to get data from large results files (~180 GB) # saved in external folders get_data <- function(x) { ## here's where the second purrr starts # enter a new purrr::map to read those data x %>% purrr::map(., ~{ # explicitly state col_types to avoid printing the message 81*5 times if (stringr::str_detect(x, "results_sumstat")) { .col_types <- readr::cols( nsim = readr::col_double(), mod = readr::col_double(), form = readr::col_double(), nsch = readr::col_double(), nstu = readr::col_double(), pctmob = readr::col_double(), cor = readr::col_double(), icc = readr::col_double(), Dbar = readr::col_double(), Dthetabar = readr::col_double(), pD = readr::col_double(), DIC = readr::col_double(), N = readr::col_double() ) } else { .col_types <- readr::cols( nsim = readr::col_double(), mod = readr::col_double(), form = readr::col_double(), nsch = readr::col_double(), nstu = readr::col_double(), pctmob = readr::col_double(), cor = readr::col_double(), icc = readr::col_double(), est = readr::col_double(), se = readr::col_double(), stat = readr::col_double(), p = readr::col_double(), lwr = readr::col_double(), upr = readr::col_double(), ess = readr::col_double() ) } # read the space-delimited data files and add a column indicating the # parameter name (based on the file name) x %>% readr::read_delim( file = ., delim = " ", col_types = .col_types ) %>% dplyr::mutate( .data = ., Parameter = x %>% stringr::str_replace(., ".*/", "") %>% stringr::str_replace_all(., "results_coef_|.txt", ""), .after = "icc" ) }) } ## Requires External Drive to be Connected # select only the files needed raw_results <- sub_path_results %>% .[stringr::str_detect(., "nsch50")] # set progress bar purrr_bar <- progress::progress_bar$new(total = length(raw_results)) # run purrr dat_list <- raw_results %>% purrr::map(., ~{ # progress bar purrr_bar$tick() # get the list of files from each sub directory .paths <- file.path(.x, list.files(.x, pattern = ".txt")) # start the progress bar purrr::map(.paths, get_data) })
# remove extraneous stuff dat_reduce <- dat_list %>% purrr::map(., ~{ .x[1:4] }) %>% unlist(., recursive = FALSE) %>% purrr::reduce(., dplyr::bind_rows) %>% dplyr::select(., -tidyselect::any_of(c("stat", "p", "lwr", "upr"))) %>% dplyr::arrange(., nsim, form, icc, mod) # set the values of x1beta used in the study x1beta <- list( icc_05 = sqrt(17)/2, icc_15 = sqrt(11/3)/2, icc_30 = 1/(2*sqrt(3)) ) # include true value in dataset results <- dat_reduce %>% dplyr::filter(., !is.na(est), cor != 0.50) %>% dplyr::mutate( true_par = dplyr::case_when( ## fixed effect - intercept Parameter == "fp_int" & form == 3 ~ 10, Parameter == "fp_int" & icc == 0.05 ~ 10 + 5 * x1beta[["icc_05"]], Parameter == "fp_int" & icc == 0.15 ~ 10 + 5 * x1beta[["icc_15"]], Parameter == "fp_int" & icc == 0.30 ~ 10 + 5 * x1beta[["icc_30"]], ## fixed effect - x1 coefficient Parameter == "fp_x1" & icc == 0.05 ~ x1beta[["icc_05"]], Parameter == "fp_x1" & icc == 0.15 ~ x1beta[["icc_15"]], Parameter == "fp_x1" & icc == 0.30 ~ x1beta[["icc_30"]], ## level-1 random effect # models without controlling for x Parameter == "rp_int_v1" & form %in% 1:2 & icc == 0.05 ~ calc_exp_icc(.gamma_x = c(0, x1beta[["icc_05"]]))[["l1_var"]], Parameter == "rp_int_v1" & form %in% 1:2 & icc == 0.15 ~ calc_exp_icc(.gamma_x = c(0, x1beta[["icc_15"]]))[["l1_var"]], Parameter == "rp_int_v1" & form %in% 1:2 & icc == 0.30 ~ calc_exp_icc(.gamma_x = c(0, x1beta[["icc_30"]]))[["l1_var"]], # models that control for x Parameter == "rp_int_v1" & form == 3 ~ 2, ## level-2 random effect # pctmob = 0 Parameter == "rp_int_v2" & pctmob == 0 ~ calc_exp_l2var( .pct_mobile = 0, .clust_cov = c(0.8, 1) ), # pctmob = 0.25, cor varies Parameter == "rp_int_v2" & cor == 0.00 & pctmob == 0.25 ~ calc_exp_l2var( .pct_mobile = 0.25, .clust_cov = c(0.8, 0.00) ), Parameter == "rp_int_v2" & cor == 0.25 & pctmob == 0.25 ~ calc_exp_l2var( .pct_mobile = 0.25, .clust_cov = c(0.8, 0.25) ), # pctmob = 0.50, cor varies Parameter == "rp_int_v2" & cor == 0.00 & pctmob == 0.50 ~ calc_exp_l2var( .pct_mobile = 0.50, .clust_cov = c(0.8, 0.00) ), Parameter == "rp_int_v2" & cor == 0.25 & pctmob == 0.50 ~ calc_exp_l2var( .pct_mobile = 0.50, .clust_cov = c(0.8, 0.25) ), # no matches TRUE ~ NA_real_ ), .after = "icc" )
# get aggregated values grouped by condition agg_results <- results %>% dplyr::group_by(., Parameter, mod, form, nsch, pctmob, cor, icc) %>% dplyr::summarise( avg_est = mean(est, na.rm = TRUE), sd_est = sd(est, na.rm = TRUE), avg_se = mean(se, na.rm = TRUE), avg_ess = mean(ess, na.rm = TRUE) ) full_results <- results %>% dplyr::left_join( x = ., agg_results, by = c("Parameter", "mod", "form", "nsch", "pctmob", "cor", "icc") ) %>% dplyr::mutate( mse = (sum((est - avg_est)^2)/1000) + ((avg_est - true_par)^2), cov_95 = dplyr::if_else( (true_par > (est + 1.96 * se)) | (true_par < (est - 1.96 * se)), true = 0, false = 1 ), sig_05 = dplyr::if_else( abs((est - 0) / se) > 1.96, true = 1, false = 0 ) ) final_results <- full_results %>% dplyr::group_by(., Parameter, mod, form, nsch, pctmob, cor, icc) %>% dplyr::summarise( avg_true_par = mean(true_par, na.rm = TRUE), avg_est = mean(avg_est, na.rm = TRUE), sd_est = mean(sd_est, na.rm = TRUE), avg_se = mean(avg_se, na.rm = TRUE), avg_mse = mean(mse, na.rm = TRUE), avg_cov_95 = mean(cov_95, na.rm = TRUE), avg_sig_05 = mean(sig_05, na.rm = TRUE), avg_ess = mean(avg_ess, na.rm = TRUE) )
usethis::use_data(final_results) usethis::use_data(full_results) data("final_results", package = "corclus") data("full_results", package = "corclus")
final_results %>% dplyr::mutate( dplyr::across(where(is.numeric), round, digits = 2) )
lm_list <- c("fp_int", "fp_x1", "rp_int_v1", "rp_int_v2") %>% purrr::map(., ~{ final_results %>% dplyr::filter(Parameter == .x, pctmob != 0) %>% dplyr::mutate( mod = dplyr::if_else(mod == 1, "HLM", "MMREM"), form = dplyr::if_else(form == 1, "Int-Only", "L2") ) %>% dplyr::mutate( avg_mse = avg_mse * 1000, mod = forcats::as_factor(mod), form = forcats::as_factor(form), pctmob = forcats::as_factor(pctmob), cor = forcats::as_factor(cor), icc = forcats::as_factor(icc), ) }) %>% rlang::set_names(., c("fp_int", "fp_x1", "rp_int_v1", "rp_int_v2")) print("fp_int: avg_mse") lm_list$fp_int %>% lm( avg_mse ~ mod + pctmob + form*cor, data = . ) %>% summary(.) print("rp_int_v1: avg_mse") lm_list$rp_int_v1 %>% lm( avg_mse ~ mod + pctmob + form*cor, data = . ) %>% summary(.) print("rp_int_v2: avg_mse") lm_list$rp_int_v2 %>% lm( avg_mse ~ mod + pctmob + form*cor, data = . ) %>% summary(.)
# mse fct_dat <- final_results %>% dplyr::mutate( .data = ., avg_mse = avg_mse * 1000, mod = forcats::as_factor(mod), form = forcats::as_factor(form), pctmob = forcats::as_factor(pctmob), cor = forcats::as_factor(cor), icc = forcats::as_factor(icc) ) %>% dplyr::mutate( mod = forcats::fct_recode(mod, HLM = "1", MMREM = "2"), form = forcats::fct_recode(form, Null = "1", L2 = "2", `L1 & L2` = "3") ) c("fp_int", "fp_x1", "rp_int_v1", "rp_int_v2") %>% purrr::map2(., .y = c("Fixed Intercept", "Fixed X Coef", "Random Intercept (L1)", "Random Intercept (L2)"), .f = ~{ temp <- fct_dat %>% dplyr::filter(., Parameter == .x) temp %>% ggplot2::ggplot( data = ., ggplot2::aes( y = avg_mse, x = form, color = pctmob, shape = cor ) ) + ggbeeswarm::geom_beeswarm() + ggplot2::scale_color_viridis_d( name = "Percent Mobility", option = "D", end = 0.85 ) + ggplot2::scale_x_discrete( name = "Intraclass Correlation" ) + ggplot2::scale_shape_discrete( name = "Cluster Correlation" ) + ggplot2::theme( legend.position = "top" ) + ggplot2::ylab("Mean Square Error of the Estimate") + ggplot2::ggtitle(.y) + ggplot2::facet_wrap(ggplot2::vars(mod)) # save plot ggplot2::ggsave( file.path(path, "figures", paste0("avgse_wrap_", .y, ".png")) ) })
#tab_results <- c("fp_int", "fp_x1", "rp_int_v1", "rp_int_v2") %>% purrr::map(., ~{ final_results %>% dplyr::ungroup(.) %>% dplyr::select(., -nsch, -avg_sig_05) %>% dplyr::filter(., Parameter == .x) %>% dplyr::mutate( .data = ., mod = dplyr::if_else(mod == 1, "HLM", "MMREM"), form = dplyr::case_when( form == 1 ~ "Null", form == 2 ~ "L2", form == 3 ~ "L1 & L2" ) ) %>% dplyr::rename( .data = ., `Estimation Model` = mod, `Model Specification` = form, `Percent Mobility` = pctmob, `Cluster Correlation` = cor, ICC = icc, `Population Parameter` = avg_true_par, `Mean(Est)` = avg_est, `SD(EST)` = sd_est, `Mean(SE)` = avg_se, MSE = avg_mse, Coverage = avg_cov_95, `Mean(ESS)` = avg_ess ) %>% readr::write_csv( x = ., file = file.path(path, "tables", paste0(.x, "_res.csv")) ) })
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.