In tessaleejohnson/corclus: Simulate Data from the Correlated Cluster Multiple Membership Model

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## load libraries

# library(corclus)
devtools::load_all()

## set paths
path <- system.file("inst", package = "corclus", mustWork = TRUE)
path_results <- here::here(file.path("../", "SimData/Results/"))

## get subdirectories for path_results (the first is just path_results)
sub_path_results <- list.dirs(path_results)[-1]

## load simulated data
load(
  file.path(
    path, 
    "extdata", 
    list.files(file.path(path, "extdata"), pattern = ".rda")
  )
)

Check Data Generation

# use a "large" number of schools to demonstrate
# variance is calculated correctly (5 students per school because
# this many schools creates a very wide dataset - more than this
# takes up too much R memory)
dat <-
  generate_data(
    .n_sch = 100,
    .n_stu = 5,
    .u_resid_var = 0.2,
    .clust_cov = c(0.8, 0.4),
    .wt_vec = c(0.5, 0.5),
    .pct_mobile = 1,
    .mean_x = 0,
    .var_x = 0,
    .mean_r = 0,
    .var_r = 0,
    .gamma_z = 1,
    .gamma_x = c(0, 0)#,
    #.seed_start = 1
  )

# with equal weights and 2 schools max, level-2 variance should be
# (1-m) * (0.5 + 0.5 * 1) + m * (0.5 + 0.5 * r), 
# where r is the covariance among adjacent schools and m is the pct mobility.
#
# (we're only allowing students to go to adjacent schools - we could expand
# that as a future factor by increasing the .max_dist argument - e.g.,
# to be able to go to a school two schools away, set .max_dist = 2)

temp <- dat %>%
  dplyr::group_by(., sch_id_1) %>%
  dplyr::summarise(
    sch_avg_1 = mean(u_residual_1),
    sch_avg_2 = mean(u_residual_2)
  ) %>%
  dplyr::select(., -sch_id_1) %>%
  dplyr::mutate(., z1z2 = 0.5 * sch_avg_1 + 0.5 * sch_avg_2)

var(temp)

Produce Results Table for One Replication

hlm <- list(mod_list$hlm, mod_list$hlm, mod_list$hlm)
mm <- list(mod_list$mm, mod_list$mm, mod_list$mm)

# set condition number
cond_num = 1

mod_results <-
  mm %>%
  purrr::imap(., ~{

    # get number of chains
    n_chains <- .x[["nchains"]]

    # extract the parameter estimates
    par_est <- 
      extract_estimates(.x, n_chains)

    # extract the global fit estimates (keep only DIC)
    sum_stat <- 
      extract_sumstat(hlm[[1]]) %>%
      dplyr::filter(., stringr::str_detect(Parameter, "DIC|pD"))

    # merge the parameter estimates with global fit &
    # add a column identifiying the simulation number
    par_est %>%
      dplyr::bind_rows(., sum_stat) %>%
      dplyr::mutate(sim_num = .y, .before = 1)


  }) %>%
  purrr::reduce(., dplyr::bind_rows) %>%
  dplyr::mutate(., cond_num = cond_num, .before = 1) %>%
  dplyr::group_by(., cond_num, Parameter) %>%
  dplyr::summarise(
    .data = ., 
    dplyr::across(Estimate:ESS, mean)
  )

Compile Saved Data

## function to get data from large results files (~180 GB)
# saved in external folders

get_data <- function(x) {

  ## here's where the second purrr starts

  # enter a new purrr::map to read those data
  x %>%
    purrr::map(., ~{

      # explicitly state col_types to avoid printing the message 81*5 times
      if (stringr::str_detect(x, "results_sumstat")) {
        .col_types <-
          readr::cols(
            nsim = readr::col_double(),
            mod = readr::col_double(),
            form = readr::col_double(),
            nsch = readr::col_double(),
            nstu = readr::col_double(),
            pctmob = readr::col_double(),
            cor = readr::col_double(),
            icc = readr::col_double(),
            Dbar = readr::col_double(),
            Dthetabar = readr::col_double(),
            pD = readr::col_double(),
            DIC = readr::col_double(),
            N = readr::col_double()
          )
      } else {
        .col_types <-
          readr::cols(
            nsim = readr::col_double(),
            mod = readr::col_double(),
            form = readr::col_double(),
            nsch = readr::col_double(),
            nstu = readr::col_double(),
            pctmob = readr::col_double(),
            cor = readr::col_double(),
            icc = readr::col_double(),
            est = readr::col_double(),
            se = readr::col_double(),
            stat = readr::col_double(),
            p = readr::col_double(),
            lwr = readr::col_double(),
            upr = readr::col_double(),
            ess = readr::col_double()
          )
      }

      # read the space-delimited data files and add a column indicating the
      # parameter name (based on the file name)
      x %>%
        readr::read_delim(
          file = .,
          delim = " ",
          col_types = .col_types
        ) %>%
        dplyr::mutate(
          .data = .,
          Parameter = x %>%
            stringr::str_replace(., ".*/", "") %>%
            stringr::str_replace_all(., "results_coef_|.txt", ""),
          .after = "icc" 
        )

    })

}


## Requires External Drive to be Connected

# select only the files needed
raw_results <- sub_path_results %>%
  .[stringr::str_detect(., "nsch50")]

# set progress bar
purrr_bar <- progress::progress_bar$new(total = length(raw_results))

# run purrr

dat_list <-
  raw_results %>%
  purrr::map(., ~{

    # progress bar
    purrr_bar$tick()

    # get the list of files from each sub directory
    .paths <- file.path(.x, list.files(.x, pattern = ".txt"))

    # start the progress bar
    purrr::map(.paths, get_data)
    })

Recode Compiled Data

# remove extraneous stuff
dat_reduce <-
  dat_list %>%
  purrr::map(., ~{
    .x[1:4]
  }) %>%
  unlist(., recursive = FALSE) %>%
  purrr::reduce(., dplyr::bind_rows) %>%
  dplyr::select(., -tidyselect::any_of(c("stat", "p", "lwr", "upr"))) %>%
  dplyr::arrange(., nsim, form, icc, mod)


# set the values of x1beta used in the study
x1beta <- list(
  icc_05 = sqrt(17)/2,
  icc_15 = sqrt(11/3)/2,
  icc_30 = 1/(2*sqrt(3))
)

# include true value in dataset
results <-
  dat_reduce %>%
  dplyr::filter(., !is.na(est), cor != 0.50) %>%
  dplyr::mutate(
    true_par = dplyr::case_when(

      ## fixed effect - intercept
      Parameter == "fp_int" & form == 3 ~ 10,
      Parameter == "fp_int" & icc == 0.05 ~ 10 + 5 * x1beta[["icc_05"]],
      Parameter == "fp_int" & icc == 0.15 ~ 10 + 5 * x1beta[["icc_15"]],
      Parameter == "fp_int" & icc == 0.30 ~ 10 + 5 * x1beta[["icc_30"]],

      ## fixed effect - x1 coefficient
      Parameter == "fp_x1" & icc == 0.05 ~ x1beta[["icc_05"]],
      Parameter == "fp_x1" & icc == 0.15 ~ x1beta[["icc_15"]],
      Parameter == "fp_x1" & icc == 0.30 ~ x1beta[["icc_30"]],

      ## level-1 random effect

      # models without controlling for x
      Parameter == "rp_int_v1" & form %in% 1:2 & icc == 0.05 ~
        calc_exp_icc(.gamma_x = c(0, x1beta[["icc_05"]]))[["l1_var"]],
      Parameter == "rp_int_v1" & form %in% 1:2 & icc == 0.15 ~
        calc_exp_icc(.gamma_x = c(0, x1beta[["icc_15"]]))[["l1_var"]],
      Parameter == "rp_int_v1" & form %in% 1:2 & icc == 0.30 ~
        calc_exp_icc(.gamma_x = c(0, x1beta[["icc_30"]]))[["l1_var"]],

      # models that control for x
      Parameter == "rp_int_v1" & form == 3 ~ 2,

      ## level-2 random effect

      # pctmob = 0
      Parameter == "rp_int_v2" & pctmob == 0 ~
        calc_exp_l2var(
          .pct_mobile = 0,
          .clust_cov = c(0.8, 1)
        ),

      # pctmob = 0.25, cor varies 
      Parameter == "rp_int_v2" & cor == 0.00 & pctmob == 0.25 ~
        calc_exp_l2var(
          .pct_mobile = 0.25,
          .clust_cov = c(0.8, 0.00)
        ),
      Parameter == "rp_int_v2" & cor == 0.25 & pctmob == 0.25 ~
        calc_exp_l2var(
          .pct_mobile = 0.25,
          .clust_cov = c(0.8, 0.25)
        ),

      # pctmob = 0.50, cor varies 
      Parameter == "rp_int_v2" & cor == 0.00 & pctmob == 0.50 ~
        calc_exp_l2var(
          .pct_mobile = 0.50,
          .clust_cov = c(0.8, 0.00)
        ),
      Parameter == "rp_int_v2" & cor == 0.25 & pctmob == 0.50 ~
        calc_exp_l2var(
          .pct_mobile = 0.50,
          .clust_cov = c(0.8, 0.25)
        ),

      # no matches
      TRUE ~ NA_real_
    ),
    .after = "icc"
  )

Compute Bias, Etc.

# get aggregated values grouped by condition
agg_results <- 
  results %>%
  dplyr::group_by(., Parameter, mod, form, nsch, pctmob, cor, icc) %>%
  dplyr::summarise(
    avg_est = mean(est, na.rm = TRUE),
    sd_est = sd(est, na.rm = TRUE),
    avg_se = mean(se, na.rm = TRUE),
    avg_ess = mean(ess, na.rm = TRUE)
  )

full_results <-
  results %>%
  dplyr::left_join(
    x = ., 
    agg_results, 
    by = c("Parameter", "mod", "form", "nsch", "pctmob", "cor", "icc")
  ) %>%
  dplyr::mutate(
    mse = (sum((est - avg_est)^2)/1000) + ((avg_est - true_par)^2),
    cov_95 = dplyr::if_else(
      (true_par > (est + 1.96 * se)) | (true_par < (est - 1.96 * se)),
      true = 0,
      false = 1
    ),
    sig_05 = dplyr::if_else(
      abs((est - 0) / se) > 1.96,
      true = 1,
      false = 0
    )
  )


final_results <-
  full_results %>%
  dplyr::group_by(., Parameter, mod, form, nsch, pctmob, cor, icc) %>%
  dplyr::summarise(
    avg_true_par = mean(true_par, na.rm = TRUE),
    avg_est = mean(avg_est, na.rm = TRUE),
    sd_est = mean(sd_est, na.rm = TRUE),
    avg_se = mean(avg_se, na.rm = TRUE),
    avg_mse = mean(mse, na.rm = TRUE),
    avg_cov_95 = mean(cov_95, na.rm = TRUE),
    avg_sig_05 = mean(sig_05, na.rm = TRUE),
    avg_ess = mean(avg_ess, na.rm = TRUE)
  )

Load/Save Results

usethis::use_data(final_results)
usethis::use_data(full_results)

data("final_results", package = "corclus")
data("full_results", package = "corclus")

Examine Final Results

final_results %>%
  dplyr::mutate(
    dplyr::across(where(is.numeric), round, digits = 2)
  )

Test Factors

lm_list <- 
  c("fp_int", "fp_x1", "rp_int_v1", "rp_int_v2") %>% 
  purrr::map(., ~{
    final_results %>%
      dplyr::filter(Parameter == .x, pctmob != 0) %>%
      dplyr::mutate(
        mod = dplyr::if_else(mod == 1, "HLM", "MMREM"),
        form = dplyr::if_else(form == 1, "Int-Only", "L2")
      ) %>%
      dplyr::mutate(
        avg_mse = avg_mse * 1000,
        mod = forcats::as_factor(mod),
        form = forcats::as_factor(form),
        pctmob = forcats::as_factor(pctmob),
        cor = forcats::as_factor(cor),
        icc = forcats::as_factor(icc),
      )
    }) %>%
  rlang::set_names(., c("fp_int", "fp_x1", "rp_int_v1", "rp_int_v2"))



print("fp_int: avg_mse")
lm_list$fp_int %>%
  lm(
    avg_mse ~ mod + pctmob + form*cor,
    data = .
  ) %>%
  summary(.)

print("rp_int_v1: avg_mse")
lm_list$rp_int_v1 %>%
  lm(
    avg_mse ~ mod + pctmob + form*cor,
    data = .
  ) %>%
  summary(.)

print("rp_int_v2: avg_mse")
lm_list$rp_int_v2 %>%
  lm(
    avg_mse ~ mod + pctmob + form*cor, 
    data = .
  ) %>%
  summary(.)

Plot Results

Mean Standard Error

# mse

fct_dat <- final_results %>%
  dplyr::mutate(
    .data = .,
    avg_mse = avg_mse * 1000,
    mod = forcats::as_factor(mod),
    form = forcats::as_factor(form),
    pctmob = forcats::as_factor(pctmob),
    cor = forcats::as_factor(cor),
    icc = forcats::as_factor(icc)
  ) %>%
  dplyr::mutate(
    mod = forcats::fct_recode(mod, HLM = "1", MMREM = "2"),
    form = forcats::fct_recode(form, Null = "1", L2 = "2", `L1 & L2` = "3")
  )

c("fp_int", "fp_x1", "rp_int_v1", "rp_int_v2") %>%
  purrr::map2(.,
    .y = c("Fixed Intercept", "Fixed X Coef",
           "Random Intercept (L1)", "Random Intercept (L2)"),
    .f = ~{

      temp <-
        fct_dat %>%
        dplyr::filter(., Parameter == .x)

      temp %>%
        ggplot2::ggplot(
          data = .,
          ggplot2::aes(
            y = avg_mse,
            x = form,
            color = pctmob,
            shape = cor
          )
        ) +
        ggbeeswarm::geom_beeswarm() +
        ggplot2::scale_color_viridis_d(
          name = "Percent Mobility",
          option = "D", end = 0.85
        ) +
        ggplot2::scale_x_discrete(
          name = "Intraclass Correlation"
        ) +
        ggplot2::scale_shape_discrete(
          name = "Cluster Correlation"
        ) +
        ggplot2::theme(
          legend.position = "top"
        ) +
        ggplot2::ylab("Mean Square Error of the Estimate") +
        ggplot2::ggtitle(.y) +
        ggplot2::facet_wrap(ggplot2::vars(mod))

      # save plot
      ggplot2::ggsave(
        file.path(path, "figures", paste0("avgse_wrap_", .y, ".png"))
      )
    })

Write Tables

#tab_results <- 
  c("fp_int", "fp_x1", "rp_int_v1", "rp_int_v2") %>%
  purrr::map(., ~{
    final_results %>%
      dplyr::ungroup(.) %>%
      dplyr::select(., -nsch, -avg_sig_05) %>%
      dplyr::filter(., Parameter == .x) %>%
      dplyr::mutate(
        .data = .,
        mod = dplyr::if_else(mod == 1, "HLM", "MMREM"),
        form = dplyr::case_when(
          form == 1 ~ "Null",
          form == 2 ~ "L2",
          form == 3 ~ "L1 & L2"
        )
      ) %>%
      dplyr::rename(
        .data = .,
        `Estimation Model` = mod,
        `Model Specification` = form,
        `Percent Mobility` = pctmob,
        `Cluster Correlation` = cor,
        ICC = icc,
        `Population Parameter` = avg_true_par,
        `Mean(Est)` = avg_est,
        `SD(EST)` = sd_est,
        `Mean(SE)` = avg_se,
        MSE = avg_mse,
        Coverage = avg_cov_95,
        `Mean(ESS)` = avg_ess
      ) %>%
      readr::write_csv(
        x = ., 
        file = file.path(path, "tables", paste0(.x, "_res.csv"))
      )
  })