R/format_dcpo.R

Defines functions format_dcpo

Documented in format_dcpo

#' Format DCPO Data for Estimation Using DCPO Stan Files
#'
#'  \code{format_dcpo} formats DCPO data output by \code{dcpo_setup} for use with the \code{DCPO}
#'  package's \code{dcpo} Stan files.
#'
#' @param dcpo_data a data frame of survey responses generated by \code{dcpo_setup}
#' @param scale_q a string indicating a survey question in dcpo_data to assign difficulty .5 at its cutpoint scale_cp
#' @param scale_cp an integer indicating which cutpoint for scale_q should be assigned difficulty .5
#' @param delta logical: should item difficulty be estimated to vary by question-country to account for potential lack of equivalence?
#'
#' @return a list of Stan data
#'
#' @import dplyr
#' @importFrom reshape2 dcast
#' @importFrom tibble rownames_to_column
#' @importFrom janitor clean_names
#'
#' @export

format_dcpo <- function(dcpo_data, scale_q, scale_cp, delta = TRUE) {
    # satisfy R CMD check
    country <- year <- item <- r <- n <- NULL

    dcpo_data_original <- dcpo_data
    if ("data.frame" %in% class(dcpo_data)) {
        dcpo_data <- list(dcpo_data)
    }

    # generate cumulative number of respondents with answers above each cutpoint
    dcpo_stan <- map(dcpo_data, function(df) {
        dat <- df %>%
            group_by(country, year, item) %>%
                       mutate(r = case_when(r == -1   ~ 1,
                                             r == 999 ~ max(setdiff(r, 999)),
                                             TRUE     ~ r)) %>%
            ungroup() %>%
            group_by(country, year, item, r, survey, cc_rank) %>%
            summarize(n = sum(n), .groups = "keep") %>%
            ungroup() %>%
            arrange(desc(cc_rank), country, year) %>%
            mutate(kk = as_factor(country),
                   tt = year - min(year) + 1,
                   qq = as_factor(item),
                   rr = r - 1,
                   question = item,
                   item = paste(question, r, "or higher")) %>%
            group_by(country, year, question) %>%
            arrange(desc(r), .by_group = TRUE) %>%
            mutate(y_r = round(cumsum(n)),
                   n_r = round(sum(n))) %>%
            arrange(r, .by_group = TRUE) %>%
            ungroup() %>%
            arrange(kk, tt) %>%
            filter(y_r > 0 & rr > 0)

        use_delta <- dat %>%
            group_by(qq, kk) %>%
            summarize(years = n_distinct(year)) %>%
            ungroup() %>%
            spread(key = kk, value = years, fill = 0) %>%
            mutate(countries = rowSums(.[, -1] > 1)) %>%
            mutate_at(vars(-qq, -countries),
                      ~ if_else(. > 1 & countries > 2 & qq != scale_q, 1, 0)) %>%
            select(-qq, -countries) %>%
            {if (!delta) mutate_all(., ~ 0) else .}

        scale_item_matrix <- dat %>%
            group_by(qq, rr) %>%
            summarize(n = sum(n_r)) %>%
            ungroup() %>%
            spread(key = rr, value = n, fill = 0) %>%
            janitor::clean_names() %>%
            mutate_at(vars(matches(paste0("x\\d+$"))), ~if_else(. > 0, 10, 0)) %>%
            mutate_at(vars(matches(paste0("x", scale_cp, "$"))), ~if_else(qq == scale_q, . + 1, 0)) %>%
            mutate_at(vars(matches(paste0("x\\d+$"))), ~if_else(. > 0, . - 10, 0)) %>%
            select(-qq) %>%
            as.matrix()
        stopifnot(sum(scale_item_matrix) == 1)

        one_dcpo_stan <- list( K          = max(as.numeric(dat$kk)),
                               T          = max(dat$tt),
                               Q          = max(as.numeric(dat$qq)),
                               R          = max(dat$rr),
                               N          = nrow(dat),
                               kk         = as.numeric(dat$kk),
                               tt         = as.numeric(dat$tt),
                               qq         = as.numeric(dat$qq),
                               rr         = dat$rr,
                               y_r        = dat$y_r,
                               n_r        = dat$n_r,
                               fixed_cutp = scale_item_matrix,
                               use_delta  = use_delta,
                               data       = dat,
                               data_args  = list(scale_q = scale_q, scale_cp = scale_cp, delta = delta))
    })

    if ("data.frame" %in% class(dcpo_data_original)) {
        dcpo_stan <- dcpo_stan[[1]]
    }

    return(dcpo_stan)
}
fsolt/DCPOtools documentation built on June 9, 2025, 4:10 p.m.