R/aggregate_item_responses.r

Defines functions make_group_grid make_group_grid_t make_group_counts count_items calc_design_effects

make_group_grid <- function(item_data, aggregate_data, ctrl) {
  # Make a table giving combinations of the grouping variables
  group_grid <- expand.grid(c(
    setNames(list(ctrl@time_filter), ctrl@time_name),
    lapply(rbind(item_data[, c(ctrl@geo_name, ctrl@group_names), with = FALSE],
                 aggregate_data[, c(ctrl@group_names, ctrl@geo_name), with = FALSE]),
           function(x) sort(unique(x)))), stringsAsFactors = FALSE)
  data.table::setDT(group_grid, key = c(ctrl@group_names, ctrl@time_name, ctrl@geo_name))
  invisible(group_grid)
}

make_group_grid_t <- function(group_grid, ctrl) {
  # Make a table giving combinations of grouping variables, excluding time
  group_grid_t <- data.table::copy(group_grid)[, ctrl@time_name := NULL]
  group_grid_t <- group_grid_t[!duplicated(group_grid_t)]
  data.table::setkeyv(group_grid_t, c(ctrl@group_names, ctrl@geo_name))
  group_grid_t
}

make_group_counts <- function(item_data, aggregate_data, ctrl) {
  # Make a table giving success and trial counts by group and item.
  #
  # Because of how DGIRT Stan code iterates over the data, the result must be
  # ordered by time, item, and then group. The order of the grouping variables
  # doesn't matter.
  if (length(item_data)) {
    gt_names <- attr(item_data, "gt_items")
    item_data[, c("n_responses") := list(rowSums(!is.na(.SD))),
              .SDcols = gt_names]
    if (!length(ctrl@weight_name)) {
      item_data[, weight := 1L]
      ctrl@weight_name <- "weight"
    }
    item_data[, c("def") := lapply(.SD, calc_design_effects),
              .SDcols = ctrl@weight_name,
              by = c(ctrl@geo_name, ctrl@group_names, ctrl@time_name)]

    # get design-effect-adjusted nonmissing response counts by group and item
    item_n <- item_data[, lapply(.SD, count_items, get("n_responses"), get("def")),
                        .SDcols = c(gt_names),
                        by = c(ctrl@geo_name, ctrl@group_names, ctrl@time_name)]
    # append _n_grp to the response count columns
    item_n_vars <- paste0(gt_names, "_n_grp")
    names(item_n) <- replace(names(item_n), match(gt_names, names(item_n)), item_n_vars)
    data.table::setkeyv(item_n, c(ctrl@time_name, ctrl@geo_name, ctrl@group_names))
    drop_cols <- setdiff(names(item_n), c(key(item_n), item_n_vars))
    if (length(drop_cols)) {
      item_n[, c(drop_cols) := NULL]
    }

    # get mean ystar
    item_data[, c("adj_weight") := get(ctrl@weight_name) / get("n_responses")]
    item_means <- item_data[, lapply(.SD, function(x) weighted.mean(x, .SD$adj_weight, na.rm = TRUE)),
                            .SDcols = c(gt_names, "adj_weight"),
                            by = c(ctrl@geo_name, ctrl@group_names, ctrl@time_name)]
    # append _mean to the mean response columns
    item_mean_vars <- paste0(gt_names, "_mean")
    names(item_means) <- replace(names(item_means), match(gt_names, names(item_means)), item_mean_vars)
    data.table::setkeyv(item_means, c(ctrl@time_name, ctrl@geo_name, ctrl@group_names))
    drop_cols <- setdiff(names(item_means), c(key(item_means), item_mean_vars))
    if (length(drop_cols)) {
      item_means[, c(drop_cols) := NULL]
    }

    # join response counts with means
    count_means <- item_n[item_means]
    count_means <- count_means[, c(ctrl@time_name, ctrl@geo_name,
                                     ctrl@group_names, item_mean_vars,
                                     item_n_vars), with = FALSE]

    # the group success count for an item is the product of its count and mean
    item_s_vars <- paste0(gt_names, "_s_grp")
    count_means[, c(item_s_vars) := round(count_means[, (item_mean_vars), with = FALSE] *
                                           count_means[, (item_n_vars), with = FALSE], 0)]
    count_means <- count_means[, -grep("_mean$", names(count_means)), with = FALSE]


    # we want a long table of successes (s_grp) and trials (n_grp) by group and
    # item; items need to move from columns to rows
    melted <- melt(count_means, id.vars = c(ctrl@time_name, ctrl@geo_name,
                                             ctrl@group_names),
                   variable.name = "item")
    melted[, c("variable") := list(gsub(".*([sn]_grp)$", "\\1", get("item")))]
    melted[, c("item") := list(gsub("(.*)_[sn]_grp$", "\\1", get("item")))]
    f <- as.formula(paste0(paste(ctrl@time_name, ctrl@geo_name,
                                 paste(ctrl@group_names, collapse = " + "),
                                 "item", sep = "+"), " ~ variable"))
    counts <- data.table::dcast.data.table(melted, f, drop = FALSE, fill = 0L)
  }

  # include aggregates, if any
  if (length(item_data) && length(aggregate_data) && nrow(aggregate_data) > 0) {
    # invariant: we have both individual- and aggregate-level item responses
    counts <- data.table::rbindlist(list(counts, aggregate_data), use.names =
                                    TRUE)
    message("Added ", length(ctrl@aggregate_item_names), " items from aggregate data.")
  } else if (length(aggregate_data) && nrow(aggregate_data) > 0) {
    # invariant: we have only aggregate-level item responses
    # aggregate_data is already in the expected format
    counts <- aggregate_data
    message("Using ", length(ctrl@aggregate_item_names), " items from aggregate data.")
  } else if (!length(item_data)) {
    # invariant: we unexpectedly have neither individual- nor aggregate-level data
    stop("can't proceed with neither item_data nor aggregate_data")
  }

  data.table::setkeyv(counts, c(ctrl@time_name, "item", ctrl@group_names,
      ctrl@geo_name))

  # include unobserved cells
  all_groups = expand.grid(c(setNames(list(unique(counts[[ctrl@geo_name]])), ctrl@geo_name),
                             setNames(list(ctrl@time_filter), ctrl@time_name),
                             lapply(counts[, c(ctrl@group_names,
                                                     "item"), with = FALSE],
                                    function(x) sort(unique(x)))),
                           stringsAsFactors = FALSE)
  counts <- merge(counts, all_groups, all = TRUE)

  # unobserved cells should be zeroed
  counts[is.na(get("s_grp")), c("s_grp") := 0]
  counts[is.na(get("n_grp")), c("n_grp") := 0]

  # create an identifier for use in n_vec and s_vec
  counts[, c("name") := do.call(paste, c(.SD, sep = "__")), .SDcols =
               c(ctrl@time_name, ctrl@geo_name, ctrl@group_names, "item")]

  setkeyv(counts, c(ctrl@time_name, "item", ctrl@group_names, ctrl@geo_name))
  counts
}

count_items <- function(x, n_responses, def) {
  ceiling(sum(as.integer(!is.na(x)) / n_responses / def, na.rm = TRUE))
}

calc_design_effects <- function(x) {
  y <- 1 + (sd(x, na.rm = T) / mean(x, na.rm = T)) ^ 2
  ifelse(is.na(y), 1, y)
}
jamesdunham/dgirt documentation built on May 18, 2019, 11:19 a.m.