R/shape.r
In dgo: Dynamic Estimation of Group-Level Opinion

#' Prepare data for modeling
#'
#' This function shapes data for use in a dgirt or dgmrp model. Most
#' arguments give the name or names of key variables in the data. These
#' arguments end in \code{_name} or \code{_names} and should be character
#' vectors.
#'
#' @section Item Response Data:
#' Individual-level data giving item responses is expected as argument
#' \code{item_data}. Required arguments \code{time_name} and \code{geo_name}
#' give the names of variables in \code{item_data} that indicate time period and
#' local geographic area. Optional argument \code{group_names} gives other
#' respondent characteristics to be modeled. \code{item_data} is optional if
#' argument \code{aggregate_data} is used. Note that the \code{dgirt()} model
#' assumes consistent coding of the polarity of item responses for
#' identification.
#'
#' @section Modifier Data:
#' Data for modeling geographic hierarchical parameters can be given with
#' argument \code{modifier_data}, in which case argument \code{modifier_names}
#' is required and arguments \code{t1_modifier_names} and \code{standardize} are
#' optional.
#'
#' @section Aggregate Item Response Data:
#' \code{shape()} aggregates the individual-level item response data given as
#' \code{item_data} for modeling. Data already aggregated to the group level can
#' be provided with argument \code{aggregate_data}.
#'
#' The data given by \code{aggregate_data} must be in a long table of trial and
#' success counts indexed by item, group, and time period. The variable names
#' given by arguments \code{group_names}, \code{geo_name}, and\code{time_name}
#' should exist in \code{aggregate_data}. Three fixed variable names must also
#' appear in \code{aggregate_data}: \code{item} giving item identifiers,
#' \code{n_grp} giving counts of item-response trials, and \code{s_grp} giving
#' counts of item-response successes. These counts should be adjusted
#' consistently with the transformations applied during the aggregation by
#' \code{shape()} of the individual \code{item_data}.
#'
#' @section Reweighting:
#' Use argument \code{target_data} to adjust the weighting of groups toward
#' population targets via raking, using an adaptation of
#' \code{\link[survey]{rake}}. To adjust existing survey weights in
#' \code{item_data}, provide argument \code{weight_name}. Otherwise,
#' observations in \code{item_data} will be assigned equal starting weights.
#' Argument \code{raking} defines strata. If you pass it a list of formulas like
#' \code{list(~ x, ~ y)}, raking is first over \code{x}, then over \code{y}.
#' Given an additive formula like \code{~ x + y}, raking is over the
#' combinations of \code{x} and \code{y}. So, \code{list(~ x, ~ y + z)} is first
#' over \code{x}, then over \code{y}-\code{z} pairs. Argument
#' \code{proportion_name} is optional.
#'
#' @section Restrictions:
#' For convenience, data in \code{item_data}, \code{modifier_data},
#' \code{aggregate_data}, and \code{target_data} can be restricted (subsetted)
#' row-wise to the time periods given by argument \code{time_filter} and the
#' local geographic areas given by argument \code{geo_filter}.
#'
#' Data can also be filtered column-wise to retain item variables that appear in
#' a minimum of time periods, using argument \code{min_t_filter}, or a minimum
#' of surveys, with argument \code{min_survey_filter}. Argument
#' \code{survey_name} is required when filtering by survey.
#'
#' If both row-wise and column-wise restrictions are specified, \code{shape}
#' iterates over them until they leave the data unchanged.
#'
#' @param item_data A table in which items appear in columns and each row
#' represents an individual's responses in some time period and local geographic
#' area.
#'
#' @param item_names Item response variables.
#'
#' @param group_names Discrete grouping variables, usually demographic. Using
#'   numeric variables is allowed but not recommended.
#'
#' @param geo_name A geographic variable representing local areas.
#'
#' @param time_name A time variable with numeric values.
#'
#' @param survey_name A survey identifier.
#'
#' @param weight_name A variable giving survey weights.
#'
#' @param modifier_data Table giving characteristics of local geographic areas
#' in time periods. See details below.
#'
#' @param target_data A table giving population proportions for groups by local
#' geographic area and time period. See details below.
#'
#' @param raking A formula or list of formulas specifying the variables on which
#' to rake survey weights.
#'
#' @param max_raked_weight A maximum over which raked weights will be trimmed.
#' Only applied after raking. To trim unraked weights, manipulate the input data
#' directly.
#'
#' @param aggregate_data A table of trial and success counts by group and item.
#' See details below.
#'
#' @param aggregate_item_names A subset of values of the \code{item} variable in
#' \code{aggregate_data}, for restricting the aggregate data.
#'
#' @param id_vars Additional variables that should be included in the result,
#' other than those specified elsewhere.
#'
#' @param modifier_names Variables giving modifiers of geographic hierarchical
#' parameters in \code{modifier_data}.
#' 
#' @param t1_modifier_names Variables to be used instead of those in
#' \code{modifier_names}, only in the first period.
#' 
#' @param standardize Whether to standardize the variables given by
#' \code{modifier_names} and \code{t1_modifier_names} to be zero-mean and
#' unit-variance for performance gains. (For discussion see the Stan Language
#' Reference section "Standardizing Predictors and Outputs.")
#' 
#' @param proportion_name The variable giving population proportions
#'   for strata in \code{target_data}.
#'
#' @param geo_filter A character vector giving values of the geographic
#'   variable. Defaults to observed values.
#'
#' @param time_filter A numeric vector giving possible values of the time
#'   variable. Observed and unobserved time periods can be given. Defaults to
#'   observed values.
#' 
#' @param min_survey_filter An integer minimum of survey appearances for
#' included items.
#' 
#' @param min_t_filter An integer minimum of time period appearances for
#' included items.
#'
#' @param constant_item Whether item difficulty parameters should be constant
#'   over time.
#'
#' @param ... Further arguments.
#' 
#' @return An object of class \code{dgirtIn} expected by \code{\link{dgirt}} and
#' \code{\link{dgmrp}}.
#'
#' @examples
#' # model individual item responses
#' shaped_responses <- shape(opinion, item_names = "abortion", time_name =
#'   "year", geo_name = "state", group_names = "race3")
#'
#' # summarize result)
#' summary(shaped_responses)
#'
#' # check sparseness of data to be modeled
#' get_item_n(shaped_responses, by = "year")
#'
#' @export
shape <- function(item_data = NULL,
                  item_names = NULL,
                  time_name,
                  geo_name,
                  group_names = NULL,
                  id_vars = NULL,
                  time_filter = NULL,
                  geo_filter = NULL,
                  min_t_filter = 1L,
                  min_survey_filter = 1L,
                  survey_name = NULL,
                  modifier_data = NULL,
                  modifier_names = NULL,
                  t1_modifier_names = NULL,
                  standardize = TRUE,
                  target_data = NULL,
                  raking = NULL,
                  max_raked_weight = NULL,
                  weight_name = NULL,
                  proportion_name = "proportion",
                  aggregate_data = NULL,
                  aggregate_item_names = NULL,
                  constant_item = TRUE,
                  ...) {

  # so long as init_control requires these arguments, pass them explicitly
  ctrl <- init_control(item_data = item_data,
                       item_names = item_names,
                       time_name = time_name,
                       geo_name = geo_name,
                       group_names = group_names,
                       id_vars = id_vars,
                       time_filter = time_filter,
                       geo_filter = geo_filter,
                       min_t_filter = min_t_filter,
                       min_survey_filter = min_survey_filter,
                       survey_name = survey_name,
                       aggregate_data = aggregate_data,
                       aggregate_item_names = aggregate_item_names,
                       modifier_data = modifier_data,
                       modifier_names = modifier_names,
                       t1_modifier_names = t1_modifier_names,
                       standardize = standardize,
                       target_data = target_data,
                       raking = raking,
                       weight_name = weight_name,
                       proportion_name = proportion_name,
                       max_raked_weight = max_raked_weight,
                       constant_item = constant_item,
                       ...)

  # setDT copies of *_data 
  item_data <- set_copy_dt(item_data)
  aggregate_data <- set_copy_dt(aggregate_data)
  modifier_data <- set_copy_dt(modifier_data)
  target_data <- set_copy_dt(target_data)

  # validate inputs #
  check_targets(target_data, ctrl)
  check_modifiers(modifier_data, ctrl)
  check_aggregates(aggregate_data, ctrl)
  check_item(item_data, ctrl)

  # restrict data #
  item_data <- restrict_items(item_data, ctrl)
  ctrl@item_names <- intersect(ctrl@item_names, names(item_data))
  aggregate_data <- restrict_aggregates(aggregate_data, ctrl)
  ctrl@aggregate_item_names <- intersect(ctrl@aggregate_item_names,
    aggregate_data$item)

  # rake survey weights #
  if (length(target_data) & length(item_data)) {
    item_data <- weight(item_data, target_data, ctrl)
    ctrl@weight_name <- "raked_weight"
  }

  d_in <- init_dgirt_in(item_data, aggregate_data, modifier_data, target_data,
    ctrl)
  d_in$call <- match.call()
  d_in$pkg_version <- packageVersion("dgo")

  # validate input to model #
  check_dimensions(d_in)
  check_values(d_in)
  check_names(d_in)

  d_in
}

set_copy_dt <- function(input_data) {
  if (length(input_data)) {
    return(data.table::setDT(data.table::copy(input_data)))
  } else {
    return(NULL)
  }
}

init_dgirt_in <- function(item_data, aggregate_data, modifier_data, target_data,
  ctrl) {
  d_in <- dgirtIn$new(ctrl)

  # aggregate individual item response data to group level #
  if (length(item_data)) {
    item_data <- dichotomize(item_data, ctrl)
  }
  d_in$group_grid <- make_group_grid(item_data, aggregate_data, ctrl)
  d_in$group_grid_t <- make_group_grid_t(d_in$group_grid, ctrl)
  d_in$group_counts <- make_group_counts(item_data, aggregate_data, ctrl)
  d_in$gt_items <- unique(d_in$group_counts$item)

  # restrict modifier data given final item_data #
  modifier_data <- restrict_modifier(modifier_data, d_in$group_grid, ctrl)

  # make objects used by dgirt.stan #
  d_in$observed <- which(d_in$group_counts[["n_grp"]] > 0)
  d_in$N_observed <- length(d_in$observed)
  d_in$n_vec <- setNames(d_in$group_counts$n_grp, d_in$group_counts$name)
  d_in$s_vec <- setNames(d_in$group_counts$s_grp, d_in$group_counts$name)

  d_in$G <- nrow(unique(d_in$group_counts[, c(ctrl@geo_name, ctrl@group_names),
      with = FALSE]))
  d_in$G_hier <- ifelse(!length(modifier_data), nlevels(gl(1L, d_in$G)),
    max(unlist(length(ctrl@modifier_names)), 1L))
  d_in$T <- length(ctrl@time_filter)
  d_in$Q <- length(d_in$gt_items)

  # not yet implemented
  d_in$WT <- array(1, dim = c(d_in$T, d_in$G_hier, d_in$G))
  d_in$l2_only <- matrix(0L, nrow = length(ctrl@time_filter), ncol = d_in$Q)
  d_in$NNl2 <- array(0L, dim = c(d_in$T, d_in$Q, d_in$G_hier))
  d_in$SSl2 <- d_in$NNl2

  d_in$XX <- make_design_matrix(d_in, ctrl)
  d_in$ZZ <- shape_hierarchical_data(modifier_data, ctrl@modifier_names,
    d_in$group_grid_t, d_in$XX, ctrl)
  d_in$ZZ_prior <- shape_hierarchical_data(modifier_data,
    ctrl@t1_modifier_names, d_in$group_grid_t, d_in$XX, ctrl)
  d_in$hier_names <- dimnames(d_in$ZZ)[[2]]

  d_in$D <- ifelse(ctrl@constant_item, 1L, d_in$T)
  d_in$N <- nrow(d_in$group_counts)
  d_in$P <- ncol(d_in$ZZ)
  d_in$S <- length(unique(d_in$group_grid[[ctrl@geo_name]])) - 1
  d_in$H <- dim(d_in$ZZ)[[3]]
  d_in$Hprior <- dim(d_in$ZZ_prior)[[3]]

  # include subset data and other objects that may be useful later #
  d_in$item_data <- item_data
  d_in$modifier_data <- modifier_data
  d_in$aggregate_data <- aggregate_data
  d_in$target_data <- target_data
  d_in$control <- ctrl
  return(d_in)
}

make_design_matrix <- function(d_in, ctrl) {
  design_formula <- paste("~ 0", ctrl@geo_name, sep = " + ")
  if (length(ctrl@group_names)) {
    design_formula <- paste(design_formula, ctrl@group_names, collapse = " + ",
                            sep = " + ")
  }
  design_formula = as.formula(design_formula)
  design_matrix <- with_contr.treatment(model.matrix(design_formula,
                                                     d_in$group_grid_t))

  rn <- do.call(function(...) paste(..., sep = "__"),
                d_in$group_grid_t[, c(ctrl@geo_name, ctrl@group_names),
                                  with = FALSE])
  rownames(design_matrix) <- rn
  colnames(design_matrix) <- sub(paste0("^", ctrl@geo_name), "",
                                 colnames(design_matrix))

  design_matrix <- subset(design_matrix, select = -1)
  invalid_values <- setdiff(as.vector(design_matrix), c(0, 1))
  if (length(invalid_values)) {
    stop("design matrix values should be in (0, 1); found ",
         paste(sort(invalid_values), collapse = ", "))
  }
  design_matrix
}

with_contr.treatment <- function(...) {
  contrast_options = getOption("contrasts")
  options("contrasts"= c(unordered = "contr.treatment",
                         ordered = "contr.treatment"))
  res <- eval(...)
  options("contrasts"= contrast_options)
  res
}