R/SurveyData.R

Defines functions print_questions_and_responses print_survey_design

#' SurveyData objects
#'
#' @name SurveyData
#' @export
#' @description
#' An [R6][R6::R6Class] `SurveyData` object represents a survey and its
#' metadata. The survey itself is a data frame containing all data from the
#' survey. The `SurveyData` object also includes the survey questions and
#' responses (if left empty these will just be the column and factor level
#' names). To enable weighted comparisons, survey weights and a survey design
#' can be specified, with the survey design specified using \pkg{survey} package
#' notation.
#'
SurveyData <- R6::R6Class(
  classname = "SurveyData",
  private = list(
    survey_data_ = data.frame(NULL),
    questions_ = list(),
    responses_ = list(),
    weights_ = numeric(),
    design_ = list()
  ),
  public = list(
    #' @description Create a new `SurveyData` object using an existing data
    #'   frame and other survey information. This method is used to create the
    #'   objects for both the sample and the population data. If a population is
    #'   approximated from a large survey (like the ACS or DHS), then the
    #'   package will enable the creation of a weighted poststratification
    #'   matrix. If the population is summarized as a poststratification matrix
    #'   already, then set the weights as the size in each cell $N_j$. If the
    #'   entire individual level population data is given, then weights should
    #'   be omitted and will be automatically set to 1.
    #' @param data A data frame containing the survey data.
    #' @param questions,responses Named lists containing the text of the survey
    #'   questions and the allowed responses, respectively. The names must
    #'   correspond to the names of variables in `data`. If these aren't
    #'   provided then they will be created internally using all factor,
    #'   character, and binary variables in `data`. Responses can also be
    #'   provided as a data frame with column names `"data"` and `"asked"`,
    #'   which reflect the responses as coded in the data (`"data"`) and their
    #'   corresponding actual survey responses (`"asked"`). See **Examples**.
    #' @param weights Optionally, the name of a variable in `data` containing
    #'   survey weights.
    #' @param design Optionally, a named list of arguments (except `weights` and
    #'   `data`) to pass to `survey::svydesign()` to specify the survey design.
    #'
    #' @return A `SurveyData` object that can be used in the creation of a
    #'   [`SurveyMap`] object.
    #'
    #' @examples
    #' # Example sample data
    #' head(shape_survey)
    #'
    #' # SurveyData object for sample data
    #' box_prefs <- SurveyData$new(
    #'   data = shape_survey,
    #'   questions = list(
    #'     age = "Please identify your age group",
    #'     gender = "Please select your gender",
    #'     vote_for = "Which party did you vote for in the 2018 election?",
    #'     y = "If today is the election day, would you vote for the Box Party?"
    #'   ),
    #'   responses = list(
    #'     age = levels(shape_survey$age),
    #'     gender = levels(shape_survey$gender),
    #'     # Here we use a data frame for the responses because the levels
    #'     # in the data are abridged versions of the actual responses.
    #'     # This can be useful when surveys have brief/non descriptive responses.
    #'     vote_for = data.frame(
    #'       data = levels(shape_survey$vote_for),
    #'       asked = c("Box Party Faction A", "Box Party Faction B",
    #'                 "Circle Party Coalition", "Circle Party")
    #'     ),
    #'     y = c("no","yes")
    #'   ),
    #'   weights = "wt",
    #'   design = list(ids =~1)
    #' )
    #' box_prefs$print()
    #' box_prefs$n_questions()
    #'
    #'
    #' # Example population data
    #' head(approx_voters_popn)
    #'
    #' # SurveyData object for population if estimated from large survey
    #' popn_obj1 <- SurveyData$new(
    #'   data = approx_voters_popn,
    #'   questions = list(
    #'     age_group = "Which age group are you?",
    #'     gender = "Gender?"
    #'   ),
    #'   # order doesn't matter (gender before age here) because
    #'   # the list has the names of the variables
    #'   responses = list(
    #'     gender = levels(approx_voters_popn$gender),
    #'     age_group = levels(approx_voters_popn$age_group)
    #'   ),
    #'   weights = "wt" # use the wt column from approx_voters_popn data
    #' )
    #'
    #' # SurveyData object for population if poststratification matrix already known
    #' library(dplyr)
    #' popn_ps <- approx_voters_popn %>%
    #'   group_by(age_group,gender) %>%
    #'   summarise(N_j = sum(wt))
    #'
    #' popn_obj2 <- SurveyData$new(
    #'   data = popn_ps,
    #'   questions = list(
    #'     age_group = "Which age group are you?",
    #'     gender = "Gender?"
    #'   ),
    #'   responses = list(
    #'     gender = levels(popn_ps$gender),
    #'     age_group = levels(popn_ps$age_group)
    #'   ),
    #'   weights = "N_j"# use N_j column from popn_ps data
    #' )
    #'
    #' # SurveyData object for population if individual population data known:
    #' # (pretend that approx_voters_popn is the full population)
    #' popn_obj3 <- SurveyData$new(
    #'   data = approx_voters_popn,
    #'   questions = list(
    #'     age_group = "Which age group are you?",
    #'     gender = "Gender?"
    #'   ),
    #'   responses = list(
    #'     gender = levels(approx_voters_popn$gender),
    #'     age_group = levels(approx_voters_popn$age_group)
    #'   )
    #' )
    #' popn_obj1
    #' popn_obj2
    #' popn_obj3
    #'
    initialize = function(data,
                          questions = list(),
                          responses = list(),
                          weights = numeric(),
                          design = list(ids = ~1)) {
      if (ncol(data) == 0 || nrow(data) == 0) {
        stop("'data' cannot be empty.", call. = FALSE)
      }

      if (length(questions) == 0 && length(responses) == 0) {
        keep <- function(x) is.factor(x) || is.character(x) || length(unique(stats::na.omit(x))) == 2
        data_use <- data[, sapply(data, keep), drop = FALSE]
        questions <- setNames(as.list(colnames(data_use)), colnames(data_use))
        responses <- lapply(data_use, function(x) if (is.factor(x)) levels(x) else unique(stats::na.omit(x)))
        warning(
          "No 'questions' and 'responses' provided. ",
          "Using all factor, character, and binary variables in 'data' by default.",
          call. = FALSE
        )
      }
      if (length(responses) != length(questions)) {
        stop("Mismatch between number of survey questions and responses.",
             call. = FALSE)
      }
      if (length(questions) != length(unique(questions))) {
        stop("All elements of 'questions' must be unique.")
      }

      nms_q <- sort(names(questions))
      nms_r <- sort(names(responses))
      if (is.null(nms_q) || sum(nzchar(nms_q)) != length(nms_q)) {
        stop("All elements of 'questions' and 'responses' list must have names.", call. = FALSE)
      }
      if (length(unique(nms_q)) != length(nms_q)) {
        stop("Names in 'questions' must be unique.", call = FALSE)
      }
      if (!identical(nms_q, nms_r)) {
        stop("Names in 'questions' and 'responses' lists must be the same.")
      }
      if (!all(nms_q %in% colnames(data))) {
        stop("Names of 'questions' must match column names in 'data'.", call. = FALSE)
      }
      questions <- questions[nms_q]
      responses <- responses[nms_q]

      for (j in seq_along(questions)) {
        if(is.data.frame(responses[[j]])){
          if(!identical(colnames(responses[[j]]),c("data","asked"))){
            stop("If providing responses as data and asked questions, must be in a single two dataframe column with column names `data` and `asked`.",
                 call. = FALSE)
          }
          responses_provided <- sort(responses[[j]][["data"]])
        }else{
          responses_provided <- sort(responses[[j]])
        }
        if (is.factor(data[[nms_q[j]]])) {
          responses_in_data <- sort(levels(data[[nms_q[j]]]))
        } else {
          responses_in_data <- sort(unique(data[[nms_q[j]]]))
        }
        if (!identical(responses_provided, responses_in_data)) {
          stop(
            "Values in data do not match specified responses for variable '", nms_q[j], "'. ",
            "\nValues in 'data': ", paste(responses_in_data, collapse = ", "),
            "\nValues in 'responses': ", paste(responses_provided, collapse = ", "),
            call. = FALSE
          )
        }
      }

      if (length(weights) == 0) {
        wts <- rep(1, nrow(data))
        warning("'Weights have not been provided, assume all data weighted with weight 1.", call. = FALSE)
      } else {
        if (!is.character(weights) || !weights %in% colnames(data)) {
          stop("'weights' must be a string naming a column in 'data'.", call. = FALSE)
        }
        wts <- data[[weights]]
        data[[weights]] <- NULL
        if (anyNA(wts)) {
          stop("NAs not allowed in weights.", call. = FALSE)
        }
      }

      if (!is.list(design) ||
          (is.null(names(design)) || any(!nzchar(names(design))))) {
        stop("'design' must be a named list.", call. = FALSE)
      }
      if (!"ids" %in% names(design)) {
        stop("'design' must contain an element 'ids'.", call. = FALSE)
      }
      if ("weights" %in% names(design)) {
        stop("'design' should not include element 'weights'.")
      }
      if ("data" %in% names(design)) {
        stop("'design' should not include element 'data'.")
      }

      private$questions_ <- questions
      private$responses_ <- responses
      private$weights_ <- wts
      private$design_ <- design
      private$survey_data_ <- data.frame(.key = 1:nrow(data), data)
      invisible(self)
    },

    #' @description Number of observations in the survey data
    #' @return An integer.
    n_obs = function() nrow(private$survey_data_),

    #' @description Number of survey questions
    #' @return An integer.
    n_questions = function() length(private$questions_),

    #' @description Print a summary of the survey data
    #' @param ... Currently ignored.
    #' @return The `SurveyData` object, invisibly.
    print = function(...) {
      cat("Survey with",
          self$n_obs(), "observations,",
          self$n_questions(), "questions",
          "\n")

      print_survey_design(private$design_, private$weights_, private$survey_data_)
      print_questions_and_responses(private$questions_, private$responses_)
      invisible(self)
    },

    #' @description Add a column to the sample data. This is primarily
    #'   intended for internal use but may occasionally be useful.
    #' @param name,value The name of the new variable (a string) and the
    #' vector of values to add to the data frame.
    #' @return The `SurveyData` object, invisibly.
    add_survey_data_column = function(name, value) {
      if (length(value) != nrow(private$survey_data_)) {
        stop("New variable must have same number of observations as the survey data.",
             call. = FALSE)
      }
      private$survey_data_[[name]] <- value
      invisible(self)
    },

    #' @description Access the data frame containing the sample data.
    #' @param key Should the `.key` column be included? This column just
    #'   indicates the original order of the rows and is primarily intended
    #'   for internal use.
    #' @return A data frame.
    survey_data = function(key = TRUE) {
      if (key) {
        private$survey_data_
      } else {
        private$survey_data_[, colnames(private$survey_data_) != ".key", drop = FALSE]
      }
    },

    #' @description Access the list of survey questions
    #' @return A named list.
    questions = function() private$questions_,

    #' @description Access the list of allowed survey responses
    #' @return A named list.
    responses = function() private$responses_,

    #' @description Access the survey weights
    #' @return A numeric vector.
    weights = function() private$weights_,

    #' @description Access the survey design
    design = function() private$design_
  )
)


# internal ----------------------------------------------------------------

# print 1-line summary of survey design
print_survey_design <- function(design, weights, data) {
  svy_design <- do.call(survey::svydesign, c(design, list(weights = weights, data = data)))
  svy_design$call <- NULL
  cat(utils::capture.output(print(svy_design))[1], "\n")
}

print_questions_and_responses <- function(questions, responses) {
  for (i in seq_along(questions)) {
    cat("\nColumn label:", names(questions)[i], "\n")
    cat("Question:", questions[[i]], "\n")
    if(is.data.frame(responses[[i]])){
      responses_provided <- responses[[i]][["data"]]
      responses_asked <- responses[[i]][["asked"]]
      cat("Allowed answers:", paste(responses_provided, "(", responses_asked , ")",collapse = ", "), "\n")
    }else{
      responses_provided <- responses[[i]]
      cat("Allowed answers:", paste(responses_provided, collapse = ", "), "\n")
    }
  }
}
lauken13/mrpkit documentation built on Aug. 6, 2023, 3:42 a.m.