ccesMRPprep: Functions and Data to Prepare CCES data for MRP

Documented in get_cces_question

#' Get one column of outcome questions for each model
#'
#' @param qcode A string, the variable name of the outcome variable of interest,
#'  exactly as it appears in the CCES dataset. Together with \code{year}, uniquely identifies
#'  the outcome question data
#' @param year A string for the year the question was asked, which defines the
#' the _dataset_. If using a dataset from the cumulative, set to \code{"cumulative"}.
#' Together with \code{qcode}, uniquely identifies the outcome question data
#' @param qID A string, the user's unique name for the question. For example, we use the
#'  syntax in our question table.
#' @param y_named_as What to name the response variable. Defaults to "response", which is also
#'  the default assumption for \link{cces_join_slim}.
#' @param dataframe An optional argument to pass the entire CCES dataframe in-environment.
#' If left empty, this will supersede the arguments \code{year} and \code{dataframe} and
#' use the provided datasets instead.
#' @param data_dir A path for the directory where flat files for the CCES common content is
#'  stored.  Currently the data must be of the form `cces_{year}.rds`  (e.g. `"cces_2016.rds"`)
#'  and it must exist wherever \code{data_dir} is. We recommend looping over \link{get_cces_dataverse}
#'  to download data first.
#' @param verbose whether to print a message; defaults to TRUE
#'
#' @details This transformation currently only supports Yes/No questions. For some common
#'  ordinal questions that can, with manual recodes, be set to a Yes/No question,
#'  there is some hard-coding of the recode in the question. In the future, this should be
#'  defined outside of the function in a taxonomy.
#'
#' @return A three-column dataframe with the columns
#' \describe{
#' \item{year}{The year}
#' \item{case_id}{The respondent identifier within year}
#' \item{qID}{The question ID}
#' \item{response}{The outcome question, of class factor}
#' }
#' The object will also have an attribute called \code{question}, which will save
#' the question identifier \code{qID}
#'
#' @import dplyr
#' @importFrom magrittr `%>%`
#' @importFrom rlang .data
#' @importFrom readr read_rds
#' @importFrom glue glue
#' @importFrom fs path
#'
#'
#' @examples
#'
#'  # with sample data
#'  get_cces_question("pid3", dataframe = cc18_samp, year = 2018, qID = "pid3")
#'
#'  # need data/input/cces/cces_2018.rds to run this
#'  \dontrun{
#'   get_cces_question(qcode = "CC18_326", year = "2018", qID = "TCJA")
#'
#'   # A tibble: 60,000 x 4
#'   # year   case_id qID   response
#'   # <int>     <int> <chr> <fct>
#'   # 1  2018 123464282 TCJA  Support
#'   # 2  2018 170169205 TCJA  Support
#'   # 3  2018 175996005 TCJA  Support
#'   # 4  2018 176818556 TCJA  Oppose
#'   # 5  2018 202120533 TCJA  Oppose
#'   # 6  2018 226449148 TCJA  Oppose
#'   # 7  2018 238205342 TCJA  Oppose
#'   # 8  2018 238806466 TCJA  Support
#'   # 9  2018 267564481 TCJA  Support
#'   # … with 59,991 more rows
#'  }
#'
#' @export
get_cces_question <- function(qcode,
                              year,
                              qID,
                              dataframe = NULL,
                              y_named_as = "response",
                              data_dir = "data/input/cces",
                              verbose = TRUE) {

  # data
  if (is.null(dataframe)) {
    if (verbose)
      cat(glue("Attempting to read in data from local flat file in the directory {data_dir}") , "\n")
    cces_year <- read_rds(path(data_dir, glue("cces_{year}.rds")))
  }

  if (!is.null(dataframe)) {
    if (verbose)
      cat("Using the dataframe provided", "\n")
    cces_year <- dataframe
  }

  # check
  stopifnot(qcode %in% colnames(cces_year))

  # rename outcome to "response"
  cces_resp <- cces_year %>%
    select(matches("case_id$"), response = !!qcode)

  # turnout is special. If missing, means they didn't turn out.
  if (qcode == "CL_2018gvm") {
    cces_resp$response <- recode(as.integer(cces_resp$response), .missing = "No", .default = "Yes")
  }

  # we will code Rs as 1 for CL_party
  if (qcode == "CL_party") {
    cces_resp$response <- recode(as.integer(cces_resp$response), `11` = "Yes", .missing = "No", .default = "No")
  }

  if (qcode == "pid7") {
    cces_resp$response <- recode(as.integer(cces_resp$response), `4` = "Yes", .missing = "No", .default = "No")
  }

  if (qcode == "ideo5") {
    cces_resp$response <- recode(as.integer(cces_resp$response), `3` = "Yes", .missing = "No", .default = "No")
  }

  cces_df <- cces_resp %>%
    transmute(year = as.integer(year),
              case_id = as.character(.data$case_id),
              qID = qID,
              !!sym(y_named_as) := as_factor(.data$response))

  attr(cces_df, "question") <- qID

  return(cces_df)
}