R/read_survey.R
In qualtRics: Download 'Qualtrics' Survey Data

Documented in read_survey

#' Read a CSV file exported from Qualtrics
#'
#' Reads comma separated CSV files generated by Qualtrics software. The second
#' line containing the variable labels is imported. Repetitive introductions to
#' matrix questions are automatically removed. Variable labels are stored as
#' attributes.
#'
#' @param file_name String. A CSV data file.
#' @param import_id Logical. If `TRUE`, use Qualtrics import IDs instead of
#'   question IDs as column names. Defaults to `FALSE`.
#' @param strip_html Logical. If `TRUE`, then remove HTML tags from variable
#'   descriptions. Defaults to `TRUE`.
#' @param time_zone String. A local timezone to determine response date values.
#'   Defaults to `NULL` which corresponds to UTC time. See ["Dates and
#'   Times"](https://api.qualtrics.com/) from Qualtrics for more information on
#'   format.
#' @param legacy Logical. If `TRUE`, then import "legacy" format CSV files (as
#'   of 2017). Defaults to `FALSE`.
#' @param add_column_map Logical. If `TRUE`, then a column map data frame will
#'   be added as an attribute to the main response data frame. This column map
#'   captures Qualtrics-provided metadata associated with the response download,
#'   such as an item description and internal ID's. Defaults to `TRUE`.
#' @param add_var_labels Logical. If `TRUE`, then the item description from each
#'   variable (equivalent to the one in the column map) will be added as a
#'   "label" attribute using [sjlabelled::set_label()]. Useful for reference as
#'   well as cross-compatibility with other stats packages (e.g., Stata, see
#'   documentation in `sjlabelled`). Defaults to `TRUE`.
#' @param col_types Optional. This argument provides a way to manually overwrite
#'   column types that may be incorrectly guessed. Takes a [readr::cols()]
#'   specification. See example below and [readr::cols()] for formatting
#'   details. Defaults to `NULL`.
#'
#' @importFrom sjlabelled set_label
#' @importFrom jsonlite fromJSON
#' @importFrom purrr map
#' @importFrom purrr imap
#' @importFrom purrr map_dfr
#' @importFrom tidyr unite
#' @importFrom tidyr everything
#' @importFrom tidyr pivot_longer
#' @importFrom tidyr pivot_wider
#' @importFrom stringr str_split_fixed
#' @importFrom readr read_csv
#' @importFrom readr locale
#' @importFrom readr type_convert
#' @importFrom dplyr select
#' @importFrom dplyr slice
#' @importFrom rlang set_names
#'
#' @return A data frame. Variable labels are stored as attributes. They are not
#'   printed on the console but are visibile in the RStudio viewer.
#' @export
#' @examples
#' \dontrun{
#' # Generic use of read_survey()
#' df <- read_survey("<YOUR-PATH-TO-CSV-FILE>")
#' }
#' # Example using current data format
#' file <- system.file("extdata", "sample.csv", package = "qualtRics")
#' df <- read_survey(file)
#'
#' # Example using legacy data format
#' file <- system.file("extdata", "sample_legacy.csv", package = "qualtRics")
#' df <- read_survey(file, legacy = TRUE)
#'
#' # Example changing column type
#' file <- system.file("extdata", "sample.csv", package = "qualtRics")
#' # Force EndDate to be a string
#' df <- read_survey(file, col_types = readr::cols(EndDate = readr::col_character()))
#'
read_survey <-
  function(file_name,
           strip_html = TRUE,
           import_id = FALSE,
           time_zone = NULL,
           legacy = FALSE,
           add_column_map = TRUE,
           add_var_labels = TRUE,
           col_types = NULL
  ) {


    # START UP: CHECK ARGUMENTS PASSED BY USER ----

    # Ignore import_id if legacy = TRUE
    if (import_id & legacy) {
      rlang::warn(c("Using import IDs as column names is not supported for legacy response files.",
                    "Defaulting to user-defined variable names",
                    "Set import_id = FALSE in future."))
      import_id = FALSE
    }


    # Check time_zone and set to system timezone if left unspecified
    time_zone <- checkarg_time_zone(time_zone)

    # check if file at file_name exists:
    checkarg_file_name(file_name)

    # Check col_types argument is the right type:
    checkarg_col_types(col_types)

    # Check other arguments:
    checkarg_isboolean(strip_html)
    checkarg_isboolean(import_id)
    checkarg_isboolean(legacy)
    checkarg_isboolean(add_column_map)
    checkarg_isboolean(add_var_labels)


    # READ RAW DATA ----

    # import raw data excluding variable names (row 1)
    # variable JSON (row 2, v3 only)
    # and descriptions (row 3, or 2 if legacy)

    rawdata <-
      suppressMessages(
        readr::read_csv(
          file = file_name,
          col_types = readr::cols(.default = readr::col_character()),
          na = c("")
        ))

    # If Qualtrics adds an empty column at the end, remove it
    if (grepl(",$", readLines(file_name, n = 1))) {
      rawdata <- rawdata[, 1:(ncol(rawdata) - 1)]
    }

    data <-
      process_raw_survey(
        rawdata = rawdata,
        import_id = import_id,
        time_zone = time_zone,
        col_types = col_types,
        add_column_map = add_column_map,
        add_var_labels = add_var_labels,
        strip_html = strip_html,
        legacy = legacy
      )


  }

process_raw_survey <-
  function(
    rawdata,
    import_id,
    time_zone,
    col_types,
    add_column_map,
    add_var_labels,
    strip_html,
    legacy = FALSE
  ){

    # Identify metadata rows - first row for legacy, first 2 rows for modern:
    if(legacy){
      header_rows <- 1
    } else {
      header_rows <- 1:2
    }

    # CREATE RESPONSE DATA FRAME ----

    # Remove metadata rows:
    responsedata <-
      dplyr::slice(rawdata, -header_rows)

    # Infer data types from data:
    responsedata <-
      readr::type_convert(
        responsedata,
        locale = readr::locale(tz = time_zone),
        col_types = col_types,
        na = character()
      )


    # GENERATE COLUMN MAP ----

    # Take the first two rows (or just the first if legacy)
    colmapdata <-
      dplyr::slice(rawdata, header_rows)

    # Create the column map:
    if(!legacy){

      # Add a reference column:
      colmapdata <-
        dplyr::mutate(colmapdata,
                      metadata_type = c("description", "JSON"))

      # Pivot twice to create the column:
      col_map <-
        tidyr::pivot_longer(colmapdata,
                            -metadata_type,
                            names_to = "qname")
      col_map <-
        tidyr::pivot_wider(col_map,
                           names_from = "metadata_type",
                           values_from = "value")

      # Process the JSON column into other columns, dropping the raw JSON:
      col_map <-
        dplyr::mutate(col_map,
                      purrr::map_dfr(JSON, jsonlite::fromJSON),
                      .keep = "unused")

      # If choiceId does not exist, create it for consistency:
      if(!rlang::has_name(col_map, "choiceId")){
        col_map$choiceId <- NA
      }

    } else {
      # If legacy, just create a simple column map with name and description:
      col_map <-
        pivot_longer(colmapdata,
                     tidyr::everything(),
                     names_to = "qname",
                     values_to = "description")
    }

    # If desired, clean variable labels in column map

    if (strip_html) {
      col_map$description <-
        remove_html(col_map$description)
    }

    # New columns in column map for main and sub questions from description:

    col_map <-
      dplyr::mutate(col_map,
                    tibble::as_tibble(
                      # Separate out descriptions based on whether there's a " - " separator
                      # Only separates a single time
                      stringr::str_split_fixed(description, "\\s-\\s", n = 2),
                      # Add names
                      .name_repair = ~c("main", "sub")
                    ),
                    # Store after variable description:
                    .after = description
      )

    # ASSIGNING QID'S IF import_id = TRUE ----
    if (import_id) {

      # Rename variables to be "ImportId_ChoiceId" rather than user-defined variable names:
      qid_names <-
        tidyr::unite(col_map,
                     col = qidnames,
                     c(ImportId, choiceId),
                     sep = "_",
                     na.rm = TRUE)[["qidnames"]]

      # Change the response data and column map to use these QID names:
      names(responsedata) <- qid_names
      col_map$qname <- qid_names

    }


    # FINAL CLEANUP ----

    # Add descriptions to data as attribute "label"
    if(add_var_labels){
      responsedata <-
        sjlabelled::set_label(responsedata, col_map$description)
    }

    # Add column map:
    if(add_column_map){
      attr(responsedata, "column_map") <- col_map
    }

    # RETURN ----

    return(responsedata)


  }