R/redcap_data.R
In REDCapDM: 'REDCap' Data Management

Documented in redcap_data

#' Read REDCap data
#'
#' @description
#' This function allows users to read datasets from a REDCap project into R for analysis, either via export of the data or via an API connection.
#'
#' The REDCap API is an interface that allows communication with REDCap and server without going through the interactive REDCap interface.
#'
#' @note If you will give further use to the package, we advise you to use the argument 'dic_path' to read your dictionary, as all other functions need it in order to run properly.
#'
#' @param data_path Character string with the pathname of the R file to read the dataset from.
#' @param dic_path Character string with the pathname of the dictionary.
#' @param event_path Character string with the pathname of the file containing the correspondence between each event and each form (it can be downloaded through the `Designate Instruments for My Events` tab inside the `Project Setup` section of REDCap)
#' @param uri The URI (Uniform Resource Identification) of the REDCap project.
#' @param token Character vector with the generated token.
#' @param filter_field Character vector with the fields of the REDCap project desired to import into R (API connection only)<.
#' @return List containing the dataset and the dictionary of the REDCap project. If the event_path is specified, it will also contain a third element with the correspondence of the events & forms of the project.
#'
#' @note To read exported data, you must first use REDCap's 'Export Data' function and select the 'R Statistical Software' format. It will then generate a CSV file with all the observations and an R file with the necessary code to complete each variable's information.
#'
#' @examples
#' \dontrun{
#' # Exported files from REDCap
#'
#' dataset <- redcap_data(data_path = "C:/Users/username/example.r",
#'                        dic_path = "C:/Users/username/example_dictionary.csv",
#'                        event_path = "C:/Users/username/events.csv")
#'
#' # API connection
#'
#' dataset_api <- redcap_data(uri = "https://redcap.idibell.cat/api/",
#'                            token = "55E5C3D1E83213ADA2182A4BFDEA") # This token is fictitious
#'
#' }
#' @export

redcap_data<-function(data_path = NA, dic_path = NA, event_path = NA, uri = NA, token = NA, filter_field = NA)
  {
  oldwd <- getwd()
  on.exit(setwd(oldwd))

  # Warning: data_path, dic_path and another argument are specified.
  if(all(!c(data_path, dic_path) %in% NA) & any(!c(token, uri) %in% NA)){
    stop("Too many arguments, if you want to read exported data from REDCap use only the arguments data_path and dic_path", call. = FALSE)
  }

  # Warning: token, uri and another argument are specified.
  if(all(!c(token, uri) %in% NA) & any(!c(data_path, dic_path) %in% NA)){
    stop("Too many arguments, if you want to read data from REDCap through an API connection use only the arguments uri and token.", call. = FALSE)
  }

  # Warning: either uri or token is specified alone
  if ((!is.na(uri) & is.na(token)) | (is.na(uri) & !is.na(token))) {
    stop("If you want to read data from REDCap through an API connection, both 'uri' and 'token' arguments must be provided.", call. = FALSE)
  }

  # Read data, dictionary and event-form mapping in case of exported data.
  if(all(!c(data_path, dic_path) %in% NA) & all(c(token, uri) %in% NA)){

    # Evaluate the extension of the data_path
    extension_data <- tools::file_ext(data_path)

    if (extension_data != "R") {
      stop("Unsupported file format. Only R files are supported. Please specify the downloaded R file from REDCap within this argument.")
    }

    # Read data
    tmp_env <- new.env()
    file.lines <- scan(data_path, what = character(), skip = 2, sep = '\n', quiet = TRUE)
    file.lines.collapsed <- paste(file.lines, collapse = '\n')
    command <- paste0("dirname(parent.frame(2)$", "data_path", ")")
    setwd(eval(parse(text = command)))
    source(textConnection(file.lines.collapsed), local = tmp_env, encoding = "UTF-8")
    data <- get("data", envir = tmp_env)
    if (names(data)[1]!="record_id") {
      names(data)[1] <- "record_id"
    }

    # Read dictionary
    setwd(oldwd)

    # Evaluate the extension of the dictionary_path
    extension_dic <- tools::file_ext(dic_path)

    if (extension_dic == "xlsx") {

      # Read XLSX file
      dic <- openxlsx::read.xlsx(dic_path, colNames = F, detectDates = T)

    } else if (extension_dic == "csv") {

      # Read CSV file
      dic <- utils::read.csv(dic_path, encoding = "UTF-8", header = FALSE)

    } else {

      stop("Unsupported file format. Only XLSX and CSV are supported.")

    }

    # Changing names of the first column and first observation
    names(dic) <- dic[1,]
    dic <- dic[-1,]
    names(dic) <- janitor::make_clean_names(names(dic))
    names(dic)[1] <- "field_name"
    if (dic[1,1]!="record_id") {
      dic[1,1] <- "record_id"
    }

    # Remove descriptive variables from dictionary
    if ("descriptive" %in% dic$field_type) {
      dic <- dic %>% dplyr::filter(!.data$field_type %in% "descriptive")
    }


    # Indicator of longitudinal projects
    longitudinal <- ifelse("redcap_event_name" %in% names(data), TRUE, FALSE)

    #Read event file
    if(!is.na(event_path)){

      setwd(oldwd)

      # Evaluate the extension
      extension <- tools::file_ext(event_path)

      if (extension == "xlsx") {

        # Read XLSX file
        event_form <- openxlsx::read.xlsx(event_path, detectDates = T)

      } else if (extension == "csv") {

        # Read CSV file
        event_form <- utils::read.csv(event_path, encoding = "UTF-8")

      } else {

        stop("Unsupported file format. Only XLSX and CSV are supported.")

      }

      data_def <- list(data = data, dictionary = dic, event_form = event_form)

    }else{

      #If no event is specified and the project is longitudinal
      if(longitudinal){
        warning("The project contains more than one event. You might want to load the event-form correspondence using the argument event_path.")
      }

      data_def <- list(data = data, dictionary = dic)
    }

  }

  # Read data, dictionary and event-form mapping in case of an API connection.
  if(all(!c(token, uri) %in% NA) & all(c(data_path, dic_path) %in% NA)){

    # Message
    message("Importing in progress...")

    # First read the labels

    ## Error SSL peer certificate (Github issue #6)

    tryCatch({
      if (all(filter_field %in% NA)) {

        labels <- suppressMessages(REDCapR::redcap_read(redcap_uri = uri, token = token, verbose = FALSE, raw_or_label = "label", raw_or_label_headers = "label", export_data_access_groups = TRUE)$data)

      } else {

        labels <- suppressMessages(REDCapR::redcap_read(redcap_uri = uri, token = token, verbose = F, raw_or_label = "label", raw_or_label_headers = "label", export_data_access_groups = TRUE, fields = filter_field)$data)

      }
    },
    error = function(e) {
      if (grepl("SSL peer certificate", e$message)) {
        stop("Unable to establish a secure connection due to an SSL certificate problem.\nConsider adding the following line of code to bypass SSL certificate verification: httr::with_config(httr::config(ssl_verifypeer = FALSE), ... <- readcap_data(...)).\n", call. = F)
      } else {
        stop(e)
      }
    })

    if (nrow(labels) > 0) {
      names(labels)[1] <- "record_id"
    } else {
      stop("Observational data retrieval is currently unavailable. Please verify the status of the REDCap server or confirm the existence of records within the project.", call. = F)
    }

    # Save the factor version of the default variables of redcap
    redcap_names <- names(labels %>%
                            dplyr::select(dplyr::any_of(c("Event Name", "Repeat Instrument", "Data Access Group"))))

    default_names <- data.frame(fac = redcap_names) %>%
      dplyr::mutate(corres = dplyr::case_when(fac %in% "Event Name" ~ "redcap_event_name.factor",
                                              fac %in% "Repeat Instrument" ~ "redcap_repeat_instrument.factor",
                                              fac %in% "Data Access Group" ~ "redcap_data_access_group.factor"))

    rename_redcap <- default_names$fac
    names(rename_redcap) <- default_names$corres

    main_vars <- labels %>%
      dplyr::mutate_at(redcap_names[!redcap_names %in% "Repeat Instrument"], ~forcats::fct_inorder(.)) %>%
      dplyr::rename(dplyr::all_of(rename_redcap)) %>%
      dplyr::select("record_id", default_names$corres)

    # Remove the "...number" suffixes from the labels
    labels <- gsub("\\.{3}\\d+$", "", names(labels))


    # Message
    message("Almost done...")

    # Read data using the API connection

    if (all(filter_field %in% NA)) {
      data_api <- REDCapR::redcap_read_oneshot(redcap_uri = uri, token = token, verbose = FALSE, raw_or_label = "raw", export_data_access_groups = TRUE)$data
    } else {
      data_api <- REDCapR::redcap_read_oneshot(redcap_uri = uri, token = token, verbose = FALSE, raw_or_label = "raw", export_data_access_groups = TRUE, fields = filter_field)$data
    }

    if (nrow(data_api) > 0) {
      names(data_api)[1] <- "record_id"
    } else {
      stop("Observational data retrieval is currently unavailable. Please verify the status of the REDCap server or confirm the existence of records within the project.", call. = F)
    }

    # Read dictionary using the API connection
    dic_api <- REDCapR::redcap_metadata_read(redcap_uri = uri, token = token, verbose = FALSE)$data

    names(dic_api)[1] <- "field_name"

    ## Making sure the names of both dictionaries(exported data and API connection) match
    names(dic_api)[names(dic_api) %in% c("select_choices_or_calculations", "branching_logic", "question_number")] <- c("choices_calculations_or_slider_labels", "branching_logic_show_field_only_if", "question_number_surveys_only")

    # Apply labels
    data_api <- as.data.frame(purrr::map2(data_api, labels, ~labelled::set_variable_labels(.x, .y, .strict = FALSE)))

    # Remove descriptive variables from dictionary
    if ("descriptive" %in% dic_api$field_type) {
      dic_api <- dic_api %>% dplyr::filter(!.data$field_type %in% "descriptive")
    }

    # If filter_field is described, filter the variables in the dictionary
    if (!all(filter_field %in% NA)) {
      dic_api <- dic_api %>% dplyr::filter(.data$field_name %in% filter_field)
    }

    # Identify checkboxes fields and convert them to factor using the dictionary as guide

    if (sum(dic_api$field_type %in% "checkbox") > 0) {

      var_check <- names(data_api)[grep("___", names(data_api))]

      data_api <- data_api %>%
        dplyr::mutate(dplyr::across(dplyr::all_of(var_check), ~ factor(., levels = c("0", "1"), labels = c("Unchecked", "Checked")), .names = "{col}.factor"))

    }

    # Identify radio buttons and dropdown fields and convert them to factor using the dictionary as guide

    if (sum(dic_api$field_type %in% c("radio", "dropdown")) > 0) {

      var_radio <- dic_api %>%
        dplyr::filter(.data$field_type %in% c("radio", "dropdown")) %>%
        dplyr::select("field_name", "field_type", "choices_calculations_or_slider_labels")%>%
        dplyr::mutate(factor = purrr::map(.data$choices_calculations_or_slider_labels, ~stringr::str_split(.x, "\\|") %>% unlist %>% trimws),
                      levels = purrr::map(factor, ~gsub(",.*", "", .x)),
                      labels = purrr::map(factor, ~gsub("^[^,]*,\\s*", "", .x)))

      for (i in var_radio$field_name) {
        tryCatch(
          {
            data_api[[stringr::str_glue("{i}.factor")]] <- factor(data_api[[i]],
                                                                  levels = c(var_radio$levels[[which(var_radio$field_name %in% i)]]),
                                                                  labels = c(var_radio$labels[[which(var_radio$field_name %in% i)]]))
          },
          error = function(e) {
            warning(stringr::str_glue("The following variable could not be replicated in its factor version: {i}. Please manually create a factor version of this variable named '{i}.factor' to properly execute the rd_transform() function."), call. = F)
          }
        )
      }
    }

    # Join the main_vars to the imported data

    data_api <- data_api %>%
      dplyr::bind_cols(main_vars %>% dplyr::select(-"record_id"))

    # Indicator of longitudinal projects
    longitudinal <- ifelse("redcap_event_name" %in% names(data_api), TRUE, FALSE)

    # Read event file
    if(!is.na(event_path)){

      # Warning: event_path not necessary while using API connection
      warning("The event_path argument is not necessary as the event-form correspondence can be automatically read with the API connection")

      setwd(oldwd)

      # Evaluate the extension
      extension <- tools::file_ext(event_path)

      if (extension == "xlsx") {

        # Read XLSX file
        event_form <- openxlsx::read.xlsx(event_path, detectDates = T)

      } else if (extension == "csv") {

        # Read CSV file
        event_form <- utils::read.csv(event_path, encoding = "UTF-8")

      } else {

        stop("Unsupported file format. Only XLSX and CSV are supported.")

      }

      data_def <- list(data = data_api,
                       dictionary = dic_api,
                       event_form = event_form)
    } else {

      # If the event file is not specified, the function reads it using the API connection (in case of longitudinal projects)
      if(longitudinal){

        event_form <- as.data.frame(REDCapR::redcap_event_instruments(redcap_uri = uri, token = token, verbose = FALSE)$data)

        data_def <- list(data = data_api[, !(grepl("_complete", names(data_api)))],
                         dictionary = dic_api,
                         event_form = event_form)

      } else {

        data_def <- list(data = data_api,
                         dictionary = dic_api)

      }

    }

    # Message
    message("Done!")
  }

  # Specifying the "UTF-8" encoding to each character column of the data
  for (i in 1:length(data_def$data)) {
    if(is.character(data_def$data[, i])){
      suppressWarnings(data_def$data[, i] <- stringr::str_conv(data_def$data[, i], "UTF-8"))
    }
  }

  # Output
  return(data_def)

}