R/tidy_redcap.R

Defines functions rc_tidy rc_read_api rc_read_csv rc_format_api rc_format_variables

Documented in rc_format_api rc_format_variables rc_read_api rc_read_csv rc_tidy

#' Clean variable types of raw data export via CSV
#'
#' @param data Data exported from REDCap via csv.
#' @param dictionary Data dictionary exported from REDCap via csv.
#' @param yesno Determine how to return REDCap 'Yes - No' fields; options include 'factor' (default), 'numeric', or 'logical'.
#'
#' @return The REDCap dataset with variable types cleaned.
#' @importFrom redcapAPI redcapFactorFlip
#' @export
#'

rc_format_variables <- function(data, dictionary, yesno = "logical") {

  ## Tidy data dictionary names
  dictionary_tidy_names <-  c(
    "field_name", "form_name", "section_header", "field_type", "field_label",
    "select_choices_or_calculations", "field_note",
    "text_validation_type_or_show_slider_number", "text_validation_min",
    "text_validation_max", "identifier", "branching_logic",
    "required_field", "custom_alignment", "question_number",
    "matrix_group_name", "matrix_ranking", "field_annotation")

  if(!identical(names(dictionary), dictionary_tidy_names))
    names(dictionary) <- dictionary_tidy_names

  ## Fields actually present in downloaded dataset
  cols_data <- names(data)


## Character fields (replace blanks with NA)
  cols_chr <- names(data)[sapply(data, is.character)]
  for (x in cols_chr)
    data[, x][data[, x] == ""] <- NA


## Dates and date-time variables to Dates or POSIXct variables, correcting empty date fields
   cols_dttm <- intersect(
    cols_data, c(
      dictionary$field_name[grepl("datetime_", dictionary$text_validation_type_or_show_slider_number)],
      grep("timestamp", names(data), value = TRUE))
  )

  for(col in cols_dttm) {
    data[, col] <- as.character(data[, col]) |> as.POSIXct(format = "%Y-%m-%d %H:%M")
  }

  ## Logical variables (yesno radios and checkboxs)
  if(yesno != "factor") {

    cols_yn <- intersect(
      ## Ensure returned names are in the extracted dataset
      cols_data,

      unique(
        ## Retrieve names of all binary fields coded as 0, 1
        dictionary$field_name[

            dictionary$select_choices_or_calculations %in% c(
              "0, Incorrect | 1, Correct", "1, Correct | 0, Incorrect",
              "0, No | 1, Yes", "1, Yes | 0, No",
              "1, True | 0, False", "0, False | 1, True")])

    )

    ## To logical
    data[, cols_yn] <- lapply(data[, cols_yn], \(x) tolower(x) == "yes" | tolower(x) == "correct" | tolower(x) == "true")

    ## To numeric (0 or 1)
    if(yesno == "numeric")
      data[, cols_yn] <- lapply(data[, cols_yn], as.logical(x) |> as.integer())

  }

  return(data)

}

#' Clean variable types following API call
#'
#' @param data Data exported from REDCap via API call.
#' @param dictionary Data dictionary exported from REDCap API call.
#' @param yesno Determine how to return REDCap 'Yes - No' fields; options include 'factor' (default), 'numeric', or 'logical'.
#'
#' @return The REDCap dataset with variable types cleaned.
#' @importFrom redcapAPI redcapFactorFlip
#' @export
#'

rc_format_api <- function(data, dictionary, yesno = "logical") {

  ## Tidy data dictionary names
  dictionary_tidy_names <-  c(
    "field_name", "form_name", "section_header", "field_type", "field_label",
    "select_choices_or_calculations", "field_note",
    "text_validation_type_or_show_slider_number", "text_validation_min",
    "text_validation_max", "identifier", "branching_logic",
    "required_field", "custom_alignment", "question_number",
    "matrix_group_name", "matrix_ranking", "field_annotation")

  if(!identical(names(dictionary), dictionary_tidy_names))
    names(dictionary) <- dictionary_tidy_names

  ## Fields actually present in downloaded dataset
  cols_data <- names(data)


  ## Character fields (replace blanks with NA)
  cols_chr <- names(data)[sapply(data, is.character)]
  for (x in cols_chr)
    data[, x][data[, x] == ""] <- NA


  ## Dates and date-time variables to Dates or POSIXct variables, correcting empty date fields
  cols_dttm <- intersect(
    cols_data, c(
      dictionary$field_name[grepl("datetime_", dictionary$text_validation_type_or_show_slider_number)],
      grep("timestamp", names(data), value = TRUE))
  )

  for(col in cols_dttm) {
    data[, col] <- as.character(data[, col]) |> as.POSIXct(format = "%Y-%m-%d %H:%M")
  }

  ## Logical variables (yesno radios and checkboxs)
  if(yesno != "factor") {

    cols_yn <- intersect(
      ## Ensure returned names are in the extracted dataset
      cols_data,

      unique(
        ## Retrieve names of all binary fields coded as 0, 1
        dictionary$field_name[

          dictionary$select_choices_or_calculations %in% c(
            "0, Incorrect | 1, Correct", "1, Correct | 0, Incorrect",
            "0, No | 1, Yes", "1, Yes | 0, No",
            "1, True | 0, False", "0, False | 1, True")])

    )

    ## To logical
    data[, cols_yn] <- lapply(data[, cols_yn], \(x) tolower(x) == "yes" | tolower(x) == "correct" | tolower(x) == "true")

    ## To numeric (0 or 1)
    if(yesno == "numeric")
      data[, cols_yn] <- lapply(data[, cols_yn], as.logical(x) |> as.integer())

  }

  return(data)

}



#' Imported REDCap csv files and combine into single list.
#'
#' Clean raw .csv data exported from a REDCap database, and export a list of data.frames
#'
#' @param folder The folder containing the files downloaded from the REDCap
#'   database. The four files required include the Data Dictionary, Events
#'   (under the 'Define my events' tab), Instrument Mappings (under the
#'   'Designate Instruments for My Events' tab), and the raw Record data (under
#'   the 'My Reports & Exports' tab). All files should be saved as .csv.
#' @param yesno Determine how to return REDCap 'Yes - No' fields; options include 'factor' (default), 'numeric', or 'logical'.
#' @param longitudinal Is the study longitudinal or not. If longitudinal, then event and instrument mapping files should be provided as well.
#' @param tidy_format Tidy variable formats or not?
#'
#' @return A named list containing four data frames: dd = metadata, evnt = Events, inst = Instrument mappings, rcrd = Records.
#' @importFrom utils read.csv
#' @importFrom checkmate assert_true test_true
#' @export
#'

rc_read_csv <- function(folder, yesno = "logical", longitudinal = NULL, tidy_formats = TRUE) {

  files_csv <- list.files(folder, ".csv")

  ## Check that the required files are present
  checkmate::assert_true(any(grepl("_DATA_", files_csv)), .var.name = "REDCap Data file")
  checkmate::assert_true(any(grepl("_DataDictionary_", files_csv)), .var.name = "REDCap Data dictionary file")


  ## Check if study is longitudinal and if required files exist
  if(is.null(longitudinal)) {

    files_long <- checkmate::test_true(
      any(grepl("Events", files_csv)) &
        any(grepl("Instrument", files_csv)))


    if(files_long)
      longitudinal = TRUE
    else
      longitudinal = FALSE

  } else {

    if(longitudinal) {

      checkmate::assert_true(any(grepl("_Events_", files_csv)), .var.name = "REDCap Events file")
      checkmate::assert_true(any(grepl("_Instruments_", files_csv)), .var.name = "REDCap Instrument file")

    }
  }


  ## Collect names of input data files ----------------------------------------
  Inputs <- list(
    # 1 Import Data dictionary
    dd = sort(list.files(folder, "Dictionary"), decreasing = TRUE)[1],

    # 2 Import events
    evnt = sort(list.files(folder, "Events"), decreasing = TRUE)[1],

    # 3 Import instrument mappings
    inst = sort(list.files(folder, "Instrument"), decreasing = TRUE)[1],

    # 4 Import REDCap record data
    rcrd = sort(list.files(folder, "DATA"), decreasing = TRUE)[1]
  )

  Inputs <- Inputs[!sapply(Inputs, is.na)]

  ## Import data
  object <- lapply(file.path(folder, Inputs), utils::read.csv, stringsAsFactors = FALSE)
  names(object) <- names(Inputs)

  ## Tidy data dictionary names
  names(object$dd) <-  c(
    "field_name", "form_name", "section_header", "field_type", "field_label",
    "select_choices_or_calculations", "field_note",
    "text_validation_type_or_show_slider_number", "text_validation_min",
    "text_validation_max", "identifier", "branching_logic",
    "required_field", "custom_alignment", "question_number",
    "matrix_group_name", "matrix_ranking", "field_annotation")

  ## Tidy formatting of variables in dataset
  if(tidy_formats)
  {
    object$rcrd <- rc_format_variables(data = object$rcrd, dictionary = object$dd, yesno = yesno)
  }

  ## Return object
  return(object)

}



#' Read in record and metadata from REDCap using redcapAPI
#'
#' A wrapper for importing data Records, data dictionary, events, and instrument
#' designation mappings from REDCap. Uses the redcapAPI package.
#'
#' @param url URL for a REDCap database API. Check your institution's REDCap
#'   documentation for this address.
#' @param token REDCap API token
#' @param cast Passed to `redcapAPI::exportRecordsTyped`, determines how different types of fields are returned. By default, all checkbox and yesno fields are returned as logical.
#' @param labels Passed to `redcapAPI::exportRecordsTyped`, determines if the variable labels are applied to the data frame. `FALSE` by default.
#'
#' @return A named list containing four dataframes: dd = metadata, evnt =
#'   Events, inst = Instrument mappings, rcrd = Records.
#' @importFrom redcapAPI redcapConnection exportMetaData exportEvents exportMappings exportRecords
#' @export
#'

rc_read_api <- function(url, token, labels = FALSE, cast = list(
  system   = redcapAPI::castRaw,
  date_    = as.Date,
  yesno    = function(x, ...) x == '1' | tolower(x) == 'yes',
  checkbox = function(x, ...) redcapAPI::castRaw(x, ...) |> as.logical()
)) {

  rcon <- redcapAPI::redcapConnection(url=url, token=token)
  #redcapAPI::exportBundle(rcon)

  ## Read in data dictionary, event and instrument tables, and raw data from REDCap
  object <- list(

    dd   = redcapAPI::exportMetaData(rcon),
    evnt = redcapAPI::exportEvents(rcon),
    inst = redcapAPI::exportMappings(rcon),
    rcrd = redcapAPI::exportRecordsTyped(
      rcon, labels = labels, dag = TRUE, cast = cast
  ))

  ## Tidy formatting of variables in dataset
  object$rcrd <- rc_format_api(data = object$rcrd, dictionary = object$dd)

  return(object)

}



#' Tidy list of datasets from REDCap.
#'
#'
#' @param object An named list containing the following data.frames; metadata (dd),
#' events (evnt), instruments (inst) and records (rcrd).
#' @param ids Names of identifiers, for inclusion on all output datasets.
#' @param label Add labels to variables. Supply name of labelling package,
#'   \code{Hmisc} or \code{sjlabelled}.
#' @param label_checkbox If labels are applied, how should checkbox items be labelled?
#' If set to \code{TRUE} then choice labels are returned, if \code{FALSE} then the field label
#' is returned for all items. If a character value is provided, the both the field label and choick label
#' are return separated by the character value.
#' @param repeated How shall repeated forms be treated in datasets assembled by
#'   event. Options are: exclude, include, or nest. Nest uses `tidyr` package to
#'   collapse by row id.
#'
#' @return A list of dataframes, with variables grouped by event or by data collection form.
#' @importFrom stats setNames na.omit
#' @importFrom Hmisc label
#' @importFrom sjlabelled set_label
#' @importFrom labelled var_label
#' @importFrom tidyr nest
#' @export
#'

rc_tidy <- function(object, ids = NULL, label = FALSE, label_checkbox = TRUE, repeated = "exclude") {

  ## If not a longitudinal project, create single event containing all forms
  if(
    (is.null(object$evnt) | !"evnt" %in% names(object)) &
    (is.null(object$inst) | !"inst" %in% names(object)) &
    !"redcap_event_name" %in% names(object$rcrd)
  ) {

    ### Create single event
    object$evnt <- data.frame(
      event_name = "All instruments",
      arm_num    = 1,
      day_offset = 0, offset_min = 0, offset_max = 0,
      unique_event_name = "all_instruments_arm_1",
      custom_event_label = "all_instruments"
    )

    ### Allocate all forms to single event
    object$inst = data.frame(
      arm_num = 1,
      unique_event_name = "all_instruments_arm_1",
      form = unique(object$dd$form_name)
    )

    ### Add event name as second column in record data.frame
    object$rcrd <- cbind(
      object$rcrd[, 1, drop = FALSE],
      redcap_event_name = "all_instruments_arm_1",
      object$rcrd[,-1, drop = FALSE]
    )

  }


  ## If ID columns not specified take first column and any beginning with 'redcap'.
  if(is.null(ids))
    ids <- names(object$rcrd)[[1]]
  ids_rc <- c(ids, grep("^redcap", names(object$rcrd), value = TRUE, ignore.case = TRUE))

  # Create list of datasets ----------------------------------------------------
  dat_ed <- vector("list", 2)
  names(dat_ed) <- c("form", "event")

  ## By form (data collection instrument) --------------------------------------
  ### All forms
  forms <- unique(object$dd$form_name)
  ### Repeating forms
  forms_rpt <- unique(object$rcrd$redcap_repeat_instrument[!is.na(
    object$rcrd$redcap_repeat_instrument
  )])

  if(is.null(forms_rpt))
    repeated = "exclude"

  ## Form completion
  cols_cmp <- intersect(
    ## All possible form names
    paste0(forms, "_complete"),
    ## Forms actually used
    grep("_complete$", names(object$rcrd), value = TRUE))


  if(any(sapply(object$rcrd[, cols_cmp], is.numeric))) {

    object$rcrd[, cols_cmp] <- lapply(object$rcrd[, cols_cmp], function(x)
      factor(x, levels = 0:2, labels = c("Incomplete", "Unverified", "Complete")))

  } else {

    object$rcrd[, cols_cmp] <- lapply(
      object$rcrd[, cols_cmp], factor, levels = c("Incomplete", "Unverified", "Complete"))

  }

  ## Form timestamps
  cols_tmstmp <- intersect(paste0(forms, "_timestamp"), names(object$rcrd))
  ### Drop timestamp values recorded as 'not completed'. (Done already during previous step)
  # object$rcrd[, cols_tmstmp][object$rcrd[, cols_tmstmp] == "[not completed]"] <- NA
  # object$rcrd[, cols_tmstmp] <- lapply(object$rcrd[, cols_tmstmp], as.POSIXct)

  ## Create list with name, columns, events, and repeating status by form
  form_data <- sapply(forms, function(form) {

    vars  <- object$dd$field_name[object$dd$form_name %in% form]
    cols  <- unlist(sapply(vars, function(x)
      grep(
        paste(paste0("^", x, c("$", "___")), collapse = "|")
        , names(object$rcrd), value = TRUE
      )))
    ## Add form timestamp and complete columns. Ensure names are in dataset
    cols <- c(cols, paste(form, c("timestamp", "complete"), sep = "_"))
    cols <- intersect(cols, names(object$rcrd))

    if("evnt" %in% names(object))
      events <- object$inst$unique_event_name[object$inst$form == form]
      # events <- unique(object$rcrd$redcap_event_name)

    else
      events <- NA
    repeating <- form %in% forms_rpt
    return(list(name = form, vars = vars, cols = cols, events = events, repeating = repeating))

  }, simplify = FALSE)

  ## Create list of data by form
  dat_ed$form <-  lapply(form_data, function(form){

    if(form$repeating) {
      data  <- object$rcrd[
        object$rcrd$redcap_repeat_instrument %in% form$name
        , unique(c(ids_rc, form$cols))]
    } else if (!is.null(forms_rpt)) {
      data  <- object$rcrd[
        is.na(object$rcrd$redcap_repeat_instrument)
        , unique(c(ids_rc, form$cols))]
    } else {
      data  <- object$rcrd[, unique(c(ids_rc, form$cols))]
    }

    ## Only run for longitudinal project
    if("redcap_event_name" %in% names(object$rcrd)) {
      data <- data[data$redcap_event_name %in% form$events,]

      ## Add in days offset column for each event (if exists)
      if(!is.null(object$evnt$day_offset)) {
        col_rc_evnt <- grep("redcap_event_name", names(data))

        redcap_event_day_offset <- as.numeric(as.character(
          factor(
            data$redcap_event_name, levels = object$evnt$unique_event_name,
            labels = object$evnt$day_offset)
        ))

        data <- cbind(
          data[, 1:col_rc_evnt],
          redcap_event_day_offset,
          data[, (col_rc_evnt + 1):ncol(data)])
      }
    }

    ## Add labels if requested
    if(label != FALSE) {

      labs <- sapply(names(data), function(x) {

        ## Find original name of checkboxes
        x_ = gsub("___[0-9]+$", "", x)

        if(x_ %in% object$dd$field_name) {

          x_dd <- object$dd[object$dd$field_name %in% x_,]

          ## Labelling options for checkboxes.
          if(!is.null(x_dd) & x_dd$field_type %in% "checkbox") {

            cb_opts <- trimws(strsplit(x_dd$select_choices_or_calculations, "|", fixed = TRUE)[[1]])
            cb_codes <- sapply(
              strsplit(cb_opts, ", ", fixed = TRUE), function(x)
                setNames(x[[2]], x[[1]]))
            cb_label <- cb_codes[sub(".*___([0-9]+)$", "\\1", x)]

            ## Combine checkbox field label and choice value
            if(is.character(label_checkbox))
              return(paste(x_dd$field_label, sep = label_checkbox, cb_label))
            ## Just return choice value
            else if(is.logical(label_checkbox) & label_checkbox)
              return(unname(cb_label))
            ## Just return field labbel
            else
              return(x_dd$field_label)
          } else {
            return(object$dd$field_label[object$dd$field_name %in% x])
          }} else {

            return(NA_character_)
          }})

      if(label == "Hmisc") {
        Hmisc::label(data[!is.na(unlist(labs))], self = FALSE) <- stats::na.omit(unlist(labs))
      } else if(grepl("^sj", label)) {
        data <- sjlabelled::set_label(data, label = unlist(labs))
      } else if(label == "labelled") {
        labelled::var_label(data) <- labs[!is.na(labs)]
      }
    }

    return(data)

  })



  ## Create list of datasets by event name -------------------------------------
  if("evnt" %in% names(object)) {

    # events <- object$evnt$unique_event_name Not all defined events have instuments attached to them.
    events <- unique(object$inst$unique_event_name)
    # events <- setNames(unique(object$rcrd$redcap_event_name), unique(object$rcrd$redcap_event_name))

    dat_ed$event <- sapply(events, function(event) {

      ## Approach 2: Identify required forms and columns
      forms  <- form_data[sapply(form_data, function(form) event %in% form$events)]
      cols   <- sapply(forms, "[[", "cols")

      ### Separate by whether repeating forms are included and how they should be treated
      if(is.null(forms_rpt) | repeated == "include") {

        ## Keep all columns
        data <- object$rcrd[
          object$rcrd$redcap_event_name == event
          , unique(c(ids_rc, unlist(cols)))]

      } else {

        cols_rpt   <- sapply(forms[ sapply(forms, "[[", "repeating")], "[[", "cols")
        cols_norpt <- sapply(forms[!sapply(forms, "[[", "repeating")], "[[", "cols")

        ## Keep only non-repeated columns. (Repeated == "exclude". Default.)
        data <- object$rcrd[
          object$rcrd$redcap_event_name == event &
            is.na(object$rcrd$redcap_repeat_instrument)
          , unique(c(ids, unlist(cols_norpt)))]

        if(repeated == "nest") {

          ## Keep and nest repeated columns
          dats_rpt <- sapply(names(cols_rpt), function(x) {

            dat_t1 <- object$rcrd[
              object$rcrd$redcap_event_name == event &
                object$rcrd$redcap_repeat_instrument %in% x
              , unique(c(ids, "redcap_repeat_instance", unlist(cols_rpt[x])))]

            tidyr::nest(dat_t1, !! x := -ids)

          }, simplify = FALSE)


          data <- Reduce(function(...) merge(..., all = TRUE, by = ids, sort = FALSE)
                         , c(norpt = list(data), dats_rpt))

        }

      }

      if(label != FALSE) {

        labs <- sapply(names(data), function(x) {

          ## Find original name of checkboxes
          x_ = gsub("___[0-9]+$", "", x)

          if(x_ %in% object$dd$field_name) {

            x_dd <- object$dd[object$dd$field_name %in% x_,]

            ## Labelling options for checkboxes.
            if(!is.null(x_dd) & x_dd$field_type %in% "checkbox") {

              cb_opts <- trimws(strsplit(x_dd$select_choices_or_calculations, "|", fixed = TRUE)[[1]])
              cb_codes <- sapply(
                strsplit(cb_opts, ", ", fixed = TRUE), function(x)
                  setNames(x[[2]], x[[1]]))
              cb_label <- cb_codes[sub(".*___([0-9]+)$", "\\1", x)]

              ## Combine checkbox field label and choice value
              if(is.character(label_checkbox))
                return(paste(x_dd$field_label, sep = label_checkbox, cb_label))
              ## Just return choice value
              else if(is.logical(label_checkbox) & label_checkbox)
                return(unname(cb_label))
              ## Just return field label
              else
                return(x_dd$field_label)
            } else {
              return(object$dd$field_label[object$dd$field_name %in% x])
            }} else {

              return(NA_character_)
            }})

        if(label == "Hmisc") {
          Hmisc::label(data[!is.na(unlist(labs))], self = FALSE) <- stats::na.omit(unlist(labs))
        } else if(grepl("^sj", label)) {
          data <- sjlabelled::set_label(data, label = unlist(labs))
        } else if(label == "labelled") {
          labelled::var_label(data) <- labs[!is.na(labs)]
        }
      }

      return(data)

    }, simplify = FALSE)

  }

  return(dat_ed)

}
jwilliman/redcapTidy documentation built on Nov. 22, 2023, 7:07 a.m.