R/R03-REDCap.R

Defines functions camr_check_name_conventions rename_redcap_vars validate_var_name camr_redcap_download camr_download_redcap camr_redcap_read

Documented in camr_check_name_conventions camr_download_redcap camr_redcap_download camr_redcap_read rename_redcap_vars validate_var_name

# Functions for REDCap
# Written by...
#   Michael Pascale
#   William Schmitt
# Maintained by...
#   Kevin Potter
# email:
#   kpotter5@mgh.harvard.edu
# Please email us directly if you
# have any questions or comments
# Last updated 2023-03-12

# Table of contents
# 1) Functions to read from REDCap
#   1.1) camr_redcap_read
#   1.2) camr_download_redcap
#     1.2.1) Download Project Information
#     1.2.2) Download Raw Data
#     1.2.3) Download Metadata
#     1.2.4) Download Form-Event Map
#     1.2.5) Generate README
# 2) Functions for Naming Conventions
#   2.1) validate_var_name
#   2.2) rename_redcap_vars
#   2.3) camr_ckdict
#     2.3.1) Validate Field Name
#     2.3.2) Validate VARNAME
#     2.3.3) Check VARNAME Format
#     2.3.4) Check Field Type Against VARNAME
#     2.3.5) Check Quality Control Fields
#     2.3.6) Construct New VARNAME
#   2.4) camr_check_name_conventions
# 3) Functions to Process REDCap Data
#  3.1) camr_pivot_redcap_eav

#### 1) Functions to read from REDCap ####

##### 1.1) camr_redcap_read #####
#' Read All Records From a REDCap Project
#'
#' This function seeks to mimic the `redcap_read()`
#' function for the (now defunct) `REDCapR` package,but
#' with a more robust output that can handle bizarre characters
#' and calculated fields. This function handles all REST API calls
#' using httr. This function also uses calls to [furrr] and [future]
#' for downloading batches from REDCap. The [plan][future::plan()]
#' is not specified. Instead, the user should use the plan that
#' works best for their specific use.
#'
#' @param redcap_uri The URI (uniform resource identifier) of the
#'   REDCap project. Required.
#'
#' @param token The user-specific string that serves to authenticate
#'   the call to the REST API. Required.
#'
#' @param raw_or_label A string (either `'raw'` or `'label'`) which
#'   specifies whether the exported data should be the raw
#'   coded/numerical values or the human-readable labels.
#'   Default is label.
#'
#' @param batch_size An integer indicating the number of records to
#'   download at once.
#'
#' @return A list which contains the following elements:
#' * `data`: A data frame containing all records and all fields.
#' * `success`:A logical indicating the success of the download.
#' * `metaData`: A data frame containing the REDCap metadata for all fields.
#'
#' @author William Schmitt
#'
#' @export
#' @md

camr_redcap_read = function(
    redcap_uri,
    token,
    raw_or_label = 'label',
    batch_size = 100) {

  # Download REDCap Metadata (e.g. data dictionary) to determine
  # unique identifier field.
  metaDtfResp <- camr_post_form_JSON_to_dtf(
    url = redcap_uri,
    params = list(
      token = token,
      content = 'metadata',
      format = 'json',
      returnFormat = 'json'
    )
  )
  metaDtf <- metaDtfResp$data

  # Extract unique identifier field
  uniId <- metaDtf[1, 1]

  # Notify user
  message(paste(
    'REDCap Metadata Download.',
    'Unique field identified:',
    uniId
  ))

  # Use unique id to get list of participants
  ptListResp <- camr_post_form_JSON_to_dtf(
    url = redcap_uri,
    params = list(
      token = token,
      content = 'record',
      format = 'json',
      returnFormat = 'json',
      type = 'flat',
      fields = uniId
    )
  )
  ptList <- ptListResp$data

  # Get list of pts in batches
  ptList <- unlist(unique(ptList[, 1]))
  ptList <- split(ptList, ceiling(seq_along(ptList)/batch_size))

  # Notify user
  message(paste(
    purrr::reduce(purrr::map_int(ptList, length), sum),
    'records identified.',
    'Downloading',
    length(ptList),
    'batches of',
    batch_size,
    'records.'
  ))

  # Initialize parameter list for each download
  baseParams <- list(
    token = token,
    content = 'record',
    format = 'json',
    returnFormat = 'json',
    type = 'flat',
    rawOrLabel = raw_or_label
  )

  # Function to download a batch
  downloadBatch <- function(ids, baseParams, pb) {

    # Build list of record #s to filter data
    curIds <- as.list(ids)
    curIdNames <- paste0(
      'records[',
      0:(length(curIds) - 1),
      ']'
    )
    names(curIds) <- curIdNames

    # Create parameter vector
    paramList <- append(baseParams, curIds)
    datResp <- camr_post_form_JSON_to_dtf(
      url = redcap_uri,
      params = paramList
    )

    # Update progress bar
    pb()

    # Return
    return(datResp$data)
  }

  # Download each batch with a progress bar indicator
  progressr::with_progress({
    # Initialize pb
    progressr::handlers('progress')
    pb <- progressr::progressor(length(ptList))
    pb(amount = 0)
    # Get data
    dat <- furrr::future_map_dfr(ptList, downloadBatch, baseParams, pb)
  })

  # Get numeric fields
  intFields <- dplyr::filter(
    metaDtf,
    stringr::str_detect(text_validation_type_or_show_slider_number, 'integer')
  )
  numFields <- dplyr::filter(
    metaDtf,
    stringr::str_detect(text_validation_type_or_show_slider_number, 'number')
  )
  calcFields <- dplyr::filter(
    metaDtf,
    field_type == 'calc'
  )

  # Typecast fields

  # Int validation --> integers
  dat <- dplyr::mutate(
    dat,
    dplyr::across(
      intFields$field_name,
      as.integer
    )
  )

  # Num validation --> numeric
  dat <- dplyr::mutate(
    dat,
    dplyr::across(
      numFields$field_name,
      as.numeric
    )
  )

  # Calculated fields --> numeric
  dat <- dplyr::mutate(
    dat,
    dplyr::across(
      calcFields$field_name,
      as.numeric
    )
  )
  # Package output
  message('Done.')
  out <- list(
    data = dat,
    success = 1,
    metaData = metaDtf
  )

  return(out)
}

##### 1.2) camr_download_redcap #####
#' Download Data From a REDCap Project
#'
#' Function to download data from a REDCap
#' Project and create a list of outputs.
#'
#' @param chr_rc_uri A character string, the URL
#'   to use for the \code{httr::POST} function.
#' @param chr_rc_token A character string, the
#'   user's API token for the specified REDCap
#'   project.
#' @param lgl_raw Optional. Download data in raw format.
#' Defaults to TRUE.
#'
#' @return A list consisting of...
#' \itemize{
#'   \item{'description'} {String. Explains the project from
#'     which the data was downloaded}.
#'   \item{'database'}{List. Information regarding the REDCap
#'     project.}
#'   \item{ 'data' }{Dataframe. All REDCap data with one
#'     datapoint per row.}
#'   \item{ 'metadata' }{Dataframe. REDCap data dictionary.}
#'   \item{ 'form_event_map' }{Dataframe. Mappings between
#'     REDCap events and instruments administered at each.
#'     This will be returned for longitudinal databases only.}
#' }
#'
#' @export

camr_download_redcap <- function(
    chr_rc_uri = "",
    chr_rc_token = "",
    lgl_raw=TRUE) {

  if ( chr_rc_uri == "" ) {

    chr_rc_uri <- Sys.getenv('API_REDCAP_URI')

  }

  if ( chr_rc_token == "" ) {

    chr_rc_token <- Sys.getenv('API_REDCAP_TOKEN')

  }

  checkmate::assert_string(chr_rc_uri, pattern = '^https://')
  checkmate::assert_string(
    chr_rc_token, n.chars = 32, pattern = '[0-9A-F]{32}'
  )

  #### 1.2.1) Download Project Information ####

  message('Downloading project information...')
  lst_formdata <- list(
    token = chr_rc_token,
    content = 'project',
    format = 'json'
  )

  rsp_response <- httr::POST(
    chr_rc_uri,
    body = lst_formdata, encode = "form"
  )
  lst_database <- httr::content(rsp_response)


  #### 1.2.2) Download Raw Data ####

  message('Downloading data...')

  lst_formdata <- list(
    token = chr_rc_token,
    content = 'record',
    action = 'export',
    format = 'csv',             # Download in CSV Format
    type = 'eav',               # Entity-Attribute-Value/Long Form
    csvDelimiter = 'tab',
    rawOrLabel = ifelse(lgl_raw, 'raw', 'label'),
    rawOrLabelHeaders = 'raw',
    exportCheckboxLabel = 'false',
    exportSurveyFields = 'false',
    exportDataAccessGroups = 'false'
  )

  dtm_init <- lubridate::now()

  rsp_response <- httr::POST(
    chr_rc_uri, body = lst_formdata, encode = "form"
  )

  # Read all columns as character types. We will use metadata
  # to infer correct types later on.
  # Trim whitespace and convert empty strings to NA.
  df_data_eav <- httr::content(
    rsp_response, as = 'parsed',
    type = 'text/tab-separated-values',
    encoding = 'UTF-8',
    trim_ws = TRUE,
    col_types = 'c', na = ''
  ) |> dplyr::filter(
    !is.na(value)
  )

  #### 1.2.3) Download Metadata ####

  message('Downloading metadata...')

  lst_formdata <- list(
    token = chr_rc_token,
    content = 'metadata',
    format = 'csv'
  )

  rsp_response <- httr::POST(
    chr_rc_uri, body = lst_formdata, encode = "form"
  )

  # UTF-8 encoding must be specified or httr::content
  # may segfault on "multibyte character".
  df_metadata <- httr::content(
    rsp_response, encoding = 'UTF-8', col_types='c'
  )


  #### 1.2.4) Download Form-Event Map ####
  df_form_event_map <- NULL
  if (as.logical(lst_database$is_longitudinal)) {
    message('Downloading form-event map...')

    lst_formdata <- list(
      token = chr_rc_token,
      content = 'formEventMapping',
      format = 'csv'
    )

    rsp_response <- httr::POST(
      chr_rc_uri, body = lst_formdata, encode = "form"
    )
    df_form_event_map <- httr::content(
      rsp_response,
      col_types =
        readr::cols(unique_event_name = 'c', form = 'c', arm_num = 'i')
    )
  }

  #### 1.2.5) Generate README ####

  chr_description <- stringr::str_glue(
    paste0(
      'MGH Center for Addiction Medicine\n',
      '{ lst_database$project_title }\n',
      'IRB No. { lst_database$project_irb_number }\n',
      '\n',
      'Unprocessed REDCap dataset.\n',
      '\n',
      'PID:        { lst_database$project_id }\n',
      'Timestamp:  { dtm_init |> lubridate::with_tz(\'UTC\') ',
      '|> lubridate::format_ISO8601(precision=\'ymdhms\', usetz=TRUE) }\n',
      'Username:   { Sys.getenv(\'USER\', Sys.getenv(\'USERNAME\')) }\n',
      'Contents:   { nrow(df_data_eav) } datapoints\n'
    ) )

  out <- list(
    description=chr_description,
    database=lst_database,
    data=df_data_eav |>
      dplyr::left_join(df_metadata |> dplyr::select(
        field_name, form_name), by='field_name'),
    metadata=df_metadata,
    form_event_map=df_form_event_map
  )

  return( out )
}

#' @rdname camr_download_redcap
#' @export
camr_redcap_download <- function(...) {
  camr_download_redcap(...)
}

#### 2) Functions for Naming Conventions ####

##### 2.1) validate_var_name #####
#' Validate That a String Conforms to Naming Standards
#'
#' Validate a string conforms to the Center for Addiction Medicine (CAM) R
#' data-frame column naming standard. The CAM name standard is
#' GGG.TTT.Optional.Var_name. This function maintains the ability to check if
#' `TTT` conforms to the standard for R types or for REDCap field types and is
#' vectorized over the input.
#'
#' The CAM Name Standard: GGG.TTT.Optional.Var_name.
#' \itemize{
#'   \item `GGG` is the variable's group, which roughly corresponds to data
#'     of a similar nature or type. Examples include: IDS for identification
#'    variables, SBJ for subject demographic variables, SSS for important
#'    study details, INV for inventory measures, etc.
#'   \item`TTT` is the variable's type, which corresponds to either a
#'     specific R type or a specific REDCap field type. Find permissible
#'     types below:
#'     \itemize{
#'       \item Possible R Types:
#'         \itemize{
#'           \item `INT`: An integer value.
#'           \item `DBL`: A double or non-integer numeric value.
#'           \item `CHR`: A character string.
#'           \item `DAT`: A date-time value. TODO: Decide on exact date-time
#'           object.
#'           \item `LGC`: A boolean value.
#'         }
#'       \item Possible REDCap Types:
#'         \itemize{
#'           \item `DAT`: Date-time fields.
#'           \item `INT`: Integer fields.
#'           \item `DBL`: Decimal/non-integer numeric fields.
#'           \item `CHR`: All other text or notes box fields.
#'           \item `MCQ`: Multiple choice drop-down lists and radio buttons.
#'           \item `CKB`: Checkbox fields.
#'           \item `YNQ`: Yes/No Questions.
#'           \item `FLE`: Signature and file-upload fields.
#'           \item `VAS`: Slider or visual analog scale (VAS) fields.
#'           \item `DSC`: Descriptive text fields.
#'         }
#'     }
#'   \item `Optional`: An optional, short, additional grouping identifier. This
#'   is most often indicated for inventories, to allow for additional grouping
#'   by the questionnaire/inventory itself. Multiple additional group identifier
#'   are allowed, so long as they maintain a period separator and contain only
#'   an alphanumeric sequence (i.e. no `_`). Case should be sensible; this
#'   section should either be all caps (e.g. abbreviations) or only the first
#'   letter should be capitalized.
#'   \item `Var_name`: A short, descriptive name for the field that would be
#'   appropriate for figure/table labeling. If the name consists of multiple
#'   words, the first word should be capitalized and subsequent words should be
#'   separated by `_` and appear in all lower-case. The one exception is
#'   if the entire word (e.g. an abbreviation) is capitalized.
#' }
#'
#' @param str A string vector of names to validate.
#' @param type A string of either "REDCap" or "R" to indicate how the type
#' section of the name should be validated.
#'
#' @author William Schmitt
#'
#' @return A boolean vector of equal length that specifies whether each element
#' passed validation.
#'
#' @export

validate_var_name <- function(str, type = "R") {

  ## CONSTANTS

  # Define R and REDCap allowed types
  rTypes <- c("INT", "DBL", "CHR", "DAT", "LGC")
  rcTypes <- c(
    "DAT",
    "INT",
    "DBL",
    "CHR",
    "MCQ",
    "CKB",
    "YNQ",
    "FLE",
    "VAS",
    "DSC"
  )

  # Assert that str is a string vector
  if (typeof(str) != "character") {
    stop("validate_R_var_name: str not a character vector.")
  }

  # Assert that type is R or REDCap
  if (!type %in% c("R", "REDCap")) {
    stop("validate_R_var_name: type must be either R or REDCap.")
  }

  ## Split strings into name parts

  # Get number of name parts
  maxSplits <- max(stringr::str_count(str, "\\.") + 1, na.rm = T)
  if (maxSplits < 3) {
    return(rep(FALSE, length(str)))
  }

  # Split and coerce to data frame
  strSplit <- stringr::str_split_fixed(str, "\\.", maxSplits)
  strSplit <- as.data.frame(strSplit)

  ## Check 1st part: 3 letters, all capitals
  part1 <- strSplit[, 1]
  out1 <- stringr::str_detect(part1, "^[:upper:]{3}$")

  ## Check 2nd part: Must be in type library
  part2 <- strSplit[, 2]
  if (type == "R") {
    out2 <- part2 %in% rTypes
  } else if (type == "REDCap") {
    out2 <- part2 %in% rcTypes
  }

  ## Check last part: First word is title case and others are lowercase

  # Transpose strSplit so it's a list for each variable (split into parts)
  strSplitList <- purrr::pmap(strSplit, list)

  # Get max index of non "" element for each variable
  lastPartIdx <- suppressWarnings(purrr::map_dbl(
    strSplitList,
    ~ max(which(purrr::map_lgl(., ~ . != '')))
  ))

  # Use index to return last part, else return blank string
  lastPart <- purrr::map2_chr(
    lastPartIdx,
    1:length(lastPartIdx),
    ~ ifelse(.x > 2, strSplit[.y, .x], '')
  )

  # Break last part by _ separator
  lastPart <- stringr::str_split(lastPart, '_')

  # Does first word begin with capital and then is all lower or all upper?
  lastPartFirstCheck <- purrr::map_lgl(
    lastPart,
    ~ stringr::str_detect(
      .[1],
      '^[:upper:](([[:upper:]\\d]+$)|([[:lower:]\\d]+$))'
    )
  )

  # Remove first word
  lastPartOthers <- purrr::map(
    lastPart,
    ~ .[-1]
  )

  # Check all other words
  lastPartOtherCheck <- purrr::map_lgl(
    lastPartOthers,
    ~ all( stringr::str_detect(
      ., '(^[[:upper:]\\d]+$)|(^[[:lower:]\\d]+$)') | . == '')
  )
  outLast <- lastPartFirstCheck & lastPartOtherCheck

  ## Check mid parts: alphanumeric (with first one capitalized)

  # Extract list of midParts
  midParts <- purrr::map2(
    strSplitList,
    lastPartIdx,
    ~ ifelse(.y <= 3, '', .x[3:.y])
  )

  # Check that everything passes
  outMid <- purrr::map_lgl(
    midParts,
    ~ all( stringr::str_detect(
      ., '^[:upper:](([[:upper:]\\d]+$)|([[:lower:]\\d]+$))'))
  ) | purrr::map_lgl(midParts, ~ . == '')

  # Combine all checks and return
  return(out1 & out2 & outMid & outLast)
}

##### 2.2) rename_redcap_vars #####
#' Rename a REDCap Data Frame to Follow CAM Naming Standards
#'
#' Rename a data frame downloaded from REDCap using the field annotation
#' located in the metadata data frame downloaded with the data. See
#' [valiate_var_name()] for more information on the naming standard.
#' Warns the user for any field that does not contain a new name that
#' conforms to the standard in the field annotation.
#'
#' @param rcDtf A data frame containing the data downloaded from REDCap.
#' @param metaDtf A data frame containing the meta data (data dictionary)
#' downloaded from REDCap.
#'
#' @author William Schmitt
#'
#' @return A data frame where all columns have been renamed to conform to
#' the CAM standard for R names.
#'
#' @export

rename_redcap_vars <- function(rcDtf, metaDtf) {

  ## Extract new names
  metaDtf <- dplyr::mutate(
    metaDtf,
    newName = stringr::str_extract(
      field_annotation,
      "(?<=VARNAME=)[[:alnum:]\\._]+(?=(\\r\\n)?)"
    )
  )

  # Validate new names
  metaDtf <- dplyr::mutate(
    metaDtf,
    validName = validate_var_name(newName, "REDCap")
  )

  # Warn about bad names
  badNames <- dplyr::filter(metaDtf, !validName)
  purrr::walk(
    badNames$field_name,
    ~ message(paste0("WARNING: Bad name for ", ., "."))
  )

  # Build data frame for old <> new mapping
  nameMap <- data.frame(
    current = names(rcDtf)
  )

  # Get new names ready to add
  newNames <- dplyr::transmute(
    metaDtf,
    field_name = field_name,
    field_type = field_type,
    field_choices = dplyr::if_else(
      field_type == "checkbox",
      select_choices_or_calculations,
      ''
    ),
    newName = dplyr::if_else(
      validName,
      newName,
      field_name
    )
  )

  # Turn choices for checkboxes into nested dtf
  newNames <- dplyr::mutate(
    newNames,
    field_choices = dplyr::if_else(
      field_type == "checkbox",
      stringr::str_extract_all(field_choices, '[:alnum:]*(?=,\\s)'),
      list("")
    )
  )

  # unnest to get one row per check option
  newNames <- tidyr::unnest(newNames, field_choices)

  # Rename newName to work for checkboxes
  newNames <- dplyr::mutate(
    newNames,
    dplyr::across(
      c(field_name, newName),
      ~ dplyr::if_else(
        field_type == "checkbox",
        paste0(., "___", field_choices),
        .
      )
    )
  )

  # Join with current names
  nameMap <- dplyr::left_join(
    nameMap,
    dplyr::select(newNames, newName, field_name),
    by = c("current" = "field_name")
  )

  # TEMPORARY: Fill in blanks
  nameMap <- dplyr::mutate(
    nameMap,
    newName = dplyr::if_else(
      is.na(newName),
      current,
      newName
    )
  )

  # Copy over new names and return
  dtf <- rcDtf
  names(dtf) <- nameMap$newName
  return(dtf)
}

##### 2.3) camr_ckdict #####
#' Verify REDCap data dictionary.
#'
#' @param dict REDCap metadata table as downloaded by [redcap_read()].
#'
#' @author Michael Pascale
#'
#' @export

camr_ckdict <- function (
    dict ) {

  dict[is.na(dict)] <- ''
  field_prefixes_list <- list()
  var_instrument_list <- list()
  var_name_list <- c()
  issue_list <- data.frame(
    severity = integer(),
    issue = character(),
    field = character(),
    variable = character()
  )

  add_issue <- function(
    issue_list,
    severity_level,
    field_name,
    field_type,
    description,
    variable = "" ) {

    new_issue <- list(
      severity = severity_level,
      field = field_name,
      type = field_type,
      issue = paste(
        description, collapse = ""
      ),
      variable = variable
    )

    out <- dplyr::bind_rows(
      issue_list,
      new_issue
    )

    return( out )
  }

  for (i in 1:nrow(dict)) {

    row <- dict[i,]
    field_type <- row$field_type
    field_name <- row$field_name
    field_instrument <- row$form_name
    field_choices <- row$select_choices_or_calculations
    field_annotation <- row$field_annotation

    #### 2.3.1) Validate Field Name ####

    if (stringr::str_length(field_name) > 26) {

      issue_list <- add_issue(
        issue_list,
        3, field_name, field_type,
        c( 'Field name is greater than 26 characters long.' )
      )

    }

    field_prefix <- stringr::str_match(field_name, '^[:alnum:]+')
    field_suffix <- stringr::str_match(field_name, '_[:alnum:]+$')

    if (is.na(field_prefix) || is.na(field_suffix)) {

      issue_list <- add_issue(
        issue_list,
        2, field_name, field_type,
        c( 'Field name is not of standard format.' )
      )

    }

    #### 2.3.2) Validate VARNAME ####

    if (field_type != 'descriptive') {
      var_name <-
        stringr::str_match(field_annotation, '(?<=VARNAME=)[\\w\\.]+')

      if (is.na(var_name)) {

        issue_list <- add_issue(
          issue_list,
          2, field_name, field_type,
          c( 'Field has no VARNAME defined in its field annotations.' )
        )

      }

      var_decomposed <-
        stringr::str_match(
          var_name, '(\\w+?)\\.(\\w+?)\\.(?:(\\w+?)\\.)?(.*)'
        )
      var_group <- var_decomposed[2]
      var_type <- var_decomposed[3]
      var_instrument <- var_decomposed[4]
      var_tail <- var_decomposed[5]

      #### 2.3.3) Check VARNAME Format ####

      if(any(is.na(c(var_group, var_type, var_tail)))) {

        issue_list <- add_issue(
          issue_list,
          2, field_name, field_type,
          c( 'Field has an impropertly formatted ',
             'VARNAME in its field annotations.'),
          var_name
        )

        next
      }

      if(!is.element(var_group, c('IDS', 'SBJ', 'INV', 'QCC'))) {

        issue_list <- add_issue(
          issue_list,
          2, field_name, field_type,
          c( 'Field has an invalid group tag ',
             'in its VARNAME annotation.'),
          var_name
        )

      }

      content <- c('DAT', 'INT', 'DBL', 'CHR', 'CLC',
                   'MCQ', 'CKB', 'YNQ', 'FLE', 'VAS', 'DSC')

      if(!is.element(var_type, content)) {

        issue_list <- add_issue(
          issue_list,
          2, field_name, field_type,
          c( 'Field has an invalid type tag in ',
             'its VARNAME annotation.'),
          var_name
        )

      }

      if (!is.na(var_instrument)) {
        if (is.null(unlist(field_prefixes_list[field_prefix]))) {
          field_prefixes_list[field_prefix] = var_instrument
        } else {
          if (field_prefixes_list[field_prefix] != var_instrument) {

            issue_list <- add_issue(
              issue_list,
              2, field_name, field_type,
              c( 'Field has an instrument tag in its VARNAME ',
                 'annotation which is inconsistent with other ',
                 'field of the same prefix.'),
              var_name
            )

          }
        }

        if (is.null(unlist(var_instrument_list[var_instrument]))) {
          var_instrument_list[var_instrument] = field_prefix
        } else {
          if (var_instrument_list[var_instrument] != field_prefix){

            issue_list <- add_issue(
              issue_list,
              2, field_name, field_type,
              c( 'Instrument tag is utilized with multiple ',
                 'different field prefixes.'),
              var_name
            )

          }
        }
      }

      var_name_list <- c(var_name_list, var_name)

      #### 2.3.4) Check Field Type Against VARNAME ####

      if (field_type == 'text') {
        validation <- row$text_validation_type_or_show_slider_number

        if ( !is.na(validation) &&
             any(
               (validation == '' && var_type != 'CHR'),
               (validation == 'integer' && var_type != 'INT'),
               (validation == 'number' &&
                !is.element(var_type, c('INT', 'DBL'))),
               (stringr::str_detect(validation, 'number\\ddp') &&
                var_type != 'DBL'),
               (stringr::str_detect(validation, '^(date|time)') &&
                var_type != 'DAT'),
               (is.element(validation, c('alpha_only', 'email',
                                         'phone', 'zipcode', 'ssn',
                                         'mrn_10d')) && var_type != 'CHR')
             )
        ) {

          issue_list <- add_issue(
            issue_list,
            2, field_name, field_type,
            c( 'Field validation does not correspond ',
               'with type tag in VARNAME annotation.'),
            var_name
          )

        }


      } else if (field_type == 'radio') {

        pairs <- stringr::str_split(field_choices, '\\|')
        pairs <- sapply(pairs, stringr::str_trim)
        pairs <- sapply(pairs, stringr::str_match, '^\\d+,\\s*\\d+$')
        integer_choices <- all(!is.na(pairs))

        if (var_type == 'INT') {

          if (!integer_choices)
            issue_list <- add_issue(
              issue_list,
              2, field_name, field_type,
              c( 'Field lacks integer choices despite ',
                 'type tag in its VARNAME annotation.'),
              var_name
            )

        } else if (var_type != 'MCQ') {
          issue_list <- add_issue(
            issue_list,
            2, field_name, field_type,
            c( 'Field validation does not correspond ',
               'with type tag in VARNAME annotation. ',
               'Recommend MCQ.'),
            var_name
          )
        } else if (integer_choices) {
          issue_list <- add_issue(
            issue_list,
            3, field_name, field_type,
            c( 'Field has only integer choices despite ',
               'type tag in its VARNAME annotation.'),
            var_name
          )
        }

      } else if (field_type == 'checkbox') {

        if (var_type != 'CKB')
          issue_list <- add_issue(
            issue_list,
            2, field_name, field_type,
            c( 'Field validation does not correspond with ',
               'type tag in VARNAME annotation. Recommend CKB.'),
            var_name
          )

      } else if (field_type == 'yesno') {

        if (var_type != 'YNQ')
          issue_list <- add_issue(
            issue_list,
            2, field_name, field_type,
            c( 'Field validation does not correspond with type tag ',
               'in VARNAME annotation. Recommend YNQ.'),
            var_name
          )

      } else if (field_type == 'dropdown') {

        if (var_type != 'MCQ')
          issue_list <- add_issue(
            issue_list,
            2, field_name, field_type,
            c( 'Field validation does not correspond with type tag',
               ' in VARNAME annotation. Recommend MCQ.'),
            var_name
          )

      } else if (field_type == 'calc') {

        if (var_type != 'CLC')
          issue_list <- add_issue(
            issue_list,
            2, field_name, field_type,
            c( 'Field validation does not correspond with type tag ',
               'in VARNAME annotation. Recommend CLC.'),
            var_name
          )

      } else if (field_type == 'file') {

        if (var_type != 'FLE')
          issue_list <- add_issue(
            issue_list,
            2, field_name, field_type,
            c( 'Field validation does not correspond with type tag ',
               'in VARNAME annotation. Recommend FLE.'),
            var_name
          )

      } else if (field_type == 'notes') {

        if (var_type != 'CHR')
          issue_list <- add_issue(
            issue_list,
            2, field_name, field_type,
            c( 'Field validation does not correspond with type tag ',
               'in VARNAME annotation. Recommend CHR.'),
            var_name
          )

      } else {
        issue_list <- add_issue(
          issue_list,
          1, field_name, field_type,
          c( 'Field validation invalid.'),
          var_name
        )
      }

      #### 2.3.5) Check Quality Control Fields ####

      if (!is.na(field_suffix)) {
        if (field_suffix == 'compby') {

          if (var_group != 'QCC' || var_type != 'CHR'
              || var_tail != 'compby')
            issue_list <- add_issue(
              issue_list,
              1, field_name, field_type,
              c( 'Field is for quality control but VARNAME annotation ',
                 'is not of form QCC.CHR.XXXX.compby.'),
              var_name
            )

        } else if (field_suffix == 'ptinitials') {

          if (var_group != 'QCC' || var_type != 'CHR' ||
              var_tail != 'ptinitials')
            issue_list <- add_issue(
              issue_list,
              1, field_name, field_type,
              c( 'Field is for quality control but VARNAME annotation ',
                 'is not of form QCC.CHR.XXXX.ptinitials.'),
              var_name
            )

        } else if (field_suffix == 'date') {

          if (var_group != 'QCC' || var_type != 'DAT' || var_tail != 'date')
            issue_list <- add_issue(
              issue_list,
              1, field_name, field_type,
              c( 'Field is for quality control but VARNAME annotation is ',
                 'not of form QCC.CHR.XXXX.date.'),
              var_name
            )

        } else if (field_suffix == 'ptinitials_date') {

          if (var_group != 'QCC' || var_type != 'DAT' ||
              var_tail != 'ptinitials_date')
            issue_list <- add_issue(
              issue_list,
              1, field_name, field_type,
              c( 'Field is for quality control but VARNAME annotation is ',
                 'not of form QCC.CHR.XXXX.ptinitials_date.'),
              var_name
            )

        }
      }

      #### 2.3.6) Construct New VARNAME ####

      new_var_name <- stringr::str_c(
        na.omit(c(var_group, var_type,
                  var_instrument, var_tail)), collapse='.')
      new_annotation <- stringr::str_replace(
        field_annotation, '(?<=VARNAME=)[\\w\\.]+', new_var_name)
    }
  }

  for (dup in  dict$field_name[which(duplicated(dict$field_name))])
    issue_list <- add_issue(
      issue_list,
      1, dup, "",
      c( 'Field name not unique.')
    )

  for (dup in var_name_list[which(duplicated(var_name_list))])
    issue_list <- add_issue(
      issue_list,
      1, "", "",
      c( 'VARNAME not unique.'),
      dup
    )

  calculation_references <- unlist(stringr::str_match_all(
    dict$select_choices_or_calculations, '(?<=\\[)\\w+(?=\\])'))
  for (ref in calculation_references[
    which(!(calculation_references %in% dict$field_name))
  ])
    issue_list <- add_issue(
      issue_list,
      1, ref, "",
      c( 'Field does not exist but is referenced in a calculation.')
    )

  branching_references <- unlist(stringr::str_match_all(
    dict$branching_logic, '(?<=\\[)\\w+(?=\\])'))
  for (ref in branching_references[
    which(!(branching_references %in% dict$field_name))])
    issue_list <- add_issue(
      issue_list,
      1, ref, "",
      c( 'Field does not exist but is referenced in branching logic.')
    )

  return( issue_list )
}

##### 2.4) camr_check_name_conventions #####
#' Check Naming Conventions
#'
#' Checks naming conventions of a processed dataframe,
#' following standards set on the CAM Wiki.
#'
#' @param df_any A dataframe.
#'
#' @author Michael Pascale
#'
#' @returns Prints invalid column names to the console.
#'
#' @export
#' @md

camr_check_name_conventions <- function(df_any) {

  for (chr_name in names(df_any)) {
    m_match <- str_match(chr_name, '(?<group>\\w{3})\\.(?<type>\\w{3})\\.(?<topic>\\w+)\\.?(?<tail>.+)?')

    if (any(is.na(m_match[1, c('group', 'type', 'topic')]))) {
      warning('Variable name is of invalid format: ', chr_name)
      next
    }

    if (!(m_match[[1, 'group']] %in% c(
      'IDX', 'IDS', 'SBJ', 'SSS',
      'BIO', 'INV', 'QCC', 'VST'
    ))) {
      warning('Variable name contains invalid group: ', chr_name)
      next
    }

    if (!switch(
      m_match[[1, 'type']],
      INT = is.integer,
      DBL = is.numeric,
      CHR = is.character,
      FCT = is.factor,
      DAT = is.Date,
      DTM = is.POSIXt,
      DUR = is.duration,
      TIM = is.period,
      LGL = is.logical,
      {
        warning('Variable name contains invalid type: ', chr_name)
        next
      }
    )(df_any[[chr_name]])) {
      warning(
        'Variable name does not reflect type: ', chr_name,
        ' (', class(df_any[[chr_name]]), ': ', mode(df_any[[chr_name]]), ')'
      )
    }
  }

}

#### 3) Functions to Process REDCap Data ####
#####  3.1) camr_pivot_redcap_eav #####
#' Extract tables from REDCap EAV downloads
#'
#' `camr_download_redcap()` downloads data in entity-attribute-value (EAV)
#' format (one row per datapoint).
#'
#' @param df_redcap_eav A dataframe with REDCap data in EAV format.
#' @param vchr_fields A set of REDCap field names to extract.
#' @param lgl_collapse_checkboxes Whether to collapse checkbox fields with '|'.
#' @param chr_event_pattern Optional. A pattern to validate event names against.
#' @param chr_field_column Optional. Defaults to the API provided "field_name".
#' @param chr_value_column Optional. Defaults to the API provided "value".
#' @param chr_event_column Optional. Defaults to the API provided "redcap_event_name".
#'
#' @return A dataframe, with the columns of df_redcap_eav and those specified by
#' vchr_fields.
#'
#' @keywords internal
#' @export
camr_pivot_redcap_eav <- function (
    df_redcap_eav,
    vchr_fields,
    lgl_collapse_checkboxes=FALSE,
    chr_event_pattern=NULL,
    chr_field_column='field_name',
    chr_value_column='value',
    chr_event_column='redcap_event_name'
) {
  assert_data_frame(df_redcap_eav)
  assert_character(vchr_fields)
  assert_logical(lgl_collapse_checkboxes, len=1, null.ok=FALSE)
  assert_string(chr_event_pattern, min.chars=1, null.ok=TRUE)
  assert_string(chr_field_column)
  assert_string(chr_value_column)
  assert_string(chr_event_column)
  assert_names(colnames(df_redcap_eav), must.include=(c(chr_field_column, chr_value_column, chr_event_column)))


  df_redcap_eav |>
    filter(
      !!sym(chr_field_column) %in% vchr_fields,
    ) |>
    camr_assert(
      is.null(chr_event_pattern) || all(str_detect(!!sym(chr_event_column), chr_event_pattern)),
      paste0('Events in column ', chr_event_column, ' of df_redcap_eav must match "', chr_event_pattern, '".')
    ) |>
    # TODO: Check repeating forms.
    pivot_wider(
      names_from=chr_field_column,
      values_from=chr_value_column,
      values_fn=(if (lgl_collapse_checkboxes) \(vchr) paste(vchr, collapse='|') else NULL)
    )
}



#' Combine REDCap Checkbox Fields into one Column
#'
#' Use this function within `dplyr::mutate`.
#'
#' @param cols          Tidyselect expression.
#' @param fn_rename     Function with which to rename collapsed columns. Default removes the '___' prefix.
#' @param fn_predicate  Function which transforms each checkbox field value to TRUE/FALSE. Default is x == 1.
#' @param collapse      Character to separate collapsed values. Defaut is '|'.
#' @param as.factor     Create a factor where the levels are all the possible combinations of values. Default is FALSE.
#'
#' @return Character or factor vector of collapsed checkbox values.
#' @export
camr_combine_redcap_checkboxes <-
  function (
    cols,
    fn_rename=\(x) str_remove(x, '^.*___'),
    fn_predicate=\(x) {
      if (!all(x %in% 0:1))
        stop('Specified columns are not all 0 or 1.')
      x == '1'
    },
    collapse='|',
    as.factor=FALSE
  ) {
    # Use this function within mutate(). cols is a tidyselect specification.
    df_subset <- across(cols)

    # Function to be applied to each row. Get the names of columns that have a 1.
    acc <- \(x) names(df_subset)[which(fn_predicate(x))]

    # Rename the values if the user does not want to keep the original column names.
    if (!is.null(fn_rename))
      acc <- compose(fn_rename, acc)

    # Combine the values into a single string.
    acc <- compose(\(x) paste(x, collapse=collapse), acc)

    # Apply the function and return, unless the user wants a factor.
    if (isFALSE(as.factor))
      return(apply(df_subset, 1, acc))

    # The user wants a factor, so define a function to extract the factor levels.
    combinedlevels <- function(labels) {
      n <- length(labels)
      l <- (2^n) -1
      if (l >= 1023)
        stop('Combination factor would be too large. Set as.factor=FALSE.')
      m <- apply(matrix(1:l, l), 1, \(x) as.integer(intToBits(x)))[1:n,]
      f <- apply(m, 2, \(x) paste(labels[!!x], collapse=collapse))
      f[order(colSums(m))]
    }

    # Extract all the possible combinations to produce levels for the factor.
    fl <- combinedlevels(fn_rename(names(df_subset)))

    # Apply the function and convert to factor.
    factor(apply(df_subset, 1, acc), fl)
  }
rettopnivek/camrprojects documentation built on March 26, 2024, 9:17 a.m.