redcap: R Utilities For REDCap

Documented in parse_logging

#' Convert a REDCap project log file to a tidy data frame
#'
#' @description
#' `r lifecycle::badge("experimental")`
#'
#' REDCap project log files have a complicated format where multiple types of
#' information are contained in a single column. For example the column `action`
#' contains the relevant record ID, the type of action that was taken (e.g.
#' Updated / Created / Deleted), and sometimes further details about the source
#' of the action (e.g. API / Import / Automatic field calculation). The
#' `details` column contains a string of variable/value combinations describing
#' any changes (e.g. "var1 = '0', var2 = '1', var3(1) = checked"), and may also
#' contain the relevant repeat instance number (e.g. "\[instance = 3\]").
#'
#' The parse_logging() function tidies up the log file by splitting the record
#' ID, action, action type, and repeat instance into separate columns.
#' Optionally, the string of variable/value changes in the `details` column may
#' be further transformed to long format to yield a single row for each
#' combination of variable and value.
#'
#' Note that this function only deals with log entries of type Created / Deleted
#' / Updated Record. All other log entries (e.g. Data Export, Manage/Design,
#' Edited Role, User Assigned to Role) will be filtered out.
#'
#' @param x REDCap project log file (data frame), e.g. returned by
#'   [`project_logging`]
#' @param format_long Logical indicating whether to transform the log file to
#'   long format, with one row per variable-value combination. Defaults to
#'   `FALSE`.
#' @param dict A REDCap metadata dictionary (data frame), e.g. returned by
#'   [`meta_dictionary`]. Only needed when argument `format_long` is `TRUE`.
#'
#' @return
#' A [`tibble`][tibble::tbl_df]-style data frame with 8 columns:
#'
#' \describe{
#'   \item{`rowid`}{Row number based on original log file. There may be gaps for
#'   rows that have been excluded from the output because they reflected an
#'   action type other than create / delete / update.}
#'   \item{`timestamp`}{unchanged from original log file}
#'   \item{`username`}{unchanged from original log file}
#'   \item{`action`}{One of "Created Record", "Deleted Record", or "Updated
#'   Record", extracted from original `details` column}
#'   \item{`action_type`}{Parenthetical details, if any, extracted from original
#'   `action` column (e.g. "(API)", "(import)", "(Auto calculation)")}
#'   \item{`record_id`}{Record ID, extracted from original `action` column}
#'   \item{`redcap_repeat_instance`}{Instance number (integer), extracted from
#'   original `details` column. Note that 1st instances are not explicitly
#'   specified in the log file and will appear as NA}
#'   \item{`details`}{String of variable value pairs (e.g. "var1 = '0', var2 =
#'   '1', var3(1) = checked"), reflecting the data that was modified}
#' }
#'
#' If argument `format_long` is `TRUE` the `details` column will be replaced
#' with three other columns:
#'
#' \describe{
#'   \item{`form_name`}{Form name, joined from metadata dictionary based on
#'   variable `field_name`. Will be `<NA>` in cases where field name has been
#'   changed or removed and therefore doesn't appear in the dictionary, or for
#'   fields not associated with a specific form like
#'   `redcap_data_access_group`.}
#'   \item{`field_name`}{Field name, extracted from original `details` column}
#'   \item{`value`}{Value, extracted from original `details` column}
#' }
#'
#' @examples
#' \dontrun{
#' conn <- rconn(
#'   url = "https://redcap.msf.fr/api/",
#'   token = Sys.getenv("MY_REDCAP_TOKEN")
#' )
#'
#' parse_logging(project_logging(conn))
#' }
#'
#' @importFrom dplyr `%>%` select filter mutate relocate if_else left_join n any_of
#' @importFrom stringr str_extract
#' @importFrom purrr map
#' @importFrom tidyr unnest
#' @importFrom rlang .data
#' @export parse_logging
parse_logging <- function(x, format_long = FALSE, dict = NULL) {


  if (format_long & is.null(dict)) {
    stop("If format_long = TRUE, argument dict must be provided", call. = FALSE)
  }

  # ## check that all action types are covered (should be nrow = 0)
  # x %>%
  #   filter(!grepl("^(Created?|Deleted?|Updated?) Record", action, ignore.case = TRUE)) %>%
  #   filter(!grepl("^(Created?|Deleted?|Updated?) (Role|User)", action, ignore.case = TRUE)) %>%
  #   filter(!grepl("^Data Export", action, ignore.case = TRUE)) %>%
  #   filter(!grepl("^Manage/Design", action, ignore.case = TRUE)) %>%
  #   filter(!grepl("^(Edited|Renamed) Role", action, ignore.case = TRUE)) %>%
  #   filter(!grepl("^User (Assigned to|Removed from) Role", action, ignore.case = TRUE)) %>%
  #   dplyr::count(action)


  ## filter log file to entries related to records (create/update/delete)
  log_file_records <- x %>%
    mutate(rowid = seq_len(n()), .before = 1) %>%
    filter(grepl("^(Created?|Deleted?|Updated?) [Rr]ecord", .data$action, ignore.case = TRUE))


  ## parse action (create/delete/update), action type (API/import/NA), record ID, and repeat instance
  log_parse <- log_file_records %>%
    filter(!grepl("^\\[instance = \\d+\\]$", .data$details)) %>% # rm if details contains only e.g. "[instance = d+]"
    filter(!is.na(.data$details)) %>%                            # rm if empty details
    mutate(
      action_raw = .data$action,
      action = stringr::str_extract(.data$action_raw, "(Created?|Updated?|Deleted?) [Rr]ecord"),
      action_type = stringr::str_extract(.data$action_raw, "\\(.*\\)"),
      record_id = gsub("(Created?|Updated?|Deleted?) [Rr]ecord (\\(.*\\) )*", "", .data$action_raw),
      # note approach below is much faster than single regex statement with PERL look-behind
      redcap_repeat_instance = stringr::str_extract(.data$details, "\\[instance = \\d+\\]"),
      redcap_repeat_instance = as.integer(stringr::str_extract(.data$redcap_repeat_instance, "[[:digit:]]+")),
      details = gsub("\\[instance = \\d+\\]\\, ", "", .data$details),
      dag = grepl("Assign record to Data Access Group", .data$details),
      details = if_else(.data$dag, stringr::str_extract(.data$details, "redcap_data_access_group \\= \\'.*\\'"), .data$details)
    ) %>%
    select(-any_of(c("action_raw", "dag", "record"))) %>%
    relocate(c("action", "action_type", "record_id", "redcap_repeat_instance"), .after = "username")


  if (format_long) {

    ## prep field_name to allowing joining form_name to log
    dict_prep <- dict %>%
      mutate(field_name = gsub("___(\\d+)$", "(\\1)", .data$field_name)) %>%
      select("field_name", "form_name")

    ## convert to long-format
    log_parse <- log_parse %>%
      mutate(field_and_value = purrr::map(.data$details, parse_value_var)) %>%
      tidyr::unnest("field_and_value") %>%
      mutate(
        field_name = stringr::str_extract(.data$field_and_value, "^.*(?= \\= )"),
        value = stringr::str_extract(.data$field_and_value, "(?<= \\= ).*$"),
        value = gsub("^\\'|\\'$", "", .data$value),
        value = dplyr::if_else(.data$value == "", NA_character_, .data$value)
      ) %>%
      select(-any_of(c("details", "field_and_value"))) %>%
      left_join(dict_prep, by = "field_name") %>%
      relocate("form_name", .after = "record_id")
  }


  ## return
  log_parse
}



#' function to extract var/value combinations from strings in log file
#' strings have form: "vars1 = '...', var2 = '...', var3 = '...'"
#' e.g. "v2_01_send = 'OK', v2_01_comments = 'La participante n'a pas', v2_01_r_rempli = '1110'"
#' @noRd
#' @importFrom stringr str_locate_all
#' @importFrom stringr str_sub
parse_value_var <- function (x) {
  # vars normally should follow [a-z][a-z0-9_]+, but "__GROUPID__ = ''" also appears in some logs
  x_vars <- stringr::str_locate_all(x, "[a-zA-Z0-9_]+(\\(\\d+\\))? \\= (\\'|checked|unchecked)")[[1]]

  if (nrow(x_vars) <= 1) {
    out <- x
  } else {
    vars_start <- x_vars[,1]
    # var/val token ends 3 chars before next var
    vars_end   <- c(x_vars[,1][-1] - 3, nchar(x))
    out <- stringr::str_sub(x, vars_start, vars_end)
  }

  out
}