R/check_dataset.R

Defines functions truncate_messages min_req_dwc_message check_min_req_dwc summary_message format_messages_from_checks add_table_headers check_all check_dataset

Documented in check_dataset

#' Check a dataset for Darwin Core conformance
#'
#' @description
#' Run a test suite of checks to test whether a `data.frame` or `tibble`
#' conforms to Darwin Core Standard.
#'
#' While most users will only want to call `suggest_workflow()`,
#' the underlying check functions are exported for detailed work, or for
#' debugging. This function is useful for users experienced with
#' Darwin Core Standard or for final dataset checks.
#' @param .df A tibble against which checks should be run
#' @returns Invisibly returns the input data frame, but primarily called for the
#' side-effect of running check functions on that input.
#' @details
#' `check_dataset()` is modelled after [devtools::test()]. It runs a
#' series of checks, then supplies a summary of passed/failed checks and
#' error messages.
#'
#' Checks run by `check_dataset()` are the same that would
#' be run automatically by various `set_` functions in a piped workflow. This
#' function allows users with only minor expected updates to check their entire
#' dataset without the need for `set_` functions.
#'
#' @examples \donttest{
#' df <- tibble::tibble(
#'   scientificName = c("Crinia Signifera", "Crinia Signifera", "Litoria peronii"),
#'   latitude = c(-35.27, -35.24, -35.83),
#'   longitude = c(149.33, 149.34, 149.34),
#'   eventDate = c("2010-10-14", "2010-10-14", "2010-10-14"),
#'   status = c("present", "present", "present")
#'   )
#'
#' # Run a test suite of checks for Darwin Core Standard conformance
#' # Checks are only run on columns with names that match Darwin Core terms
#' df |>
#'   check_dataset()
#' }
#'
#' @importFrom rlang inform
#' @importFrom purrr map
#' @importFrom cli cli_bullets
#' @importFrom cli col_blue
#' @importFrom cli col_green
#' @importFrom cli col_red
#' @importFrom cli col_yellow
#' @importFrom cli style_italic
#' @importFrom cli cli_h3
#' @importFrom cli ansi_align
#' @importFrom cli ansi_nchar
#' @importFrom cli cat_line
#' @importFrom purrr map
#' @importFrom dplyr bind_rows
#' @importFrom dplyr slice_head
#' @importFrom dplyr group_split
#' @importFrom tidyr unnest
#' @order 1
#' @export
check_dataset <- function(.df){

  # dwc_terms
  fields <- colnames(.df)
  available_checks <- fn_to_term_table() |>
    bind_rows() |>
    select("term") |>
    pull()
  checkable_fields <- fields[fields %in% available_checks]

  # If measurementOrFact is a column, add nested column names
  if(any(fields %in% "measurementOrFact")) {
    checkable_fields <- c(checkable_fields,
                          "measurementValue", "measurementID", "measurementUnit", "measurementType")
  }

  # find fields in .df with available checks
  check_functions_names <- c(glue("check_{checkable_fields}"))
  check_functions <- as.list(check_functions_names)
  names(check_functions) <- checkable_fields

  ## Run Checks

  # inform user
  cli::cli_alert_info("Testing data")
  for(i in 1:100) {
    wait(0.001)
  }

  # build table
  add_table_headers(checkable_fields)
  invisible() # prevent df results from printing with headers

  # browser()
  # check all checkable fields, save fields & error messages
  check_results <- check_functions_names |>
    map(~ check_all(.x, .df, checkable_fields)) |>
    bind_rows()

  # print result summary
  summary_message(check_results, checkable_fields)
  cat_line()

  ## Darwin Core compliance
  # inform user
  cli::cli_alert_info("Checking Darwin Core compliance")
  for(i in 1:100) {
    wait(0.001)
  }

  dwc_compliant <- check_min_req_dwc(checkable_fields)
  min_req_dwc_message(dwc_compliant)

  ## Error Messages

  # truncate
  gt_msg_max <- ifelse(length(check_results$messages) > 5, TRUE, FALSE)
  check_results <- truncate_messages(check_results, gt_msg_max)

  if(length(check_results$messages) > 0) {

    dwc_spinny_message(paste0("Collecting error messages..."))

  # split messages by function for message formatting
    results_split <- check_results |>
      tidyr::unnest(cols = "messages") |>
      mutate(
        term = factor("term", levels = unique("term")) # maintain original term order
        ) |>
      group_split("term")

  # print preserved errors in a nice format
    results_split |>
      map(~ format_messages_from_checks(.x))

  } else {
    if(isTRUE(dwc_compliant)) {
      # celebrate
      cat_line(paste0("\n", add_emoji(), " ", col_green("All column checks pass!"), "\n"))
    }
  }

  invisible(.df)
}


#' Check all fields that match Darwin Core terms in a data frame
#'
#' @description
#' Runs checks on all columns that match Darwin Core terms. `check_all()` does this by
#' detecting and matching matched Darwin Core terms to their associated `check_` function.
#'
#' `check_all()` runs in a similar way to `devtools::test()`, whereby it will run and
#' report the results of checks "live", then return a summary table and
#' any error messages returned.
#'
#' @importFrom cli cli_progress_step
#' @importFrom cli cli_progress_update
#' @importFrom cli ansi_align
#' @importFrom cli ansi_nchar
#' @importFrom rlang cnd_muffle
#' @importFrom rlang exec
#' @importFrom stringr str_remove_all
#' @importFrom tibble tibble
#' @noRd
#' @keywords Internal
check_all <- function(fn, .df, checkable_fields) {

  # message saving & counting setup
  m_counter <- 0
  msgs <- list()
  passing <- list()
  all_results <- list()

  # message format setup
  field_nchar <- max(ansi_nchar(checkable_fields))
  fn_name <- str_remove_all(fn, 'check_')
  progress_msg <- paste0(
    # ansi_align(" ", max(ansi_nchar(symbol$tick))), " ",
                         ansi_align(glue("| {m_counter}"), ansi_nchar("| E")), " ",
                         ansi_align(glue("{passing} |"), ansi_nchar("P |")), " ",
                         ansi_align(glue("{fn_name}"), field_nchar), " "
  )

  # run live checks
  tryCatch(withCallingHandlers(
    {
      progress <- cli_progress_step("{progress_msg}", spinner = TRUE) # prints message
      exec(fn, .df) # runs check function
    },
    message = function(m) {
      # update counter if a galax message is triggered
      if (inherits(m, "galax_message")) {
        m_counter <<- m_counter + 1
        msgs <<- append(msgs, m$message)
        passing <<- col_red(symbol$cross)
        progress_msg <<- paste0(
          # ansi_align(" ", max(ansi_nchar(symbol$tick))), " ",
                                ansi_align(glue("| {m_counter}"), ansi_nchar("| E")), " ",
                                ansi_align(glue("{passing} |"), ansi_nchar("P |")), " ",
                               ansi_align(glue("{fn_name}"), field_nchar), " "
        )
        cli_progress_update(id = progress)
        cnd_muffle(m)
      } else {
        passing <<- col_green(symbol$tick)
        progress_msg <<- paste0(
          # ansi_align(" ", max(ansi_nchar(symbol$tick))), " ",
                                ansi_align(glue("| {m_counter}"), ansi_nchar("| E")), " ",
                                ansi_align(glue("{passing} |"), ansi_nchar("P |")), " ",
                               ansi_align(glue("{fn_name}"), field_nchar), " "
        )
      }
    }),

    finally = {
      # capture all messages somewhere
      results <- tibble(
        term = fn_name,
        check_function = fn,
        messages = msgs
      )
      return(results)
    }
  )
}

#' Format table headers for `check_all()`
#'
#' @importFrom cli ansi_align
#' @importFrom cli ansi_nchar
#' @importFrom cli col_yellow
#' @importFrom cli col_green
#'
#' @noRd
#' @keywords Internal
add_table_headers <- function(row_values) {
  # browser()
  headers <- paste0(
    ansi_align(col_green(symbol$tick), max(ansi_nchar(symbol$tick))), " ",
    ansi_align(glue("| {col_yellow('E')}"), max(ansi_nchar("| E"))), " ",
    ansi_align(glue("{col_green('P')} |"), max(ansi_nchar(glue("{symbol$tick} |")))), " ",
    ansi_align("Column", max(ansi_nchar(row_values)))

  )
  cat_line(headers)
}

#' Format each saved message from `check_all()` nicely
#'
#' @importFrom cli cat_line
#' @importFrom cli cli_rule
#' @importFrom cli style_bold
#' @importFrom cli col_yellow
#'
#' @noRd
#' @keywords Internal
format_messages_from_checks <- function(df) {
  # retrieve term & message
  term <- df$term |> unique()
  m <- paste0(df$messages)

  # format & print
  cat_line()
  cli_rule("{col_yellow(style_bold('Error'))} in {term}")
  cat_line()
  cat_line(m)
  cat_line()
}

#' Build `check_all()` summary message
#'
#' @importFrom cli cat_line
#' @importFrom cli col_red
#' @importFrom cli col_green
#' @noRd
#' @keywords Internal
summary_message <- function(results, checkable_fields) {
  n_errors <- length(results$messages)
  n_passing_fields <- length(checkable_fields) - length(unique(results$term))

  # message
  cat_line()
  cli_div(theme = list(rule = list("line-type" = "double")))
  cli::cli_rule(left = "Results")
  cli_end()
  cat_line()
  cat_line(glue("[ {col_yellow('Errors')}: {n_errors} | {col_green('Pass')}: {n_passing_fields} ]"))
}


#' Check whether data meets minimum requirements
#'
#' @description
#' Simple check for whether data meets minimum requirements to be accepted as a
#' Darwin Core archive. The check is a simplifed version of the underlying check
#' in `suggest_workflow()`.
#' @noRd
#' @keywords Internal
check_min_req_dwc <- function(checkable_fields) {

  # message
  dwc_spinny_message(glue("Data meets minimum Darwin Core column requirements"))

  # check matching user columns with minimum required DwC terms
  req_terms_results <- check_required_terms(checkable_fields)
  is_dwc_compliant <- all(req_terms_results$result == "pass")
  return(is_dwc_compliant)
}

#' Build message about results of `check_min_req_dwc()`
#'
#' @importFrom cli cat_line
#' @importFrom cli col_red
#' @importFrom cli col_green
#' @importFrom cli cli_progress_step
#' @noRd
#' @keywords Internal
min_req_dwc_message <- function(is_dwc_compliant) {

  if(isTRUE(is_dwc_compliant)) {
    complies_text <- "Data meets minimum Darwin Core column requirements"
    cli::cli_status_clear()
    cat_line(glue("{col_green(symbol$tick)} {complies_text}"))
  } else {
    noncomplies_text <- "Data does not meet minimum Darwin Core column requirements"
    cli::cli_status_clear()
    cat_line(glue("{col_red(symbol$cross)} {noncomplies_text}"))
    cli_bullets(c(i = "Use `suggest_workflow()` to see more information."))
  }
  cat_line()

}


#' Truncate list of messages
#'
#' @description
#' Truncates list of messages if greater than 5.
#'
#' @importFrom cli cat_line
#' @importFrom cli cli_h3
#' @importFrom cli col_green
#' @importFrom dplyr slice_head
#' @noRd
#' @keywords Internal
truncate_messages <- function(check_results, gt_msg_max) {
  if(isTRUE(gt_msg_max)) {
    check_results <- check_results |>
      slice_head(n = 5)

    cli_h3(col_yellow(style_italic("Truncating to first 5 error messages")))
  }

  return(check_results)
}

#' Advanced `check_all()` with separate message, warning, error tracking
#' @noRd
#' @keywords Internal
# check_all_advanced <- function(fn, .df, checkable_fields) {
#
#   # message saving & counting setup
#   m_counter <- 0
#   w_counter <- 0
#   e_counter <- 0
#   msgs <- list()
#   wrns <- list()
#   errs <- list()
#   all_results <- list()
#
#   # message format setup
#   field_nchar <- max(ansi_nchar(checkable_fields))
#   fn_name <- stringr::str_remove_all(fn, 'check_')
#   progress_msg <- paste0(ansi_align(glue("{fn_name}"), field_nchar), " ",
#                          ansi_align(glue("{m_counter}"), ansi_nchar(2)), " ",
#                          ansi_align(glue("{w_counter}"), ansi_nchar(2)), " ",
#                          ansi_align(glue("{e_counter}"), ansi_nchar(2)), " "
#   )
#
#   # run check functions
#   tryCatch(withCallingHandlers(
#     {
#       progress <- cli::cli_progress_step("{progress_msg}", spinner = TRUE) # prints message
#       rlang::exec(fn, .df) # runs check function
#     },
#     message = function(m) {
#       if (inherits(m, "galax_message")) {
#         m_counter <<- m_counter + 1
#         msgs <<- append(msgs, m$message)
#         progress_msg <<- paste0(ansi_align(glue("{fn_name}"), field_nchar), " ",
#                                 ansi_align(glue("{m_counter}"), ansi_nchar(2)), " ",
#                                 ansi_align(glue("{w_counter}"), ansi_nchar(2)), " ",
#                                 ansi_align(glue("{e_counter}"), ansi_nchar(2)), " "
#         )
#         cli::cli_progress_update(id = progress)
#         rlang::cnd_muffle(m)
#       }
#     },
#     warning = function(w) {
#       if (inherits(w, "galax_warning")) {
#         w_counter <<- w_counter + 1
#         wrns <<- append(wrns, w$message)
#         progress_msg <<- paste0(ansi_align(glue("{fn_name}"), field_nchar), " ",
#                                 ansi_align(glue("{m_counter}"), ansi_nchar(2)), " ",
#                                 ansi_align(glue("{w_counter}"), ansi_nchar(2)), " ",
#                                 ansi_align(glue("{e_counter}"), ansi_nchar(2)), " "
#         )
#         cli::cli_progress_update(id = progress)
#         rlang::cnd_muffle(w)
#       }
#     }),
#     error = function(e) {
#       if (inherits(e, "galax_error")) {
#         e_counter <<- e_counter + 1
#         errs <<- append(errs, e)
#         progress_msg <<- paste0(ansi_align(glue("{fn_name}"), field_nchar), " ",
#                                 ansi_align(glue("{m_counter}"), ansi_nchar(2)), " ",
#                                 ansi_align(glue("{w_counter}"), ansi_nchar(2)), " ",
#                                 ansi_align(glue("{e_counter}"), ansi_nchar(2)), " "
#         )
#         cli::cli_progress_update(id = progress)
#         rlang::cnd_muffle(e)
#       }
#     },
#
#     finally = {
#       # capture all messages somewhere
#       tibble(
#         term = fn_name,
#         check_function = fn,
#         messages = msgs,
#         warnings = wrns,
#         errors = errs
#       )
#     }
#   )
# }

Try the corella package in your browser

Any scripts or data that you put into this service are public.

corella documentation built on April 4, 2025, 12:20 a.m.