R/suggest_workflow.R

Defines functions add_emoji check_required_terms required_terms build_req_terms_table fn_to_term_table full_workflow_message additional_functions_message suggest_functions_message minreq_terms_message matching_terms_message check_contains_terms check_dataframe dwc_spinny_message suggest_workflow

Documented in suggest_workflow

#' Suggest a workflow to make data comply with Darwin Core Standard
#'
#' @description
#' Checks whether a `data.frame` or `tibble` conforms to Darwin
#' Core Standard and suggests how to standardise a data frame that is not
#' standardised to minimum Darwin Core requirements. This is intended as
#' users' go-to function for figuring out how to get started standardising
#' their data.
#'
#' Output provides a summary to users about which column names
#' match valid Darwin Core terms, the minimum required
#' column names/terms (and which ones are missing), and a suggested workflow to
#' add any missing terms.
#' @param .df A `data.frame`/`tibble` against which checks should be run
#' @returns Invisibly returns the input `data.frame`/`tibble`, but primarily
#' called for the side-effect of running check functions on that input.
#' @examples
#' df <- tibble::tibble(
#'   scientificName = c("Callocephalon fimbriatum", "Eolophus roseicapilla"),
#'   latitude = c(-35.310, "-35.273"), # deliberate error for demonstration purposes
#'   longitude = c(149.125, 149.133),
#'   eventDate = c("14-01-2023", "15-01-2023"),
#'   status = c("present", "present")
#' )
#'
#' # Summarise whether your data conforms to Darwin Core Standard.
#' # See a suggested workflow to amend or add missing information.
#' df |>
#'   suggest_workflow()
#'
#' @order 1
#' @export
suggest_workflow <- function(.df){
  dwc_spinny_message("Checking Darwin Core terms")
  check_dataframe(.df)
  invisible(.df)
}


#' Theatrics
#' @importFrom cli make_spinner
#' @noRd
#' @keywords Internal
dwc_spinny_message <- function(message) {

  # define the spinner
  spinny <- make_spinner(
    which = "dots2",
    template = paste0("{spin}", glue(" {message}"))
  )

  # update the spinner 100 times
  for(i in 1:100) {
    spinny$spin()
    wait(.01)
  }

  # clear the spinner from the status bar
  spinny$finish()
}


#' Checks dataframe and column names
#'
#' @description
#' Checks whether user dataframe contains sf `geometry` and preserves this
#' information, then checks matching user columns to darwin core terms.
#' @noRd
#' @keywords Internal
check_dataframe <- function(.df,
                         level = c("inform", "warn", "abort")){
  level <- match.arg(level)
  is_sf <- inherits(.df, "sf")

  result <- tibble(dwc_terms = colnames(.df)) |>
    check_contains_terms(dwc_terms = dwc_terms,
                         is_sf = is_sf,
                         level = level)
  .df
}


#' Match Darwin Core terms to column names
#'
#' @description
#' This is the main workhorse function of `suggest_workflow()`.
#'
#' @param .df vector of values
#' @param dwc_terms vector of valid Darwin Core terms against which .df should be compared
#' @importFrom cli cli_div
#' @importFrom cli cli_h1
#' @importFrom cli cli_h2
#' @importFrom cli cli_h3
#' @importFrom cli cli_end
#' @importFrom dplyr pull
#' @importFrom dplyr filter
#' @importFrom dplyr distinct
#' @importFrom rlang is_empty
#' @noRd
#' @keywords Internal
check_contains_terms <- function(.df,
                                 dwc_terms,
                                 is_sf,
                                 level = "inform",
                                 call = caller_env()
){
  check_is_dataframe(.df)
  field_name <- colnames(.df)[[1]]
  user_column_names <- .df |>
    pull(field_name) |>
    unique() |>
    sort()
  name_lookup <- user_column_names %in% dwc_terms$term

  ## Matching column names to darwin core terms

  # matches
  matched_values <- user_column_names[name_lookup]
  unmatched_values <- user_column_names[!name_lookup]
  all_cols_match <- is_empty(unmatched_values)

  ## Minimum required terms

  # retrieve required term match results
  req_terms_results <- check_required_terms(user_column_names)

  ## Suggested workflow & Additional functions

  # Function matching for suggested workflow
  main_functions <- fn_to_term_table()$main
  other_functions <- fn_to_term_table()$optional

  suggested_functions <- main_functions |>
    filter(!.data$term %in% matched_values) |>
    distinct(.data$set_function) |>
    pull("set_function")

  optional_functions <- other_functions |>
    filter(.data$term %in% matched_values) |>
    distinct(.data$set_function) |>
    pull("set_function")

  # this wraps text (which might not be optimal for this table)
  # withr::with_options(
  #   list(cli.width = 80),
  #   full_workflow_message()
  # )

  full_workflow_message(matched_values,
                        unmatched_values,
                        all_cols_match,
                        req_terms_results,
                        suggested_functions,
                        is_sf,
                        optional_functions)

  .df
}


#' Build "Matching terms" message
#'
#' @importFrom cli ansi_collapse
#' @importFrom cli cat_line
#' @importFrom cli cli_text
#' @importFrom cli cli_bullets
#' @importFrom cli cli_par
#' @importFrom cli col_green
#'
#' @noRd
#' @keywords Internal
matching_terms_message <- function(matched_values,
                                   unmatched_values,
                                   all_cols_match) {

  # concatenate matched & unmatched fields
  matched_string <- ansi_collapse(glue("{matched_values}"), sep = ", ", last = ", ")
  unmatched_string <- ansi_collapse(glue("{unmatched_values}"), sep = ", ", last = ", ")

  results_summary <- paste0("Matched {length(matched_values)} of {sum(length(matched_values), length(unmatched_values))} column name{?s} to DwC terms:")

  # message
  cat_line()
  cat_line(cli_text(results_summary))
  cli_bullets(c("v" = "Matched: {.field {matched_string}}"))
  cli_bullets(c("x" = "Unmatched: {col_red({unmatched_string})}"))
  cli_par()
  if(isTRUE(all_cols_match)) {
    # celebrate
    cat_line(paste0("\n", add_emoji(), " ", col_green("All column names matched DwC terms!"), "\n"))
  }
}

#' build "Matching terms" message
#'
#' @importFrom cli cat_line
#' @importFrom cli col_green
#' @noRd
#' @keywords Internal
minreq_terms_message <- function(req_terms_results) {

  # create message components
  req_terms_table <- build_req_terms_table(req_terms_results)
  all_req_terms_found <- all(req_terms_results$result == "pass")

  # message
  cat_line()
  cat_line(req_terms_table)
  if(isTRUE(all_req_terms_found)) {
    # celebrate
    cat_line(paste0("\n", add_emoji(), " ", col_green("All minimum column requirements met!"), "\n"))
  }
}

#' build suggested workflow message
#'
#' @importFrom cli ansi_collapse
#' @importFrom cli cat_line
#' @importFrom cli cli_text
#' @importFrom cli cli_div
#' @importFrom cli cli_alert
#' @importFrom cli cli_end
#' @importFrom cli style_italic
#' @importFrom rlang is_empty
#' @noRd
#' @keywords Internal
suggest_functions_message <- function(suggested_functions,
                                      is_sf,
                                      .envir = parent.frame()) {

  # if POINT sf class, suggest `set_coordinates_sf()`
  if(isTRUE(is_sf)) {
    # add
    suggested_functions <- c("set_coordinates_sf()", suggested_functions)
  }

  # add pipe when there are multiple suggested functions
  if(length(suggested_functions) > 1) {
    suggested_functions_piped <- c(paste0(utils::head(suggested_functions, -1), " |> "),
                                   utils::tail(suggested_functions, 1))
  } else {
    suggested_functions_piped <- suggested_functions
  }

  # test whether user doesn't need any additional functions
  workflow_is_empty <- is_empty(suggested_functions_piped)

    if(!any(workflow_is_empty)) {
      cat_line(style_italic(paste0("\n", "To make your data Darwin Core compliant, use the following workflow:", "\n")))
      cli_text("df |>")
      cli_div(theme = list(.alert = list(`margin-left` = 2, before = "")))
      lapply(suggested_functions_piped, cli_alert, .envir = .envir)
      cli_end()

    } else {
      cat_line(paste0("\n", add_emoji(), " ", col_green("Your dataframe is Darwin Core compliant!"), "\n"))
      cat_line(cli_text(paste0("Run checks, or use your dataframe to build a Darwin Core Archive with {.pkg galaxias}:\n")))
      cli_text("df |>")
      cli_div(theme = list(.alert = list(`margin-left` = 2, before = "")))
      lapply(paste0("check_dataset()"), cli_alert, .envir = .envir)
      cli_end()
    }
}


#' build additional functions message
#'
#' @importFrom cli col_grey
#' @importFrom cli cli_text
#' @importFrom cli cli_bullets
#' @importFrom cli cli_text
#' @importFrom cli cli_fmt
#' @importFrom cli ansi_collapse
#' @noRd
#' @keywords Internal
additional_functions_message <- function(optional_functions,
                                         .envir = parent.frame()) {

  # add list of optional functions
  if(length(optional_functions) >= 1) {
    optional_functions_string <- ansi_collapse(glue("`{optional_functions}`"),
                                               sep = ", ",
                                               last = ", ",
                                               trunc = 3)
    optional_functions_message <- paste0(
      "{optional_functions_string}") |> cli_text() |> cli_fmt()
  } else {
    optional_functions_message <- NA
  }

  if(!is.na(optional_functions_message)) {
    cli_text(paste0("Based on your matched terms, you can also add to your pipe: ", "\n"))
    cli_bullets(c("*" = optional_functions_message))
  }
  cli_bullets(c("i" = col_grey("See all `set_` functions at http://corella.ala.org.au/reference/index.html#add-rename-or-edit-columns-to-match-darwin-core-terms")))
}


#' Build full workfow message for `suggest_workflow()`
#'
#' @importFrom cli cli_div
#' @importFrom cli cli_end
#' @importFrom cli cli_h1
#' @importFrom cli cli_h3
#'
#' @noRd
#' @keywords Internal
full_workflow_message <- function(matched_values,
                                  unmatched_values,
                                  all_cols_match,
                                  req_terms_results,
                                  suggested_functions,
                                  is_sf,
                                  optional_functions) {

  # build final message with separate components

  # DwC terms
  cli_div()
  cli_h1("Matching Darwin Core terms")
  matching_terms_message(matched_values,
                         unmatched_values,
                         all_cols_match)

  cli_h1("Minimum required Darwin Core terms")
  minreq_terms_message(req_terms_results)
  cli_end()

  # Suggested workflow
  cli_h1("Suggested workflow")
  suggest_functions_message(suggested_functions,
                            is_sf)

  cli_h3(col_grey("Additional functions"))
  additional_functions_message(optional_functions)
}


#' Table of Darwin Core terms and their corresponding `set_` function
#'
#' @importFrom tibble lst
#' @importFrom dplyr filter
#' @importFrom dplyr select
#' @noRd
#' @keywords Internal
fn_to_term_table <- function() {

  main_terms <- c("basisOfRecord", "occurrenceID", "scientificName",
                  "occurrenceID", "scientificName", "decimalLatitude",
                  "decimalLongitude", "geodeticDatum",
                  "coordinateUncertaintyInMeters", "eventDate")

  terms_table <- corella::darwin_core_terms |>
    select("set_function", "term")

  main <- terms_table |>
    filter(.data$term %in% main_terms)

  optional <- terms_table |>
    filter(!.data$term %in% main_terms) |>
    filter(!is.na(.data$set_function))

  # main <- tibble::tribble(
  #   ~"set_function", ~"dwc_term",
  #   "set_occurrences()", "basisOfRecord",
  #   "set_occurrences()", "occurrenceID",
  #   "set_scientific_name()", "scientificName",
  #   "set_coordinates()", "decimalLatitude",
  #   "set_coordinates()", "decimalLongitude",
  #   "set_coordinates()", "geodeticDatum",
  #   "set_coordinates()", "coordinateUncertaintyInMeters",
  #   "set_datetime()", "eventDate"
  # )

  # optional <- tibble::tribble(
  #   ~"set_function", ~"dwc_term",
  #   "set_locality()", "continent",
  #   "set_locality()", "country",
  #   "set_locality()", "countryCode",
  #   "set_locality()", "stateProvince",
  #   "set_locality()", "locality",
  #   "set_taxonomy()", "kingdom",
  #   "set_taxonomy()", "phylum",
  #   "set_taxonomy()", "class",
  #   "set_taxonomy()", "order",
  #   "set_taxonomy()", "family",
  #   "set_taxonomy()", "genus",
  #   # "set_taxonomy()", "species",
  #   "set_taxonomy()", "specificEpithet",
  #   "set_taxonomy()", "vernacularName",
  #   "set_abundance()", "individualCount",
  #   "set_abundance()", "organismQuantity",
  #   "set_abundance()", "organismQuantityType",
  #   "set_abundance()", "organismQuantity",
  #   "set_collection()", "datasetID",
  #   "set_collection()", "datasetName",
  #   "set_collection()", "catalogNumber",
  #   "set_coordinates()", "coordinatePrecision",
  #   "set_scientific_name()", "taxonRank",
  #   "set_scientific_name()", "scientificNameAuthorship",
  #   "set_datetime()", "year",
  #   "set_datetime()", "month",
  #   "set_datetime()", "day",
  #   "set_datetime()", "eventTime",
  #   "set_individual_traits()", "individualID",
  #   "set_individual_traits()", "lifeStage",
  #   "set_individual_traits()", "sex",
  #   "set_individual_traits()", "vitality",
  #   "set_individual_traits()", "reproductiveCondition",
  #   "set_observer()", "recordedBy",
  #   "set_observer()", "recordedByID",
  #   "set_events()", "eventID",
  #   "set_events()", "eventType",
  #   "set_events()", "parentEventID",
  #   "set_license()", "license",
  #   "set_license()", "rightsHolder",
  #   "set_license()", "accessRights",
  #   "set_measurements()", "measurementValue",
  #   "set_measurements()", "measurementID",
  #   "set_measurements()", "measurementUnit",
  #   "set_measurements()", "measurementType"
  # )

  table <- lst(main, optional) # named list

  return(table)
}


#' Build table for messaging about minimum required terms
#'
#' @importFrom cli ansi_align
#' @importFrom cli ansi_collapse
#' @importFrom cli ansi_nchar
#' @importFrom cli col_blue
#' @importFrom cli col_green
#' @importFrom cli col_red
#' @importFrom cli symbol
#' @importFrom dplyr case_when
#' @importFrom dplyr group_by
#' @importFrom dplyr select
#' @importFrom dplyr full_join
#' @importFrom dplyr join_by
#' @importFrom rlang .data
#' @importFrom tidyr replace_na
#' @importFrom tidyr unnest
#' @noRd
#' @keywords Internal
build_req_terms_table <- function(req_terms) {

  # Unnest & concatenate terms by group
  missing_results <- req_terms |>
    select(-"matched") |>
    unnest(cols = "missing") |>
    group_by(.data$term_group) |>
    mutate( # glue names
      missing = ansi_collapse(.data$missing, sep = ", ", last = ", ")
    ) |>
    unique()

  matched_results <- req_terms |>
    select(-"missing") |>
    unnest(cols = "matched") |>
    group_by(.data$term_group) |>
    mutate( # glue names
      matched = ansi_collapse(.data$matched, sep = ", ", last = ", ")
    ) |>
    unique()

  req_terms_message <- missing_results |>
    full_join(matched_results,
              join_by("term_group", "result")) |>
    # remove other Identifier terms if one or more are matched
    mutate(
      missing = case_when(
        .data$term_group == "Identifier (at least one)" & !is.na(.data$matched) ~ NA,
        .default = .data$missing
        )) |>
    # add blank space for correct message formatting
    tidyr::replace_na(list(missing = stringr::str_pad("-", width = 16, side = "right"),
                           matched = stringr::str_pad("-", width = 16, side = "right")))

  # Group terms found vs missing
  pass <- req_terms_message |>
    filter(.data$result == "pass")

  failed <- req_terms_message |>
    filter(.data$result == "fail")

  pass_group <- glue("{pass$term_group}")
  pass_matched <- glue("{pass$matched}")
  pass_missing <- glue("{pass$missing}")
  fail_group <- glue("{failed$term_group}")
  fail_matched <- glue("{failed$matched}")
  fail_missing <- glue("{failed$missing}")

  headers <- paste0(
    "  ",
    ansi_align(col_blue("Type"), max(ansi_nchar(c(pass_group, fail_group)))), " ",
    ansi_align(col_blue("Matched term(s)"), max(ansi_nchar(c(pass_matched, fail_matched)))), " ",
    ansi_align(col_blue("Missing term(s)"), max(ansi_nchar(c(pass_missing, fail_missing)))),"\n",
    collapse = "\n"
  )

  bullets_found <- paste0(paste0(
    col_green(symbol$tick), " ",
    ansi_align(pass_group, max(ansi_nchar(c(pass_group, fail_group)))), " ",
    ansi_align(col_green(pass_matched), max(ansi_nchar(c(pass_matched, fail_matched)))), " ",
    ansi_align(col_red(pass_missing), max(ansi_nchar(c(pass_missing, fail_missing)))), " ",
    collapse = "\n"
  ), "\n")

  bullets_missing <- paste0(paste0(
    col_red(symbol$cross), " ",
    ansi_align(fail_group, max(ansi_nchar(c(pass_group, fail_group)))), " ",
    ansi_align(col_green(fail_matched), max(ansi_nchar(c(pass_matched, fail_matched)))), " ",
    ansi_align(col_red(fail_missing), max(ansi_nchar(c(pass_missing, fail_missing)))), " ",
    collapse = "\n"
  ), "\n")


  # Remove tick when all terms are matched or missing
  if(nrow(pass) == 0) {
    bullets_found <- NULL
  }

  if(nrow(failed) == 0) {
    # celebrate
    bullets_missing <- NULL
  }

  # final message
  paste0(
    headers,
    bullets_found,
    bullets_missing
  )
}

#' Minimum required terms for a Darwin Core compliant data archive
#' @noRd
#' @keywords Internal
required_terms <- function() {
  terms <- list(
    identifier = c(
      "occurrenceID",
      "catalogNumber",
      "recordNumber"
    ),
    basis = c(
      "basisOfRecord"
    ),
    name = c(
      "scientificName"
    ),
    location = c(
      "decimalLatitude",
      "decimalLongitude",
      "geodeticDatum",
      "coordinateUncertaintyInMeters"
    ),
    date = c(
      "eventDate"
    )
  )
}

#' Return missing minimum required terms
#' @importFrom dplyr bind_rows
#' @noRd
#' @keywords Internal
check_required_terms <- function(user_column_names) {

  terms <- required_terms()

  id <- tibble(
    term_group = "Identifier (at least one)",
    missing = list(terms$identifier[!terms$identifier %in% user_column_names]),
    matched = list(terms$identifier[terms$identifier %in% user_column_names])
  ) |>
    mutate(
      result = ifelse(length(missing[[1]]) != 3, "pass", "fail")
    )
  basis <- tibble(
    term_group = "Record type",
    missing = list(terms$basis[!terms$basis %in% user_column_names]),
    matched = list(terms$basis[terms$basis %in% user_column_names])
  ) |>
    mutate(
      result = ifelse(length(missing[[1]]) == 0, "pass", "fail")
    )
  name <- tibble(
    term_group = "Scientific name",
    missing = list(terms$name[!terms$name %in% user_column_names]),
    matched = list(terms$name[terms$name %in% user_column_names])
  ) |>
    mutate(
      result = ifelse(length(missing[[1]]) == 0, "pass", "fail")
    )
  location <- tibble(
    term_group = "Location",
    missing = list(terms$location[!terms$location %in% user_column_names]),
    matched = list(terms$location[terms$location %in% user_column_names])
  ) |>
    mutate(
      result = ifelse(length(missing[[1]]) == 0, "pass", "fail")
    )
  date <- tibble(
    term_group= "Date/Time",
    missing = list(terms$date[!terms$date %in% user_column_names]),
    matched = list(terms$date[terms$date %in% user_column_names])
  ) |>
    mutate(
      result = ifelse(length(missing[[1]]) == 0, "pass", "fail")
    )

  # combine
  all_terms <- bind_rows(id, basis, name, location, date)

  # convert empty row value to NULL
  result <- all_terms |>
    mutate(
      missing = lapply(.data$missing, function(x) if(identical(x, character(0))) NULL else x),
      matched = lapply(.data$matched, function(x) if(identical(x, character(0))) NULL else x)
    )

  return(result)
}


#' Add happy emoji
#' @noRd
#' @keywords Internal
add_emoji <- function() {
  emoji <- c(
    "\U0001f600", # smile
    "\U0001f973", # party face
    "\U0001f638", # cat grin
    "\U0001f308", # rainbow
    "\U0001f947", # gold medal
    "\U0001f389", # party popper
    "\U0001f38a" # confetti ball
  )
  sample(emoji, 1)
}

Try the corella package in your browser

Any scripts or data that you put into this service are public.

corella documentation built on April 4, 2025, 12:20 a.m.