R/dct_check_sci_name.R

Defines functions dct_check_sci_name check_sci_name_is_uniq check_sci_name_not_na

Documented in dct_check_sci_name

#' check_sci_name sub-check: check that no scientificName is missing
#' Required columns:
#' - scientificName
#' @inherit dct_check_sci_name
#' @param run Logical; should this check be run? If FALSE, return NULL
#' @noRd
#' @autoglobal
check_sci_name_not_na <- function(tax_dat,
                                  on_fail = dct_options()$on_fail,
                                  on_success = dct_options()$on_success,
                                  run = TRUE,
                                  quiet = dct_options()$quiet) {
  # Early exit with NULL if req'd cols not present
  if (!"scientificName" %in% colnames(tax_dat) || run == FALSE) {
    return(NULL)
  }

  # Check for missing ID
  missing_sci_name <- tax_dat$scientificName[is.na(tax_dat$scientificName)]

  # Format results
  if (on_fail == "error") {
    assertthat::assert_that(
      length(missing_sci_name) == 0,
      msg = glue::glue(
        "check_sci_name failed
         scientificName detected with missing value
         {make_msg('scientificName', missing_sci_name)}
      "
      )
    )
  }
  if (on_fail == "summary") {
    err_msg <- "scientificName detected with missing value"
    assert_that_d(
      length(missing_sci_name) == 0,
      data = tibble::tibble(
        scientificName = missing_sci_name,
        check = "check_sci_name",
        error = err_msg
      ),
      msg = err_msg,
      quiet = quiet
    )
  }
  if (on_success == "data") {
    return(tax_dat)
  }
  if (on_success == "logical") {
    return(TRUE)
  }
}

#' check_sci_name sub-check: check that all scientificName values are unique
#' Required columns:
#' - scientificName
#' @inherit check_sci_name_not_na
#' @noRd
#' @autoglobal
check_sci_name_is_uniq <- function(tax_dat,
                                   on_fail = dct_options()$on_fail,
                                   on_success = dct_options()$on_success,
                                   run = TRUE,
                                   quiet = dct_options()$quiet) {
  # Early exit with NULL if req'd cols not present
  if (!"scientificName" %in% colnames(tax_dat) || run == FALSE) {
    return(NULL)
  }

  # Check for duplicated ID
  duplicated_sci_name <- tax_dat$scientificName[
    duplicated(tax_dat$scientificName)
  ]

  bad_tax_id <- NULL
  if ("taxonID" %in% colnames(tax_dat)) {
    bad_rows <- tax_dat[
      tax_dat$scientificName %in% duplicated_sci_name,
      c("taxonID", "scientificName")
    ]
    bad_rows <- unique(bad_rows)
    bad_tax_id <- bad_rows$taxonID
    duplicated_sci_name <- bad_rows$scientificName
  }

  # Format results
  if (on_fail == "error") {
    assertthat::assert_that(
      length(duplicated_sci_name) == 0,
      msg = glue::glue(
        "check_sci_name failed
         scientificName detected with duplicated value
         {make_msg('scientificName', duplicated_sci_name)}
      "
      )
    )
  }
  if (on_fail == "summary") {
    err_msg <- "scientificName detected with duplicated value"
    assert_that_d(
      length(duplicated_sci_name) == 0,
      data = tibble::tibble(
        taxonID = bad_tax_id,
        scientificName = duplicated_sci_name,
        check = "check_sci_name",
        error = err_msg
      ),
      msg = err_msg,
      quiet = quiet
    )
  }
  if (on_success == "data") {
    return(tax_dat)
  }
  if (on_success == "logical") {
    return(TRUE)
  }
}

#' Check scientificName
#'
#' Check for correctly formatted scientificName column in Darwin Core
#' taxonomic data.
#'
#' The following rules are enforced:
#' - scientificName may not be missing (NA)
#' - scientificName must be unique
#'
#' @param tax_dat `r param_tax_dat`
#' @param on_fail `r param_on_fail`
#' @param on_success `r param_on_success`
#' @param quiet `r param_quiet`
#'
#' @inherit dct_check_taxon_id return
#' @example inst/examples/dct_check_sci_name.R
#' @autoglobal
#' @export
#'
dct_check_sci_name <- function(tax_dat,
                               on_fail = dct_options()$on_fail,
                               on_success = dct_options()$on_success,
                               quiet = dct_options()$quiet) {
  # Check input format
  assertthat::assert_that(
    inherits(tax_dat, "data.frame"),
    msg = "'tax_dat' must be of class 'data.frame'"
  )
  assertthat::assert_that(assertthat::is.string(on_fail))
  assertthat::assert_that(assertthat::is.string(on_success))
  assertthat::assert_that(
    on_fail %in% c("error", "summary"),
    msg = "on_fail must be one of 'error' or 'summary'"
  )
  assertthat::assert_that(
    on_success %in% c("data", "logical"),
    msg = "on_success must be one of 'data' or 'logical'"
  )

  # Run main checks
  check_res <- list(
    # Check for required columns
    assert_col(
      tax_dat, "scientificName", "character",
      req_by = "check_sci_name", on_fail = on_fail, quiet = quiet
    ),
    # Check taxonID not NA
    check_sci_name_not_na(
      tax_dat,
      on_fail = on_fail, on_success = "logical", quiet = quiet
    ),
    # Check taxonID is unique
    check_sci_name_is_uniq(
      tax_dat,
      on_fail = on_fail, on_success = "logical", quiet = quiet
    )
  ) |>
    # drop any NULL results
    purrr::compact()

  # Format results
  if (on_fail == "summary") {
    if (any_not_true(check_res)) {
      return(unique(bind_rows_f(check_res)))
    }
  }
  if (on_success == "data") {
    return(tax_dat)
  }
  if (on_success == "logical") {
    return(TRUE)
  }
}

Try the dwctaxon package in your browser

Any scripts or data that you put into this service are public.

dwctaxon documentation built on May 29, 2024, 5:53 a.m.