test-idMatchR.R
In BeeBDC: Occurrence Data Cleaning

requireNamespace("dplyr")
requireNamespace("stringr")

  # Read in a flagged test dataset
beesFlagged <- BeeBDC::beesFlagged

  # Create a dummy "priorData" dataset using the first fifty rows
priorRun <- beesFlagged %>%
  dplyr::slice_head(n = 50)


  #### 1.0 Exclude ASP ####
  # Run the function using the first fifty to be matched to their original database_id numbers
testOut <- BeeBDC::idMatchR(
  currentData = beesFlagged %>% dplyr::mutate(database_id = database_id %>%
                                                stringr::str_replace("[0-9]+","") %>%
                                                paste0(., dplyr::row_number())),
  priorData = priorRun,
  # First matches will be given preference over later ones
  matchBy = dplyr::lst(c("gbifID"),
                        c("catalogNumber", "institutionCode", "dataSource"),
                        c("occurrenceID", "dataSource"),
                        c("recordId", "dataSource"),
                        c("id"),
                        # Because INHS was entered as it's own dataset but is now included in the GBIF download...
                        c("catalogNumber", "institutionCode")),
  # You can exclude datasets from prior by matching their prefixs — before first underscore:
  # Which datasets are static and should be excluded from matching?
  excludeDataset = c("ASP", "BMin", "BMont", "CAES", "EaCO", "Ecd", "EcoS",
                     "Gai", "KP", "EPEL", "CAES", "EaCO", "FSCA", "SMC", "Lic", "Arm"))


# Get a count of TRUE and FALSE column name matches
resultsMatched <- sum(testOut$database_id %in% beesFlagged$database_id)
resultsExcluded <- sum(testOut$database_id %in% (beesFlagged %>% dplyr::mutate(database_id = database_id %>%
                                                                                   stringr::str_replace("[0-9]+","") %>%
                                                                                   paste0(., dplyr::row_number())) %>%
                                                     dplyr::pull(database_id)))
resultsNotMatched <- sum(testOut$database_id %in% beesFlagged$database_id)

# Test the number of expected TRUE and FALSE columns and then test the output format (data frames and
# tibbles are a special case of lists)
testthat::test_that("idMatchR results successfuly matched", {
  testthat::expect_equal(resultsMatched, 50)
})
testthat::test_that("idMatchR results not matched", {
  testthat::expect_equal(resultsNotMatched, 50)
})
testthat::test_that("idMatchR results excluded because in excludeDatasets", {
  testthat::expect_equal(resultsExcluded, 1)
})

testthat::test_that("idMatchR expected class", {
  testthat::expect_type(testOut, "list")
})


  #### 2.0 Don't exclude ASP ####
# Run the function using the first fifty to be matched to their original database_id numbers
testOut2 <- BeeBDC::idMatchR(
  currentData = beesFlagged %>% dplyr::mutate(database_id = database_id %>%
                                                stringr::str_replace("[0-9]+","") %>%
                                                paste0(., dplyr::row_number())),
  priorData = priorRun,
  # First matches will be given preference over later ones
  matchBy = dplyr::lst(c("gbifID"),
                        c("catalogNumber", "institutionCode", "dataSource"),
                        c("occurrenceID", "dataSource"),
                        c("recordId", "dataSource"),
                        c("id"),
                        # Because INHS was entered as it's own dataset but is now included in the GBIF download...
                        c("catalogNumber", "institutionCode")),
  # You can exclude datasets from prior by matching their prefixs — before first underscore:
  # Which datasets are static and should be excluded from matching?
    # This time don't exclude the ASP data
  excludeDataset = NULL)

# Get a count of TRUE and FALSE column name matches
resultsMatched <- sum(testOut2$database_id %in% beesFlagged$database_id)
resultsExcluded <- sum(testOut2$database_id %in% (beesFlagged %>% dplyr::mutate(database_id = database_id %>%
                                                                                 stringr::str_replace("[0-9]+","") %>%
                                                                                 paste0(., dplyr::row_number())) %>%
                                                   dplyr::pull(database_id)))
resultsNotMatched <- sum(testOut2$database_id %in% beesFlagged$database_id)

# Test the number of expected TRUE and FALSE columns and then test the output format (data frames and
# tibbles are a special case of lists)
testthat::test_that("idMatchR results successfuly matched", {
  testthat::expect_equal(resultsMatched, 50)
})
testthat::test_that("idMatchR results not matched", {
  testthat::expect_equal(resultsNotMatched, 50)
})
testthat::test_that("idMatchR results excluded because in excludeDatasets", {
  testthat::expect_equal(resultsExcluded, 0)
})