test-dupeSummary.R
In BeeBDC: Occurrence Data Cleaning

requireNamespace("dplyr")


testData <- dplyr::tibble(
  database_id = c("fakeID1","fakeID2", "fakeID2", "fakeID4","fakeID5", "fakeID6", "fakeID7",
                  "fakeID8","fakeID9", "fakeID10","fakeID11","fakeID12"),
  decimalLatitude = c(1.111, 2.111, 3.111, 4.111, 5.111, 6.111, 6.111, 6.111, 7.111, 7.111, 7.111, 7.111),
  decimalLongitude = c(10.111, 11.111, 12.111, 13.111, 14.111, 15.111, 15.111, 16.111, 16.111, 16.111, 17.111, 18.111),
  scientificName = c("a", "b", "c", "d", "e", "f", "f", "f", "g", "g", "h", "g"),      
  eventDate = c("1988-10-13 00:00:00 UTC", "1930-06-06 00:00:00 UTC", "1994-01-01 00:00:00 UTC",
                "1987-08-06 00:00:00 UTC", "1995-03-31 00:00:00 UTC", "2000-05-01 00:00:00 UTC",
                "1998-01-01 00:00:00 UTC", "1994-01-01 00:00:00 UTC","1994-01-01 00:00:00 UTC",
                 NA,"1994-01-01 00:00:00 UTC",
                "1972-04-28 00:00:00 UTC"),
  recordedBy = c("bee-er1, bee-er2", "bee-er3", "bee-er4", "bee-er5", "bee-er6","bee-er6",
                 "bee-er6", "bee-er6", "bee-er6", "bee-er7", "bee-er1", "bee-er2"),
  catalogNumber = c("beeCode0001","beeCode0002","beeCode0003","beeCode0004","beeCode0006","beeCode0006",
                    "beeCode0006","beeCode0006","beeCode0006","be01","boe6","beeCode0006"),
  otherCatalogNumbers = c("otherBeeCode0001","otherBeeCode0002","otherBeeCode0003","otherBeeCode0004",
                          "otherBeeCode0006","otherBeeCode0006",
                          "otherBeeCode0007","otherBeeCode0007","otherBeeCode0001","otherBeeCode0045",
                          "otherBeeCode0678","otherBeeCode194"),
  institutionCode = c("beeHouse","beeHouse","beeHouse","beeHouse","beeHouse","waspHouse","waspHouse",
                      "waspHouse","waspHouse","beeHouse","beeHouse","beeHouse"),     
  gbifID = c("asdflgjkh11","asdflgjkh11",NA_character_,NA_character_,NA_character_,NA_character_,
             NA_character_,NA_character_,NA_character_,NA_character_,NA_character_,NA_character_),
  occurrenceID = c(NA_character_,NA_character_,NA_character_,NA_character_,NA_character_,NA_character_,
                   NA_character_,NA_character_,"duplicatecode12345","duplicatecode12345",NA_character_,NA_character_),
  recordId = c(NA_character_,NA_character_,NA_character_,NA_character_,NA_character_,NA_character_,
               NA_character_,NA_character_,NA_character_,NA_character_,NA_character_,NA_character_),
  id = c("a","b","c","d","e","f","g","h","i","j","k","l"),
  dataSource = c("GBIF","GBIF","GBIF","GBIF","GBIF","GBIF","GBIF","GBIF","GBIF","GBIF","GBIF","GBIF")
)


check_time <- BeeBDC::dupeSummary(
  data = testData,
  path = tempdir(),
  # options are "ID","collectionInfo", or "both"
  duplicatedBy = "collectionInfo", 
  # The columns to generate completeness info from (and to sort by completness)
  completeness_cols = c("decimalLatitude",  "decimalLongitude",
                        "scientificName", "eventDate"),
  # idColumns = c("gbifID", "occurrenceID", "recordId","id"),
  # The columns to ADDITIONALLY consider when finding duplicates in collectionInfo
  collectionCols = c("decimalLatitude", "decimalLongitude", "scientificName", "eventDate", 
                     "recordedBy"),
  # The columns to combine, one-by-one with the collectionCols
  collectInfoColumns = c("catalogNumber", "otherCatalogNumbers"),
  # Custom comparisons — as a list of columns to compare
  # RAW custom comparisons do not use the character and number thresholds
  CustomComparisonsRAW = dplyr::lst(c("catalogNumber", "institutionCode", "scientificName")),
  # Other custom comparisons use the character and number thresholds
  CustomComparisons = dplyr::lst(c("gbifID", "scientificName"),
                                  c("occurrenceID", "scientificName"),
                                  c("recordId", "scientificName"),
                                  c("id", "scientificName")),
  # The order in which you want to KEEP duplicated based on data source
  # try unique(check_time$dataSource)
  sourceOrder = c("CAES", "Gai", "Ecd","BMont", "BMin", "EPEL", "ASP", "KP", "EcoS", "EaCO",
                  "FSCA", "Bal", "SMC", "Lic", "Arm",
                  "USGS", "ALA", "GBIF","SCAN","iDigBio"),
  # Prefix ordering is done using the database_id prefix, not the dataSource prefix.
  prefixOrder = c("Paige", "Dorey"),
  characterThreshold = 2,
  numberThreshold = 3,
  numberOnlyThreshold = 5
)

  # Get counts of the number of expected kept duplicates and duplicats
test_keptDuplicates <-  sum(check_time$duplicateStatus == "Kept duplicate")
test_duplicates <- sum(check_time$duplicateStatus == "Duplicate")

# Test duplicate numbers
testthat::test_that("dupeSummary kept duplicates", {
  testthat::expect_equal(test_keptDuplicates, 2)
})
testthat::test_that("dupeSummary duplicates", {
  testthat::expect_equal(test_duplicates, 3)
})
# Test concordance with TRUE/FALSE and number of duplicates
testthat::test_that("dupeSummary duplicates", {
  testthat::expect_equal(test_duplicates, sum(check_time$.duplicates == FALSE))
})

# Test class
testthat::test_that("dupeSummary expected class - simple", {
  testthat::expect_type(check_time, "list")
})
testthat::test_that("dupeSummary expected class - complex", {
  testthat::expect_true(any(stringr::str_detect(attributes(check_time)$class, "tbl_df|tbl")))
})