Nothing
requireNamespace("dplyr")
testData <- dplyr::tibble(
database_id = c("fakeID1","fakeID2", "fakeID2", "fakeID4","fakeID5", "fakeID6", "fakeID7",
"fakeID8","fakeID9", "fakeID10","fakeID11","fakeID12"),
decimalLatitude = c(1.111, 2.111, 3.111, 4.111, 5.111, 6.111, 6.111, 6.111, 7.111, 7.111, 7.111, 7.111),
decimalLongitude = c(10.111, 11.111, 12.111, 13.111, 14.111, 15.111, 15.111, 16.111, 16.111, 16.111, 17.111, 18.111),
scientificName = c("a", "b", "c", "d", "e", "f", "f", "f", "g", "g", "h", "g"),
eventDate = c("1988-10-13 00:00:00 UTC", "1930-06-06 00:00:00 UTC", "1994-01-01 00:00:00 UTC",
"1987-08-06 00:00:00 UTC", "1995-03-31 00:00:00 UTC", "2000-05-01 00:00:00 UTC",
"1998-01-01 00:00:00 UTC", "1994-01-01 00:00:00 UTC","1994-01-01 00:00:00 UTC",
NA,"1994-01-01 00:00:00 UTC",
"1972-04-28 00:00:00 UTC"),
recordedBy = c("bee-er1, bee-er2", "bee-er3", "bee-er4", "bee-er5", "bee-er6","bee-er6",
"bee-er6", "bee-er6", "bee-er6", "bee-er7", "bee-er1", "bee-er2"),
catalogNumber = c("beeCode0001","beeCode0002","beeCode0003","beeCode0004","beeCode0006","beeCode0006",
"beeCode0006","beeCode0006","beeCode0006","be01","boe6","beeCode0006"),
otherCatalogNumbers = c("otherBeeCode0001","otherBeeCode0002","otherBeeCode0003","otherBeeCode0004",
"otherBeeCode0006","otherBeeCode0006",
"otherBeeCode0007","otherBeeCode0007","otherBeeCode0001","otherBeeCode0045",
"otherBeeCode0678","otherBeeCode194"),
institutionCode = c("beeHouse","beeHouse","beeHouse","beeHouse","beeHouse","waspHouse","waspHouse",
"waspHouse","waspHouse","beeHouse","beeHouse","beeHouse"),
gbifID = c("asdflgjkh11","asdflgjkh11",NA_character_,NA_character_,NA_character_,NA_character_,
NA_character_,NA_character_,NA_character_,NA_character_,NA_character_,NA_character_),
occurrenceID = c(NA_character_,NA_character_,NA_character_,NA_character_,NA_character_,NA_character_,
NA_character_,NA_character_,"duplicatecode12345","duplicatecode12345",NA_character_,NA_character_),
recordId = c(NA_character_,NA_character_,NA_character_,NA_character_,NA_character_,NA_character_,
NA_character_,NA_character_,NA_character_,NA_character_,NA_character_,NA_character_),
id = c("a","b","c","d","e","f","g","h","i","j","k","l"),
dataSource = c("GBIF","GBIF","GBIF","GBIF","GBIF","GBIF","GBIF","GBIF","GBIF","GBIF","GBIF","GBIF")
)
check_time <- BeeBDC::dupeSummary(
data = testData,
path = tempdir(),
# options are "ID","collectionInfo", or "both"
duplicatedBy = "collectionInfo",
# The columns to generate completeness info from (and to sort by completness)
completeness_cols = c("decimalLatitude", "decimalLongitude",
"scientificName", "eventDate"),
# idColumns = c("gbifID", "occurrenceID", "recordId","id"),
# The columns to ADDITIONALLY consider when finding duplicates in collectionInfo
collectionCols = c("decimalLatitude", "decimalLongitude", "scientificName", "eventDate",
"recordedBy"),
# The columns to combine, one-by-one with the collectionCols
collectInfoColumns = c("catalogNumber", "otherCatalogNumbers"),
# Custom comparisons — as a list of columns to compare
# RAW custom comparisons do not use the character and number thresholds
CustomComparisonsRAW = dplyr::lst(c("catalogNumber", "institutionCode", "scientificName")),
# Other custom comparisons use the character and number thresholds
CustomComparisons = dplyr::lst(c("gbifID", "scientificName"),
c("occurrenceID", "scientificName"),
c("recordId", "scientificName"),
c("id", "scientificName")),
# The order in which you want to KEEP duplicated based on data source
# try unique(check_time$dataSource)
sourceOrder = c("CAES", "Gai", "Ecd","BMont", "BMin", "EPEL", "ASP", "KP", "EcoS", "EaCO",
"FSCA", "Bal", "SMC", "Lic", "Arm",
"USGS", "ALA", "GBIF","SCAN","iDigBio"),
# Prefix ordering is done using the database_id prefix, not the dataSource prefix.
prefixOrder = c("Paige", "Dorey"),
characterThreshold = 2,
numberThreshold = 3,
numberOnlyThreshold = 5
)
# Get counts of the number of expected kept duplicates and duplicats
test_keptDuplicates <- sum(check_time$duplicateStatus == "Kept duplicate")
test_duplicates <- sum(check_time$duplicateStatus == "Duplicate")
# Test duplicate numbers
testthat::test_that("dupeSummary kept duplicates", {
testthat::expect_equal(test_keptDuplicates, 2)
})
testthat::test_that("dupeSummary duplicates", {
testthat::expect_equal(test_duplicates, 3)
})
# Test concordance with TRUE/FALSE and number of duplicates
testthat::test_that("dupeSummary duplicates", {
testthat::expect_equal(test_duplicates, sum(check_time$.duplicates == FALSE))
})
# Test class
testthat::test_that("dupeSummary expected class - simple", {
testthat::expect_type(check_time, "list")
})
testthat::test_that("dupeSummary expected class - complex", {
testthat::expect_true(any(stringr::str_detect(attributes(check_time)$class, "tbl_df|tbl")))
})
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.