test-countryNameCleanR.R
In BeeBDC: Occurrence Data Cleaning

requireNamespace("readr")
requireNamespace("dplyr")
library(dplyr) ## could not use %>% without loading as library


# make yourself some test data which has common problem names for the USA and one row that is already correct
testData <- dplyr::tribble(
              ~database_id, ~decimalLatitude, ~decimalLongitude,                  ~country, ~countryCode,                 ~scientificName, ~dataSource, ~recordedBy,        ~order,  ~family, ~genus, ~specificEpithet, ~scientificNameAuthorship,
               "fake SCAN",            38.54,        -121.75667,           "United States",           NA, "Apis mellifera Linneaus, 1758",      "fake",          NA, "Hymenoptera", "apidae", "Apis",      "mellifera",          "Linnaeus, 1758",
               "fake SCAN",            38.54,        -121.75667,                  "U.S.A.",           NA, "Apis mellifera Linneaus, 1759",      "fake",          NA, "Hymenoptera", "apidae", "Apis",      "mellifera",          "Linnaeus, 1758",
               "fake SCAN",            38.54,        -121.75667,                  "U.S.A.",           NA, "Apis mellifera Linneaus, 1760",      "fake",          NA, "Hymenoptera", "apidae", "Apis",      "mellifera",          "Linnaeus, 1758",
               "fake SCAN",            38.54,        -121.75667,                      "US",           NA, "Apis mellifera Linneaus, 1761",      "fake",          NA, "Hymenoptera", "apidae", "Apis",      "mellifera",          "Linnaeus, 1758",
               "fake SCAN",            38.54,        -121.75667,                     "USA",           NA, "Apis mellifera Linneaus, 1762",      "fake",          NA, "Hymenoptera", "apidae", "Apis",      "mellifera",          "Linnaeus, 1758",
               "fake SCAN",            38.54,        -121.75667,                     "usa",           NA, "Apis mellifera Linneaus, 1763",      "fake",          NA, "Hymenoptera", "apidae", "Apis",      "mellifera",          "Linnaeus, 1758",
               "fake SCAN",            38.54,        -121.75667,           "UNITED STATES",           NA, "Apis mellifera Linneaus, 1764",      "fake",          NA, "Hymenoptera", "apidae", "Apis",      "mellifera",          "Linnaeus, 1758",
               "fake SCAN",            38.54,        -121.75667,"United States of America",           NA, "Apis mellifera Linneaus, 1758",      "fake",          NA, "Hymenoptera", "apidae", "Apis",      "mellifera",          "Linnaeus, 1758",
               "fake SCAN",            38.54,        -121.75667,                     "usa",           NA, "Apis mellifera Linneaus, 1765",      "fake",          NA, "Hymenoptera", "apidae", "Apis",      "mellifera",          "Linnaeus, 1758"
              )

# common problem table from the function documentation
commonProblems <- dplyr::tibble(problem = c('U.S.A.', 'US','USA','usa','UNITED STATES',
                                            'United States','U.S.A','MX','CA','Bras.','Braz.',
                                            'Brasil','CNMI','USA TERRITORY: PUERTO RICO'),
                                 fix = c('United States of America','United States of America',
                                        'United States of America','United States of America',
                                        'United States of America','United States of America',
                                        'United States of America','Mexico','Canada','Brazil',
                                        'Brazil','Brazil','Northern Mariana Islands','PUERTO.RICO'))


# run the function!
testOut <- BeeBDC::countryNameCleanR(data = testData, commonProblems = commonProblems)


# check that country column has only one value (correct value)
numberOfCountries <- testOut %>% group_by(country) %>% summarise()


# number of NA values in fix column
numberNAs <- sum(is.na(testOut$fix))


# Test the number of expected fixed and NA values in the fix column and
# verify addition of new column
testthat::test_that("number of country names in the fixed country column" , {
  testthat::expect_equal(ncol(numberOfCountries), 1)
})

testthat::test_that("number of unchanged rows in data", {
  testthat::expect_equal(numberNAs, 1)
})