R/jbd_coordCountryInconsistent.R
In BeeBDC: Occurrence Data Cleaning

Documented in jbd_coordCountryInconsistent

   # This function was written by James B Dorey starting from the 3rd-7th of June 2022.
# It is intended to replace the bdc function bdc_coordinates_country_inconsistent for larger datasets
# where it is simply not feasible. 
  # Initial function finished: _____
  # For questions, please ask James at jbdorey[at]me.com

#' Flags coordinates that are inconsistent with the stated country name
#' 
#' Compares stated country name in an occurrence record with record's coordinates using 
#' rnaturalearth data. The prefix, jbd_ is meant
#' to distinguish this function from the original [bdc::bdc_coordinates_country_inconsistent()].
#' This functions will preferably use the countryCode and country_suggested columns 
#' generated by `bdc::bdc_country_standardized()`; please run it on your dataset prior to running
#' this function.
#'
#' @param data A data frame or tibble. Occurrence records as input.
#' @param lon Character. The name of the column to use as longitude. Default = "decimalLongitude".
#' @param lat Character. The name of the column to use as latitude. Default = "decimalLatitude".
#' @param scale Numeric or character. To be passed to [rnaturalearth::ne_countries()]'s scale.
#' Scale of map to return, one of 110, 50, 10 or "small", "medium", "large". 
#' Smaller values return higher-resolution maps.
#' @param pointBuffer Numeric. Amount to buffer points, in decimal degrees. If the point is outside 
#' of a country, but within this point buffer, it will not be flagged. Default = 0.01.
#' @param mc.cores Numeric. If > 1, the st_intersects function will run in parallel
#' using mclapply using the number of cores specified. If = 1 then it will be run using a serial
#' loop. NOTE: Windows machines must use a value of 1 (see ?parallel::mclapply). Additionally,
#' be aware that each thread can use large chunks of memory.
#'  Default = 1.
#' @param stepSize Numeric. The number of occurrences to process in each chunk. Default = 1000000.
#'
#' @return The input occurrence data with a new column, .coordinates_country_inconsistent
#' @export
#' 
#' @importFrom dplyr %>%
#'
#' @examples
#' if(requireNamespace("rnaturalearthdata")){
#' beesRaw_out <- jbd_coordCountryInconsistent(
#'   data = BeeBDC::beesRaw,
#'   lon = "decimalLongitude",
#'   lat = "decimalLatitude",
#'   scale = 50,
#'   pointBuffer = 0.01)
#' } # END if require

jbd_coordCountryInconsistent <- function(
    data = NULL,
    lon = "decimalLongitude",
    lat = "decimalLatitude",
    scale = 50,
    pointBuffer = 0.01,
    stepSize = 1000000,
    mc.cores = 1){
  
  database_id <- decimalLatitude <- decimalLongitude <- country <- name_long <- iso_a2 <- 
    geometry <- admin <- sovereignt <- name <- . <- NULL
  .coordinates_outOfRange <- .coordinates_empty <- indexMatch <- BeeBDC_order <- NULL
  countryCode <- country_suggested <- NULL
  .coordinates_empt <- .data <- .coordinates_empty <- .coordinates_outOfRange <- isna_ <- NULL
  
startTime <- Sys.time()
requireNamespace("rnaturalearth")
requireNamespace("dplyr")
requireNamespace("ggspatial")
requireNamespace("mgsub")
requireNamespace("terra")

  #### 0.0 Prep ####
    ###### 0.1 fatal errors ####
if(!any(colnames(data) %in% "country")){
stop("There is no column called 'country' in the dataset. This is a minimum requirement.")
  }
  
    ###### 0.2 Coord quality ####
if(!any(colnames(data) %in% ".coordinates_outOfRange")){
    writeLines("No '.coordinates_outOfRange' column found, running bdc_coordinates_outOfRange...")
  
  bdc_coordinates_outOfRange_internal <- function(data, lat = "decimalLatitude", lon = "decimalLongitude") {
      .data <- .coordinates_outOfRange <- NULL
      data_filtered <- data %>%
        dplyr::select(dplyr::all_of(c(lon, lat))) %>%
        dplyr::rename(lon = dplyr::all_of(lon), lat = dplyr::all_of(lat)) %>%
        dplyr::mutate(dplyr::across(dplyr::everything(), ~ as.numeric(.x)))
      data_flag <- data_filtered %>% dplyr::mutate(.coordinates_outOfRange = dplyr::case_when(
            lat < -90 | lat > 90 ~ FALSE,
            lon < -180 | lon > 180 ~ FALSE,TRUE ~ TRUE)) %>%
        dplyr::select(.coordinates_outOfRange)
      df <- dplyr::bind_cols(data, data_flag)
      message(paste("\nbdc_coordinates_outOfRange:\nFlagged",
          sum(df$.coordinates_outOfRange == FALSE),
          "records.\nOne column was added to the database.\n"))
      return(df)
    }

  data <- bdc_coordinates_outOfRange_internal(
    data = data,
    lat = lat,
    lon = lon)
}
###### 0.3 columns present ####
if(!any(colnames(data) %in% ".coordinates_empty")){
  writeLines("No '.coordinates_empty' column found, running bdc_coordinates_empty")
  
  check_col_internal <- function(data, col) {
    for (i in seq_along(col)) {
      if (!col[i] %in% colnames(data)) {
        stop("Column `", col[i], "` is not present in the data", call. = FALSE)}}}
  
  bdc_coordinates_empty_internal <- function(data, lat = "decimalLatitude", lon = "decimalLongitude") {
      if (!is.data.frame(data)) {
        stop(deparse(substitute(data)), " is not a data.frame", call. = FALSE)}
      check_col_internal(data, c(lat, lon))
      df <- data %>%
        dplyr::select(dplyr::all_of(c(lat, lon))) %>%
        dplyr::mutate(dplyr::across(dplyr::all_of(c(lat, lon)), ~ as.numeric(.x))) %>%
        dplyr::mutate(isna_ = dplyr::if_any(dplyr::all_of(c(lat, lon)), ~ is.na(.x)),
          .coordinates_empty = ifelse(isna_, FALSE, TRUE)) %>%
        dplyr::select(.coordinates_empty)
      df <- dplyr::bind_cols(data, df)
      message(paste("\nbdc_coordinates_empty:\nFlagged",
          sum(df$.coordinates_empty == FALSE),
          "records.\nOne column was added to the database.\n"))
      return(df)
    }
  
  data <- bdc_coordinates_empty_internal(
    data = data,
    lat = lat,
    lon = lon)
}
if(!any(colnames(data) %in% "country_suggested")){
  writeLines(paste0("No 'country_suggested' column found, adding an empty (NA) placeholder. This",
                    " column can be added by running bdc::bdc_country_standardized() on the ",
                    "input data."))
  data <- data %>%
    dplyr::mutate(country_suggested = NA_character_)
}
if(!any(colnames(data) %in% "countryCode")){
  writeLines(paste0("No 'countryCode' column found, adding an empty (NA) placeholder. This",
                    " column can be added by running bdc::bdc_country_standardized() on the ",
                    "input data."))
  data <- data %>%
    dplyr::mutate(countryCode = NA_character_)
}


  # Remove poor-quality coordinates
dataR <- data %>%
  dplyr::filter(!.coordinates_outOfRange == FALSE) %>%
  dplyr::filter(!.coordinates_empty == FALSE) 


    # Reduce dataset 
  dataR <- dataR %>%
    dplyr::select(
      tidyselect::any_of(c("database_id", "decimalLatitude", "decimalLongitude",
                         "country", "countryCode","country_suggested"))) %>%
      # Remove lat/lon NAs
    dplyr::filter(!is.na(decimalLatitude)) %>% dplyr::filter(!is.na(decimalLongitude))
  
  #### 1.1 Terrestrial map ####
      ##### 1.1 rnaturalearth DL ####
  writeLines(" - Downloading naturalearth map...")
  suppressWarnings({
  # Download the rnaturalearth countries
vectEarth <- rnaturalearth::ne_countries(scale = scale, type = "countries", 
                                    returnclass = "sf" )%>%
  dplyr::select(name_long, iso_a2, geometry, admin, sovereignt, name) %>%
  sf::st_make_valid()
# Simplify the world map ONCE to be used later
simplePoly <- vectEarth %>% sf::st_drop_geometry() %>%
  dplyr::mutate(indexMatch = dplyr::row_number())

  # Repair gemoetries
sf::sf_use_s2(FALSE)
})

  #### 2.0 Extractions ####
    ##### 2.1 Create function 1 ####
intersectFun <- function(sp){
  suppressMessages({
  extracted <- sf::st_intersects(sp, vectEarth, sparse = TRUE) %>% 
    # return a tibble with the index of each match or NA where there was no match
    dplyr::tibble(indexMatch = .) 
    # If first element is full, unlist each one
      extracted <- extracted %>%
      dplyr::mutate(indexMatch = indexMatch %>% as.character() %>% as.numeric() )
    # rejoin
  extracted <- extracted %>%
    dplyr::left_join(simplePoly,
                     by = "indexMatch") %>%
    # Add in the database_id
    dplyr::bind_cols(sp)
  }) # END suppressWarnings
  return(extracted)
  }# END intersectFun function



    ##### 2.2 Country name ####
writeLines(" - Extracting initial country names without buffer...")
suppressWarnings({
      # Turn the points into an sf object
sp <- sf::st_as_sf(dataR, coords = c(lon, lat),
                   crs = sf::st_crs("WGS84"))
  # Extract the country for the points from the vectEarth map
  country_extracted = sp %>%
    # Make a new column with the ordering of rows
    dplyr::mutate(BeeBDC_order = dplyr::row_number()) %>%
    # Group by the row number and step size
    dplyr::group_by(BeeBDC_group = ceiling(BeeBDC_order/stepSize)) %>%
    # Split the dataset up into a list by group
    dplyr::group_split(.keep = TRUE) %>%
    # Run the actual function
    parallel::mclapply(., intersectFun,
                       mc.cores = mc.cores
    ) %>%
    # Combine the lists of tibbles
    dplyr::bind_rows()
})
  

    ##### 2.3 Failures ####
  # Find those records that don't match.
failed_extract <- country_extracted %>%
    # Find the mis-matched countries
  dplyr::filter(!tolower(country) %in% c(tolower(name_long), tolower(admin),
                                                   tolower(sovereignt), tolower(name))) %>%
  dplyr::filter(!tolower(country_suggested) %in% c(tolower(name_long), tolower(admin),
                                                   tolower(sovereignt), tolower(name))) %>%
  dplyr::filter(!tolower(iso_a2) %in% c(tolower(country), tolower(countryCode))) %>%
    # Remove NA countries
  dplyr::filter(!is.na(country)) %>%
  dplyr::tibble() %>%
  sf::st_as_sf(crs = sf::st_crs("WGS84"))
    # Replace some country names as needed and re-remove
failed_extract$country <-
  mgsub::mgsub(failed_extract$country, 
               pattern = c("Martinique", "French guiana", "Federated States of Micronesia"),
               replacement = c("France", "France", "Micronesia"))
  # Remove new matches
failed_extract <- failed_extract %>%
  # Find the mis-matched countries
  dplyr::filter(!tolower(country) %in% c(tolower(name_long), tolower(admin),
                                                   tolower(sovereignt), tolower(name))) %>%
  dplyr::filter(!tolower(country_suggested) %in% c(tolower(name_long), tolower(admin),
                                                   tolower(sovereignt), tolower(name))) %>%
  dplyr::filter(!tolower(iso_a2) %in% c(tolower(country), tolower(countryCode))) %>%
  # Remove NA countries
  dplyr::filter(!is.na(country)) %>%
  dplyr::tibble() %>%
  sf::st_as_sf(crs = sf::st_crs("WGS84"))
# remove spent dataset
rm(country_extracted)


    ##### 2.4 Buffer fails ####
writeLines(" - Buffering naturalearth map by pointBuffer...")
  # Buffer the natural earth map
suppressWarnings({
  vectEarth <- vectEarth %>% 
  sf::st_buffer(dist = pointBuffer)
})

writeLines(" - Extracting FAILED country names WITH buffer...")
# Extract the country for the points from the vectEarth map
suppressWarnings({
  failed_extract_2 = failed_extract %>%
    dplyr::select(database_id, country, geometry, country_suggested, countryCode) %>%
    # Make a new column with the ordering of rows
    dplyr::mutate(BeeBDC_order = dplyr::row_number()) %>%
    # Group by the row number and step size
    dplyr::group_by(BeeBDC_group = ceiling(BeeBDC_order/stepSize)) %>%
    # Split the dataset up into a list by group
    dplyr::group_split(.keep = TRUE) %>%
    # Run the actual function
    parallel::mclapply(., intersectFun,
                       mc.cores = mc.cores
    ) %>%
    # Combine the lists of tibbles
    dplyr::bind_rows()
})
    # Find MATCHES #
  # With country
fExtr_1 <- failed_extract_2 %>% 
  dplyr::filter(!tolower(country) %in% c(tolower(name_long), tolower(admin),
                                         tolower(sovereignt), tolower(name))) %>%
  dplyr::filter(!tolower(country_suggested) %in% c(tolower(name_long), tolower(admin),
                                                   tolower(sovereignt), tolower(name)))
  # With iso_a2
fExtr_2 <- failed_extract_2 %>% 
  dplyr::filter(tolower(iso_a2) %in% c(tolower(country), tolower(countryCode)))

ids2keep <- dplyr::bind_rows(fExtr_1, fExtr_2) %>%
  # Find the mis-matched countries
  dplyr::filter(!tolower(country) %in% c(tolower(name_long), tolower(admin),
                                         tolower(sovereignt), tolower(name))) %>%
  dplyr::filter(!tolower(country_suggested) %in% c(tolower(name_long), tolower(admin),
                                                   tolower(sovereignt), tolower(name))) %>%
  dplyr::filter(tolower(iso_a2) %in% c(tolower(country), tolower(countryCode))) %>%
    # Keep only the database id
  dplyr::select(database_id)

  #### 3.0 Final fails ####
# Get the final fails by removing those to keep from the failed list
ids2remove <- failed_extract_2 %>%
  dplyr::filter(!database_id %in% ids2keep$database_id) %>%
  dplyr::select(database_id)


  #### 4.0 Finals ####
    # Create new column
data <- data %>% 
  dplyr::mutate(.coordinates_country_inconsistent = !database_id %in% ids2remove$database_id)

    # return message
message(paste("\njbd_coordinates_country_inconsistent:\nFlagged", 
              format(sum(data$.coordinates_country_inconsistent == FALSE, na.rm = TRUE), big.mark = ","),
              "records.\nThe column, '.coordinates_country_inconsistent',",
              "was added to the database.\n"), sep = "")
 endTime <- Sys.time()
    # Time output
 message(paste(
   " - Completed in ", 
   round(difftime(endTime, startTime), digits = 2 ),
   " ",
   units(round(endTime - startTime, digits = 2)),
   sep = ""))

  # Return the data
return(data)
} # END function