Nothing
# This function was written by James B Dorey starting from the 3rd-7th of June 2022.
# It is intended to replace the bdc function bdc_coordinates_country_inconsistent for larger datasets
# where it is simply not feasible.
# Initial function finished: _____
# For questions, please ask James at jbdorey[at]me.com
#' Flags coordinates that are inconsistent with the stated country name
#'
#' Compares stated country name in an occurrence record with record's coordinates using
#' rnaturalearth data. The prefix, jbd_ is meant
#' to distinguish this function from the original [bdc::bdc_coordinates_country_inconsistent()].
#' This functions will preferably use the countryCode and country_suggested columns
#' generated by `bdc::bdc_country_standardized()`; please run it on your dataset prior to running
#' this function.
#'
#' @param data A data frame or tibble. Occurrence records as input.
#' @param lon Character. The name of the column to use as longitude. Default = "decimalLongitude".
#' @param lat Character. The name of the column to use as latitude. Default = "decimalLatitude".
#' @param scale Numeric or character. To be passed to [rnaturalearth::ne_countries()]'s scale.
#' Scale of map to return, one of 110, 50, 10 or "small", "medium", "large".
#' Smaller values return higher-resolution maps.
#' @param pointBuffer Numeric. Amount to buffer points, in decimal degrees. If the point is outside
#' of a country, but within this point buffer, it will not be flagged. Default = 0.01.
#' @param mc.cores Numeric. If > 1, the st_intersects function will run in parallel
#' using mclapply using the number of cores specified. If = 1 then it will be run using a serial
#' loop. NOTE: Windows machines must use a value of 1 (see ?parallel::mclapply). Additionally,
#' be aware that each thread can use large chunks of memory.
#' Default = 1.
#' @param stepSize Numeric. The number of occurrences to process in each chunk. Default = 1000000.
#'
#' @return The input occurrence data with a new column, .coordinates_country_inconsistent
#' @export
#'
#' @importFrom dplyr %>%
#'
#' @examples
#' if(requireNamespace("rnaturalearthdata")){
#' beesRaw_out <- jbd_coordCountryInconsistent(
#' data = BeeBDC::beesRaw,
#' lon = "decimalLongitude",
#' lat = "decimalLatitude",
#' scale = 50,
#' pointBuffer = 0.01)
#' } # END if require
jbd_coordCountryInconsistent <- function(
data = NULL,
lon = "decimalLongitude",
lat = "decimalLatitude",
scale = 50,
pointBuffer = 0.01,
stepSize = 1000000,
mc.cores = 1){
database_id <- decimalLatitude <- decimalLongitude <- country <- name_long <- iso_a2 <-
geometry <- admin <- sovereignt <- name <- . <- NULL
.coordinates_outOfRange <- .coordinates_empty <- indexMatch <- BeeBDC_order <- NULL
countryCode <- country_suggested <- NULL
.coordinates_empt <- .data <- .coordinates_empty <- .coordinates_outOfRange <- isna_ <- NULL
startTime <- Sys.time()
requireNamespace("rnaturalearth")
requireNamespace("dplyr")
requireNamespace("ggspatial")
requireNamespace("mgsub")
requireNamespace("terra")
#### 0.0 Prep ####
###### 0.1 fatal errors ####
if(!any(colnames(data) %in% "country")){
stop("There is no column called 'country' in the dataset. This is a minimum requirement.")
}
###### 0.2 Coord quality ####
if(!any(colnames(data) %in% ".coordinates_outOfRange")){
writeLines("No '.coordinates_outOfRange' column found, running bdc_coordinates_outOfRange...")
bdc_coordinates_outOfRange_internal <- function(data, lat = "decimalLatitude", lon = "decimalLongitude") {
.data <- .coordinates_outOfRange <- NULL
data_filtered <- data %>%
dplyr::select(dplyr::all_of(c(lon, lat))) %>%
dplyr::rename(lon = dplyr::all_of(lon), lat = dplyr::all_of(lat)) %>%
dplyr::mutate(dplyr::across(dplyr::everything(), ~ as.numeric(.x)))
data_flag <- data_filtered %>% dplyr::mutate(.coordinates_outOfRange = dplyr::case_when(
lat < -90 | lat > 90 ~ FALSE,
lon < -180 | lon > 180 ~ FALSE,TRUE ~ TRUE)) %>%
dplyr::select(.coordinates_outOfRange)
df <- dplyr::bind_cols(data, data_flag)
message(paste("\nbdc_coordinates_outOfRange:\nFlagged",
sum(df$.coordinates_outOfRange == FALSE),
"records.\nOne column was added to the database.\n"))
return(df)
}
data <- bdc_coordinates_outOfRange_internal(
data = data,
lat = lat,
lon = lon)
}
###### 0.3 columns present ####
if(!any(colnames(data) %in% ".coordinates_empty")){
writeLines("No '.coordinates_empty' column found, running bdc_coordinates_empty")
check_col_internal <- function(data, col) {
for (i in seq_along(col)) {
if (!col[i] %in% colnames(data)) {
stop("Column `", col[i], "` is not present in the data", call. = FALSE)}}}
bdc_coordinates_empty_internal <- function(data, lat = "decimalLatitude", lon = "decimalLongitude") {
if (!is.data.frame(data)) {
stop(deparse(substitute(data)), " is not a data.frame", call. = FALSE)}
check_col_internal(data, c(lat, lon))
df <- data %>%
dplyr::select(dplyr::all_of(c(lat, lon))) %>%
dplyr::mutate(dplyr::across(dplyr::all_of(c(lat, lon)), ~ as.numeric(.x))) %>%
dplyr::mutate(isna_ = dplyr::if_any(dplyr::all_of(c(lat, lon)), ~ is.na(.x)),
.coordinates_empty = ifelse(isna_, FALSE, TRUE)) %>%
dplyr::select(.coordinates_empty)
df <- dplyr::bind_cols(data, df)
message(paste("\nbdc_coordinates_empty:\nFlagged",
sum(df$.coordinates_empty == FALSE),
"records.\nOne column was added to the database.\n"))
return(df)
}
data <- bdc_coordinates_empty_internal(
data = data,
lat = lat,
lon = lon)
}
if(!any(colnames(data) %in% "country_suggested")){
writeLines(paste0("No 'country_suggested' column found, adding an empty (NA) placeholder. This",
" column can be added by running bdc::bdc_country_standardized() on the ",
"input data."))
data <- data %>%
dplyr::mutate(country_suggested = NA_character_)
}
if(!any(colnames(data) %in% "countryCode")){
writeLines(paste0("No 'countryCode' column found, adding an empty (NA) placeholder. This",
" column can be added by running bdc::bdc_country_standardized() on the ",
"input data."))
data <- data %>%
dplyr::mutate(countryCode = NA_character_)
}
# Remove poor-quality coordinates
dataR <- data %>%
dplyr::filter(!.coordinates_outOfRange == FALSE) %>%
dplyr::filter(!.coordinates_empty == FALSE)
# Reduce dataset
dataR <- dataR %>%
dplyr::select(
tidyselect::any_of(c("database_id", "decimalLatitude", "decimalLongitude",
"country", "countryCode","country_suggested"))) %>%
# Remove lat/lon NAs
dplyr::filter(!is.na(decimalLatitude)) %>% dplyr::filter(!is.na(decimalLongitude))
#### 1.1 Terrestrial map ####
##### 1.1 rnaturalearth DL ####
writeLines(" - Downloading naturalearth map...")
suppressWarnings({
# Download the rnaturalearth countries
vectEarth <- rnaturalearth::ne_countries(scale = scale, type = "countries",
returnclass = "sf" )%>%
dplyr::select(name_long, iso_a2, geometry, admin, sovereignt, name) %>%
sf::st_make_valid()
# Simplify the world map ONCE to be used later
simplePoly <- vectEarth %>% sf::st_drop_geometry() %>%
dplyr::mutate(indexMatch = dplyr::row_number())
# Repair gemoetries
sf::sf_use_s2(FALSE)
})
#### 2.0 Extractions ####
##### 2.1 Create function 1 ####
intersectFun <- function(sp){
suppressMessages({
extracted <- sf::st_intersects(sp, vectEarth, sparse = TRUE) %>%
# return a tibble with the index of each match or NA where there was no match
dplyr::tibble(indexMatch = .)
# If first element is full, unlist each one
extracted <- extracted %>%
dplyr::mutate(indexMatch = indexMatch %>% as.character() %>% as.numeric() )
# rejoin
extracted <- extracted %>%
dplyr::left_join(simplePoly,
by = "indexMatch") %>%
# Add in the database_id
dplyr::bind_cols(sp)
}) # END suppressWarnings
return(extracted)
}# END intersectFun function
##### 2.2 Country name ####
writeLines(" - Extracting initial country names without buffer...")
suppressWarnings({
# Turn the points into an sf object
sp <- sf::st_as_sf(dataR, coords = c(lon, lat),
crs = sf::st_crs("WGS84"))
# Extract the country for the points from the vectEarth map
country_extracted = sp %>%
# Make a new column with the ordering of rows
dplyr::mutate(BeeBDC_order = dplyr::row_number()) %>%
# Group by the row number and step size
dplyr::group_by(BeeBDC_group = ceiling(BeeBDC_order/stepSize)) %>%
# Split the dataset up into a list by group
dplyr::group_split(.keep = TRUE) %>%
# Run the actual function
parallel::mclapply(., intersectFun,
mc.cores = mc.cores
) %>%
# Combine the lists of tibbles
dplyr::bind_rows()
})
##### 2.3 Failures ####
# Find those records that don't match.
failed_extract <- country_extracted %>%
# Find the mis-matched countries
dplyr::filter(!tolower(country) %in% c(tolower(name_long), tolower(admin),
tolower(sovereignt), tolower(name))) %>%
dplyr::filter(!tolower(country_suggested) %in% c(tolower(name_long), tolower(admin),
tolower(sovereignt), tolower(name))) %>%
dplyr::filter(!tolower(iso_a2) %in% c(tolower(country), tolower(countryCode))) %>%
# Remove NA countries
dplyr::filter(!is.na(country)) %>%
dplyr::tibble() %>%
sf::st_as_sf(crs = sf::st_crs("WGS84"))
# Replace some country names as needed and re-remove
failed_extract$country <-
mgsub::mgsub(failed_extract$country,
pattern = c("Martinique", "French guiana", "Federated States of Micronesia"),
replacement = c("France", "France", "Micronesia"))
# Remove new matches
failed_extract <- failed_extract %>%
# Find the mis-matched countries
dplyr::filter(!tolower(country) %in% c(tolower(name_long), tolower(admin),
tolower(sovereignt), tolower(name))) %>%
dplyr::filter(!tolower(country_suggested) %in% c(tolower(name_long), tolower(admin),
tolower(sovereignt), tolower(name))) %>%
dplyr::filter(!tolower(iso_a2) %in% c(tolower(country), tolower(countryCode))) %>%
# Remove NA countries
dplyr::filter(!is.na(country)) %>%
dplyr::tibble() %>%
sf::st_as_sf(crs = sf::st_crs("WGS84"))
# remove spent dataset
rm(country_extracted)
##### 2.4 Buffer fails ####
writeLines(" - Buffering naturalearth map by pointBuffer...")
# Buffer the natural earth map
suppressWarnings({
vectEarth <- vectEarth %>%
sf::st_buffer(dist = pointBuffer)
})
writeLines(" - Extracting FAILED country names WITH buffer...")
# Extract the country for the points from the vectEarth map
suppressWarnings({
failed_extract_2 = failed_extract %>%
dplyr::select(database_id, country, geometry, country_suggested, countryCode) %>%
# Make a new column with the ordering of rows
dplyr::mutate(BeeBDC_order = dplyr::row_number()) %>%
# Group by the row number and step size
dplyr::group_by(BeeBDC_group = ceiling(BeeBDC_order/stepSize)) %>%
# Split the dataset up into a list by group
dplyr::group_split(.keep = TRUE) %>%
# Run the actual function
parallel::mclapply(., intersectFun,
mc.cores = mc.cores
) %>%
# Combine the lists of tibbles
dplyr::bind_rows()
})
# Find MATCHES #
# With country
fExtr_1 <- failed_extract_2 %>%
dplyr::filter(!tolower(country) %in% c(tolower(name_long), tolower(admin),
tolower(sovereignt), tolower(name))) %>%
dplyr::filter(!tolower(country_suggested) %in% c(tolower(name_long), tolower(admin),
tolower(sovereignt), tolower(name)))
# With iso_a2
fExtr_2 <- failed_extract_2 %>%
dplyr::filter(tolower(iso_a2) %in% c(tolower(country), tolower(countryCode)))
ids2keep <- dplyr::bind_rows(fExtr_1, fExtr_2) %>%
# Find the mis-matched countries
dplyr::filter(!tolower(country) %in% c(tolower(name_long), tolower(admin),
tolower(sovereignt), tolower(name))) %>%
dplyr::filter(!tolower(country_suggested) %in% c(tolower(name_long), tolower(admin),
tolower(sovereignt), tolower(name))) %>%
dplyr::filter(tolower(iso_a2) %in% c(tolower(country), tolower(countryCode))) %>%
# Keep only the database id
dplyr::select(database_id)
#### 3.0 Final fails ####
# Get the final fails by removing those to keep from the failed list
ids2remove <- failed_extract_2 %>%
dplyr::filter(!database_id %in% ids2keep$database_id) %>%
dplyr::select(database_id)
#### 4.0 Finals ####
# Create new column
data <- data %>%
dplyr::mutate(.coordinates_country_inconsistent = !database_id %in% ids2remove$database_id)
# return message
message(paste("\njbd_coordinates_country_inconsistent:\nFlagged",
format(sum(data$.coordinates_country_inconsistent == FALSE, na.rm = TRUE), big.mark = ","),
"records.\nThe column, '.coordinates_country_inconsistent',",
"was added to the database.\n"), sep = "")
endTime <- Sys.time()
# Time output
message(paste(
" - Completed in ",
round(difftime(endTime, startTime), digits = 2 ),
" ",
units(round(endTime - startTime, digits = 2)),
sep = ""))
# Return the data
return(data)
} # END function
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.