Nothing
# This function was written by James Dorey to chunk the bdc_country_from_coordinates function
# to allow bigger datasets to be analysed without consuming too much RAM.
# This function was written on the 12th of May 2022. For questions, please email jbdorey[at]me.com
#' Get country names from coordinates
#'
#' Because the [bdc::bdc_country_from_coordinates()] function is very RAM-intensive, this wrapper
#' allows a user to specify chunk-sizes and only analyse a small portion of the occurrence data at a
#' time. The prefix jbd_ is used to highlight the difference between this function and the original
#' [bdc::bdc_country_from_coordinates()].
#'
#' @param data A data frame or tibble. Occurrence records to use as input.
#' @param lat Character. The name of the column to use as latitude. Default = "decimalLatitude".
#' @param lon Character. The name of the column to use as longitude. Default = "decimalLongitude".
#' @param country Character. The name of the column containing country names. Default = "country.
#' @param stepSize Numeric. The number of occurrences to process in each chunk. Default = 1000000.
#' @param chunkStart Numeric. The chunk number to start from. This can be > 1 when you need to
#' restart the function from a certain chunk. For example, can be used if R failed unexpectedly.
#' @param path Character. The directory path to a folder in which to save the running countrylist
#' csv file.
#' @param scale Passed to rnaturalearth's ne_countries().
#' Scale of map to return, one of 110, 50, 10 or 'small', 'medium', 'large'. Default = "large".
#' @param mc.cores Numeric. If > 1, the function will run in parallel
#' using mclapply using the number of cores specified. If = 1 then it will be run using a serial
#' loop. NOTE: Windows machines must use a value of 1 (see ?parallel::mclapply). Additionally,
#' be aware that each thread can use large chunks of memory.
#' Default = 1.
#'
#' @return A data frame containing database_ids and a country column
#' that needs to be re-merged with the data input.
#' @export
#'
#' @importFrom dplyr %>%
#'
#' @examples
#' if(requireNamespace("rnaturalearthdata")){
#' library("dplyr")
#' data(beesFlagged)
#' HomePath = tempdir()
#' # Tibble of common issues in country names and their replacements
#' commonProblems <- dplyr::tibble(problem = c('U.S.A.', 'US','USA','usa','UNITED STATES',
#' 'United States','U.S.A','MX','CA','Bras.','Braz.','Brasil','CNMI','USA TERRITORY: PUERTO RICO'),
#' fix = c('United States of America','United States of America',
#' 'United States of America','United States of America',
#' 'United States of America','United States of America',
#' 'United States of America','Mexico','Canada','Brazil','Brazil',
#' 'Brazil','Northern Mariana Islands','Puerto Rico'))
#'
#' beesFlagged <- beesFlagged %>%
#' # Replace a name to test
#' dplyr::mutate(country = stringr::str_replace_all(country, "Brazil", "Brasil"))
#'
#' beesFlagged_out <- countryNameCleanR(
#' data = beesFlagged,
#' commonProblems = commonProblems)
#'
#' suppressWarnings(
#' countryOutput <- jbd_CfC_chunker(data = beesFlagged_out,
#' lat = "decimalLatitude",
#' lon = "decimalLongitude",
#' country = "country",
#' # How many rows to process at a time
#' stepSize = 1000000,
#' # Start row
#' chunkStart = 1,
#' path = HomePath,
#' scale = "medium"),
#' classes = "warning")
#'
#'
#' # Left join these datasets
#' beesFlagged_out <- left_join(beesFlagged_out, countryOutput, by = "database_id") %>%
#' # merge the two country name columns into the "country" column
#' dplyr::mutate(country = dplyr::coalesce(country.x, country.y)) %>%
#' # remove the now redundant country columns
#' dplyr::select(!c(country.x, country.y)) %>%
#' # put the column back
#' dplyr::relocate(country) %>%
#' # Remove duplicates if they arose!
#' dplyr::distinct()
#'
#' # Remove illegal characters
#' beesFlagged_out$country <- beesFlagged_out$country %>%
#' stringr::str_replace(., pattern = paste("\\[", "\\]", "\\?",
#' sep= "|"), replacement = "")
#' } # END if require
jbd_CfC_chunker <- function(data = NULL,
lat = "decimalLatitude",
lon = "decimalLongitude",
country = "country",
# How many rows to process at a time
stepSize = 1000000,
# Start row
chunkStart = 1,
scale = "medium",
path = tempdir(),
mc.cores = 1){
BeeBDC_order <- . <- .data <- id_temp <- name_long <- geometry <- inData <- country_OG <- NULL
#### 0.0 Prep ####
startTime <- Sys.time()
##### 0.1 nChunks ####
# Find the number of chunks needed to complete the run
nChunks = ceiling(nrow(data)/stepSize)
# Find the max nrow
nrowMax <- nrow(data)
# The chunkEnd is the same as the stepSize initially, but the chunkEnd will change with each
# iteration
##### 0.3 chunkEnd ####
chunkEnd = (chunkStart + stepSize) - 1
##### 0.4 Text out ####
# Write user output
message(paste(" - Running chunker with:", "\n",
"stepSize = ",
format(stepSize, big.mark=",",scientific=FALSE), "\n",
"chunkStart = ",
format(chunkStart, big.mark=",",scientific=FALSE), "\n",
"chunkEnd = ",
format(chunkEnd, big.mark=",",scientific=FALSE),
sep = ""))
#### 1.0 Parallel ####
##### 1.1 Input function for parallel ####
funCoordCountry <-
function(inData) {
suppressWarnings({
check_require_cran("rnaturalearth")
# check_require_github("ropensci/rnaturalearthdata")
})
# create an id_temp
inData$id_temp <- 1:nrow(inData)
minimum_colnames <- c(lat, lon)
if(!all(minimum_colnames %in% colnames(inData))) {
stop(
"These columns names were not found in your database: ",
paste(minimum_colnames[!minimum_colnames %in% colnames(inData)],
collapse = ", "),
call. = FALSE
)}
# check if inData has a country column
has_country <- any(colnames(inData) == country)
if(!has_country) {
inData$country <- NA}
# converts coordinates columns to numeric
inData <- inData %>%
dplyr::mutate(decimalLatitude = as.numeric(.data[[lat]]),
decimalLongitude = as.numeric(.data[[lon]]))
worldmap <- rnaturalearth::ne_countries(scale = scale, returnclass = "sf") %>%
sf::st_make_valid()
data_no_country <- inData %>%
dplyr::filter(is.na(country) | country == "")
if(nrow(data_no_country) == 0) {
inData <- inData %>% dplyr::select(-id_temp)
}else{
# converts coordinates columns to spatial points
suppressWarnings({
data_no_country <-
CoordinateCleaner::cc_val(
x = data_no_country,
lon = lon,
lat = lat,
verbose = FALSE
) %>%
sf::st_as_sf(
.,
coords = c("decimalLongitude", "decimalLatitude"),
remove = FALSE
) %>%
sf::st_set_crs(., sf::st_crs(worldmap))
})
worldmap <-
sf::st_as_sf(worldmap) %>% dplyr::select(name_long)
# Extract country names from coordinates
suppressWarnings({
suppressMessages({
ext_country <-
data_no_country %>%
dplyr::select(id_temp, geometry) %>%
sf::st_intersection(., worldmap)
})
})
ext_country$geometry <- NULL
res <- dplyr::left_join(data_no_country, ext_country, by = "id_temp") %>%
dplyr::distinct(id_temp, .keep_all = TRUE)
id_replace <- res$id_temp
inData[id_replace, "country"] <- res$name_long
inData <- inData %>% dplyr::select(-id_temp)
}
return(dplyr::as_tibble(inData))
}
##### 1.2 Run mclapply ####
# User output
writeLines(paste(" - Starting parallel operation. Unlike the serial operation (mc.cores = 1)",
", a parallel operation will not provide running feedback. Please be patient",
" as this function may take some time to complete. Each chunk will be run on",
" a seperate thread so also be aware of RAM usage."))
loop_check_pf = data %>%
# Make a new column with the ordering of rows
dplyr::mutate(BeeBDC_order = dplyr::row_number()) %>%
# Group by the row number and step size
dplyr::group_by(BeeBDC_group = ceiling(BeeBDC_order/stepSize)) %>%
# Split the dataset up into a list by group
dplyr::group_split(.keep = TRUE) %>%
# Run the actual function
parallel::mclapply(., funCoordCountry,
mc.cores = mc.cores
) %>%
# Combine the lists of tibbles
dplyr::bind_rows()
CountryList = dplyr::tibble(database_id = loop_check_pf$database_id,
country = loop_check_pf$country,
BeeBDC_order = loop_check_pf$BeeBDC_order) %>%
# Arrange these
dplyr::arrange(BeeBDC_order) %>%
# Remove extra columns
dplyr::select(!tidyselect::any_of("BeeBDC_order"))
#### 2.0 Return ####
colnames(CountryList) <- c("database_id", "country")
endTime <- Sys.time()
message(paste(
" - Completed in ",
round(difftime(endTime, startTime), digits = 2 ),
" ",
units(round(endTime - startTime, digits = 2)),
sep = ""))
# Get a subset of the input data
data <- data %>%
dplyr::select(tidyselect::any_of(c("database_id", "country")))
# Clean a little
CountryList <- CountryList %>%
# Drop na rows
tidyr::drop_na(country)
# Get a summary of the output
summaryTable <- CountryList %>%
dplyr::left_join(data, by = "database_id",
suffix = c("", "_OG")) %>%
# Assign changed == 1 if the country name has changed from the original
dplyr::mutate(changed = dplyr::if_else(is.na(country_OG), 1, 0))
writeLines(paste0(" - We have updated the country names of ",
format(sum(summaryTable$changed), big.mark = ","),
" occurrences that previously had no country name assigned."))
return(CountryList)
} # END function
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.