Nothing
#' @title Associate other data sources with gadm IDs
#' @description Intelligently bind together data with gadm IDs at all scales.
#' @author Francis Windram
#'
#' @param df the source data to bind gadm IDs to. This **must** contain decimal lonlat data!
#' @param lonlat_names a vector containing the column names of the longitude and latitude columns **IN THAT ORDER**!
#' @param cache_location path to cache location (defaults to a temporary user directory, or one set by [set_default_ohvbd_cache()]).
#' @param basereq the url of the AREAdata database (usually generated by [ad_basereq()]). If `NA`, uses the default.
#'
#' @return A matrix of the `data` with added gadm columns.
#'
#' @section Caching:
#' This will **ALWAYS** get and cache gid level 2 data sources. These files are about 80MB total, so if you are running on a metered connection do beware of this.
#'
#'
#' @examplesIf interactive()
#' vtdf <- search_hub("Aedes aegypti", "vt") |>
#' tail(20) |>
#' fetch() |>
#' glean(cols = c(
#' "DatasetID",
#' "Latitude",
#' "Longitude",
#' "Interactor1Genus",
#' "Interactor1Species"
#' ), returnunique = TRUE) |>
#' assoc_gadm(lonlat_names = c("Longitude", "Latitude"))
#'
#'
#' @export
#'
assoc_gadm <- function(
df,
lonlat_names = c("Longitude", "Latitude"),
cache_location = NULL,
basereq = ad_basereq()
) {
# Remember db attr of input data
datatype <- ohvbd_db(df)
# Always load gid level 2 as it's quickest AND a complete superset of GID1 and 0
gid <- 2
if ("gbif" %in% class(df)) {
df <- df$data
}
# Use LonLats to find an appropriate gadm id
if (length(lonlat_names) != 2) {
cli::cli_abort(c(
"x" = "Longitude and Latitude column names must be provided as vector of length 2!",
"i" = "You provided {.val {lonlat_names}} (detected length = {col_red(length(lonlat_names))})."
))
}
# Find GADM shapefile for searching
gadm_sf <- fetch_gadm_sfs(gid = gid, cache_location = cache_location)
cli::cli_progress_message("{cli::symbol$pointer} Finding distinct lonlats...")
# Find latlons for quick processing
orig_lonlat <- df |>
select(all_of(lonlat_names)) |>
mutate_all(function(x) as.numeric(as.character(x)))
# Make distinct points into a SpatVect
points <- terra::vect(orig_lonlat |> distinct(), geom = lonlat_names)
cli::cli_alert_success("Created search vector.")
startproc <- lubridate::now()
cli::cli_progress_message(
"{cli::symbol$pointer} Finding gadm names for {.val {length(points)}} latlon{?s}..."
)
# Locate which shape each point is in and get the appropriate names for aligning with AD
gadm_point_ids <- terra::extract(gadm_sf, points)
endproc <- lubridate::now()
totaltime <- as.duration(interval(startproc, endproc)) # nolint: object_usage_linter
cli::cli_alert_success("Found gadm names in {.val {totaltime}}.")
# Deduplicate ids if necessary.
# This probably happens when polys overlap in the shapefile
duplicated_ids <- duplicated(gadm_point_ids$id.y)
if (any(duplicated_ids)) {
cli::cli_alert_warning(
"Multiple gadm id returns @ data ind{?ex/ices}: {.val {as.character(gadm_point_ids$id.y[which(duplicated_ids)])}}."
)
cli::cli_alert_info(
"Removing duplicates {col_yellow('(check the results are what you want!)')}"
)
# Just take the first response. It is likely that both are correct.
ids_to_remove <- which(duplicated_ids)
gadm_point_ids <- gadm_point_ids[-ids_to_remove, ]
}
gadm_point_ids <- gadm_point_ids |> select(starts_with(c("NAME", "GID")))
cli::cli_progress_message("{cli::symbol$pointer} Merging with original dataset...")
# Recreate the original dataset at full length
gadm_point_ids <- left_join(
orig_lonlat,
bind_cols(orig_lonlat |> distinct(), gadm_point_ids),
by = lonlat_names
)
gadm_point_ids <- gadm_point_ids |> select(starts_with(c("NAME", "GID")))
outdata <- bind_cols(df, gadm_point_ids)
cli::cli_alert_success("Merge complete.")
cli::cli_progress_done()
ohvbd_db(outdata) <- datatype
return(outdata)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.