Nothing
#' @title Associate other data sources with AREAdata data
#' @description Intelligently bind together data from AREAdata and other sources at a given spatial scale.
#' @author Francis Windram
#'
#' @param data the source data to bind AREAdata to. This **must** contain decimal lonlat data!
#' @param areadata the AREAdata to bind, usually from [fetch_ad()].
#' @param targetdate **ONE OF** the following:
#' * The date to search for in ISO 8601 (e.g. "2020", "2021-09", or "2022-09-21").
#' * The start date for a range of dates.
#' * A character vector of fully specified dates to search for (i.e. "yyyy-mm-dd").
#' @param enddate The (exclusive) end of the range of dates to search for. If this is unfilled, only the `targetdate` is searched for.
#' @param gid the spatial scale to retrieve (0 = country-level, 1=province-level...). (Note: this will preferentially use the gid level of `areadata` if present.)
#' @param lonlat_names a vector containing the column names of the longitude and latitude columns **IN THAT ORDER**!
#' @param cache_location path to cache location (defaults to a temporary user directory, or one set by [set_default_ohvbd_cache()]).
#' @param basereq the url of the AREAdata database (usually generated by [ad_basereq()]). If `NA`, uses the default.
#'
#' @return A matrix of the `data` with added columns extracted from `areadata`.
#'
#' @section Date ranges:
#' The date range is a partially open interval. That is to say the lower bound (`targetdate`) is inclusive, but the upper bound (`enddate`) is exclusive.
#'
#' For example a date range of "2020-08-04" - "2020-08-12" will return the 7 days from the 4th through to the 11th of August, but *not* the 12th.
#'
#' @section Date inference:
#'
#' In cases where a full date is not provided, the earliest date possible with the available data is chosen.
#'
#' So "2020-04" will internally become "2020-04-01".
#'
#' If an incomplete date is specified as the `targetdate` and no `enddate` is specified, the range to search is inferred from the minimum temporal scale provided in `targetdate`.
#'
#' For example "2020-04" will be taken to mean the month of April in 2020, and the `enddate` will internally be set to "2020-05-01".
#'
#' @examplesIf interactive()
#' vtdf <- search_hub("Aedes aegypti", "vt") |>
#' tail(20) |>
#' fetch() |>
#' glean(cols = c(
#' "DatasetID",
#' "Latitude",
#' "Longitude",
#' "Interactor1Genus",
#' "Interactor1Species"
#' ), returnunique = TRUE)
#' areadata <- fetch_ad(metric="temp", gid=2, use_cache=TRUE)
#' ad_extract_working <- assoc_ad(vtdf, areadata,
#' targetdate = c("2021-08-04"), enddate=c("2021-08-06"),
#' gid=2, lonlat_names = c("Longitude", "Latitude"))
#'
#' @concept areadata
#'
#' @export
#'
assoc_ad <- function(
data,
areadata,
targetdate = NA,
enddate = NA,
gid = 0,
lonlat_names = c("Longitude", "Latitude"),
cache_location = NULL,
basereq = ad_basereq()
) {
# Remember db attr of input data
datatype <- ohvbd_db(data)
# Cast data to a df
data <- as.data.frame(data)
check_provenance(areadata, "ad", altfunc = NULL, objtype = "Argument {.arg areadata}")
# TODO: Rework this to use assoc_gadm under the hood.
if (length(lonlat_names) != 2) {
cli::cli_abort(c(
"x" = "Longitude and Latitude column names must be provided as vector of length 2!",
"i" = "You provided {.val {lonlat_names}} (detected length = {col_red(length(lonlat_names))})."
))
}
data_colnames <- colnames(data)
if (!all(lonlat_names %in% data_colnames)) {
cli::cli_alert_warning("{.arg lonlat_names} {.val {lonlat_names}} not found in {.arg data}!")
cli::cli_alert_info("Attempting to infer from column names...")
lon_options <- c("longitude", "lon", "long", "sample_long_dd", "decimallongitude")
lat_options <- c("latitude", "lat", "sample_lat_dd", "decimallatitude")
lon_matched <- data_colnames[which(tolower(data_colnames) %in% lon_options)]
lat_matched <- data_colnames[which(tolower(data_colnames) %in% lat_options)]
if (length(lon_matched) > 0 && length(lat_matched) > 0) {
lonlat_names <- c(lon_matched[1], lat_matched[1])
cli::cli_alert_success("Found {.val {lonlat_names}}!")
} else {
cli::cli_abort(
c("x" = "Could not infer Longitude/Latitude columns.",
"!" = "Please provide these as a vector of length 2."))
}
}
metric <- attr(areadata, "metric") %||% "metric"
gid <- attr(areadata, "gid") %||% gid
name_label <- c("NAME_0", "GID_1", "GID_2")
final_name <- name_label[gid + 1]
# Find GADM shapefile for searching
cli::cli_progress_message("Fetching gadm shapefile...")
gadm_sf <- fetch_gadm_sfs(gid = gid, cache_location = cache_location)
# Find latlons for quick processing
cli::cli_progress_message("Selecting latlons...")
orig_lonlat <- data |>
select(all_of(lonlat_names)) |>
mutate_all(function(x) as.numeric(as.character(x)))
# Make distinct points into a SpatVect
cli::cli_progress_message("Casting lonlat points to vector...")
points <- terra::vect(orig_lonlat |> distinct(), geom = lonlat_names)
# Locate which shape each point is in and get the appropriate names for aligning with AD
cli::cli_progress_message("Extracting points (this may take a while)...")
gadm_point_ids <- terra::extract(gadm_sf[, final_name], points)
cli::cli_alert_success("Extraction complete.")
# Prepare the names for association
gadm_point_ids[, final_name] <- gsub(" ", "_", gadm_point_ids[, final_name])
# Recreate the original dataset at full length
cli::cli_progress_message("Lengthening point data...")
gadm_point_ids <- left_join(
orig_lonlat,
bind_cols(orig_lonlat |> distinct(), gadm_point_ids),
by = lonlat_names
)
# Get only the unique entries (no NAs)
places <- gadm_point_ids |>
select(any_of(final_name)) |>
na.omit() |>
unique()
# Pull out the data from AD
cli::cli_progress_message("Extracting AD data...")
ad_extracted <- areadata |>
glean_ad(
targetdate = targetdate,
enddate = enddate,
places = places[, final_name],
gid = gid
)
# Make rownames into a column ready for left join with ids
ad_extracted <- rownames_to_column(data.frame(ad_extracted), var = final_name)
# Make extracted data the same length as the original data
cli::cli_progress_done(result = "Done!")
suppressMessages(
final_extract <- left_join(
select(gadm_point_ids, any_of(final_name)),
ad_extracted
)
)
# Get rid of joining column
final_extract <- final_extract |> select(-one_of(final_name))
# Put together the output
outdata <- bind_cols(data, final_extract)
# Rename the AD variables based upon their metric
outdata <- outdata |>
rename_with(~ gsub("X", paste0(metric, "_"), .), starts_with("X"))
# When only one metric col is found, just name it as the metric
if (ncol(final_extract) == 1) {
colnames(outdata) <- c(colnames(outdata)[1:length(outdata) - 1], metric) # nolint: seq_linter
}
# TODO: Possibly worth filtering returned columns based on the unique dates provided in a date column! This significantly reduced postprocessing effort.
if (!is.null(datatype)) {
outdata <- new_ohvbd.data.frame(outdata, datatype)
}
return(outdata)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.