R/assoc_ad.R

Defines functions assoc_ad

Documented in assoc_ad

#' @title Associate other data sources with AREAdata data
#' @description Intelligently bind together data from AREAdata and other sources at a given spatial scale.
#' @author Francis Windram
#'
#' @param data the source data to bind AREAdata to. This **must** contain decimal lonlat data!
#' @param areadata the AREAdata to bind, usually from [fetch_ad()].
#' @param targetdate **ONE OF** the following:
#' * The date to search for in ISO 8601 (e.g. "2020", "2021-09", or "2022-09-21").
#' * The start date for a range of dates.
#' * A character vector of fully specified dates to search for (i.e. "yyyy-mm-dd").
#' @param enddate The (exclusive) end of the range of dates to search for. If this is unfilled, only the `targetdate` is searched for.
#' @param gid the spatial scale to retrieve (0 = country-level, 1=province-level...). (Note: this will preferentially use the gid level of `areadata` if present.)
#' @param lonlat_names a vector containing the column names of the longitude and latitude columns **IN THAT ORDER**!
#' @param cache_location path to cache location (defaults to a temporary user directory, or one set by [set_default_ohvbd_cache()]).
#' @param basereq the url of the AREAdata database (usually generated by [ad_basereq()]). If `NA`, uses the default.
#'
#' @return A matrix of the `data` with added columns extracted from `areadata`.
#'
#' @section Date ranges:
#' The date range is a partially open interval. That is to say the lower bound (`targetdate`) is inclusive, but the upper bound (`enddate`) is exclusive.
#'
#' For example a date range of "2020-08-04" - "2020-08-12" will return the 7 days from the 4th through to the 11th of August, but *not* the 12th.
#'
#' @section Date inference:
#'
#' In cases where a full date is not provided, the earliest date possible with the available data is chosen.
#'
#' So "2020-04" will internally become "2020-04-01".
#'
#' If an incomplete date is specified as the `targetdate` and no `enddate` is specified, the range to search is inferred from the minimum temporal scale provided in `targetdate`.
#'
#' For example "2020-04" will be taken to mean the month of April in 2020, and the `enddate` will internally be set to "2020-05-01".
#'
#' @examplesIf interactive()
#' vtdf <- search_hub("Aedes aegypti", "vt") |>
#'   tail(20) |>
#'   fetch() |>
#'   glean(cols = c(
#'     "DatasetID",
#'     "Latitude",
#'     "Longitude",
#'     "Interactor1Genus",
#'     "Interactor1Species"
#'     ), returnunique = TRUE)
#' areadata <- fetch_ad(metric="temp", gid=2, use_cache=TRUE)
#' ad_extract_working <- assoc_ad(vtdf, areadata,
#'                                     targetdate = c("2021-08-04"), enddate=c("2021-08-06"),
#'                                     gid=2, lonlat_names = c("Longitude", "Latitude"))
#'
#' @concept areadata
#'
#' @export
#'

assoc_ad <- function(
  data,
  areadata,
  targetdate = NA,
  enddate = NA,
  gid = 0,
  lonlat_names = c("Longitude", "Latitude"),
  cache_location = NULL,
  basereq = ad_basereq()
) {
  # Remember db attr of input data
  datatype <- ohvbd_db(data)

  # Cast data to a df
  data <- as.data.frame(data)

  check_provenance(areadata, "ad", altfunc = NULL, objtype = "Argument {.arg areadata}")

  # TODO: Rework this to use assoc_gadm under the hood.

  if (length(lonlat_names) != 2) {
    cli::cli_abort(c(
        "x" = "Longitude and Latitude column names must be provided as vector of length 2!",
        "i" = "You provided {.val {lonlat_names}} (detected length = {col_red(length(lonlat_names))})."
      ))
  }

  data_colnames <- colnames(data)

  if (!all(lonlat_names %in% data_colnames)) {
    cli::cli_alert_warning("{.arg lonlat_names} {.val {lonlat_names}} not found in {.arg data}!")
    cli::cli_alert_info("Attempting to infer from column names...")
    lon_options <- c("longitude", "lon", "long", "sample_long_dd", "decimallongitude")
    lat_options <- c("latitude", "lat", "sample_lat_dd", "decimallatitude")
    lon_matched <- data_colnames[which(tolower(data_colnames) %in% lon_options)]
    lat_matched <- data_colnames[which(tolower(data_colnames) %in% lat_options)]
    if (length(lon_matched) > 0 && length(lat_matched) > 0) {
      lonlat_names <- c(lon_matched[1], lat_matched[1])
      cli::cli_alert_success("Found {.val {lonlat_names}}!")
    } else {
      cli::cli_abort(
        c("x" = "Could not infer Longitude/Latitude columns.",
          "!" = "Please provide these as a vector of length 2."))
    }
  }

  metric <- attr(areadata, "metric") %||% "metric"

  gid <- attr(areadata, "gid") %||% gid

  name_label <- c("NAME_0", "GID_1", "GID_2")
  final_name <- name_label[gid + 1]
  # Find GADM shapefile for searching
  cli::cli_progress_message("Fetching gadm shapefile...")
  gadm_sf <- fetch_gadm_sfs(gid = gid, cache_location = cache_location)

  # Find latlons for quick processing
  cli::cli_progress_message("Selecting latlons...")
  orig_lonlat <- data |>
    select(all_of(lonlat_names)) |>
    mutate_all(function(x) as.numeric(as.character(x)))

  # Make distinct points into a SpatVect
  cli::cli_progress_message("Casting lonlat points to vector...")
  points <- terra::vect(orig_lonlat |> distinct(), geom = lonlat_names)

  # Locate which shape each point is in and get the appropriate names for aligning with AD
  cli::cli_progress_message("Extracting points (this may take a while)...")
  gadm_point_ids <- terra::extract(gadm_sf[, final_name], points)

  cli::cli_alert_success("Extraction complete.")

  # Prepare the names for association
  gadm_point_ids[, final_name] <- gsub(" ", "_", gadm_point_ids[, final_name])

  # Recreate the original dataset at full length
  cli::cli_progress_message("Lengthening point data...")
  gadm_point_ids <- left_join(
    orig_lonlat,
    bind_cols(orig_lonlat |> distinct(), gadm_point_ids),
    by = lonlat_names
  )

  # Get only the unique entries (no NAs)
  places <- gadm_point_ids |>
    select(any_of(final_name)) |>
    na.omit() |>
    unique()

  # Pull out the data from AD
  cli::cli_progress_message("Extracting AD data...")
  ad_extracted <- areadata |>
    glean_ad(
      targetdate = targetdate,
      enddate = enddate,
      places = places[, final_name],
      gid = gid
    )

  # Make rownames into a column ready for left join with ids
  ad_extracted <- rownames_to_column(data.frame(ad_extracted), var = final_name)

  # Make extracted data the same length as the original data
  cli::cli_progress_done(result = "Done!")
  suppressMessages(
    final_extract <- left_join(
      select(gadm_point_ids, any_of(final_name)),
      ad_extracted
    )
  )

  # Get rid of joining column
  final_extract <- final_extract |> select(-one_of(final_name))

  # Put together the output
  outdata <- bind_cols(data, final_extract)

  # Rename the AD variables based upon their metric
  outdata <- outdata |>
    rename_with(~ gsub("X", paste0(metric, "_"), .), starts_with("X"))
  # When only one metric col is found, just name it as the metric
  if (ncol(final_extract) == 1) {
    colnames(outdata) <- c(colnames(outdata)[1:length(outdata) - 1], metric) # nolint: seq_linter
  }

  # TODO: Possibly worth filtering returned columns based on the unique dates provided in a date column! This significantly reduced postprocessing effort.
  if (!is.null(datatype)) {
    outdata <- new_ohvbd.data.frame(outdata, datatype)
  }

  return(outdata)
}

Try the ohvbd package in your browser

Any scripts or data that you put into this service are public.

ohvbd documentation built on March 10, 2026, 1:07 a.m.