R/assoc_gadm.R
In ohvbd: One Health VBD Hub

Documented in assoc_gadm

#' @title Associate other data sources with gadm IDs
#' @description Intelligently bind together data with gadm IDs at all scales.
#' @author Francis Windram
#'
#' @param df the source data to bind gadm IDs to. This **must** contain decimal lonlat data!
#' @param lonlat_names a vector containing the column names of the longitude and latitude columns **IN THAT ORDER**!
#' @param cache_location path to cache location (defaults to a temporary user directory, or one set by [set_default_ohvbd_cache()]).
#' @param basereq the url of the AREAdata database (usually generated by [ad_basereq()]). If `NA`, uses the default.
#'
#' @return A matrix of the `data` with added gadm columns.
#'
#' @section Caching:
#' This will **ALWAYS** get and cache gid level 2 data sources. These files are about 80MB total, so if you are running on a metered connection do beware of this.
#'
#'
#' @examplesIf interactive()
#' vtdf <- search_hub("Aedes aegypti", "vt") |>
#'   tail(20) |>
#'   fetch() |>
#'   glean(cols = c(
#'     "DatasetID",
#'     "Latitude",
#'     "Longitude",
#'     "Interactor1Genus",
#'     "Interactor1Species"
#'     ), returnunique = TRUE) |>
#'   assoc_gadm(lonlat_names = c("Longitude", "Latitude"))
#'
#'
#' @export
#'

assoc_gadm <- function(
  df,
  lonlat_names = c("Longitude", "Latitude"),
  cache_location = NULL,
  basereq = ad_basereq()
) {
  # Remember db attr of input data
  datatype <- ohvbd_db(df)

  # Always load gid level 2 as it's quickest AND a complete superset of GID1 and 0
  gid <- 2

  if ("gbif" %in% class(df)) {
    df <- df$data
  }

  # Use LonLats to find an appropriate gadm id

  if (length(lonlat_names) != 2) {
    cli::cli_abort(c(
      "x" = "Longitude and Latitude column names must be provided as vector of length 2!",
      "i" = "You provided {.val {lonlat_names}} (detected length = {col_red(length(lonlat_names))})."
    ))
  }

  # Find GADM shapefile for searching
  gadm_sf <- fetch_gadm_sfs(gid = gid, cache_location = cache_location)

  cli::cli_progress_message("{cli::symbol$pointer} Finding distinct lonlats...")
  # Find latlons for quick processing
  orig_lonlat <- df |>
    select(all_of(lonlat_names)) |>
    mutate_all(function(x) as.numeric(as.character(x)))
  # Make distinct points into a SpatVect
  points <- terra::vect(orig_lonlat |> distinct(), geom = lonlat_names)
  cli::cli_alert_success("Created search vector.")

  startproc <- lubridate::now()

  cli::cli_progress_message(
    "{cli::symbol$pointer} Finding gadm names for {.val {length(points)}} latlon{?s}..."
  )
  # Locate which shape each point is in and get the appropriate names for aligning with AD

  gadm_point_ids <- terra::extract(gadm_sf, points)

  endproc <- lubridate::now()
  totaltime <- as.duration(interval(startproc, endproc)) # nolint: object_usage_linter
  cli::cli_alert_success("Found gadm names in {.val {totaltime}}.")

  # Deduplicate ids if necessary.
  # This probably happens when polys overlap in the shapefile
  duplicated_ids <- duplicated(gadm_point_ids$id.y)
  if (any(duplicated_ids)) {
    cli::cli_alert_warning(
      "Multiple gadm id returns @ data ind{?ex/ices}: {.val {as.character(gadm_point_ids$id.y[which(duplicated_ids)])}}."
    )
    cli::cli_alert_info(
      "Removing duplicates {col_yellow('(check the results are what you want!)')}"
    )

    # Just take the first response. It is likely that both are correct.
    ids_to_remove <- which(duplicated_ids)
    gadm_point_ids <- gadm_point_ids[-ids_to_remove, ]
  }

  gadm_point_ids <- gadm_point_ids |> select(starts_with(c("NAME", "GID")))

  cli::cli_progress_message("{cli::symbol$pointer} Merging with original dataset...")
  # Recreate the original dataset at full length

  gadm_point_ids <- left_join(
    orig_lonlat,
    bind_cols(orig_lonlat |> distinct(), gadm_point_ids),
    by = lonlat_names
  )

  gadm_point_ids <- gadm_point_ids |> select(starts_with(c("NAME", "GID")))

  outdata <- bind_cols(df, gadm_point_ids)
  cli::cli_alert_success("Merge complete.")
  cli::cli_progress_done()

  ohvbd_db(outdata) <- datatype

  return(outdata)
}