R/arc-select.R

Defines functions determine_format supports_pbf validate_page_size set_record_offsets match_fields validate_results_count count_results validate_params add_offset update_params check_inherits_any obj_is_layer obj_check_layer get_query_resps collect_layer arc_select

Documented in arc_select update_params

#' Query a Feature Service
#'
#' [arc_select()] takes a `FeatureLayer`, `Table`, of `ImageServer` object and
#' returns data from the layer as an `sf` object or `data.frame` respectively.
#'
#' @param x an object of class `FeatureLayer`, `Table`, or `ImageServer`.
#' @param fields a character vector of the field names that you wish to be
#'   returned. By default all fields are returned.
#' @param where a simple SQL where statement indicating which features should be
#'   selected.
#' @param crs the spatial reference to be returned. If the CRS is different than
#'   the CRS for the input `FeatureLayer`, a transformation will occur
#'   server-side. Ignored if x is a `Table`.
#' @param geometry default `TRUE`. If geometries should be returned. Ignored for
#'   `Table` objects.
#' @inheritParams prepare_spatial_filter
#' @param n_max the maximum number of features to return. By default returns
#'   every feature available. Unused at the moment.
#' @param page_size the maximum number of features to return per request. Useful when requests return a 500 error code. See Details.
#' @param ... additional query parameters passed to the API.
#' @inheritParams arc_open
#'
#' @details
#'
#' See [reference documentation](https://developers.arcgis.com/rest/services-reference/enterprise/query-feature-service-layer-.htm#GUID-BC2AD141-3386-49FB-AA09-FF341145F614) for possible arguments.
#'
#' `FeatureLayers` can contain very dense geometries with a lot of coordinates.
#' In those cases, the feature service may time out before all geometries can
#' be returned. To address this issue, we can reduce the number of features
#' returned per each request by reducing the value of the `page_size` parameter.
#'
#' `arc_select()` works by sending a single request that counts the number of
#' features that will be returned by the current query. That number is then used
#' to calculate how many "pages" of responses are needed to fetch all the results.
#' The number of features returned (page size) is set to the `maxRecordCount`
#' property of the layer by default. However, by setting `page_size` to be
#' smaller than the `maxRecordCount` we can return fewer geometries per page and
#' avoid time outs.
#'
#' `r lifecycle::badge("experimental")`
#'
#' @export
#' @examples
#' \dontrun{
#' # define the feature layer url
#' furl <- paste0(
#'   "https://services3.arcgis.com/ZvidGQkLaDJxRSJ2/arcgis/rest",
#'   "/services/PLACES_LocalData_for_BetterHealth/FeatureServer/0"
#' )
#'
#' flayer <- arc_open(furl)
#'
#' arc_select(
#'   flayer,
#'   fields = c("StateAbbr", "TotalPopulation")
#' )
#'
#' arc_select(
#'   flayer,
#'   fields = c("OBJECTID", "PlaceName"),
#'   where = "TotalPopulation > 1000000"
#' )
#' }
#' @returns An sf object, or a data.frame
arc_select <- function(
  x,
  ...,
  fields = NULL,
  where = NULL,
  crs = sf::st_crs(x),
  geometry = TRUE,
  filter_geom = NULL,
  predicate = "intersects",
  n_max = Inf,
  page_size = NULL,
  token = arc_token()
) {
  # Developer note:
  # For this function we extract the query object and manipulate the elements
  # inside of the query object to modify our request. We then splice those
  # values back into `x` and send our request
  # note that everything that goes into our query must be the json that will
  # be sent directly to the API request which is why we convert it to json
  # before we use `update_params()`
  check_inherits_any(x, c("FeatureLayer", "Table", "ImageServer"))
  check_number_whole(n_max, min = 0, allow_infinite = TRUE)
  check_string(where, allow_null = TRUE, allow_empty = FALSE)
  check_character(fields, allow_null = TRUE)

  # extract the query object
  query <- attr(x, "query")

  # if dots provided we check that all elements are named
  dots <- rlang::list2(...)
  check_dots_named(dots)

  # extract dots names
  dots_names <- names(dots)

  # insert into query
  for (i in seq_along(dots)) {
    key <- dots_names[i]
    val <- dots[[i]]
    # check that the value is a scalar and non-empty
    check_string(val, arg = key, allow_empty = FALSE)

    # insert into query
    query[[key]] <- val
  }

  # handle fields and where clause if missing
  fields <- fields %||% query[["outFields"]]

  # make sure that fields actually exist
  fields <- match_fields(
    fields = fields,
    values = c(x[["fields"]][["name"]], "")
  )

  # include the fields the query
  query[["outFields"]] <- fields

  # include the where clause if present
  query[["where"]] <- where %||% query[["where"]]

  # set returnGeometry depending on on geometry arg
  query[["returnGeometry"]] <- geometry

  # handle filter geometry if not missing
  if (!is.null(filter_geom) && inherits(x, "FeatureLayer")) {
    spatial_filter <- prepare_spatial_filter(
      filter_geom,
      crs = crs,
      predicate = predicate
    )

    # append spatial filter fields to the query
    query <- c(query, spatial_filter)
  } else if (!is.null(filter_geom)) {
    # warn if filter_geom is supplied but object is not a FeatureLayer
    cli::cli_warn(
      "{.arg filter_geom} is ignored when {.arg x} is
      {.obj_simple_type {.cls {class(x)}}}."
    )

    filter_geom <- NULL
  }

  # handle SR if not missing
  if (!is.na(crs)) {
    query[["outSR"]] <- jsonify::to_json(validate_crs(crs)[[1]], unbox = TRUE)
  }

  # update the parameters based on our query list
  x <- update_params(x, !!!query)

  # send the request
  collect_layer(x, n_max = n_max, token = token, page_size = page_size, ...)
}

#' Query a FeatureLayer or Table object
#'
#' [collect_layer()] is the "workhorse" function that actually executes the
#' queries for FeatureLayer or Table objects.
#'
#' @noRd
collect_layer <- function(
  x,
  n_max = Inf,
  token = arc_token(),
  page_size = NULL,
  ...,
  error_call = rlang::caller_env()
) {
  # 1. Make base request
  # 2. Identify necessary query parameters
  # 3. Figure out offsets and update query parameters
  # 4. Make list of requests
  # 5. Make requests
  # 6. Identify errors (if any) -- skip for now
  # 7. Parse:

  # sets token and agent
  req <- arc_base_req(x[["url"]], token)

  # determine if the layer can query
  can_query <- switch(
    class(x),
    "FeatureLayer" = grepl("query", x[["capabilities"]], ignore.case = TRUE),
    "Table" = grepl("query", x[["capabilities"]], ignore.case = TRUE),
    "ImageServer" = x[["supportsAdvancedQueries"]],
    FALSE
  )

  # throw warning if the layer cannot query
  if (!can_query) {
    cli::cli_alert_danger(
      "{class(x)} {.val {x[['name']]}} does not support querying"
    )
  }

  # extract existing query
  query <- attr(x, "query")

  # if the outSR isn't set, set it to be the same as x
  if (inherits(x, "FeatureLayer") && is.null(query[["outSR"]])) {
    query[["outSR"]] <- jsonify::to_json(
      validate_crs(sf::st_crs(x))[[1]],
      unbox = TRUE
    )
  }

  # retain outFields vector and create flag
  out_fields <- query[["outFields"]]
  has_out_fields <- !is.null(out_fields) && !identical(out_fields, "*")

  # parameter validation ----------------------------------------------------
  # get existing parameters

  # determine_format() chooses between pbf and json
  out_f <- determine_format(x, call = error_call)

  query_params <- validate_params(
    query,
    out_f
  )

  # Offsets -----------------------------------------------------------------

  # count the number of features in a query
  n_feats <- count_results(
    req = req,
    query = query_params,
    n_max = n_max,
    error_call = error_call
  )

  all_resps <- get_query_resps(
    x = x,
    req = req,
    n_feats = n_feats,
    page_size = page_size,
    query_params = query_params,
    error_call = error_call
  )

  if (out_f == "pbf") {
    res <- arcpbf::resps_data_pbf(all_resps)
  } else {
    # fetch the results
    res <- lapply(
      all_resps,
      # all_resps[!has_error],
      function(x) {
        parse_esri_json(
          httr2::resp_body_string(x),
          call = error_call
        )
      }
    )

    # combine results
    res <- rbind_results(res, call = error_call)
  }

  # Drop fields that aren't selected to avoid returning OBJECTID when not
  # selected
  if (rlang::is_named(res) && has_out_fields) {
    out_fields <- c(out_fields, attr(res, "sf_column"))
    match_nm <- match(tolower(out_fields), tolower(names(res)))
    res <- res[, match_nm[!is.na(match_nm)], drop = FALSE]
  }

  if (rlang::is_empty(res)) {
    cli::cli_alert_info("No features returned from query")
    return(res)
  }

  if (inherits(res, "sf") && is.na(sf::st_crs(res))) {
    sf::st_crs(res) <- sf::st_crs(x)
  }

  if (nrow(res) < n_feats) {
    # See https://github.com/R-ArcGIS/arcgislayers/issues/110
    cli::cli_warn(
      c(
        "Results include fewer than the expected {n_feats} features.",
        "*" = "Try setting {.arg page_size} to a smaller value to make
        sure results include all available features."
      )
    )
  }

  res
}

#' Get query responses with handling for layers that don't support pagination
#' @noRd
get_query_resps <- function(
  req,
  x,
  n_feats,
  page_size = NULL,
  query_params = list(),
  error_call = rlang::caller_env()
) {
  # If pagination is not supported, we create one query and return the results
  # in a list with a warning. This way the maximum number of results is returned
  # but the user is also informed that they will not get tha maximum number of
  # records. Otherwise, we continue and utilize the pagination
  if (isFALSE(x[["advancedQueryCapabilities"]][["supportsPagination"]])) {
    if (n_feats > x[["maxRecordCount"]]) {
      cli::cli_warn(
        c(
          "{class(x)} {.val {x[['name']]}} does not support pagination and
          complete results can't be returned.",
          "i" = "{n_feats} features are selected by the query and the maximum
          is {x[['maxRecordCount']]} records."
        )
      )
    }

    req <- httr2::req_body_form(
      httr2::req_url_path_append(req, "query"),
      !!!query_params
    )

    resp <- httr2::req_perform(req, error_call = error_call)

    return(list(resp))
  }

  # create a list of record counts based on number of features, page size and max records
  record_offsets <- set_record_offsets(
    n_feats = n_feats,
    page_size = page_size,
    max_records = x[["maxRecordCount"]],
    error_call = error_call
  )

  # create a list of requests from the offset and page sizes
  all_requests <- mapply(
    add_offset,
    .offset = record_offsets[["offsets"]],
    .page_size = record_offsets[["counts"]],
    MoreArgs = list(.req = req, .params = query_params),
    SIMPLIFY = FALSE
  )

  # make all requests and store responses in list
  httr2::req_perform_parallel(all_requests, on_error = "continue")
}


# utility -----------------------------------------------------------------

#' Check if an object is a FeatureLayer or Table object
#'
#' [obj_check_layer()] errors if an object does not inherit either the
#' FeatureLayer or Table class.
#'
#' @param x A `FeatureLayer` or `Table` class object created with [arc_open()].
#' @inheritParams rlang::args_error_context
#' @keywords internal
#' @noRd
obj_check_layer <- function(
  x,
  arg = rlang::caller_arg(x),
  call = rlang::caller_env()
) {
  check_inherits_any(
    x,
    class = c("FeatureLayer", "Table"),
    arg = arg,
    call = call
  )
}

#' @noRd
obj_is_layer <- function(x) {
  rlang::inherits_any(x, c("FeatureLayer", "Table"))
}

#' Check if an object inherits from a set of classes
#'
#' [check_inherits_any()] wraps [rlang::inherits_any()] to error if an object
#' does not inherit any of a set of classes.
#'
#' @inheritParams cli::cli_vec
#' @inheritParams rlang::inherits_any
#' @inheritParams rlang::args_error_context
#' @keywords internal
#' @noRd
check_inherits_any <- function(
  x,
  class,
  arg = rlang::caller_arg(x),
  call = rlang::caller_env()
) {
  if (rlang::inherits_any(x, class)) {
    return(invisible(NULL))
  }

  class <- cli::cli_vec(
    class,
    style = list("before" = "`", "after" = "`", "vec-last" = " or ")
  )

  cli::cli_abort(
    "{.arg {arg}} must be a {class} object, not {.obj_simple_type {.cls {class(x)}}}.",
    call = call
  )
}

#' Modify query parameters
#'
#' [update_params()] takes named arguments and updates the query.
#'
#' @param x a `FeatureLayer` or `Table` object
#' @param ... key value pairs of query parameters and values.
#' @export
#' @examples
#' \dontrun{
#' furl <- paste0(
#'   "https://services.arcgis.com/P3ePLMYs2RVChkJx/ArcGIS/rest/services/",
#'   "USA_Major_Cities_/FeatureServer/0"
#' )
#'
#' flayer <- arc_open(furl)
#' update_params(flayer, outFields = "NAME")
#' }
#' @returns An object of the same class as `x`
update_params <- function(x, ...) {
  query <- attr(x, "query")
  params <- rlang::list2(...)

  for (name in names(params)) {
    query[[name]] <- params[[name]]
  }

  attr(x, "query") <- query
  x
}

#' Add an offset to a query parameters
#'
#' [add_offset()] takes a list of query parameters and creates a query request.
#' Importantly, this creates the paginated results that will be needed for
#' Feature Layers with more than 2000 observations.
#'
#' @keywords internal
#' @noRd
add_offset <- function(.req, .offset, .page_size, .params) {
  .req <- httr2::req_url_path_append(.req, "query")
  httr2::req_body_form(
    .req,
    !!!.params,
    resultOffset = .offset,
    resultRecordCount = .page_size
  )
}

#' Validate query parameters
#'
#' [validate_params()] ensures that the parameters are set to minimally
#' acceptable values.
#'
#' @keywords internal
#' @noRd
validate_params <- function(params, f = "json") {
  if (!is.null(params[["outFields"]])) {
    params[["outFields"]] <- paste0(params[["outFields"]], collapse = ",")
  } else {
    # if output fields are missing set to "*"
    params[["outFields"]] <- "*"
  }

  # if where is missing set it to 1=1
  params[["where"]] <- params[["where"]] %||% "1=1"

  # set output type to geojson if we return geometry, json if not
  if (
    is.null(params[["returnGeometry"]]) || isTRUE(params[["returnGeometry"]])
  ) {
    params[["f"]] <- f
  } else {
    params[["f"]] <- f
  }

  params
}

# Given a query, determine how many features will be returned
#' @noRd
count_results <- function(
  req,
  query,
  n_max = Inf,
  error_call = rlang::caller_env()
) {
  n_req <- httr2::req_body_form(
    httr2::req_url_path_append(req, "query"),
    # count results should always use json
    !!!validate_params(query, query[["f"]]),
    returnCountOnly = "true"
  )

  resp <- httr2::resp_body_string(
    httr2::req_perform(
      httr2::req_url_query(n_req, f = "json"),
      error_call = error_call
    )
  )

  n_results <- RcppSimdJson::fparse(resp)[["count"]]

  # identify the number of pages needed to return all features
  validate_results_count(n_results, n_max = n_max, error_call = error_call)
}


#' Set and validate n_feats based on n_max
#' @noRd
validate_results_count <- function(
  n_results = NULL,
  n_max = Inf,
  error_call = rlang::caller_env()
) {
  if (is.null(n_results)) {
    cli::cli_abort(
      c(
        "Can't determine the number of requested features.",
        "i" = "Did you set custom parameters via {.arg ...} or
        use an invalid {.arg where} argument?"
      ),
      call = error_call
    )
  } else if (!is.infinite(n_max) && (n_results > n_max)) {
    # TODO: Implement a verbose parameter that can enable this message
    # See https://github.com/R-ArcGIS/arcgislayers/pull/180#issuecomment-2049631271
    # cli::cli_bullets(
    #   c(
    #     "i" = "Query results limited to {n_max} out of {n_feats} available feature{?s}.",
    #     "!" = "Increase {.arg n_max} value to return all selected features."
    #     )
    # )

    n_results <- n_max
  }

  if (is.numeric(n_results)) {
    return(n_results)
  }

  cli::cli_abort(
    c(
      "Can't determine the number of requested features.",
      "*" = "Set {.arg n_max} or check to make sure query parameters are valid."
    ),
    call = error_call
  )
}

#' Match fields
#'
#' [match_fields()] ensures that fields passed to [arc_select()] match
#' permissible values.
#'
#' @keywords internal
#' @noRd
match_fields <- function(
  fields,
  values = NULL,
  multiple = TRUE,
  error_arg = rlang::caller_arg(fields),
  error_call = rlang::caller_env()
) {
  if (is.null(fields) || identical(fields, "*")) {
    return(fields)
  }

  if (all(tolower(fields) %in% tolower(values))) {
    return(fields)
  }

  rlang::arg_match(
    fields,
    values = values,
    multiple = multiple,
    error_arg = error_arg,
    error_call = error_call
  )
}

#' Set record counts to retrieve based on page size and number of pages
#' @noRd
set_record_offsets <- function(
  n_feats = NULL,
  page_size = NULL,
  max_records = NULL,
  error_call = rlang::caller_env()
) {
  # set page size based on the maximum allowed to be returned
  page_size <- validate_page_size(
    page_size,
    max_records = max_records,
    error_call = error_call
  )

  # calculate the total number of requests to be made
  n_pages <- ceiling(n_feats / page_size)
  # these values get passed to `resultOffset`
  offsets <- (1:n_pages - 1) * page_size
  # create vector of page sizes to be passed to `resultRecordCount`
  counts <- rep(page_size, n_pages)
  # modify the last offset to have `resultRecordCount` of the remainder
  # this lets us get an exact value
  counts[n_pages] <- n_feats - offsets[n_pages]

  list(
    "offsets" = offsets,
    "counts" = counts
  )
}

#' Set page size and check page size validity
#'
#' @noRd
validate_page_size <- function(
  page_size = NULL,
  max_records = NULL,
  error_call = rlang::caller_env()
) {
  if (is.numeric(page_size)) {
    # coerce to integer if page_size is numeric
    page_size <- as.integer(page_size)
  }

  check_number_whole(page_size, min = 1, allow_null = TRUE, call = error_call)
  check_number_whole(
    page_size,
    # bug in the standalone checks
    # needs to be a double and cannot be used with
    # max at the same time which is why it is brought into two calls
    max = as.double(max_records),
    allow_null = TRUE,
    call = error_call
  )

  # if page_size is null, use max records (default)
  page_size <- page_size %||% max_records

  if (is.numeric(max_records) && (page_size > max_records)) {
    cli::cli_abort(
      "{.arg page_size} ({page_size}) can't be more than than the layer
      {.field maxRecordCount} property ({max_records}).",
      call = error_call
    )
  }

  page_size
}


# Protocol Buffer helpers ------------------------------------------------

supports_pbf <- function(
  x,
  arg = rlang::caller_arg(x),
  call = rlang::caller_call()
) {
  check_inherits_any(
    x,
    class = c("FeatureLayer", "Table", "ImageServer"),
    arg = arg,
    call = call
  )

  # extract supported query formats
  query_formats_raw <- x[["supportedQueryFormats"]]

  # perform a check to make sure the supported query formats are
  # actually there if not return false. This shouldn't happen though.
  if (is.null(query_formats_raw)) {
    return(FALSE)
  }

  # split and convert to lower case
  formats <- tolower(strsplit(query_formats_raw, ", ")[[1]])
  # if for some reason the first element is null we return false
  # note sure of the utility of this check though.

  if (is.null(formats)) {
    return(FALSE)
  }

  # perform the check
  "pbf" %in% formats
}

determine_format <- function(
  x,
  arg = rlang::caller_arg(x),
  call = rlang::caller_call()
) {
  use_pbf <- supports_pbf(x, arg, call)
  if (use_pbf) {
    "pbf"
  } else {
    "json"
  }
}

Try the arcgislayers package in your browser

Any scripts or data that you put into this service are public.

arcgislayers documentation built on April 16, 2025, 1:10 a.m.