# Takes dataframe of inputs, filters out duplicates and invalid rows to limit
# the number of API requests and avoid errors, applies rate-limiting if
# necessary, then iterates over rows in input dataframe making requests and
# stacking all the 1-row results, then the results are joined back to the
# original inputs so that the final return dataframe has the same number of rows
# in the same order and contains the input columns (with the names of the
# geo_*() arguments rather than API names).
geoclient_reqs <- function(inputs, operation, creds, rate_limit) {
if (!(is_logical(rate_limit, 1L))) stop_glue("`rate_limit` must be either TRUE or FALSE")
if (rate_limit) {
# 100 calls per IP per second, 2,500 calls per minute, 500,000 calls per day
geoclient_req <- ratelimitr::limit_rate(geoclient_req, ratelimitr::rate(n = 2500, period = 60))
}
inputs_dedup <- inputs %>% dplyr::distinct() %>% drop_invalid_rows(operation)
if (nrow(inputs_dedup) == 0) {
all_invalid <- inputs %>%
dplyr::mutate(!!"no_results" := TRUE) %>%
fix_input_names(operation)
return(all_invalid)
}
pb <- progress::progress_bar$new(total = nrow(inputs_dedup))
ret <- purrr::pmap_dfr(
inputs_dedup,
geoclient_req,
operation = operation,
creds = creds,
pb = pb
)
# We know the geocoding results are exactly aligned with the inputs, so safe
# to bind cols here
results_dedup <- inputs_dedup %>%
fix_input_names(operation) %>%
dplyr::bind_cols(ret)
# To make sure all the final results are returned in the exact order as the
# inputs we need to start with the full non-deduplicated inputs and left join
# on the results.
inputs %>%
fix_input_names(operation) %>%
dplyr::left_join(
results_dedup,
by = colnames(.)
) %>%
dplyr::mutate(!!"no_results" := replace_na(!!sym("no_results"), TRUE)) # Rows dropped by drop_invalid_rows()
}
# Makes a single API request (Geoclient does not support a single request for
# multiple input locations).
# Inputs: takes a length-1 vector for each of the API query parameters
# Returns: API response as a dataframe
geoclient_req <- function(..., operation, creds, pb = NULL) {
if (!is_null(pb) && !pb$finished) pb$tick()
# Build query param list, removing element if NA (eg. address borough/zip)
params <- purrr::splice(...) %>% purrr::discard(is_na)
resp <- rGET(
glue::glue("https://api.nyc.gov/geo/geoclient/v1/{operation}"),
config = httr::add_headers(
"Accept" = "application/json",
"Ocp-Apim-Subscription-Key" = creds$key
),
query = params
)
auth_failed <- try(httr::content(resp)[[1]][[1]] == "401", silent = TRUE)
if (is_true(auth_failed)) {
stop_glue(
"Authentication failed: Geoclient API key is invalid.
See ?geoclient_api_key for details on how to aquire valid credentials."
)
}
# Sometimes bad inputs can cause 500 Internal Server Error. I think for these
# it should just return the empty tibble like when returns null.
geoclient_stop_for_status(resp)
if (httr::status_code(resp) %in% c(400, 429, 500)) {
return(dplyr::tibble(no_results = TRUE))
}
if (operation == "search") {
parsed <- content_as_json_UTF8(resp)[["results"]][["response"]]
} else {
parsed <- content_as_json_UTF8(resp)[[operation]]
}
# If these is no response (bad inputs, but request executed normally) return a
# 1-row tibble with column indicating the issue
placeholder <- dplyr::tibble(no_results = is_empty(parsed))
if (is_empty(parsed)) {
return(placeholder)
}
# Sometimes, for no clear reason, it will return two rows for one request with
# no important differences, so slice one row
# TODO: Look into this more. I don't remember but this could have to do with
# single-input-search returning multiple possible matches
parsed <- dplyr::as_tibble(parsed) %>% dplyr::slice(1)
dplyr::bind_cols(placeholder, dplyr::as_tibble(parsed))
}
# For the geoclient API request we rename the inputs, but for the return
# dataframe we want to use the R function argument names for consistency. These
# are always at the beginning of the dataframe.
fix_input_names <- function(.data, operation) {
# All other operation the number of user args and api inputs are the same,
# except for BBL which gets split. So concatenate it back together
if (operation == "bbl") {
ret <- .data %>%
dplyr::mutate(!!"input_bbl" := stringr::str_c(!!sym("borough"), !!sym("block"), !!sym("lot"))) %>%
dplyr::select(-(1:3)) %>%
dplyr::select(!!sym("input_bbl"), dplyr::everything())
return(ret)
}
input_names <- switch(
operation,
address = c("house_number", "street", "borough", "zip"),
bbl = "bbl",
bin = "bin",
blockface = c("on_street", "cross_street_1", "cross_street_2", "borough"),
intersection = c("cross_street_1", "cross_street_2", "borough"),
place = c("place", "borough", "zip"),
search = "location"
)
colnames(.data)[seq_along(input_names)] <- paste0("input_", input_names)
.data
}
# If their are mandatory inputs that are missing the API will return either
# nothing or raise http errors. Either way it's not useful to make these
# requests, so we drop all these rows before making the requests and join back
# the responses with the original inputs so the final dataframe returned to the
# user has the same number of rows as the inputs
drop_invalid_rows <- function(.data, operation) {
mandatory_vars <- switch(operation,
address = c("houseNumber", "street"),
bbl = c("borough", "block", "lot"),
bin = "bin",
blockface = c("onStreet", "crossStreetOne", "crossStreetTwo", "borough"),
intersection = c("crossStreetOne", "crossStreetTwo", "borough"),
place = "name",
search = "input"
)
ret <- .data %>% dplyr::filter_at(mandatory_vars, dplyr::all_vars(!are_na(.)))
# Borough and zip can't both be NA
if (operation %in% c("address", "place")) {
ret <- ret %>% dplyr::filter(!(are_na(!!sym("borough")) & are_na(!!sym("zip"))))
}
ret
}
replace_na <- function(x, replace) {
dplyr::if_else(are_na(x), replace, x)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.