R/geocode.R

#' Geocode
#'
#' Geocodes (finds latitude and longitude of) a location using the Google
#' Geocoding API. Note: To use Google's Geocoding API, you must first enable the
#' API in the Google Cloud Platform Console. See [register_google()].
#'
#' Note: [geocode()] uses Google's Geocoding API to geocode addresses. Please
#' take care not to disclose sensitive information.
#' \url{https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8972108/} suggest various
#' alternative options for such data.
#'
#' @param location a character vector of street addresses or place names (e.g.
#'   `"1600 pennsylvania avenue, washington dc" or "Baylor University"`)
#' @param output amount of output, `"latlon"`, `"latlona"`, `"more"`, or `"all"`
#' @param source "google" for Google (note: "dsk" is defunct)
#' @param force force online query even if cached.
#' @param urlonly return only the url?
#' @param override_limit override the current query rate
#' @param nameType in some cases, Google returns both a long name and a short
#'   name. this parameter allows the user to specify which to grab.
#' @param ext top level domain (e.g. "com", "co.nz"); helpful for non-US users
#' @param inject character string to add to the url or named character vector of
#'   key-value pairs to be injected (e.g. c("a" = "b") get converted to "a=b"
#'   and appended to the query)
#' @param data a data frame or equivalent
#' @param path path to file
#' @param overwrite in [load_geocode_cache()], should the current cache be
#'   wholly replaced with the one on file?
#' @param ... In [mutate_geocode()], arguments to pass to [geocode()]. In
#'   [write_geocode_cache()], arguments to pass to [saveRDS()].
#' @return If `output` is `"latlon"`, `"latlona"`, or `"more"`, a tibble
#'   (classed data frame). If `"all"`, a list.
#' @author David Kahle \email{david@@kahle.io}
#' @seealso \url{https://developers.google.com/maps/documentation/geocoding/},
#'   \url{https://developers.google.com/maps/documentation/javascript/geocoding/},
#'   \url{https://developers.google.com/maps/documentation/geocoding/usage-and-billing/},
#'   \url{https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8972108/}
#'
#'
#' @name geocode
#' @examples
#'
#' \dontrun{ requires Google API key, see ?register_google
#'
#' ## basic usage
#' ########################################
#'
#' # geocoding is most commonly used for addresses
#' geocode("1600 Amphitheatre Parkway, Mountain View, CA")
#' geocode("1600 Amphitheatre Parkway, Mountain View, CA", urlonly = TRUE)
#'
#' # google can also geocode colloquial names of places
#' geocode("the white house")
#'
#' # geocode can also accept character vectors of places
#' geocode(c("the white house", "washington dc"))
#'
#'
#'
#' ## types of output
#' ########################################
#'
#' geocode("waco texas")
#' geocode("waco texas", output = "latlona")
#' geocode("waco texas", output = "more")
#' str(geocode("waco texas", output = "all"))
#'
#' geocode(c("waco, texas", "houston, texas"))
#' geocode(c("waco, texas", "houston, texas"), output = "latlona")
#' geocode(c("waco, texas", "houston, texas"), output = "all") %>% str(4)
#'
#'
#'
#' ## mutate_geocode
#' ########################################
#'
#' # mutate_geocode is used to add location columns to an existing dataset
#' # that has location information
#'
#' df <- data.frame(
#'   address = c("1600 Pennsylvania Avenue, Washington DC", "", "houston texas"),
#'   stringsAsFactors = FALSE
#' )
#'
#' mutate_geocode(df, address)
#' df %>% mutate_geocode(address)
#'
#'
#' ## known issues
#' ########################################
#'
#' # in some cases geocode finds several locations
#'
#'
#' }
#'
#'











#' @rdname geocode
#' @export
geocode <- function (
  location,
  output = c("latlon", "latlona", "more", "all"),
  source = c("google", "dsk"),
  force = ifelse(source == "dsk", FALSE, TRUE),
  urlonly = FALSE,
  override_limit = FALSE,
  nameType = c("long", "short"),
  ext = "com",
  inject = "",
  ...
) {

  # basic parameter check
  stopifnot(is.character(location))
  output <- match.arg(output)
  nameType <- match.arg(nameType)
  source   <- match.arg(source)


  # source checking
  if (source == "google" && !has_google_key() && !urlonly) {
    cli::cli_abort("Google now requires an API key; see {.fn ggmap::register_google}.")
  }
  # if (source == "dsk") stop("datasciencetoolkit.org terminated its map service, sorry!")


  # vectorize for many locations
  if (length(location) > 1) {

    out <- location %>%
      map(~ geocode(.x,
        "output" = output,
        "source" = source,
        "messaging" = messaging,
        "inject" = inject,
        "force" = force,
        "urlonly" = urlonly
        )
      )

    if (output == "all") return(out)

    out <- out %>% map(~ as_tibble(as.list(.x))) %>% bind_rows()
    return(out)

  }


  # return NA for location == ""
  if (location == "") return(return_failed_geocode(output))


  # set url base (protocol + fqdn + path + "?")
  url_base <- switch(source,
    "google" = glue("https://maps.googleapis.{ext}/maps/api/geocode/json?"),
       "dsk" = "http://www.datasciencetoolkit.org/maps/api/geocode/json?"
  )


  # initialize the url query
  url_query <- location %>% str_trim() %>% str_replace_all(" +", "+") %>% URLencode(reserved = FALSE) %>% c("address" = .)
  #                                        address
  # "1600+Amphitheatre+Parkway,+Mountain+View,+CA"


  # add google account stuff to query, if applicable
  if (source == "google") {
    url_query <- c(url_query, "client" = google_client(), "signature" = google_signature(), "key" = google_key())
    url_query <- url_query[!is.na(url_query)]
  }


  # form url
  url_query_inline <- str_c(names(url_query), url_query, sep = "=", collapse = "&")
  url <- str_c(url_base, url_query_inline)


  # inject any remaining stuff
  if (inject != "") {
    if (is.null(names(inject))) {
      url <- str_c(url, inject, sep = "&")
    } else {
      url <- str_c(url, str_c(names(inject), inject, sep = "=", collapse = "&"), sep = "&")
    }
  }


  # encode
  url <- URLencode( enc2utf8(url) )
  url <- str_replace_all(url, "#", "%23") # selectively url-encode


  # return early if user only wants url
  if(urlonly) if(showing_key()) return(url) else return(scrub_key(url))


  # hash for caching
  url_hash <- digest::digest(scrub_key(url))


  # lookup info if on file
  if (location_is_cached(url_hash) && force == FALSE) {

    gc <- geocode_cache()[[url_hash]]

  } else {

    # if using google, throttle/update google query limit
    if (source == "google") throttle_google_geocode_query_rate(url_hash, queries_sought = 1L, override = override_limit)

    # message url
    if (showing_key()) source_url_msg(url) else source_url_msg(scrub_key(url))

    # query server
    response <- httr::GET(url)

    # deal with bad responses
    if (response$status_code != 200L) {
      warning(
        tryCatch(stop_for_status(response),
          "http_400" = function(c) "HTTP 400 Bad Request",
          "http_402" = function(c) "HTTP 402 Payment Required - May indicate over Google query limit",
          "http_403" = function(c) "HTTP 403 Forbidden - Server refuses, is the API enabled?",
          "http_404" = function(c) "HTTP 404 Not Found - Server reports page not found",
          "http_414" = function(c) "HTTP 414 URI Too Long - URL query too long",
          "http_500" = function(c) "HTTP 500 Internal Server Error - If dsk, try Google",
          "http_503" = function(c) "HTTP 503 Service Unavailable - Server bogged down, try later"
        )
      )
      return(return_failed_geocode(output))
    }

    # grab content
    gc <- httr::content(response)

    # cache it
    cache_geocoded_info(url_hash, gc)

  }


  # did geocode fail?
  if (gc$status != "OK") {
    warning(
      glue("Geocoding \"{str_trunc(location, 20)}\" failed with error:"),
      "\n", gc$error_message, "\n",
      call. = FALSE, immediate. = TRUE, noBreaks. = FALSE
    )
    return(tibble("lon" = NA_real_, "lat" = NA_real_))
  }


  # return if you want full output
  if (output == "all") return(gc)



  # more than one location found?
  if (length(gc$results) > 1L) {
    cli::cli_warn("\"{stringr::str_trunc(location, 20)}\" not uniquely geocoded, using \"{tolower(gc$results[[1]]$formatted_address)}\"")
  }



  # format geocoded data
  NULLtoNA <- function (x) {
    if (is.null(x)) return(NA) else x
  }

  gcdf <- with(gc$results[[1]], {
  	tibble(
      "lon" = NULLtoNA(geometry$location$lng),
      "lat" = NULLtoNA(geometry$location$lat),
      "type" = tolower(NULLtoNA(types[1])),
      "loctype" = tolower(NULLtoNA(geometry$location_type)),
      "address" = location, # dsk doesn't give the address
      "north" = NULLtoNA(geometry$viewport$northeast$lat),
      "south" = NULLtoNA(geometry$viewport$southwest$lat),
      "east" = NULLtoNA(geometry$viewport$northeast$lng),
      "west" = NULLtoNA(geometry$viewport$southwest$lng)
    )
  })



  # add address
  if (source == "google") gcdf$address <- tolower(NULLtoNA(gc$results[[1]]$formatted_address))


  if (output == "latlon") return(gcdf[,c("lon","lat")])
  if (output == "latlona") return(gcdf[,c("lon","lat","address")])
  if (output == "more") return(gcdf)


  # parse json when output == "more"
  name_to_grab  <- if(nameType == "long") "long_name" else "short_name"
  output_values <- vapply(gc$results[[1]]$address_components, function (x) x[[name_to_grab]], character(1))
  output_names <- vapply(gc$results[[1]]$address_components, function (x) {
      if (length(x$types) == 0) return("query")
      unlist(x$types)[1]
    },
    character(1)
  )
  gcdf_more <- as_tibble(as.list(output_values))
  names(gcdf_more) <- output_names

  tibble(gcdf, gcdf_more)
}






















#' @rdname geocode
#' @export
mutate_geocode <- function (data, location, ...){
  locs <- data[[deparse(substitute(location))]]
  gcdf <- geocode(locs, ...)
  dplyr::bind_cols(data, gcdf)
}












throttle_google_geocode_query_rate <- function (url_hash, queries_sought, override) {

  if (exists(".google_geocode_query_times", ggmap_environment)) {

    .google_geocode_query_times <- get(".google_geocode_query_times", envir = ggmap_environment)

    queries_used_in_last_second <- with(.google_geocode_query_times, sum(queries[time >= Sys.time() - 1L]))

    if (!override && (queries_used_in_last_second + queries_sought > google_second_limit())) Sys.sleep(.2) # can do better

    assign(
      ".google_geocode_query_times",
      bind_rows(.google_geocode_query_times, tibble("time" = Sys.time(), "url" = url_hash, "queries" = queries_sought)),
      envir = ggmap_environment
    )

  } else {

    assign(".google_geocode_query_times", tibble("time" = Sys.time(), "url" = url_hash, "queries" = queries_sought), envir = ggmap_environment)

  }

  invisible()

}










#' @export
#' @rdname geocode
geocodeQueryCheck <- function () {

  .Deprecated(msg = "As of mid-2018, Google no longer has daily query limits.")
  queries <- NA; rm(queries)

  if (exists(".google_geocode_query_times", ggmap_environment)) {

    .google_geocode_query_times <- get(".google_geocode_query_times", ggmap_environment)

    google_geocode_queries_in_last_24hrs <-
      .google_geocode_query_times %>%
        dplyr::filter(time >= Sys.time() - 24L*60L*60L) %>%
        dplyr::select(queries) %>%
        sum()

  	remaining <- google_day_limit() - google_geocode_queries_in_last_24hrs
    cli::cli_alert_info("{remaining} Google geocoding queries remaining.")

  } else {

  	remaining <- google_day_limit()
  	cli::cli_alert_info("{remaining} Google geocoding queries remaining.")

  }

  invisible(remaining)
}






#' @export
#' @rdname geocode
geocode_cache <- function () {

  if (!exists(".geocode_cache", envir = ggmap_environment)) {
    assign(".geocode_cache", list(), ggmap_environment)
  }

  get(".geocode_cache", envir = ggmap_environment)

}







cache_geocoded_info <- function (url_hash, data) {

  if (!exists(".geocode_cache", envir = ggmap_environment)) assign(".geocode_cache", list(), ggmap_environment)

  assign(
    ".geocode_cache",
    c(geocode_cache(), structure(list(data), names = url_hash)),
    envir = ggmap_environment
  )

  invisible()

}








location_is_cached <- function (url_hash) {
  if (!exists(".geocode_cache", envir = ggmap_environment)) return(FALSE)
  if (url_hash %notin% names(geocode_cache())) return(FALSE)
  TRUE
}










return_failed_geocode <- function (output) {
  if (output == "latlon") {
    return(tibble("lon" = NA_real_, "lat" = NA_real_))
  } else if (output == "latlona") {
    return(tibble("lon" = NA_real_, "lat" = NA_real_, "address" = NA_character_))
  } else if (output == "more") {
    return(tibble(
      "lon" = NA_real_, "lat" = NA_real_, "type" = NA_character_, "address" = NA_character_,
      "north" = NA_real_, "south" = NA_real_, "east" = NA_real_, "west" = NA_real_
    ))
  } else {
    return(NA)
  }
}









#' @export
#' @rdname geocode
write_geocode_cache <- function (path, ...) {

  saveRDS(
    object = geocode_cache(),
    file = path,
    ...
  )

}





#' @export
#' @rdname geocode
load_geocode_cache <- function(path, overwrite = FALSE) {

  if (!exists(".geocode_cache", envir = ggmap_environment)) {
    assign(".geocode_cache", list(), ggmap_environment)
  }

  if (overwrite) {
    assign(".geocode_cache", readRDS(path), ggmap_environment)
  } else {
    assign(
      ".geocode_cache",
      c(geocode_cache(), readRDS(path)),
      ggmap_environment
    )
  }

}






#' @export
#' @rdname geocode
clear_geocode_cache <- function(path) {

  assign(".geocode_cache", list(), ggmap_environment)

}
dkahle/ggmap documentation built on March 3, 2024, 8:19 a.m.