R/geocode_combine.R

Defines functions geocode_combine geo_combine get_global_params_parameter_documentation get_queries_parameter_documentation

Documented in geocode_combine geo_combine

get_queries_parameter_documentation <- function() {
  return(c(
    "a list of queries, each provided as a list of parameters. The queries are",
    "executed by the [geocode] function in the order provided.",
    "(ex. `list(list(method = 'osm'), list(method = 'census'), ...)`)"
  ))
}

get_global_params_parameter_documentation <- function() {
  return(c(
    "a list of parameters to be used for all queries",
    "(ex. `list(address = 'address', full_results = TRUE)`)"
  ))
}


#' Combine multiple geocoding queries
#'
#' @description Passes address inputs in character vector form to the
#'  [geocode_combine] function for geocoding.
#'
#'  Note that address inputs must be specified for queries either with the `queries` parameter (for each query)
#'  or the `global_params` parameter (for all queries). For example `global_params = list(address = 'address')`
#'  passes addresses provided in the `address` parameter to all queries.
#'
#' @param queries `r get_queries_parameter_documentation()`
#' @param global_params `r get_global_params_parameter_documentation()`
#' @inheritParams geo
#' @param ... arguments passed to the [geocode_combine] function
#' @inherit geo return
#' @examples
#' \donttest{
#'
#' options(tidygeocoder.progress_bar = FALSE)
#' example_addresses <- c("100 Main St New York, NY", "Paris", "Not a Real Address")
#'
#' geo_combine(
#'   queries = list(
#'     list(method = "census"),
#'     list(method = "osm")
#'   ),
#'   address = example_addresses,
#'   global_params = list(address = "address")
#' )
#'
#' geo_combine(
#'   queries = list(
#'     list(method = "arcgis"),
#'     list(method = "census", mode = "single"),
#'     list(method = "census", mode = "batch")
#'   ),
#'   global_params = list(address = "address"),
#'   address = example_addresses,
#'   cascade = FALSE,
#'   return_list = TRUE
#' )
#'
#' geo_combine(
#'   queries = list(
#'     list(method = "arcgis", address = "city"),
#'     list(method = "osm", city = "city", country = "country")
#'   ),
#'   city = c("Tokyo", "New York"),
#'   country = c("Japan", "United States"),
#'   cascade = FALSE
#' )
#' }
#' @seealso [geocode_combine] [geo] [geocode]
#' @export
geo_combine <- function(queries, global_params = list(), address = NULL,
                        street = NULL, city = NULL, county = NULL, state = NULL, postalcode = NULL,
                        country = NULL, lat = lat, long = long, ...) {

  # NSE - converts lat and long parameters to character values
  lat <- rm_quote(deparse(substitute(lat)))
  long <- rm_quote(deparse(substitute(long)))

  # Check address arguments -------------------------------------
  address_pack <- package_addresses(
    address, street, city, county,
    state, postalcode, country
  )

  # prepare data for geocode_combine() function -----------------
  input_df <- tibble::tibble(
    address = address,
    street = street, city = city, county = county, state = state, postalcode = postalcode, country = country
  )

  # pass arguments to geocode_combine() as a list. lat and long arguments did not work when passed directly
  arguments_to_pass <- c(
    list(
      .tbl = input_df, queries = queries, global_params = global_params, lat = lat, long = long
    ),
    list(...)
  )

  return(
    do.call(geocode_combine, arguments_to_pass)
  )
}


#' Combine multiple geocoding queries
#'
#' @description Executes multiple geocoding queries on a dataframe input and combines
#'  the results. To use a character vector input instead, see the [geo_combine] function.
#'  Queries are executed by the [geocode] function. See example usage
#'  in `vignette("tidygeocoder")`.
#'
#'  Query results are by default labelled to show which query produced each result. Labels are either
#'  placed in a `query` column (if `return_list = FALSE`) or used as the names of the returned list
#'  (if `return_list = TRUE`). By default the `method` parameter value of each query is used as a query label.
#'  If the same `method` is used in multiple queries then a number is added according
#'  to the order of the queries (ie. `osm1`, `osm2`, ...). To provide your own custom query labels
#'  use the `query_names` parameter.
#'
#' @param queries `r get_queries_parameter_documentation()`
#' @param global_params `r get_global_params_parameter_documentation()`
#' @param return_list if TRUE then results from each service will be returned as separate
#'   dataframes. If FALSE (default) then all results will be combined into a single dataframe.
#' @param cascade if TRUE (default) then only addresses that are not found by a geocoding
#'   service will be attempted by subsequent queries. If FALSE then all queries will
#'   attempt to geocode all addresses.
#' @param query_names optional vector with one label for each query provided
#'   (ex. `c('geocodio batch', 'geocodio single')`).
#' @inheritParams geocode
#' @inherit geo return
#' @examples
#' \donttest{
#'
#' library(dplyr, warn.conflicts = FALSE)
#'
#' sample_addresses %>%
#'   geocode_combine(
#'     queries = list(list(method = "census"), list(method = "osm")),
#'     global_params = list(address = "addr"), cascade = TRUE
#'   )
#'
#' more_addresses <- tibble::tribble(
#'   ~street_address, ~city, ~state, ~zip_cd,
#'   "624 W DAVIS ST #1D", "BURLINGTON", "NC", 27215,
#'   "201 E CENTER ST #268", "MEBANE", "NC", 27302,
#'   "100 Wall Street", "New York", "NY", 10005,
#'   "Bucharest", NA, NA, NA
#' )
#'
#' more_addresses %>%
#'   geocode_combine(
#'     queries = list(
#'       list(method = "census", mode = "batch"),
#'       list(method = "census", mode = "single"),
#'       list(method = "osm")
#'     ),
#'     global_params = list(
#'       street = "street_address",
#'       city = "city", state = "state", postalcode = "zip_cd"
#'     ),
#'     query_names = c("census batch", "census single", "osm")
#'   )
#'
#' more_addresses %>%
#'   geocode_combine(
#'     queries = list(
#'       list(
#'         method = "census", mode = "batch", street = "street_address",
#'         city = "city", state = "state", postalcode = "zip_cd"
#'       ),
#'       list(method = "arcgis", address = "street_address")
#'     ),
#'     cascade = FALSE,
#'     return_list = TRUE
#'   )
#' }
#' @seealso [geo_combine] [geo] [geocode]
#' @export
geocode_combine <-
  function(.tbl,
           queries,
           global_params = list(),
           return_list = FALSE,
           cascade = TRUE,
           query_names = NULL,
           lat = "lat",
           long = "long") {

    # NSE - converts lat and long parameters to character values
    lat <- rm_quote(deparse(substitute(lat)))
    long <- rm_quote(deparse(substitute(long)))

    stopifnot(
      is.data.frame(.tbl),
      is.list(queries),
      # either queries is an empty list or contains other lists
      length(queries) == 0 || all(sapply(queries, is.list)),
      is.list(global_params), is.logical(return_list), is.logical(cascade),
      is.null(query_names) || is.character(query_names)
    )

    # combine all parameter lists into a list of lists
    all_param_lists <- queries
    all_param_lists[[length(queries) + 1]] <- global_params

    # get all parameter names
    all_param_names <- unlist(sapply(all_param_lists, names))

    # which columns are used to store addresses
    used_address_colnames <- unique(unlist(sapply(
      all_param_lists,
      function(x) unlist(x[names(x) %in% pkg.globals$address_arg_names], use.names = FALSE)
    )))

    if (cascade == TRUE) {
      for (query in all_param_lists) {
        if (!is.null(query[["return_input"]]) && query[["return_input"]] == FALSE) {
          stop("return_input must be set to TRUE for geocode_combine()", call. = FALSE)
        }
      }
    }

    # add global arguments to each query
    queries_prepped <- lapply(queries, function(x) {
      c(
        list(.tbl = .tbl, lat = lat, long = long),
        global_params,
        x
      )
    })

    # Set default query names and check user input query names
    if (is.null(query_names)) {
      # default query names to the method arguments and fill in 'osm' if the method argument isn't provided
      query_names <- unlist(lapply(queries_prepped, function(q) if (!is.null(q[["method"]])) q[["method"]] else "osm"))

      # number duplicate query names if necessary (to prevent duplicates)
      # ie. 'osm1', 'osm2', etc.
      for (name in unique(query_names)) {
        # if the given name occurs more than once in query_names then iterate through and add numbers
        if ((sum(query_names == name)) > 1) {
          i <- 1
          dup_num <- 1
          for (i in 1:length(query_names)) {
            if (query_names[[i]] == name) {
              query_names[[i]] <- paste0(query_names[[i]], as.character(dup_num), collapse = "")
              dup_num <- dup_num + 1
            }
          }
        }
      }
    } else {
      if (length(query_names) != length(queries)) {
        stop("query_names parameter must contain one name per query provided. See ?geocode_combine")
      }

      if (any(duplicated(query_names)) == TRUE) {
        stop("query_names values should be unique. See ?geocode_combine")
      }

      if (any(trimws(query_names) == "")) {
        stop("query_names values should not be blank. See ?geocode_combine")
      }
    }

    # Sanity check all queries (ie. make sure no queries have a mistyped argument, etc.)
    for (query in queries_prepped) {
      query[["no_query"]] <- TRUE

      tryCatch(
        expr = {
          suppressMessages(do.call(geocode, query))
        },
        error = function(e) {
          message("The following error was produced:\n")
          message(e)

          message("\n\nBy these query parameters:\n")
          for (name in names(query)) {
            # don't display .tbl parameter for now
            if (name != ".tbl") message(paste0(name, " = ", query[[name]]))
          }
        },
        finally = {
          message("")
        }
      )
    }

    # iterate through the queries (list of lists) and execute each query
    # aggregate results in list object
    all_results <- list()
    not_found <- tibble::tibble()
    for (i in 1:length(queries_prepped)) {
      query <- queries_prepped[[i]]
      if (cascade == TRUE) {
        # adjust the input dataframe based on which addresses were not found in prior query
        if (nrow(not_found) != 0) {
          query[[".tbl"]] <- not_found
        } else if (i != 1) {
          break # break loop - all addresses are geocoded
        }
      }

      # use geocode() to execute the query
      result <- do.call(geocode, query)
      na_indices <- is.na(result[[lat]]) | is.na(result[[long]])

      # which addresses were not found
      not_found <- result[na_indices, intersect(colnames(result), colnames(.tbl))]
      found <- result[!is.na(result[[lat]]) & !is.na(result[[long]]), ]

      # aggregate results. if using cascade then separate the not-found addresses
      all_results <- if (cascade == TRUE) c(all_results, list(found)) else c(all_results, list(result))
    }

    names(all_results) <- query_names[1:length(all_results)]

    # if there are addresses that no method found then in cascade then
    # separate them into their own list item
    if (cascade == TRUE) {
      if (nrow(not_found) != 0) all_results[[length(all_results) + 1]] <- not_found
    }

    # bind all results into one dataframe if return_list == FALSE
    # otherwise return list
    if (return_list == TRUE) {
      return(all_results)
    } else {

      # label the dataframes contained in the all_results list with a 'query' column
      all_results_labeled <- mapply(function(x, y) dplyr::bind_cols(x, tibble::tibble(query = y)),
        all_results, names(all_results),
        SIMPLIFY = FALSE
      )

      # put all results into a single dataframe
      bound_data <- dplyr::bind_rows(all_results_labeled)

      # remove .id column if it is present
      bound_data <- bound_data[!names(bound_data) %in% ".id"]

      # reorder the dataset to match the order it was received in before returning it
      proper_order <- unique(.tbl[used_address_colnames])
      proper_order[[".id"]] <- 1:nrow(proper_order)

      # join to get our .id column so we can sort the output dataset
      bound_data_joined <- dplyr::left_join(bound_data, proper_order, by = used_address_colnames)

      # sort the dataset
      bound_data_joined <- bound_data_joined[order(bound_data_joined[[".id"]]), ]

      # remove .id column before returning the dataset
      return(bound_data_joined[!names(bound_data_joined) %in% ".id"])
    }
  }
jessecambon/tidygeocoder documentation built on Jan. 26, 2023, 4:03 p.m.