#' Get GPS (lon, lat) from address
#'
#' @param addr Address string
#' @param source API source
#' @param parallel Parallel connection
#' @param n_cpu
#' @param rate Connection rate per second
#' @param use_tor Boolean. Default TRUE. Use tor or not.
#'
#' @describeIn geocode Get GPS from address vector
#' @return GPS data.table
#' @export
#' @import magrittr httr rvest data.table parallel pbapply
#'
#' @examples
#' addr <- c("台北市中正區羅斯福路一段2號",
#' "台北市中正區貴陽街一段120號")
#' geocode(addr)
geocode <- function(addr, precise = FALSE, source = "google",
parallel = TRUE, n_cpu = -1L,
rate = 200, use_tor = TRUE) {
# addr <- c("台北市中正區羅斯福路一段2號",
# "台北市中正區貴陽街一段120號")
if (!is.vector(addr)) {
stop("values must be character vecter")
}
n_oginial_addr <- length(addr)
addr <- unique(addr)
message(sprintf("%d unique address out of %d input",
length(addr), n_oginial_addr))
if (n_cpu != 1 && length(addr) >= 10) {
is_parallel <- TRUE
} else {
is_parallel <- FALSE
}
if (n_cpu == -1L) {n_cpu <- parallel::detectCores() - 1}
if (use_tor) message("(Use tor in crawling)");
if (is_parallel) {
message(sprintf("(Use %s clusters)", n_cpu))
cl <- parallel::makeCluster(n_cpu)
print(cl)
on.exit(parallel::stopCluster(cl))
worker.init <- function(packages) {
for (p in packages) {
library(p, character.only=TRUE)
}
invisible(NULL)
}
invisible(
parallel::clusterCall(cl, worker.init, c('magrittr', 'httr', 'rvest', 'data.table'))
)
out <- pbapply::pbsapply(addr, geocode_, rate = rate, use_tor = use_tor,
simplify = FALSE, USE.NAMES = TRUE,
cl = cl) %>%
rbindlist(idcol = "addr", fill=TRUE, use.names = TRUE)
} else {
out <- pbapply::pbsapply(addr, geocode_, rate, use_tor = use_tor,
simplify = FALSE, USE.NAMES = TRUE) %>%
rbindlist(idcol = "addr", fill=TRUE, use.names = TRUE)
}
# Fetch 2nd time
left <- out[is.na(lat), addr]
if (length(left) > 0) {
message(sprintf("Fetch 2nd time for left %d addr...", length(left)))
if (length(left) > 500 && !use_tor) {
use_tor <- TRUE
message(sprintf("Use tor for left %d data", length(left)))
}
if (is_parallel) {
message(sprintf("(Use %s clusters)", n_cpu))
temp <- left %>%
pbapply::pbsapply(geocode_, rate = rate, use_tor = use_tor,
simplify = FALSE, USE.NAMES = TRUE,
cl = cl) %>%
rbindlist(idcol = "addr", fill=TRUE, use.names = TRUE)
} else {
temp <- left %>%
pbapply::pbsapply(., geocode_, rate = rate, use_tor = use_tor,
simplify = FALSE, USE.NAMES = TRUE) %>%
rbindlist(idcol = "addr", fill=TRUE, use.names = TRUE)
}
out <- rbindlist(list(out[!is.na(lat),], temp), fill=TRUE, use.names = TRUE)
}
# # Fetch 3rd time (w/o tor)
# left <- out[is.na(lat), addr]
# if (length(left) > 500) {
# use_tor <- TRUE
# message(sprintf("Using tor for left %d data", length(left)))
# } else {
# use_tor <- FALSE
# }
# if (is_parallel) {
# temp <- left %>%
# pbapply::pbsapply(geocode_, rate = rate, use_tor = use_tor,
# simplify = FALSE, USE.NAMES = TRUE,
# cl = cl) %>%
# rbindlist(idcol = "addr", fill=TRUE, use.names = TRUE)
# } else {
# temp <- left %>%
# sapply(., FUN = geocode_, rate = rate, use_tor = use_tor,
# simplify = FALSE, USE.NAMES = TRUE) %>%
# rbindlist(idcol = "addr", fill=TRUE, use.names = TRUE)
# }
# out <- rbindlist(list(out[!is.na(lat),], temp), fill=TRUE, use.names = TRUE)
to_clean <- setdiff(names(out), c("addr", "msg"))
out[msg == "Over limit", (to_clean) := NA]
if (precise) {
to_clean <- setdiff(names(out), "addr")
out[village == "", (to_clean) := NA]
}
out[]
}
geocode_ <- function(addr, rate=200, use_tor = FALSE, ...) {
# addr <- "台北市中正區羅斯福路一段2號"
## to be nice :)
Sys.sleep(rexp(1, rate)) # sleep expo dist at rate per sec
url <- "http://api.map.com.tw/net/GraphicsXY.aspx"
get_ <- function(use_tor, max_try = 3) {
if (use_tor) {
old <- set_config(use_proxy("socks5://localhost:9050"))
on.exit({
set_config(old, override = TRUE)
})
# set_config(verbose())
}
i <- 1
while (i <= max_try) {
err <- tryCatch({
res <- GET(url,
add_headers(
Referer = "http://www.map.com.tw/",
`User-Agent` = "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36"
),
query = list(
search_class = "address",
SearchWord = addr %>% clean_addr,
searchkey = "D43A19151569F32A449B7EDCB8555165B68B5F95"))
break
}, error = function(e) {
i <<- i + 1
if (i == max_try) {
if (e$message == "Couldn't connect to server" && use_tor) {
warning("TOR connection may be faild. Restart TOR with `sudo killall tor; tor &`", call. = FALSE)
stop(e)
}
stop(e)
}
system("sudo killall tor; tor&")
res <<- NULL
invisible(e)
})
}
res
}
res <- get_(use_tor)
if (http_error(res)) {
return(
data.table(
lat = NA_character_,
lng = NA_character_,
no = NA_character_,
bheigh2 = NA_character_,
bcount2 = NA_character_,
baddr2 = NA_character_,
bname2 = NA_character_,
village = NA_character_,
road = NA_character_,
msg = NA_character_,
precision = NA_character_
)
)
}
out <- res %>%
content("text") %>%
stringr::str_replace_all('^\\(|\\)$', '') %>%
jsonlite::fromJSON() %>%
.[[1]] %>%
as.data.table
out[!is.na(lat),
addr_norm := paste0(baddr2, bname2, village, road)]
out[, c("lat", "lng") := lapply(.SD, as.numeric), .SDcols = c("lat", "lng")]
out
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.