R/get_twrail.R

#' 臺鐵
#'  
#' @seealso 
#' url: \url{http://www.railway.gov.tw/tw/CP.aspx?sn=3606&n=6829}
#' 
#' @examples
#' \dontrun{
#' dt <- get_twrail()
#' }
#' 
#' @return data.table
#' 
#' @import geocode
#' @export
get_twrail <- function () {
  res <- GET("http://www.railway.gov.tw/tw/CP.aspx?sn=3606&n=6829")
  rail_sections <- res %>% content() %>% 
    html_nodes("#Table3 a") %>% 
    html_attr("href") %>% 
    paste("http://www.railway.gov.tw/tw", ., sep = "/")
  
  tbl_all <- data.table(store_nm = character(), 
                    addr = character(), 
                    tel_no = character())
  for (i in seq_along(rail_sections)) {
    res2 <- GET(rail_sections[i])
    tbl <- res2 %>% 
      content(encoding = "UTF-8") %>% 
      html_node('table[width="96%"]') %>% 
      html_table() %>% 
      data.table() %>% 
      setnames(c("store_nm", "addr", "tel_no"))
    # cleansing
    tbl[, store_nm := store_nm %>% 
          str_trim() %>% 
          str_extract("^[^(]+")]
    tbl[, addr := addr %>% str_match("(?:.+:)?([^\r]*)") %>% .[,2]]
    tbl_all <- rbindlist(list(tbl_all, tbl))
  }
  
  addrs <- tbl_all[, addr] %>% 
    geocode(n_cpu = parallel::detectCores()-1) %>% 
    .[, .(addr, lat_y = lat, lon_x = lng)]
  out_dt <- addrs[tbl_all, on = .(addr)]
  
  # add brand name
  out_dt[, `:=`(brand_nm = "臺鐵", keyword = "台鐵")]
  
  ## add url, time, full name
  out_dt[, store_url := "http://www.railway.gov.tw/tw/CP.aspx?sn=3606&n=6829"][
    , data_time := format(Sys.time(), "%Y-%m-%dT%H:%M:%S%z")][
      , full_nm := paste0(brand_nm, store_nm)]
  
  key_var <- c("full_nm", "brand_nm", "keyword", "store_nm", "addr",
               "lon_x", "lat_y", "store_url")
  ## move brand_nm, key_word to first two colmuns
  out_dt <- out_dt %>% setcolorder(c(key_var, setdiff(names(out_dt), key_var)))
  
  out_dt
}
leoluyi/address_crawler documentation built on May 21, 2019, 5:09 a.m.