R/get_rtmart.R

#' 大潤發
#' 
#' @seealso 
#' url: \url{http://official.rt-mart.com.tw/store_new.asp} \cr
#' data: \url{http://official.rt-mart.com.tw/store_info_new.asp?fno=&no=&id=12}
#' 
#' @importFrom geocode geocode
#' @examples
#' dt <- get_rtmart()
#' 
#' @return data.table
#' @export
get_rtmart <- function () {
  url <- "http://official.rt-mart.com.tw/store_new.asp"
  doc <- GET(url) %>% content("text") %>% read_html
  
  store_nm <- doc %>% html_nodes("#store_list li a") %>% html_text
  store_url <- doc %>% html_nodes("#store_list li a") %>% html_attr("href") %>% 
    paste("http://official.rt-mart.com.tw", ., sep = "/")
  
  store_url <- setNames(store_url, store_nm) %>% as.list
  store_url[c("總公司", "網路購物")] <- NULL
  out <- lapply(store_url, get_rtmart_)
  out_dt <- rbindlist(out)
  out_dt <- rbindlist(list(out_dt, list(store_nm = "基隆店", 
                                        addr = "基隆市安樂區安和二街15號",
                                        worktime = "09:00~22:00")))
  addrs <- out_dt[, addr] %>%
    geocode(n_cpu = parallel::detectCores()-1) %>% 
    .[, .(addr, lat_y = lat, lon_x = lng)]
  out_dt <- addrs[out_dt, on = .(addr)]
  
  # add brand name
  out_dt[, `:=`(brand_nm = "大潤發", keyword = "rtmart")]
  
  ## add url, time, full name
  out_dt[, store_url := "http://official.rt-mart.com.tw/store_new.asp"][
    , data_time := format(Sys.time(), "%Y-%m-%dT%H:%M:%S%z")][
      , full_nm := paste0(brand_nm, store_nm)]
  
  key_var <- c("full_nm", "brand_nm", "keyword", "store_nm", "addr",
               "lon_x", "lat_y", "store_url")
  ## move brand_nm, key_word to first two colmuns
  setcolorder(out_dt, c(key_var, setdiff(names(out_dt), key_var)))
  
  out_dt[]
}

get_rtmart_ <- function(url) {
  # url = "http://official.rt-mart.com.tw/store_info_new.asp?fno=&no=&id=12"
  doc <- GET(url) %>% content("text") %>% read_html
  
  tryCatch({
    out <- data.table(
      store_nm = doc %>% html_node("td.red") %>% html_text,
      addr = doc %>% html_node("tr > td:contains('地  址') ~ td ~ td") %>%
        html_text %>% str_extract("^[^\\[\\s]+"),
      worktime = doc %>% html_nodes("tr > td:contains('營業時間') ~ td ~ td") %>%
        html_text %>% str_trim
    )
  }, error = function(e) {
    out <<- NULL
  })
  
  out
}
leoluyi/address_crawler documentation built on May 21, 2019, 5:09 a.m.