R/get_wellcome.R

#' Wellcome 頂好
#'  
#' @seealso 
#' url: \url{http://www.wellcome.com.tw/CHT/HOME/Store} \cr
#' data: \url{http://www.wellcome.com.tw/CHT/Home/Search}
#' 
#' @examples
#' dt <- get_wellcome()
#' 
#' @return data.table
#' @export
get_wellcome <- function () {
  url <- "http://www.wellcome.com.tw/CHT/Home/Search"
  doc1 <- POST(url, 
               body = list(
                 strCity="",
                 strArea="",
                 strRoad="",
                 strStoreName="%",
                 strRoadSearch="",
                 strBusinessHr="0",
                 page="1",
                 flag="2",
                 strLanguag="CHT"
               )) %>% content
  max_page <- doc1 %>% html_nodes("#divPaging li a") %>% html_attr("datapage") %>% 
    as.integer() %>% max(na.rm = T)
  
  out_list <- seq(max_page) %>% lapply(get_wellcome_)
  out_dt <- out_list %>% rbindlist()
  
  # add brand name
  out_dt[, `:=`(brand_nm = "頂好", keyword = "Wellcome")]
  
  ## add url, time, full name
  out_dt[, store_url := "http://www.okmart.com.tw/"][
    , data_time := format(Sys.time(), "%Y-%m-%dT%H:%M:%S%z")][
      , full_nm := paste0(brand_nm, store_nm)]
  
  key_var <- c("full_nm", "brand_nm", "keyword", "store_nm", "addr",
               "lon_x", "lat_y", "store_url")
  ## move brand_nm, key_word to first two colmuns
  out_dt <- out_dt %>% setcolorder(c(key_var, setdiff(names(out_dt), key_var)))
  
  out_dt
}


get_wellcome_ <- function(page) {
  # page = 19
  page <- page %>% .[1] %>% as.integer()
  
  url <- "http://www.wellcome.com.tw/CHT/Home/Search"
  doc <- POST(url, 
              body = list(
                strCity="",
                strArea="",
                strRoad="",
                strStoreName="%",
                strRoadSearch="",
                strBusinessHr="0",
                page=as.character(page),
                flag="2",
                strLanguag="CHT"
              )) %>% content
  
  tryCatch({
    out <- data.table(
      store_nm = doc %>% html_nodes(".result dt") %>% html_text(),
      addr = doc %>% html_nodes(".result address") %>% html_text() %>% 
        str_extract("(?<=:).*"),
      lat_y = doc %>% html_nodes(".result address") %>% html_attr("onclick") %>% 
        str_match("(?:q=[+-]?)([0-9.]+)(?:%2c[+-]?)([0-9.]+)") %>% .[,2],
      lon_x = doc %>% html_nodes(".result address") %>% html_attr("onclick") %>% 
        str_match("(?:q=[+-]?)([0-9.]+)(?:%2c[+-]?)([0-9.]+)") %>% .[,3],
      tel_no = doc %>% html_nodes(".result dd.tel") %>% html_text() %>% 
        str_extract("(?<=:).*"),
      work_time = doc %>% html_nodes(".result time") %>% html_text() %>% 
        str_extract("(?<=:).*")
    )
  }, error = function(x) {
    out <<- NULL
  })
  out
}
leoluyi/address_crawler documentation built on May 21, 2019, 5:09 a.m.