R/get_hilife.R

#' HiLife
#' 
#' Basically it's provider is the same as FamilyMart......
#'  
#' @seealso
#' url: \url{http://www.hilife.com.tw/storeInquiry_name.aspx} \cr
#' data: \url{http://www.hilife.com.tw/storeInquiry_name.aspx}
#' 
#' @return data.table
#' @import pbapply
#' @export
get_hilife <- function(max_iter = 5) {
  url = "http://www.hilife.com.tw/storeInquiry_name.aspx"
  
  doc_init <- GET(url, ua) %>% content()
  VS <- doc_init %>% html_node("#__VIEWSTATE") %>% html_attr("value")
  VS_GEN <- doc_init %>% html_node("#__VIEWSTATEGENERATOR") %>% html_attr("value")
  EVENTVALIDATION <- doc_init %>% html_node("#__EVENTVALIDATION") %>% html_attr("value")
  
  for (i in 1:max_iter) {
    tryCatch({
      res <- POST(url,
                  ua,
                  add_headers(
                    Referer = "http://www.hilife.com.tw/storeInquiry_name.aspx"
                  ),
                  body = list(
                    `__VIEWSTATE` = VS,
                    `__EVENTVALIDATION` = EVENTVALIDATION,
                    `__VIEWSTATEGENERATOR` = VS_GEN,
                    btnQuery = "+搜++尋+",
                    txtKeyWord = "%"),
                  encode = "form")
      break
    }, error = function(e) {
      futile.logger::flog.error(e)
      if (i == max_iter) {
        stop("Max try exceeded - ", e, call. = FALSE)
      }
    })
  }
  
  res_xml <- content(res)
  out_dt <- res_xml %>% 
    html_nodes("div.searchResults > table") %>% 
    html_table() %>% 
    as.data.table() %>% 
    .[, c("X1", "X5") := NULL]
  setnames(out_dt, c("store_nm", "addr", "tel_no"))
  
  ## data cleansing
  out_dt <- na.omit(out_dt, "store_nm") %>% # remove NA by store name
    unique(by = "store_nm")
  
  ## Get GIS from google
  # nrow(out_dt)
  hilife_gis <- function (x) {
    # x = 20
    res_google <- POST("http://www.hilife.com.tw/getGoogleSpot.ashx",
                       ua,
                       body = list(city_name="", town_name="",shop_id = x),
                       encode = "form")
    
    out <- data.table() # pre-allocate
    
    if (!http_error(res_google)) {
      out <- res_google %>% 
        content("text", encoding = "UTF-8") %>% 
        jsonlite::fromJSON() %>% 
        as.data.table()
    } else {
      return(out)
    }
    
    if (length(out)) {
      setnames(out,
               c("經度", "緯度", "名稱", "說明"),
               c("lon_x", "lat_y", "store_nm", "misc"))
      out[, store_nm := store_nm %>% 
            str_replace("^萊爾富\\s*[A-Z0-9]+\\s+", "")]
      
      misc_html <- xml2::read_html(out[,misc])
      out[, addr := misc_html %>%
            html_text() %>%
            str_match("地址:(.+?)(?:電話:\\d+[-]\\d+)?\\s*$") %>%
            .[, 2]]
      out[, tel_no := misc_html %>%
            html_text() %>%
            str_match("電話:(\\d+[-]?\\d+)\\s*$") %>%
            .[, 2]]
      out[, misc := misc_html %>%
            html_nodes("img") %>%
            html_attr("title") %>%
            na.omit() %>%
            paste0(., collapse = ";")]
    }
  }
  
  result_gps_list <- pblapply(
    seq_along(out_dt[, store_nm]),
    hilife_gis
  )
  
  result_gps <- result_gps_list %>% 
    rbindlist(use.names = TRUE, fill = TRUE) %>% 
    unique(by = "store_nm") %>% 
    na.omit("store_nm")
  
  out_dt <- result_gps
  # ## join
  # out_dt[result_gps, on = (store_nm = store_nm)]

  # add brand name
  out_dt[, `:=`(brand_nm = "萊爾富", keyword = "HiLife")]
  
  ## add url, time, full name
  out_dt[, store_url := "http://www.hilife.com.tw"][
    , data_time := format(Sys.time(), "%Y-%m-%dT%H:%M:%S%z")][
      , full_nm := paste0(brand_nm, store_nm)]
  
  key_var <- c("full_nm", "brand_nm", "keyword", "store_nm", "addr",
               "lon_x", "lat_y", "store_url")
               
  ## move brand_nm, key_word to first two colmuns
  setcolorder(out_dt, c(key_var, setdiff(names(out_dt), key_var)))
  
  out_dt[]
}
leoluyi/address_crawler documentation built on May 21, 2019, 5:09 a.m.