draft/getWoodpeckers.R

getWoodpeckers <- function(){
  #1.找到每頁連結網址
  url_link <-'http://woodpeckers.com.tw/info/location'
  res_link <- GET(url_link)
  res_link <- content(res_link,'text', encoding = 'utf8')
  html_link<- htmlParse(res_link, encoding = "utf8")
  link <- cssApply(html_link, '.location_button_item> h3 > a', cssLink)
  link <- toUTF8(link) #先toutf8再encode,但encode後遇到-要再轉成%2d
  
  #2.將網址代入迴圈中
  result<-list()
  url   <-list()
  
  for(i in 1:length(link)){
    url[[i]] <- str_replace_all(URLencode(link[i]), '-','%2d')
    res <- GET(url[[i]])
    res <- content(res,'text', encoding = 'utf8')
    html<- htmlParse(res, encoding = "utf8")
    data <-cssApply(html,"body > div.content_area > div.right_area > div.location_item_area > div.detail_area", cssCharacter)
    data<-str_replace_all(data,'\r|\n|\t|[:space:]','')
    store_nm<-str_replace_all(str_extract(data, '.+(電話)'),'電話','')
    tel_no<-str_replace_all(str_extract(data, '電話.+門市地址'),'電話:|門市地址','')
    addr<-str_replace_all(str_extract(data, '門市地址.+(門市地圖|營業時間)'),'門市地址:|門市地圖|營業時間','')
    #轉置*2
    store_nm <-t(t(store_nm))
    tel_no   <-t(t(tel_no))
    addr     <-t(t(addr))
    addr<- str_replace(addr, '\\(.+\\)', '') #拿掉地址裡的()備註
    result[[i]]<-data.frame(brand_nm='啄木鳥藥局',store_nm, tel_no, addr, stringsAsFactors = FALSE) 
  }
  data_fin <- do.call(rbind, result)
  return(data_fin)
}
leoluyi/CRMaddress documentation built on May 21, 2019, 5:08 a.m.