draft/getHongya.R

getHongya <- function(){
  
  url_link <-'http://www.hongya88.com.tw/place/?page=1&parent_id=1199'
  res_link <- GET(url_link)
  res_link <- content(res_link,'text', encoding = 'utf8')
  html_link<- htmlParse(res_link, encoding = "utf8")
  link <- xpathSApply(html_link, '//*[@id="page_box"]', xmlValue)
  link<-str_replace_all(str_extract(link, '共.+筆'),'共 | 筆','')
  link<-ceiling(as.numeric(link)/16)  #無條件進位;floor是無條件捨去;round是四捨五入;trunc取鄰近值
  
  
  url   <-list()
  data  <-list()
  
  for(i in 1:link){
    
    url[[i]]<-sprintf('http://www.hongya88.com.tw/place/?page=%s&parent_id=1199',i)
    res <- GET(url[[i]])
    
    #url<-'http://www.hongya88.com.tw/place/45page=2&parent_id=1199'
    #res <- GET(url)
    res <- content(res,'text', encoding = 'utf8')
    html <- htmlParse(res, encoding = "utf8")
    
    tables <- readHTMLTable(html)
    tables <- tables[3]
    tables <- tables[[1]]
    data[[i]] <- tables[-1,2:4]
    
  }
  
  data_fin <- do.call(rbind, data)
  names(data_fin) <- c('store_nm','tel_no','addr')
  
  data_fin <- cbind('hongya', data_fin )
  names(data_fin) <- c('brand_nm','store_nm','tel_no', 'addr')
  data_fin
}
leoluyi/CRMaddress documentation built on May 21, 2019, 5:08 a.m.