draft/getSoNet.R

getSoNet <- function(){
  wantURL <- 'http://www.so-net.net.tw/place/index.html'
  res=htmlParse(wantURL, encoding='big5')
  page_url=xpathSApply(res, '//*[@id="NAV"]/ul/li/a', xmlAttrs)
  page_url=c(matrix(page_url))
  wantURL <- sprintf('http://www.so-net.net.tw/place/%s', page_url)
  page_num=length(page_url)
  
  OUTPUT=list()
  for(i in 1:page_num)
  {
    res=htmlParse(wantURL[i], encoding='big5')
    store_nm=xpathSApply(res, '//*[@class="BOX"]/table/tr/td[1][@class="LEFT2"]', xmlValue)
    store_nm <- str_replace_all(store_nm, '(\r|\n| )+', '')
    tel_no=xpathSApply(res, '//*[@class="BOX"]/table/tr/td[2]', xmlValue)
    tel_no <- str_replace_all(tel_no, '(\r|\n| )+', '')
    tel_no=tel_no[-1]
    addr=xpathSApply(res, '//*[@class="BOX"]/table/tr/td[3][@class="ADD"]', xmlValue)
    addr <- str_replace_all(addr, '(\r|\n| )+', '')
    OUTPUT[i]= list(data.frame(brand_nm='SO-NET',store_nm=store_nm,addr=addr,tel_no=tel_no, data_dt=gsub('-','',Sys.Date()) ,stringsAsFactors=FALSE ))
  }
  FINISH=do.call(rbind,OUTPUT)
}
leoluyi/CRMaddress documentation built on May 21, 2019, 5:08 a.m.