draft/getJeJoy.R

getJeJoy <- function(){
  res <- GET("http://je.joy.com.tw/mobile/branch_list.php")
  res <- htmlParse(content(res,'text', encoding = 'utf8'), encoding = 'utf8')
  
  
  
  #取出地區變數名稱
  area_option=xpathSApply(res, '//*[@id="select"]/option', xmlValue)
  area_option=area_option[-1]
  for(i in 1:length(area_option))
  {
    area_option[i]=URLencode(area_option[i])
  }
  
  
  #取出地區網址
  wantURL <- sprintf('http://je.joy.com.tw/mobile/branch_list.php?city=%s',area_option)
  area_num=length(wantURL)
  
  wantURLs=list()
  for(i in 1:area_num)
  {
    res <- GET(wantURL[i])
    res <- htmlParse(content(res,'text', encoding = 'utf8'), encoding = 'utf8')
    subarea_option=xpathSApply(res, '//*[@id="select2"]/option', xmlAttrs)
    subarea_option=subarea_option[-1]
    subarea_option_num=length(subarea_option)
    for(j in 1:subarea_option_num)
    {
      wantURLs= rbind(wantURLs,sprintf('http://je.joy.com.tw/mobile/branch_list.php?city=%s&bcode=%s',area_option[i],subarea_option[j]))
    }
  }
  
  #排除無分店的縣市連結
  wantURLs=wantURLs[regexpr("bcode=selected",wantURLs)==-1]
  wantURLs_num=length(wantURLs)
  
  #取最後分頁
  #頻繁抓取會被出現錯誤,無法抓取而出現錯誤需加大停止時間
  OUTPUT=c()
  for(i in 1:wantURLs_num)
  {
    res=htmlParse(wantURLs[i], encoding='utf8')
    store_nm=xpathSApply(res, '//*[@class="wrap"]/table/tr[1]/td[2]', xmlValue)
    addr=xpathSApply(res, '//*[@class="wrap"]/table/tr[2]/td[2]', xmlValue)
    addr <- str_replace_all(addr, '(\r|\n| |地址:)+', '')
    addr <- str_replace_all(addr, '([(]).+$', '')
    tel_no=xpathSApply(res, '//*[@class="wrap"]/table/tr[3]/td[2]', xmlValue)
    tel_no <- str_replace_all(tel_no, '(\r|\n| |電話:)+', '')
    OUTPUT[i]= list(data.frame(brand_nm='佳音英語',store_nm=store_nm,addr=addr,tel_no=tel_no, data_dt=gsub('-','',Sys.Date()) ,stringsAsFactors=FALSE ))
    Sys.sleep(3)
  }
  FINISH=do.call(rbind,OUTPUT)
  FINISH$addr=str_replace_all(FINISH$addr, '(().+$', '')
  FINISH$addr=str_replace_all(FINISH$addr, '(【).+$', '')
  FINISH
}
leoluyi/CRMaddress documentation built on May 21, 2019, 5:08 a.m.