draft/getWhirlPool.R

getWhirlPool <- function(){
  
  wantURL <- 'http://www.whirlpool.com.tw/Service/'
  res <- GET(wantURL, encoding='utf8')
  res2 <- content(res, encoding='utf8')
  
  page_num=xpathSApply(res2, '//*[@class="PageUnSelected"]', xmlValue)
  next_page=xpathSApply(res2, '//*[@class="PageNext"]', xmlValue)
  next_page <- str_replace_all(next_page, '([下]|[頁])+', '')
  page_num=length(strsplit(page_num,split='[ ]',fixed=T))+1+as.numeric(next_page)
  
  #組分頁網址
  wanthttp=c()
  for(i in 1:page_num)
  {
    wanthttp[i] <- sprintf('http://www.whirlpool.com.tw/Service/?ctl00$ContentPlaceHolder1$Datagrid1=%s,1,0,30',i)
  }
  
  
  #取分頁中資訊
  OUTPUT=list()
  for(i in 1:page_num)
  {
    wantURL <- wanthttp[i] 
    res <- GET(wantURL, encoding='utf8')
    res2 <- content(res, encoding='utf8')
    area=xpathSApply(res2, '//*[@id="ContentPlaceHolder1_Datagrid1"]/tr/td[1]', xmlValue)
    area=area[-length(area)]
    store_nm=xpathSApply(res2, '//*[@id="ContentPlaceHolder1_Datagrid1"]/tr/td[2]', xmlValue)
    addr=xpathSApply(res2, '//*[@id="ContentPlaceHolder1_Datagrid1"]/tr/td[3]', xmlValue)
    addr <-paste(area,addr)
    addr <- str_replace_all(addr, '(\r|\n| | )+', '')
    OUTPUT[[i]]= data.frame(brand_nm='惠而浦',area=area,store_nm=store_nm,addr=addr, data_dt=gsub('-','',Sys.Date()) ,stringsAsFactors=FALSE )
  }
  FINISH=do.call(rbind,OUTPUT)
}
leoluyi/CRMaddress documentation built on May 21, 2019, 5:08 a.m.