draft/getHtc.R

getHtc <- function(){
  wantURL <- 'http://www.htc.com/tw/support/where_to_buy_iframe.aspx?folderid=868&page=0'
  
  res=htmlParse(wantURL, encoding='utf8')
  maxPage<- xpathSApply(res, '//*[@class="pages"]/a', xmlValue)
  maxPage=maxPage[-(length(maxPage))]
  maxPage=min(maxPage)
  wantPages<- sapply(0:maxPage,function(wantPage) sprintf('http://www.htc.com/tw/support/where_to_buy_iframe.aspx?folderid=868&page=%s', wantPage))
  
  OUTPUT=list()
  for(i in 1:maxPage){
    res=htmlParse(wantPages[i], encoding='utf8')
    store_nm <- xpathSApply(res, '//*[@class="store-name"]', xmlValue)
    store_nm <- str_replace_all(store_nm, '(\r|\n| )+', '')
    addr <- xpathSApply(res, '//*[@class="store-list"]/div/table/tr/td[@class="right-td"]', xmlValue)
    addr <- str_replace_all(addr, '(\r|\n| )+', '')
    addr <- str_replace_all(addr, '(\xc2\xa0)+', '')
    addr <- str_replace_all(addr, '([(]).+$', '')
    OUTPUT[[i]]= data.frame(brand_nm='hTC',store_nm=store_nm,addr=addr, data_dt=gsub('-','',Sys.Date()) ,stringsAsFactors=FALSE )
  }
  FINISH=do.call(rbind,OUTPUT)
  return(FINISH)
}
leoluyi/CRMaddress documentation built on May 21, 2019, 5:08 a.m.