draft/getThaiTown.R

getThaiTown <- function(){
  
  getsidurl <- function(){
    wantURL <- 'http://www.thaitown.com.tw/index.php?cid=3'
    res <- GET(wantURL, encoding='utf8')
    res2 <- content(res, encoding='utf8')
    (maxPage<- xpathSApply(res2, '//div/ul[@class="idx3_item"]/li/a', xmlAttrs))
    
    maxPage <- maxPage[rownames(maxPage)=='href',]
    
    maxPage <- str_replace(str_extract(maxPage,'sid=[0-9]+$'),'sid=', '')
    
    wantPages<- sapply(maxPage, 
                       function(wantPage) sprintf('http://www.thaitown.com.tw/index.php?cid=3&sid=%s', wantPage)
    )
    return(wantPages)
  }
  
  #getsidurl()
  
  
  getAddr <- function(URL){
    res <- GET(URL, encoding='utf8')
    res2 <- content(res, encoding='utf8')
    name <- xpathSApply(res2, '//div[@class="store_tit"]', xmlValue)
    addr <- xpathSApply(res2, '//div[@class="store_addr"]', xmlValue)
    tel <- xpathSApply(res2, '//div[@class="store_tel"]', xmlValue)
    
    name <- str_replace_all(name, '[a-z]|[A-Z]| ', '')
    addr <- str_extract_all(addr, '.+【')
    addr <- sapply(addr,str_replace_all, '地址:|【', '')
    
    
    tel <- str_replace_all(tel, '電話:| ', '')
    
    
    return(data.frame(brand_nm='瓦城',store_nm=name,addr=addr,tel_no=tel, stringsAsFactors = FALSE))
  }
  
  #getAddr(URL)
  
  UrlList <- lapply(getsidurl(), getAddr)
  UrlList_1 <- do.call(rbind, UrlList)
}
leoluyi/CRMaddress documentation built on May 21, 2019, 5:08 a.m.