draft/getPerngyuh.R

getPerngyuh <- function(){
  getsidurl <- function(){
    wantURL <- 'http://www.perngyuh.com.tw/store.php?cID=1'
    res <- GET(wantURL, encoding='utf8')
    res2 <- content(res, encoding='utf8')
    (maxPage<- xpathSApply(res2, '/html/body/table[1]/tr/td/table/tr/td[2]/table/tr[2]/td/table/tr/td/table[2]/tr/td/table[1]/tr/td/a', xmlAttrs))
    
    maxPage <- maxPage[rownames(maxPage)=='href',]
    
    wantPages<- sapply(maxPage, 
                       function(wantPage) sprintf('http://www.perngyuh.com.tw/%s', wantPage)
    )
    return(wantPages)
  }
  
  #getsidurl()
  
  #URL='http://www.perngyuh.com.tw/store.php?cID=1'
  
  getAddr <- function(URL){
    res <- GET(URL, encoding='utf8')
    res2 <- content(res, encoding='utf8')
    name <- xpathSApply(res2, '/html/body/table[1]/tr/td/table/tr/td[2]/table/tr[2]/td/table/tr/td/table[2]/tr/td/table[3]/tr/td[1]', xmlValue)
    addr <- xpathSApply(res2, '/html/body/table[1]/tr/td/table/tr/td[2]/table/tr[2]/td/table/tr/td/table[2]/tr/td/table[3]/tr/td[3]/a', xmlValue)
    tel <- xpathSApply(res2, '/html/body/table[1]/tr/td/table/tr/td[2]/table/tr[2]/td/table/tr/td/table[2]/tr/td/table[3]/tr/td[2]', xmlValue)
    
    tel <- str_replace_all(  tel, 'TEL:| ', '')
    
    
    
    return(data.frame(brand_nm='芃諭名品',store_nm=name,addr=addr, tel_no=tel, stringsAsFactors = FALSE))
  }
  
  #getAddr(URL)
  
  UrlList <- lapply(getsidurl(), getAddr)
  UrlList_1 <- do.call(rbind, UrlList)
}
leoluyi/CRMaddress documentation built on May 21, 2019, 5:08 a.m.