draft/getPhcafe.R

getPhcafe <- function(){
  URL <- 'http://www.phcafe.com.tw/sell.php'
  res <- GET(URL, encoding='utf8')
  res2 <- htmlParse(content(res, "text", encoding = "utf8"), encoding = "utf8")
  
  result1 <- cssApply(res2,'tr > td > a',cssCharacter)
  result1 <- str_extract(result1,'^.+地區$')
  
  result2 <- cssApply(res2,'tr > td > a',cssLink)
  result  <- cbind(result1,result2)
  index <- !(is.na(result[,1]))
  result <- result[index,]
  
  #各分頁連結
  wantPages <- sprintf('http://www.phcafe.com.tw/%s',result[,2])
  
  URL <- 'http://www.phcafe.com.tw/sell_2.php?sc=4'
  getinfo <- function(URL){
    res <- GET(URL, encoding='utf8')
    res2 <- htmlParse(content(res, "text", encoding = "utf8"), encoding = "utf8")
    result <- cssApply(res2,'tr > td',cssCharacter)  
    result2 <- strsplit(result[179], '\n')  
    
    store_nm <- str_extract(unlist(result2),'.+店$')
    store_nm <- store_nm[index <- !(is.na(store_nm))]
    store_nm <- str_replace_all(store_nm,'[:space:]','')
    
    tel_no <- str_extract(unlist(result2),'[0-9].+-[0-9]{6,}')
    tel_no <- tel_no[index <- !(is.na(tel_no))]
    tel_no <- str_replace_all(tel_no,'[:space:]','')
    
    addr <- str_extract(unlist(result2),'.+(F|樓|號)')
    addr <- addr[index <- !(is.na(addr))]
    addr <- str_replace_all(addr,'[:space:]','')
    
    data.frame(brand_nm='品皇咖啡',store_nm=store_nm,addr=addr,tel_no=tel_no, stringsAsFactors=FALSE)  
  }
  
  allList <- lapply(wantPages, getinfo) 
  phcafe <- do.call(rbind, allList) 
  return(phcafe)
}
leoluyi/address_crawler documentation built on May 21, 2019, 5:09 a.m.