draft/getMaxxis.R

getMaxxis <- function(){
  URL <- 'http://www.cst.com.tw/pubtw/store/store_2011-2-2.html'
  res <- GET(URL, encoding='utf8')
  res2 <- httr::content(res, "text", encoding = "utf8")
  res2 <- htmlParse(res2, encoding = "utf8")
  
  result <- cssApply(res2,'div#left > p > a',cssLink)
  result <- unlist(str_extract_all(result,'^store.+'))
  
  #各縣市連結
  wantPages <- sprintf('http://www.cst.com.tw/pubtw/store/%s',result)
  
  #URL <- 'http://www.cst.com.tw/pubtw/store/store_Taipei_2.html'
  #URL <- wantPages[2]
  getPages <- function(URL) {    
    res <- GET(URL, encoding='utf8')
    res2 <- httr::content(res, "text", encoding = "utf8")
    res2 <- htmlParse(res2, encoding = "utf8")
    
    result <- unlist(cssApply(res2,'li > a',cssLink))
    result <- result[nchar(result)>0]
    
    storeURL <- sprintf('http://www.cst.com.tw/pubtw/store/%s',result)
    return(storeURL)
  }
  
  #URL <- 'http://www.cst.com.tw/pubtw/store/tp/store_tp_004.html'
  getInfo <- function(URL){
    res <- GET(URL, encoding='utf8')
    res2 <- httr::content(res, "text", encoding = "utf8")
    res2 <- htmlParse(res2, encoding = "utf8")
    
    result <- unlist(cssApply(res2,'tr > td',cssCharacter))
    result <- str_replace_all(result,'[:space:]','')
    result2 <- matrix(result, ncol = 2, byrow = TRUE)
    index <- !(result2[,1]=='所屬經銷商:')
    result3 <- result2[index,]    
    result3 <- result3[,2]
    
    addr <- str_extract(result3[3],'^.+\\(一廠|^.+\\(')
    addr <- str_replace(addr,'\\(一廠|\\(','')
    
    data.frame(brand_nm='瑪吉斯',
               store_nm=result3[1],
               store_owner=result3[2],             
               addr=addr,
               tel_no=result3[4],
               stringsAsFactors=FALSE) 
  }
  
  UrlList <- lapply(wantPages, getPages) 
  allList <- lapply(unlist(UrlList), getInfo) 
  Maxxis <- do.call(rbind, allList) 
}
leoluyi/CRMaddress documentation built on May 21, 2019, 5:08 a.m.