draft/getSmallNorth.R

getSmallNorth <- function(){
  URL <- 'http://www.small-north.url.tw/page03.html'
  res <- GET(URL, user_agent("mozilla/5.0"))
  res2 <- htmlParse(content(res, "text", encoding = "utf8"),encoding = "utf8")
  result <- cssApply(res2,'tr > td.aab > a',cssLink)
  
  #各店連結
  wantPages <- sprintf('http://www.small-north.url.tw/%s',result)
  
  URL <- 'http://www.small-north.url.tw/page03-a012.html'
  getinfo <- function(URL){
    res <- GET(URL, user_agent("mozilla/5.0"))
    res2 <- htmlParse(content(res, "text", encoding = "utf8"),encoding = "utf8")
    
    result <- cssApply(res2,' tr > td',cssCharacter)
    store_nm <- result [!(is.na(str_extract(result,'^首頁.+$')))]
    store_nm <- str_replace_all(store_nm,'[:space:]','')
    store_nm <- str_replace(store_nm,'首頁>分店訊息>','')
    
    result2 <- cssApply(res2,' tr(2) > td > table tr(1) > td.aai',cssCharacter)
    if (length(result2)>2) {
      addr <- paste0(result2[1],result2[2])
      addr <- str_replace_all(addr,'[:space:]','')
      addr <- str_replace(addr,'。住址:','')
    } else {
      addr <- str_replace_all(result2[1],'[:space:]','')
      addr <- str_replace(addr,'。住址:|住址:','')
    }
    
    if (length(result2)>2) {
      open_time <- str_replace_all(result2[3],'[:space:]','')
      open_time <- str_replace(open_time,'。','')
    } else {
      open_time <- str_replace_all(result2[2],'[:space:]','')
      open_time <- str_replace(open_time,'。','')
    }
    
    result3 <- cssApply(res2,' tr(2) > td > table tr(1) > td.aad',cssCharacter)
    tel_no <- str_replace_all(result3,'[:space:]','')
    tel_no <- str_replace(tel_no,'。電話:|電話:','')
    
    data.frame(brand_nm='小北百貨',store_nm=store_nm,addr=addr,tel_no=tel_no,open_time=open_time, stringsAsFactors=FALSE)  
  }
  
  allList <- lapply(wantPages, getinfo) 
  smallnorth <- do.call(rbind, allList) 
  return(smallnorth)
}
leoluyi/CRMaddress documentation built on May 21, 2019, 5:08 a.m.