draft/getSushiExpress.R

getSushiExpress <- function(){
  # 頁數
  url      <- 'http://sushiexpress.com.tw/location.php?bid=5&ipage=1'
  htmldoc1 <- content(GET(url, encoding='utf8'), as='text', encoding='utf8')
  htmldoc2 <- htmlParse(htmldoc1, encoding='utf8')
  page     <- cssApply(htmldoc2, "dd.pagenumber>a" , cssCharacter)
  page1    <- str_extract(page,'[0-9]{1,2}')
  page2    <- as.vector(na.omit(page1))
  
  # 所有頁
  
  info1    <- list()
  store_nm <- list()
  for (i in 1:max(page2)){
    URL           <- sprintf('http://sushiexpress.com.tw/location.php?bid=5&ipage=%s',i)
    HTML1         <- content(GET(URL, encoding='utf8'), as='text', encoding='utf8')
    HTML2         <- htmlParse(HTML1, encoding='utf8')
    tables        <- readHTMLTable(HTML2)
    info1[[i]]    <- do.call(rbind,tables)
    store_nm[[i]] <- gsub("店別:","",cssApply(HTML2, ".ltitle" , cssCharacter))
  }
  
  info2 <- do.call(rbind,info1)
  info3 <- as.data.frame(matrix(info2$V2, ncol=4, byrow=TRUE), stringsAsFactors = FALSE)
  info4 <- str_replace(str_replace(info3$V1,'\\(.+\\)',''),'\n','')
  nm1   <- unlist(store_nm)
  datas <- data.frame('定食8',nm1,info4,info3$V2,info3$V3, stringsAsFactors = FALSE)
  
  names(datas) <- c('brand_nm','store_nm','addr','tel_no','tm') 
  return(datas)
}
leoluyi/CRMaddress documentation built on May 21, 2019, 5:08 a.m.