draft/getSubway.R

getSubway <- function(){
  
  url <- "http://www.twsubway.com/www/include/index.php?Page=4"
  res <- GET(url, encoding='utf8')
  res2 <- content(res,"text", encoding='utf8')
  html <- htmlParse(res2, encoding = "utf8")
  res0 <- cssApply(html,"#pane_locator > div ", cssCharacter)
  total <-as.integer(str_replace_all(unlist(str_extract_all(res0,'共.+項')), '(共|項|[:space:])', ""))
  total_page <- floor(total/10)+1
  
  mylist <- list()
  for (i in 1:total_page)
  {
    url<- sprintf("http://www.twsubway.com/www/include/index.php?pageNum_content01=%s&totalRows_content01=%s&Page=4", i-1, total)
    
    res <- GET(url, encoding='utf8')
    res2 <- content(res,"text", encoding='utf8')
    html <- htmlParse(res2, encoding = "utf8")
    
    data3 <- cssApply(html,"#pane_locator > ul > ul li", cssCharacter)
    data4 <- str_replace_all(unlist(str_extract_all(data3, "\t.+\n")), '(\t|\n|[:space:]|BEYONDPLAZA   PLAZA)', "")
    data41 <- str_replace_all(data4, 'BEYONDPLAZA', "")
    
    data5 <- str_replace_all(unlist(str_extract_all(data3, "(店|醫院|PLAZA)\n.+\n.+(\\)|09)")), '\\(.+\\)|\\(|店|\n|[:space:]|BEYOND PLAZA|頂樓 (國道三號-清水服務區)|\\(|09|[0-9]{2}\\)', "")
    data50 <- str_replace_all(data5, '(\\-靜宜大學宜園餐廳1F|副樓|1\\-2/F|01011櫃位|1F&B1|2620\\-28|2277\\-|(林口長庚紀念醫院美食街))', "-")
    data51 <- str_replace_all(data50, '\\-|^[0-9]{3}|頂樓|PLAZA|第.+中心', "")
    
    data6 <- str_replace_all(unlist(str_extract_all(data3, "\n.+([0-9]{2}\\)|[0-9]{4}-).+\n")), '\n|[:space:]|\\(', "")
    
    data7 <- str_replace_all(data6, '\\)', "-")
    
    data01=data.frame("subway",data41, data51, data7, stringAsFactors=FALSE) 
    data02 <- data01[,-5]
    mylist[[i]] <- data02
    
  }
  
  data06 <- rbind(mylist[[1]], mylist[[2]], mylist[[3]], mylist[[4]], mylist[[5]],
                  mylist[[6]], mylist[[7]], mylist[[8]], mylist[[9]], mylist[[10]],
                  mylist[[11]], mylist[[12]], mylist[[13]], mylist[[14]]
  )
  
  colnames(data06) = c("brand_nm", "store_nm", "addr", "tel_no")
  data06
}
leoluyi/CRMaddress documentation built on May 21, 2019, 5:08 a.m.