draft/getEnTie.R

getEnTie <- function(){
  result<-list()
  url   <-list()
  link<-5
  for(i in 1:link){
    url[[i]]<-sprintf('http://www.entiebank.com.tw/header/serviceLocation-unit_%s.html',i)
    htmldoc1 <- GET(url[[i]],user_agent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.130 Safari/537.36"))
    htmldoc2 <- htmlParse(htmldoc1, asText = TRUE, encoding='utf8')
    data    <- xpathSApply(htmldoc2,'//*[@id="pageCTBg2"]/table[3]/tr/td/table/tr/td/a', xmlAttrs)["onclick",]
    data    <- toUTF8(data)
    store_nm<- str_extract(data, '\\,.+(分行)|\\,.+(部)')
    store_nm<- str_replace_all (store_nm, '[0-9]|[A-Z]|[a-z]|&|=|-|_','')
    store_nm<- str_replace_all (store_nm, '\\,|\\.','')
    store_nm<-gsub("'","",store_nm)
    tel_no<-str_extract(data, '[0-9]{2}-[0-9]{8}|[0-9]{2}-[0-9]{7}')
    addr<-str_extract(data, '[0-9]{2}-[0-9]{8}.+Branch|[0-9]{2}-[0-9]{8}.+Dept|[0-9]{2}-[0-9]{7}.+Branch|[0-9]{2}-[0-9]{7}.+Dept')
    addr<-str_replace_all(addr, '[0-9]{2}-[0-9]{8}|[0-9]{2}-[0-9]{7}|[A-Z]|[a-z]|\\,|[:space:]','')
    addr<-gsub("'","",addr)
    result[[i]]<-data.frame(brand_nm='安泰銀行',store_nm, tel_no, addr, stringsAsFactors = FALSE)
  }
  data_fin <- do.call(rbind, result)
  return(data_fin)
}
leoluyi/CRMaddress documentation built on May 21, 2019, 5:08 a.m.