draft/get3up.R

get3up <- function(){
#  wantURL <- 'http://www.3up.com.tw/chainNorth.htm'
#  res=htmlParse(wantURL, encoding='big5')
  wantPages=c(
    'http://www.3up.com.tw/chainNorth.htm',
    'http://www.3up.com.tw/chainCentrality.htm',
    'http://www.3up.com.tw/chainSouth.htm',
    'http://www.3up.com.tw/chainEast.htm'
  )

  OUTPUT1=c()
  for(i in 1:4) {
    res <- htmlParse(wantPages[i], encoding='big5')
    store_nm <- xpathSApply(res, '//*[@class="graylight"]/td[1]', xmlValue)
    store_nm <- str_replace_all(store_nm, '(\r|\n| | )+', '')
    addr <- xpathSApply(res, '//*[@class="graylight"]/td[2]', xmlValue)
    addr <- str_replace_all(addr, '(\r|\n| | )+', '')
    tel_no <- xpathSApply(res, '//*[@class="graylight"]/td[3]', xmlValue)
    tel_no <- str_replace_all(tel_no, '(\r|\n| | )+', '')
    OUTPUT1[[i]] <- data.frame(brand_nm='三上數位',
                               store_nm=store_nm,
                               addr=addr,
                               tel_no=tel_no, 
                               data_dt=gsub('-','',Sys.Date()),
                               stringsAsFactors=FALSE )
  }

  FINISH1=do.call(rbind,OUTPUT1)
  
  OUTPUT2=c()
  for(i in 1:4) {
    res <-htmlParse(wantPages[i], encoding='big5')
    store_nm <-xpathSApply(res, '//*[@class="green"]/td[1]', xmlValue)
    store_nm <- str_replace_all(store_nm, '(\r|\n| | )+', '')
    addr <- xpathSApply(res, '//*[@class="green"]/td[2]', xmlValue)
    addr <- str_replace_all(addr, '(\r|\n| | )+', '')
    tel_no <- xpathSApply(res, '//*[@class="green"]/td[3]', xmlValue)
    tel_no <- str_replace_all(tel_no, '(\r|\n| | )+', '')
    OUTPUT2[[i]] <- data.frame(brand_nm='三上數位',
                               store_nm=store_nm,
                               addr=addr,
                               tel_no=tel_no, 
                               data_dt=gsub('-','',Sys.Date()),
                               stringsAsFactors=FALSE )
  }
  FINISH2=do.call(rbind,OUTPUT2)
  
  FINISH=rbind(FINISH1, FINISH2)
}
leoluyi/CRMaddress documentation built on May 21, 2019, 5:08 a.m.