draft/getMizuno.R

getMizuno <- function(){
  URL='http://www.mizuno.com.tw/06sale/shop_list.aspx?type=0&city=&area=&CurrentPage=1'
  res <- GET(URL, encoding='utf8')
  res3 <- content(res, "text")
  maxpage<-unlist(str_extract_all(res3,'PageNavigator1_HyperLinkLastPage.+最後頁'))
  maxpage<-str_replace(unlist(str_extract_all(maxpage,'CurrentPage=[0-9]+')),'CurrentPage=','')
  wantURL <- sprintf('http://www.mizuno.com.tw/06sale/shop_list.aspx?type=0&city=&area=&CurrentPage=%s',1:maxpage )
  getmizuinfo <- function(URL){
    #URL="http://www.mizuno.com.tw/06sale/shop_list.aspx?type=0&city=&area=&CurrentPage=33"
    res <- GET(URL, encoding='utf8')
    res2 <- htmlParse(content(res, "text", encoding = "utf8"), encoding = "utf8")
    store_nm <- cssApply(res2,"tr > td > span.shop_name",cssCharacter)
    addr <- cssApply(res2,"table>tr > td:nth-child(2)",cssCharacter)
    #CodeForm > table:nth-child(9) > tbody > tr > td:nth-child(4) > table:nth-child(5) > tbody > tr > td > table:nth-child(1) > tbody > tr:nth-child(2) > td:nth-child(2)
    #CodeForm > table:nth-child(9) > tbody > tr > td:nth-child(4) > table:nth-child(5) > tbody > tr > td > table:nth-child(1) > tbody > tr:nth-child(2) > td:nth-child(2)
    tables <- readHTMLTable(res2)
    #   sapply(tables,NCOL)
    #   sapply(tables,NROW)
    data_table <- tables[[17]]
    data_table<-str_replace_all(data_table[,2],'[:space:]','')
    index<-!(is.na(data_table))
    data_table2<-data_table[index]
    index2<-(nchar(data_table2)>0)
    data_table3<-data_table2[index2]
    store_info <-matrix(data=data_table3,ncol = 2,byrow=TRUE)
    store_nm<-str_extract(str_replace_all(store_info[,1],'[:space:]',''),'.+?[0-9]')
    store_nm<-str_replace_all(store_nm,'[0-9]$','')
    tel_no<-unlist(str_extract_all(str_replace_all(store_info[,1],'[:space:]',''),'([0-9]+-[0-9]+-[0-9]+)|([0-9]{8})|([0-9]+-[0-9]+)'))
    addr<-str_replace_all(store_info[,2],'^[0-9]+','')
    brand_nm='美津濃(MIZUNO)'
    result <- data.frame(brand_nm=brand_nm,store_nm=store_nm, addr=addr, tel_no=tel_no, 
                         stringsAsFactors=FALSE )
  }
  mizuinfo <- lapply(wantURL,getmizuinfo)
  mizuinfo2 <- do.call(rbind, mizuinfo)
}
leoluyi/CRMaddress documentation built on May 21, 2019, 5:08 a.m.