draft/getFarmer.R

getFarmer <- function(){
  wantURL <- 'http://www.farmer.org.tw/basic_book.aspx'
  res <- GET(wantURL, encoding='utf8')
  res2 <- content(res, encoding='utf8')
  UrlList <- c(matrix(xpathSApply(res2, '//*[@id="ContentPlaceHolder1_dp_book"]/a', xmlAttrs)))
  UrlList=UrlList[length(UrlList)]
  UrlList <- str_replace_all(UrlList, '([/basic_book.aspx?pq=])+', '')
  
  for(i in 1:UrlList)
  {
    wantURL[i] <- sprintf('http://www.farmer.org.tw/basic_book.aspx?pq=%s',i)
  }
  
  OUTPUT=list()
  for(i in 1:UrlList)
  {
    res <- GET(wantURL[i], encoding='utf8')
    res2 <- content(res, encoding='utf8')
    store_nm <- xpathSApply(res2, '//*[@class="book text"]/tr[1]', xmlValue)
    store_nm <- str_replace_all(store_nm, '(\r|\n| | )+', '')
    list <- xpathSApply(res2, '//*[@class="book text"]/tr[2]/td/ul/li[1]', xmlValue)
    list <- str_replace_all(list, '(\r|\n| | )+', '')
    tel_no=substr(list,1,regexpr("傳真:",list)-1)
    tel_no <- str_replace_all(tel_no, '(\r|\n| | |[/]|[電話:])+', '')
    tel_no <- str_replace_all(tel_no, '([(]).+$', '')
    fax_no=substr(list,regexpr("傳真:",list),regexpr("會址:",list)-1)
    fax_no <- str_replace_all(fax_no, '(\r|\n| | |[/]|[傳真:])+', '')
    fax_no <- str_replace_all(fax_no, '([(]).+$', '')
    addr=substr(list,regexpr("會址:",list),200)
    addr <- str_replace_all(addr, '(\r|\n| | |[/]|[會址:])+', '')
    addr <- str_replace_all(addr, '([(]).+$', '')
    OUTPUT[[i]]= data.frame(brand_nm='中華民國農會',store_nm=store_nm,addr=addr,tel_no=tel_no,fax_no=fax_no, data_dt=gsub('-','',Sys.Date()) ,stringsAsFactors=FALSE )
  }
  FINISH=do.call(rbind,OUTPUT)
}
leoluyi/CRMaddress documentation built on May 21, 2019, 5:08 a.m.