R/get_tp_metro.R

#' Taipei Metro
#'
#' Taipei Metro
#'  
#' @seealso 
#' url: \url{http://web.metro.taipei/} \cr
#' data: \url{http://web.metro.taipei/c/selectstation2010.asp}
#' 
#' @examples
#' \dontrun{
#' tp_metro <- get_tp_metro()
#' }
#' 
#' @return data.table
#' @export
get_tp_metro <- function () {
  
  station_ids <- get_station_id()
  all_stations_info_ls <- lapply(station_ids, get_tp_metro_)
  out_dt <- rbindlist(all_stations_info_ls)
  
  # add brand name
  out_dt[, `:=`(brand_nm = "臺北捷運", keyword = "Taipei Metro")]
  
  ## add url, time, full name
  out_dt[, store_url := "http://web.metro.taipei/"][
    , data_time := format(Sys.time(), "%Y-%m-%dT%H:%M:%S%z")][
      , full_nm := paste0(brand_nm, store_nm)]
  
  key_var <- c("full_nm", "brand_nm", "keyword", "store_nm", "addr",
               "lon_x", "lat_y", "store_url")
  ## move brand_nm, key_word to first two colmuns
  setcolorder(out_dt, c(key_var, setdiff(names(out_dt), key_var)))
  
  out_dt  
}


# Helper functions --------------------------------------------------------

#Get all urls
get_station_id <- function() {
  url = 'http://web.metro.taipei/c/selectstation2010.asp'
  res = GET(url, ua) %>% content(encoding = 'utf8')
  station_id <- res %>% 
    html_nodes("#Station option") %>% 
    html_attr("value")
  station_id
}

#Get Station Info
get_tp_metro_ <- function(station_id) {
  # station_id = "BL23-031"
  url <- "http://web.metro.taipei/c/stationdetail2010.asp"
  res_xml <- POST(url, ua, 
                  body = list(ID=station_id),
                  encode = "form") %>% 
    content(encoding = 'utf8')
  
  tables <- res_xml %>% html_nodes('table[bgcolor="#c0c0c0"]') %>% 
    rvest::html_table(fill = TRUE)
  station_info <- tables[[1]] %>% t %>% data.table %>% .[-1]
  station_info <- station_info %>% setnames(c(
    "store_nm", "addr", "elevator", "info_center", 
    "water", "toilet", "bicycle"
  )) 
  
  # 車站地址數 >1 的部分待之後再修正
  station_info[, addr := addr %>% 
                 str_extract_all("\\d{5}.{6,}?號") %>% 
                 .[[1]] %>% paste(collapse="\n") %>% 
                 str_replace_all("\n", " ")]
  
  # Store name
  station_info[, store_nm := paste0(store_nm, "站")]
  
  # GPS
  gis_address <- res_xml %>% 
    as.character() %>% 
    str_match("googlemap\\.asp\\?Longitude=([0-9]+\\.[0-9]+)&amp;Latitude=([0-9]+\\.[0-9]+)") %>% 
    .[, 2:3]
  
  station_info[, `:=`(
    lon_x = as.double(gis_address[1]),
    lat_y = as.double(gis_address[2])
  )]
  station_info
}
leoluyi/address_crawler documentation built on May 21, 2019, 5:09 a.m.