R/get_tp_metro.R

#' Taipei Metro
#'
#' Taipei Metro
#'  
#' @seealso 
#' url: \url{http://web.metro.taipei/} \cr
#' data: \url{http://web.metro.taipei/c/selectstation2010.asp}
#' 
#' @return data.table
#' @export
get_tp_metro <- function () {
  #Get all urls
  getAllStations <- function() {
    url = 'http://web.metro.taipei/c/selectstation2010.asp'
    res = content(GET(url, encoding = 'utf8'), 'text', encoding = 'utf8')
    station_url = str_extract_all(res, 'href=\\\"stationdetail2010\\.asp\\?ID\\=[0-9]{3}')
    station_url = do.call(rbind, station_url)
    station_url = gsub("href=", "", station_url)
    station_url = gsub('"', '', station_url)
    for (i in 1:length(station_url)){
      station_url[i] = sprintf('http://web.metro.taipei/c/%s', station_url[i])
    }
    return(station_url)
  }
  
  all_stations = getAllStations()
  
  #Get Station Info
  getStationInfo <- function(url){
    res = content(GET(url, encoding = 'utf8'), 'text', encoding = 'utf8')
    html = htmlParse(res, encoding = 'utf8')
    tables = readHTMLTable(html)
    station_info = tables[[3]]
    station_name = names(station_info)[2]
    station_info = as.character(station_info[, 2])
    station_info = c(station_name, station_info)
    gis_address = str_extract(res, "Longitude=[0-9]{3}\\.[0-9]{4,6}\\&Latitude\\=[0-9]{2}\\.[0-9]{4,6}")
    gis_address =str_split(gis_address, '&')[[1]]
    gisX = as.double(gsub('Longitude\\=', '', gis_address[1]))
    gisY = as.double(gsub('Latitude\\=', '', gis_address[2]))
    station_info = data.frame(t(station_info), gisX, gisY, stringsAsFactors = FALSE)
    colnames(station_info) = c("station_nm", "addr", "elevator", "info_center", "bicycle", "gisX", "gisY")
    return(station_info)
  }
  
  all_stations_info = list()
  for (i in 1:length(all_stations)){
    if (i %% 10 == 0){Sys.sleep(1)}
    all_stations_info[[i]] = getStationInfo(all_stations[i])
  }
  
  all_stations_info = do.call(rbind, all_stations_info)
  
  #Data Cleaning
  #車站地址數 >1 的部分待之後再修正
  addr = gsub("[[:space:]]", "", all_stations_info$addr)
  # 20150911 fix addr 
  addr = strsplit(str_trim(addr),'號')
  addr = paste0(sapply(addr,function(x) x[[1]]),"號")
  addr = gsub('^.{0,5}:', '', addr)
  addr = gsub('^[0-9]+', '', addr)
  elevator = gsub("[[:space:]]", "", all_stations_info$elevator)
  all_stations_info$addr = addr
  all_stations_info$elevator = elevator
  
  final_output = data.frame(brand_nm = 'Taipei_MRT', store_nm = all_stations_info$station_nm, all_stations_info[, -1])
  a=strsplit(str_trim(final_output$addr),'號')
  return(final_output)
}
leoluyi/CRMaddress documentation built on May 21, 2019, 5:08 a.m.