R/download_links.R

Defines functions download_links download_area_links

Documented in download_area_links download_links

#' Download links for areas, countries and stations
#' 
#' For polite scraping, 5 sec interval is set in download_links(), 
#' it takes about 15 minutes to get all station links. 
#' Please use existing links by "data(station_links)", 
#' if you do not need to renew links. 
#' You can see web page as below. 
#' https://www.data.jma.go.jp/gmd/cpd/monitor/nrmlist/
#' 
#' @name download_links
#' @param url  A String to specify target html.
#' @return     A string vector of url links, or NULL when failed.
#' @examples
#' # If you want links for all countries and all sations, remove head().
#' # The codes take over 5 sec because of poliste scraping.
#' \donttest{
#' library(magrittr)
#' library(stringi)
#' library(dplyr)
#' library(tibble)
#' area_links <- download_area_links()
#' station_links <- NULL
#' continent     <- NULL
#' continents <- 
#'   c("\\u30a2\\u30d5\\u30ea\\u30ab",
#'     "\\u30a2\\u30b8\\u30a2",
#'     "\\u5357\\u30a2\\u30e1\\u30ea\\u30ab", 
#'     "\\u5317\\u4e2d\\u30a2\\u30e1\\u30ea\\u30ab",
#'     "\\u30aa\\u30bb\\u30a2\\u30cb\\u30a2",
#'     "\\u30e8\\u30fc\\u30ed\\u30c3\\u30d1")
#' area_links <- head(area_links, 1)  # for test
#' for(i in seq_along(area_links)){
#'     print(stringr::str_c("area: ", i, " / ", length(area_links)))
#'     country_links <- download_links(area_links[i])
#'     country_links <- head(country_links, 1)  # for test
#'     for(j in seq_along(country_links)){
#'         print(stringr::str_c("    country: ", j, " / ", length(country_links)))
#'         links <- download_links(country_links[j])
#'         station_links <- c(station_links, links)
#'         continent     <- c(continent,     rep(continents[i], length(links)))
#'     }
#' }
#' station_links <- tibble::tibble(url = station_links, continent = continent)
#' station_links
#' }
#' @export
download_area_links <- function(
  url = "https://www.data.jma.go.jp/gmd/cpd/monitor/nrmlist/"){
  html <- gracefully_fail(url)
  if(is.null(html)) return(NULL)
  href <- 
    html %>%
    rvest::read_html() %>%
    rvest::html_elements("#main") %>%
    rvest::html_elements("area") %>%
    rvest::html_attr("href") %>%
    stringr::str_sub(start = 3)
  return(stringr::str_c(url, href))
}

#' @rdname download_links
#' @export
download_links <- function(url){
  sleep()
  html <- gracefully_fail(url)
  if(is.null(html)) return(NULL)
  href <- 
    html %>%
    rvest::read_html() %>%
    rvest::html_elements("#main") %>%
    rvest::html_elements("a") %>% 
    rvest::html_attr("href")
  url <- 
    "https://www.data.jma.go.jp/gmd/cpd/monitor/nrmlist/" %>%
    stringr::str_c(href)
  return(url)
}

Try the clidatajp package in your browser

Any scripts or data that you put into this service are public.

clidatajp documentation built on March 7, 2023, 8:10 p.m.