R/get_wasserportal_stations_table.R

Defines functions print_invalid_hrefs extract_hrefs

#' Wasserportal Berlin: get stations overview table
#'
#' @param type type of stations table to retrieve. Valid options defined in
#' \code{\link{get_overview_options}}, default: get_overview_options()$groundwater$level
#' @param url_wasserportal base url to Wasserportal berlin (default:
#' \code{\link{wasserportal_base_url}}
#' @return data frame with master data of selected monitoring stations
#' @export
#' @importFrom rvest html_node html_table html_nodes html_attr
#' @importFrom stringr str_remove_all
#' @importFrom xml2 read_html
#' @importFrom dplyr bind_cols
#' @importFrom tibble tibble
#' @examples
#' types <- wasserportal::get_overview_options()
#' str(types)
#' sw_l <- wasserportal::get_wasserportal_stations_table(type = types$surface_water$water_level)
#' str(sw_l)

get_wasserportal_stations_table <- function (
    type = get_overview_options()$groundwater$level,
    url_wasserportal = wasserportal_base_url()
)
{
  if (! is.null(type)) {
    type <- match.arg(type, unlist(get_overview_options()))
  }

  overview_url <- paste0(
    url_wasserportal,
    "/messwerte.php?",
    url_parameter_string(anzeige = "tabelle", thema = type)
  )

  html <- xml2::read_html(overview_url)

  pegeltab <- rvest::html_node(html, xpath = '//*[@id="pegeltab"]')

  if (is.na(pegeltab)) {
    stop(
      "Could not find element with id 'pegeltab' in HTML returned by ",
      overview_url, call. = FALSE
    )
  }

  # Convert the HTML table into a data frame
  overview_table <- rvest::html_table(pegeltab)

  # Get the column captions from the table header
  captions <- html %>%
    rvest::html_nodes(xpath = '//table[@id="pegeltab"]/thead/tr/th') %>%
    rvest::html_text()

  # Identify columns "Messstellennummer" and "Ganglinie"
  column_id <- grep("Mess.?stellen.?nummer", captions)
  column_graph <- grep("Gang.?linie", captions)

  stopifnot(length(column_id) == 1L)
  stopifnot(length(column_graph) == 1L)

  # Function to create xpath expression to match the cells in column i
  xpath_column <- function(i) {
    sprintf('//table[@id="pegeltab"]/tbody/tr/td[%d]', i)
  }

  # Look for hyperlinks in column "Messstellennummer"
  hrefs_id <- html %>%
    rvest::html_nodes(xpath = xpath_column(column_id)) %>%
    extract_hrefs()

  # Look for hyperlinks in column "Ganglinie"
  hrefs_graph <- html %>%
    rvest::html_nodes(xpath = xpath_column(column_graph)) %>%
    extract_hrefs()

  # Do not combine both links
  #
  # # The wasserportal-related hyperlinks in column "Ganglinie" are slightly
  # # different from those in column "Messstellennummer". Adapt the links in
  # # column "Ganglinie" before "merging" them with the links in column
  # # "Messstellennummer".
  # hrefs_graph <- multi_substitute(hrefs_graph, list(
  #   "anzeige=[^&]+" = "anzeige=i",
  #   "stable=gwq" = "stable=gws"
  # ))
  #
  # # "Merge" hrefs_id with hrefs_graph: Use hrefs_id if not NA else hrefs_graph
  # # and warn if both are given but different
  # hrefs <- parallel_non_na(hrefs_id, hrefs_graph)
  #
  # # Report about differing hrefs in the two columns
  # #print_invalid_hrefs(hrefs)

  # Prefix the wasserportal-related hyperlinks with the wasserportal base URL
  add_baseurl <- function(hrefs) {

  is_not_na <- !is_na_or_empty(hrefs)

  if(sum(is_not_na) > 0) {
  is_wasserportal <- startsWith(hrefs, "station.php") & is_not_na

  hrefs[is_wasserportal] <- sprintf(
    "%s/%s",
    url_wasserportal,
    hrefs[is_wasserportal]
  )
  } else {
   hrefs <- NA_character_
  }

  hrefs
  }

  overview_table[[column_graph]] <- add_baseurl(hrefs_graph)

  names(overview_table) <- names(overview_table) %>%
    stringr::str_remove_all("-") %>%
    subst_special_chars()


  dplyr::bind_cols(
    overview_table,
    tibble::tibble(stammdaten_link = add_baseurl(hrefs_id))
  )


}

# extract_hrefs ----------------------------------------------------------------
extract_hrefs <- function(x)
{
  hrefs <- rep(NA_character_, length(x))

  links <- rvest::html_node(x, "a")

  has_link <- !is.na(links)

  hrefs[has_link] <- rvest::html_attr(links[has_link], "href")

  hrefs
}

# print_invalid_hrefs ----------------------------------------------------------
print_invalid_hrefs <- function(hrefs)
{
  invalid <- attr(hrefs, "invalid")

  if (is.null(invalid)) {
    return()
  }

  message("There are different hrefs in column 1 and column 8 of the table.")
  print(invalid)
}
KWB-R/wasserportal documentation built on June 6, 2024, 10:26 a.m.