knitr::opts_chunk$set(
  collapse = TRUE,
  eval = TRUE,
  comment = "#>"
)

chk_online <- FALSE
chk_stations <- FALSE
chk_ts <- FALSE

Introduction

This vignette shows how to create a tidy data frame for the stations' and time series' data from Hydroscope. We will use the Hydroscope's sub-domains kyy, ypaat and emy, because their servers are maintened by the National Technical University Of Athens and work seamlessly.

This vignette requires an internet connection to run and that the Hydroscope's subdomains to be online. We can check if the sub-domains are online using the pingr package:

library(pingr)

# helper function to check if a sub-domain is online
online <- function(h_url){
  !is.na(pingr::ping(h_url, count = 1))
}

# check if sub-domains are online
kyy   <- online("kyy.hydroscope.gr")
emy   <- online("emy.hydroscope.gr")
ypaat <- online("ypaat.hydroscope.gr")
chk_online <- kyy & emy & ypaat
print(paste("All sub-domains are online:", chk_online))

Note that the following chucks will be evaluated only if all sub-domains are online and if the retrieved data have some expected variables. Downloading will take some time, depending on your internet connection and the Hydroscope's servers traffic.

Download data

With the following code we can download Hydroscope's sub-domains' data to a named list.

library(hydroscoper)
library(tibble)
library(plyr)

subdomain <- c("kyy", "ypaat", "emy")

# download all databases
hydro_data <- lapply(subdomain, function(x) get_database(x))
names(hydro_data) <- subdomain

Stations

First of all, we must check if there are specific variables at the sub-domain's stations' data, because we will merge these data to a new data frame.

# check if all expected variables exist in all subdomains stations data
vars <- c("id", "name", "owner", "point", "altitude", "water_basin",
          "water_division")
chk1 <- laply(subdomain, function(id) {
    all(vars %in% names(hydro_data[[id]]$stations))
  })

# check water_basin, water_division and owner variables
vars2 <- c("id", "name")
chk2 <- laply(subdomain, function(id) { 
  all((vars2 %in% names(hydro_data[[id]]$water_basin)) &
        (vars2 %in% names(hydro_data[[id]]$water_division)) &
        (vars2 %in% names(hydro_data[[id]]$owner))
  )})

chk_stations <- all(chk1) & all(chk2)
print(paste("All expected variables exist in stations' related data:",
            chk_stations))

With the following code we will create a tidy data frame with the stations data. Note that the Greek terms are translated to English.

stations <- ldply(subdomain, function(id){

  tmp <-  hydro_data[[id]]

  # extract dataframes to join
  wbas <- tmp$water_basins[c("id", "name")]
  wdiv <- tmp$water_divisions[c("id", "name")]
  owners <- tmp$owners[c("id", "name")]

  # remove area from water_basin
  wbas$name <- laply(wbas$name, function(y) gsub("\\([^()]*\\)", "", y))

  # translate names
  wdiv$name <- hydro_translate(wdiv$name, "division")
  owners$name <- hydro_translate(owners$name, "owner")

  # rename dataframes variables
  names(wbas) <- c("water_basin", "water_basin_name")
  names(wdiv) <- c("water_division", "water_division_name")
  names(owners) <- c("owner", "owner_name")

  # merge data
  res <- merge(tmp$stations, wbas, by = "water_basin", all.x = TRUE)
  res <- merge(res, wdiv, by = "water_division", all.x = TRUE)
  res <- merge(res, owners, by = "owner", all.x = TRUE)

  # create coords
  coords <- hydro_coords(res$point)

  # create a data frame with all the data
  data.frame(station_id = as.integer(res$id),
             name = as.character(res$name),
             water_basin = res$water_basin_name,
             water_division = res$water_division_name,
             owner = res$owner_name,
             longitude = as.numeric(coords$long),
             latitude = as.numeric(coords$lat),
             altitude = as.numeric(res$altitude),
             subdomain = rep(id, nrow(res)),
             stringsAsFactors = FALSE)

})
stations <- as_tibble(stations)
stations

Time series

Similarly, we will check if there are specific variables at each sub-domain's time series' data.

# check if variables exists in timeseries data
vart <- c("id", "gentity", "start_date_utc", "end_date_utc", "variable",
          "time_step", "unit_of_measurement")
chk3 <- laply(subdomain, function(id) { 
  all(vart %in% names(hydro_data[[id]]$timeseries))
  })

# check if variables exists in hydrometeorological variables and timesteps
vart2 <- c("id", "descr")
chk4 <- laply(subdomain, function(id) {
  all(vart2 %in% names(hydro_data[[id]]$variables)) &
  all(vart2 %in% names(hydro_data[[id]]$time_steps))
})

# check if variables exists in unit_of_measurement
vart3 <- c("id", "symbol")
chk5 <- laply(subdomain, function(id) {
  all(vart3 %in% names(hydro_data[[id]]$units))
})

chk_ts <- all(chk3) & all(chk4) & all(chk5)
print(paste("All expected variables exist in time series' data:",
            chk_ts))

With the following code we will, also, create a tidy data frame with the time series data with translated terms.

# create stations' dataframe
timeseries <- ldply(subdomain, function(id){

  tmp <- hydro_data[[id]]

  # create data frames to join
  var_names <- tmp$variables[c("id", "descr")]
  ts_names <-  tmp$time_steps[c("id", "descr")]
  ts_units <-tmp$units[c("id", "symbol")]

  # translate names
  var_names$descr <- hydro_translate(var_names$descr, "variable")
  ts_names$descr <- hydro_translate(ts_names$descr, "timestep")

  # rename dataframes variables
  names(var_names) <- c("variable", "variable_name")
  names(ts_names) <- c("time_step", "time_step_name")
  names(ts_units) <- c("unit_of_measurement", "unit_of_measurement_name")

  # merge data
  res <- merge(tmp$timeseries, var_names, by = "variable", all.x = TRUE)
  res <- merge(res, ts_names, by = "time_step", all.x = TRUE)
  res <- merge(res, ts_units, by = "unit_of_measurement", all.x = TRUE)

  data.frame(
    "time_id" = as.integer(res$id),
    "station_id" = as.integer(res$gentity),
    "variable" = as.character(res$variable_name),
    "timestep" = as.character(res$time_step_name),
    "units" = as.character(res$unit_of_measurement_name),
    "start_date" = as.character(res$start_date_utc),
    "end_date" = as.character(res$end_date_utc),
    "subdomain" = rep(id, nrow(res)), 
    stringsAsFactors = FALSE)
})
timeseries <- as_tibble(timeseries)
timeseries


ropensci/hydroscoper documentation built on July 20, 2024, 2:06 p.m.