R/aemet_helpers.R

Defines functions .aemet_monthly_yearly_carpentry .aemet_daily_carpentry .aemet_current_day_carpentry .get_data_aemet .get_info_aemet .check_status_aemet .create_aemet_path

#' Create the path elements for AEMET API
#'
#' Path vectors for AEMET API to use with httr::GET
#'
#' @param api_options Option list as generated by \link{\code{aemet_options}}
#'
#' @noRd
.create_aemet_path <- function(api_options) {

  # we need the resolution to create the corresponding path
  resolution <- api_options$resolution
  # we need to transform the dates to the character string specific format for the AEMET path.
  # We will use a stamp function:
  aemet_stamp <- lubridate::stamp("2020-12-25T00:00:00UTC", orders = "YOmdHMS", quiet = TRUE)

  # current day
  if (resolution == 'current_day') {
    return(c('opendata', 'api', 'observacion', 'convencional', 'todas'))
  }

  # daily
  if (resolution == 'daily') {
    return(
      c(
        'opendata', 'api', 'valores', 'climatologicos', 'diarios', 'datos',
        'fechaini', aemet_stamp(api_options$start_date),
        'fechafin', aemet_stamp(api_options$end_date),
        'todasestaciones'
      )
    )
  }

  # monthly
  # monthly API does not work for now
  if (resolution %in% c('monthly', 'yearly')) {
    # stop if stations is null. For monthly API, one and only one station must be provided
    if (length(api_options$stations) < 1) {
      cli::cli_abort(c(
        "AEMET API for monthly/yearly aggregated values needs one station provided"
      ))
    }

    # issue a warning if more than one station is provided
    if (length(api_options$stations) > 1) {
      cli::cli_warn(c(
        "AEMET API for monthly/yearly aggregated values only accepts one station per query.\n",
        "Only the first station provided ({.val {api_options$stations[1]}}) will be used."
      ))
    }

    return(
      c(
        'opendata', 'api', 'valores', 'climatologicos', 'mensualesanuales', 'datos',
        'anioini', lubridate::year(api_options$start_date), 'aniofin', lubridate::year(api_options$end_date),
        'estacion', api_options$stations[1]
      )
    )
  }

  # not recognised resolution
  cli::cli_abort(c(
    "{.arg {api_options$resolution}} is not a valid temporal resolution for AEMET. Please see aemet_options help for more information"
  ))
}

#' Check status and errors for AEMET
#'
#' Check status and erros for AEMET
#'
#' In the AEMET API we have to do a lot of GETs and checking the status of the response before
#' parsing it to avoid errors (html responses instead of json due to limits, among other things).
#' Why this? Easy, AEMET API does not return data and metadata directly, but a link to a transitory
#' page with the metadata and another link to a transitory page with the data. This means that for
#' a normal daily query the process is the following:
#' \itemize{
#'   \item{1. GET to the correct daily path}
#'   \item{2. Check statuses and response}
#'   \item{3. GET to the data link}
#'   \item{4. Check statuses and response}
#'   \item{5. GET to the metadata link}
#'   \item{6. Check statuses and response}
#'   \item{7. GET to the stations info, to get coords, internally using .get_info helper}
#'   \item{8. Check statuse and response}
#'   \item{9. GET to the stations info data link}
#'   \item{10. Check statuses and response}
#'   \item{11. GET to the stations info metadata link}
#'   \item{12. Check statuses and response}
#' }
#' This makes 6 calls, each of one has the code to check the statuses, because checking only the
#' main call does not avoid getting errors due to limits in the following calls to data or metadata.
#' This means, 6 times repeating the code for checking the statuses and changing lines in 6 places
#' when trying to debug or change something.
#'
#' @section Rationale:
#' So, the rationale is the following, a function accepting \code{...} that will be passed to
#' httr::GET, and internally checking statuses. If error, return the error code and the message.
#' In the main function, if the error returned is API limit, wait 60 seconds, if other, stop
#' and give the correct message.
#'
#' @param ... <[`dynamic-dots`][rlang::dyn-dots]> Arguments for httr::GET
#'
#' @noRd
.check_status_aemet <- function(...) {

  # GET step
  api_response <- safe_api_access(type = 'rest', ...)
  # api_response <- httr::GET(...)
  response_status <- httr::status_code(api_response)

  # and now the status checks
  if (response_status == 404) {
    res <- list(
      status = 'Error',
      code = response_status,
      message = glue::glue(
        "Unable to connect to AEMET API at {api_response$url}: {httr::http_status(api_response)$message}"
      )
    )
    return(res)
  }

  if (response_status == 401) {
    res <- list(
      status = 'Error',
      code = response_status,
      message = glue::glue("Invalid API Key: {httr::http_status(api_response)$message}")
    )
    return(res)
  }

  if (response_status == 429) {
    res <- list(
      status = 'Error',
      code = response_status,
      message = glue::glue(
        "API request limit reached: {httr::http_status(api_response)$message}"
      )
    )
    return(res)
  }

  # the content check
  if (stringr::str_detect(httr::http_type(api_response), "html")) {

    # uupps, an html was returned, instead of a json or plain text.
    html_text <- httr::content(api_response, 'text')

    # this can be an html with the infamous hidden 429 error, so we check that
    if (stringr::str_detect(html_text, "429 Too Many Requests")) {
      res <- list(
        status = 'Error',
        code = 429,
        message = "API request limit reached, taking a cooldown of 60 seconds to reset."
      )
    } else {
      res <- list(
        status = 'Error',
        code =  response_status,
        message = glue::glue("AEMET API returned an error: {html_text}")
      )
    }

    return(res)
  }

  # Now, finally, we are able to access the response content (because now we are sure is json)
  response_content <- jsonlite::fromJSON(httr::content(api_response, as = 'text', encoding = 'ISO-8859-15'))

  # the last check, even with 200 code (ok) it can be no data
  if (!rlang::is_null(response_content$estado) && response_content$estado != 200) {
    res <- list(
      status = 'Error',
      code = response_content$estado,
      message = glue::glue("AEMET API returned no data: {response_content$descripcion}")
    )
    return(res)
  }

  # If we reach here, is because everything went well
  res <- list(
    status = 'OK',
    code = response_status,
    message = "Data received",
    content = response_content
  )

  return(res)
}

#' Get info for the aemet stations
#'
#' Get info for the aemet stations
#'
#' @noRd

.get_info_aemet <- function(api_options) {
  # path
  path_resolution <- c(
    'opendata', 'api', 'valores', 'climatologicos', 'inventarioestaciones', 'todasestaciones'
  )

  # create httr config to execute only if in linux, due to the ubuntu 20.04 update to seclevel 2
  config_httr_aemet <- switch(
    Sys.info()["sysname"],
    'Linux' = httr::config(ssl_cipher_list = 'DEFAULT@SECLEVEL=1'),
    httr::config()
  )

  # Status check ------------------------------------------------------------------------------------------
  # now we need to check the status of the response (general status), and the status of the AEMET (specific
  # query status). They can differ, as you can reach succesfully AEMET API (200) but the response can be
  # empty due to errors in the dates or stations (404) or simply the api key is incorrect (xxx).
  # This is done with .check_status_aemet helper, which return a list with the status, and if success the
  # content parsed already
  api_status_check <- .check_status_aemet(
    "https://opendata.aemet.es",
    httr::add_headers(api_key = api_options$api_key),
    path = path_resolution,
    httr::user_agent('https://github.com/emf-creaf/meteospain'),
    config = config_httr_aemet
  )

  if (api_status_check$status != 'OK') {
    # if api request limit reached, do a recursive call to the function after 60 seconds
    if (api_status_check$code == 429) {
      return(.manage_429_errors(api_status_check, api_options, .get_info_aemet))
    } else {
      cli::cli_abort(c(
        x = api_status_check$code,
        i = api_status_check$message
      ))
    }
  }

  response_content <- api_status_check$content

  # Response data and metadata ----------------------------------------------------------------------------
  # Now, as stated in the .check_status_aemet rationale, we need to access data (in this case we don't need
  # metadata)
  stations_info_check <- .check_status_aemet(
    response_content$datos,
    httr::user_agent('https://github.com/emf-creaf/meteospain'),
    config = config_httr_aemet
  )

  if (stations_info_check$status != 'OK') {
    # if api request limit reached, do a recursive call to the function after 60 seconds
    if (stations_info_check$code == 429) {
      return(.manage_429_errors(api_status_check, api_options, .get_info_aemet))
    } else {
      cli::cli_abort(c(
        x = stations_info_check$code,
        i = stations_info_check$message
      ))
    }
  }

  # Data transformation ----------------------------------------------------------------------------------
  # We can finally take the station info data frame and do the necessary transformations
  stations_info_check$content |>
    dplyr::as_tibble() |>
    # add service name, to identify the data if joining with other services
    dplyr::mutate(service = 'aemet') |>
    dplyr::select(
      "service", station_id = "indicativo", station_name = "nombre",
      station_province = "provincia", altitude = "altitud", latitude = "latitud",
      longitude = "longitud"
    ) |>
    # latitude and longitude are in strings with the cardinal letter. We need to transform that to numeric
    # and negative when S or W.
    dplyr::mutate(
      altitude = as.numeric(stringr::str_replace_all(.data$altitude, ',', '.')),
      altitude = units::set_units(.data$altitude, "m"),
      latitude = .aemet_coords_generator(.data$latitude),
      longitude = .aemet_coords_generator(.data$longitude)
    ) |>
    sf::st_as_sf(coords = c('longitude', 'latitude'), crs = 4326)

}

#' Get data from AEMET
#'
#' Get data from AEMET service
#'
#' For current_day and daily, there is no need of supply the stations_id in the query,
#' as the data is not so big. So, in case of stations provided, we can filter later, after getting
#' the data. This also has the advantage of using only one query, reducing the probability of reaching
#' the API limit per minute or total.
#'
#' @param api_options Option list as generated by \link{\code{aemet_options}}
#'
#' @noRd
.get_data_aemet <- function(api_options) {
  # All necessary things for the GET ----------------------------------------------------------------------
  # create api path
  path_resolution <- .create_aemet_path(api_options)

  # create httr config to execute only if in linux, due to the ubuntu 20.04 update to seclevel 2
  config_httr_aemet <- switch(
    Sys.info()["sysname"],
    'Linux' = httr::config(ssl_cipher_list = 'DEFAULT@SECLEVEL=1'),
    httr::config()
  )

  # GET and Status check ------------------------------------------------------------------------------------------
  # now we need to check the status of the response (general status), and the status of the AEMET (specific
  # query status). They can differ, as you can reach succesfully AEMET API (200) but the response can be
  # empty due to errors in the dates or stations (404) or simply the api key is incorrect (xxx).
  # This is done with .check_status_aemet helper, which return a list with the status, and if success the
  # content parsed already
  api_status_check <- .check_status_aemet(
    "https://opendata.aemet.es",
    httr::add_headers(api_key = api_options$api_key),
    path = path_resolution,
    httr::user_agent('https://github.com/emf-creaf/meteospain'),
    config = config_httr_aemet
  )

  if (api_status_check$status != 'OK') {
    # if api request limit reached, do a recursive call to the function after 60 seconds
    if (api_status_check$code == 429) {
      return(.manage_429_errors(api_status_check, api_options, .get_data_aemet))
    } else {
      cli::cli_abort(c(
        x = api_status_check$code,
        i = api_status_check$message
      ))
    }
  }

  response_content <- api_status_check$content

  # Response data and metadata ----------------------------------------------------------------------------
  # Now, as stated in the .check_status_aemet rationale, we need to access data and metadata
  stations_data_check <- .check_status_aemet(
    response_content$datos,
    httr::user_agent('https://github.com/emf-creaf/meteospain'),
    config = config_httr_aemet
  )

  if (stations_data_check$status != 'OK') {
    # if api request limit reached, do a recursive call to the function after 60 seconds
    if (stations_data_check$code == 429) {
      return(.manage_429_errors(api_status_check, api_options, .get_data_aemet))
    } else {
      cli::cli_abort(c(
        x = stations_data_check$code,
        i = stations_data_check$message
      ))
    }
  }

  stations_metadata_check <- .check_status_aemet(
    response_content$metadatos,
    httr::user_agent('https://github.com/emf-creaf/meteospain'),
    config = config_httr_aemet
  )

  if (stations_metadata_check$status != 'OK') {
    # if api request limit reached, do a recursive call to the function after 60 seconds
    if (stations_metadata_check$code == 429) {
      return(.manage_429_errors(api_status_check, api_options, .get_data_aemet))
    } else {
      cli::cli_abort(c(
        x = stations_metadata_check$code,
        i = stations_metadata_check$message
      ))
    }
  }
  # We also need the stations info
  stations_info <- .get_info_aemet(api_options)

  # Filter expression for stations ------------------------------------------------------------------------
  # In case stations were supplied, we need also to filter them
  filter_expression <- TRUE
  # update filter if there is stations supplied, but not for monthly. In monthly only one
  # station must be used, so the filtering is unnecesary
  if (!rlang::is_null(api_options$stations)) {
    filter_expression <- switch(
      api_options$resolution,
      'current_day' = rlang::expr(.data$idema %in% api_options$stations),
      'daily' = rlang::expr(.data$indicativo %in% api_options$stations),
      'monthly' = TRUE,
      'yearly' = TRUE
    )
  }

  # Resolution specific carpentry -------------------------------------------------------------------------
  # Now, current day and daily have differences, in the names of the variables and also
  # in the need to join the stations data to offer coords. We can branch the code with ifs, repeating the
  # common steps in the data carpentry or we can create the specific functions and have only one common pipe.
  resolution_specific_carpentry <- switch(
    api_options$resolution,
    'current_day' = .aemet_current_day_carpentry,
    'daily' = .aemet_daily_carpentry,
    'monthly' = .aemet_monthly_yearly_carpentry,
    'yearly' = .aemet_monthly_yearly_carpentry
  )

  # Data transformation -----------------------------------------------------------------------------------
  res <- stations_data_check$content |>
    dplyr::as_tibble() |>
    # remove unwanted stations
    dplyr::filter(!! filter_expression) |>
    # apply the resolution-specific transformations
    resolution_specific_carpentry(stations_info, resolution = api_options$resolution) |>
    # arrange data
    dplyr::arrange(.data$timestamp, .data$station_id) |>
    # reorder variables to be consistent among all services
    relocate_vars() |>
    # ensure we have an sf
    sf::st_as_sf()


  # Check if any stations were returned -------------------------------------------------------------------
  if ((!is.null(api_options$stations)) & nrow(res) < 1) {
    cli::cli_abort(c(
      x = "Station(s) provided have no data for the dates selected.",
      "Available stations with data for the actual query are:",
      glue::glue_collapse(
        c(unique(stations_data_check$content$indicativo), unique(stations_data_check$content$idema)),
        sep = ', ', last = ' and '
      )
    ))
  }

  # Copyright message -------------------------------------------------------------------------------------
  cli::cli_inform(c(
    i = copyright_style(stations_metadata_check$content$copyright),
    legal_note_style(stations_metadata_check$content$notaLegal)
  ))

  # Return ------------------------------------------------------------------------------------------------
  return(res)
}


# resolution_specific_carpentry -------------------------------------------------------------------------
.aemet_current_day_carpentry <- function(data, stations_info, ...) {
  data |>
    dplyr::select(dplyr::any_of(c(
      timestamp = "fint", station_id = "idema", station_name = "ubi",
      altitude = "alt",
      temperature = "ta",
      min_temperature = "tamin",
      max_temperature = "tamax",
      relative_humidity = "hr",
      precipitation = "prec",
      wind_speed = "vv",
      wind_direction = "dv",
      insolation = "inso",
      longitude = "lon", latitude = "lat"
    ))) |>
    # create any variable missing
    .create_missing_vars(
      var_names = c(
        "temperature", "min_temperature", "max_temperature", "relative_humidity",
        "precipitation", "wind_speed", "wind_direction", "insolation"
      )
    ) |>
    # units
    dplyr::mutate(
      service = 'aemet',
      timestamp = lubridate::as_datetime(.data$timestamp),
      altitude = units::set_units(.data$altitude, "m"),
      temperature = units::set_units(.data$temperature, "degree_C"),
      min_temperature = units::set_units(.data$min_temperature, "degree_C"),
      max_temperature = units::set_units(.data$max_temperature, "degree_C"),
      relative_humidity = units::set_units(.data$relative_humidity, "%"),
      precipitation = units::set_units(.data$precipitation, "L/m^2"),
      wind_speed = units::set_units(.data$wind_speed, "m/s"),
      wind_direction = units::set_units(.data$wind_direction, "degree"),
      insolation = units::set_units(.data$insolation, "hours")
    ) |>
    dplyr::left_join(stations_info, by = c('service', 'station_id', 'station_name', 'altitude')) |>
    sf::st_as_sf(coords = c('longitude', 'latitude'), crs = 4326)
}

.aemet_daily_carpentry <- function(data, stations_info, ...) {
  data |>
    dplyr::select(dplyr::any_of(c(
      timestamp = "fecha",
      station_id = "indicativo", station_name = "nombre", station_province = "provincia",
      mean_temperature = "tmed",
      min_temperature = "tmin",
      max_temperature = "tmax",
      precipitation = "prec",
      mean_wind_speed = "velmedia",
      # wind_direction = "dir",
      insolation = "sol"
    ))) |>
    # create any variable missing
    .create_missing_vars(
      var_names = c(
        "mean_temperature", "min_temperature", "max_temperature",
        "precipitation", "mean_wind_speed", "insolation"
      )
    ) |>
    # variables are characters, with "," as decimal point, so....
    dplyr::mutate(
      service = 'aemet',
      timestamp = lubridate::as_datetime(.data$timestamp),
      mean_temperature = as.numeric(stringr::str_replace_all(.data$mean_temperature, ',', '.')),
      min_temperature = as.numeric(stringr::str_replace_all(.data$min_temperature, ',', '.')),
      max_temperature = as.numeric(stringr::str_replace_all(.data$max_temperature, ',', '.')),
      precipitation = suppressWarnings(as.numeric(stringr::str_replace_all(.data$precipitation, ',', '.'))),
      mean_wind_speed = as.numeric(stringr::str_replace_all(.data$mean_wind_speed, ',', '.')),
      # wind_direction = as.numeric(stringr::str_replace_all(.data$wind_direction, ',', '.')),
      insolation = as.numeric(stringr::str_replace_all(.data$insolation, ',', '.')),
      # and set the units also
      mean_temperature = units::set_units(.data$mean_temperature, "degree_C"),
      min_temperature = units::set_units(.data$min_temperature, "degree_C"),
      max_temperature = units::set_units(.data$max_temperature, "degree_C"),
      precipitation = units::set_units(.data$precipitation, "L/m^2"),
      mean_wind_speed = units::set_units(.data$mean_wind_speed, "m/s"),
      # wind_direction = units::set_units(.data$wind_direction, degree),
      insolation = units::set_units(.data$insolation, "h")
    ) |>
    dplyr::left_join(stations_info, by = c('service', 'station_id', 'station_name', 'station_province'))
}

.aemet_monthly_yearly_carpentry <- function(data, stations_info, resolution) {

  # resolution depending negate argument
  negate_filter <- FALSE
  if (resolution == "monthly") {
    negate_filter <- TRUE
  }
  # data carpentry
  data |>
    dplyr::select(dplyr::any_of(c(
          timestamp = "fecha",
          station_id = "indicativo",
          # temperatures
          mean_temperature = "tm_mes",
          mean_min_temperature = "tm_min",
          mean_max_temperature = "tm_max",
          # rh
          mean_relative_humidity = "hr",
          # precipitation
          total_precipitation = "p_mes",
          days_precipitation = "np_001",
          # wind
          mean_wind_speed = "w_med",
          # radiation
          mean_insolation = "inso",
          mean_global_radiation = "glo"
    ))) |>
    # remove yearly or monthly values, depending on resolution
    dplyr::filter(
      stringr::str_detect(.data$timestamp, "-13", negate = negate_filter)
    ) |>
    # remove any "-13" for yearly values (if monthly, this step dont do anything), for
    # the timestamp parsing to work
    dplyr::mutate(
      timestamp = stringr::str_remove(.data$timestamp, "-13")
    ) |>
    # create any variable missing
    .create_missing_vars(
      var_names = c(
        "mean_temperature", "mean_min_temperature", "mean_max_temperature",
        "mean_relative_humidity", "total_precipitation", "days_precipitation",
        "mean_wind_speed", "mean_insolation", "mean_global_radiation"
      )
    ) |>
    # timestamp has to be parsed, "ym" for monthly values, "y" for yearly, and
    # variables are characters, with "," as decimal point, so....
    dplyr::mutate(
      service = 'aemet',
      timestamp = lubridate::parse_date_time(.data$timestamp, orders = c("ym", "y")),
      mean_temperature = as.numeric(stringr::str_replace_all(.data$mean_temperature, ',', '.')),
      mean_min_temperature = as.numeric(stringr::str_replace_all(.data$mean_min_temperature, ',', '.')),
      mean_max_temperature = as.numeric(stringr::str_replace_all(.data$mean_max_temperature, ',', '.')),
      total_precipitation = suppressWarnings(as.numeric(stringr::str_replace_all(.data$total_precipitation, ',', '.'))),
      mean_wind_speed = as.numeric(stringr::str_replace_all(.data$mean_wind_speed, ',', '.')),
      mean_relative_humidity = as.numeric(stringr::str_replace_all(.data$mean_relative_humidity, ',', '.')),
      days_precipitation = as.numeric(stringr::str_replace_all(.data$days_precipitation, ',', '.')),
      mean_insolation = as.numeric(stringr::str_replace_all(.data$mean_insolation, ',', '.')),
      # global radiation is in 10*kJ/m2, so we multiply by 10 to set the units later to kJ/m2
      mean_global_radiation = 10*as.numeric(stringr::str_replace_all(.data$mean_global_radiation, ',', '.')),
      # and set the units also
      mean_temperature = units::set_units(.data$mean_temperature, "degree_C"),
      mean_min_temperature = units::set_units(.data$mean_min_temperature, "degree_C"),
      mean_max_temperature = units::set_units(.data$mean_max_temperature, "degree_C"),
      total_precipitation = units::set_units(.data$total_precipitation, "L/m^2"),
      mean_wind_speed = units::set_units(.data$mean_wind_speed, "km/h"),
      mean_relative_humidity = units::set_units(.data$mean_relative_humidity, "%"),
      days_precipitation = units::set_units(.data$days_precipitation, "days"),
      mean_insolation = units::set_units(.data$mean_insolation, "hours"),
      mean_global_radiation = units::set_units(.data$mean_global_radiation, "kJ/m^2")
    ) |>
    dplyr::left_join(stations_info, by = c('service', 'station_id'))
}

Try the meteospain package in your browser

Any scripts or data that you put into this service are public.

meteospain documentation built on May 29, 2024, 1:59 a.m.