R/adapter_CL_DGA.R

Defines functions timeseries.hydro_service_CL_DGA .cl_fetch_q_daily .cl_cr2_download_q_daily stations.hydro_service_CL_DGA .cl_dga_split_station_name .cl_dga_dms_to_dd .cl_dga_utm_to_wgs84 .cl_dga_read_stationlist .cl_dga_stationlist_url .cl_dga_stationlist_page timeseries_parameters.hydro_service_CL_DGA register_CL_DGA

# ==== Chile (DGA via CR2 Explorador) adapter =================================
# Stations:
#   - Official DGA station list (Excel) linked from:
#       https://dga.mop.gob.cl/estadisticas-estaciones-dga/
#     "Listado de estaciones activas de la Direcci\u00F3n General de Aguas"
#     Columns (per site description): c\u00F3digo, nombre, tipo de estaci\u00F3n, cuenca,
#     coordenadas.  We auto-detect columns by name and coerce the code column
#     to text to preserve leading zeros.
#
# Time series:
#   - Daily mean discharge compiled by CR2 (Caudales Medios Diarios - CR2),
#     exposed via Explorador Clim\u00E1tico:
#       https://explorador.cr2.cl
#     We use the same "request.php?options=..." pattern as the existing
#     `chile()` helper you showed, for `variable.id = "qflxDaily"`.
#   - We always download the full series for a gauge and then filter by
#     date range in R (like your `chile()` function).
#
# Notes:
#   - Adapter exposes only parameter = "water_discharge".
#   - Station metadata comes from DGA; time series from CR2 (which compiles
#     DGA + other official data).
#   - Daily data are returned with a `timestamp` at midnight UTC.

# -----------------------------------------------------------------------------
# Registration
# -----------------------------------------------------------------------------

#' @keywords internal
#' @noRd
register_CL_DGA <- function() {
  register_service_usage(
    provider_id   = "CL_DGA",
    provider_name = "Direcci\u00F3n General de Aguas (via CR2 Explorador)",
    country       = "Chile",
    base_url      = "https://explorador.cr2.cl",
    rate_cfg      = list(n = 1L, period = 1),  # 1 request / second
    auth          = list(type = "none")
  )
}

#' @export
timeseries_parameters.hydro_service_CL_DGA <- function(x, ...) {
  # currently only daily discharge
  c("water_discharge")
}

# -----------------------------------------------------------------------------
# Station list (from DGA Excel)
# -----------------------------------------------------------------------------

.cl_dga_stationlist_page <- function() {
  "https://dga.mop.gob.cl/estadisticas-estaciones-dga/"
}

.cl_dga_stationlist_url <- function() {
  # Scrape the Estadisticas page to find the current .xlsx with the
  # "Listado de estaciones vigentes / activas".
  page_url <- .cl_dga_stationlist_page()

  req <- httr2::request(page_url) |>
    httr2::req_user_agent(
      "hydrodownloadR (+https://github.com/your-org/hydrodownloadR)"
    )

  resp <- try(perform_request(req), silent = TRUE)
  if (inherits(resp, "try-error")) {
    rlang::abort("CL_DGA: failed to fetch Estadisticas page for station list.")
  }

  html <- try(
    xml2::read_html(httr2::resp_body_string(resp)),
    silent = TRUE
  )
  if (inherits(html, "try-error")) {
    rlang::abort("CL_DGA: could not parse Estadisticas HTML for station list.")
  }

  links <- rvest::html_elements(html, "a")
  hrefs <- trimws(rvest::html_attr(links, "href"))
  hrefs <- hrefs[!is.na(hrefs)]

  # Heuristic: the station list is an .xlsx whose path typically contains
  # "Listado-estaciones" and/or "Nacional".
  xlsx <- hrefs[grepl("\\.xlsx($|\\?)", hrefs, ignore.case = TRUE)]

  if (!length(xlsx)) {
    rlang::abort(
      "CL_DGA: could not find any .xlsx link on Estadisticas page."
    )
  }


  preferred <- xlsx[
    grepl("Listado",     xlsx, ignore.case = TRUE) |
      grepl("Estacion",  xlsx, ignore.case = TRUE) |
      grepl("Estaciones",xlsx, ignore.case = TRUE) |
      grepl("Nacional",  xlsx, ignore.case = TRUE) |
      grepl("Informe",   xlsx, ignore.case = TRUE)
  ]

  cand <- if (length(preferred)) preferred else xlsx
  url <- cand[1]
  if (!grepl("^https?://", url)) {
    url <- xml2::url_absolute(url, page_url)
  }
  url
}

.cl_dga_read_stationlist <- function() {
  url <- .cl_dga_stationlist_url()

  tmp <- tempfile(fileext = ".xlsx")

  req <- httr2::request(url) |>
    httr2::req_user_agent(
      "hydrodownloadR (+https://github.com/your-org/hydrodownloadR)"
    )

  resp <- try(perform_request(req), silent = TRUE)
  if (inherits(resp, "try-error")) {
    rlang::abort(paste("CL_DGA: failed to download station list from", url))
  }

  bin <- httr2::resp_body_raw(resp)
  writeBin(bin, tmp)

  # First pass: detect which column is the station code (e.g. "Estaci\u00F3n")
  raw0 <- readxl::read_xlsx(tmp, sheet = 1, skip = 15)
  nm0  <- names(raw0)

  id_idx <- which(
    grepl("estaci", nm0, ignore.case = TRUE) |
      grepl("c[o\u00F3]digo", nm0, ignore.case = TRUE)
  )[1]

  if (is.na(id_idx)) {
    # Fallback: just use this read if we didn't find a clear id column
    return(raw0)
  }

  col_types <- rep("guess", length(nm0))
  col_types[id_idx] <- "text"  # preserve leading zeros in codes

  readxl::read_xlsx(
    tmp,
    sheet     = 1,
    skip      = 15,
    col_types = col_types
  )
}

# Datum = UTM zone (18, 19, ...); we map to EPSG:327xx (southern hemisphere).
.cl_dga_utm_to_wgs84 <- function(east, north, datum) {
  east  <- suppressWarnings(as.numeric(east))
  north <- suppressWarnings(as.numeric(north))
  zone  <- suppressWarnings(as.integer(datum))

  lon <- rep(NA_real_, length(east))
  lat <- rep(NA_real_, length(east))

  ok <- !is.na(east) & !is.na(north) & !is.na(zone) &
    zone >= 1 & zone <= 60

  if (!any(ok)) {
    return(list(lon = lon, lat = lat))
  }

  zones <- sort(unique(zone[ok]))

  for (z in zones) {
    idx <- ok & zone == z
    if (!any(idx)) next

    epsg <- 32700 + z  # WGS84 / UTM zone zS (Chile is in southern hemisphere)
    pts  <- data.frame(E = east[idx], N = north[idx])

    sf_pts <- sf::st_as_sf(
      pts,
      coords = c("E", "N"),
      crs    = epsg,
      remove = FALSE
    )
    sf_ll   <- sf::st_transform(sf_pts, 4326)
    coords  <- sf::st_coordinates(sf_ll)

    lon[idx] <- coords[, "X"]
    lat[idx] <- coords[, "Y"]
  }

  list(lon = lon, lat = lat)
}

.cl_dga_dms_to_dd <- function(x, is_lon = FALSE) {
  if (is.null(x)) return(NA_real_)

  x <- trimws(as.character(x))
  x[x == ""] <- NA_character_

  res  <- rep(NA_real_, length(x))
  sign <- rep(1, length(x))

  # explicit hemisphere letters (if present)
  s_idx <- grepl("[Ss]$", x)
  w_idx <- grepl("[WwOo]$", x)  # O = Oeste
  sign[s_idx | w_idx] <- -1

  # Extract D / M / S groups
  m <- regexec("(-?\\d+)\\D+(\\d+)\\D+(\\d+(?:\\.\\d*)?)", x)
  parts <- regmatches(x, m)

  for (i in seq_along(parts)) {
    p <- parts[[i]]
    if (length(p) != 4) next

    d  <- suppressWarnings(as.numeric(p[2]))
    m_ <- suppressWarnings(as.numeric(p[3]))
    s  <- suppressWarnings(as.numeric(p[4]))
    if (is.na(d) || is.na(m_) || is.na(s)) next

    val    <- d + m_ / 60 + s / 3600
    res[i] <- val * sign[i]
  }

  # Chile is S/W; if still positive, flip sign
  pos <- !is.na(res) & res > 0
  if (any(pos)) {
    res[pos] <- -res[pos]
  }

  res
}

.cl_dga_split_station_name <- function(estacion_col) {
  # estacion_col: vector from "Estaci\u00F3n" column
  original <- normalize_utf8(trimws(as.character(estacion_col)))
  river    <- rep(NA_character_, length(original))
  location <- original

  # Match things like "RIO CAMARONES EN CONANOXA"
  has_pat <- grepl("R[\u00cdI]O\\s+.+\\s+EN\\s+.+",
                   original,
                   ignore.case = TRUE)

  river[has_pat] <- sub(
    "^(R[\u00cdI]O\\s+.+?)\\s+EN\\s+(.+)$",
    "\\1",
    original[has_pat],
    ignore.case = TRUE
  )

  location[has_pat] <- sub(
    "^(R[\u00cdI]O\\s+.+?)\\s+EN\\s+(.+)$",
    "\\2",
    original[has_pat],
    ignore.case = TRUE
  )

  list(
    original = original,
    river    = river,
    location = location
  )
}


#' @export
stations.hydro_service_CL_DGA <- function(x, ...) {
  raw_tbl <- .cl_dga_read_stationlist()

  if (!NROW(raw_tbl)) {
    rlang::abort("CL_DGA: station list .xlsx appears to be empty.")
  }

  # --- filter by Tipo de Estaci\u00F3n -------------------------------------------
  tipo_col <- col_or_null(raw_tbl, "Tipo de Estaci\u00F3n")
  station_type <- NULL

  if (!is.null(tipo_col)) {
    tipo_chr <- normalize_utf8(trimws(as.character(tipo_col)))

    # only keep stations where type string contains FLUVIOMETRICA
    has_fl <- grepl("FLUVIOMETRICA", tipo_chr, ignore.case = TRUE)

    keep_type <- has_fl

    raw_tbl      <- raw_tbl[keep_type, , drop = FALSE]
    station_type <- tipo_chr[keep_type]
  }

  if (!NROW(raw_tbl)) {
    # no stations match -> return empty tibble
    return(tibble::tibble(
      country               = x$country,
      provider_id           = x$provider_id,
      provider_name         = x$provider_name,
      station_id            = character(),
      station_name          = character(),
      river                 = character(),
      lat                   = numeric(),
      lon                   = numeric(),
      area                  = numeric(),
      altitude              = numeric(),
      station_name_original = character(),
      station_type          = character()
    ))
  }

  # if there was no tipo_col at all, but we still have rows:
  if (is.null(station_type)) {
    station_type <- rep(NA_character_, NROW(raw_tbl))
  }

  # --- columns via col_or_null() ---------------------------------------------

  # Station code (ID)
  station_id_col <- col_or_null(raw_tbl, "C\u00F3digo")

  # Station name (full DGA label, e.g. "RIO CAMARONES EN CONANOXA")
  estacion_col <- col_or_null(raw_tbl, "Estaci\u00F3n")

  # Coordinates (UTM and Datum)
  utm_n_col <- col_or_null(raw_tbl, "UTM WGS84 Norte(m)")
  utm_e_col <- col_or_null(raw_tbl, "UTM WGS84 Este(m)")
  datum_col <- col_or_null(raw_tbl, "Datum")

  # DMS coordinates
  lat_dms_col <- col_or_null(raw_tbl, "Latitud")
  lon_dms_col <- col_or_null(raw_tbl, "Longitud")

  # Altitude
  alt_col <- col_or_null(raw_tbl, "Altitud m.s.n.m")

  # --- basic cleaning --------------------------------------------------------

  station_id <- trimws(as.character(station_id_col))

  # Split station name into river + location, but keep original label
  name_split <- .cl_dga_split_station_name(estacion_col)
  station_name_original <- name_split$original
  river                  <- name_split$river
  station_name           <- name_split$location

  utm_n   <- utm_n_col
  utm_e   <- utm_e_col
  datum   <- datum_col
  lat_dms <- lat_dms_col
  lon_dms <- lon_dms_col
  alt_raw <- alt_col

  # --- 1) primary: UTM to WGS84 ----------------------------------------------

  utm_coords <- .cl_dga_utm_to_wgs84(
    east  = utm_e,
    north = utm_n,
    datum = datum
  )
  lon_num <- utm_coords$lon
  lat_num <- utm_coords$lat

  # --- 2) fallback: DMS Latitud/Longitud ------------------------------------

  lat_dms_dd <- .cl_dga_dms_to_dd(lat_dms, is_lon = FALSE)
  lon_dms_dd <- .cl_dga_dms_to_dd(lon_dms, is_lon = TRUE)

  missing_lat <- is.na(lat_num) & !is.na(lat_dms_dd)
  missing_lon <- is.na(lon_num) & !is.na(lon_dms_dd)

  lat_num[missing_lat] <- lat_dms_dd[missing_lat]
  lon_num[missing_lon] <- lon_dms_dd[missing_lon]

  altitude <- suppressWarnings(as.numeric(as.character(alt_raw)))

  # --- final tibble ----------------------------------------------------------

  out <- tibble::tibble(
    country               = x$country,
    provider_id           = x$provider_id,
    provider_name         = x$provider_name,
    station_id            = station_id,
    station_name_original = station_name_original, # full DGA label
    station_name          = station_name,          # location only
    river                 = river,                 # e.g. "RIO CAMARONES"
    lat                   = lat_num,
    lon                   = lon_num,
    area                  = NA_real_,
    altitude              = altitude,
    station_type          = station_type           # only types with FLUVIOMETRICA
  )

  # Drop rows without a usable ID
  out <- out[!is.na(out$station_id) & nzchar(out$station_id), ]

  out
}

# -----------------------------------------------------------------------------
# Internal helpers for time series (CR2 Explorador)
# -----------------------------------------------------------------------------

.cl_cr2_download_q_daily <- function(site) {
  # site: 8-digit DGA code as character, e.g. "01001002"

  original <- paste0(
    "https://explorador.cr2.cl/request.php?options=",
    "{%22variable%22:{%22id%22:%22qflxDaily%22,%22var%22:%22caudal%22,",
    "%22intv%22:%22daily%22,%22season%22:%22year%22,%22stat%22:%22mean%22,",
    "%22minFrac%22:80},%22time%22:{%22start%22:-946771200,%22end%22:",
    "1631664000,%22months%22:%22A%C3%B1o%20completo%22},",
    "%22anomaly%22:{%22enabled%22:false,%22type%22:%22dif%22,",
    "%22rank%22:%22no%22,%22start_year%22:1980,%22end_year%22:2010,",
    "%22minFrac%22:70},%22map%22:{%22stat%22:%22mean%22,%22minFrac%22:10,",
    "%22borderColor%22:%227F7F7F%22,%22colorRamp%22:%22Jet%22,",
    "%22showNaN%22:false,%22limits%22:{%22range%22:[5,95],",
    "%22size%22:[4,12],%22type%22:%22prc%22}},%22series%22:{%22sites%22:[%22"
  )

  ending <- paste0(
    "%22],%22start%22:null,%22end%22:null},%22export%22:{%22map%22:",
    "%22Shapefile%22,%22series%22:%22CSV%22,%22view%22:{%22frame%22:",
    "%22Vista%20Actual%22,%22map%22:%22roadmap%22,%22clat%22:-18.0036,",
    "%22clon%22:-69.6331,%22zoom%22:5,%22width%22:461,%22height%22:2207}},",
    "%22action%22:[%22export_series%22]}"
  )

  website <- paste0(original, site, ending)

  # be polite with CR2 server
  Sys.sleep(0.25)

  s <- rvest::session(website)
  body_txt <- s |>
    rvest::html_element("body") |>
    rvest::html_text()

  # Extract CSV URL from the HTML-ish response
  page <- gsub("(.*)(https://.*)(\"}}})", "\\2", body_txt)
  page <- as.character(page)

  outpath <- tempfile(fileext = ".csv")
  utils::download.file(page, outpath, quiet = TRUE)

  original_data <- readr::read_delim(outpath, show_col_types = FALSE)
  names(original_data) <- sub("\\s+", "", names(original_data))

  tibble::as_tibble(original_data)
}

.cl_fetch_q_daily <- function(x,
                              station_id,
                              rng,
                              mode) {
  site <- trimws(as.character(station_id))

  ts_raw <- try(.cl_cr2_download_q_daily(site), silent = TRUE)
  if (inherits(ts_raw, "try-error") || !NROW(ts_raw)) {
    rlang::warn(paste0("CL_DGA/CR2: download failed for station ", site))
    return(tibble::tibble())
  }

  required_cols <- c("agno", "mes", "dia", "valor")
  if (!all(required_cols %in% names(ts_raw))) {
    rlang::warn(paste0(
      "CL_DGA/CR2: unexpected columns for station ", site,
      " (missing one of: ", paste(required_cols, collapse = ", "), ")."
    ))
    return(tibble::tibble())
  }

  year  <- suppressWarnings(as.integer(ts_raw$agno))
  month <- suppressWarnings(as.integer(ts_raw$mes))
  day   <- suppressWarnings(as.integer(ts_raw$dia))

  date_chr <- sprintf("%04d-%02d-%02d", year, month, day)
  date     <- suppressWarnings(as.Date(date_chr))

  value <- suppressWarnings(as.numeric(ts_raw$valor))

  keep <- !is.na(date) & !is.na(value)

  if (identical(mode, "range")) {
    keep <- keep &
      date >= rng$start_date &
      date <= rng$end_date
  }

  if (!any(keep)) {
    return(tibble::tibble())
  }

  date  <- date[keep]
  value <- value[keep]

  ts_final <- as.POSIXct(date, tz = "UTC")

  tibble::tibble(
    country       = x$country,
    provider_id   = x$provider_id,
    provider_name = x$provider_name,
    station_id    = site,
    parameter     = "water_discharge",
    timestamp     = ts_final,
    value         = value,
    unit          = "m^3/s",
    quality_code  = NA_character_,
    source_url    = x$base_url
  )
}



#' @export
timeseries.hydro_service_CL_DGA <- function(x,
                                            parameter = c("water_discharge"),
                                            stations    = NULL,
                                            start_date  = NULL,
                                            end_date    = NULL,
                                            mode        = c("complete", "range"),
                                            exclude_quality = NULL,
                                            ...) {
  parameter <- match.arg(parameter)
  mode      <- match.arg(mode)

  if (!identical(parameter, "water_discharge")) {
    rlang::abort("CL_DGA: only parameter = 'water_discharge' is supported.")
  }

  rng <- resolve_dates(mode, start_date, end_date)

  # --------------------------------------------------------------------------
  # station_id vector
  # - default: only stations with station_type containing "FLUVIOMETRICA"
  # - if stations are given explicitly, use them as-is
  # --------------------------------------------------------------------------
  if (is.null(stations)) {
    st <- stations.hydro_service_CL_DGA(x)

    if ("station_type" %in% names(st)) {
      is_fl <- grepl("FLUVIOMETRICA", st$station_type, ignore.case = TRUE)
      st    <- st[is_fl & !is.na(st$station_id), , drop = FALSE]
    }

    station_vec <- st$station_id
  } else {
    station_vec <- stations
  }

  station_vec <- unique(trimws(as.character(station_vec)))
  if (!length(station_vec)) {
    return(tibble::tibble())
  }

  # batching + rate limit -----------------------------------------------------

  batches <- chunk_vec(station_vec, 20L)

  pb <- progress::progress_bar$new(
    total  = length(batches),
    format = "CL_DGA [:bar] :current/:total (:percent) eta: :eta"
  )

  fetch_one <- function(st_id) {
    .cl_fetch_q_daily(
      x          = x,
      station_id = st_id,
      rng        = rng,
      mode       = mode
    )
  }

  limited <- ratelimitr::limit_rate(
    fetch_one,
    rate = ratelimitr::rate(
      n      = x$rate_cfg$n %||% 1L,
      period = x$rate_cfg$period %||% 1
    )
  )

  out <- lapply(batches, function(batch) {
    pb$tick()
    dplyr::bind_rows(lapply(batch, limited))
  })

  dplyr::bind_rows(out)
}

Try the hydrodownloadR package in your browser

Any scripts or data that you put into this service are public.

hydrodownloadR documentation built on Feb. 25, 2026, 5:08 p.m.