R/lake_sources.R

Defines functions assign_sites_to_lakes load_lake_file standardize_osm_sf download_lake_osm download_lake_osm_single query_osm_by_name extract_osm_polys cluster_sites get_lake_boundary

Documented in assign_sites_to_lakes get_lake_boundary

# ==============================================================================
# Lake Boundary Acquisition Functions
# ==============================================================================

#' Get Lake Boundary
#'
#' Get lake boundary polygon(s) either from OpenStreetMap or from a local file.
#'
#' @param sites A data.frame with latitude and longitude columns, or an sf object.
#' @param file Optional file path to a shapefile or geopackage with lake boundaries.
#'
#' @return A list with elements:
#'   \item{all_lakes}{sf object with lake polygons in UTM projection}
#'   \item{sites}{sf object with site points in UTM projection}
#'   \item{utm_epsg}{EPSG code for the UTM projection used}
#'
#' @details
#' If \code{file} is provided, the lake boundary is loaded from the file.
#' Otherwise, the function downloads lake boundaries from OpenStreetMap
#' based on the bounding box of the provided sites.
#'
#' @examples
#' \donttest{
#' csv_path <- system.file("extdata", "sample_sites.csv", package = "lakefetch")
#' sites <- load_sites(csv_path)
#' lake_data <- get_lake_boundary(sites)
#' }
#'
#' @export
get_lake_boundary <- function(sites, file = NULL) {
  if (!is.null(file)) {
    return(load_lake_file(sites, file))
  } else {
    return(download_lake_osm(sites))
  }
}

#' Group Sites into Spatial Clusters
#'
#' Groups sites by proximity using a grid-based approach. Sites within 0.1
#' degrees (~11 km) of each other are placed in the same cluster.
#'
#' @param sites_sf sf object with site points in WGS84
#' @return List of integer vectors, each containing row indices for one cluster
#' @noRd
cluster_sites <- function(sites_sf) {
  coords <- sf::st_coordinates(sites_sf)
  grid_size <- 0.1  # ~11 km

  # Round coordinates to nearest grid cell
  grid_x <- floor(coords[, 1] / grid_size)
  grid_y <- floor(coords[, 2] / grid_size)
  grid_key <- paste(grid_x, grid_y, sep = "_")

  # Group indices by grid cell
  clusters <- split(seq_len(nrow(sites_sf)), grid_key)

  return(unname(clusters))
}

#' Extract Polygons from OSM Response
#'
#' @param osm_data Result from osmdata::osmdata_sf()
#' @return List of standardized sf data frames
#' @noRd
extract_osm_polys <- function(osm_data) {
  result <- list()
  if (!is.null(osm_data$osm_polygons) && nrow(osm_data$osm_polygons) > 0) {
    std_poly <- standardize_osm_sf(osm_data$osm_polygons)
    if (!is.null(std_poly) && nrow(std_poly) > 0) {
      result[[length(result) + 1]] <- std_poly
      message("    Found ", nrow(std_poly), " polygons")
    }
  }
  if (!is.null(osm_data$osm_multipolygons) && nrow(osm_data$osm_multipolygons) > 0) {
    std_mpoly <- standardize_osm_sf(osm_data$osm_multipolygons)
    if (!is.null(std_mpoly) && nrow(std_mpoly) > 0) {
      result[[length(result) + 1]] <- std_mpoly
      message("    Found ", nrow(std_mpoly), " multipolygons")
    }
  }
  return(result)
}

#' Query OSM by Name(s) Using Regex Matching
#'
#' Accepts a single name or vector of names, builds regex OR pattern,
#' and queries Overpass for matching water bodies. Uses substring matching
#' so "Crooked" matches "Crooked Lake".
#'
#' @param bbox Named numeric vector with left, bottom, right, top
#' @param names Character vector of lake names
#' @param overpass_servers Character vector of Overpass API server URLs
#' @param max_attempts Maximum retry attempts
#' @return osmdata result or NULL
#' @noRd
query_osm_by_name <- function(bbox, names, overpass_servers, max_attempts = 3) {
  # Escape regex special characters in each name, then join with |
  escape_regex <- function(x) {
    gsub("([.|()\\\\+*?\\[\\]^${}])", "\\\\\\1", x, perl = TRUE)
  }
  escaped <- vapply(names, escape_regex, character(1), USE.NAMES = FALSE)
  name_pattern <- paste(escaped, collapse = "|")

  for (attempt in seq_len(max_attempts)) {
    server <- overpass_servers[((attempt - 1) %% length(overpass_servers)) + 1]

    tryCatch({
      osmdata::set_overpass_url(server)
      osm_query <- osmdata::opq(bbox = bbox, timeout = 120)
      osm_query <- osmdata::add_osm_feature(osm_query,
                                             key = "natural", value = "water")
      osm_query <- osmdata::add_osm_feature(osm_query,
                                             key = "name", value = name_pattern,
                                             value_exact = FALSE)
      result <- osmdata::osmdata_sf(osm_query)
      osmdata::set_overpass_url(overpass_servers[1])
      return(result)
    }, error = function(e) {
      msg <- conditionMessage(e)
      if (attempt < max_attempts) {
        if (grepl("500|502|503|504|timeout|Timeout", msg, ignore.case = TRUE)) {
          wait_time <- attempt * 5
          message("    Server error, trying another server in ", wait_time, "s...")
          Sys.sleep(wait_time)
        } else {
          message("    Error: ", msg)
        }
      } else {
        message("    Failed after ", max_attempts, " attempts: ", msg)
      }
      NULL
    })
  }
  tryCatch(osmdata::set_overpass_url(overpass_servers[1]), error = function(e) NULL)
  return(NULL)
}

#' Download Water Bodies from OSM for a Single Bounding Box
#'
#' Queries the Overpass API for water bodies within a bounding box. Optionally
#' tries a name-filtered query first to reduce data volume.
#'
#' @param bbox_vec Named numeric vector with left, bottom, right, top
#' @param lake_names Optional character vector of lake names to try first
#' @param overpass_servers Character vector of Overpass API server URLs
#' @param name_only If TRUE, skip broad natural=water fallback when name query
#'   fails. Used in many-cluster mode to reduce API load.
#' @return List of sf data frames with water body polygons
#' @noRd
download_lake_osm_single <- function(bbox_vec, lake_names = NULL,
                                     overpass_servers = NULL,
                                     name_only = FALSE) {
  if (is.null(overpass_servers)) {
    overpass_servers <- c(
      "https://overpass-api.de/api/interpreter",
      "https://overpass.kumi.systems/api/interpreter",
      "https://maps.mail.ru/osm/tools/overpass/api/interpreter"
    )
  }

  # Helper to query OSM with retries across multiple servers
  query_osm_robust <- function(bbox, key, value, max_attempts = 3) {
    for (attempt in seq_len(max_attempts)) {
      server <- overpass_servers[((attempt - 1) %% length(overpass_servers)) + 1]

      tryCatch({
        osmdata::set_overpass_url(server)
        osm_query <- osmdata::opq(bbox = bbox, timeout = 90)
        osm_query <- osmdata::add_osm_feature(osm_query, key = key, value = value)
        result <- osmdata::osmdata_sf(osm_query)
        osmdata::set_overpass_url(overpass_servers[1])
        return(result)
      }, error = function(e) {
        msg <- conditionMessage(e)
        if (attempt < max_attempts) {
          if (grepl("500|502|503|504|timeout|Timeout", msg, ignore.case = TRUE)) {
            wait_time <- attempt * 3
            message("    Server error, trying another server in ", wait_time, "s...")
            Sys.sleep(wait_time)
          } else {
            message("    Error: ", msg)
          }
        } else {
          message("    Failed after ", max_attempts, " attempts: ", msg)
        }
        NULL
      })
    }
    tryCatch(osmdata::set_overpass_url(overpass_servers[1]), error = function(e) NULL)
    return(NULL)
  }

  water_list <- list()

  # Try name-filtered queries first if lake names are available
  name_query_sufficient <- FALSE
  if (!is.null(lake_names) && length(lake_names) > 0) {
    lake_names <- unique(lake_names[!is.na(lake_names) & nchar(trimws(lake_names)) > 0])
    if (length(lake_names) > 0) {
      message("    Trying name-filtered query for: ",
              paste(lake_names, collapse = ", "))
      for (lname in lake_names) {
        osm_result <- query_osm_by_name(bbox_vec, lname, overpass_servers)
        if (!is.null(osm_result)) {
          name_polys <- extract_osm_polys(osm_result)
          if (length(name_polys) > 0) {
            water_list <- c(water_list, name_polys)
            name_query_sufficient <- TRUE
          }
        }
      }
      if (name_query_sufficient) {
        message("    Name-filtered query found results, skipping broad query")
      }
    }
  }

  # Fall back to broad queries if name-filtered query didn't find anything
  # Skip broad fallback when name_only=TRUE (many-cluster mode to reduce API load)
  if (!name_query_sufficient && !name_only) {
    message("    Querying natural=water...")
    osm_result <- query_osm_robust(bbox_vec, "natural", "water")
    if (!is.null(osm_result)) {
      water_list <- c(water_list, extract_osm_polys(osm_result))
    }

    message("    Querying water=lake...")
    osm_result <- query_osm_robust(bbox_vec, "water", "lake")
    if (!is.null(osm_result)) {
      water_list <- c(water_list, extract_osm_polys(osm_result))
    }
  } else if (!name_query_sufficient && name_only) {
    message("    No results for name query, skipping broad query (name-only mode)")
  }

  return(water_list)
}

#' Download Lake Boundary from OpenStreetMap
#'
#' Downloads lake boundaries from OpenStreetMap. For geographically spread-out
#' sites, clusters them and queries per-cluster to avoid downloading the entire
#' world's water bodies.
#'
#' @param sites_df Data frame with latitude and longitude columns
#'
#' @return A list with all_lakes, sites, and utm_epsg
#'
#' @noRd
download_lake_osm <- function(sites_df) {

  message("Converting to spatial format...")

  # Disable S2 spherical geometry to avoid topology errors
  sf::sf_use_s2(FALSE)
  on.exit(sf::sf_use_s2(TRUE), add = TRUE)

  # Convert to sf object (WGS84)
  if (inherits(sites_df, "sf")) {
    sites_sf <- sf::st_transform(sites_df, 4326)
  } else {
    sites_sf <- sf::st_as_sf(sites_df,
                              coords = c("longitude", "latitude"),
                              crs = 4326)
  }

  # Overpass servers to try (main server often overloaded)
  overpass_servers <- c(
    "https://overpass-api.de/api/interpreter",
    "https://overpass.kumi.systems/api/interpreter",
    "https://maps.mail.ru/osm/tools/overpass/api/interpreter"
  )

  # Detect lake name column if available
  lake_name_col <- NULL
  for (col in c("lake.name", "lake_name", "lakename", "Lake", "lake", "waterbody")) {
    if (col %in% names(sites_sf)) {
      lake_name_col <- col
      break
    }
  }

  # Create bounding box and check site spread
  bbox <- sf::st_bbox(sites_sf)
  site_spread <- max(bbox["xmax"] - bbox["xmin"], bbox["ymax"] - bbox["ymin"])

  message("Downloading lake boundaries from OpenStreetMap...")

  water_list <- list()

  if (site_spread <= 0.5) {
    # --- Small spread: single bbox query (current behavior) ---
    bbox_buffer <- min(0.05, max(0.01, site_spread * 0.1))
    bbox[1] <- bbox[1] - bbox_buffer
    bbox[2] <- bbox[2] - bbox_buffer
    bbox[3] <- bbox[3] + bbox_buffer
    bbox[4] <- bbox[4] + bbox_buffer

    bbox_vec <- c(bbox["xmin"], bbox["ymin"], bbox["xmax"], bbox["ymax"])
    names(bbox_vec) <- c("left", "bottom", "right", "top")

    message("  Bounding box: [", paste(round(bbox_vec, 4), collapse = ", "), "]")

    # Get lake names for name-filtered query
    cluster_lake_names <- NULL
    if (!is.null(lake_name_col)) {
      cluster_lake_names <- unique(sites_sf[[lake_name_col]])
    }

    water_list <- download_lake_osm_single(bbox_vec, cluster_lake_names,
                                           overpass_servers)
  } else {
    # --- Large spread: per-cluster small bbox queries ---
    # Each cluster gets a tiny bbox (~0.1 degree) and a single natural=water
    # query. Small bboxes return quickly with just nearby water bodies.
    clusters <- cluster_sites(sites_sf)
    n_clusters <- length(clusters)

    message("  Site spread is ", round(site_spread, 1),
            " degrees - querying ", n_clusters, " site clusters")

    # Progress bar for interactive sessions
    use_pb <- interactive()
    if (use_pb) {
      pb <- utils::txtProgressBar(min = 0, max = n_clusters, style = 3)
    }
    start_time <- Sys.time()

    # Track failed clusters and rotate servers
    query_count <- 0
    failed_clusters <- integer(0)

    for (ci in seq_along(clusters)) {
      cluster_idx <- clusters[[ci]]
      cluster_sf <- sites_sf[cluster_idx, ]

      # Compute small bbox around cluster sites with 0.05 degree buffer
      cl_bbox <- sf::st_bbox(cluster_sf)
      cl_buffer <- 0.05  # ~5.5 km buffer around site(s)
      cl_bbox[1] <- cl_bbox[1] - cl_buffer
      cl_bbox[2] <- cl_bbox[2] - cl_buffer
      cl_bbox[3] <- cl_bbox[3] + cl_buffer
      cl_bbox[4] <- cl_bbox[4] + cl_buffer

      cl_bbox_vec <- c(cl_bbox["xmin"], cl_bbox["ymin"],
                       cl_bbox["xmax"], cl_bbox["ymax"])
      names(cl_bbox_vec) <- c("left", "bottom", "right", "top")

      # Single natural=water query per cluster (small bbox = fast)
      # Try up to 3 attempts across different servers
      cluster_success <- FALSE
      for (attempt in 1:3) {
        if (cluster_success) break
        server_idx <- ((query_count + attempt - 1) %% length(overpass_servers)) + 1
        tryCatch({
          osmdata::set_overpass_url(overpass_servers[server_idx])
          osm_query <- osmdata::opq(bbox = cl_bbox_vec, timeout = 30)
          osm_query <- osmdata::add_osm_feature(osm_query,
                                                 key = "natural", value = "water")
          osm_result <- osmdata::osmdata_sf(osm_query)
          water_list <- c(water_list, extract_osm_polys(osm_result))
          cluster_success <- TRUE
        }, error = function(e) {
          if (attempt < 3) {
            Sys.sleep(2)
          }
        })
      }
      if (!cluster_success) {
        failed_clusters <- c(failed_clusters, ci)
      }

      query_count <- query_count + 1

      # Update progress bar
      if (use_pb) {
        utils::setTxtProgressBar(pb, ci)
      }

      # Rate limit: 1 second between queries to respect Overpass usage policy
      if (ci < n_clusters) {
        Sys.sleep(1)
      }
    }

    if (use_pb) {
      close(pb)
    }

    elapsed <- as.numeric(difftime(Sys.time(), start_time, units = "mins"))
    message("  Download complete in ", round(elapsed, 1), " minutes")

    if (length(failed_clusters) > 0) {
      message("  WARNING: ", length(failed_clusters),
              " cluster(s) failed after 3 attempts: ",
              paste(failed_clusters, collapse = ", "))
      message("  Some lake boundaries may be missing. ",
              "Try running again or provide a local boundary file.")
    }

    # Reset to default server
    tryCatch(osmdata::set_overpass_url(overpass_servers[1]),
             error = function(e) NULL)
  }

  # Auto-detect UTM zone from sites
  site_coords <- if (inherits(sites_df, "sf")) {
    sf::st_coordinates(sf::st_centroid(sf::st_union(sites_df)))
  } else {
    c(mean(sites_df$longitude), mean(sites_df$latitude))
  }
  utm_zone <- floor((site_coords[1] + 180) / 6) + 1
  utm_epsg <- ifelse(site_coords[2] >= 0,
                     as.numeric(paste0("326", sprintf("%02d", utm_zone))),
                     as.numeric(paste0("327", sprintf("%02d", utm_zone))))

  # Check if we found ANY water bodies
  if (length(water_list) == 0) {
    warning("No water bodies found in OpenStreetMap - creating approximate boundary")

    # FALLBACK: Create a buffer around all points
    sites_utm_temp <- sf::st_transform(sites_sf, utm_epsg)
    buffer_dist <- 5000
    buffered <- sf::st_buffer(sites_utm_temp, dist = buffer_dist)
    lake_approx_utm <- sf::st_union(buffered)
    lake_approx_utm <- sf::st_convex_hull(lake_approx_utm)

    lake_approx_utm <- sf::st_sf(
      name = "Approximate Boundary",
      osm_id = "fallback",
      area_km2 = as.numeric(sf::st_area(lake_approx_utm)) / 1e6,
      geometry = sf::st_geometry(lake_approx_utm)
    )

    return(list(
      all_lakes = lake_approx_utm,
      sites = sites_utm_temp,
      utm_epsg = utm_epsg
    ))
  }

  # Combine all water polygons
  message("  Combining results...")
  all_water <- do.call(rbind, water_list)
  message("  Total water bodies found: ", nrow(all_water))

  message("  Auto-detected UTM Zone: ", utm_zone,
          ifelse(site_coords[2] >= 0, "N", "S"))

  # Transform EVERYTHING to UTM before spatial operations
  message("  Transforming to UTM for analysis...")
  all_water_utm <- sf::st_transform(all_water, utm_epsg)
  sites_utm <- sf::st_transform(sites_sf, utm_epsg)

  # Process each lake polygon - extract largest part if multipolygon
  message("  Processing lake polygons...")
  processed_lakes <- list()

  for (i in seq_len(nrow(all_water_utm))) {
    tryCatch({
      lake_poly <- all_water_utm[i, ]

      # Skip empty geometries
      if (sf::st_is_empty(lake_poly)) next

      geom_type <- sf::st_geometry_type(lake_poly, by_geometry = FALSE)

      if (geom_type %in% c("MULTIPOLYGON", "GEOMETRYCOLLECTION")) {
        lake_parts <- sf::st_cast(lake_poly, "POLYGON")
        lake_areas <- sf::st_area(lake_parts)
        lake_poly <- lake_parts[which.max(lake_areas), ]
      }

      validity <- sf::st_is_valid(lake_poly)
      if (is.na(validity) || !all(validity)) {
        lake_poly <- sf::st_make_valid(lake_poly)
      }

      processed_lakes[[length(processed_lakes) + 1]] <- lake_poly
    }, error = function(e) {
      # Skip problematic geometries silently
      NULL
    })
  }

  all_lakes_utm <- do.call(rbind, processed_lakes)

  # Calculate areas for reference
  all_lakes_utm$area_km2 <- as.numeric(sf::st_area(all_lakes_utm)) / 1e6

  # Filter out tiny water bodies (< 0.0001 km2 = 100 m2)
  min_area_km2 <- 0.0001
  tiny_mask <- all_lakes_utm$area_km2 < min_area_km2
  n_tiny <- sum(tiny_mask)
  if (n_tiny > 0) {
    message("  Filtered ", n_tiny, " water bodies < ", min_area_km2, " km2")
    all_lakes_utm <- all_lakes_utm[!tiny_mask, ]
  }

  # Deduplicate by osm_id (keep largest polygon if duplicates)
  unique_ids <- unique(all_lakes_utm$osm_id)
  if (length(unique_ids) < nrow(all_lakes_utm)) {
    message("  Deduplicating ", nrow(all_lakes_utm) - length(unique_ids),
            " duplicate lake entries...")
    deduped_list <- lapply(unique_ids, function(id) {
      matches <- all_lakes_utm[all_lakes_utm$osm_id == id, ]
      if (nrow(matches) > 1) {
        matches[which.max(matches$area_km2), ]
      } else {
        matches
      }
    })
    all_lakes_utm <- do.call(rbind, deduped_list)
  }

  message("  Processed ", nrow(all_lakes_utm), " unique lake polygons")
  message("  Total area: ", round(sum(all_lakes_utm$area_km2), 2), " km2")

  return(list(
    all_lakes = all_lakes_utm,
    sites = sites_utm,
    utm_epsg = utm_epsg
  ))
}

#' Standardize and Validate OSM sf Data
#'
#' @param osm_sf sf object from osmdata
#' @return Standardized sf object with osm_id, name, geometry
#' @noRd
standardize_osm_sf <- function(osm_sf) {
  if (is.null(osm_sf) || nrow(osm_sf) == 0) return(NULL)

  # Make geometries valid first
  osm_sf <- sf::st_make_valid(osm_sf)

  # Find columns case-insensitively
  col_names_lower <- tolower(names(osm_sf))
  osm_id_col <- which(col_names_lower == "osm_id")[1]
  name_col <- which(col_names_lower == "name")[1]

  # Extract only essential columns
  result <- data.frame(
    osm_id = if (!is.na(osm_id_col)) as.character(osm_sf[[osm_id_col]]) else NA_character_,
    name = if (!is.na(name_col)) as.character(osm_sf[[name_col]]) else NA_character_,
    stringsAsFactors = FALSE
  )
  result <- sf::st_sf(result, geometry = sf::st_geometry(osm_sf))

  # Remove any invalid geometries
  valid_mask <- sf::st_is_valid(result)
  if (!all(valid_mask)) {
    message("      (Removed ", sum(!valid_mask), " invalid geometries)")
    result <- result[valid_mask, ]
  }

  return(result)
}

#' Load Lake from Local File
#'
#' @param sites_df Data frame with latitude and longitude columns
#' @param lake_file_path Path to shapefile or geopackage
#'
#' @return A list with all_lakes, sites, and utm_epsg
#'
#' @noRd
load_lake_file <- function(sites_df, lake_file_path) {

  message("Loading lake boundary from file: ", lake_file_path)

  # Convert sites to sf
  if (inherits(sites_df, "sf")) {
    sites_sf <- sf::st_transform(sites_df, 4326)
  } else {
    sites_sf <- sf::st_as_sf(sites_df,
                              coords = c("longitude", "latitude"),
                              crs = 4326)
  }

  # Load lake shapefile
  lake_wgs84 <- sf::st_read(lake_file_path, quiet = TRUE)

  if (!sf::st_crs(lake_wgs84)$input %in% c("EPSG:4326", "WGS 84")) {
    lake_wgs84 <- sf::st_transform(lake_wgs84, 4326)
  }

  # Auto-detect UTM zone
  lake_centroid <- sf::st_coordinates(sf::st_centroid(sf::st_union(lake_wgs84)))
  utm_zone <- floor((lake_centroid[1] + 180) / 6) + 1
  utm_epsg <- ifelse(lake_centroid[2] >= 0,
                     as.numeric(paste0("326", sprintf("%02d", utm_zone))),
                     as.numeric(paste0("327", sprintf("%02d", utm_zone))))

  # Transform to UTM
  lake_utm <- sf::st_transform(lake_wgs84, utm_epsg)
  sites_utm <- sf::st_transform(sites_sf, utm_epsg)

  # Extract largest polygon if needed
  geom_type <- sf::st_geometry_type(lake_utm, by_geometry = FALSE)
  if (any(geom_type %in% c("MULTIPOLYGON", "GEOMETRYCOLLECTION"))) {
    lake_parts <- sf::st_cast(lake_utm, "POLYGON")
    lake_areas <- sf::st_area(lake_parts)
    lake_utm <- lake_parts[which.max(lake_areas), ]
  }

  if (!all(sf::st_is_valid(lake_utm))) {
    lake_utm <- sf::st_make_valid(lake_utm)
  }

  lake_area_km2 <- as.numeric(sf::st_area(lake_utm)) / 1e6
  message("  Lake area: ", round(lake_area_km2, 2), " km2")

  # Add required columns for multi-lake compatibility
  lake_utm$osm_id <- "local_file"
  lake_utm$name <- basename(lake_file_path)
  lake_utm$area_km2 <- lake_area_km2

  return(list(
    all_lakes = lake_utm,
    sites = sites_utm,
    utm_epsg = utm_epsg
  ))
}

#' Assign Sites to Lakes
#'
#' Perform spatial join to assign each site to its containing lake polygon.
#'
#' @param sites_sf sf object with site points
#' @param water_polygons sf object with lake polygons
#' @param tolerance_m Buffer distance for matching sites near lake edges
#'
#' @return sf object with sites and added columns for lake_osm_id, lake_name, lake_area_km2
#'
#' @examples
#' \donttest{
#' csv_path <- system.file("extdata", "sample_sites.csv", package = "lakefetch")
#' sites <- load_sites(csv_path)
#' lake_data <- get_lake_boundary(sites)
#'
#' # Assign sites to their containing lakes
#' sites_assigned <- assign_sites_to_lakes(
#'   lake_data$sites,
#'   lake_data$all_lakes,
#'   tolerance_m = 50
#' )
#'
#' # Check assignments
#' table(sites_assigned$lake_name)
#' }
#'
#' @export
assign_sites_to_lakes <- function(sites_sf, water_polygons, tolerance_m = NULL) {

  if (is.null(tolerance_m)) {
    tolerance_m <- get_opt("gps_tolerance_m")
  }

  message("Assigning sites to lakes...")

  n_sites <- nrow(sites_sf)
  sites_sf$lake_osm_id <- NA_character_
  sites_sf$lake_name <- NA_character_
  sites_sf$lake_area_km2 <- NA_real_

  # First pass: direct intersection
  message("  Checking direct intersections...")
  sites_in_lakes <- sf::st_join(sites_sf, water_polygons, join = sf::st_intersects, left = TRUE)

  # Handle sites that matched directly
  direct_matches <- !is.na(sites_in_lakes$osm_id)
  if (any(direct_matches)) {
    for (i in which(direct_matches)) {
      site_name <- sites_in_lakes$Site[i]
      if (is.na(sites_sf$lake_osm_id[sites_sf$Site == site_name])) {
        sites_sf$lake_osm_id[sites_sf$Site == site_name] <- sites_in_lakes$osm_id[i]
        sites_sf$lake_name[sites_sf$Site == site_name] <- sites_in_lakes$name[i]
        lake_idx <- which(water_polygons$osm_id == sites_in_lakes$osm_id[i])[1]
        if (!is.na(lake_idx) && "area_km2" %in% names(water_polygons)) {
          sites_sf$lake_area_km2[sites_sf$Site == site_name] <- water_polygons$area_km2[lake_idx]
        }
      }
    }
    message("    ", sum(!is.na(sites_sf$lake_osm_id)), " sites matched directly")
  }

  # Second pass: buffer check for unmatched sites
  # Only match sites that are close to the SHORELINE of a lake (not just near the lake polygon)
  # This prevents assigning points to the wrong lake when multiple lakes are nearby
  unmatched_idx <- which(is.na(sites_sf$lake_osm_id))

  if (length(unmatched_idx) > 0 && tolerance_m > 0) {
    message("  Checking ", length(unmatched_idx),
            " unmatched sites with ", tolerance_m, "m tolerance...")

    for (i in unmatched_idx) {
      site_geom <- sf::st_geometry(sites_sf)[i]

      # Find distance to the BOUNDARY (shoreline) of each lake, not just the polygon
      # This ensures we only match sites that are genuinely close to a lake edge
      shoreline_distances <- sapply(seq_len(nrow(water_polygons)), function(j) {
        lake_boundary <- tryCatch({
          sf::st_boundary(water_polygons[j, ])
        }, error = function(e) {
          sf::st_cast(water_polygons[j, ], "MULTILINESTRING")
        })
        as.numeric(sf::st_distance(site_geom, lake_boundary))
      })

      # Find lakes within tolerance distance of their shoreline
      within_tolerance <- which(shoreline_distances <= tolerance_m)

      if (length(within_tolerance) > 0) {
        # Pick the lake with the closest shoreline
        closest_idx <- within_tolerance[which.min(shoreline_distances[within_tolerance])]
        lake_match <- water_polygons[closest_idx, ]

        sites_sf$lake_osm_id[i] <- lake_match$osm_id
        sites_sf$lake_name[i] <- lake_match$name
        if ("area_km2" %in% names(lake_match)) {
          sites_sf$lake_area_km2[i] <- lake_match$area_km2
        }
      }
    }

    matched_after_buffer <- sum(!is.na(sites_sf$lake_osm_id))
    matched_before_buffer <- n_sites - length(unmatched_idx)
    buffer_matches <- matched_after_buffer - matched_before_buffer
    message("    ", buffer_matches, " additional sites matched within tolerance")
  }

  # Third pass: name-based matching for remaining unmatched sites
  # If sites have a lake.name or similar column, try to match by name
  unmatched_idx <- which(is.na(sites_sf$lake_osm_id))
  if (length(unmatched_idx) > 0) {
    # Check if sites have a lake name column we can use
    site_lake_col <- NULL
    for (col in c("lake.name", "lake_name", "lakename", "Lake", "lake", "waterbody")) {
      if (col %in% names(sites_sf)) {
        site_lake_col <- col
        break
      }
    }

    if (!is.null(site_lake_col)) {
      message("  Trying name-based matching using '", site_lake_col, "' column...")

      # Get unique lake names from unmatched sites
      unmatched_lake_names <- unique(sites_sf[[site_lake_col]][unmatched_idx])
      unmatched_lake_names <- unmatched_lake_names[!is.na(unmatched_lake_names)]

      # Get OSM lake names (lowercase for comparison)
      osm_names <- tolower(trimws(water_polygons$name))
      osm_names[is.na(osm_names)] <- ""

      for (site_lake_name in unmatched_lake_names) {
        site_lake_lower <- tolower(trimws(site_lake_name))

        # Try exact match first
        exact_match <- which(osm_names == site_lake_lower)

        # If no exact match, try partial/fuzzy matching
        if (length(exact_match) == 0) {
          # Check if OSM name contains site lake name or vice versa
          partial_match <- which(
            grepl(site_lake_lower, osm_names, fixed = TRUE) |
            sapply(osm_names, function(x) grepl(x, site_lake_lower, fixed = TRUE) && nchar(x) > 3)
          )
          if (length(partial_match) > 0) {
            exact_match <- partial_match[1]
          }
        }

        if (length(exact_match) > 0) {
          # Found a matching lake - assign all unmatched sites with this lake name
          lake_match <- water_polygons[exact_match[1], ]
          site_indices <- unmatched_idx[sites_sf[[site_lake_col]][unmatched_idx] == site_lake_name]
          name_match_tolerance <- tolerance_m * 5

          matched_count <- 0
          skipped_sites <- character(0)
          for (idx in site_indices) {
            site_dist <- as.numeric(sf::st_distance(sf::st_geometry(sites_sf)[idx], lake_match))
            if (site_dist <= name_match_tolerance) {
              sites_sf$lake_osm_id[idx] <- lake_match$osm_id
              sites_sf$lake_name[idx] <- lake_match$name
              if ("area_km2" %in% names(lake_match)) {
                sites_sf$lake_area_km2[idx] <- lake_match$area_km2
              }
              matched_count <- matched_count + 1
            } else {
              skipped_sites <- c(skipped_sites, sites_sf$Site[idx])
            }
          }
          if (matched_count > 0) {
            message("    Matched ", matched_count, " sites to '", lake_match$name,
                    "' via name matching from '", site_lake_name, "'")
          }
          if (length(skipped_sites) > 0) {
            warning("Skipped ", length(skipped_sites), " site(s) named '", site_lake_name,
                    "': too far from OSM lake '", lake_match$name,
                    "' (>", name_match_tolerance, "m). ",
                    "Site(s) may be on a different water body. ",
                    "Sites: ", paste(skipped_sites, collapse = ", "))
          }
        } else {
          # Report unmatched lake name
          n_unmatched_this_lake <- sum(sites_sf[[site_lake_col]][unmatched_idx] == site_lake_name)
          message("    WARNING: No OSM lake found matching '", site_lake_name,
                  "' (", n_unmatched_this_lake, " sites)")
        }
      }
    }
  }

  # Summary
  matched <- sum(!is.na(sites_sf$lake_osm_id))
  unmatched <- sum(is.na(sites_sf$lake_osm_id))

  message("  Site assignment summary:")
  message("    Matched: ", matched, "/", n_sites)

  if (unmatched > 0) {
    message("    Unmatched: ", unmatched)

    # Provide diagnostic information for unmatched sites
    unmatched_sites <- sites_sf[is.na(sites_sf$lake_osm_id), ]
    unmatched_coords <- sf::st_coordinates(unmatched_sites)

    # Check if unmatched sites have a lake name column
    lake_col <- NULL
    for (col in c("lake.name", "lake_name", "lakename", "Lake", "lake", "waterbody")) {
      if (col %in% names(unmatched_sites)) {
        lake_col <- col
        break
      }
    }

    if (!is.null(lake_col)) {
      unmatched_lakes <- unique(unmatched_sites[[lake_col]])
      message("    Unmatched sites claim to be in: ", paste(unmatched_lakes, collapse = ", "))
    }

    # Show coordinate range of unmatched sites
    message("    Unmatched coordinate range:")
    message("      Lat: ", round(min(unmatched_coords[, 2]), 4), " to ",
            round(max(unmatched_coords[, 2]), 4))
    message("      Lon: ", round(min(unmatched_coords[, 1]), 4), " to ",
            round(max(unmatched_coords[, 1]), 4))

    # Suggest increasing tolerance or checking OSM
    message("    TIP: Try lakefetch_options(gps_tolerance_m = 100) for larger buffer")
    message("    TIP: Check if the lake exists in OpenStreetMap at openstreetmap.org")
  }

  # Clean up lake names - look up from water_polygons if name is NA
  for (i in which(is.na(sites_sf$lake_name) & !is.na(sites_sf$lake_osm_id))) {
    lake_id <- sites_sf$lake_osm_id[i]
    lake_area <- sites_sf$lake_area_km2[i]

    # First try: Find any polygon with this osm_id that has a name
    matching_polys <- water_polygons[water_polygons$osm_id == lake_id, ]
    if (nrow(matching_polys) > 0) {
      poly_names <- matching_polys$name[!is.na(matching_polys$name) & matching_polys$name != ""]
      if (length(poly_names) > 0) {
        sites_sf$lake_name[i] <- poly_names[1]
        next
      }
    }

    # Second try: Find another lake with very similar area (likely duplicate polygon)
    # This handles cases where OSM has the same lake with different IDs
    if (!is.na(lake_area) && lake_area > 0) {
      area_tolerance <- 0.01  # 1% tolerance
      similar_area_idx <- which(
        abs(water_polygons$area_km2 - lake_area) / lake_area < area_tolerance &
        !is.na(water_polygons$name) &
        water_polygons$name != ""
      )
      if (length(similar_area_idx) > 0) {
        sites_sf$lake_name[i] <- water_polygons$name[similar_area_idx[1]]
      }
    }
  }

  # Final fallback for any still-missing names
  sites_sf$lake_name <- ifelse(
    is.na(sites_sf$lake_name) & !is.na(sites_sf$lake_osm_id),
    paste0("Unknown Lake (ID: ", sites_sf$lake_osm_id, ")"),
    sites_sf$lake_name
  )

  # Show lakes with sites
  if (matched > 0) {
    lake_counts <- table(sites_sf$lake_osm_id, useNA = "no")
    message("  Sites per lake:")
    for (lake_id in names(lake_counts)) {
      lake_nm <- sites_sf$lake_name[sites_sf$lake_osm_id == lake_id][1]
      if (is.na(lake_nm)) lake_nm <- paste0("ID: ", lake_id)
      message("    ", lake_nm, ": ", lake_counts[lake_id], " sites")
    }
  }

  return(sites_sf)
}

Try the lakefetch package in your browser

Any scripts or data that you put into this service are public.

lakefetch documentation built on March 20, 2026, 5:10 p.m.