Nothing
# ==============================================================================
# Lake Boundary Acquisition Functions
# ==============================================================================
#' Get Lake Boundary
#'
#' Get lake boundary polygon(s) either from OpenStreetMap or from a local file.
#'
#' @param sites A data.frame with latitude and longitude columns, or an sf object.
#' @param file Optional file path to a shapefile or geopackage with lake boundaries.
#'
#' @return A list with elements:
#' \item{all_lakes}{sf object with lake polygons in UTM projection}
#' \item{sites}{sf object with site points in UTM projection}
#' \item{utm_epsg}{EPSG code for the UTM projection used}
#'
#' @details
#' If \code{file} is provided, the lake boundary is loaded from the file.
#' Otherwise, the function downloads lake boundaries from OpenStreetMap
#' based on the bounding box of the provided sites.
#'
#' @examples
#' \donttest{
#' csv_path <- system.file("extdata", "sample_sites.csv", package = "lakefetch")
#' sites <- load_sites(csv_path)
#' lake_data <- get_lake_boundary(sites)
#' }
#'
#' @export
get_lake_boundary <- function(sites, file = NULL) {
if (!is.null(file)) {
return(load_lake_file(sites, file))
} else {
return(download_lake_osm(sites))
}
}
#' Group Sites into Spatial Clusters
#'
#' Groups sites by proximity using a grid-based approach. Sites within 0.1
#' degrees (~11 km) of each other are placed in the same cluster.
#'
#' @param sites_sf sf object with site points in WGS84
#' @return List of integer vectors, each containing row indices for one cluster
#' @noRd
cluster_sites <- function(sites_sf) {
coords <- sf::st_coordinates(sites_sf)
grid_size <- 0.1 # ~11 km
# Round coordinates to nearest grid cell
grid_x <- floor(coords[, 1] / grid_size)
grid_y <- floor(coords[, 2] / grid_size)
grid_key <- paste(grid_x, grid_y, sep = "_")
# Group indices by grid cell
clusters <- split(seq_len(nrow(sites_sf)), grid_key)
return(unname(clusters))
}
#' Extract Polygons from OSM Response
#'
#' @param osm_data Result from osmdata::osmdata_sf()
#' @return List of standardized sf data frames
#' @noRd
extract_osm_polys <- function(osm_data) {
result <- list()
if (!is.null(osm_data$osm_polygons) && nrow(osm_data$osm_polygons) > 0) {
std_poly <- standardize_osm_sf(osm_data$osm_polygons)
if (!is.null(std_poly) && nrow(std_poly) > 0) {
result[[length(result) + 1]] <- std_poly
message(" Found ", nrow(std_poly), " polygons")
}
}
if (!is.null(osm_data$osm_multipolygons) && nrow(osm_data$osm_multipolygons) > 0) {
std_mpoly <- standardize_osm_sf(osm_data$osm_multipolygons)
if (!is.null(std_mpoly) && nrow(std_mpoly) > 0) {
result[[length(result) + 1]] <- std_mpoly
message(" Found ", nrow(std_mpoly), " multipolygons")
}
}
return(result)
}
#' Query OSM by Name(s) Using Regex Matching
#'
#' Accepts a single name or vector of names, builds regex OR pattern,
#' and queries Overpass for matching water bodies. Uses substring matching
#' so "Crooked" matches "Crooked Lake".
#'
#' @param bbox Named numeric vector with left, bottom, right, top
#' @param names Character vector of lake names
#' @param overpass_servers Character vector of Overpass API server URLs
#' @param max_attempts Maximum retry attempts
#' @return osmdata result or NULL
#' @noRd
query_osm_by_name <- function(bbox, names, overpass_servers, max_attempts = 3) {
# Escape regex special characters in each name, then join with |
escape_regex <- function(x) {
gsub("([.|()\\\\+*?\\[\\]^${}])", "\\\\\\1", x, perl = TRUE)
}
escaped <- vapply(names, escape_regex, character(1), USE.NAMES = FALSE)
name_pattern <- paste(escaped, collapse = "|")
for (attempt in seq_len(max_attempts)) {
server <- overpass_servers[((attempt - 1) %% length(overpass_servers)) + 1]
tryCatch({
osmdata::set_overpass_url(server)
osm_query <- osmdata::opq(bbox = bbox, timeout = 120)
osm_query <- osmdata::add_osm_feature(osm_query,
key = "natural", value = "water")
osm_query <- osmdata::add_osm_feature(osm_query,
key = "name", value = name_pattern,
value_exact = FALSE)
result <- osmdata::osmdata_sf(osm_query)
osmdata::set_overpass_url(overpass_servers[1])
return(result)
}, error = function(e) {
msg <- conditionMessage(e)
if (attempt < max_attempts) {
if (grepl("500|502|503|504|timeout|Timeout", msg, ignore.case = TRUE)) {
wait_time <- attempt * 5
message(" Server error, trying another server in ", wait_time, "s...")
Sys.sleep(wait_time)
} else {
message(" Error: ", msg)
}
} else {
message(" Failed after ", max_attempts, " attempts: ", msg)
}
NULL
})
}
tryCatch(osmdata::set_overpass_url(overpass_servers[1]), error = function(e) NULL)
return(NULL)
}
#' Download Water Bodies from OSM for a Single Bounding Box
#'
#' Queries the Overpass API for water bodies within a bounding box. Optionally
#' tries a name-filtered query first to reduce data volume.
#'
#' @param bbox_vec Named numeric vector with left, bottom, right, top
#' @param lake_names Optional character vector of lake names to try first
#' @param overpass_servers Character vector of Overpass API server URLs
#' @param name_only If TRUE, skip broad natural=water fallback when name query
#' fails. Used in many-cluster mode to reduce API load.
#' @return List of sf data frames with water body polygons
#' @noRd
download_lake_osm_single <- function(bbox_vec, lake_names = NULL,
overpass_servers = NULL,
name_only = FALSE) {
if (is.null(overpass_servers)) {
overpass_servers <- c(
"https://overpass-api.de/api/interpreter",
"https://overpass.kumi.systems/api/interpreter",
"https://maps.mail.ru/osm/tools/overpass/api/interpreter"
)
}
# Helper to query OSM with retries across multiple servers
query_osm_robust <- function(bbox, key, value, max_attempts = 3) {
for (attempt in seq_len(max_attempts)) {
server <- overpass_servers[((attempt - 1) %% length(overpass_servers)) + 1]
tryCatch({
osmdata::set_overpass_url(server)
osm_query <- osmdata::opq(bbox = bbox, timeout = 90)
osm_query <- osmdata::add_osm_feature(osm_query, key = key, value = value)
result <- osmdata::osmdata_sf(osm_query)
osmdata::set_overpass_url(overpass_servers[1])
return(result)
}, error = function(e) {
msg <- conditionMessage(e)
if (attempt < max_attempts) {
if (grepl("500|502|503|504|timeout|Timeout", msg, ignore.case = TRUE)) {
wait_time <- attempt * 3
message(" Server error, trying another server in ", wait_time, "s...")
Sys.sleep(wait_time)
} else {
message(" Error: ", msg)
}
} else {
message(" Failed after ", max_attempts, " attempts: ", msg)
}
NULL
})
}
tryCatch(osmdata::set_overpass_url(overpass_servers[1]), error = function(e) NULL)
return(NULL)
}
water_list <- list()
# Try name-filtered queries first if lake names are available
name_query_sufficient <- FALSE
if (!is.null(lake_names) && length(lake_names) > 0) {
lake_names <- unique(lake_names[!is.na(lake_names) & nchar(trimws(lake_names)) > 0])
if (length(lake_names) > 0) {
message(" Trying name-filtered query for: ",
paste(lake_names, collapse = ", "))
for (lname in lake_names) {
osm_result <- query_osm_by_name(bbox_vec, lname, overpass_servers)
if (!is.null(osm_result)) {
name_polys <- extract_osm_polys(osm_result)
if (length(name_polys) > 0) {
water_list <- c(water_list, name_polys)
name_query_sufficient <- TRUE
}
}
}
if (name_query_sufficient) {
message(" Name-filtered query found results, skipping broad query")
}
}
}
# Fall back to broad queries if name-filtered query didn't find anything
# Skip broad fallback when name_only=TRUE (many-cluster mode to reduce API load)
if (!name_query_sufficient && !name_only) {
message(" Querying natural=water...")
osm_result <- query_osm_robust(bbox_vec, "natural", "water")
if (!is.null(osm_result)) {
water_list <- c(water_list, extract_osm_polys(osm_result))
}
message(" Querying water=lake...")
osm_result <- query_osm_robust(bbox_vec, "water", "lake")
if (!is.null(osm_result)) {
water_list <- c(water_list, extract_osm_polys(osm_result))
}
} else if (!name_query_sufficient && name_only) {
message(" No results for name query, skipping broad query (name-only mode)")
}
return(water_list)
}
#' Download Lake Boundary from OpenStreetMap
#'
#' Downloads lake boundaries from OpenStreetMap. For geographically spread-out
#' sites, clusters them and queries per-cluster to avoid downloading the entire
#' world's water bodies.
#'
#' @param sites_df Data frame with latitude and longitude columns
#'
#' @return A list with all_lakes, sites, and utm_epsg
#'
#' @noRd
download_lake_osm <- function(sites_df) {
message("Converting to spatial format...")
# Disable S2 spherical geometry to avoid topology errors
sf::sf_use_s2(FALSE)
on.exit(sf::sf_use_s2(TRUE), add = TRUE)
# Convert to sf object (WGS84)
if (inherits(sites_df, "sf")) {
sites_sf <- sf::st_transform(sites_df, 4326)
} else {
sites_sf <- sf::st_as_sf(sites_df,
coords = c("longitude", "latitude"),
crs = 4326)
}
# Overpass servers to try (main server often overloaded)
overpass_servers <- c(
"https://overpass-api.de/api/interpreter",
"https://overpass.kumi.systems/api/interpreter",
"https://maps.mail.ru/osm/tools/overpass/api/interpreter"
)
# Detect lake name column if available
lake_name_col <- NULL
for (col in c("lake.name", "lake_name", "lakename", "Lake", "lake", "waterbody")) {
if (col %in% names(sites_sf)) {
lake_name_col <- col
break
}
}
# Create bounding box and check site spread
bbox <- sf::st_bbox(sites_sf)
site_spread <- max(bbox["xmax"] - bbox["xmin"], bbox["ymax"] - bbox["ymin"])
message("Downloading lake boundaries from OpenStreetMap...")
water_list <- list()
if (site_spread <= 0.5) {
# --- Small spread: single bbox query (current behavior) ---
bbox_buffer <- min(0.05, max(0.01, site_spread * 0.1))
bbox[1] <- bbox[1] - bbox_buffer
bbox[2] <- bbox[2] - bbox_buffer
bbox[3] <- bbox[3] + bbox_buffer
bbox[4] <- bbox[4] + bbox_buffer
bbox_vec <- c(bbox["xmin"], bbox["ymin"], bbox["xmax"], bbox["ymax"])
names(bbox_vec) <- c("left", "bottom", "right", "top")
message(" Bounding box: [", paste(round(bbox_vec, 4), collapse = ", "), "]")
# Get lake names for name-filtered query
cluster_lake_names <- NULL
if (!is.null(lake_name_col)) {
cluster_lake_names <- unique(sites_sf[[lake_name_col]])
}
water_list <- download_lake_osm_single(bbox_vec, cluster_lake_names,
overpass_servers)
} else {
# --- Large spread: per-cluster small bbox queries ---
# Each cluster gets a tiny bbox (~0.1 degree) and a single natural=water
# query. Small bboxes return quickly with just nearby water bodies.
clusters <- cluster_sites(sites_sf)
n_clusters <- length(clusters)
message(" Site spread is ", round(site_spread, 1),
" degrees - querying ", n_clusters, " site clusters")
# Progress bar for interactive sessions
use_pb <- interactive()
if (use_pb) {
pb <- utils::txtProgressBar(min = 0, max = n_clusters, style = 3)
}
start_time <- Sys.time()
# Track failed clusters and rotate servers
query_count <- 0
failed_clusters <- integer(0)
for (ci in seq_along(clusters)) {
cluster_idx <- clusters[[ci]]
cluster_sf <- sites_sf[cluster_idx, ]
# Compute small bbox around cluster sites with 0.05 degree buffer
cl_bbox <- sf::st_bbox(cluster_sf)
cl_buffer <- 0.05 # ~5.5 km buffer around site(s)
cl_bbox[1] <- cl_bbox[1] - cl_buffer
cl_bbox[2] <- cl_bbox[2] - cl_buffer
cl_bbox[3] <- cl_bbox[3] + cl_buffer
cl_bbox[4] <- cl_bbox[4] + cl_buffer
cl_bbox_vec <- c(cl_bbox["xmin"], cl_bbox["ymin"],
cl_bbox["xmax"], cl_bbox["ymax"])
names(cl_bbox_vec) <- c("left", "bottom", "right", "top")
# Single natural=water query per cluster (small bbox = fast)
# Try up to 3 attempts across different servers
cluster_success <- FALSE
for (attempt in 1:3) {
if (cluster_success) break
server_idx <- ((query_count + attempt - 1) %% length(overpass_servers)) + 1
tryCatch({
osmdata::set_overpass_url(overpass_servers[server_idx])
osm_query <- osmdata::opq(bbox = cl_bbox_vec, timeout = 30)
osm_query <- osmdata::add_osm_feature(osm_query,
key = "natural", value = "water")
osm_result <- osmdata::osmdata_sf(osm_query)
water_list <- c(water_list, extract_osm_polys(osm_result))
cluster_success <- TRUE
}, error = function(e) {
if (attempt < 3) {
Sys.sleep(2)
}
})
}
if (!cluster_success) {
failed_clusters <- c(failed_clusters, ci)
}
query_count <- query_count + 1
# Update progress bar
if (use_pb) {
utils::setTxtProgressBar(pb, ci)
}
# Rate limit: 1 second between queries to respect Overpass usage policy
if (ci < n_clusters) {
Sys.sleep(1)
}
}
if (use_pb) {
close(pb)
}
elapsed <- as.numeric(difftime(Sys.time(), start_time, units = "mins"))
message(" Download complete in ", round(elapsed, 1), " minutes")
if (length(failed_clusters) > 0) {
message(" WARNING: ", length(failed_clusters),
" cluster(s) failed after 3 attempts: ",
paste(failed_clusters, collapse = ", "))
message(" Some lake boundaries may be missing. ",
"Try running again or provide a local boundary file.")
}
# Reset to default server
tryCatch(osmdata::set_overpass_url(overpass_servers[1]),
error = function(e) NULL)
}
# Auto-detect UTM zone from sites
site_coords <- if (inherits(sites_df, "sf")) {
sf::st_coordinates(sf::st_centroid(sf::st_union(sites_df)))
} else {
c(mean(sites_df$longitude), mean(sites_df$latitude))
}
utm_zone <- floor((site_coords[1] + 180) / 6) + 1
utm_epsg <- ifelse(site_coords[2] >= 0,
as.numeric(paste0("326", sprintf("%02d", utm_zone))),
as.numeric(paste0("327", sprintf("%02d", utm_zone))))
# Check if we found ANY water bodies
if (length(water_list) == 0) {
warning("No water bodies found in OpenStreetMap - creating approximate boundary")
# FALLBACK: Create a buffer around all points
sites_utm_temp <- sf::st_transform(sites_sf, utm_epsg)
buffer_dist <- 5000
buffered <- sf::st_buffer(sites_utm_temp, dist = buffer_dist)
lake_approx_utm <- sf::st_union(buffered)
lake_approx_utm <- sf::st_convex_hull(lake_approx_utm)
lake_approx_utm <- sf::st_sf(
name = "Approximate Boundary",
osm_id = "fallback",
area_km2 = as.numeric(sf::st_area(lake_approx_utm)) / 1e6,
geometry = sf::st_geometry(lake_approx_utm)
)
return(list(
all_lakes = lake_approx_utm,
sites = sites_utm_temp,
utm_epsg = utm_epsg
))
}
# Combine all water polygons
message(" Combining results...")
all_water <- do.call(rbind, water_list)
message(" Total water bodies found: ", nrow(all_water))
message(" Auto-detected UTM Zone: ", utm_zone,
ifelse(site_coords[2] >= 0, "N", "S"))
# Transform EVERYTHING to UTM before spatial operations
message(" Transforming to UTM for analysis...")
all_water_utm <- sf::st_transform(all_water, utm_epsg)
sites_utm <- sf::st_transform(sites_sf, utm_epsg)
# Process each lake polygon - extract largest part if multipolygon
message(" Processing lake polygons...")
processed_lakes <- list()
for (i in seq_len(nrow(all_water_utm))) {
tryCatch({
lake_poly <- all_water_utm[i, ]
# Skip empty geometries
if (sf::st_is_empty(lake_poly)) next
geom_type <- sf::st_geometry_type(lake_poly, by_geometry = FALSE)
if (geom_type %in% c("MULTIPOLYGON", "GEOMETRYCOLLECTION")) {
lake_parts <- sf::st_cast(lake_poly, "POLYGON")
lake_areas <- sf::st_area(lake_parts)
lake_poly <- lake_parts[which.max(lake_areas), ]
}
validity <- sf::st_is_valid(lake_poly)
if (is.na(validity) || !all(validity)) {
lake_poly <- sf::st_make_valid(lake_poly)
}
processed_lakes[[length(processed_lakes) + 1]] <- lake_poly
}, error = function(e) {
# Skip problematic geometries silently
NULL
})
}
all_lakes_utm <- do.call(rbind, processed_lakes)
# Calculate areas for reference
all_lakes_utm$area_km2 <- as.numeric(sf::st_area(all_lakes_utm)) / 1e6
# Filter out tiny water bodies (< 0.0001 km2 = 100 m2)
min_area_km2 <- 0.0001
tiny_mask <- all_lakes_utm$area_km2 < min_area_km2
n_tiny <- sum(tiny_mask)
if (n_tiny > 0) {
message(" Filtered ", n_tiny, " water bodies < ", min_area_km2, " km2")
all_lakes_utm <- all_lakes_utm[!tiny_mask, ]
}
# Deduplicate by osm_id (keep largest polygon if duplicates)
unique_ids <- unique(all_lakes_utm$osm_id)
if (length(unique_ids) < nrow(all_lakes_utm)) {
message(" Deduplicating ", nrow(all_lakes_utm) - length(unique_ids),
" duplicate lake entries...")
deduped_list <- lapply(unique_ids, function(id) {
matches <- all_lakes_utm[all_lakes_utm$osm_id == id, ]
if (nrow(matches) > 1) {
matches[which.max(matches$area_km2), ]
} else {
matches
}
})
all_lakes_utm <- do.call(rbind, deduped_list)
}
message(" Processed ", nrow(all_lakes_utm), " unique lake polygons")
message(" Total area: ", round(sum(all_lakes_utm$area_km2), 2), " km2")
return(list(
all_lakes = all_lakes_utm,
sites = sites_utm,
utm_epsg = utm_epsg
))
}
#' Standardize and Validate OSM sf Data
#'
#' @param osm_sf sf object from osmdata
#' @return Standardized sf object with osm_id, name, geometry
#' @noRd
standardize_osm_sf <- function(osm_sf) {
if (is.null(osm_sf) || nrow(osm_sf) == 0) return(NULL)
# Make geometries valid first
osm_sf <- sf::st_make_valid(osm_sf)
# Find columns case-insensitively
col_names_lower <- tolower(names(osm_sf))
osm_id_col <- which(col_names_lower == "osm_id")[1]
name_col <- which(col_names_lower == "name")[1]
# Extract only essential columns
result <- data.frame(
osm_id = if (!is.na(osm_id_col)) as.character(osm_sf[[osm_id_col]]) else NA_character_,
name = if (!is.na(name_col)) as.character(osm_sf[[name_col]]) else NA_character_,
stringsAsFactors = FALSE
)
result <- sf::st_sf(result, geometry = sf::st_geometry(osm_sf))
# Remove any invalid geometries
valid_mask <- sf::st_is_valid(result)
if (!all(valid_mask)) {
message(" (Removed ", sum(!valid_mask), " invalid geometries)")
result <- result[valid_mask, ]
}
return(result)
}
#' Load Lake from Local File
#'
#' @param sites_df Data frame with latitude and longitude columns
#' @param lake_file_path Path to shapefile or geopackage
#'
#' @return A list with all_lakes, sites, and utm_epsg
#'
#' @noRd
load_lake_file <- function(sites_df, lake_file_path) {
message("Loading lake boundary from file: ", lake_file_path)
# Convert sites to sf
if (inherits(sites_df, "sf")) {
sites_sf <- sf::st_transform(sites_df, 4326)
} else {
sites_sf <- sf::st_as_sf(sites_df,
coords = c("longitude", "latitude"),
crs = 4326)
}
# Load lake shapefile
lake_wgs84 <- sf::st_read(lake_file_path, quiet = TRUE)
if (!sf::st_crs(lake_wgs84)$input %in% c("EPSG:4326", "WGS 84")) {
lake_wgs84 <- sf::st_transform(lake_wgs84, 4326)
}
# Auto-detect UTM zone
lake_centroid <- sf::st_coordinates(sf::st_centroid(sf::st_union(lake_wgs84)))
utm_zone <- floor((lake_centroid[1] + 180) / 6) + 1
utm_epsg <- ifelse(lake_centroid[2] >= 0,
as.numeric(paste0("326", sprintf("%02d", utm_zone))),
as.numeric(paste0("327", sprintf("%02d", utm_zone))))
# Transform to UTM
lake_utm <- sf::st_transform(lake_wgs84, utm_epsg)
sites_utm <- sf::st_transform(sites_sf, utm_epsg)
# Extract largest polygon if needed
geom_type <- sf::st_geometry_type(lake_utm, by_geometry = FALSE)
if (any(geom_type %in% c("MULTIPOLYGON", "GEOMETRYCOLLECTION"))) {
lake_parts <- sf::st_cast(lake_utm, "POLYGON")
lake_areas <- sf::st_area(lake_parts)
lake_utm <- lake_parts[which.max(lake_areas), ]
}
if (!all(sf::st_is_valid(lake_utm))) {
lake_utm <- sf::st_make_valid(lake_utm)
}
lake_area_km2 <- as.numeric(sf::st_area(lake_utm)) / 1e6
message(" Lake area: ", round(lake_area_km2, 2), " km2")
# Add required columns for multi-lake compatibility
lake_utm$osm_id <- "local_file"
lake_utm$name <- basename(lake_file_path)
lake_utm$area_km2 <- lake_area_km2
return(list(
all_lakes = lake_utm,
sites = sites_utm,
utm_epsg = utm_epsg
))
}
#' Assign Sites to Lakes
#'
#' Perform spatial join to assign each site to its containing lake polygon.
#'
#' @param sites_sf sf object with site points
#' @param water_polygons sf object with lake polygons
#' @param tolerance_m Buffer distance for matching sites near lake edges
#'
#' @return sf object with sites and added columns for lake_osm_id, lake_name, lake_area_km2
#'
#' @examples
#' \donttest{
#' csv_path <- system.file("extdata", "sample_sites.csv", package = "lakefetch")
#' sites <- load_sites(csv_path)
#' lake_data <- get_lake_boundary(sites)
#'
#' # Assign sites to their containing lakes
#' sites_assigned <- assign_sites_to_lakes(
#' lake_data$sites,
#' lake_data$all_lakes,
#' tolerance_m = 50
#' )
#'
#' # Check assignments
#' table(sites_assigned$lake_name)
#' }
#'
#' @export
assign_sites_to_lakes <- function(sites_sf, water_polygons, tolerance_m = NULL) {
if (is.null(tolerance_m)) {
tolerance_m <- get_opt("gps_tolerance_m")
}
message("Assigning sites to lakes...")
n_sites <- nrow(sites_sf)
sites_sf$lake_osm_id <- NA_character_
sites_sf$lake_name <- NA_character_
sites_sf$lake_area_km2 <- NA_real_
# First pass: direct intersection
message(" Checking direct intersections...")
sites_in_lakes <- sf::st_join(sites_sf, water_polygons, join = sf::st_intersects, left = TRUE)
# Handle sites that matched directly
direct_matches <- !is.na(sites_in_lakes$osm_id)
if (any(direct_matches)) {
for (i in which(direct_matches)) {
site_name <- sites_in_lakes$Site[i]
if (is.na(sites_sf$lake_osm_id[sites_sf$Site == site_name])) {
sites_sf$lake_osm_id[sites_sf$Site == site_name] <- sites_in_lakes$osm_id[i]
sites_sf$lake_name[sites_sf$Site == site_name] <- sites_in_lakes$name[i]
lake_idx <- which(water_polygons$osm_id == sites_in_lakes$osm_id[i])[1]
if (!is.na(lake_idx) && "area_km2" %in% names(water_polygons)) {
sites_sf$lake_area_km2[sites_sf$Site == site_name] <- water_polygons$area_km2[lake_idx]
}
}
}
message(" ", sum(!is.na(sites_sf$lake_osm_id)), " sites matched directly")
}
# Second pass: buffer check for unmatched sites
# Only match sites that are close to the SHORELINE of a lake (not just near the lake polygon)
# This prevents assigning points to the wrong lake when multiple lakes are nearby
unmatched_idx <- which(is.na(sites_sf$lake_osm_id))
if (length(unmatched_idx) > 0 && tolerance_m > 0) {
message(" Checking ", length(unmatched_idx),
" unmatched sites with ", tolerance_m, "m tolerance...")
for (i in unmatched_idx) {
site_geom <- sf::st_geometry(sites_sf)[i]
# Find distance to the BOUNDARY (shoreline) of each lake, not just the polygon
# This ensures we only match sites that are genuinely close to a lake edge
shoreline_distances <- sapply(seq_len(nrow(water_polygons)), function(j) {
lake_boundary <- tryCatch({
sf::st_boundary(water_polygons[j, ])
}, error = function(e) {
sf::st_cast(water_polygons[j, ], "MULTILINESTRING")
})
as.numeric(sf::st_distance(site_geom, lake_boundary))
})
# Find lakes within tolerance distance of their shoreline
within_tolerance <- which(shoreline_distances <= tolerance_m)
if (length(within_tolerance) > 0) {
# Pick the lake with the closest shoreline
closest_idx <- within_tolerance[which.min(shoreline_distances[within_tolerance])]
lake_match <- water_polygons[closest_idx, ]
sites_sf$lake_osm_id[i] <- lake_match$osm_id
sites_sf$lake_name[i] <- lake_match$name
if ("area_km2" %in% names(lake_match)) {
sites_sf$lake_area_km2[i] <- lake_match$area_km2
}
}
}
matched_after_buffer <- sum(!is.na(sites_sf$lake_osm_id))
matched_before_buffer <- n_sites - length(unmatched_idx)
buffer_matches <- matched_after_buffer - matched_before_buffer
message(" ", buffer_matches, " additional sites matched within tolerance")
}
# Third pass: name-based matching for remaining unmatched sites
# If sites have a lake.name or similar column, try to match by name
unmatched_idx <- which(is.na(sites_sf$lake_osm_id))
if (length(unmatched_idx) > 0) {
# Check if sites have a lake name column we can use
site_lake_col <- NULL
for (col in c("lake.name", "lake_name", "lakename", "Lake", "lake", "waterbody")) {
if (col %in% names(sites_sf)) {
site_lake_col <- col
break
}
}
if (!is.null(site_lake_col)) {
message(" Trying name-based matching using '", site_lake_col, "' column...")
# Get unique lake names from unmatched sites
unmatched_lake_names <- unique(sites_sf[[site_lake_col]][unmatched_idx])
unmatched_lake_names <- unmatched_lake_names[!is.na(unmatched_lake_names)]
# Get OSM lake names (lowercase for comparison)
osm_names <- tolower(trimws(water_polygons$name))
osm_names[is.na(osm_names)] <- ""
for (site_lake_name in unmatched_lake_names) {
site_lake_lower <- tolower(trimws(site_lake_name))
# Try exact match first
exact_match <- which(osm_names == site_lake_lower)
# If no exact match, try partial/fuzzy matching
if (length(exact_match) == 0) {
# Check if OSM name contains site lake name or vice versa
partial_match <- which(
grepl(site_lake_lower, osm_names, fixed = TRUE) |
sapply(osm_names, function(x) grepl(x, site_lake_lower, fixed = TRUE) && nchar(x) > 3)
)
if (length(partial_match) > 0) {
exact_match <- partial_match[1]
}
}
if (length(exact_match) > 0) {
# Found a matching lake - assign all unmatched sites with this lake name
lake_match <- water_polygons[exact_match[1], ]
site_indices <- unmatched_idx[sites_sf[[site_lake_col]][unmatched_idx] == site_lake_name]
name_match_tolerance <- tolerance_m * 5
matched_count <- 0
skipped_sites <- character(0)
for (idx in site_indices) {
site_dist <- as.numeric(sf::st_distance(sf::st_geometry(sites_sf)[idx], lake_match))
if (site_dist <= name_match_tolerance) {
sites_sf$lake_osm_id[idx] <- lake_match$osm_id
sites_sf$lake_name[idx] <- lake_match$name
if ("area_km2" %in% names(lake_match)) {
sites_sf$lake_area_km2[idx] <- lake_match$area_km2
}
matched_count <- matched_count + 1
} else {
skipped_sites <- c(skipped_sites, sites_sf$Site[idx])
}
}
if (matched_count > 0) {
message(" Matched ", matched_count, " sites to '", lake_match$name,
"' via name matching from '", site_lake_name, "'")
}
if (length(skipped_sites) > 0) {
warning("Skipped ", length(skipped_sites), " site(s) named '", site_lake_name,
"': too far from OSM lake '", lake_match$name,
"' (>", name_match_tolerance, "m). ",
"Site(s) may be on a different water body. ",
"Sites: ", paste(skipped_sites, collapse = ", "))
}
} else {
# Report unmatched lake name
n_unmatched_this_lake <- sum(sites_sf[[site_lake_col]][unmatched_idx] == site_lake_name)
message(" WARNING: No OSM lake found matching '", site_lake_name,
"' (", n_unmatched_this_lake, " sites)")
}
}
}
}
# Summary
matched <- sum(!is.na(sites_sf$lake_osm_id))
unmatched <- sum(is.na(sites_sf$lake_osm_id))
message(" Site assignment summary:")
message(" Matched: ", matched, "/", n_sites)
if (unmatched > 0) {
message(" Unmatched: ", unmatched)
# Provide diagnostic information for unmatched sites
unmatched_sites <- sites_sf[is.na(sites_sf$lake_osm_id), ]
unmatched_coords <- sf::st_coordinates(unmatched_sites)
# Check if unmatched sites have a lake name column
lake_col <- NULL
for (col in c("lake.name", "lake_name", "lakename", "Lake", "lake", "waterbody")) {
if (col %in% names(unmatched_sites)) {
lake_col <- col
break
}
}
if (!is.null(lake_col)) {
unmatched_lakes <- unique(unmatched_sites[[lake_col]])
message(" Unmatched sites claim to be in: ", paste(unmatched_lakes, collapse = ", "))
}
# Show coordinate range of unmatched sites
message(" Unmatched coordinate range:")
message(" Lat: ", round(min(unmatched_coords[, 2]), 4), " to ",
round(max(unmatched_coords[, 2]), 4))
message(" Lon: ", round(min(unmatched_coords[, 1]), 4), " to ",
round(max(unmatched_coords[, 1]), 4))
# Suggest increasing tolerance or checking OSM
message(" TIP: Try lakefetch_options(gps_tolerance_m = 100) for larger buffer")
message(" TIP: Check if the lake exists in OpenStreetMap at openstreetmap.org")
}
# Clean up lake names - look up from water_polygons if name is NA
for (i in which(is.na(sites_sf$lake_name) & !is.na(sites_sf$lake_osm_id))) {
lake_id <- sites_sf$lake_osm_id[i]
lake_area <- sites_sf$lake_area_km2[i]
# First try: Find any polygon with this osm_id that has a name
matching_polys <- water_polygons[water_polygons$osm_id == lake_id, ]
if (nrow(matching_polys) > 0) {
poly_names <- matching_polys$name[!is.na(matching_polys$name) & matching_polys$name != ""]
if (length(poly_names) > 0) {
sites_sf$lake_name[i] <- poly_names[1]
next
}
}
# Second try: Find another lake with very similar area (likely duplicate polygon)
# This handles cases where OSM has the same lake with different IDs
if (!is.na(lake_area) && lake_area > 0) {
area_tolerance <- 0.01 # 1% tolerance
similar_area_idx <- which(
abs(water_polygons$area_km2 - lake_area) / lake_area < area_tolerance &
!is.na(water_polygons$name) &
water_polygons$name != ""
)
if (length(similar_area_idx) > 0) {
sites_sf$lake_name[i] <- water_polygons$name[similar_area_idx[1]]
}
}
}
# Final fallback for any still-missing names
sites_sf$lake_name <- ifelse(
is.na(sites_sf$lake_name) & !is.na(sites_sf$lake_osm_id),
paste0("Unknown Lake (ID: ", sites_sf$lake_osm_id, ")"),
sites_sf$lake_name
)
# Show lakes with sites
if (matched > 0) {
lake_counts <- table(sites_sf$lake_osm_id, useNA = "no")
message(" Sites per lake:")
for (lake_id in names(lake_counts)) {
lake_nm <- sites_sf$lake_name[sites_sf$lake_osm_id == lake_id][1]
if (is.na(lake_nm)) lake_nm <- paste0("ID: ", lake_id)
message(" ", lake_nm, ": ", lake_counts[lake_id], " sites")
}
}
return(sites_sf)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.