knitr::opts_chunk$set( collapse = TRUE, comment = "#>", warning = FALSE, message = FALSE, fig.width = 8, fig.height = 6 )
Spatial data integration is essential for combining information from different sources and scales. The GeoSpatialSuite package provides the universal_spatial_join() function that handles all major spatial data combinations with automatic method detection and robust error handling.
By the end of this vignette, you will be able to:
# Load required packages library(geospatialsuite) library(terra) library(sf) # Verify package functionality test_function_availability()
The universal_spatial_join() function automatically detects data types and selects the appropriate spatial operation:
# Create sample point data (field sites) field_sites <- data.frame( site_id = paste0("Site_", 1:20), lon = runif(20, -83.5, -83.0), lat = runif(20, 40.2, 40.7), crop_type = sample(c("corn", "soybeans", "wheat"), 20, replace = TRUE) ) # Create sample raster data (satellite imagery) satellite_raster <- rast(nrows = 50, ncols = 50, xmin = -83.5, xmax = -83.0, ymin = 40.2, ymax = 40.7) values(satellite_raster) <- runif(2500, 0.2, 0.9) names(satellite_raster) <- "ndvi" # Extract raster values to points (automatic method detection) extracted_results <- universal_spatial_join( source_data = field_sites, target_data = satellite_raster, method = "auto", # Automatically detects "extract" verbose = TRUE ) # View results - original data plus extracted values head(extracted_results) print(names(extracted_results))
# Extract values with 100m buffer around each point buffered_extraction <- universal_spatial_join( source_data = field_sites, target_data = satellite_raster, method = "extract", buffer_distance = 0.001, # ~100m in degrees summary_function = "mean", verbose = TRUE ) # Compare point vs buffered extraction comparison_data <- data.frame( site_id = extracted_results$site_id, point_extraction = extracted_results$extracted_ndvi, buffer_extraction = buffered_extraction$extracted_ndvi ) # Calculate differences comparison_data$difference <- abs(comparison_data$point_extraction - comparison_data$buffer_extraction) print(summary(comparison_data$difference))
# Create sample polygon data (management zones) management_zones <- data.frame( zone_id = 1:5, x_center = runif(5, -83.4, -83.1), y_center = runif(5, 40.3, 40.6), zone_type = sample(c("irrigated", "dryland"), 5, replace = TRUE) ) # Convert to polygons with buffers zones_sf <- st_as_sf(management_zones, coords = c("x_center", "y_center"), crs = 4326) zones_polygons <- st_buffer(zones_sf, dist = 0.02) # ~2km buffer # Calculate zonal statistics zonal_results <- universal_spatial_join( source_data = satellite_raster, # Raster first for zonal target_data = zones_polygons, # Polygons second method = "zonal", summary_function = "mean", verbose = TRUE ) # View zonal statistics head(zonal_results)
# Calculate multiple statistics for each zone summary_functions <- c("mean", "median", "max", "min", "sd") zonal_multi_stats <- list() for (func in summary_functions) { result <- universal_spatial_join( source_data = satellite_raster, target_data = zones_polygons, method = "zonal", summary_function = func, verbose = FALSE ) # Extract the new column stat_col <- names(result)[!names(result) %in% names(zones_polygons)] zonal_multi_stats[[func]] <- result[[stat_col[1]]] } # Combine into summary data frame zone_summary <- data.frame( zone_id = zones_polygons$zone_id, zone_type = zones_polygons$zone_type, mean_ndvi = zonal_multi_stats$mean, median_ndvi = zonal_multi_stats$median, max_ndvi = zonal_multi_stats$max, min_ndvi = zonal_multi_stats$min, sd_ndvi = zonal_multi_stats$sd ) print(zone_summary)
# Create rasters with different resolutions high_res_raster <- rast(nrows = 100, ncols = 100, xmin = -83.5, xmax = -83.0, ymin = 40.2, ymax = 40.7) values(high_res_raster) <- runif(10000, 0.3, 0.8) names(high_res_raster) <- "detailed_ndvi" low_res_raster <- rast(nrows = 20, ncols = 20, xmin = -83.5, xmax = -83.0, ymin = 40.2, ymax = 40.7) values(low_res_raster) <- runif(400, 0.1, 0.6) names(low_res_raster) <- "coarse_data" # Resample high resolution to match low resolution resampled_result <- universal_spatial_join( source_data = high_res_raster, target_data = low_res_raster, method = "resample", summary_function = "mean", verbose = TRUE ) # Check resolution change cat("Original resolution:", res(high_res_raster), "\n") cat("Resampled resolution:", res(resampled_result), "\n")
# Aggregate by scale factor (coarser resolution) aggregated_raster <- universal_spatial_join( source_data = high_res_raster, target_data = NULL, # No target needed for scaling method = "resample", scale_factor = 5, # 5x coarser resolution summary_function = "mean", verbose = TRUE ) # Disaggregate (finer resolution) disaggregated_raster <- universal_spatial_join( source_data = low_res_raster, target_data = NULL, method = "resample", scale_factor = 0.5, # 2x finer resolution verbose = TRUE ) cat("Original low res:", res(low_res_raster), "\n") cat("Disaggregated res:", res(disaggregated_raster), "\n")
# Create county boundaries (simplified) counties <- data.frame( county = c("Franklin", "Delaware", "Union"), x_center = c(-83.0, -83.1, -83.3), y_center = c(40.0, 40.4, 40.2) ) counties_sf <- st_as_sf(counties, coords = c("x_center", "y_center"), crs = 4326) counties_polygons <- st_buffer(counties_sf, dist = 0.15) # Large county-like areas # Find which county each field site is in sites_with_counties <- universal_spatial_join( source_data = field_sites, target_data = counties_polygons, method = "overlay", verbose = TRUE ) # View results head(sites_with_counties)
# Find nearest weather station for each field site weather_stations <- data.frame( station_id = paste0("WX_", 1:8), longitude = runif(8, -83.6, -82.9), latitude = runif(8, 40.1, 40.8), elevation = runif(8, 200, 400), avg_temp = runif(8, 10, 15) ) nearest_results <- universal_spatial_join( source_data = field_sites, target_data = weather_stations, method = "nearest", verbose = TRUE ) # Check distances to nearest stations # The function automatically calculates spatial relationships head(nearest_results)
# Integrate multiple datasets to field sites raster_datasets <- list( elevation = rast(nrows = 30, ncols = 30, xmin = -83.5, xmax = -83.0, ymin = 40.2, ymax = 40.7), temperature = rast(nrows = 30, ncols = 30, xmin = -83.5, xmax = -83.0, ymin = 40.2, ymax = 40.7), precipitation = rast(nrows = 30, ncols = 30, xmin = -83.5, xmax = -83.0, ymin = 40.2, ymax = 40.7) ) # Add random values values(raster_datasets$elevation) <- runif(900, 200, 400) values(raster_datasets$temperature) <- runif(900, 8, 18) values(raster_datasets$precipitation) <- runif(900, 800, 1200) # Extract all environmental variables to field sites environmental_data <- field_sites for (var_name in names(raster_datasets)) { extraction_result <- universal_spatial_join( source_data = environmental_data, target_data = raster_datasets[[var_name]], method = "extract", verbose = FALSE ) # Add extracted values to main dataset new_col <- names(extraction_result)[!names(extraction_result) %in% names(environmental_data)] environmental_data[[var_name]] <- extraction_result[[new_col[1]]] } # View integrated dataset head(environmental_data)
# Create sample DEM dem_raster <- rast(nrows = 60, ncols = 60, xmin = -83.5, xmax = -83.0, ymin = 40.2, ymax = 40.7) values(dem_raster) <- runif(3600, 200, 500) names(dem_raster) <- "elevation" # Use the integrated terrain analysis function terrain_results <- integrate_terrain_analysis( vector_data = field_sites, elevation_raster = dem_raster, terrain_vars = c("slope", "aspect", "TRI", "TPI"), extraction_method = "extract" ) # View terrain-enhanced field sites head(terrain_results)
# Create data in different coordinate systems utm_points <- data.frame( id = 1:10, x_utm = runif(10, 300000, 350000), # UTM coordinates y_utm = runif(10, 4450000, 4480000) ) # Convert to UTM Zone 17N utm_sf <- st_as_sf(utm_points, coords = c("x_utm", "y_utm"), crs = 32617) # Our raster is in WGS84 (EPSG:4326) wgs84_raster <- satellite_raster # Universal spatial join handles CRS conversion automatically utm_extraction <- universal_spatial_join( source_data = utm_sf, target_data = wgs84_raster, method = "extract", verbose = TRUE # Shows CRS conversion messages ) # Check that extraction worked despite different CRS head(utm_extraction)
# Force specific output CRS projected_result <- universal_spatial_join( source_data = field_sites, target_data = satellite_raster, method = "extract", crs_target = 32617, # UTM Zone 17N verbose = TRUE ) # Check output CRS st_crs(projected_result)
# Create raster with some NA values sparse_raster <- satellite_raster values(sparse_raster)[sample(1:ncell(sparse_raster), 500)] <- NA names(sparse_raster) <- "sparse_data" # Different strategies for handling NAs strategies <- c("remove", "nearest", "zero") na_handling_results <- list() for (strategy in strategies) { result <- universal_spatial_join( source_data = field_sites, target_data = sparse_raster, method = "extract", na_strategy = strategy, verbose = FALSE ) extracted_col <- names(result)[grepl("extracted", names(result))] na_handling_results[[strategy]] <- result[[extracted_col[1]]] } # Compare different NA handling approaches na_comparison <- data.frame( site_id = field_sites$site_id, remove_na = na_handling_results$remove, nearest_fill = na_handling_results$nearest, zero_fill = na_handling_results$zero ) # Count NAs in each approach sapply(na_comparison[-1], function(x) sum(is.na(x)))
# Create rasters at different scales scales <- c(1, 2, 4, 8) # Different aggregation levels multi_scale_data <- list() base_raster <- satellite_raster for (scale in scales) { if (scale == 1) { multi_scale_data[[paste0("scale_", scale)]] <- base_raster } else { aggregated <- universal_spatial_join( source_data = base_raster, target_data = NULL, method = "resample", scale_factor = scale, summary_function = "mean" ) multi_scale_data[[paste0("scale_", scale)]] <- aggregated } } # Extract values at different scales multi_scale_extraction <- field_sites for (scale_name in names(multi_scale_data)) { result <- universal_spatial_join( source_data = multi_scale_extraction, target_data = multi_scale_data[[scale_name]], method = "extract", verbose = FALSE ) # Add to main dataset new_col <- names(result)[!names(result) %in% names(multi_scale_extraction)] multi_scale_extraction[[scale_name]] <- result[[new_col[1]]] } # Analyze scale effects scale_columns <- names(multi_scale_extraction)[grepl("scale_", names(multi_scale_extraction))] scale_analysis <- multi_scale_extraction[c("site_id", scale_columns)] head(scale_analysis)
# Create sparse monitoring data sparse_monitoring <- data.frame( monitor_id = 1:8, longitude = runif(8, -83.4, -83.1), latitude = runif(8, 40.3, 40.6), soil_ph = runif(8, 6.0, 7.5), organic_matter = runif(8, 2, 6) ) # Some missing values sparse_monitoring$soil_ph[c(2, 5)] <- NA sparse_monitoring$organic_matter[c(3, 7)] <- NA # Interpolate missing values interpolated_data <- spatial_interpolation_comprehensive( spatial_data = sparse_monitoring, target_variables = c("soil_ph", "organic_matter"), method = "NN", # Nearest neighbor verbose = TRUE ) # Compare before and after interpolation comparison <- data.frame( original_ph = sparse_monitoring$soil_ph, interpolated_ph = interpolated_data$soil_ph, original_om = sparse_monitoring$organic_matter, interpolated_om = interpolated_data$organic_matter ) print(comparison)
# Simulate large point dataset large_dataset <- data.frame( point_id = 1:5000, x = runif(5000, -83.5, -83.0), y = runif(5000, 40.2, 40.7), measurement = runif(5000, 0, 100) ) # Process in chunks for memory efficiency chunked_extraction <- universal_spatial_join( source_data = large_dataset, target_data = satellite_raster, method = "extract", chunk_size = 1000, # Process 1000 points at a time verbose = TRUE ) # Check processing efficiency cat("Processed", nrow(chunked_extraction), "points successfully\n")
# Create multiple large rasters raster_list <- list() for (i in 1:3) { r <- rast(nrows = 200, ncols = 200, xmin = -84, xmax = -82, ymin = 40, ymax = 42) values(r) <- runif(40000, 0, 1) names(r) <- paste0("band_", i) raster_list[[i]] <- r } # Efficiently combine rasters combined_raster <- raster_to_raster_ops( raster1 = raster_list[[1]], raster2 = raster_list[[2]], operation = "add", handle_na = "ignore", verbose = TRUE ) # Multi-raster operations mean_composite <- universal_spatial_join( source_data = raster_list[[1]], target_data = raster_list[[2]], method = "resample", summary_function = "mean", verbose = TRUE )
# Simulate complete agricultural analysis workflow farm_fields <- data.frame( field_id = paste0("Field_", LETTERS[1:15]), longitude = runif(15, -83.4, -83.1), latitude = runif(15, 40.3, 40.6), crop_type = sample(c("corn", "soybeans"), 15, replace = TRUE), planting_date = as.Date("2023-05-01") + sample(0:20, 15, replace = TRUE) ) # Convert to polygons (field boundaries) farm_sf <- st_as_sf(farm_fields, coords = c("longitude", "latitude"), crs = 4326) field_polygons <- st_buffer(farm_sf, dist = 0.008) # ~800m field size # Environmental datasets environmental_rasters <- list( ndvi = satellite_raster, elevation = dem_raster, soil_moisture = rast(nrows = 40, ncols = 40, xmin = -83.5, xmax = -83.0, ymin = 40.2, ymax = 40.7) ) values(environmental_rasters$soil_moisture) <- runif(1600, 0.1, 0.4) # Comprehensive field characterization field_analysis <- farm_fields for (env_var in names(environmental_rasters)) { # Calculate field averages using zonal statistics zonal_result <- universal_spatial_join( source_data = environmental_rasters[[env_var]], target_data = field_polygons, method = "zonal", summary_function = "mean", verbose = FALSE ) # Extract the zonal statistic stat_col <- names(zonal_result)[grepl("zonal", names(zonal_result))] field_analysis[[paste0("avg_", env_var)]] <- zonal_result[[stat_col[1]]] } # View comprehensive field analysis head(field_analysis)
# Create watershed polygons watersheds <- data.frame( watershed_id = paste0("WS_", 1:6), outlet_lon = runif(6, -83.4, -83.1), outlet_lat = runif(6, 40.3, 40.6), area_km2 = runif(6, 10, 100) ) watersheds_sf <- st_as_sf(watersheds, coords = c("outlet_lon", "outlet_lat"), crs = 4326) # Create watershed polygons proportional to area watersheds_polygons <- st_buffer(watersheds_sf, dist = sqrt(watersheds$area_km2) * 0.002) # Calculate watershed statistics from multiple sources watershed_stats <- watersheds raster_sources <- list( mean_elevation = dem_raster, mean_ndvi = satellite_raster, vegetation_variability = satellite_raster # Will calculate SD ) summary_functions <- list( mean_elevation = "mean", mean_ndvi = "mean", vegetation_variability = "sd" ) for (var_name in names(raster_sources)) { result <- universal_spatial_join( source_data = raster_sources[[var_name]], target_data = watersheds_polygons, method = "zonal", summary_function = summary_functions[[var_name]], verbose = FALSE ) stat_col <- names(result)[grepl("zonal", names(result))] watershed_stats[[var_name]] <- result[[stat_col[1]]] } print(watershed_stats)
# Handle geometry mismatches gracefully mismatched_raster <- rast(nrows = 25, ncols = 30, # Different aspect ratio xmin = -83.6, xmax = -82.9, # Slightly different extent ymin = 40.1, ymax = 40.8) values(mismatched_raster) <- runif(750, 0, 1) # Function automatically handles alignment robust_extraction <- universal_spatial_join( source_data = field_sites, target_data = mismatched_raster, method = "extract", verbose = TRUE # Shows alignment messages ) # Handle empty results gracefully empty_region <- data.frame( x = c(-90, -89), # Far from our data y = c(35, 36) ) empty_sf <- st_as_sf(empty_region, coords = c("x", "y"), crs = 4326) # This will work but return NA values where appropriate empty_result <- universal_spatial_join( source_data = empty_sf, target_data = satellite_raster, method = "extract", verbose = TRUE ) print(empty_result)
# Monitor performance for different data sizes data_sizes <- c(10, 50, 100, 500) performance_results <- data.frame( n_points = data_sizes, processing_time = numeric(length(data_sizes)) ) for (i in seq_along(data_sizes)) { n <- data_sizes[i] test_points <- data.frame( id = 1:n, lon = runif(n, -83.4, -83.1), lat = runif(n, 40.3, 40.6) ) start_time <- Sys.time() result <- universal_spatial_join( source_data = test_points, target_data = satellite_raster, method = "extract", verbose = FALSE ) end_time <- Sys.time() performance_results$processing_time[i] <- as.numeric(end_time - start_time) } print(performance_results)
# Extract vegetation indices to management zones vegetation_stack <- calculate_multiple_indices( red = satellite_raster * 0.7, # Simulate red band nir = satellite_raster, indices = c("NDVI", "DVI", "RVI"), output_stack = TRUE ) # Extract all vegetation indices to zones vegetation_by_zone <- universal_spatial_join( source_data = zones_polygons, target_data = vegetation_stack, method = "extract", buffer_distance = 0, # No buffer for polygon extraction summary_function = "mean", verbose = TRUE ) # Analyze vegetation patterns by zone type zone_veg_summary <- aggregate( vegetation_by_zone[grepl("extracted", names(vegetation_by_zone))], by = list(zone_type = vegetation_by_zone$zone_type), FUN = mean, na.rm = TRUE ) print(zone_veg_summary)
# Combine water indices with field water quality measurements water_index_stack <- calculate_multiple_water_indices( green = satellite_raster * 0.8, # Simulate green band nir = satellite_raster, swir1 = satellite_raster * 0.5, # Simulate SWIR1 indices = c("NDWI", "MNDWI", "NDMI"), output_stack = TRUE ) # Extract to water quality monitoring sites water_sites <- data.frame( site_id = paste0("WQ_", 1:12), longitude = runif(12, -83.4, -83.1), latitude = runif(12, 40.3, 40.6), measured_turbidity = runif(12, 5, 25), measured_chlorophyll = runif(12, 8, 35) ) remote_vs_field <- universal_spatial_join( source_data = water_sites, target_data = water_index_stack, method = "extract", buffer_distance = 0.002, # 200m buffer summary_function = "mean", verbose = TRUE ) # Analyze relationships between remote sensing and field data correlations <- cor( remote_vs_field[c("measured_turbidity", "measured_chlorophyll")], remote_vs_field[grepl("extracted", names(remote_vs_field))], use = "complete.obs" ) print(correlations)
# Use the specialized multi-scale function multi_scale_analysis <- multiscale_operations( spatial_data = satellite_raster, target_scales = c(1, 2, 4, 8), operation = "mean", pyramid = TRUE ) # Extract at multiple scales to compare spatial patterns scale_comparison <- field_sites for (scale_name in names(multi_scale_analysis)) { result <- universal_spatial_join( source_data = scale_comparison, target_data = multi_scale_analysis[[scale_name]], method = "extract", verbose = FALSE ) new_col <- names(result)[!names(result) %in% names(scale_comparison)] scale_comparison[[scale_name]] <- result[[new_col[1]]] } # Analyze scale effects scale_vars <- names(scale_comparison)[grepl("scale_", names(scale_comparison))] scale_effects <- scale_comparison[c("site_id", scale_vars)] head(scale_effects)
# Create two related rasters for mathematical operations raster_a <- satellite_raster raster_b <- rast(nrows = 50, ncols = 50, xmin = -83.5, xmax = -83.0, ymin = 40.2, ymax = 40.7) values(raster_b) <- runif(2500, 0.1, 0.7) # Mathematical operations between rasters addition_result <- raster_to_raster_ops( raster1 = raster_a, raster2 = raster_b, operation = "add", align_method = "resample", verbose = TRUE ) difference_result <- raster_to_raster_ops( raster1 = raster_a, raster2 = raster_b, operation = "subtract", handle_na = "ignore", verbose = TRUE ) ratio_result <- raster_to_raster_ops( raster1 = raster_a, raster2 = raster_b, operation = "ratio", verbose = TRUE ) # Extract mathematical results to points math_results <- universal_spatial_join( source_data = field_sites, target_data = c(addition_result, difference_result, ratio_result), method = "extract", verbose = TRUE )
# Create comprehensive visualization of integrated results integrated_map <- create_spatial_map( spatial_data = environmental_data, fill_variable = "elevation", color_scheme = "terrain", title = "Field Sites with Environmental Data", point_size = 5 ) # Quick visualization of zonal results zonal_map <- create_spatial_map( spatial_data = zonal_results, fill_variable = names(zonal_results)[grepl("zonal", names(zonal_results))][1], map_type = "polygons", color_scheme = "viridis", title = "Management Zones - Mean NDVI" )
# Compare extraction methods comparison_data <- data.frame( site_id = field_sites$site_id, point_extraction = extracted_results$extracted_ndvi, buffer_extraction = buffered_extraction$extracted_ndvi, difference = abs(extracted_results$extracted_ndvi - buffered_extraction$extracted_ndvi) ) # Plot comparison plot(comparison_data$point_extraction, comparison_data$buffer_extraction, xlab = "Point Extraction", ylab = "Buffer Extraction", main = "Point vs Buffer Extraction Comparison") abline(0, 1, col = "red", lty = 2) # 1:1 line
# Always validate inputs before processing validate_spatial_data <- function(data) { if (inherits(data, "sf")) { # Check for valid geometries invalid_geoms <- !st_is_valid(data) if (any(invalid_geoms)) { warning(paste("Found", sum(invalid_geoms), "invalid geometries")) data <- st_make_valid(data) } } if (inherits(data, "SpatRaster")) { # Check for data coverage valid_cells <- sum(!is.na(values(data, mat = FALSE))) total_cells <- ncell(data) coverage <- (valid_cells / total_cells) * 100 if (coverage < 10) { warning(paste("Low data coverage:", round(coverage, 1), "%")) } } return(data) } # Use validation in workflows validated_sites <- validate_spatial_data( st_as_sf(field_sites, coords = c("lon", "lat"), crs = 4326) ) validated_raster <- validate_spatial_data(satellite_raster)
# Guidelines for choosing spatial join methods method_guide <- data.frame( Source_Type = c("Points", "Polygons", "Raster", "Raster", "Points"), Target_Type = c("Raster", "Raster", "Polygons", "Raster", "Points"), Recommended_Method = c("extract", "extract", "zonal", "resample", "nearest"), Use_Case = c("Sample at locations", "Area averages", "Regional stats", "Align data", "Find closest"), stringsAsFactors = FALSE ) print(method_guide)
# Optimize for different scenarios optimization_tips <- list( large_datasets = "Use chunk_size parameter to control memory usage", different_crs = "Let the function handle CRS conversion automatically", missing_data = "Choose appropriate na_strategy for your analysis needs", multiple_variables = "Process variables separately for better error handling", visualization = "Use quick_map() for fast preliminary visualization" ) for (tip_name in names(optimization_tips)) { cat(tip_name, ":", optimization_tips[[tip_name]], "\n") }
This vignette demonstrated comprehensive spatial data integration using GeoSpatialSuite:
universal_spatial_join() - Core integration function with auto-detectionraster_to_raster_ops() - Mathematical operations between rastersmultiscale_operations() - Multi-scale analysis capabilitiesspatial_interpolation_comprehensive() - Fill missing data spatiallyintegrate_terrain_analysis() - Specialized terrain integrationThis work was developed by the GeoSpatialSuite team with contributions from: Olatunde D. Akanbi, Erika I. Barcelos, and Roger H. French.
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.