mlspatial:Spatial Machine Learning workflow
In mlspatial: Machine Learning and Mapping for Spatial Epidemiology

knitr::opts_chunk$set(collapse = TRUE, comment = "#>", echo = TRUE, message = FALSE, warning = FALSE)

olddir <- getwd()
setwd(tempdir())
setwd(olddir)

oldopt <- options(digits = 4)
options(oldopt)

knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
library(mlspatial)
library(dplyr)
library(ggplot2)
library(tmap)
library(sf)
library(spdep)
library(randomForest)
library(xgboost) 
library(e1071) 
library(caret)
library(tidyr)

# Quick thematic map with country labels
plot(africa_shp)
qtm(africa_shp)
qtm(africa_shp, text = "NAME")  # Replace with actual field name

tm_shape(africa_shp) +
  tm_polygons() +
  tm_text("NAME", size = 0.5) +  # Replace with correct column
  tm_title ("Africa Countries")

ggplot(africa_shp) +
  geom_sf(fill = "lightgray", color = "black") +
  geom_sf_text(aes(label = NAME), size = 2) +  # Replace as needed
  ggtitle("Africa countries") +
  theme_minimal()

# Join data
mapdata <- join_data(africa_shp, panc_incidence, by = "NAME")


## OR Joining/ merging my data and shapefiles
mapdata <- inner_join(africa_shp, panc_incidence, by = "NAME")   
## OR mapdata <- left_join(nat, codata, by = "DISTRICT_N")
str(mapdata)

#Visualize Pancreatic cancer Incidence by countries
#Basic map with labels
# quantile map
p1 <- tm_shape(mapdata) + 
  tm_fill("incidence", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"), 
          fill.legend = tm_legend(title = "Incidence")) + tm_borders(fill_alpha = .3) + tm_compass() + 
  tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE)
p2 <- tm_shape(mapdata) + 
  tm_fill("female", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"), 
          fill.legend = tm_legend(title = "Female")) + tm_borders(fill_alpha = .3) + tm_compass() + 
  tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE)
p3 <- tm_shape(mapdata) + 
  tm_fill("male", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"), 
          fill.legend = tm_legend(title = "Male")) + tm_borders(fill_alpha = .3) + tm_compass() + 
  tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE)
p4 <- tm_shape(mapdata) + 
  tm_fill("ageb", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"), 
          fill.legend = tm_legend(title = "Age:20-54yrs")) + tm_borders(fill_alpha = .3) + tm_compass() + 
  tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE)
p5 <- tm_shape(mapdata) + 
  tm_fill("agec", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"), 
          fill.legend = tm_legend(title = "Age:55+yrs")) + tm_borders(fill_alpha = .3) + tm_compass() + 
  tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE)
p6 <- tm_shape(mapdata) + 
  tm_fill("agea", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"), 
          fill.legend = tm_legend(title = "Age:<20yrs")) + tm_borders(fill_alpha = .3) + tm_compass() + 
  tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE)
p7 <- tm_shape(mapdata) + 
  tm_fill("fageb", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"), 
          fill.legend = tm_legend(title = "Female:20-54yrs")) + tm_borders(fill_alpha = .3) + tm_compass() + 
  tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE)
p8 <- tm_shape(mapdata) + 
  tm_fill("fagec", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"), 
          fill.legend = tm_legend(title = "Female:55+yrs")) + tm_borders(fill_alpha = .3) + tm_compass() + 
  tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE)
p9 <- tm_shape(mapdata) + 
  tm_fill("fagea", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"), 
          fill.legend = tm_legend(title = "Female:<20yrs")) + tm_borders(fill_alpha = .3) + tm_compass() + 
  tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE)
p10 <- tm_shape(mapdata) + 
  tm_fill("mageb", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"), 
          fill.legend = tm_legend(title = "Male:20-54yrs")) + tm_borders(fill_alpha = .3) + tm_compass() + 
  tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE)
p11 <- tm_shape(mapdata) + 
  tm_fill("magec", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"), 
          fill.legend = tm_legend(title = "Male:55+yrs")) + tm_borders(fill_alpha = .3) + tm_compass() + 
  tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE)
p12 <- tm_shape(mapdata) + 
  tm_fill("magea", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"), 
          fill.legend = tm_legend(title = "Male:<20yrs")) + tm_borders(fill_alpha = .3) + tm_compass() + 
  tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE)

current.mode <- tmap_mode("plot")
tmap_arrange(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, 
             widths = c(.75, .75))
tmap_mode(current.mode)

## Machine Learning Model building
# 1. Random Forest Regression
# Train Random Forest
set.seed(123)
rf_model <- randomForest(incidence ~ female + male + agea + ageb + agec + fagea + fageb + fagec +
                           magea + mageb + magec + yrb + yrc + yrd + yre, data = mapdata, ntree = 500, 
                         importance = TRUE)

# View model results
print(rf_model)
varImpPlot(rf_model)

#Plot Variable Importance
library(ggplot2)
importance_df <- data.frame(
  Variable = rownames(importance(rf_model)),
  Importance = importance(rf_model)[, "IncNodePurity"])

ggplot(importance_df, aes(x = reorder(Variable, Importance), y = Importance)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  labs(title = "Variable Importance (IncNodePurity)", x = "Variable", y = "Importance")

# 2. Gradient Boosting Machine (XGBoost)
# Prepare the data
x_vars <- model.matrix(incidence ~ female + male + agea + ageb + agec + fagea + fageb + fagec +
                         magea + mageb + magec + yrb + yrc + yrd + yre, data = mapdata)[,-1]
y <- mapdata$incidence

# Convert to DMatrix
dtrain <- xgb.DMatrix(data = x_vars, label = y)

# Train model
xgb_model <- xgboost(data = dtrain, 
                     objective = "reg:squarederror", 
                     nrounds = 100,
                     max_depth = 4,
                     eta = 0.1,
                     verbose = 0)

# Feature importance
xgb.importance(model = xgb_model)

# 3. Support Vector Regression (SVR)
# Train SVR model
svr_model <- svm(incidence ~ female + male + agea + ageb + agec + fagea + fageb + fagec +
                   magea + mageb + magec + yrb + yrc + yrd + yre, data = mapdata, 
                 type = "eps-regression", 
                 kernel = "radial")

# Summary and predictions
summary(svr_model)
#mapdata$pred_svr <- predict(svr_model)

# Model Evaluation (predictions):
# evaluate each model step-by-step:
# 1. Random Forest Evaluation
rf_preds <- predict(rf_model, newdata = mapdata)
rf_metrics <- postResample(pred = rf_preds, obs = mapdata$incidence)
print(rf_metrics)

# 2. XGBoost Evaluation
xgb_preds <- predict(xgb_model, newdata = x_vars)
xgb_metrics <- postResample(pred = xgb_preds, obs = mapdata$incidence)
print(xgb_metrics)

# 3. SVR Evaluation
svr_preds <- predict(svr_model, newdata = mapdata)
svr_metrics <- postResample(pred = svr_preds, obs = mapdata$incidence)
print(svr_metrics)

# Compare All Models
model_eval <- data.frame(
  Model = c("Random Forest", "XGBoost", "SVR"),
  RMSE = c(rf_metrics["RMSE"], xgb_metrics["RMSE"], svr_metrics["RMSE"]),
  MAE = c(rf_metrics["MAE"], xgb_metrics["MAE"], svr_metrics["MAE"]),
  Rsquared = c(rf_metrics["Rsquared"], xgb_metrics["Rsquared"], svr_metrics["Rsquared"]))

print(model_eval)

#Choosing the Best Model
#Lowest RMSE and MAE = most accurate predictions
#Highest R² = best variance explanation
#Sort and interpret:
model_eval[order(model_eval$RMSE), ]  # Best = Top row

#### Models Predicted plots
# Create a data frame from your table
model_preds <- data.frame(rf_preds, xgb_preds, svr_preds)
# Add observation ID
model_preds$ID <- 1:nrow(model_preds)

# Reshape for plotting
model_long <- model_preds %>%
  tidyr::pivot_longer(cols = c("rf_preds", "xgb_preds", "svr_preds"), names_to = "Model", values_to = "Predicted")

# Plot
ggplot(model_long, aes(x = ID, y = Predicted, color = Model)) +
  geom_line(linewidth = 0.5) +
  labs(title = "Model Predictions Over Observations",
       x = "Observation", y = "Predicted Incidence") +
  theme_minimal()

## plot actual vs predicted 
oldpar <- par(mfrow = c(1, 3)) # 3 plots side-by-side

# Random Forest
plot(mapdata$incidence, mapdata$rf_pred,
     xlab = "Observed", ylab = "Predicted",
     main = "Random Forest", pch = 19, col = "steelblue")
abline(0, 1, col = "red", lwd = 2)

# XGBoost
plot(mapdata$incidence, mapdata$xgb_pred,
     xlab = "Observed", ylab = "Predicted",
     main = "XGBoost", pch = 19, col = "darkgreen")
abline(0, 1, col = "red", lwd = 2)

# SVR
plot(mapdata$incidence, mapdata$svr_pred,
     xlab = "Observed", ylab = "Predicted",
     main = "SVR", pch = 19, col = "purple")
abline(0, 1, col = "red", lwd = 2)

par(oldpar)

## Actual vs predicted plot with correlations

library(ggplot2)
library(ggpubr)  # For stat_cor

mapdata$rf_pred <- predict(rf_model)
mapdata$xgb_pred <- predict(xgb_model, newdata = x_vars)
mapdata$svr_pred <- predict(svr_model, newdata = mapdata)

# Random Forest Plot
p1 <- ggplot(mapdata, aes(x = incidence, y = rf_pred)) +
  geom_point(color = "steelblue", alpha = 0.6) +
  geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
  stat_cor(method = "pearson", aes(label = paste0("R² = ")), label.x = 0) +
  labs(title = "Random Forest", x = "Observed Incidence", y = "Predicted Incidence") +
  theme_minimal()

# XGBoost Plot
p2 <- ggplot(mapdata, aes(x = incidence, y = xgb_pred)) +
  geom_point(color = "darkgreen", alpha = 0.6) +
  geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
  stat_cor(method = "pearson", aes(label = paste0("R² = ")), label.x = 0) +
  labs(title = "XGBoost", x = "Observed Incidence", y = "Predicted Incidence") +
  theme_minimal()

# SVR Plot
p3 <- ggplot(mapdata, aes(x = incidence, y = svr_pred)) +
  geom_point(color = "purple", alpha = 0.6) +
  geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
  stat_cor(method = "pearson", aes(label = paste0("R² = ")), label.x = 0) +
  labs(title = "SVR", x = "Observed Incidence", y = "Predicted Incidence") +
  theme_minimal()

combined_plot <- ggarrange(p1, p2, p3, ncol = 3, nrow = 1, common.legend = FALSE)

## CROSS-VALIDATION
#Step 1: Prepare common setup
# Set seed for reproducibility
set.seed(123)

# Define 5-fold cross-validation
cv_control <- trainControl(method = "cv", number = 5)

# 1. Random Forest
library(randomForest)

rf_cv <- train(
  incidence ~ female + male + agea + ageb + agec + fagea + fageb + fagec +
    magea + mageb + magec + yrb + yrc + yrd + yre,
  data = mapdata,
  method = "rf",
  trControl = cv_control,
  tuneLength = 3,
  importance = TRUE
)
print(rf_cv)

# 2. XGBoost
library(xgboost)

xgb_cv <- train(
  incidence ~ female + male + agea + ageb + agec + fagea + fageb + fagec +
    magea + mageb + magec + yrb + yrc + yrd + yre,
  data = mapdata,
  method = "xgbTree",
  trControl = cv_control,
  tuneLength = 3
)
print(xgb_cv)

# 3. Support Vector Regression (SVR)
library(e1071)
library(kernlab)

svr_cv <- train(
  incidence ~ female + male + agea + ageb + agec + fagea + fageb + fagec +
    magea + mageb + magec + yrb + yrc + yrd + yre,
  data = mapdata,
  method = "svmRadial",
  trControl = cv_control,
  preProcess = c("center", "scale"),  # SVR often benefits from scaling
  tuneLength = 3
)
print(svr_cv)


# Compare All Models (from CV)
results <- resamples(list(
  RF = rf_cv,
  XGB = xgb_cv,
  SVR = svr_cv
))

# Summary of RMSE, MAE, Rsquared
summary(results)

# Boxplot of performance
bwplot(results)

## Spatial maps of predicted values of each model
# 1. Random Forest Spatial Map
mapdata$pred_rf <- predict(rf_model, newdata = mapdata)

tm_shape(mapdata) + 
  tm_fill("pred_rf", fill.scale =tm_scale_intervals(values = "brewer.greens",  style = "quantile"), 
          fill.legend = tm_legend(title = "Inci_pred_rf")) + tm_borders(fill_alpha = .2) + 
  tm_compass() + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), 
                           frame = TRUE, component.autoscale = FALSE)

# 2. XGBoost Spatial Map
# Ensure matrix used in training
mapdata$pred_xgb <- predict(xgb_model, newdata = x_vars)

tm_shape(mapdata) + 
  tm_fill("pred_xgb", fill.scale =tm_scale_intervals(values = "brewer.purples", style = "quantile"), 
          fill.legend = tm_legend(title = "Inci_pred_xgb")) + tm_borders(fill_alpha = .2) + 
  tm_compass() + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), 
                           frame = TRUE, component.autoscale = FALSE)

# 3. Support Vector Regression (SVR) Spatial Map
mapdata$pred_svr <- predict(svr_model, newdata = mapdata)

tm_shape(mapdata) + 
  tm_fill("pred_svr", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"), 
          fill.legend = tm_legend(title = "Inci_pred_svr")) + tm_borders(fill_alpha = .2) + 
  tm_compass() + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), 
                           frame = TRUE, component.autoscale = FALSE)

# Compare Side by Side
tmap_arrange(
  tm_shape(mapdata) + 
    tm_fill("pred_rf", fill.scale =tm_scale_intervals(values = "brewer.greens",  style = "quantile"), 
            fill.legend = tm_legend(title = "Inci_pred_rf")) + tm_borders(fill_alpha = .2) + 
    tm_compass() + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), 
                             frame = TRUE, component.autoscale = FALSE),
  tm_shape(mapdata) + 
    tm_fill("pred_xgb", fill.scale =tm_scale_intervals(values = "brewer.purples", style = "quantile"), 
            fill.legend = tm_legend(title = "Inci_pred_xgb")) + tm_borders(fill_alpha = .2) + 
    tm_compass() + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), 
                             frame = TRUE, component.autoscale = FALSE),
  tm_shape(mapdata) + 
    tm_fill("pred_svr", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"), 
            fill.legend = tm_legend(title = "Inci_pred_svr")) + tm_borders(fill_alpha = .2) + 
    tm_compass() + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), 
                             frame = TRUE, component.autoscale = FALSE),
  nrow = 1
)

# Predicted Residuals
# we've already trained all 3 models and have `mapdata$incidence` as your actual values.
### Step 1: Generate predictions and residuals for each model
# Random Forest
rf_preds <- predict(rf_model, newdata = mapdata)
rf_resid <- mapdata$incidence - rf_preds

# XGBoost
xgb_preds <- predict(xgb_model, newdata = x_vars)  # x_vars = model.matrix(...)
xgb_resid <- mapdata$incidence - xgb_preds

# SVR
svr_preds <- predict(svr_model, newdata = mapdata)
svr_resid <- mapdata$incidence - svr_preds

### Step 2: Combine into a single data frame
residuals_df <- data.frame(
  actual = mapdata$incidence,
  rf_pred = rf_preds,
  rf_resid = rf_resid,
  xgb_pred = xgb_preds,
  xgb_resid = xgb_resid,
  svr_pred = svr_preds,
  svr_resid = svr_resid
)

# export
library(writexl)

### Compare residual distributions

boxplot(residuals_df$rf_resid, residuals_df$xgb_resid, residuals_df$svr_resid,
        names = c("RF", "XGB", "SVR"),
        main = "Model Residuals",
        ylab = "Prediction Error (Residual)")

## Spatial maps of residual values from each model
#Add residuals to mapdata
#You should already have these from the previous steps:
mapdata$rf_resid <- residuals_df$rf_resid
mapdata$xgb_resid <- residuals_df$xgb_resid
mapdata$svr_resid <- residuals_df$svr_resid

# Set tmap mode to plot (static map)
tmap_mode("plot")

# Create individual residual maps
map_rf <- tm_shape(mapdata) +
  tm_fill("rf_resid", fill.scale =tm_scale_intervals(values = "brewer.greens", style = "quantile", 
                                                     midpoint = 0), fill.legend = tm_legend(title = "Inci_rf_resid")) +
  tm_borders(fill_alpha = .2) + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), 
                                          frame = TRUE, component.autoscale = FALSE)

map_xgb <- tm_shape(mapdata) + 
  tm_fill("xgb_resid", fill.scale =tm_scale_intervals(values = "brewer.purples", style = "quantile", 
                                                      midpoint = 0), fill.legend = tm_legend(title = "Inci_xgb_resid")) + tm_borders(fill_alpha = .2) + 
  tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), 
            frame = TRUE, component.autoscale = FALSE)
map_svr <- tm_shape(mapdata) + 
  tm_fill("svr_resid", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"), 
          fill.legend = tm_legend(title = "Inci_svr_resid")) + tm_borders(fill_alpha = .2) + 
  tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), 
            frame = TRUE, component.autoscale = FALSE)

#Step 3: Combine maps in a grid
# Combine maps in a grid layout
tmap_arrange(map_rf, map_xgb, map_svr, nrow = 1)

##Barplot and Spatial maps for RMSE and MAE
#Step 1: Calculate RMSE and MAE for each model
# Random Forest
rf_metrics <- postResample(pred = rf_preds, obs = mapdata$incidence)

# XGBoost
xgb_metrics <- postResample(pred = xgb_preds, obs = mapdata$incidence)

# SVR
svr_metrics <- postResample(pred = svr_preds, obs = mapdata$incidence)

#Step 2: Combine into a summary data frame
model_eval <- data.frame(
  Model = c("Random Forest", "XGBoost", "SVR"),
  RMSE = c(rf_metrics["RMSE"], xgb_metrics["RMSE"], svr_metrics["RMSE"]),
  MAE = c(rf_metrics["MAE"], xgb_metrics["MAE"], svr_metrics["MAE"]),
  Rsquared = c(rf_metrics["Rsquared"], xgb_metrics["Rsquared"], svr_metrics["Rsquared"])
)

print(model_eval)

#Visualize MAE and RMSE
oldpar <- par(mfrow = c(1, 3))

#Barplot of RMSE
barplot(model_eval$RMSE, names.arg = model_eval$Model, 
        col = "skyblue", las = 1, main = "Model RMSE", ylab = "RMSE")

#Barplot of MAE
barplot(model_eval$MAE, names.arg = model_eval$Model, 
        col = "lightgreen", las = 1, main = "Model MAE", ylab = "MAE")
#Barplot of MAE
barplot(model_eval$Rsquared, names.arg = model_eval$Model, 
        col = "grey", las = 1, main = "Model Rsquared", ylab = "MAE")

par(oldpar)

#Add metrics to residual maps as captions
map_rf <- tm_shape(mapdata) +
  tm_fill("rf_resid", fill.scale =tm_scale_intervals(values = "brewer.greens", midpoint = 0),
          title = paste0("rf_resid\nRMSE: ", round(rf_metrics["RMSE"], 2),
                         "\nMAE: ", round(rf_metrics["MAE"], 2))) +
  tm_borders(fill_alpha = .2) + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"))

map_xgb <- tm_shape(mapdata) +
  tm_fill("xgb_resid", fill.scale =tm_scale_intervals(values = "-RdBu", midpoint = 0),
          title = paste0("xgb_resid\nRMSE: ", round(xgb_metrics["RMSE"], 2),
                         "\nMAE: ", round(xgb_metrics["MAE"], 2))) +
  tm_borders(fill_alpha = .2) + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"))

map_svr <- tm_shape(mapdata) +
  tm_fill("svr_resid", fill.scale =tm_scale_intervals(values = "-RdBu", midpoint = 0),
          title = paste0("svr_resid\nRMSE: ", round(svr_metrics["RMSE"], 2),
                         "\nMAE: ", round(svr_metrics["MAE"], 2))) +
  tm_borders(fill_alpha = .2) + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"))

tmap_arrange(map_rf, map_xgb, map_svr, nrow = 1)

#Global Moran’s I, Local Moran’s I (LISA), Cluster categories (e.g., High-High, Low-Low), Maps: Moran’s I map,
#LISA clusters, High-High clusters using the predicted values from the four machine learning models
#Assumptions: Your spatial data is in mapdata (an sf object).
#Predicted values for each model are already stored in: rf_preds, xgb_preds, svr_preds, mlp_preds
#STEP 1: Create Spatial Weights (if not done yet)
neighbors <- poly2nb(mapdata) #if this gives warning, use the below codes
mapdata <- st_as_sf(mapdata)  # If it's not already sf
mapdata <- st_make_valid(mapdata)  # Fix any invalid geometries
neighbors <- poly2nb(mapdata, snap = 1e-15) ## Try 1e-6, 1e-5, or higher if needed. You can adjust snap upward incrementally until the warnings disappear or are reduced

listw <- nb2listw(neighbors, style = "W", zero.policy = TRUE)

#STEP 2: Define a function to compute Moran’s I and cluster categories
analyze_spatial_autocorrelation <- function(mapdata, values, listw, model_name, signif_level = 0.05) {
  # Standardize predicted values
  mapdata$val_st <- scale(values)[, 1]
  # Compute lag
  mapdata$lag_val <- lag.listw(listw, mapdata$val_st, zero.policy = TRUE)
  # Global Moran's I
  global_moran <- moran.test(values, listw, zero.policy = TRUE)
  # Local Moran's I (LISA)
  lisa <- localmoran(values, listw, zero.policy = TRUE)
  lisa_df <- as.data.frame(lisa)
  #rename p-value column
  colnames(lisa_df)[5] <- "Pr_z"
  # Add to mapdata
  mapdata$Ii <- lisa_df$Ii
  mapdata$Z_Ii <- lisa_df$Z.I
  mapdata$Pr_z <- lisa_df$Pr_z
  #mapdata$Pr_z <- lisa_df[, "Pr(z > 0)"]
  # Classify clusters
  mapdata <- mapdata %>%
    mutate(
      cluster = case_when(
        val_st > 0 & lag_val > 0 & Pr_z <= signif_level ~ "High-High",
        val_st < 0 & lag_val < 0 & Pr_z <= signif_level ~ "Low-Low",
        val_st < 0 & lag_val > 0 & Pr_z <= signif_level ~ "Low-High",
        val_st > 0 & lag_val < 0 & Pr_z <= signif_level ~ "High-Low",
        TRUE ~ "Not Significant"
      )
    )
  return(list(
    map = mapdata,
    global_moran = global_moran
  ))
}

#STEP 3: Run the function for each model
rf_result <- analyze_spatial_autocorrelation(mapdata, rf_preds, listw, "Random Forest")
xgb_result <- analyze_spatial_autocorrelation(mapdata, xgb_preds, listw, "XGBoost")
svr_result <- analyze_spatial_autocorrelation(mapdata, svr_preds, listw, "SVR")

#STEP 4: Mapping LISA Clusters and High-High Areas for Random Forest
tmap_mode("plot")
# LISA Cluster Map.  fill.scale =tm_scale_intervals(values = "-RdBu")
tm_rf <- tm_shape(rf_result$map) +
  tm_fill(
    "cluster",
    fill.scale = tm_scale(values = c(
      "High-High" = "red",
      "Low-Low" = "blue",
      "Low-High" = "lightblue",
      "High-Low" = "pink",
      "Not Significant" = "gray90")),
    fill.legend = tm_legend(title = "LISA Clusters - RF")) +
  tm_borders(fill_alpha = .2) + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom")) 

# Mapping LISA Clusters and High-High Areas for XGBoost
tmap_mode("plot")
# LISA Cluster Map
tm_xgb <- tm_shape(xgb_result$map) +
  tm_fill("cluster", 
          fill.scale = tm_scale(values = c(
            "High-High" = "red", 
            "Low-Low" = "blue",
            "Low-High" = "lightblue", 
            "High-Low" = "pink",
            "Not Significant" = "gray90")),
          fill.legend = tm_legend(title = "LISA Clusters - XGBoost")) +
  tm_borders(fill_alpha = .2) + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom")) 

#Mapping LISA Clusters and High-High Areas for SVR
tmap_mode("plot")
# LISA Cluster Map
tm_svr <- tm_shape(svr_result$map) +
  tm_fill("cluster", 
          fill.scale = tm_scale(values = c(
            "High-High" = "red", 
            "Low-Low" = "blue",
            "Low-High" = "lightblue", 
            "High-Low" = "pink",
            "Not Significant" = "gray90")),
          fill.legend = tm_legend(title = "LISA Clusters - SVR")) +
  tm_borders(fill_alpha = .2) + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"))

tmap_arrange(tm_rf, tm_xgb, tm_svr, nrow = 1)
#You can also arrange maps side-by-side using tmap_arrange().

#View Global Moran’s I Results
#These print the test statistic and p-values for global spatial autocorrelation of predictions.
rf_result$global_moran
xgb_result$global_moran
svr_result$global_moran