knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
This vignette serves as a quick guide to Spatial Logistic Gaussian Process (SLGP) modeling, with a focus on predicting distributions for integer outputs.
We illustrate the model's capabilities using the Boston Housing dataset @harrison_hedonic_1978, a widely used benchmark in statistical modeling and regression analysis.
In this vignette, we demonstrate how to model the distribution of rad (index of accessibility to radial highways, between 1 and 24 where 24 indicates the best accessibility and 1 the worst) as a function of medv (median value of owner-occupied homes) and dis (weighted distance to employment centers) using Spatial Logistic Gaussian Processes (SLGPs). Since rad is an integer-valued variable, our goal is to adjust the implementation for this.
library(dplyr) # Load the dataset (available in MASS package) require(MASS) data("Boston", package = "MASS") df <- Boston range_response <- range(df$rad) range_x <- matrix(c(c(1, 13), range(df$medv)), # Use c(1, 13) instead of range = c(1.1296 12.1265) # For easier chunks ncol=2, byrow=TRUE)
We represent the data.
library(ggplot2) library(ggpubr) # Histogram: Distribution of med by age bin ggplot(df, aes(x = dis, y=medv, fill=as.factor(rad))) + geom_point(pch=21)+ labs(x = "Median value of owner-occupied homes [k$]", y = "weighted distance to employment centers", title = "Scatter-plot of accessibility to radial highways") + theme_minimal()+ scale_fill_viridis_d(option = "viridis", guide = guide_legend(nrow = 3, title = "Index of accessibility", barheight = unit(2, units = "mm"), barwidth = unit(55, units = "mm"), title.position = 'top', label.position = "bottom", title.hjust = 0.5))+ theme(legend.position = "bottom")
We propose mapping the value 24 to 9 for rad, resulting in a more compact domain
df$rad <- ifelse(df$rad==24, 9, df$rad) range_response <- range(df$rad) # Histogram: Distribution of med by age bin ggplot(df, aes(x = dis, y=medv, fill=factor(rad, levels=seq(9)))) + geom_point(pch=21)+ labs(x = "Median value of owner-occupied homes [k$]", y = "weighted distance to employment centers", title = "Scatter-plot of accessibility to radial highways") + theme_minimal()+ scale_fill_viridis_d(option = "viridis", guide = guide_legend(nrow = 3, title = "Index of accessibility", barheight = unit(2, units = "mm"), barwidth = unit(55, units = "mm"), title.position = 'top', label.position = "bottom", title.hjust = 0.5))+ theme(legend.position = "bottom")
We can also display the empirical probabilities. For that, we use "bins" for the samples, as there are no replicates.
# Create interval variables for medv and dis df <- df %>% mutate(medv_bin = factor(paste0("medv in ", cut(medv, breaks = seq(0, 50, by = 10), include.lowest = FALSE, right = TRUE)), levels = paste0("medv in ", cut(seq(50, 1, -10), breaks = seq(0, 50, by = 10), include.lowest = FALSE, right = TRUE))), dis_bin = factor(paste0("dis in ", cut(dis, breaks = seq(1, 13, by = 2), include.lowest = FALSE, right = TRUE)), levels = paste0("dis in ", cut(seq(2, 13, 2), breaks = seq(1, 13, by = 2), include.lowest = FALSE, right = TRUE)))) ggplot(df, aes(x = rad, y = after_stat(prop))) + geom_bar(fill="cornflowerblue", col="navy", lwd=0.2, alpha=0.7, width=0.4)+ facet_grid(medv_bin ~ dis_bin) + labs(x = "Index of accessibility to radial highways (RAD)", y = "Proportion", title = "Distribution of RAD across bins of various home value and distance to employment centers") + theme_minimal() + theme(legend.position = "bottom")+ scale_x_continuous(breaks = c(1:9))
library(SLGP) modelMAP <- slgp(rad~dis+medv, # Use a formula with two indexing variables data=df, method="MAP", #Maximum a posteriori estimation scheme basisFunctionsUsed = "RFF", interpolateBasisFun="WNN", # Accelerate inference hyperparams = list(lengthscale=c(0.1, 0.15, 0.15), # Applied to normalised data # So 0.15 is 15% of the range of values sigma2=1), nIntegral = 9, #or length(seq(seq(range_response[1], range_response[2]))) sigmaEstimationMethod = "heuristic", # Set to heuristic for numerical stability predictorsLower= c(range_x[,1]), predictorsUpper= c(range_x[,2]), responseRange= range_response, opts_BasisFun = list(nFreq=150, MatParam=5/2))
We can represent the conditional probabilities. For that, we use "bins" for the samples, as there are no replicates. We display the histograms of values in these bins compared to SLGP predictions of the probabilities at the center of the bins.
library(viridis) dfGrid <- data.frame(expand.grid(seq(range_x[1, 1], range_x[1, 2], 0.5), seq(range_x[2, 1], range_x[2, 2], 1), seq(range_response[1], range_response[2]))) colnames(dfGrid) <- c("dis", "medv", "rad") pred <- predictSLGP_newNode(SLGPmodel=modelMAP, newNodes = dfGrid, nIntegral = 9) pred[, -c(1:3)] <- pred[, -c(1:3)] * diff(range_response) /(diff(range_response) +1) # Goes from values to integrate over a domain to discrete probabilities df_plot <- pred %>% filter(dis %in% seq(2, 12, 2))%>% filter(medv %in% seq(5, 45, 10)) %>% mutate(medv_bin=paste0("medv in (", medv-5,",", medv+5,"]"))%>% mutate(dis_bin=paste0("dis in (", dis-1,",", dis+1,"]")) df_counts <- df %>% group_by(medv_bin, dis_bin) %>% summarise(count = paste0("n=", n()), .groups = "keep") ggplot() + geom_bar(data=df, mapping=aes(x = rad-0.1, y = after_stat(prop)), fill="cornflowerblue", col="navy", lwd=0.2, alpha=0.7, width=0.4) + geom_col(data=df_plot, mapping=aes(x = rad+0.1, y = pdf_1), col="red", fill="grey", lwd=0.2, alpha=0.7, lty=2, width=0.4) + geom_text(data=df_counts, mapping=aes(x = 1.5, y = 1, label=count), col="grey10", size = 3, vjust = 1)+ facet_grid(medv_bin ~ dis_bin) + labs(x = "Index of accessibility to radial highways (RAD)", y = "Proportion", title = "Distribution of RAD across bins of various medv and dis values (blue histogram)\nVS SLGP at the center of these bins") + theme_minimal() + theme(legend.position = "bottom")+ scale_x_continuous(breaks = c(1:9))
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.