Nothing
## ----setup, include = FALSE---------------------------------------------------
# Check if required packages are available
has_suggested <- requireNamespace("sf", quietly = TRUE) &&
requireNamespace("disdat", quietly = TRUE) &&
requireNamespace("purrr", quietly = TRUE) &&
requireNamespace("tidyr", quietly = TRUE) &&
requireNamespace("tibble", quietly = TRUE) &&
requireNamespace("ggplot2", quietly = TRUE)
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>",
fig.width = 7,
out.width = "100%",
eval = has_suggested
)
## ----check-packages, echo = FALSE, results = "asis", eval = TRUE--------------
if (!has_suggested) {
cat("**Note:** This vignette requires the following packages to run: `sf`, `disdat`, `purrr`, `tidyr`, `tibble`, and `ggplot2`. Please install them to view the full vignette output.\n\n")
cat("```r\n")
cat('install.packages(c("sf", "disdat", "purrr", "tidyr", "tibble", "ggplot2"))\n')
cat("```\n\n")
}
## ----libraries, message=FALSE-------------------------------------------------
library(dplyr)
library(purrr)
library(tidyr)
library(tibble)
library(ggplot2)
library(terra)
library(sf)
library(disdat)
library(MIAmaxent)
## ----load-data----------------------------------------------------------------
# Choose a species (change this to model a different species!)
selected_species <- "swi01" # `disdat` has anonymized species codes: swi01 to swi30
# Define predictor variables
predictors <- c(
"bcc", "calc", "ccc", "ddeg", "nutri", "pday",
"precyy", "sfroyy", "slope", "sradyy", "swb", "tavecc", "topo"
)
# Extract presence-only (PO) data for the selected species
swi_po <- disPo("SWI") # function from `disdat` package
po_records <- swi_po[swi_po$spid == selected_species, ]
cat("Number of presence records:", nrow(po_records), "\n")
# Background data (10,000 random locations)
background <- disBg("SWI") # function from `disdat` package
# Combine presence and background into training data frame
# RV = 1 for presences, NA for background
train_data <- rbind(
data.frame(RV = 1, po_records[, predictors]),
data.frame(RV = NA, background[, predictors])
)
cat("Training presences:", sum(train_data$RV == 1, na.rm = TRUE), "\n")
## ----plot-po, fig.height=4----------------------------------------------------
studyarea <- disBorder("SWI")
po_sf <- st_as_sf(po_records, coords = c("x", "y"), crs = 21781)
ggplot() +
geom_sf(data = studyarea, fill = "lightgrey") +
geom_sf(data = po_sf, color = "blue", size = 0.5) +
labs(
title = paste("Presence records for species", selected_species),
x = "Easting (m)", y = "Northing (m)"
) +
theme_minimal()
## -----------------------------------------------------------------------------
# Rename columns to more informative names
col_name_map <- c(
RV = "RV",
bcc = "broadleaf.cover",
calc = "bedrock.calcareous",
ccc = "conifer.cover",
ddeg = "growing.degree.days",
nutri = "soil.nutrients",
pday = "precip.days",
precyy = "annual.precip",
sfroyy = "summer.frost.freq",
slope = "slope",
sradyy = "solar.radiation",
swb = "water.balance",
tavecc = "temp.coldest.month",
topo = "topographic.position"
)
names(train_data) <- col_name_map[names(train_data)]
# Convert the categorical variable to factor
train_data$bedrock.calcareous <- as.factor(train_data$bedrock.calcareous)
## ----single-fop, fig.height=4-------------------------------------------------
plotFOP(train_data, "temp.coldest.month")
## ----fop-data-extraction, echo=FALSE, fig.show='hide'-------------------------
# Identify continuous variables (exclude RV and categorical variables)
continuous_vars <- setdiff(names(train_data), c("RV", "bedrock.calcareous"))
# Extract FOP data for all continuous variables
fop_results <- map(continuous_vars, \(x) plotFOP(train_data, x))
names(fop_results) <- continuous_vars
## ----fop-combine-data, echo=FALSE---------------------------------------------
# Combine FOP data into a single data frame
fop_combined <- map2_dfr(
fop_results, continuous_vars,
\(x, y) mutate(x$FOPdata, variable = y)
)
# Prepare raw training data for density plots
train_data_long <- train_data %>%
select(all_of(continuous_vars)) %>%
pivot_longer(everything(), names_to = "variable", values_to = "value")
## ----fop-faceted-plot, fig.height=10, echo=FALSE------------------------------
# Create a faceted plot to compare FOP across all continuous variables
ggplot() +
# Grey density showing data distribution (scaled to fit on same axis)
geom_density(
data = train_data_long, aes(x = value, y = after_stat(scaled)),
fill = "grey90", color = NA
) +
# FOP points and loess line
geom_point(data = fop_combined, aes(x = intEV, y = intRV), size = 1.5) +
geom_line(data = fop_combined, aes(x = intEV, y = loess), color = "red") +
facet_wrap(~variable, scales = "free_x", ncol = 3) +
labs(
x = "Environmental variable value",
y = "Frequency of Observed Presence (FOP)"
) +
theme_bw() +
theme(strip.text = element_text(size = 8))
## ----derive-vars--------------------------------------------------------------
# Create derived variables using multiple transformation types
# L = Linear, M = Monotonic, D = Deviation (unimodal),
# HF = Forward hinge, HR = Reverse hinge, T = Threshold
train_DVs <- deriveVars(train_data,
transformtype = c("L", "M", "D", "HF", "HR", "T"),
allsplines = TRUE,
quiet = TRUE
)
## ----examine-dvs--------------------------------------------------------------
# Example: some of the DVs created for temperature
head(names(train_DVs$dvdata$temp.coldest.month))
## ----select-dvs---------------------------------------------------------------
# Select the best DVs for each individual EV
train_DV_select <- selectDVforEV(train_DVs$dvdata,
alpha = 0.001,
quiet = TRUE
)
## ----examine-selected-dvs-----------------------------------------------------
# Example: DVs selected for temperature
names(train_DV_select$dvdata$temp.coldest.month)
## ----select-evs---------------------------------------------------------------
# Select which EVs (represented by their best DVs) to include
train_model <- selectEV(train_DV_select$dvdata,
alpha = 0.001,
interaction = FALSE,
quiet = TRUE
)
# View the selection trail, only the top model from each round
selection_trail <- train_model$selection[!duplicated(train_model$selection$round), ]
knitr::kable(selection_trail, digits = c(0, 0, 0, 3, 1, 0, 3)) # Print the data.frame as a table
## ----final-model--------------------------------------------------------------
# View the final model
train_model$selectedmodel$formula
## ----response-curves, fig.height=4--------------------------------------------
# Get the most important variable (first selected)
top_EV <- train_model$selection$variables[1]
# Plot response curve for the most important variable
plotResp(
train_model$selectedmodel,
train_DVs$transformations,
top_EV
)
## ----compare-fop-response, fig.show='hold', fig.height=4----------------------
plotFOP(train_data, top_EV)
## ----all-responses, include=FALSE---------------------------------------------
evs <- (train_model$selectedmodel$coefficients)[-1] |>
names() |>
sub("_.*$", "", x = _) |>
unique()
for (ev in evs[-1]) {
plotResp(
train_model$selectedmodel,
train_DVs$transformations,
ev
)
}
train_model$selectedmodel$formula
## ----load-data-pa-------------------------------------------------------------
# Extract presence-absence (PA) data
swi_pa <- disPa("SWI") # function from `disdat` package
# Merge with environmental data by location
swi_env <- disEnv("SWI") # function from `disdat` package
pa_merged <- merge(swi_pa, swi_env)
# Create validation data frame with RV and environmental variables (parallel to train_data)
validation_data <- data.frame(
RV = pa_merged[, selected_species],
pa_merged[, predictors]
)
# Rename columns to match training data
names(validation_data) <- col_name_map[names(validation_data)]
# Convert the categorical variable to factor
validation_data$bedrock.calcareous <- as.factor(validation_data$bedrock.calcareous)
cat("Validation presences:", sum(validation_data$RV == 1), "\n")
cat("Validation absences:", sum(validation_data$RV == 0), "\n")
## ----plot-pa, fig.height=4----------------------------------------------------
pa_sf <- st_as_sf(swi_pa, coords = c("x", "y"), crs = 21781)
# Convert to factor for discrete colors (presence/absence)
pa_sf[[selected_species]] <- as.factor(pa_sf[[selected_species]])
ggplot() +
geom_sf(data = studyarea, fill = "lightgrey") +
geom_sf(data = pa_sf, aes(color = .data[[selected_species]]), size = 0.5) +
scale_color_manual(
values = c("0" = "red", "1" = "blue"),
labels = c("0" = "Absence", "1" = "Presence"),
name = "Record type"
) +
labs(
title = paste("Presence-absence records for species", selected_species),
x = "Easting (m)", y = "Northing (m)"
) +
theme_minimal()
## ----validation-auc, fig.height=4---------------------------------------------
# Calculate validation AUC
auc_result <- testAUC(
model = train_model$selectedmodel,
transformations = train_DVs$transformations,
data = validation_data
)
## ----reconstruct-spatial, include=FALSE---------------------------------------
# To create spatial predictions directly, we need raster data
# Here we reconstruct the (incomplete) raster from coordinates, and metadata instead
# Create projection data (similar to validation data but with coordinates)
proj_data <- data.frame(
x = pa_merged$x,
y = pa_merged$y,
pa_merged[, predictors]
)
# Rename columns to match training data
col_name_map <- c(
x = "x",
y = "y",
col_name_map
)
names(proj_data) <- col_name_map[names(proj_data)]
# Convert the categorical variable to factor
proj_data$bedrock.calcareous <- as.factor(proj_data$bedrock.calcareous)
# SWI
# Prepared by Niklaus E. Zimmermann and Antoine Guisan
# 13 variables: see SWI\01_metadata_SWI_environment.csv
# Coordinate reference system: Transverse, spheroid Bessel (note all data has had a constant shift applied to it)
# EPSG:21781
# Units: meters
# Raster cell size: 100m
# Create a raster to fill in with predictions
# Here we assume env_data has columns 'x' and 'y' for coordinates
coordinates <- proj_data[, c("x", "y")]
env_raster <- rast(
xmin = min(coordinates$x),
xmax = max(coordinates$x),
ymin = min(coordinates$y),
ymax = max(coordinates$y),
resolution = 100,
crs = "EPSG:21781"
)
# Create a simple prediction map using the environmental data
predictions <- projectModel(
model = train_model$selectedmodel,
transformations = train_DVs$transformations,
data = proj_data
)
# Rasterize the predictions
pred_raster <- rasterize(as.matrix(coordinates), env_raster,
values = predictions$output$PRO
)
plot(pred_raster, main = "Predicted Probability of Occurrence")
## ----auc-po, include=FALSE----------------------------------------------------
testAUC(
model = train_model$selectedmodel,
transformations = train_DVs$transformations,
data = train_data
)
## ----bv-setup-----------------------------------------------------------------
# Create a tibble to store results for three alpha levels
model_comparison <- tibble(
alpha = c(1e-1, 1e-3, 1e-9)
)
## ----bv-split-----------------------------------------------------------------
set.seed(123) # For reproducibility
# Create indices for 70/30 split
n_total <- nrow(train_data)
train_idx <- sample(1:n_total, size = floor(0.7 * n_total))
# Split the data into training and test sets
train_70 <- train_data[train_idx, ]
test_30 <- train_data[-train_idx, ]
# Derive variables for the training set
train_DVs_70 <- deriveVars(train_70,
transformtype = c("L", "M", "D", "HF", "HR", "T"),
allsplines = TRUE,
quiet = TRUE
)
## ----bv-fit-models------------------------------------------------------------
# Add model fitting results using map
model_comparison <- model_comparison %>%
mutate(
# Step 1: Select DVs for each EV
dv_selection = map(alpha, \(a) selectDVforEV(train_DVs_70$dvdata,
alpha = a,
quiet = TRUE
)),
# Step 2: Select EVs for final model
ev_selection = map2(
dv_selection, alpha,
\(dv, a) selectEV(dv$dvdata,
alpha = a,
interaction = FALSE,
quiet = TRUE
)
),
# Extract number of DVs in final model
n_dvs = map_int(ev_selection, \(ev) length(ev$selectedmodel$coefficients) - 1)
)
# Display intermediate results
model_comparison %>%
select(alpha, n_dvs)
## ----bv-evaluate, fig.show='hide', warning=FALSE------------------------------
# Add AUC calculations
model_comparison <- model_comparison %>%
mutate(
# AUC on 30% test set (internal evaluation)
auc_test = map_dbl(
ev_selection,
\(ev) testAUC(
model = ev$selectedmodel,
transformations = train_DVs_70$transformations,
data = test_30
)
),
# AUC on independent validation set (external evaluation)
auc_validation = map_dbl(
ev_selection,
\(ev) testAUC(
model = ev$selectedmodel,
transformations = train_DVs_70$transformations,
data = validation_data
)
)
)
# Display final results table
results_table <- model_comparison %>%
select(alpha, n_dvs, auc_test, auc_validation)
knitr::kable(results_table, digits = c(10, 0, 3, 3))
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.