#' @title Function to evaluate relative importance of each variable.
#' @description Evaluate relative importance of each variable within the model
#' using the following methods:
#' \itemize{
#' \item{Jackknife test based on AUC ratio and Pearson correlation between the
#' result of model using all variables}
#' \item{SHapley Additive exPlanations (SHAP) according to Shapley values}}
#' @param model (`isolation_forest`) The extended isolation forest SDM. It could be
#' the item `model` of `POIsotree` made by function \code{\link{isotree_po}}.
#' @param pts_occ (`sf`) The `sf` style table that
#' include training occurrence locations.
#' @param pts_occ_test (`sf`, or `NULL`) The `sf` style
#' table that include occurrence locations of test.
#' If `NULL`, it would be set the same as `var_occ`. The default is `NULL`.
#' @param variables (`stars`) The `stars` of environmental variables. It should have
#' multiple `attributes` instead of `dims`. If you have `raster` object instead, you
#' could use \code{\link{st_as_stars}} to convert it to `stars` or use
#' \code{\link{read_stars}} directly read source data as a `stars`.
#' @param shap_nsim (`integer`) The number of Monte Carlo repetitions in SHAP
#' method to use for estimating each Shapley value. See details in documentation of
#' function \code{\link{explain}} in package `fastshap`.
#' @param visualize (`logical`) If `TRUE`, plot the analysis figures.
#' The default is `FALSE`.
#' @param seed (`integer`) The seed for any random progress. The default is `10L`.
#' @return (`VariableAnalysis`) A list of
#' \itemize{
#' \item{variables (`vector` of `character`) The names of environmental variables}
#' \item{pearson_correlation (`tibble`) A table of Jackknife test based on Pearson correlation}
#' \item{full_AUC_ratio (`tibble`) A table of AUC ratio of training and test dataset using all variables,
#' that act as references for Jackknife test}
#' \item{AUC_ratio (`tibble`) A table of Jackknife test based on AUC ratio}
#' \item{SHAP (`tibble`) A table of Shapley values of training and test dataset separately}
#' }
#'
#' @seealso
#' \code{\link{plot.VariableAnalysis}}, \code{\link{print.VariableAnalysis}}
#' \code{\link{explain}} in `fastshap`
#'
#' @details
#' \bold{Jackknife test} of variable importance is reflected as the decrease
#' in a model performance when an environmental variable is used singly or is
#' excluded from the environmental variable pool. In this function, we used
#' Pearson correlation and AUC ratio.
#'
#' \bold{Pearson correlation} is the correlation between the predictions generated by
#' different variable importance evaluation methods and the predictions generated
#' by the full model as the assessment of mode performance.
#'
#' The area under the ROC curve (AUC) is a threshold-independent evaluator of
#' model performance, which needs both presence and absence data. A ROC curve is
#' generated by plotting the proportion of correctly predicted presence on the
#' y-axis against 1 minus the proportion of correctly predicted absence on x-axis
#' for all thresholds. Multiple approaches have been used to evaluate accuracy of
#' presence-only models. Peterson et al. (2008) modified AUC by plotting the
#' proportion of correctly predicted presence against the proportion of
#' presences falling above a range of thresholds against the proportion of
#' cells of the whole area falling above the range of thresholds. This is the
#' so called \bold{AUC ratio} that is used in this package.
#'
#' \bold{SHapley Additive exPlanations (SHAP)} uses Shapley values to evaluate the variable importance. The
#' larger the absolute value of Shapley value, the more important this variable is.
#' Positive Shapley values mean positive affect, while negative Shapely values mean
#' negative affect. Please check references for more details if you are interested in.
#'
#' @references
#' \itemize{
#' \item{Peterson,
#' A. Townsend, Monica Papeş, and Jorge Soberón. "Rethinking receiver operating
#' characteristic analysis applications in ecological niche modeling."
#' \emph{Ecological modelling} 213.1 (2008): 63-72.\doi{10.1016/j.ecolmodel.2007.11.008}}
#' \item{Strumbelj, Erik,
#' and Igor Kononenko. "Explaining prediction models and individual predictions
#' with feature contributions." \emph{Knowledge and information systems}
#' 41.3 (2014): 647-665.\doi{10.1007/s10115-013-0679-x}}
#' \item{\href{http://proceedings.mlr.press/v119/sundararajan20b.html}{Sundara
#' rajan, Mukund, and Amir Najmi. "The many Shapley values for model explanation
#' ." \emph{International Conference on Machine Learning}. PMLR, 2020.}}
#' \item{\url{https://github.com/bgreenwell/fastshap}}
#' \item{\url{https://github.com/slundberg/shap}}
#' }
#'
#' @importFrom dplyr select tibble filter sample_n as_tibble
#' @importFrom sf st_as_sf st_drop_geometry
#' @importFrom stars st_as_stars st_xy2sfc st_get_dimension_values
#' @importFrom fastshap explain
#' @importFrom stats cor predict
#' @importFrom tidyselect all_of
#' @importFrom rlang .data
#' @export
#' @examples
#' \donttest{
#' # Using a pseudo presence-only occurrence dataset of
#' # virtual species provided in this package
#' library(dplyr)
#' library(sf)
#' library(stars)
#' library(itsdm)
#'
#' data("occ_virtual_species")
#' obs_df <- occ_virtual_species %>% filter(usage == "train")
#' eval_df <- occ_virtual_species %>% filter(usage == "eval")
#' x_col <- "x"
#' y_col <- "y"
#' obs_col <- "observation"
#'
#' # Format the observations
#' obs_train_eval <- format_observation(
#' obs_df = obs_df, eval_df = eval_df,
#' x_col = x_col, y_col = y_col, obs_col = obs_col,
#' obs_type = "presence_only")
#'
#' env_vars <- system.file(
#' 'extdata/bioclim_tanzania_10min.tif',
#' package = 'itsdm') %>% read_stars() %>%
#' slice('band', c(1, 5, 12, 16))
#'
#' # With imperfect_presence mode,
#' mod <- isotree_po(
#' obs_mode = "imperfect_presence",
#' obs = obs_train_eval$obs,
#' obs_ind_eval = obs_train_eval$eval,
#' variables = env_vars, ntrees = 10,
#' sample_size = 0.8, ndim = 2L,
#' seed = 123L, nthreads = 1,
#' response = FALSE,
#' spatial_response = FALSE,
#' check_variable = FALSE)
#'
#' var_analysis <- variable_analysis(
#' model = mod$model,
#' pts_occ = mod$observation,
#' pts_occ_test = mod$independent_test,
#' variables = mod$variables)
#' plot(var_analysis)
#' }
#'
variable_analysis <- function(model,
pts_occ,
# Independent test
pts_occ_test = NULL,
variables,
shap_nsim = 100,
visualize = FALSE,
seed = 10) {
# Check inputs
checkmate::assert_data_frame(pts_occ)
checkmate::assert_data_frame(pts_occ_test, null.ok = T)
if (is.null(pts_occ_test)) {
warning(paste0('pts_occ_test is NULL, set it to pts_occ. ',
'The result of train and test would be the same'))
pts_occ_test <- pts_occ}
checkmate::assert_class(variables, 'stars')
checkmate::assert_int(shap_nsim)
checkmate::assert_logical(visualize)
checkmate::assert_int(seed)
bands <- names(variables)
# Background samples
set.seed(seed)
n_samples <- nrow(pts_occ) + nrow(pts_occ_test)
if_replace <- ifelse(n_samples > length(variables[[1]]),
TRUE, FALSE)
samples_bg <- variables %>% select(bands[1]) %>%
st_xy2sfc(as_points = T) %>% st_as_sf() %>%
select(.data$geometry) %>%
sample_n(nrow(pts_occ) + nrow(pts_occ_test),
replace = if_replace)
# Do full prediction
## Raster
var_pred_full <- predict(variables, model)
## Stretch result to be comparable with other results
var_pred_full <- 1 - var_pred_full
# Extract values
full_pred_var <- .stars_stretch(var_pred_full)
full_pred_occ <- st_extract(
x = full_pred_var,
at = pts_occ) %>%
pull(.data$prediction)
full_pred_occ_test <- st_extract(
x = full_pred_var,
at = pts_occ_test) %>%
pull(.data$prediction)
full_pred_bg <- st_extract(
x = full_pred_var,
at = samples_bg) %>%
pull(.data$prediction)
## AUC background and ratio
full_auc_train <- .auc_ratio(na.omit(full_pred_occ),
na.omit(as.vector(full_pred_var[[1]])))
full_auc_test <- .auc_ratio(na.omit(full_pred_occ_test),
na.omit(as.vector(full_pred_var[[1]])))
# Process
## Extract values
var_occ <- st_extract(x = variables, at = pts_occ) %>%
st_as_sf() %>% na.omit() %>% st_drop_geometry()
var_occ_test <- st_extract(x = variables, at = pts_occ_test) %>%
st_as_sf() %>% na.omit() %>% st_drop_geometry()
## Start
var_each <- do.call(rbind, lapply(bands, function(nm){
# Model with only this variable
this_var_occ <- var_occ %>% select(all_of(nm))
this_vars <- variables %>% select(all_of(nm))
## Fit model
args_iforest <- c(
list(data=this_var_occ, seed=model$random_seed, nthreads=model$nthreads),
model$params
)
args_iforest$ndim <- 1
args_iforest$ncols_per_tree <- min(ncol(args_iforest$data),
args_iforest$ncols_per_tree)
if (args_iforest$new_categ_action == "impute") {
args_iforest$new_categ_action <- "weighted"
}
this_model <- do.call(isolation.forest, args_iforest)
## Prediction
## Raster
this_vars_pred <- predict(this_vars, this_model)
## Stretch result to be comparable with other results
this_vars_pred <- 1 - this_vars_pred
this_vars_pred <- .stars_stretch(this_vars_pred)
# Extract values
this_occ_pred <- st_extract(
x = this_vars_pred,
at = pts_occ) %>%
pull(.data$prediction)
this_occ_test_pred <- st_extract(
x = this_vars_pred,
at = pts_occ_test) %>%
pull(.data$prediction)
# Background values
this_pred_bg <- st_extract(
x = this_vars_pred,
at = samples_bg) %>%
pull(.data$prediction)
## Calculate metrics
r_only_train <- cor(c(full_pred_occ, full_pred_bg[1:nrow(pts_occ)]),
c(this_occ_pred, this_pred_bg[1:nrow(pts_occ)]),
use = 'complete.obs')
r_only_test <- cor(c(full_pred_occ_test,
full_pred_bg[(nrow(pts_occ) + 1): n_samples]),
c(this_occ_test_pred,
this_pred_bg[(nrow(pts_occ) + 1): n_samples]),
use = 'complete.obs')
auc_only_train <- .auc_ratio(na.omit(this_occ_pred),
na.omit(as.vector(this_vars_pred[[1]])))
auc_only_test <- .auc_ratio(na.omit(this_occ_test_pred),
na.omit(as.vector(this_vars_pred[[1]])))
# Cleaning up
rm(this_var_occ, this_vars, this_model, this_occ_pred,
this_occ_test_pred, this_vars_pred, this_pred_bg)
# Model with variables except the chosen one
## Subset dataset
nms <- setdiff(bands, nm)
except_var_occ <- var_occ %>% select(all_of(nms))
except_vars <- variables %>% select(all_of(nms))
## Fit model
args_iforest <- c(
list(data=except_var_occ, seed=model$random_seed, nthreads=model$nthreads),
model$params
)
args_iforest$ndim <- min(ncol(args_iforest$data), args_iforest$ndim)
args_iforest$ncols_per_tree <- min(ncol(args_iforest$data), args_iforest$ncols_per_tree)
if (args_iforest$ndim == 1) {
if (args_iforest$new_categ_action == "impute") {
args_iforest$new_categ_action <- "weighted"
}
} else {
if (args_iforest$missing_action == "divide") {
args_iforest$missing_action <- "impute"
}
if (args_iforest$missing_action == "weighted") {
args_iforest$missing_action <- "impute"
}
}
except_model <- do.call(isolation.forest, args_iforest)
## Prediction
## Raster
except_vars_pred <- predict(except_vars, except_model)
## Stretch result to be comparable with other results
except_vars_pred <- 1 - except_vars_pred
except_vars_pred <- .stars_stretch(except_vars_pred)
# Extract values
except_occ_pred <- st_extract(
x = except_vars_pred,
at = pts_occ) %>%
pull(.data$prediction)
except_occ_test_pred <- st_extract(
x = except_vars_pred,
at = pts_occ_test) %>%
pull(.data$prediction)
# Background values
except_pred_bg <- st_extract(
x = except_vars_pred,
at = samples_bg) %>%
pull(.data$prediction)
## Calculate metrics
r_except_train <- cor(c(full_pred_occ, full_pred_bg[1:nrow(pts_occ)]),
c(except_occ_pred, except_pred_bg[1:nrow(pts_occ)]),
use = 'complete.obs')
r_except_test <- cor(c(full_pred_occ_test,
full_pred_bg[(nrow(pts_occ) + 1): n_samples]),
c(except_occ_test_pred,
except_pred_bg[(nrow(pts_occ) + 1): n_samples]),
use = 'complete.obs')
auc_except_train <- .auc_ratio(na.omit(except_occ_pred),
na.omit(as.vector(except_vars_pred[[1]])))
auc_except_test <- .auc_ratio(na.omit(except_occ_test_pred),
na.omit(as.vector(except_vars_pred[[1]])))
# Clean up
rm(nms, except_var_occ, except_vars,
except_model, except_occ_pred, except_occ_test_pred,
except_vars_pred, except_pred_bg)
# Output
tibble(variable = rep(nm, 8),
metrics = c(rep('pearson_correlation', 4), rep('AUC_ratio', 4)),
method = rep(c(rep('Only', 2), rep('Without', 2)), 2),
usage = rep(c('Train', 'Test'), 4),
value = c(r_only_train, r_only_test,
r_except_train, r_except_test,
auc_only_train, auc_only_test,
auc_except_train, auc_except_test))
}))
# NOTE: Permutation feature importance method does not work for isolation forest
# because each split is independent to the rank of values. Its tree structure is
# fundamentally different from tree structure such as random forest.
# ALTERNATIVE: SHAP method that use Shapley values according to game theory.
# SHAP feature importance
## Training
### Add background points to balance the average value is around 0.5
vars_bg <- st_extract(
x = variables,
at = samples_bg[1:nrow(pts_occ), ]) %>%
st_drop_geometry()
set.seed(seed)
shap_train <- explain(model, X = var_occ, nsim = shap_nsim,
newdata = rbind(var_occ, vars_bg),
pred_wrapper = .pfun_shap)
shap_train <- as.data.frame(shap_train) # For fastshap >= 0.1.0
## Test
### Add background points to balance the average value is around 0.5
vars_bg <- st_extract(
x = variables,
at = samples_bg[(nrow(pts_occ) + 1): n_samples, ]) %>%
st_drop_geometry()
set.seed(seed)
shap_test <- explain(model, X = var_occ, nsim = shap_nsim,
newdata = rbind(var_occ_test, vars_bg),
pred_wrapper = .pfun_shap)
shap_test <- as.data.frame(shap_test) # For fastshap >= 0.1.0
rm(vars_bg, variables, samples_bg, var_occ, var_occ_test)
# Output
out <- list(variables = bands,
pearson_correlation =
var_each %>%
filter(.data$metrics == 'pearson_correlation') %>%
select(-.data$metrics),
full_AUC_ratio =
tibble(full_auc_train = full_auc_train,
full_auc_test = full_auc_test),
AUC_ratio =
var_each %>%
filter(.data$metrics == 'AUC_ratio') %>%
select(-.data$metrics),
SHAP = list(train = as_tibble(shap_train),
test = as_tibble(shap_test)))
class(out) <- append("VariableAnalysis", class(out))
# Visualize
if (visualize) {
print(plot(out))
}
# Return
return(out)
}
# variable_analysis end
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.