R/variable_analysis.R
In itsdm: Isolation Forest-Based Presence-Only Species Distribution Modeling

Documented in variable_analysis

#' @title Function to evaluate relative importance of each variable.
#' @description Evaluate relative importance of each variable within the model
#' using the following methods:
#' \itemize{
#' \item{Jackknife test based on AUC ratio and Pearson correlation between the
#' result of model using all variables}
#' \item{SHapley Additive exPlanations (SHAP) according to Shapley values}}
#' @param model (`isolation_forest`) The extended isolation forest SDM. It could be
#' the item `model` of `POIsotree` made by function \code{\link{isotree_po}}.
#' @param pts_occ (`sf`) The `sf` style table that
#' include training occurrence locations.
#' @param pts_occ_test (`sf`, or `NULL`) The `sf` style
#' table that include occurrence locations of test.
#' If `NULL`, it would be set the same as `var_occ`. The default is `NULL`.
#' @param variables (`stars`) The `stars` of environmental variables. It should have
#' multiple `attributes` instead of `dims`. If you have `raster` object instead, you
#' could use \code{\link{st_as_stars}} to convert it to `stars` or use
#' \code{\link{read_stars}} directly read source data as a `stars`.
#' @param shap_nsim (`integer`) The number of Monte Carlo repetitions in SHAP
#' method to use for estimating each Shapley value. See details in documentation of
#' function \code{\link{explain}} in package `fastshap`.
#' @param visualize (`logical`) If `TRUE`, plot the analysis figures.
#' The default is `FALSE`.
#' @param seed (`integer`) The seed for any random progress. The default is `10L`.
#' @return (`VariableAnalysis`) A list of
#' \itemize{
#' \item{variables (`vector` of `character`) The names of environmental variables}
#' \item{pearson_correlation (`tibble`) A table of Jackknife test based on Pearson correlation}
#' \item{full_AUC_ratio (`tibble`) A table of AUC ratio of training and test dataset using all variables,
#' that act as references for Jackknife test}
#' \item{AUC_ratio (`tibble`) A table of Jackknife test based on AUC ratio}
#' \item{SHAP (`tibble`) A table of Shapley values of training and test dataset separately}
#' }
#'
#' @seealso
#' \code{\link{plot.VariableAnalysis}}, \code{\link{print.VariableAnalysis}}
#' \code{\link{explain}} in `fastshap`
#'
#' @details
#' \bold{Jackknife test} of variable importance is reflected as the decrease
#' in a model performance when an environmental variable is used singly or is
#' excluded from the environmental variable pool. In this function, we used
#' Pearson correlation and AUC ratio.
#'
#' \bold{Pearson correlation} is the correlation between the predictions generated by
#' different variable importance evaluation methods and the predictions generated
#' by the full model as the assessment of mode performance.
#'
#' The area under the ROC curve (AUC) is a threshold-independent evaluator of
#' model performance, which needs both presence and absence data. A ROC curve is
#' generated by plotting the proportion of correctly predicted presence on the
#' y-axis against 1 minus the proportion of correctly predicted absence on x-axis
#' for all thresholds. Multiple approaches have been used to evaluate accuracy of
#' presence-only models. Peterson et al. (2008) modified AUC by plotting the
#' proportion of correctly predicted presence against the proportion of
#' presences falling above a range of thresholds against the proportion of
#' cells of the whole area falling above the range of thresholds. This is the
#' so called \bold{AUC ratio} that is used in this package.
#'
#' \bold{SHapley Additive exPlanations (SHAP)} uses Shapley values to evaluate the variable importance. The
#' larger the absolute value of Shapley value, the more important this variable is.
#' Positive Shapley values mean positive affect, while negative Shapely values mean
#' negative affect. Please check references for more details if you are interested in.
#'
#' @references
#' \itemize{
#' \item{Peterson,
#' A. Townsend, Monica Papeş, and Jorge Soberón. "Rethinking receiver operating
#' characteristic analysis applications in ecological niche modeling."
#' \emph{Ecological modelling} 213.1 (2008): 63-72.\doi{10.1016/j.ecolmodel.2007.11.008}}
#' \item{Strumbelj, Erik,
#' and Igor Kononenko. "Explaining prediction models and individual predictions
#' with feature contributions." \emph{Knowledge and information systems}
#' 41.3 (2014): 647-665.\doi{10.1007/s10115-013-0679-x}}
#' \item{\href{http://proceedings.mlr.press/v119/sundararajan20b.html}{Sundara
#' rajan, Mukund, and Amir Najmi. "The many Shapley values for model explanation
#' ." \emph{International Conference on Machine Learning}. PMLR, 2020.}}
#' \item{\url{https://github.com/bgreenwell/fastshap}}
#' \item{\url{https://github.com/slundberg/shap}}
#' }
#'
#' @importFrom dplyr select tibble filter sample_n as_tibble
#' @importFrom sf st_as_sf st_drop_geometry
#' @importFrom stars st_as_stars st_xy2sfc st_get_dimension_values
#' @importFrom fastshap explain
#' @importFrom stats cor predict
#' @importFrom tidyselect all_of
#' @importFrom rlang .data
#' @export
#' @examples
#' \donttest{
#' # Using a pseudo presence-only occurrence dataset of
#' # virtual species provided in this package
#' library(dplyr)
#' library(sf)
#' library(stars)
#' library(itsdm)
#'
#' data("occ_virtual_species")
#' obs_df <- occ_virtual_species %>% filter(usage == "train")
#' eval_df <- occ_virtual_species %>% filter(usage == "eval")
#' x_col <- "x"
#' y_col <- "y"
#' obs_col <- "observation"
#'
#' # Format the observations
#' obs_train_eval <- format_observation(
#'   obs_df = obs_df, eval_df = eval_df,
#'   x_col = x_col, y_col = y_col, obs_col = obs_col,
#'   obs_type = "presence_only")
#'
#' env_vars <- system.file(
#'   'extdata/bioclim_tanzania_10min.tif',
#'   package = 'itsdm') %>% read_stars() %>%
#'   slice('band', c(1, 5, 12, 16))
#'
#' # With imperfect_presence mode,
#' mod <- isotree_po(
#'   obs_mode = "imperfect_presence",
#'   obs = obs_train_eval$obs,
#'   obs_ind_eval = obs_train_eval$eval,
#'   variables = env_vars, ntrees = 10,
#'   sample_size = 0.8, ndim = 2L,
#'   seed = 123L, nthreads = 1,
#'   response = FALSE,
#'   spatial_response = FALSE,
#'   check_variable = FALSE)
#'
#' var_analysis <- variable_analysis(
#'   model = mod$model,
#'   pts_occ = mod$observation,
#'   pts_occ_test = mod$independent_test,
#'   variables = mod$variables)
#' plot(var_analysis)
#' }
#'
variable_analysis <- function(model,
                              pts_occ,
                              # Independent test
                              pts_occ_test = NULL,
                              variables,
                              shap_nsim = 100,
                              visualize = FALSE,
                              seed = 10) {
  # Check inputs
  checkmate::assert_data_frame(pts_occ)
  checkmate::assert_data_frame(pts_occ_test, null.ok = T)
  if (is.null(pts_occ_test)) {
    warning(paste0('pts_occ_test is NULL, set it to pts_occ. ',
                   'The result of train and test would be the same'))
    pts_occ_test <- pts_occ}
  checkmate::assert_class(variables, 'stars')
  checkmate::assert_int(shap_nsim)
  checkmate::assert_logical(visualize)
  checkmate::assert_int(seed)
  bands <- names(variables)

  # Background samples
  set.seed(seed)
  n_samples <- nrow(pts_occ) + nrow(pts_occ_test)
  if_replace <- ifelse(n_samples > length(variables[[1]]),
                       TRUE, FALSE)
  samples_bg <- variables %>% select(bands[1]) %>%
    st_xy2sfc(as_points = T) %>% st_as_sf() %>%
    select(.data$geometry) %>%
    sample_n(nrow(pts_occ) + nrow(pts_occ_test),
             replace = if_replace)

  # Do full prediction
  ## Raster
  var_pred_full <- predict(variables, model)
  ## Stretch result to be comparable with other results
  var_pred_full <- 1 - var_pred_full

  # Extract values
  full_pred_var <- .stars_stretch(var_pred_full)
  full_pred_occ <- st_extract(
    x = full_pred_var,
    at = pts_occ) %>%
    pull(.data$prediction)
  full_pred_occ_test <- st_extract(
    x = full_pred_var,
    at = pts_occ_test) %>%
    pull(.data$prediction)
  full_pred_bg <- st_extract(
    x = full_pred_var,
    at = samples_bg) %>%
    pull(.data$prediction)

  ## AUC background and ratio
  full_auc_train <- .auc_ratio(na.omit(full_pred_occ),
                               na.omit(as.vector(full_pred_var[[1]])))
  full_auc_test <- .auc_ratio(na.omit(full_pred_occ_test),
                              na.omit(as.vector(full_pred_var[[1]])))

  # Process
  ## Extract values
  var_occ <- st_extract(x = variables, at = pts_occ) %>%
    st_as_sf() %>% na.omit() %>% st_drop_geometry()
  var_occ_test <- st_extract(x = variables, at = pts_occ_test) %>%
    st_as_sf() %>% na.omit() %>% st_drop_geometry()

  ## Start
  var_each <- do.call(rbind, lapply(bands, function(nm){
    # Model with only this variable
    this_var_occ <- var_occ %>% select(all_of(nm))
    this_vars <- variables %>% select(all_of(nm))

    ## Fit model
    args_iforest <- c(
      list(data=this_var_occ, seed=model$random_seed, nthreads=model$nthreads),
      model$params
    )
    args_iforest$ndim <- 1
    args_iforest$ncols_per_tree <- min(ncol(args_iforest$data),
                                       args_iforest$ncols_per_tree)
    if (args_iforest$new_categ_action == "impute") {
      args_iforest$new_categ_action <- "weighted"
    }
    this_model <- do.call(isolation.forest, args_iforest)

    ## Prediction
    ## Raster
    this_vars_pred <- predict(this_vars, this_model)
    ## Stretch result to be comparable with other results
    this_vars_pred <- 1 - this_vars_pred
    this_vars_pred <- .stars_stretch(this_vars_pred)

    # Extract values
    this_occ_pred <- st_extract(
      x = this_vars_pred,
      at = pts_occ) %>%
      pull(.data$prediction)
    this_occ_test_pred <- st_extract(
      x = this_vars_pred,
      at = pts_occ_test) %>%
      pull(.data$prediction)

    # Background values
    this_pred_bg <- st_extract(
      x = this_vars_pred,
      at = samples_bg) %>%
      pull(.data$prediction)

    ## Calculate metrics
    r_only_train <- cor(c(full_pred_occ, full_pred_bg[1:nrow(pts_occ)]),
                        c(this_occ_pred, this_pred_bg[1:nrow(pts_occ)]),
                        use = 'complete.obs')
    r_only_test <- cor(c(full_pred_occ_test,
                         full_pred_bg[(nrow(pts_occ) + 1): n_samples]),
                       c(this_occ_test_pred,
                         this_pred_bg[(nrow(pts_occ) + 1): n_samples]),
                       use = 'complete.obs')
    auc_only_train <- .auc_ratio(na.omit(this_occ_pred),
                                 na.omit(as.vector(this_vars_pred[[1]])))
    auc_only_test <- .auc_ratio(na.omit(this_occ_test_pred),
                                na.omit(as.vector(this_vars_pred[[1]])))

    # Cleaning up
    rm(this_var_occ, this_vars, this_model, this_occ_pred,
       this_occ_test_pred, this_vars_pred, this_pred_bg)

    # Model with variables except the chosen one
    ## Subset dataset
    nms <- setdiff(bands, nm)
    except_var_occ <- var_occ %>% select(all_of(nms))
    except_vars <- variables %>% select(all_of(nms))

    ## Fit model
    args_iforest <- c(
      list(data=except_var_occ, seed=model$random_seed, nthreads=model$nthreads),
      model$params
    )
    args_iforest$ndim <- min(ncol(args_iforest$data), args_iforest$ndim)
    args_iforest$ncols_per_tree <- min(ncol(args_iforest$data), args_iforest$ncols_per_tree)
    if (args_iforest$ndim == 1) {
      if (args_iforest$new_categ_action == "impute") {
        args_iforest$new_categ_action <- "weighted"
      }
    } else {
      if (args_iforest$missing_action == "divide") {
        args_iforest$missing_action <- "impute"
      }
      if (args_iforest$missing_action == "weighted") {
        args_iforest$missing_action <- "impute"
      }
    }
    except_model <- do.call(isolation.forest, args_iforest)

    ## Prediction
    ## Raster
    except_vars_pred <- predict(except_vars, except_model)
    ## Stretch result to be comparable with other results
    except_vars_pred <- 1 - except_vars_pred
    except_vars_pred <- .stars_stretch(except_vars_pred)

    # Extract values
    except_occ_pred <- st_extract(
      x = except_vars_pred,
      at = pts_occ) %>%
      pull(.data$prediction)
    except_occ_test_pred <- st_extract(
      x = except_vars_pred,
      at = pts_occ_test) %>%
      pull(.data$prediction)

    # Background values
    except_pred_bg <- st_extract(
      x = except_vars_pred,
      at = samples_bg) %>%
      pull(.data$prediction)

    ## Calculate metrics
    r_except_train <- cor(c(full_pred_occ, full_pred_bg[1:nrow(pts_occ)]),
                        c(except_occ_pred, except_pred_bg[1:nrow(pts_occ)]),
                        use = 'complete.obs')
    r_except_test <- cor(c(full_pred_occ_test,
                         full_pred_bg[(nrow(pts_occ) + 1): n_samples]),
                       c(except_occ_test_pred,
                         except_pred_bg[(nrow(pts_occ) + 1): n_samples]),
                       use = 'complete.obs')
    auc_except_train <- .auc_ratio(na.omit(except_occ_pred),
                                   na.omit(as.vector(except_vars_pred[[1]])))
    auc_except_test <- .auc_ratio(na.omit(except_occ_test_pred),
                                  na.omit(as.vector(except_vars_pred[[1]])))

    # Clean up
    rm(nms, except_var_occ, except_vars,
       except_model, except_occ_pred, except_occ_test_pred,
       except_vars_pred, except_pred_bg)

    # Output
    tibble(variable = rep(nm, 8),
           metrics = c(rep('pearson_correlation', 4), rep('AUC_ratio', 4)),
           method = rep(c(rep('Only', 2), rep('Without', 2)), 2),
           usage = rep(c('Train', 'Test'), 4),
           value = c(r_only_train, r_only_test,
                     r_except_train, r_except_test,
                     auc_only_train, auc_only_test,
                     auc_except_train, auc_except_test))
  }))

  # NOTE: Permutation feature importance method does not work for isolation forest
  # because each split is independent to the rank of values. Its tree structure is
  # fundamentally different from tree structure such as random forest.
  # ALTERNATIVE: SHAP method that use Shapley values according to game theory.

  # SHAP feature importance
  ## Training
  ### Add background points to balance the average value is around 0.5
  vars_bg <- st_extract(
    x = variables,
    at = samples_bg[1:nrow(pts_occ), ]) %>%
    st_drop_geometry()
  set.seed(seed)
  shap_train <- explain(model, X = var_occ, nsim = shap_nsim,
                        newdata = rbind(var_occ, vars_bg),
                        pred_wrapper = .pfun_shap)
  shap_train <- as.data.frame(shap_train) # For fastshap >= 0.1.0

  ## Test
  ### Add background points to balance the average value is around 0.5
  vars_bg <- st_extract(
    x = variables,
    at = samples_bg[(nrow(pts_occ) + 1): n_samples, ]) %>%
    st_drop_geometry()
  set.seed(seed)
  shap_test <- explain(model, X = var_occ, nsim = shap_nsim,
                       newdata = rbind(var_occ_test, vars_bg),
                       pred_wrapper = .pfun_shap)
  shap_test <- as.data.frame(shap_test) # For fastshap >= 0.1.0
  rm(vars_bg, variables, samples_bg, var_occ, var_occ_test)

  # Output
  out <- list(variables = bands,
              pearson_correlation =
                var_each %>%
                filter(.data$metrics == 'pearson_correlation') %>%
                select(-.data$metrics),
              full_AUC_ratio =
                tibble(full_auc_train = full_auc_train,
                       full_auc_test = full_auc_test),
              AUC_ratio =
                var_each %>%
                filter(.data$metrics == 'AUC_ratio') %>%
                select(-.data$metrics),
              SHAP = list(train = as_tibble(shap_train),
                          test = as_tibble(shap_test)))

  class(out) <- append("VariableAnalysis", class(out))

  # Visualize
  if (visualize) {
    print(plot(out))
  }

  # Return
  return(out)
}

# variable_analysis end