R/marginal_response.R
In itsdm: Isolation Forest-Based Presence-Only Species Distribution Modeling

Documented in marginal_response

#' @title Calculate marginal responses of each variables.
#' @description Calculate the marginal responses of each variables within the model.
#' @param model (Any predictive model). In this package, it is `isolation_forest`.
#' It could be the item `model` of `POIsotree` made by function \code{\link{isotree_po}}.
#' @param var_occ (`data.frame`, `tibble`) The `data.frame` style table that
#' include values of environmental variables at occurrence locations.
#' @param variables (`stars`) The `stars` of environmental variables. It should have
#' multiple `attributes` instead of `dims`. If you have `raster` object instead, you
#' could use \code{\link{st_as_stars}} to convert it to `stars` or use
#' \code{\link{read_stars}} directly read source data as a `stars`. You also could
#' use item `variables` of `POIsotree` made by function \code{\link{isotree_po}}.
#' @param si (`integer`) The number of samples to generate response curves.
#' If it is too small, the response curves might be biased.
#' The default value is `1000`.
#' @param visualize (`logical`) if `TRUE`, plot the response curves.
#' The default is `FALSE`.
#' @return (`MarginalResponse`) A nested list of
#' \itemize{
#' \item{responses_cont (`list`) A list of response values of continuous variables}
#' \item{responses_cat (`list`) A list of response values of categorical variables}
#' }
#'
#' @seealso
#' \code{\link{plot.MarginalResponse}}
#'
#' @details
#' The values show how each environmental variable affects the modeling
#' prediction. They show how the predicted result changes as each environmental
#' variable is varied while keeping all other environmental variables at average
#' sample value. They might be hard to interpret if there are strongly correlated
#' variables. The users could use \code{\link{dim_reduce}} function to remove
#' the strong correlation from original environmental variable stack.
#'
#' @references
#' \itemize{
#' \item{Elith, Jane,
#' et al. "The evaluation strip: a new and robust method for plotting predicted
#' responses from species distribution models." \emph{Ecological modelling}
#' 186.3 (2005): 280-289.\doi{10.1016/j.ecolmodel.2004.12.007}}
#' }
#'
#' @importFrom dplyr select slice as_tibble pull n %>%
#' @importFrom stars st_as_stars
#' @importFrom stats predict setNames
#' @importFrom rlang :=
#' @export
#' @examples
#' # Using a pseudo presence-only occurrence dataset of
#' # virtual species provided in this package
#' library(dplyr)
#' library(sf)
#' library(stars)
#' library(itsdm)
#'
#' # Prepare data
#' data("occ_virtual_species")
#' obs_df <- occ_virtual_species %>% filter(usage == "train")
#' eval_df <- occ_virtual_species %>% filter(usage == "eval")
#' x_col <- "x"
#' y_col <- "y"
#' obs_col <- "observation"
#'
#' # Format the observations
#' obs_train_eval <- format_observation(
#'   obs_df = obs_df, eval_df = eval_df,
#'   x_col = x_col, y_col = y_col, obs_col = obs_col,
#'   obs_type = "presence_only")
#'
#' env_vars <- system.file(
#'   'extdata/bioclim_tanzania_10min.tif',
#'   package = 'itsdm') %>% read_stars() %>%
#'   slice('band', c(1, 5, 12, 16))
#'
#' # With imperfect_presence mode,
#' mod <- isotree_po(
#'   obs_mode = "imperfect_presence",
#'   obs = obs_train_eval$obs,
#'   obs_ind_eval = obs_train_eval$eval,
#'   variables = env_vars, ntrees = 10,
#'   sample_size = 0.8, ndim = 2L,
#'   seed = 123L, nthreads = 1,
#'   response = FALSE,
#'   spatial_response = FALSE,
#'   check_variable = FALSE)
#'
#' marginal_responses <- marginal_response(
#'   model = mod$model,
#'   var_occ = mod$vars_train,
#'   variables = mod$variables)
#' plot(marginal_responses)
#' #'
marginal_response <- function(model,
                              var_occ,
                              variables,
                              si = 1000,
                              visualize = FALSE) {

  # Check inputs
  checkmate::assert_data_frame(var_occ)
  checkmate::assert_int(si)
  checkmate::assert_class(variables, 'stars')
  stopifnot(length(dim(variables)) <= 2)
  checkmate::assert_logical(visualize)
  bands <- names(variables)
  stopifnot(all(bands %in% colnames(var_occ)))

  # Reformat data
  ## Split numeric and categorical
  isfacor <- as.vector(sapply(variables, is.factor))
  bands_cont <- bands[!isfacor]
  bands_cat <- bands[isfacor]

  # Do full prediction
  ## Raster
  var_pred_full <- predict(variables, model)
  ## Stretch result to be comparable with other results
  var_pred_full <- 1 - var_pred_full

  # Numeric variables
  ## Not limited to data volume, could generate as many as possible
  ## pseudo observations in [min, max], so the function could make
  ## smoother curves.
  if (length(bands_cont) > 0) {
    mins <- sapply(bands_cont, function(nm) {
      min(variables %>% pull(nm), na.rm = T)})
    maxs <- sapply(bands_cont, function(nm) {
      max(variables %>% pull(nm), na.rm = T)})
    vals_cont <- do.call(
      cbind, lapply(1:length(mins), function(m) {
      seq(from = mins[m], to = maxs[m],
          length.out = si)})) %>%
      data.frame() %>%
      setNames(bands_cont)

    # Get means and expand
    means <- lapply(bands, function(nm) {
      if (nm %in% bands_cont) {
        mean(var_occ %>% pull(nm), na.rm = T)
      } else {
        .mode(var_occ %>% pull(nm))
      }}) %>%
      setNames(bands) %>%
      as_tibble() %>%
      slice(rep(1:n(), each = nrow(vals_cont)))
    responses_cont <- lapply(names(vals_cont), function(nm) {
      vals_tmp <- means
      vals_tmp <- vals_tmp %>%
        mutate('{nm}' := vals_cont %>% pull(nm))
      pred_tmp <- predict(model, vals_tmp)
      pred_tmp <- 1 - pred_tmp
      pred_tmp <- .stars_stretch(var_pred_full, new_values = pred_tmp)
      data.frame(y = pred_tmp,
                 x = vals_cont %>% pull(nm)) %>%
        setNames(c("y", "x"))
    })
    names(responses_cont) <- bands_cont
    rm(mins, maxs, vals_cont, means)
  } else responses_cont <- NULL

  # Categorical variables
  ## The number of pseudo observations is limited to factor levels
  ## So categorical variables should generate one by one
  if (length(bands_cat) > 0) {
    responses_cat <- lapply(bands_cat, function(nm) {
      vals_this <- var_occ %>% pull(nm) %>%
        levels() %>% as.factor()

      means_this <- lapply(bands, function(nm) {
        if (nm %in% bands_cont) {
          mean(var_occ %>% pull(nm), na.rm = T)
        } else {
          .mode(var_occ %>% pull(nm))
        }}) %>%
        setNames(bands) %>%
        as_tibble() %>%
        slice(rep(1:n(), each = length(vals_this)))

      means_this <- means_this %>%
        mutate('{nm}' := vals_this)
      pred_tmp <- predict(model, means_this)
      pred_tmp <- 1 - pred_tmp
      pred_tmp <- .stars_stretch(var_pred_full, new_values = pred_tmp)
      data.frame(y = pred_tmp,
                 x = vals_this) %>%
        setNames(c("y", "x"))
    }) %>% setNames(bands_cat)
  } else responses_cat <- NULL

  # Put together
  responses <- list(responses_cont = responses_cont,
                    responses_cat = responses_cat)
  class(responses) <- append("MarginalResponse", class(responses))

  # Visualize
  if (visualize) {
    print(plot(responses))
  }

  # Return
  return(responses)
}

# marginal_response end