R/explore_partition_extrapolation.R
In kuenm2: Detailed Development of Ecological Niche Models

Documented in explore_partition_extrapolation

#' Analysis of extrapolation risks in partitions using the MOP metric
#'
#' @description
#' This function calculates environmental dissimilarities and identifies
#' non-analogous conditions by comparing the training data against the test data
#' for each partition, using the MOP (Mobility-Oriented Parity) metric.
#'
#' @usage explore_partition_extrapolation(data, include_train_background = TRUE,
#'                                        include_test_background = FALSE,
#'                                        variables = NULL,
#'                                        mop_type = "detailed",
#'                                        calculate_distance = TRUE,
#'                                        where_distance = "all",
#'                                        progress_bar = FALSE, ...)
#'
#' @param data an object of class `prepared_data` returned by the
#' `prepare_data()` function.
#' @param include_train_background (logical) whether to include the background
#' points used in training to define the environmental range of the training
#' data. If set to FALSE, only the environmental conditions of the training
#' presence records will be considered. Default is TRUE, meaning both presence
#' and background points are used.
#' @param include_test_background (logical) whether to compute MOP for both the
#' test presence records and the background points not used during training.
#' Default is FALSE, meaning MOP will be calculated only for the test presences.
#' @param variables (character) names of the variables to be used in the MOP
#' calculation. Default is NULL, meaning all variables in `data` will be used.
#' @param mop_type (character) type of MOP analysis to be performed. Options
#' available are "basic", "simple" and "detailed". Default is 'simples'. See
#' \code{\link{projection_mop}}() for more details.
#' @param calculate_distance (logical) whether to calculate distances
#' (dissimilarities) between train and test data. Default is TRUE.
#' @param where_distance (character) specifies which values in train data should
#' be used to calculate distances. Options are: "in_range" (only conditions
#' within the train range), "out_range" (only conditions outside the
#' train range), and "all" (all conditions). Default is "all".
#' @param progress_bar (logical) whether to display a progress bar during
#' processing. Default is FALSE.
#' @param ... additional arguments passed to \code{\link[mop]{mop}()}.
#'
#' @return
#' A `data.frame` containing:
#' - MOP distances (if `calculate_distance = TRUE`);
#' - an indicator of whether environmental conditions at each test record fall
#' within the training range;
#' - the number of variables outside the training range;
#' - the names of variables with values lower or higher than the training range;
#' - if the `prepared_data` object includes categorical variables, it will also
#' contain columns indicating which values in the testing data were not present
#' in the training data.
#'
#' @export
#' @importFrom terra vect rasterize mask coltab
#' @importFrom mop mop
#' @importFrom stats setNames
#' @examples
#' #Prepare data
#' # Import occurrences
#' data(occ_data, package = "kuenm2")
#'
#' # Import raster layers
#' var <- terra::rast(system.file("extdata", "Current_variables.tif",
#'                                package = "kuenm2"))
#'
#' # Prepare data for maxnet model
#' sp_swd <- prepare_data(algorithm = "maxnet", occ = occ_data,
#'                        x = "x", y = "y",
#'                        raster_variables = var,
#'                        species = occ_data[1, 1],
#'                        n_background = 100,
#'                        categorical_variables = "SoilType",
#'                        features = c("l", "lq"),
#'                        r_multiplier = 1,
#'                        partition_method = "kfolds")
#'
#' # Analysis of extrapolation risks in partitions
#' res <- explore_partition_extrapolation(data = sp_swd)
#'
explore_partition_extrapolation <- function(data,
                                            include_train_background = TRUE,
                                            include_test_background = FALSE,
                                            variables = NULL,
                                            mop_type = "detailed",
                                            calculate_distance = TRUE,
                                            where_distance = "all",
                                            progress_bar = FALSE, ...){
  #Check data
  if (missing(data)) {
    stop("Argument 'data' must be defined.")
  }
  if (!inherits(data, "prepared_data")) {
    stop("'data' must be a 'prepared_data' object.")
  }
  if (!inherits(include_train_background, "logical")) {
    stop("'include_train_background' must be 'logical'.")
  }
  if (!inherits(include_test_background, "logical")) {
    stop("'include_test_background' must be 'logical'.")
  }
  if (!is.null(variables)) {
    if(!inherits(variables, "logical")) {
    stop("'variables' must be 'character' or NULL.")}
    var_out <- setdiff(variables, colnames(data$calibration_data))
    if(length(var_out) > 0){
      stop("Some 'variables' provided are absent from data")
    }
  }

  mop_type_out <- setdiff(mop_type, c("basic", "simple", "detailed"))
  if (length(mop_type_out) > 0) {
    stop("Invalid 'mop_type' provided.",
         "\nAvailable options are: 'basic', 'simple', or 'detailed'.")
  }

  if(!inherits(calculate_distance, "logical")){
    stop("Argument 'calculate_distance' must be 'logical'.")
  }

  distance_out <- setdiff(where_distance, c("in_range", "out_range", "all"))
  if (length(distance_out) > 0) {
    stop("Invalid 'where_distance' provided.",
         "\nAvailable options are: 'in_range', 'out_range', and 'all'.")
  }


  if(!inherits(progress_bar, "logical")){
    stop("Argument 'progress_bar' must be 'logical'.")
  }

  #Remove categorical if necessary
  if(!is.null(data$categorical_variables)){
    v <- setdiff(colnames(data$calibration_data), c("pr_bg",
                                                    data$categorical_variables))
  } else {
    v <- setdiff(colnames(data$calibration_data),
                 data$categorical_variables)
  }

  #Remove variables, if necessary
  if(!is.null(variables)){
    v <- intersect(variables, v)
  }

  #Create list to save partition results
  res <- list()

  #Get number of partitions
  n_partitions <- length(data$part_data)

  if (progress_bar) {
    pb <- txtProgressBar(min = 0, max = n_partitions, style = 3)
    progress <- function(n) {
      setTxtProgressBar(pb, n)
    }
    opts <- list(progress = progress)
  } else {
    opts <- NULL
  }

  for(i in 1:n_partitions){
    #Get partition name
    i_name <- names(data$part_data[i])

    #Get test and train
    partition_i <- data$part_data[[i_name]]
    test_i <- data$calibration_data[partition_i, ]
    train_i <- data$calibration_data[-partition_i, ]

    #Exclude background, if necessary
    if(!include_train_background){
      train_i <- train_i[train_i$pr_bg == 1, ]
    }
    if(!include_test_background){
      test_i <- test_i[test_i$pr_bg == 1, ]
    }

    #Run analysis
    res_i <- mop_with_records(train_data = train_i, test_data = test_i,
                              variables = v,
                              categorical_variables = data$categorical_variables,
                              mop_type = mop_type,
                              calculate_distance = calculate_distance,
                              where_distance = where_distance,
                              progress_bar = FALSE, ...)

    res[[i_name]][["test_data"]] <- res_i$mop_records
    #Append pr_bg information
    if(include_test_background){
      res[[i_name]][["test_data"]]$pr_bg <- data$calibration_data[partition_i,
                                                                  "pr_bg"]
    } else {
      res[[i_name]][["test_data"]]$pr_bg <- 1
    }


    #Append xy information
    if(!is.null(data$data_xy)){
      if(!include_test_background){
        xy <- cbind(data$data_xy[partition_i, ],
                    "pr_bg" = data$calibration_data[partition_i, "pr_bg"])
        xy <- xy[xy$pr_bg == 1, c("x", "y")]
      } else {
        xy <- data$data_xy[partition_i, ]
      }

    res[[i_name]][["test_data"]]<- cbind(xy,
                                         res[[i_name]][["test_data"]])}

    # #Append index, for test
    # res[[i_name]][["test_data"]]$index <- partition_i

    # Sets the progress bar to the current state
    if (progress_bar) setTxtProgressBar(pb, i)

  } #End of for i

  #Summarize data
  all_res <- lapply(names(res), function(i) {
    cbind("Partition" = i, res[[i]]$test_data)
  })
  all_res <- do.call("rbind", all_res)

  #Organize columns
  c_order <- intersect(c("Partition", "pr_bg", "x", "y"), colnames(all_res))
  c_order <- c(c_order, setdiff(colnames(all_res), c_order))
  all_res <- all_res[, c_order]

  #Reorganize calibration data
  partition_order <- unlist(data$part_data)

  calib_data <- data$calibration_data[partition_order, ]


  #Return final results
  res_final <- list("Mop_results" = all_res,
                    "calibration_data" = calib_data,
                    "categorical_variables" = data$categorical_variables,
                    "pca" = data$pca)
  class(res_final) <- "explore_partition"
  return(res_final)
  } #End of function