spatialRF: Easy Spatial Modeling with Random Forest

Documented in rank_spatial_predictors

#' @title Ranks spatial predictors
#' @description Ranks spatial predictors generated by [mem_multithreshold()] or [pca_multithreshold()] by their effect in reducing the Moran's I of the model residuals (`ranking.method = "effect"`), or by their own Moran's I (`ranking.method = "moran"`).
#'
#' In the former case, one model of the type `y ~ predictors + spatial_predictor_X` is fitted per spatial predictor, and the Moran's I of this model's residuals is compared with the one of the model without spatial predictors (`y ~ predictors`), to finally rank the spatial predictor from maximum to minimum difference in Moran's I.
#'
#' In the latter case, the spatial predictors are ordered by their Moran's I alone (this is the faster option).
#'
#' In both cases, spatial predictors that are redundant with others at a Pearson correlation > 0.5 and spatial predictors with no effect (no reduction of Moran's I  or Moran's I of the spatial predictor equal or lower than 0) are removed.
#'
#' This function has been designed to be used internally by [rf_spatial()] rather than directly by a user.
#' @param data Data frame with a response variable and a set of predictors. Default: `NULL`
#' @param dependent.variable.name Character string with the name of the response variable. Must be in the column names of `data`. Default: `NULL`
#' @param predictor.variable.names Character vector with the names of the predictive variables. Every element of this vector must be in the column names of `data`. Default: `NULL`
#' @param reference.moran.i Moran's I of the residuals of the model without spatial predictors. Default: `1`
#' @param distance.matrix Squared matrix with the distances among the records in `data`. The number of rows of `distance.matrix` and `data` must be the same. If not provided, the computation of the Moran's I of the residuals is omitted. Default: `NULL`
#' @param distance.thresholds Numeric vector with neighborhood distances. All distances in the distance matrix below each value in `dustance.thresholds` are set to 0 for the computation of Moran's I. If `NULL`, it defaults to seq(0, max(distance.matrix), length.out = 4). Default: `NULL`
#' @param ranger.arguments List with \link[ranger]{ranger} arguments. See [rf] or [rf_repeat] for further details.
#' @param spatial.predictors.df Data frame of spatial predictors.
#' @param ranking.method Character, method used by to rank spatial predictors. The method "effect" ranks spatial predictors according how much each predictor reduces Moran's I of the model residuals, while the method "moran" ranks them by their own Moran's I. Default: `"moran"`.
#' @param verbose Logical, ff `TRUE`, messages and plots generated during the execution of the function are displayed, Default: `TRUE`
#' @param n.cores Integer, number of cores to use for parallel execution. Creates a socket cluster with `parallel::makeCluster()`, runs operations in parallel with `foreach` and `%dopar%`, and stops the cluster with `parallel::clusterStop()` when the job is done. Default: `parallel::detectCores() - 1`
#' @param cluster A cluster definition generated with `parallel::makeCluster()`. If provided, overrides `n.cores`. When `cluster = NULL` (default value), and `model` is provided, the cluster in `model`, if any, is used instead. If this cluster is `NULL`, then the function uses `n.cores` instead. The function does not stop a provided cluster, so it should be stopped with `parallel::stopCluster()` afterwards. The cluster definition is stored in the output list under the name "cluster" so it can be passed to other functions via the `model` argument, or using the `%>%` pipe. Default: `NULL`
#' @return A list with four slots:
#' \itemize{
#' \item `method`: Character, name of the method used to rank the spatial predictors.
#'  \item `criteria`: Data frame with two different configurations depending on the ranking method. If `ranking.method = "effect"`, the columns contain the names of the spatial predictors, the r-squared of the model, the Moran's I of the model residuals, the difference between the Moran's I of the model including the given spatial predictor, and the Moran's I of the model fitted without spatial predictors, and the interpretation of the Moran's I value. If `ranking.method = "moran"`, only the name of the spatial predictor and it's Moran's I are in the output data frame.
#'  \item `ranking`: Ordered character vector with the names of the spatial predictors selected.
#'  \item `spatial.predictors.df`: data frame with the selected spatial predictors in the order of the ranking.
#' }
#' @examples
#' if(interactive()){
#'
#'  #loading distance matrix
#'  data(distance_matrix)
#'
#'  #computing Moran's Eigenvector Maps
#'  mem.df <- mem(
#'   distance.matrix = distance_matrix[1:50, 1:50],
#'   distance.threshold = 0
#'  )
#'
#'  #ranking by the Moran's I of the spatial predictor
#'  rank <- rank_spatial_predictors(
#'   distance.matrix = distance_matrix[1:50, 1:50],
#'   distance.thresholds = 0,
#'   spatial.predictors.df = mem.df,
#'   ranking.method = "moran",
#'   n.cores = 1
#'  )
#'
#'  #checking Moran's I of MEMs
#'  rank$criteria
#'
#'  #checking rank of MEMs
#'  rank$ranking
#' }
#' @rdname rank_spatial_predictors
#' @export
rank_spatial_predictors <- function(
  data = NULL,
  dependent.variable.name = NULL,
  predictor.variable.names = NULL,
  distance.matrix = NULL,
  distance.thresholds = NULL,
  ranger.arguments = NULL,
  spatial.predictors.df = NULL,
  ranking.method = c("moran", "effect"),
  reference.moran.i = 1,
  verbose = FALSE,
  n.cores = parallel::detectCores() - 1,
  cluster = NULL
){

  #predictor.variable.names comes from auto_vif or auto_cor
  if(!is.null(predictor.variable.names)){
    if(inherits(predictor.variable.names, "variable_selection")){
      predictor.variable.names <- predictor.variable.names$selected.variables
    }
  }

  #testing method argument
  ranking.method <- match.arg(
    arg = ranking.method,
    choices = c("moran", "effect"),
    several.ok = FALSE
    )

  #add write.forest = FALSE to ranger.arguments
  if(is.null(ranger.arguments)){
    ranger.arguments <- list()
  }
  ranger.arguments$write.forest <- TRUE
  ranger.arguments$importance <- "none"
  ranger.arguments$local.importance <- FALSE
  ranger.arguments$keep.inbag <- FALSE
  ranger.arguments$num.trees <- 500

  #reference.moran.i
  if(is.null(reference.moran.i)){reference.moran.i <- 1}


  #CLUSTER SETUP
  #cluster is provided
  if(!is.null(cluster)){

    #n.cores <- NULL
    n.cores <- NULL

    #flat to not stop cluster after execution
    stop.cluster <- FALSE

  } else {

    #creates and registers cluster
    cluster <- parallel::makeCluster(
      n.cores,
      type = "PSOCK"
    )

    #registering cluster
    doParallel::registerDoParallel(cl = cluster)

    #flag to stop cluster
    stop.cluster <- TRUE

  }

  #parallelized loop
  spatial.predictors.i <- NULL
  spatial.predictors.order <- foreach::foreach(
    spatial.predictors.i = seq(1, ncol(spatial.predictors.df)),
    .combine = "rbind",
    .verbose = verbose
    ) %dopar% {

    #3.2.3.1 preparing data

    #spatial predictor name
    spatial.predictors.name.i <- colnames(spatial.predictors.df)[spatial.predictors.i]

    #computing reduction in Moran's I
    if(ranking.method == "effect"){

      #training data
      data.i <- data.frame(
        data,
        spatial.predictors.df[, spatial.predictors.i]
      )
      colnames(data.i)[ncol(data.i)] <- spatial.predictors.name.i

      #new predictor.variable.names
      predictor.variable.names.i <- c(predictor.variable.names, spatial.predictors.name.i)

      #fitting model I
      m.i <- spatialRF::rf(
        data = data.i,
        dependent.variable.name = dependent.variable.name,
        predictor.variable.names = predictor.variable.names.i,
        seed = spatial.predictors.i,
        distance.matrix = distance.matrix,
        distance.thresholds = distance.thresholds,
        scaled.importance = FALSE,
        ranger.arguments = ranger.arguments,
        verbose = FALSE
      )

      #out.df
      out.i <- data.frame(
        spatial.predictors.name = spatial.predictors.name.i,
        model.r.squared = m.i$performance$r.squared.oob,
        moran.i = m.i$residuals$autocorrelation$max.moran,
        p.value = m.i$residuals$autocorrelation$per.distance$p.value[1],
        ranking.criteria = reference.moran.i - m.i$residuals$autocorrelation$max.moran
      )

    }

    #computing Moran's I of the spatial predictors
    if(ranking.method == "moran"){

      #moran's I of spatial predictor
      m.i <- spatialRF::moran(
        x = spatial.predictors.df[, spatial.predictors.i],
        distance.matrix = distance.matrix,
        distance.threshold = distance.thresholds[1],
        verbose = FALSE
      )

      #out.df
      out.i <- data.frame(
        spatial.predictors.name = spatial.predictors.name.i,
        ranking.criteria = m.i$test$moran.i,
        interpretation = m.i$test$interpretation
      )

    }

    #returning output
    return(out.i)

  } #end of parallelized loop

  #variables to avoid check complaints
  ranking.criteria <- NULL
  interpretation <- NULL

  #getting only spatial predictors with positive spatial correlation
  if(ranking.method == "moran"){
    spatial.predictors.order <- dplyr::filter(
      spatial.predictors.order,
      interpretation == "Positive spatial correlation"
    )
  }

  #arranging by Moran's I
  spatial.predictors.order <- dplyr::arrange(
    spatial.predictors.order,
    dplyr::desc(ranking.criteria)
    )

  #return output
  out.list <- list()
  out.list$method <- ranking.method
  out.list$criteria <- spatial.predictors.order
  out.list$ranking <- spatial.predictors.order$spatial.predictors.name
  out.list$spatial.predictors.df <- spatial.predictors.df[, spatial.predictors.order$spatial.predictors.name]

  #stopping cluster
  if(stop.cluster == TRUE){
    parallel::stopCluster(cl = cluster)
  }

  #returning output list
  out.list

}