#' @title Tuning of random forest hyperparameters via spatial cross-validation
#' @description Finds the optimal set of random forest hyperparameters `num.trees`, `mtry`, and `min.node.size` via grid search by maximizing the model's R squared, or AUC, if the response variable is binomial, via spatial cross-validation performed with [rf_evaluate()].
#' @param model A model fitted with [rf()]. If provided, the training data is taken directly from the model definition (stored in `model$ranger.arguments`). Default: `NULL`
#' @param num.trees Numeric integer vector with the number of trees to fit on each model repetition. Default: `c(500, 1000, 2000)`.
#' @param mtry Numeric integer vector, number of predictors to randomly select from the complete pool of predictors on each tree split. Default: `floor(seq(1, length(predictor.variable.names), length.out = 4))`
#' @param min.node.size Numeric integer, minimal number of cases in a terminal node. Default: `c(5, 10, 20, 40)`
#' @param xy Data frame or matrix with two columns containing coordinates and named "x" and "y". If `NULL`, the function will throw an error. Default: `NULL`
#' @param repetitions Integer, number of independent spatial folds to use during the cross-validation. Default: `30`.
#' @param training.fraction Proportion between 0.2 and 0.9 indicating the number of records to be used in model training. Default: `0.75`
#' @param seed Integer, random seed to facilitate reproduciblity. If set to a given number, the results of the function are always the same. Default: `1`.
#' @param verbose Logical. If TRUE, messages and plots generated during the execution of the function are displayed, Default: `TRUE`
#' @param n.cores Integer, number of cores to use for parallel execution. Creates a socket cluster with `parallel::makeCluster()`, runs operations in parallel with `foreach` and `%dopar%`, and stops the cluster with `parallel::clusterStop()` when the job is done. Default: `parallel::detectCores() - 1`
#' @param cluster A cluster definition generated with `parallel::makeCluster()`. If provided, overrides `n.cores`. When `cluster = NULL` (default value), and `model` is provided, the cluster in `model`, if any, is used instead. If this cluster is `NULL`, then the function uses `n.cores` instead. The function does not stop a provided cluster, so it should be stopped with `parallel::stopCluster()` afterwards. The cluster definition is stored in the output list under the name "cluster" so it can be passed to other functions via the `model` argument, or using the `%>%` pipe. Default: `NULL`
#' @return A model with a new slot named `tuning`, with a data frame with the results of the tuning analysis.
#' @seealso [rf_evaluate()]
#' @examples
#' if(interactive()){
#'
#' #loading example data
#' data(plant_richness_df)
#' data(distance_matrix)
#'
#' #fitting model to tune
#' out <- rf(
#' data = plant_richness_df,
#' dependent.variable.name = "richness_species_vascular",
#' predictor.variable.names = colnames(plant_richness_df)[5:21],
#' distance.matrix = distance_matrix,
#' distance.thresholds = 0,
#' n.cores = 1
#' )
#'
#' #model tuning
#' tuning <- rf_tuning(
#' model = out,
#' num.trees = c(100, 500),
#' mtry = c(2, 8),
#' min.node.size = c(5, 10),
#' xy = plant_richness_df[, c("x", "y")],
#' n.cores = 1
#' )
#'
#' }
#' @importFrom rlang sym
#' @rdname rf_tuning
#' @export
rf_tuning <- function(
model = NULL,
num.trees = NULL,
mtry = NULL,
min.node.size = NULL,
xy = NULL,
repetitions = 30,
training.fraction = 0.75,
seed = 1,
verbose = TRUE,
n.cores = parallel::detectCores() - 1,
cluster = NULL
){
#declaring variables
metric <- NULL
num.trees.i <- NULL
mtry.i <- NULL
min.node.size.i <- NULL
if(is.null(model)){
stop("The argument 'model' is empty, there is no model to tune.")
} else {
#getting arguments from model rather than ranger.arguments
ranger.arguments <- model$ranger.arguments
data <- ranger.arguments$data
dependent.variable.name <- ranger.arguments$dependent.variable.name
predictor.variable.names <- ranger.arguments$predictor.variable.names
distance.matrix <- ranger.arguments$distance.matrix
distance.thresholds <- ranger.arguments$distance.thresholds
#getting cluster from model if "cluster" is not provided
if(is.null(cluster) & "cluster" %in% names(model)){
cluster <- model$cluster
}
#getting xy
if(is.null(xy)){
if(is.null(model$ranger.arguments$xy)){
stop("The argument 'xy' is required for spatial cross-validation.")
} else {
xy <- model$ranger.arguments$xy
}
}
}
if(sum(c("x", "y") %in% colnames(xy)) < 2){
stop("The column names of 'xy' must be 'x' and 'y'.")
}
if(nrow(xy) != nrow(data)){
stop("nrow(xy) and nrow(data) (stored in model$ranger.arguments$data) must be the same.")
}
#saving model class for later
model.class <- class(model)
#testing if the data is binary
is.binary <- is_binary(
data = data,
dependent.variable.name = dependent.variable.name
)
if(is.binary == TRUE){
metric <- "auc"
} else {
metric <- "r.squared"
}
#saving slots if it's an rf_spatial model
if(inherits(model, "rf_spatial")){
spatial <- model$spatial
}
#mtry
if(is.null(mtry)){
mtry <- as.integer(seq(1, length(predictor.variable.names) - 1, length.out = 4))
} else {
if(max(mtry) > length(predictor.variable.names)){
if(verbose == TRUE){
message("Maximum 'mtry' set to length(predictor.variable.names)")
}
mtry <- mtry[mtry < length(predictor.variable.names)]
}
}
mtry <- as.integer(mtry)
#min.node.size
if(is.null(min.node.size)){
min.node.size <- c(5, 10, 20, 40)
} else {
if(max(min.node.size) >= floor(nrow(data)/4)){
min.node.size <- min.node.size[min.node.size <= floor(nrow(data)/4)]
if(verbose == TRUE){
message(paste0(
"'min.node.size' values larger than ",
floor(nrow(data)/4),
" were removed."
)
)
}
}
}
min.node.size <- as.integer(min.node.size)
#num.trees
if(is.null(num.trees)){
num.trees <- c(500, 1000, 2000)
}
num.trees <- as.integer(num.trees)
#combining values
combinations <- expand.grid(
num.trees = num.trees,
mtry = mtry,
min.node.size = min.node.size
)
if(verbose == TRUE){
message(
paste0(
"Exploring ",
nrow(combinations),
" combinations of hyperparameters."
)
)
}
#CLUSTER SETUP
#cluster is provided
if(!is.null(cluster)){
#n.cores <- NULL
n.cores <- NULL
#flat to not stop cluster after execution
stop.cluster <- FALSE
} else {
#creates and registers cluster
cluster <- parallel::makeCluster(
n.cores,
type = "PSOCK"
)
#flag to stop cluster
stop.cluster <- TRUE
}
#registering cluster
doParallel::registerDoParallel(cl = cluster)
#looping through combinations
tuning <- foreach::foreach(
num.trees.i = combinations$num.trees,
mtry.i = combinations$mtry,
min.node.size.i = combinations$min.node.size,
.combine = "rbind",
.verbose = FALSE
) %dopar% {
#filling ranger arguments
if(!is.null(ranger.arguments)){
ranger.arguments.i <- ranger.arguments
} else {
ranger.arguments.i <- list()
}
ranger.arguments.i$num.trees <- num.trees.i
ranger.arguments.i$mtry <- mtry.i
ranger.arguments.i$min.node.size <- min.node.size.i
ranger.arguments.i$importance <- "none"
ranger.arguments.i$num.threads <- 1
ranger.arguments.i$save.memory <- TRUE
#fit model with new hyperparameters
m.i <- spatialRF::rf(
data = data,
dependent.variable.name = dependent.variable.name,
predictor.variable.names = predictor.variable.names,
distance.matrix = distance.matrix,
distance.thresholds = distance.thresholds,
ranger.arguments = ranger.arguments.i,
scaled.importance = FALSE,
seed = seed,
n.cores = 1,
verbose = FALSE
)
#evaluate with spatial cross-validation
m.i <- spatialRF::rf_evaluate(
model = m.i,
xy = xy,
repetitions = repetitions,
training.fraction = training.fraction,
metrics = metric,
seed = seed,
verbose = FALSE,
n.cores = 1
)
#getting performance measures
m.i.performance <- spatialRF::get_evaluation(m.i)
m.i.performance <- m.i.performance[m.i.performance$model == "Testing", c("metric", "median")]
#if the model is spatial
if(inherits(model, "rf_spatial")){
#getting interpretation of Moran's I if the model is rf_spatial
moran.i.interpretation <- m.i$residuals$autocorrelation$per.distance$interpretation[1]
#getting performance
m.i.performance <- data.frame(
r.squared = m.i.performance[m.i.performance$metric == metric, "median"],
moran.i.interpretation = moran.i.interpretation
)
} else {
#performance without Moran's I for non-spatial models
m.i.performance <- data.frame(
metric = m.i.performance[m.i.performance$metric == metric, "median"]
)
}
colnames(m.i.performance)[1] <- metric
return(m.i.performance)
}#end of parallelized loop
#binding with combinations
tuning <- cbind(
combinations,
tuning
) %>%
dplyr::arrange(dplyr::desc(!!rlang::sym(metric)))
#getting metric name
metric.name <- colnames(tuning)[!(colnames(tuning) %in% c("num.trees", "mtry", "min.node.size", "moran.i.interpretation"))]
#preparing tuning list
tuning.list <- list()
tuning.list$metric <- metric.name
tuning.list$tuning.df <- tuning
#subset if rf_spatial
if(inherits(model, "rf_spatial")){
#remove results yielding
tuning <- dplyr::filter(
tuning,
moran.i.interpretation == "No spatial correlation"
)
#stop if all results increase spatial autocorrelation of the residuals
if(nrow(tuning) == 0){
message("This spatial model cannot be tuned, all possible results increase spatial autocorrelation of the residuals")
return(model)
}
}
#best hyperparameters into ranger arguments
ranger.arguments$num.trees <- tuning[1, "num.trees"]
ranger.arguments$mtry <- tuning[1, "mtry"]
ranger.arguments$min.node.size <- tuning[1, "min.node.size"]
#fitting tuned model
model.tuned <- rf(
data = data,
dependent.variable.name = dependent.variable.name,
predictor.variable.names = predictor.variable.names,
ranger.arguments = ranger.arguments,
distance.matrix = distance.matrix,
distance.thresholds = distance.thresholds,
xy = xy,
n.cores = n.cores,
verbose = FALSE
)
#keeping class
class(model.tuned) <- unique(c(class(model.tuned), model.class))
#adding tuning slot
model.tuned$tuning <- tuning.list
#adding the variable importance slot if rf_spatial
if(inherits(model, "rf_spatial")){
model.tuned$importance <- prepare_importance_spatial(model = model.tuned)
}
#comparing metric of the old model and the tuned one
#evaluate model
model <- spatialRF::rf_evaluate(
model = model,
xy = xy,
repetitions = repetitions,
training.fraction = training.fraction,
seed = seed,
verbose = FALSE,
n.cores = n.cores,
cluster = cluster
)
#evaluate model tuned
model.tuned <- spatialRF::rf_evaluate(
model = model.tuned,
xy = xy,
repetitions = repetitions,
training.fraction = training.fraction,
seed = seed,
verbose = FALSE,
n.cores = n.cores,
cluster = cluster
)
#extract r.squared values
new.performance <- round(
model.tuned$evaluation$aggregated[model.tuned$evaluation$aggregated$model == "Testing" &
model.tuned$evaluation$aggregated$metric == metric.name,
"median"
],
3
)
old.performance <- round(
model$evaluation$aggregated[
model$evaluation$aggregated$model == "Testing" &
model$evaluation$aggregated$metric == metric.name,
"median"
],
3
)
#performance difference
performance.gain <- new.performance - old.performance
#if there is r-squared gain
if(performance.gain > 0){
#plot tuning
if(verbose == TRUE){
suppressWarnings(plot_tuning(model.tuned))
message("Best hyperparameters:")
message(paste0(" - num.trees: ", tuning[1, "num.trees"]))
message(paste0(" - mtry: ", tuning[1, "mtry"]))
message(paste0(" - min.node.size: ", tuning[1, "min.node.size"]))
message(paste0(
"gain in ",
metric.name,
": ",
round(performance.gain, 4)
)
)
}
#adding gain
model.tuned$tuning$performance.gain <- performance.gain
#adding selection of spatial predictors
if(inherits(model, "rf_spatial")){
model.tuned$spatial <- spatial
}
return(model.tuned)
#tuned model worse than original one
} else {
if(verbose == TRUE){
message(
paste0(
"The tuned model (",
metric.name,
" = ",
new.performance,
") performs worse than the original one (",
metric.name,
" = ",
old.performance,
"). Tuning not required, returning the original model."
)
)
}
#adding tuning slot
model$tuning <- tuning.list
#adding plot to the tunning slot
model$tuning$plot <- plot_tuning(
model,
verbose = FALSE
)
#adding r squared gain
model$tuning$performance.gain <- performance.gain
#stopping cluster
if(stop.cluster == TRUE){
parallel::stopCluster(cl = cluster)
} else {
model$cluster <- cluster
}
if(verbose == TRUE){
message("Tuning results stored in model$tuning.")
}
return(model)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.