#' Build ecological Diagnostic Tool
#'
#' This function allows to build an ecological Diagnostic Tool (DT) that
#' predicts an Impairment Probability for one or several anthropogenic
#' pressures.
#'
#' The function takes as input two tables: one with a categorical description
#' (quality classes) of samples by one or several anthropogenic `pressures` and
#' the second with the values of biological `metrics` calculated from the
#' community data from the same samples.
#'
#' For each pressure (i.e. column in the `pressures` table), a model is built
#' and saved in the directory given by the `pathDT` argument. The whole set of
#' models (DT units) saved in this directory constitute the DT.
#'
#' Each DT unit is a probability random forest model built using the
#' [ranger][ranger::ranger] function to predict the probability of a community
#' being impaired by the pressure considered based on the biological metrics
#' exhibited by the communities.
#'
#' For each DT unit, the given metrics and pressures tables are splitted in
#' training and test data sets. This is performed using the trainingFrac
#' argument that specify the proportion of the data (once observations with
#' missing pressure are removed) from each pressure level (low or impaired) that
#' are used to constitute the training data (stratified sampling). By default,
#' trainingFrac refers to the observations (rows of metrics and pressures) but
#' if a grouping vector (e.g. site ID) is given to the argument samplingUnit,
#' then this the training data set is built by sampling among samplingUnit and
#' not among the rows. If a site has observations with different pressure levels
#' (low or impaired), then the level occuring with highest frequency is
#' allocated to the site.
#'
#' The hyper-parameters of the [ranger][ranger::ranger] model are given in the
#' params argument that could accept one or several values per parameter. If
#' several values are given, an optimization procedure using
#' [tuneParamsMultiCrit][mlr::tuneParamsMultiCrit] is performed to identify the
#' parameter set exhibiting the best trade-off between performance (AUC) and
#' execution time. Two optimization algorithms are implemented: a grid search
#' and a genetic optimization algorithm. If the argument `calibGenNb` is larger
#' than one then the genetic algorithm is used and the space search for each
#' parameter is determined by the minimum and maximum values given in `params`.
#' If `calibGenNb` is smaller or equal to 1, then a grid search testing all the
#' `params` value combinations is performed.
#'
#' @param metrics a data frame with samples in rows and biological metrics in
#' columns
#'
#' @param pressures a data frame with samples in rows and pressure information
#' in columns (one per pressure category). The table is filled with quality
#' classes (e.g. low or impaired)
#'
#' @param pathDT character string, the path where the built models will be saved
#'
#' @param params a named list with the values, one or two (minimum, maximum) of
#' the following parameters:
#' - num.trees: Number of trees to grow;
#' - mtry: Number of variables randomly sampled as candidates at each split;
#' - sample.fraction: Proportion of samples to draw;
#' - min.node.size: Minimum size of terminal nodes.
#'
#' @param CVfolds an integer indicating the number of parts made from the
#' training data set and used to calibrate the model hyper-parameters.
#'
#' @param low,impaired character vectors with the labels of the pressure classes
#' (in `pressures`) corresponding to low impact and impaired situations,
#' respectively.
#'
#' @param nIter an integer indicating the number of ranger RF models created for
#' each pressure type. nIter larger than 1 allow to estimate prediction
#' uncertainty and improve model robustness.
#'
#' @param nCores an integer indicating the number of CPU cores available to
#' parallelize the calibration step
#'
#' @param trainingFrac a number between 0 and 1 indicating which propotion of
#' the data set will be used to train the model
#'
#' @param samplingUnit a vector with a length equal to the number of rows of
#' metrics and pressures indicating to which group each observation belongs
#' to. The training and test data sets will be obtained by sampling these
#' groups and not the observations (except if samplingUnit = NULL, the
#' default).
#'
#' @param calibPopSize numeric. The size of the population used by the genetic
#' algorithm used to calibrate the parameters.
#'
#' @param calibGenNb numeric. The number of generations used by the genetic
#' algorithm used to calibrate the parameters. (calibGenNb + 1) * calibPopSize
#' gives the total number of iterations performed by the calibration
#' algorithm.
#'
#' @param seed numeric. The seed used for the random number generator
#'
#' @return nothing, the models and used data are saved as .rda objects in the
#' directory corresponding to the pathDT argument.
#'
#' @seealso [ranger][ranger::ranger]
#' [tuneParamsMultiCrit][mlr::tuneParamsMultiCrit]
#'
#' @importFrom dplyr "%>%"
#' @export
build_DT <- function(metrics,
pressures,
low = "low",
impaired = "impaired",
pathDT,
params = list(num.trees = 500,
mtry = 25,
sample.fraction = 0.632,
min.node.size = 25),
CVfolds = 5,
nIter = 1L,
nCores = 1L,
trainingFrac = 1,
samplingUnit = NULL,
calibPopSize = 100,
calibGenNb = 1,
seed = 20181025) {
set.seed(seed)
num.trees <- mtry <- sample.fraction <- min.node.size <- ellapsedTime <-
AUC <- auc_diff <- time_diff <- NULL
dir.create(path = pathDT, recursive = TRUE)
if (nrow(pressures) != nrow(metrics)) {
stop("pressures and metrics tables should have the same lines")
}
cat("Building DT", file = paste0(pathDT, "log.csv"))
for (p in colnames(pressures)) {
cat("\n", p, "\n", sep = "")
cat("\n", p, "\n", sep = "",
file = paste0(pathDT, "log.csv"), append = TRUE)
pressure <- NULL
allData <- data.frame(pressure = pressures[[p]],
metrics) %>%
dplyr::mutate(
pressure = dplyr::case_when(pressure %in% low ~ "low",
pressure %in% impaired ~ "impaired",
TRUE ~ NA_character_) %>%
factor(x = ., levels = c("low", "impaired"))) %>%
split_training_test(data = ., frac = trainingFrac,
group = samplingUnit)
trainingData <- allData$training
testData <- allData$test
rm(allData)
selMetrics <- colnames(trainingData)[-1]
if (any(sapply(params, length) > 1)) {
cat("\n calibration...\n")
cat("\n calibration...\n",
file = paste0(pathDT, "log.csv"), append = TRUE)
tunedModel <- calibrate_DT(trainingData = trainingData,
CVfolds = CVfolds,
selMetrics = selMetrics,
params = params,
p = p,
nCores = nCores,
calibPopSize = calibPopSize,
calibGenNb = calibGenNb)
testedParams <- tunedModel$opt.path %>%
as.data.frame() %>%
dplyr::select(num.trees, mtry, sample.fraction, min.node.size,
AUC = auc.test.mean,
ellapsedTime = timeboth.test.mean)
suppressWarnings(utils::write.table(testedParams,
file = paste0(pathDT, "log.csv"),
sep = ";", append = TRUE,
row.names = FALSE))
bestParams <- tunedModel$x[[1]] %>%
as.data.frame() %>%
dplyr::bind_cols(tunedModel$y %>%
as.data.frame() %>%
dplyr::slice(1) %>%
dplyr::rename(AUC = auc.test.mean,
ellapsedTime = timeboth.test.mean))
} else {
bestParams <- do.call(what = "cbind", args = params) %>%
data.frame() %>%
dplyr::mutate(AUC = NA, ellapsedTime = NA)
}
cat("\n best parameter values\n",
file = paste0(pathDT, "log.csv"), append = TRUE)
suppressWarnings(utils::write.table(bestParams,
file = paste0(pathDT, "log.csv"),
sep = ";", append = TRUE, row.names = FALSE))
overSamplingRatio <- max(table(trainingData$pressure)) /
min(table(trainingData$pressure))
learner <- mlr::makeLearner(cl = "classif.ranger",
predict.type = "prob",
num.threads = nCores,
replace = FALSE) %>%
mlr::setHyperPars(learner = .,
par.vals = bestParams %>%
dplyr::select(-AUC, -ellapsedTime) %>%
as.list()) %>%
mlr::makeSMOTEWrapper(learner = ., sw.rate = overSamplingRatio)
task <- mlr::makeClassifTask(data = trainingData[, c("pressure",
selMetrics)],
target = "pressure",
positive = "impaired")
DTunit <- mlr::bootstrapB632(learner = learner,
task = task,
iters = nIter,
stratify = TRUE,
models = TRUE,
keep.pred = TRUE,
measures = list(mlr::auc, mlr::timeboth),
show.info = FALSE)
thresholdData <-
mlr::generateThreshVsPerfData(obj = DTunit,
measures = list(mlr::tpr, mlr::tnr))$data
funTPR <- with(thresholdData, stats::approxfun(x = threshold, y = tpr))
funTNR <- with(thresholdData, stats::approxfun(x = threshold, y = tnr))
DTunit$threshold <-
stats::optimize(f = function(x) abs(funTPR(x) - funTNR(x)),
interval = c(0,1))$minimum
save(DTunit, trainingData, testData, task,
file = file.path(pathDT, paste0("model_", p, ".rda")))
cat(" DONE")
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.