# Suitability mapping based on ensembles of modelling algorithms: comparison of different algorithms and calibration

### Description

The basic function `ensemble.test`

allows to evaluate different algorithms for (species) suitability modelling, including maximum entropy (MAXENT), boosted regression trees, random forests, generalized linear models (including stepwise selection of explanatory variables), generalized additive models (including stepwise selection of explanatory variables), multivariate adaptive regression splines, regression trees, artificial neural networks, flexible discriminant analysis, support vector machines, the BIOCLIM algorithm, the DOMAIN algorithm and the Mahalanobis algorithm. These sets of functions were developed in parallel with the `biomod2`

package, especially for inclusion of the maximum entropy algorithm, but also to allow for a more direct integration with the BiodiversityR package, more direct handling of model formulae and greater focus on mapping. Researchers and students of species distribution are strongly encouraged to familiarize themselves with all the options of the BIOMOD and dismo packages.

### Usage

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 | ```
ensemble.test(x = NULL, p = NULL, a = NULL, an = 1000, excludep = FALSE, ext = NULL,
k = 0, pt = NULL, at = NULL, CIRCLES.at = FALSE, CIRCLES.d = 100000,
TrainData = NULL, TestData = NULL,
VIF = FALSE, COR = FALSE,
SINK = FALSE, PLOTS = TRUE,
threshold.method = "spec_sens", threshold.sensitivity = 0.9,
threshold.PresenceAbsence = FALSE,
evaluations.keep = FALSE,
models.list = NULL, models.keep = FALSE,
models.save = FALSE, species.name = "Species001",
AUC.weights = TRUE, ENSEMBLE.tune = FALSE,
ENSEMBLE.best = 0, ENSEMBLE.min = 0.7, ENSEMBLE.exponent = 1,
ENSEMBLE.weight.min = 0.05,
input.weights = NULL,
MAXENT = 1, GBM = 1, GBMSTEP = 1, RF = 1, GLM = 1, GLMSTEP = 1,
GAM = 1, GAMSTEP = 1, MGCV = 1, MGCVFIX = 0, EARTH = 1,
RPART = 1, NNET = 1, FDA = 1, SVM = 1 , SVME = 1,
BIOCLIM = 1, DOMAIN = 1, MAHAL = 1,
GEODIST = 0,
PROBIT = FALSE,
Yweights = "BIOMOD",
layer.drops = NULL, factors = NULL, dummy.vars = NULL,
formulae.defaults = TRUE, maxit = 100,
MAXENT.a = NULL, MAXENT.an = 10000, MAXENT.BackData = NULL,
MAXENT.path=paste(getwd(), "/models/maxent_", species.name, sep=""),
GBM.formula = NULL, GBM.n.trees = 2001,
GBMSTEP.gbm.x = 2:(ncol(TrainData.vars)+1), GBMSTEP.tree.complexity = 5,
GBMSTEP.learning.rate = 0.005, GBMSTEP.bag.fraction = 0.5,
GBMSTEP.step.size = 100,
RF.formula = NULL, RF.ntree = 751,
RF.mtry = floor(sqrt(ncol(TrainData.vars))),
GLM.formula = NULL, GLM.family = binomial(link = "logit"),
GLMSTEP.steps = 1000, STEP.formula = NULL, GLMSTEP.scope = NULL,
GLMSTEP.k = 2,
GAM.formula = NULL, GAM.family = binomial(link = "logit"),
GAMSTEP.steps = 1000, GAMSTEP.scope = NULL, GAMSTEP.pos = 1,
MGCV.formula = NULL, MGCV.select = FALSE,
MGCVFIX.formula = NULL,
EARTH.formula = NULL,
EARTH.glm = list(family = binomial(link = "logit"), maxit = maxit),
RPART.formula = NULL, RPART.xval = 50,
NNET.formula = NULL, NNET.size = 8, NNET.decay = 0.01,
FDA.formula = NULL,
SVM.formula = NULL,
SVME.formula = NULL,
MAHAL.shape = 1,
RASTER.format = "raster")
ensemble.test.splits(x = NULL, p = NULL, a = NULL, an = 1000,
CIRCLES.at = FALSE, CIRCLES.d = 100000,
excludep = FALSE, ext = NULL,
k = 4,
TrainData = NULL,
VIF = FALSE, COR = FALSE,
SINK = FALSE, PLOTS = FALSE,
data.keep = FALSE,
species.name = "Species001",
threshold.method = "spec_sens", threshold.sensitivity = 0.9,
threshold.PresenceAbsence = FALSE,
AUC.weights = TRUE, ENSEMBLE.tune = FALSE,
ENSEMBLE.best = 0, ENSEMBLE.min = 0.7, ENSEMBLE.exponent = 1,
ENSEMBLE.weight.min = 0.05,
input.weights = NULL,
MAXENT = 1, GBM = 1, GBMSTEP = 1, RF = 1, GLM = 1, GLMSTEP = 1,
GAM = 1, GAMSTEP = 1, MGCV = 1, MGCVFIX = 0, EARTH = 1,
RPART = 1, NNET = 1, FDA = 1, SVM = 1 , SVME = 1,
BIOCLIM = 1, DOMAIN = 1, MAHAL = 1,
PROBIT = FALSE,
Yweights = "BIOMOD",
layer.drops = NULL, factors = NULL, dummy.vars = NULL,
formulae.defaults = TRUE, maxit = 100,
MAXENT.a = NULL, MAXENT.an = 10000, MAXENT.BackData = NULL,
MAXENT.path = paste(getwd(), "/models/maxent_", species.name, sep=""),
GBM.formula = NULL, GBM.n.trees = 2001,
GBMSTEP.gbm.x = 2:(ncol(TrainData1)), GBMSTEP.tree.complexity = 5,
GBMSTEP.learning.rate = 0.005,
GBMSTEP.bag.fraction = 0.5, GBMSTEP.step.size = 100,
RF.formula = NULL, RF.ntree = 751, RF.mtry = floor(sqrt(ncol(TrainData1)-1)),
GLM.formula = NULL, GLM.family = binomial(link = "logit"),
GLMSTEP.steps = 1000, STEP.formula = NULL, GLMSTEP.scope = NULL, GLMSTEP.k = 2,
GAM.formula = NULL, GAM.family = binomial(link = "logit"),
GAMSTEP.steps = 1000, GAMSTEP.scope = NULL, GAMSTEP.pos = 1,
MGCV.formula = NULL, MGCV.select = FALSE,
MGCVFIX.formula = NULL,
EARTH.formula = NULL,
EARTH.glm = list(family = binomial(link = "logit"), maxit = maxit),
RPART.formula = NULL, RPART.xval = 50,
NNET.formula = NULL, NNET.size = 8, NNET.decay = 0.01,
FDA.formula = NULL,
SVM.formula = NULL,
SVME.formula = NULL,
MAHAL.shape = 1)
ensemble.test.gbm(x = NULL, p = NULL, a = NULL, an = 1000, excludep = FALSE, ext = NULL,
k = 4,
TrainData = NULL,
VIF = FALSE, COR = FALSE,
SINK = FALSE, PLOTS = FALSE,
species.name = "Species001",
Yweights = "BIOMOD",
layer.drops = NULL, factors = NULL,
GBMSTEP.gbm.x = 2:(ncol(TrainData.orig)),
complexity = c(3:6), learning = c(0.005, 0.002, 0.001),
GBMSTEP.bag.fraction = 0.5, GBMSTEP.step.size = 100)
ensemble.test.nnet(x = NULL, p = NULL, a = NULL, an = 1000, excludep = FALSE,
ext = NULL, k = 4,
TrainData = NULL,
VIF = FALSE, COR = FALSE,
SINK = FALSE, PLOTS = FALSE,
species.name = "Species001",
Yweights = "BIOMOD",
layer.drops = NULL, factors = NULL,
formulae.defaults = TRUE, maxit = 100,
NNET.formula = NULL,
sizes = c(2, 4, 6, 8), decays = c(0.1, 0.05, 0.01, 0.001) )
ensemble.drop1(x = NULL, p = NULL, a = NULL, an = 1000, excludep = FALSE, ext = NULL,
k = 0, pt = NULL, at = NULL, CIRCLES.at = FALSE, CIRCLES.d = 100000,
TrainData = NULL, TestData = NULL,
VIF = FALSE, COR = FALSE,
SINK = FALSE,
species.name = "Species001",
difference = FALSE,
ENSEMBLE.best = 0, ENSEMBLE.min = 0.7, ENSEMBLE.exponent = 1,
input.weights = NULL,
MAXENT = 1, GBM = 1, GBMSTEP = 1, RF = 1, GLM = 1, GLMSTEP = 1,
GAM = 1, GAMSTEP = 1, MGCV = 1, MGCVFIX = 0, EARTH = 1,
RPART = 1, NNET = 1, FDA = 1, SVM = 1, SVME = 1,
BIOCLIM = 1, DOMAIN = 1, MAHAL = 1,
PROBIT = FALSE,
Yweights = "BIOMOD",
layer.drops = NULL, factors = NULL, dummy.vars = NULL,
maxit = 100,
MAXENT.a = NULL, MAXENT.an = 10000, MAXENT.BackData = NULL,
MAXENT.path = paste(getwd(), "/models/maxent_", species.name, sep=""),
GBM.n.trees = 2001,
GBMSTEP.tree.complexity = 5, GBMSTEP.learning.rate = 0.005,
GBMSTEP.bag.fraction = 0.5, GBMSTEP.step.size = 100,
RF.ntree = 751,
GLM.family = binomial(link = "logit"),
GLMSTEP.steps = 1000, GLMSTEP.scope = NULL, GLMSTEP.k = 2,
GAM.family = binomial(link = "logit"),
GAMSTEP.steps = 1000, GAMSTEP.scope = NULL, GAMSTEP.pos = 1,
MGCV.select = FALSE,
EARTH.glm = list(family = binomial(link = "logit"), maxit = maxit),
RPART.xval = 50,
NNET.size = 8, NNET.decay = 0.01,
MAHAL.shape = 1)
ensemble.weights(weights = c(0.9, 0.8, 0.7, 0.5),
best = 0, min.weight = 0,
exponent = 1, digits = 4)
ensemble.strategy(TrainData = NULL, TestData = NULL,
verbose = FALSE,
ENSEMBLE.best = c(4:10), ENSEMBLE.min = c(0.7),
ENSEMBLE.exponent = c(1, 2, 4, 6, 8) )
ensemble.formulae(x, factors = NULL, dummy.vars = NULL)
ensemble.threshold(eval, threshold.method="spec_sens", threshold.sensitivity=0.9,
threshold.PresenceAbsence=FALSE, Pres, Abs)
``` |

### Arguments

`x` |
RasterStack object ( |

`p` |
presence points used for calibrating the suitability models, typically available in 2-column (lon, lat) dataframe; see also |

`a` |
background points used for calibrating the suitability models (except for |

`an` |
number of background points for calibration to be selected with |

`excludep` |
parameter that indicates (if |

`ext` |
an Extent object to limit the selection of background points to a sub-region of |

`k` |
If larger than 1, the number of groups to split between calibration (k-1) and evaluation (1) data sets (for example, |

`pt` |
presence points used for evaluating the suitability models, available in 2-column (lon, lat) dataframe; see also |

`at` |
background points used for evaluating the suitability models, available in 2-column (lon, lat) dataframe; see also |

`CIRCLES.at` |
If |

`CIRCLES.d` |
Radius in m of circular neighbourhoods (created with |

`TrainData` |
dataframe with first column 'pb' describing presence (1) and absence (0) and other columns containing explanatory variables; see also |

`TestData` |
dataframe with first column 'pb' describing presence (1) and absence (0) and other columns containing explanatory variables; see also |

`VIF` |
Estimate the variance inflation factors based on a linear model calibrated on the training data (if |

`COR` |
Provide information on the correlation between the numeric explanatory variables (if |

`SINK` |
Append the results to a text file in subfolder 'outputs' (if |

`PLOTS` |
Plot the results of evaluation for the various suitability models (if |

`threshold.method` |
Method to calculate the threshold between predicted absence and presence; possibilities include |

`threshold.sensitivity` |
Sensitivity value for |

`threshold.PresenceAbsence` |
If |

`evaluations.keep` |
Keep the results of evaluations (if |

`models.list` |
list with 'old' model objects such as |

`models.keep` |
store the details for each suitability modelling algorithm (if |

`models.save` |
Save the list with model details to a file (if |

`species.name` |
Name by which the model details will be saved to a file; see also argument |

`data.keep` |
Keep the data for each k-fold cross-validation run (if |

`AUC.weights` |
If |

`ENSEMBLE.tune` |
Determine weights for the ensemble model based on AUC values through internal cross-validation procedures (if |

`ENSEMBLE.best` |
The number of individual suitability models to be used in the consensus suitability map (based on a weighted average). In case this parameter is smaller than 1 or larger than the number of positive input weights of individual models, then all individual suitability models with positive input weights are included in the consensus suitability map. In case a vector is provided, |

`ENSEMBLE.min` |
The minimum input weight (typically corresponding to AUC values) for a model to be included in the ensemble. In case a vector is provided, function |

`ENSEMBLE.exponent` |
Exponent applied to AUC values to convert AUC values into weights (for example, an exponent of 2 converts input weights of 0.7, 0.8 and 0.9 into 0.7^2=0.49, 0.8^2=0.64 and 0.9^2=0.81). See details. |

`ENSEMBLE.weight.min` |
The minimum output weight for models included in the ensemble, applying to weights that sum to one. Note that |

`input.weights` |
array with numeric values for the different modelling algorithms; if |

`MAXENT` |
number: if larger than 0, then a maximum entropy model ( |

`GBM` |
number: if larger than 0, then a boosted regression trees model ( |

`GBMSTEP` |
number: if larger than 0, then a stepwise boosted regression trees model ( |

`RF` |
number: if larger than 0, then a random forest model ( |

`GLM` |
number: if larger than 0, then a generalized linear model ( |

`GLMSTEP` |
number: if larger than 0, then a stepwise generalized linear model ( |

`GAM` |
number: if larger than 0, then a generalized additive model ( |

`GAMSTEP` |
number: if larger than 0, then a stepwise generalized additive model ( |

`MGCV` |
number: if larger than 0, then a generalized additive model ( |

`MGCVFIX` |
number: if larger than 0, then a generalized additive model with fixed d.f. regression splines ( |

`EARTH` |
number: if larger than 0, then a multivariate adaptive regression spline model ( |

`RPART` |
number: if larger than 0, then a recursive partioning and regression tree model ( |

`NNET` |
number: if larger than 0, then an artificial neural network model ( |

`FDA` |
number: if larger than 0, then a flexible discriminant analysis model ( |

`SVM` |
number: if larger than 0, then a support vector machine model ( |

`SVME` |
number: if larger than 0, then a support vector machine model ( |

`BIOCLIM` |
number: if larger than 0, then the BIOCLIM algorithm ( |

`DOMAIN` |
number: if larger than 0, then the DOMAIN algorithm ( |

`MAHAL` |
number: if larger than 0, then the Mahalanobis algorithm ( |

`PROBIT` |
If |

`GEODIST` |
number: if larger than 0, then the geoDist algorithm ( |

`Yweights` |
chooses how cases of presence and background (absence) are weighted; |

`layer.drops` |
vector that indicates which layers should be removed from RasterStack |

`factors` |
vector that indicates which variables are factors; see also |

`dummy.vars` |
vector that indicates which variables are dummy variables (influences formulae suggestions) |

`formulae.defaults` |
Suggest formulae for most of the models (if |

`maxit` |
Maximum number of iterations for some of the models. See also |

`MAXENT.a` |
background points used for calibrating the maximum entropy model ( |

`MAXENT.an` |
number of background points for calibration to be selected with |

`MAXENT.BackData` |
dataframe containing explanatory variables for the background locations. This information will be used for calibrating the maximum entropy model ( |

`MAXENT.path` |
path to the directory where output files of the maximum entropy model are stored; see also |

`GBM.formula` |
formula for the boosted regression trees algorithm; see also |

`GBM.n.trees` |
total number of trees to fit for the boosted regression trees model; see also |

`GBMSTEP.gbm.x` |
indices of column numbers with explanatory variables for stepwise boosted regression trees; see also |

`GBMSTEP.tree.complexity` |
complexity of individual trees for stepwise boosted regression trees; see also |

`GBMSTEP.learning.rate` |
weight applied to individual trees for stepwise boosted regression trees; see also |

`GBMSTEP.bag.fraction` |
proportion of observations used in selecting variables for stepwise boosted regression trees; see also |

`GBMSTEP.step.size` |
number of trees to add at each cycle for stepwise boosted regression trees (should be small enough to result in a smaller holdout deviance than the initial number of trees [50]); see also |

`RF.formula` |
formula for random forest algorithm; see also |

`RF.ntree` |
number of trees to grow for random forest algorithm; see also |

`RF.mtry` |
number of variables randomly sampled as candidates at each split for random forest algorithm; see also |

`GLM.formula` |
formula for the generalized linear model; see also |

`GLM.family` |
description of the error distribution and link function for the generalized linear model; see also |

`GLMSTEP.steps` |
maximum number of steps to be considered for stepwise generalized linear model; see also |

`STEP.formula` |
formula for the "starting model" to be considered for stepwise generalized linear model; see also |

`GLMSTEP.scope` |
range of models examined in the stepwise search; see also |

`GLMSTEP.k` |
multiple of the number of degrees of freedom used for the penalty (only k = 2 gives the genuine AIC); see also |

`GAM.formula` |
formula for the generalized additive model; see also |

`GAM.family` |
description of the error distribution and link function for the generalized additive model; see also |

`GAMSTEP.steps` |
maximum number of steps to be considered in the stepwise generalized additive model; see also |

`GAMSTEP.scope` |
range of models examined in the step-wise search n the stepwise generalized additive model; see also |

`GAMSTEP.pos` |
parameter expected to be set to 1 to allow for fitting of the stepwise generalized additive model |

`MGCV.formula` |
formula for the generalized additive model; see also |

`MGCV.select` |
if |

`MGCVFIX.formula` |
formula for the generalized additive model with fixed d.f. regression splines; see also |

`EARTH.formula` |
formula for the multivariate adaptive regression spline model; see also |

`EARTH.glm` |
list of arguments to pass on to |

`RPART.formula` |
formula for the recursive partioning and regression tree model; see also |

`RPART.xval` |
number of cross-validations for the recursive partioning and regression tree model; see also |

`NNET.formula` |
formula for the artificial neural network model; see also |

`NNET.size` |
number of units in the hidden layer for the artificial neural network model; see also |

`NNET.decay` |
parameter of weight decay for the artificial neural network model; see also |

`FDA.formula` |
formula for the flexible discriminant analysis model; see also |

`SVM.formula` |
formula for the support vector machine model; see also |

`SVME.formula` |
formula for the support vector machine model; see also |

`MAHAL.shape` |
parameter that influences the transformation of output values of |

`RASTER.format` |
Format of the raster files that will be generated for the GEODIST model. See |

`complexity` |
vector with values of complexity of individual trees ( |

`learning` |
vector with values of weights applied to individual trees ( |

`sizes` |
vector with values of number of units in the hidden layer for the artificial neural network model; see also |

`decays` |
vector with values of weight decay for the artificial neural network model; see also |

`difference` |
if |

`weights` |
input weights for the |

`best` |
The number of final weights. In case this parameter is smaller than 1 or larger than the number of positive input weights of individual models, then this parameter is ignored. |

`min.weight` |
The minimum input weight to be included in the output. |

`exponent` |
Exponent applied to AUC values to convert AUC values into weights (for example, an exponent of 2 converts input weights of 0.7, 0.8 and 0.9 into 0.7^2=0.49, 0.8^2=0.64 and 0.9^2=0.81). See details. |

`digits` |
Number of number of decimal places in the output weights; see also |

`verbose` |
If |

`eval` |
ModelEvaluation object obtained by |

`Pres` |
Suitabilities (probabilities) at presence locations |

`Abs` |
Suitabilities (probabilities) at background locations |

### Details

The basic function `ensemble.test`

first calibrates individual suitability models based on presence locations `p`

and background locations `a`

, then evaluates these suitability models based on presence locations `pt`

and background locations `at`

. While calibrating and testing individual models, results obtained via the `evaluate`

function are shown in the GUI, and possibly plotted (`PLOTS`

) or saved (`evaluations.keep`

).

As an alternative to providing presence locations `p`

, models can be calibrated with data provided in `TrainData`

. In case that both `p`

and `TrainData`

are provided, then models will be calibrated with `TrainData`

.

Calibration of the maximum entropy (MAXENT) algorithm is not based on background locations `a`

, but based on background locations `MAXENT.a`

instead. However, to compare evaluations with evaluations of other algorithms, during evaluations of the MAXENT algorithm, presence locations `p`

and background locations `a`

are used (and not background locations `MAXENT.a`

).

As the Mahalanobis function (`mahal`

) does not always provide values within a range of 0 - 1, the output values are rescaled by first subtracting the value of `1 - MAHAL.shape`

from each prediction, followed by calculating the absolute value, followed by calculating the reciprocal value and finally multiplying this reciprocal value with `MAHAL.shape`

. As this rescaling method does not estimate probabilities, inclusion in the calculation of a (weighted) average of ensemble probabilities may be problematic (the same applies to other distance-based methods).

With parameter `ENSEMBLE.best`

, the subset of best models (evaluated by the individual AUC values) can be selected and only those models will be used for calculating the ensemble model (in other words, weights for models not included in the ensemble will be set to zero). It is possible to further increase the contribution to the ensemble model for models with higher AUC values through parameter `ENSEMBLE.exponent`

. With `ENSEMBLE.exponent = 2`

, AUC values of 0.7, 0.8 and 0.9 are converted into weights of 0.7^2=0.49, 0.8^2=0.64 and 0.9^2=0.81). With `ENSEMBLE.exponent = 4`

, AUC values of 0.7, 0.8 and 0.9 are converted into weights of 0.7^4=0.2401, 0.8^4=0.4096 and 0.9^2=0.6561).

`ENSEMBLE.tune`

will result in an internal procedure whereby the best selection of parameter values for `ENSEMBLE.min`

, `ENSEMBLE.best`

or `ENSEMBLE.exponent`

can be identified. Through a factorial procedure, the ensemble model with best AUC for a specific combination of parameter values is identified. The procedure also provides the weights that correspond to the best ensemble.

Function `ensemble.test.splits`

splits the presence and background locations in a user-defined (`k`

) number of subsets (i.e. k-fold cross-validation), then sequentially calibrates individual suitability models with (`k-1`

) combined subsets and evaluates those with the remaining one subset, whereby each subset is used once for evaluation in the user-defined number (`k`

) of runs. For example, `k = 4`

results in splitting the locations in 4 subsets, then using one of these subsets in turn for evaluations (see also `kfold`

). Note that for the maximum entropy (MAXENT) algorithm, the same background data will be used in each cross-validation run (this is based on the assumption that a large number (~10000) of background locations are used).

Among the output from function `ensemble.test.splits`

are suggested weights for an ensemble model (`output.weights`

and `output.weights.AUC`

), and information on the respective AUC values of the ensemble model with the suggested weights for each of the (`k`

) subsets. Suggested weights `output.weights`

are calculated as the average of the weights of the different algorithms (submodels) of the `k`

ensembles. Suggested weights `output.weights.AUC`

are calculated as the average of the AUC of the different algorithms of the for the `k`

runs.

Function `ensemble.test.gbm`

allows to test various combinations of parameters `tree.complexity`

and `learning.rate`

for the `gbm.step`

model.

Function `ensemble.test.nnet`

allows to test various combinations of parameters `size`

and `decay`

for the `nnet`

model.

Function `ensemble.drop1`

allows to test the effects of leaving out each of the explanatory variables, and comparing these results with the "full" model. Note that option of `difference = TRUE`

may result in positive values, indicating that the model without the explanatory variable having larger AUC than the "full" model. A procedure is included to estimate the deviance of a model based on the fitted values, using -2 * (sum(x*log(x)) + sum((1-x)*log(1-x))) where x is a vector of the fitted values for a respective model. (It was checked that this procedure results in similar deviance estimates for the null and 'full' models for glm, but note that it is not certain whether deviance can be calculated in a similar way for other submodels.)

Function `ensemble.formulae`

provides suggestions for formulae that can be used for `ensemble.test`

and `ensemble.raster`

. This function is always used internally by the `ensemble.drop1`

function.

The `ensemble.weights`

function is used internally by the `ensemble.test`

and `ensemble.raster`

functions, using the input weights for the different suitability modelling algorithms. Ties between input weights result in the same output weights.

The `ensemble.strategy`

function is used internally by the `ensemble.test`

function, using the train and test data sets with predictions of the different suitability modelling algorithms and different combinations of parameters `ENSEMBLE.best`

, `ENSEMBLE.min`

and `ENSEMBLE.exponent`

. The final ensemble model is based on the parameters that generate the best AUC.

The `ensemble.threshold`

function is used internally by the `ensemble.test`

, `ensemble.mean`

and `ensemble.plot`

functions. `threshold.mean`

and `threshold.min`

result in calculating the mean or minimum value of threshold methods that resulted in better results in a study by Liu et al. (Ecography 28: 385-393. 2005) with threshold availabe in `threshold`

(`prevalence`

, `spec_sens`

and `equal_spec_sens`

) or `optimal.thresholds`

(`ObsPrev`

, `MeanProb`

, `MaxSens+Spec`

, `Sens=Spec`

and `MinROCdist`

).

### Value

Function `ensemble.test`

(potentially) returns a list with results from evaluations (via `evaluate`

) of calibration and test runs of individual suitability models.

Function `ensemble.test.splits`

returns a matrix with, for each individual suitability model, the AUC of each run and the average AUC over the runs. Models are sorted by the average AUC. The average AUC for each model can be used as input weights for the `ensemble.raster`

function.

Functions `ensemble.test.gbm`

and `ensemble.test.nnet`

return a matrix with, for each combination of model parameters, the AUC of each run and the average AUC. Models are sorted by the average AUC.

### Author(s)

Roeland Kindt (World Agroforestry Centre)

### References

Buisson L, Thuiller W, Casajus N, Lek S and Grenouillet G. 2010. Uncertainty in ensemble forecasting of species distribution. Global Change Biology 16: 1145-1157

Liu C, Berry PM, Dawson TP and Pearson RC. 2005. Selecting thresholds of occurrence in the prediction of species distributions. Ecography 28: 385-393

### See Also

`ensemble.raster`

### Examples

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 | ```
## Not run:
# based on examples in the dismo package
# get predictor variables
library(dismo)
predictor.files <- list.files(path=paste(system.file(package="dismo"), '/ex', sep=''),
pattern='grd', full.names=TRUE)
predictors <- stack(predictor.files)
# subset based on Variance Inflation Factors
predictors <- subset(predictors, subset=c("bio5", "bio6",
"bio16", "bio17", "biome"))
predictors
predictors@title <- "base"
# presence points
presence_file <- paste(system.file(package="dismo"), '/ex/bradypus.csv', sep='')
pres <- read.table(presence_file, header=TRUE, sep=',')[,-1]
# the kfold function randomly assigns data to groups;
# groups are used as calibration (1/5) and training (4/5) data
groupp <- kfold(pres, 5)
pres_train <- pres[groupp != 1, ]
pres_test <- pres[groupp == 1, ]
# choose background points
ext <- extent(-90, -32, -33, 23)
background <- randomPoints(predictors, n=1000, ext=ext, extf=1.00)
colnames(background)=c('lon', 'lat')
groupa <- kfold(background, 5)
backg_train <- background[groupa != 1, ]
backg_test <- background[groupa == 1, ]
# formulae for random forest and generalized linear model
# compare with: ensemble.formulae(predictors, factors=c("biome"))
rfformula <- as.formula(pb ~ bio5+bio6+bio16+bio17)
glmformula <- as.formula(pb ~ bio5 + I(bio5^2) + I(bio5^3) +
bio6 + I(bio6^2) + I(bio6^3) + bio16 + I(bio16^2) + I(bio16^3) +
bio17 + I(bio17^2) + I(bio17^3) )
# fit four ensemble models (RF, GLM, BIOCLIM, DOMAIN)
ensemble.nofactors <- ensemble.test(x=predictors, p=pres_train, a=backg_train,
pt=pres_test, at=backg_test,
species.name="Bradypus",
MAXENT=0, GBM=0, GBMSTEP=0, RF=1, GLM=1, GLMSTEP=0, GAM=0,
GAMSTEP=0, MGCV=0, MGCVFIX=0,EARTH=0, RPART=0, NNET=0, FDA=0,
SVM=0, SVME=0, BIOCLIM=1, DOMAIN=1, MAHAL=0,
Yweights="BIOMOD", factors="biome",
PLOTS=FALSE, evaluations.keep=TRUE,
RF.formula=rfformula,
GLM.formula=glmformula)
# fit four ensemble models (RF, GLM, BIOCLIM, DOMAIN) using default formulae
# variable 'biome' is not included as explanatory variable
# results are provided in a file in the 'outputs' subfolder of the working
# directory
ensemble.nofactors <- ensemble.test(x=predictors,
p=pres_train, a=backg_train,
pt=pres_test, at=backg_test,
layer.drops="biome",
species.name="Bradypus",
SINK=TRUE,
MAXENT=0, GBM=0, GBMSTEP=0, RF=1, GLM=1, GLMSTEP=0, GAM=0,
GAMSTEP=0, MGCV=0, MGCVFIX=0, EARTH=0, RPART=0, NNET=0, FDA=0,
SVM=0, SVME=0, BIOCLIM=1, DOMAIN=1, MAHAL=0,
Yweights="BIOMOD", factors="biome",
PLOTS=FALSE, evaluations.keep=TRUE,
formulae.defaults=TRUE)
# after fitting the individual algorithms (submodels),
# transform predictions with a probit link.
ensemble.nofactors <- ensemble.test(x=predictors,
p=pres_train, a=backg_train,
pt=pres_test, at=backg_test,
layer.drops="biome",
species.name="Bradypus",
SINK=TRUE,
ENSEMBLE.min=0.6,
MAXENT=0, GBM=0, GBMSTEP=0, RF=1, GLM=1, GLMSTEP=0, GAM=0,
GAMSTEP=0, MGCV=0, MGCVFIX=0, EARTH=0, RPART=0, NNET=0, FDA=0,
SVM=0, SVME=0, BIOCLIM=1, DOMAIN=1, MAHAL=0,
PROBIT=TRUE,
Yweights="BIOMOD", factors="biome",
PLOTS=FALSE, evaluations.keep=TRUE,
formulae.defaults=TRUE)
# instead of providing presence and background locations, provide data.frames
# because 'biome' is a factor, RasterStack and extent need to be provided
# to check for levels in the Training and Testing data set
TrainData1 <- prepareData(x=predictors, p=pres_train, b=backg_train,
factors=c("biome"), xy=FALSE)
TestData1 <- prepareData(x=predictors, p=pres_test, b=backg_test,
factors=c("biome"), xy=FALSE)
ensemble.factors1 <- ensemble.test(x=predictors, ext=ext,
TrainData=TrainData1, TestData=TestData1,
p=pres_train, a=backg_train,
pt=pres_test, at=backg_test,
species.name="Bradypus",
SINK=TRUE,
MAXENT=1, GBM=1, GBMSTEP=0, RF=1, GLM=1, GLMSTEP=1, GAM=1,
GAMSTEP=1, MGCV=1, MGCVFIX=1, EARTH=1, RPART=1, NNET=1, FDA=1,
SVM=1, SVME=1, BIOCLIM=1, DOMAIN=1, MAHAL=0,
Yweights="BIOMOD", factors="biome",
PLOTS=FALSE, evaluations.keep=TRUE)
# compare different methods of calculating ensembles
ensemble.factors2 <- ensemble.test(x=predictors, ext=ext,
TrainData=TrainData1, TestData=TestData1,
p=pres_train, a=backg_train,
pt=pres_test, at=backg_test,
species.name="Bradypus",
SINK=TRUE,
MAXENT=1, GBM=1, GBMSTEP=0, RF=1, GLM=1, GLMSTEP=1, GAM=1,
GAMSTEP=1, MGCV=1, MGCVFIX=1, EARTH=1, RPART=1, NNET=1, FDA=1,
SVM=1, SVME=1, BIOCLIM=1, DOMAIN=1, MAHAL=0,
ENSEMBLE.best=c(4:10), ENSEMBLE.exponent=c(1, 2, 4, 6, 8),
Yweights="BIOMOD", factors="biome",
PLOTS=FALSE, evaluations.keep=TRUE)
# test performance of different suitability models
# data are split in 4 subsets, each used once for evaluation
ensemble.nofactors2 <- ensemble.test.splits(x=predictors, ext=ext,
p=pres, a=background, k=4,
layer.drops=c("biome"),
species.name="Bradypus",
SINK=TRUE,
MAXENT=1, GBM=1, GBMSTEP=0, RF=1, GLM=1, GLMSTEP=1, GAM=1,
GAMSTEP=1, MGCV=1, MGCVFIX=1, EARTH=1, RPART=1, NNET=1, FDA=1,
SVM=1, SVME=1, BIOCLIM=1, DOMAIN=1, MAHAL=0,
ENSEMBLE.best=0, ENSEMBLE.exponent=c(1, 2, 4, 6, 8),
ENSEMBLE.min=0.7,
Yweights="BIOMOD", factors="biome",
PLOTS=FALSE, formulae.defaults=TRUE,
GBMSTEP.learning.rate=0.002)
ensemble.nofactors2
# test the result of leaving out one of the variables from the model
# note that positive differences indicate that the model without the variable
# has higher AUC than the full model
ensemble.variables <- ensemble.drop1(x=predictors, ext=ext,
p=pres, a=background, k=5,
layer.drops=c("bio6", "bio1", "bio12"),
species.name="Bradypus",
SINK=TRUE,
difference=TRUE,
VIF=TRUE,
MAXENT=0, GBM=1, GBMSTEP=0, RF=1, GLM=1, GLMSTEP=1, GAM=1,
GAMSTEP=1, MGCV=1, MGCVFIX=1, EARTH=1, RPART=1, NNET=1, FDA=1,
SVM=1, SVME=1, BIOCLIM=0, DOMAIN=0, MAHAL=0,
ENSEMBLE.best=0, ENSEMBLE.exponent=c(1, 2, 4, 6, 8),
ENSEMBLE.min=0.7,
Yweights="BIOMOD", factors="biome",
GBMSTEP.learning.rate=0.002)
ensemble.variables
## End(Not run)
``` |