knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) options(rmarkdown.html_vignette.check_title = FALSE)
library(ntbox)
We demonstrate the use of the model selection protocol implemented in ntbox by using occurrence data for the giant hummingbird (Patagona gigas). The occurrence records and the modeling layers are available in the ntbox
package (if you haven't installed it go to this link where you will find the installation notes for each of the operating systems).
Let's load the R
packages that we are going to use in our session
library(ntbox) library(raster) library(rgl) library(stringr) set.seed(111)
With the following commands, you can load into your R
session the geographic records
# Occurrence data for the giant hummingbird (Patagona gigas) pg <- utils::read.csv(system.file("extdata/p_gigas.csv", package = "ntbox")) head(pg)
As you can see in the table there are 4 columns: "species", "longitude" and "latitude" and "type" (with train or test as factors). The data points with the train labels are going to be used for model calibration and the test points will be used for selection.
Let's load the environmental information and display it
# Bioclimatic layers path wcpath <- list.files(system.file("extdata/bios", package = "ntbox"), pattern = ".tif$",full.names = TRUE) wc <- raster::stack(wcpath) plot(wc[[1]])
The environmental information is for South America and here we will project our niche model ...
Now we split the occurrence data into train and test using the type variable; the choice of the data is normally done using different partition methods, but in our case, the data previously was split using a random partition.
# Split occs in train and test pgL <- base::split(pg,pg$type) pg_train <- pgL$train pg_test <- pgL$test
The following code extracts the environmental information for both train and test data
pg_etrain <- raster::extract(wc,pg_train[,c("longitude", "latitude")], df=TRUE) pg_etrain <- pg_etrain[,-1] head(pg_etrain) pg_etest <- raster::extract(wc,pg_test[,c("longitude", "latitude")], df=TRUE) pg_etest <- pg_etest[,-1] head(pg_etest)
Select the non-correlated variables using a threshold correlation value of 0.8
env_varsL <- ntbox::correlation_finder(cor(pg_etrain,method = "spearman"), threshold = 0.8, verbose = F) env_vars <- env_varsL$descriptors print(env_vars )
Now we specify the number of variables to fit the ellipsoid models; in the example, we will fit for 3,5, and 6 dimensions
nvarstest <- c(3,5,6)
This parameter is to specify the proportion of training points that will be used to fit the minimum volume ellipsoid [@VanAelst2009].
# Level level <- 0.99
This background data is just to compute the partial ROC test
env_bg <- ntbox::sample_envbg(wc,10000)
For selecting the model we will use an arbitrary value of 6 percent of omission; it is not a rule but accepted omission rates are those bellow 10%. We will ask the function to return the partial ROC value [@Peterson2008]
omr_criteria <- 0.06 proc <- TRUE
To calibrate the models ntbox
estimates all combinations ($C^n_k$) of $n$ variables,
taken $k$ at a time for each $k= 2,3,\dots, m$, where $m$ is lesser than $n$.
It is known that
$$\displaystyle C^n_k =\frac {n!}{k!(n-k)!}.$$
Now we just need to use the function ellipsoid_selection
to run the model calibration and selection protocol.
e_selct <- ntbox::ellipsoid_selection(env_train = pg_etrain, env_test = pg_etest, env_vars = env_vars, level = level, nvarstest = nvarstest, env_bg = env_bg, omr_criteria= omr_criteria, proc = proc)
Let's see the first 20 rows of the results
head(e_selct,20)
With the following lines of code, I am going to display the model in the first row of the table
# Best ellipsoid model for "omr_criteria" bestvarcomb <- stringr::str_split(e_selct$fitted_vars,",")[[1]] # Ellipsoid model (environmental space) best_mod <- ntbox::cov_center(pg_etrain[,bestvarcomb], mve = T, level = 0.99, vars = 1:length(bestvarcomb)) # Projection model in geographic space mProj <- ntbox::ellipsoidfit(wc[[bestvarcomb]], centroid = best_mod$centroid, covar = best_mod$covariance, level = 0.99,size = 3) raster::plot(mProj$suitRaster) points(pg[,c("longitude","latitude")],pch=20,cex=0.5)
# Best ellipsoid model for "omr_criteria" bestvarcomb <- stringr::str_split(e_selct$fitted_vars,",")[[1]] # Ellipsoid model (environmental space) best_mod <- ntbox::cov_center(pg_etrain[,bestvarcomb], mve = T, level = 0.99, vars = 1:length(bestvarcomb)) # Projection model in geographic space mProj <- ntbox::ellipsoidfit(wc[[bestvarcomb]], centroid = best_mod$centroid, covar = best_mod$covariance, level = 0.99,size = 3) if(length(bestvarcomb)==3){ rglwidget() } raster::plot(mProj$suitRaster) points(pg[,c("longitude","latitude")],pch=20,cex=0.5)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.