###################################################################################################
##' @name BIOMOD_FormatingData
##' @author Damien Georges, Wilfried Thuiller
##'
##' @title Format input data, and select pseudo-absences if wanted, for usage in \pkg{biomod2}
##'
##' @description This function gathers together all input data needed (\emph{xy,
##' presences/absences, explanatory variables, and the same for evaluation data if available}) to
##' run \pkg{biomod2} models. It allows to select pseudo-absences if no absence data is available,
##' with different strategies (see Details).
##'
##'
##' @param dir.name (\emph{optional, default} \code{.}) \cr
##' A \code{character} corresponding to the modeling folder
##' @param resp.name a \code{character} corresponding to the species name
##'
##' @param resp.var a \code{vector}, a \code{\link[terra:vect]{SpatVector}}
##' without associated data (\emph{if presence-only}),
##' or a \code{\link[terra:vect]{SpatVector}} object containing binary data
##' (\code{0} : absence, \code{1} : presence, \code{NA} : indeterminate)
##' for a single species that will be used to build the species distribution model(s)
##' \cr \emph{Note that old format from \pkg{sp} are still supported such as
##' \code{SpatialPoints} (if presence-only) or \code{SpatialPointsDataFrame}
##' object containing binary data.}
##'
##' @param expl.var a \code{matrix}, \code{data.frame}, \code{\link[terra:vect]{SpatVector}}
##' or \code{\link[terra:rast]{SpatRaster}} object containing the explanatory variables
##' (in columns or layers) that will be used to build the species distribution model(s)
##' \cr \emph{Note that old format from \pkg{raster} and \pkg{sp} are still supported such as
##' \code{RasterStack} and \code{SpatialPointsDataFrame} objects. }
##'
##' @param resp.xy (\emph{optional, default} \code{NULL}) \cr
##' If \code{resp.var} is a \code{vector}, a 2-columns \code{matrix} or \code{data.frame}
##' containing the corresponding \code{X} and \code{Y} coordinates that will be
##' used to build the species distribution model(s)
##'
##' @param eval.resp.var (\emph{optional, default} \code{NULL}) \cr
##' A \code{vector}, a \code{\link[terra:vect]{SpatVector}}
##' without associated data (\emph{if presence-only}),
##' or a \code{\link[terra:vect]{SpatVector}} object containing binary data
##' (\code{0} : absence, \code{1} : presence, \code{NA} : indeterminate)
##' for a single species that will be used to
##' evaluate the species distribution model(s) with independent data
##' \cr \emph{Note that old format from \pkg{sp} are still supported such as
##' \code{SpatialPoints} (if presence-only) or \code{SpatialPointsDataFrame}
##' object containing binary data.}
##' @param eval.expl.var (\emph{optional, default} \code{NULL}) \cr
##' A \code{matrix}, \code{data.frame}, \code{\link[terra:vect]{SpatVector}}
##' or \code{\link[terra:rast]{SpatRaster}} object containing the explanatory variables (in
##' columns or layers) that will be used to evaluate the species distribution model(s) with
##' independent data.
##' \cr \emph{Note that old format from \pkg{raster} and \pkg{sp} are still supported such as
##' \code{RasterStack} and \code{SpatialPointsDataFrame} objects. }
##' @param eval.resp.xy (\emph{optional, default} \code{NULL}) \cr
##' If \code{resp.var} is a \code{vector}, a 2-columns \code{matrix} or \code{data.frame}
##' containing the corresponding \code{X} and \code{Y} coordinates that will be used to evaluate
##' the species distribution model(s) with independent data
##'
##' @param PA.nb.rep (\emph{optional, default} \code{0}) \cr
##' If pseudo-absence selection, an \code{integer} corresponding to the number of sets
##' (repetitions) of pseudo-absence points that will be drawn
##' @param PA.strategy (\emph{optional, default} \code{NULL}) \cr
##' If pseudo-absence selection, a \code{character} defining the strategy that will be used to
##' select the pseudo-absence points. Must be \code{random}, \code{sre}, \code{disk} or
##' \code{user.defined} (see Details)
##' @param PA.nb.absences (\emph{optional, default} \code{0}) \cr
##' If pseudo-absence selection, and \code{PA.strategy = 'random'} or \code{PA.strategy = 'sre'}
##' or \code{PA.strategy = 'disk'}, an \code{integer} corresponding to the number of pseudo-absence
##' points that will be selected for each pseudo-absence repetition (true absences included). \cr
##' It can also be a \code{vector} of the same length as \code{PA.nb.rep} containing \code{integer}
##' values corresponding to the different numbers of pseudo-absences to be selected
##' @param PA.sre.quant (\emph{optional, default} \code{0}) \cr
##' If pseudo-absence selection and \code{PA.strategy = 'sre'}, a \code{numeric} between \code{0}
##' and \code{0.5} defining the half-quantile used to make the \code{sre} pseudo-absence selection
##' (see Details)
##' @param PA.dist.min (\emph{optional, default} \code{0}) \cr
##' If pseudo-absence selection and \code{PA.strategy = 'disk'}, a \code{numeric} defining the
##' minimal distance to presence points used to make the \code{disk} pseudo-absence selection
##' (in meters, see Details)
##' @param PA.dist.max (\emph{optional, default} \code{0}) \cr
##' If pseudo-absence selection and \code{PA.strategy = 'disk'}, a \code{numeric} defining the
##' maximal distance to presence points used to make the \code{disk} pseudo-absence selection
##' (in meters, see Details)
##' @param PA.user.table (\emph{optional, default} \code{NULL}) \cr
##' If pseudo-absence selection and \code{PA.strategy = 'user.defined'}, a \code{matrix} or
##' \code{data.frame} with as many rows as \code{resp.var} values, as many columns as
##' \code{PA.nb.rep}, and containing \code{TRUE} or \code{FALSE} values defining which points
##' will be used to build the species distribution model(s) for each repetition (see Details)
##'
##' @param na.rm (\emph{optional, default} \code{TRUE}) \cr
##' A \code{logical} value defining whether points having one or several missing values for
##' explanatory variables should be removed from the analysis or not
##' @param filter.raster (\emph{optional, default} \code{FALSE}) \cr
##' If \code{expl.var} is of raster type, a \code{logical} value defining whether \code{resp.var}
##' is to be filtered when several points occur in the same raster cell
##'
##'
##' @return
##'
##' A \code{\link{BIOMOD.formated.data}} object that can be used to build species distribution
##' model(s) with the \code{\link{BIOMOD_Modeling}} function. \cr
##' \href{https://biomodhub.github.io/biomod2/reference/BIOMOD.formated.data.html}{\code{print/show}},
##' \href{https://biomodhub.github.io/biomod2/reference/plot.html}{\code{plot}} and
##' \href{https://biomodhub.github.io/biomod2/reference/summary.html}{\code{summary}} functions
##' are available to have a summary of the created object.
##'
##'
##' @details
##'
##' This function gathers and formats all input data needed to run \pkg{biomod2} models. It
##' supports different kind of inputs (e.g. \code{matrix},
##' \code{\link[terra:vect]{SpatVector}}, \code{\link[terra:rast]{SpatRaster}})
##' and provides different methods to select pseudo-absences if needed. \cr \cr
##'
##' \bold{Concerning explanatory variables and XY coordinates :}
##' \itemize{
##' \item if \code{\link[terra:rast]{SpatRaster}}, \code{RasterLayer} or \code{RasterStack}
##' provided for \code{expl.var} or \code{eval.expl.var}, \cr \pkg{biomod2} will extract
##' the corresponding values from XY coordinates provided :
##' \itemize{
##' \item either through \code{resp.xy} or \code{eval.resp.xy} respectively
##' \item or \code{resp.var} or \code{eval.resp.var}, if provided as
##' \code{\link[terra:vect]{SpatVector}} or \code{SpatialPointsDataFrame}
##' }
##' \emph{Be sure to give the objects containing XY coordinates in the same projection
##' system than the raster objects !}
##'
##' \item if \code{data.frame} or \code{matrix} provided for \code{expl.var} or
##' \code{eval.expl.var}, \cr \pkg{biomod2} will simply merge it (\code{cbind})
##' with \code{resp.var} without considering XY coordinates. \cr
##' \emph{Be sure to give explanatory and response values in the same row order !}
##' }
##'
##' \bold{Concerning pseudo-absence selection (see \code{\link{bm_PseudoAbsences}}) :}
##' \itemize{
##' \item if both presence and absence data are available, and there is enough absences :
##' set \code{PA.nb.rep = 0} and no pseudo-absence will be selected.
##' \item if no absence data is available, several pseudo-absence repetitions
##' are recommended (to estimate the effect of pseudo-absence selection), as well as high
##' number of pseudo-absence points. \cr
##' \emph{Be sure not to select more pseudo-absence points than maximum number of pixels in
##' the studied area !}
##' \item it is possible now to create several pseudo-absence repetitions with different
##' number of points, BUT with the same sampling strategy. \cr \cr \cr \cr
##' }
##'
##' \describe{
##' \item{Response variable}{
##' \pkg{biomod2} models single species at a time (no multi-species). Hence, \code{resp.var}
##' must be a uni-dimensional object (either a \code{vector}, a one-column \code{matrix},
##' \code{data.frame}, a \code{\link[terra:vect]{SpatVector}} (\emph{without associated
##' data - if presence-only}), a \code{SpatialPoints} (\emph{if presence-only}),
##' a \code{SpatialPointsDataFrame} or \code{\link[terra:vect]{SpatVector}} object),
##' containing values among :
##' \itemize{
##' \item \code{1} : presences
##' \item \code{0} : true absences (if any)
##' \item \code{NA} : no information point (might be used to select pseudo-absences if any)
##' }
##' If no true absences are available, pseudo-absence selection must be done. \cr
##' If \code{resp.var} is a non-spatial object (\code{vector}, \code{matrix} or
##' \code{data.frame}), XY coordinates must be provided through \code{resp.xy}. \cr
##' If pseudo-absence points are to be selected, \code{NA} points must be provided in order to
##' select pseudo-absences among them.
##' }
##' \item{Explanatory variables}{
##' Factorial variables are allowed, but might lead to some pseudo-absence strategy or models
##' omissions (e.g. \code{sre}).
##' }
##' \item{Evaluation data}{
##' Although \pkg{biomod2} provides tools to automatically divide dataset into calibration and
##' validation parts through the modeling process (see \code{CV.[..]} parameters in
##' \code{\link{BIOMOD_Modeling}} function ; or \code{\link{bm_CrossValidation}
##' function}), it is also possible (and strongly advised) to directly provide two independent
##' datasets, one for calibration/validation and one for evaluation
##' }
##' \item{Pseudo-absence selection (see \code{\link{bm_PseudoAbsences}})}{
##' If no true absences are available, pseudo-absences must be selected from the
##' \emph{background data}, meaning data there is no information whether the species of
##' interest occurs or not. It corresponds either to the remaining pixels of the \code{expl.var}
##' (if provided as a \code{\link[terra:rast]{SpatRaster}} or \code{RasterSatck})
##' or to the points identified as \code{NA} in \code{resp.var} (if \code{expl.var}
##' provided as a \code{matrix} or \code{data.frame}). \cr
##' Several methods are available to do this selection :
##' \describe{
##' \item{random}{all points of initial background are pseudo-absence candidates.
##' \code{PA.nb.absences} are drawn randomly, for each \code{PA.nb.rep} requested.
##' }
##' \item{sre}{pseudo-absences have to be selected in conditions (combination of explanatory
##' variables) that differ in a defined proportion (\code{PA.sre.quant}) from those of
##' presence points. A \emph{Surface Range Envelop} model is first run over the species of
##' interest (see \code{\link{bm_SRE}}), and pseudo-absences are selected outside this envelop. \cr
##' \emph{This case is appropriate when all the species climatic niche has been sampled,
##' otherwise it may lead to over-optimistic model evaluations and predictions !}
##' }
##' \item{disk}{pseudo-absences are selected within circles around presence points defined by
##' \code{PA.dist.min} and \code{PA.dist.max} distance values (in meters). It allows to select
##' pseudo-absence points that are not too close to (avoid same niche and pseudo-replication)
##' or too far (localized sampling strategy) from presences.
##' }
##' \item{user.defined}{pseudo-absences are defined in advance and given as \code{data.frame}
##' through the \code{PA.user.table} parameter.
##' }
##' }
##' }
##' }
##'
##'
##' @keywords dataset format evaluation pseudo-absence
##'
##'
##' @seealso \code{\link{bm_PseudoAbsences}}, \code{\link{BIOMOD_Modeling}}
##' @family Main functions
##'
##'
##' @examples
##' library(terra)
##'
##' # Load species occurrences (6 species available)
##' data(DataSpecies)
##' head(DataSpecies)
##'
##' # Select the name of the studied species
##' myRespName <- 'GuloGulo'
##'
##' # Get corresponding presence/absence data
##' myResp <- as.numeric(DataSpecies[, myRespName])
##'
##' # Get corresponding XY coordinates
##' myRespXY <- DataSpecies[, c('X_WGS84', 'Y_WGS84')]
##'
##' # Load environmental variables extracted from BIOCLIM (bio_3, bio_4, bio_7, bio_11 & bio_12)
##' data(bioclim_current)
##' myExpl <- terra::rast(bioclim_current)
##'
##' \dontshow{
##' myExtent <- terra::ext(0,30,45,70)
##' myExpl <- terra::crop(myExpl, myExtent)
##' }
##'
##' # ---------------------------------------------------------------#
##' # Format Data with true absences
##' myBiomodData <- BIOMOD_FormatingData(resp.var = myResp,
##' expl.var = myExpl,
##' resp.xy = myRespXY,
##' resp.name = myRespName)
##' myBiomodData
##' summary(myBiomodData)
##' plot(myBiomodData)
##'
##'
##' # ---------------------------------------------------------------#
##' # # Transform true absences into potential pseudo-absences
##' # myResp.PA <- ifelse(myResp == 1, 1, NA)
##' #
##' # # Format Data with pseudo-absences : random method
##' # myBiomodData.r <- BIOMOD_FormatingData(resp.var = myResp.PA,
##' # expl.var = myExpl,
##' # resp.xy = myRespXY,
##' # resp.name = myRespName,
##' # PA.nb.rep = 4,
##' # PA.nb.absences = 1000,
##' # PA.strategy = 'random')
##' #
##' # # Format Data with pseudo-absences : disk method
##' # myBiomodData.d <- BIOMOD_FormatingData(resp.var = myResp.PA,
##' # expl.var = myExpl,
##' # resp.xy = myRespXY,
##' # resp.name = myRespName,
##' # PA.nb.rep = 4,
##' # PA.nb.absences = 500,
##' # PA.strategy = 'disk',
##' # PA.dist.min = 5,
##' # PA.dist.max = 35)
##' #
##' # # Format Data with pseudo-absences : SRE method
##' # myBiomodData.s <- BIOMOD_FormatingData(resp.var = myResp.PA,
##' # expl.var = myExpl,
##' # resp.xy = myRespXY,
##' # resp.name = myRespName,
##' # PA.nb.rep = 4,
##' # PA.nb.absences = 1000,
##' # PA.strategy = 'sre',
##' # PA.sre.quant = 0.025)
##' #
##' # # Format Data with pseudo-absences : user.defined method
##' # myPAtable <- data.frame(PA1 = ifelse(myResp == 1, TRUE, FALSE),
##' # PA2 = ifelse(myResp == 1, TRUE, FALSE))
##' # for (i in 1:ncol(myPAtable)) myPAtable[sample(which(myPAtable[, i] == FALSE), 500), i] = TRUE
##' # myBiomodData.u <- BIOMOD_FormatingData(resp.var = myResp.PA,
##' # expl.var = myExpl,
##' # resp.xy = myRespXY,
##' # resp.name = myRespName,
##' # PA.strategy = 'user.defined',
##' # PA.user.table = myPAtable)
##' #
##' # myBiomodData.r
##' # myBiomodData.d
##' # myBiomodData.s
##' # myBiomodData.u
##' # plot(myBiomodData.r)
##' # plot(myBiomodData.d)
##' # plot(myBiomodData.s)
##' # plot(myBiomodData.u)
##'
##'
##' # ---------------------------------------------------------------#
##' # # Select multiple sets of pseudo-absences
##' #
##' # # Transform true absences into potential pseudo-absences
##' # myResp.PA <- ifelse(myResp == 1, 1, NA)
##' #
##' # # Format Data with pseudo-absences : random method
##' # myBiomodData.multi <- BIOMOD_FormatingData(resp.var = myResp.PA,
##' # expl.var = myExpl,
##' # resp.xy = myRespXY,
##' # resp.name = myRespName,
##' # PA.nb.rep = 4,
##' # PA.nb.absences = c(1000, 500, 500, 200),
##' # PA.strategy = 'random')
##' # myBiomodData.multi
##' # summary(myBiomodData.multi)
##' # plot(myBiomodData.multi)
##'
##'
##' @importFrom terra rast crds categories
##'
##' @export
##'
##'
###################################################################################################
BIOMOD_FormatingData <- function(resp.name,
resp.var,
expl.var,
dir.name = '.',
resp.xy = NULL,
eval.resp.var = NULL,
eval.expl.var = NULL,
eval.resp.xy = NULL,
PA.nb.rep = 0,
PA.nb.absences = 1000,
PA.strategy = NULL,
PA.dist.min = 0,
PA.dist.max = NULL,
PA.sre.quant = 0.025,
PA.user.table = NULL,
na.rm = TRUE,
filter.raster = FALSE)
{
.bm_cat(paste0(resp.name, " Data Formating"))
## 1. check args ------------------------------------------------------------
args <- .BIOMOD_FormatingData.check.args(resp.name,
resp.var,
expl.var,
dir.name,
resp.xy,
eval.resp.var,
eval.expl.var,
eval.resp.xy,
filter.raster)
for (argi in names(args)) { assign(x = argi, value = args[[argi]]) }
rm(args)
## 2. build BIOMOD.formated.data object -------------------------------------
out <- NULL
if (is.null(PA.strategy) || PA.strategy == 'none') { # no Pseudo Absences
out <- BIOMOD.formated.data(sp = resp.var,
xy = resp.xy,
env = expl.var,
dir.name = dir.name,
sp.name = resp.name,
eval.sp = eval.resp.var,
eval.env = eval.expl.var,
eval.xy = eval.resp.xy,
na.rm = na.rm,
filter.raster = filter.raster)
} else { # automatic Pseudo Absences selection
out <- BIOMOD.formated.data.PA(sp = resp.var,
xy = resp.xy,
env = expl.var,
dir.name = dir.name,
sp.name = resp.name,
eval.sp = eval.resp.var,
eval.env = eval.expl.var,
eval.xy = eval.resp.xy,
PA.nb.rep = PA.nb.rep,
PA.strategy = PA.strategy,
PA.nb.absences = PA.nb.absences,
PA.dist.min = PA.dist.min,
PA.dist.max = PA.dist.max,
PA.sre.quant = PA.sre.quant,
PA.user.table = PA.user.table,
na.rm = na.rm,
filter.raster = filter.raster)
}
.bm_cat("Done")
return(out)
}
# Argument Check -------------------------------------------------------------
.BIOMOD_FormatingData.check.args <- function(resp.name,
resp.var,
expl.var,
dir.name,
resp.xy,
eval.resp.var,
eval.expl.var,
eval.resp.xy,
filter.raster)
{
## 0. Checking names (resp.name available ?) --------------------------------
if (grepl('/', resp.name)) {
stop(paste0("Response variable name must be a character, and not a file path."
, "\n Please refer to dir.name parameter to set a modeling folder."))
}
if (grepl('_', resp.name) | grepl(' ', resp.name)) {
resp.name <- paste(unlist(strsplit(resp.name, '_')), collapse = '.')
resp.name <- paste(unlist(strsplit(resp.name, ' ')), collapse = '.')
cat('\n ! Response variable name was converted into', resp.name)
}
if (!dir.exists(dir.name)) {
stop(paste0("Modeling folder '", dir.name, "' does not exist"))
}
args <- .BIOMOD.formated.data.check.args(sp = resp.var, env = expl.var, xy = resp.xy
, eval.sp = eval.resp.var, eval.env = eval.expl.var
, eval.xy = eval.resp.xy, filter.raster = filter.raster)
return(list(resp.var = args$sp,
expl.var = args$env,
resp.xy = args$xy,
resp.name = resp.name,
dir.name = dir.name,
eval.resp.var = args$eval.sp,
eval.expl.var = args$eval.env,
eval.resp.xy = args$eval.xy))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.