Nothing
#'@title Calculating species marginality and specialization via ENFA and
#' phylogenetic imputation
#'@description The function computes vectors of marginality and specialization
#' according to \cite{Rinnan & Lawler (2019)} via Environmental Niche Factor
#' Analysis (ENFA) and phylogenetic imputation (\cite{Garland & Ives, 2000}).
#' It takes a list of \code{Simple Features} (or \pkg{sf}) objects and a
#' phylogenetic tree to train ENFA and/or ENphylo models. Both model techniques
#' are calibrated and evaluated while accounting for phylogenetic uncertainty.
#' Calibrations are made on a random subset of the data under the bootstrap
#' cross-validation scheme. The predictive power of the different models is
#' estimated using five different evaluation metrics.
#'@usage ENphylo_modeling(input_data, tree, input_mask, obs_col, time_col=NULL,
#' min_occ_enfa=30, boot_test_perc=20, boot_reps=10, swap.args= list(nsim=10,
#' si=0.2, si2=0.2), eval.args=list(eval_metric_for_imputation="AUC",
#' eval_threshold=0.7,output_options="best"),clust=0.5,output.dir)
#'@param input_data a list of \code{sf::data.frame} objects containing species
#' occurrence data in binary format (ones for presence, zero for background
#' points) along with the explanatory continuous variables to be used in ENFA
#' or ENphylo. Each element of the list must be named using the names of the
#' target species. Alternatively, ENFA model outputs generated through
#' \code{ENphylo_modeling} can be supplied as named elements of
#' \code{input_data} list.
#'@param tree an object of class \code{phylo} including all the species listed
#' in \code{input_data}. The tree needs not to be ultrametric or fully
#' dichotomous. Any species in the tree that do not match species in
#' \code{input_data} are automatically dropped from the tree.
#'@param input_mask a \code{SpatRaster} object. It represents the geographical
#' mask defining the spatial domain encompassing the background area enclosing
#' all the species in \code{input_data}.
#'@param obs_col character. Name of the \code{input_data} column containing the
#' vector of species occurrence data in binary format.
#'@param time_col character. Name of the \code{input_data} column containing the
#' time intervals associated to each species presence and background point
#' (optional).
#'@param min_occ_enfa numeric. The minimum number of occurrence data required
#' for a species to be modeled with ENFA.
#'@param boot_test_perc numeric. Percentage of data (ranging between 0 and 100)
#' used to calibrate ENFA and/or ENphylo models within a bootstrap
#' cross-validation scheme. The remaining percentage
#' (\code{100-boot_test_perc}) will be used to evaluate model performances.
#'@param boot_reps numeric. Number of evaluation runs performed within the
#' bootstrap cross-validation scheme to evaluate ENFA and/or ENphylo models. If
#' set to 0, models evaluation is skipped and the internal evaluation element
#' returns \code{NULL}.
#'@param swap.args list of ENphylo parameters. It includes: \itemize{ \item nsim
#' = number of alternative phylogenies generated by altering topology and
#' branch lengths of the reference tree by means of
#' \code{\link[RRphylo]{swapONE}}. \code{nsim} must be greater than or equal to
#' 1 (see details); \item si,si2 = arguments passed to
#' \code{RRphylo::swapONE}.}
#'@param eval.args list of evaluation model parameters. It includes:
#' \itemize{\item eval_metric_for_imputation = evaluation metric used to select
#' the most accurate ENphylo models. The viable options are: \code{"AUC"},
#' \code{"TSS"}, \code{"CBI"}, \code{"SORENSEN"}, or \code{"OMR"};
#' \item eval_threshold = the minimum evaluation score required to assess ENFA
#' and ENphylo performance. ENFA models having
#' \code{eval_metric_for_imputation} lower than \code{eval_threshold} are
#' compared to ENphylo models to keep the one fitting best. Additionally,
#' within ENphylo, models derived from the swapped trees
#' having \code{eval_metric_for_imputation} lower than \code{eval_threshold}
#' are excluded from the output; \item output_options = the strategy adopted to
#' return ENphylo models results (see details). The viable options are:
#' \code{"full"}, \code{"weighted.mean"}, and \code{"best"}.
#' }
#'@param clust numeric. The proportion of cores used to train ENFA and ENphylo
#' models. If \code{NULL}, parallel computing is disabled. It is set at 0.5 by
#' default.
#'@param output.dir the file path wherein \code{ENphylo_modeling} creates
#' "ENphylo_enfa_models" and "ENphylo_imputed_models" folders to store modeling
#' outputs (see details).
#'@author Alessandro Mondanaro, Mirko Di Febbraro, Silvia Castiglione, Carmela
#' Serio, Marina Melchionna, Pasquale Raia
#'@details \code{ENphylo_modeling} automatically arranges \code{input_data} in a
#' suitable format to run ENFA or ENphylo. The internal call of the function is
#' \code{"calibrated_enfa"} for ENFA and \code{"calibrated_imputed"} for
#' ENphylo, respectively.
#'
#'@details \strong{Phylogenetic uncertainty}
#'
#' The function does not work with \code{nsim} < 1 since one of the strongest
#' points of \code{ENphylo_modeling} is to test alternative phylogenies to
#' provide the most accurate reconstruction of species environmental
#' preferences. Similarly, setting \code{nsim = 1} limits the power of the
#' function, as it will use the original tree without generating alternative
#' phylogenies.
#'
#'@details \strong{Phylogenetic Imputation}
#'
#' \code{ENphylo_modeling} automatically switches from ENFA to ENphylo
#' algorithm for any species having less than \code{min_occ_enfa} occurrences
#' or ENFA model accuracy below \code{eval_threshold}. In this latter case, the
#' function performs both models and retains the one performing best according
#' to \code{eval_metric_for_imputation}. Phylogenetic imputation is allowed for
#' up to 30\% of the species on the tree. If the number of species to impute
#' exceeds 30\%, \code{ENphylo_modeling} automatically splits the original tree
#' into smaller subtrees, so that the maximum percentage of imputation is
#' observed. Each subtree is designed to impute phylogenetically distant
#' species and to retain species phylogenetically close to the taxa to be
#' imputed (so that imputation is robust). In this case, the function prints
#' the number of phylogenies used.
#'
#'@details \strong{Outputs}
#'
#' If \code{ENphylo_modeling} runs the ENphylo algorithm, the outputs depend on
#' the strategy adopted by the user through the \code{output_options} argument.
#' If \code{output_options="full"}, all CO matrices and evaluation metrics for
#' all the swapped trees tested are returned. Under
#' \code{output_options="weighted.mean"}, the output consists of a subset of CO
#' matrices and evaluation metrics for those tree swapping iterations achieving
#' a predictive accuracy in terms of \code{eval_metric_for_imputation} above
#' \code{eval_threshold}. Finally, if \code{output_options="best"}, a single CO
#' matrix and evaluation scores list corresponding to the most accurate swapped
#' tree is returned. If any tree swapping iterations under either \code{"best"}
#' or \code{"weighted.mean"} results in accuracy below the threshold, the
#' function automatically switches to \code{"full"} strategy.
#'
#' Eventually, the function creates two new folders, "ENphylo_enfa_models" and
#' "ENphylo_imputed_models", in \code{output.dir}. In each of these folders, a
#' number of new named subfolders equal to the number of modeled species are
#' created. Therein, model outputs and background area are saved as
#' \code{model_outputs.RData} and \code{study_area.tif}, respectively.
#' \code{model_outputs.RData} includes a list of three elements, regardless of
#' whether ENFA or ENphylo is used: \enumerate{ \item
#' \strong{$call} a character specifying the algorithm used to model the
#' species (i.e. ENFA or ENphylo). \item \strong{$formatted data} a list of
#' input data formatted to run either ENFA or ENphylo algorithms. Specifically,
#' the list reports: the presence data points (\code{$input_ones}),
#' the background points (\code{$input_back}),the name
#' of the columns associated to the arguments \code{OBS_col} and
#' \code{time_col} (if specified), the name of the column containing the cell
#' numbers (\code{geoID_col}), and the coordinates of presence data only
#' (\code{$one_coords}). \item \strong{$calibrated_model} a list. The output
#' objects are different depending on whether ENFA or ENphylo is used to model
#' the species:}
#'@details ENFA
#'@details \itemize{\item$call: a character specifying the algorithm used.
#' \item$full_ model: a list containing marginality and specialization factors,
#' the 'co' matrix, the number of significant axes, and all the other objects
#' generated by applying ENFA on the entire occurrence dataset (see
#' \cite{Rinnan et al. 2019} for additional details). \item$evaluation: a
#' matrix containing the evaluation scores of the ENFA model assessed by all
#' possible evaluation metrics (i.e. Area Under the Curve (AUC), True Skill
#' Statistic (TSS), Boyce Index (CBI), Sorensen Index, and Omission Rate (OMR))
#' for each model evaluations run.}
#'@details ENphylo
#'@details \itemize{\item$call: a character specifying the algorithm used.
#' \item$co: a list of the 'co' matrices of length equal to the number of
#' alternative phylogenies tested (i.e. \code{nsim} argument). The number of
#' 'co' matrices also reflects the selected output_option strategy.
#' \item$evaluation: a data.frame containing the evaluation scores of ENphylo
#' model assessed by all possible evaluation metrics for each alternative
#' phylogeny. The output of this object depends on the strategy adopted by the
#' user through the \code{output_options} argument.Specifically, the function
#' internally selects the model (or models) with the highest evaluation score
#' according to the specified evaluation metric.\item$output_options: a
#' character vector including the argument \code{output_options} and
#' \code{eval_metric_for_imputation} set to run the of ENphylo model.}
#'@return The function does not return the output into \code{.GlobalEnv}. Use
#' the function \code{\link{getENphylo_results}} to collect results from local
#' folders.
#'@importFrom ape drop.tip vcv cophenetic.phylo keep.tip
#'@importFrom methods extends as
#'@importFrom parallel detectCores parLapply
#'@importFrom stats hclust cutree as.dist
#'@importFrom terra extend res crop vect rasterize crds ext writeRaster
#'@export
#'@seealso \link{getENphylo_results}; \href{../doc/ENphylo.html}{\code{ENphylo} vignette}
#'@references Rinnan, D. S., & Lawler, J. (2019). Climate-niche factor
#' analysis: a spatial approach to quantifying species vulnerability to climate
#' change. \emph{Ecography}, 42(9), 1494–1503. doi/full/10.1111/ecog.03937
#'@references Garland, T., & Ives, A. R. (2000). Using the past to predict the
#' present: Confidence intervals for regression equations in phylogenetic
#' comparative methods. \emph{American Naturalist}, 155(3),346–364.
#' doi.org/10.1086/303327
#'@references Mondanaro, A., Di Febbraro, M., Castiglione, S., Melchionna, M.,
#' Serio, C., Girardi, G., Blefiore, A.M., & Raia, P. (2023). ENphylo: A new
#' method to model the distribution of extremely rare species. \emph{Methods in
#' Ecology and Evolution}, 14: 911-922. doi:10.1111/2041-210X.14066
#'@examples
#' \donttest{
#' library(ape)
#' library(terra)
#' library(sf)
#' library(RRgeo)
#'
#' newwd<-tempdir()
#' # newwd<-"YOUR_DIRECTORY"
#' latesturl<-RRgeo:::get_latest_version("12734585")
#' curl::curl_download(url = paste0(latesturl,"/files/dat.Rda?download=1"),
#' destfile = file.path(newwd,"dat.Rda"), quiet = FALSE)
#' load(file.path(newwd,"dat.Rda"))
#' read.tree(system.file("exdata/Eucopdata_tree.txt", package="RRgeo"))->tree
#' tree$tip.label<-gsub("_"," ",tree$tip.label)
#' curl::curl_download(paste0(latesturl,"/files/X35kya.tif?download=1"),
#' destfile = file.path(newwd,"X35kya.tif"), quiet = FALSE)
#' rast(file.path(newwd,"X35kya.tif"))->map35
#' project(map35,st_crs(dat[[1]])$proj4string,res = 50000)->map
#'
#' ENphylo_modeling(input_data=dat[c(1,11)],
#' tree=tree,
#' input_mask=map[[1]],
#' obs_col="OBS",
#' time_col="age",
#' min_occ_enfa=15,
#' boot_test_perc=20,
#' boot_reps=10,
#' swap.args=list(nsim=5,si=0.2,si2=0.2),
#' eval.args=list(eval_metric_for_imputation="AUC",
#' eval_threshold=0.7,
#' output_options="best"),
#' clust=NULL,
#' output.dir=newwd)
#'
#'}
ENphylo_modeling<-function (input_data,
tree,
input_mask,
obs_col,
time_col = NULL,
min_occ_enfa = 30,
boot_test_perc = 20,
boot_reps = 10,
swap.args=list(nsim=10,si=0.2,si2=0.2),
eval.args=list(eval_metric_for_imputation="AUC",eval_threshold=0.7,output_options="best"),
clust = 0.5,
output.dir){
if (any(is.null(names(input_data))))
stop("all the elements in input_data list must be provided with a species name")
if (any(!names(input_data) %in% tree$tip.label))
stop("all species in input_data must be on the phylogenetic tree")
if(is.null(output.dir)) stop('argument "output.dir" is missing, with no default')
if(!is.null(swap.args)){
swargs<-c(nsim=10,si=0.2,si2=0.2)
if(any(!names(swargs)%in%names(swap.args)))
swap.args<-c(swap.args,swargs[which(!names(swargs)%in%names(swap.args))])
}
if(!is.null(eval.args)){
evargs<-c(eval_metric_for_imputation="AUC",eval_threshold=0.7,output_options="best")
if(any(!names(evargs)%in%names(eval.args)))
eval.args<-c(eval.args,evargs[which(!names(evargs)%in%names(eval.args))])
}
if (boot_reps > 0)
evaluate_imputed = TRUE else evaluate_imputed = FALSE
if (evaluate_imputed & length(eval.args$eval_metric_for_imputation) >1)
stop("Please, set just one evaluation metric for imputation")
if (evaluate_imputed & length(eval.args$output_options) >1)
stop("Please, set just one output option")
if (any(tree$edge.length == 0))
tree$edge.length[which(tree$edge.length < max(diag(vcv(tree)))/1000)] <- tree$edge.length[which(tree$edge.length <
max(diag(vcv(tree)))/1000)] + max(diag(vcv(tree))) *
0.001
if (!is.null(clust))
clust <- detectCores() * clust
start_data <- input_data[sapply(input_data, function(xx) class(xx)[1] ==
"sf")]
add_data <- input_data[sapply(input_data, function(xx) class(xx)[1] !=
"sf")]
all_models <- mapply(function(data, nam) {
message(paste("\n", "Modelling", nam, "with ENFA", "\n"))
if (is.null(time_col)) {
prova <- subset(data, as.data.frame(data)[, obs_col] ==
0)
} else {
prova <- subset(data, as.data.frame(data)[, obs_col] ==
0 & as.data.frame(data)[, time_col] == unique(as.data.frame(data)[,
time_col])[1])
}
qq <- input_mask
vv <- prova[, obs_col]
vv <- vect(vv)
ex <- extract(qq, vv)
qq1 <- rasterize(vv, qq)
qq_RTP <- apply(crds(qq1), 2, range)
qq_RTP <- ext(qq_RTP[, 1], qq_RTP[, 2])
qq_RTP <- extend(qq_RTP, res(qq)[1]*2)
maskk <- crop(qq, qq_RTP)
mydata <- DATA_PREPARATION(species_input_data = data,
obs_col = obs_col, input_mask = maskk, time_col = time_col)
if (nrow(mydata$ones_coords) >= min_occ_enfa) {
mymodel <- ENFA_CALIBRATION(formatted_data = mydata,
boot_test_perc = boot_test_perc, boot_reps = boot_reps,
sig_axes_selection = "brStick", clust = clust)
message(paste("\n", "Modelling", nam, "with ENFA: done.",
"\n"))
} else {
mymodel = NULL
message(paste("\n", "Modelling", nam, "with ENFA: skipped.",
"\n"))
}
return(list(call = "calibrated_enfa", formatted_data = mydata,
calibrated_model = mymodel))
}, data = start_data, nam = names(start_data), SIMPLIFY = FALSE)
names(all_models) <- names(start_data)
if (length(add_data) > 0) {
add_mm <- unlist(add_data, recursive = FALSE, use.names = FALSE)
names(add_mm) <- names(add_data)
all_models <- c(add_mm, all_models)
} else add_mm <- NULL
all_models <- all_models[match(names(input_data), names(all_models))]
enf_mod <- all_models
if (evaluate_imputed) {
all_models <- lapply(all_models, function(k) {
if (!is.null(k$calibrated_model)) {
if (mean(k$calibrated_model$evaluation[, eval.args$eval_metric_for_imputation]) <
eval.args$eval_threshold) {
k <- list(call = k$call, formatted_data = k$formatted_data,
calibrated_model = NULL)
}
}
k
})
}
if (all(sapply(all_models, function(k) !is.null(k$calibrated_model)) ==
TRUE)) {
message("All species are modelled with ENFA")
lapply(1:length(all_models), function(bb) {
model_outputs <- all_models[bb]
dir.create(file.path(output.dir, "ENphylo_enfa_models",
names(model_outputs)), recursive = TRUE)
writeRaster(all_models[[bb]]$formatted_data$study_area,
filename = paste0(output.dir, "/ENphylo_enfa_models/",
names(model_outputs), "/study_area.tif"),
overwrite = TRUE)
model_outputs[[1]]$formatted_data$study_area <- NULL
save(model_outputs, file = paste0(output.dir,
"/ENphylo_enfa_models/", names(model_outputs),
"/model_outputs.RData"))
})
} else {
if (any(tree$tip.label %in% names(all_models) == FALSE)) {
sp <- tree$tip.label[!tree$tip.label %in% names(all_models)]
tree <- drop.tip(tree, sp)
warning("Some species are not present in input_data. They will be dropped from the tree")
}
all_good <- all_models[!sapply(all_models, function(j) is.null(j$calibrated_model))]
all_out <- all_models[sapply(all_models, function(j) is.null(j$calibrated_model))]
if (Ntip(tree)<=10 && length(all_out) >= length(all_good) * 0.3) {
lapply(1:length(enf_mod), function(bb) {
model_outputs <- enf_mod[bb]
dir.create(file.path(output.dir, "ENphylo_enfa_models",
names(model_outputs)), recursive = TRUE)
writeRaster(enf_mod[[bb]]$formatted_data$study_area,
filename = paste0(output.dir, "/ENphylo_enfa_models/",
names(model_outputs), "/study_area.tif"), overwrite = TRUE)
enf_mod[[1]]$formatted_data$study_area <- NULL
save(model_outputs, file = paste0(output.dir, "/ENphylo_enfa_models/",
names(model_outputs), "/model_outputs.RData"))
})
message("Phylogenetic imputation is unfeasible, too few species. All species are modelled with ENFA")
}else {
if (length(all_out) > length(all_good) * 0.3) {
ENtree(tree=tree,
impa=tree$tip.label[tree$tip.label%in%names(all_out)],
imp.max=0.3)$trees->treeX
message(paste0("Since more than 30% of the total species have to be modelled with phylogenetic imputation, the phylogenetic tree was split in ",
length(treeX), " different phylogenies"))
}else treeX <- list(tree)
myimputed <- list()
for (jj in 1:length(treeX)) {
if (swap.args$nsim == 0)
stop("Please, set nsim as an integer > 0")
enf <- all_models[names(all_models) %in% treeX[[jj]]$tip.label]
myimputed[[jj]] <- IMPUTED_CALIBRATION(ENFA_output = enf,
tree = treeX[[jj]], nsim = swap.args$nsim, si = swap.args$si,
si2 = swap.args$si2, clust = clust, evaluate = evaluate_imputed,
boot_test_perc = boot_test_perc, boot_reps = boot_reps,
output_options = eval.args$output_options, eval.metric = eval.args$eval_metric_for_imputation,
eval_threshold = eval.args$eval_threshold)
}
gc()
myimputed <- unlist(myimputed, recursive = FALSE)
sel <- lapply(names(myimputed), function(x) {
if (is.null(enf_mod[x][[1]]$calibrated_model)) {
TRUE
} else {
a <- mean(enf_mod[x][[1]]$calibrated_model$evaluation[,
eval.args$eval_metric_for_imputation])
b <- mean(myimputed[x][[1]]$evaluation[, eval.args$eval_metric_for_imputation])
b > a
}
})
myimputed <- myimputed[which(sel == TRUE)]
myimputed <- mapply(function(x, y) {
list(call = "calibrated_imputed", formatted_data = y$formatted_data,
calibrated_model = x)
}, x = myimputed, y = all_models[names(myimputed)], SIMPLIFY = FALSE)
w <- match(names(myimputed), names(enf_mod))
rm(all_good)
rm(all_out)
enf_mod[w] <- myimputed
if (length(add_data) < 1) {
enf_mod2 <- enf_mod
} else {
enf_mod2 <- enf_mod[-which(names(enf_mod) %in% names(add_data))]
}
lapply(1:length(enf_mod2), function(bb) {
if (enf_mod2[[bb]]$call == "calibrated_enfa") {
model_outputs <- enf_mod2[bb]
dir.create(file.path(output.dir, "ENphylo_enfa_models",
names(model_outputs)), recursive = TRUE)
writeRaster(enf_mod2[[bb]]$formatted_data$study_area,
filename = paste0(output.dir, "/ENphylo_enfa_models/",
names(model_outputs), "/study_area.tif"),
overwrite = TRUE)
model_outputs[[1]]$formatted_data$study_area <- NULL
save(model_outputs, file = paste0(output.dir,
"/ENphylo_enfa_models/", names(model_outputs),
"/model_outputs.RData"))
}
if (enf_mod2[[bb]]$call == "calibrated_imputed") {
model_outputs <- enf_mod2[bb]
dir.create(file.path(output.dir, "ENphylo_imputed_models",
names(model_outputs)), recursive = TRUE)
writeRaster(enf_mod2[[bb]]$formatted_data$study_area,
filename = paste0(output.dir, "/ENphylo_imputed_models/",
names(model_outputs), "/study_area.tif"),
overwrite = TRUE)
model_outputs[[1]]$formatted_data$study_area <- NULL
save(model_outputs, file = paste0(output.dir,
"/ENphylo_imputed_models/", names(model_outputs),
"/model_outputs.RData"))
}
})
lapply(1:length(enf_mod), function(jj) enf_mod[[jj]]$formatted_data$study_area <<- NULL)
all_models <- enf_mod
}
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.