R/ENphylo_modeling.R
In RRgeo: Species Distribution Modelling for Rare Species

#'@title Calculating species marginality and specialization via ENFA and
#'  phylogenetic imputation
#'@description The function computes vectors of marginality and specialization
#'  according to \cite{Rinnan & Lawler (2019)} via Environmental Niche Factor
#'  Analysis (ENFA) and phylogenetic imputation (\cite{Garland & Ives, 2000}).
#'  It takes a list of \code{Simple Features} (or \pkg{sf}) objects and a
#'  phylogenetic tree to train ENFA and/or ENphylo models. Both model techniques
#'  are calibrated and evaluated while accounting for phylogenetic uncertainty.
#'  Calibrations are made on a random subset of the data under the bootstrap
#'  cross-validation scheme. The predictive power of the different models is
#'  estimated using five different evaluation metrics.
#'@usage ENphylo_modeling(input_data, tree, input_mask, obs_col, time_col=NULL,
#'  min_occ_enfa=30, boot_test_perc=20, boot_reps=10, swap.args= list(nsim=10,
#'  si=0.2, si2=0.2), eval.args=list(eval_metric_for_imputation="AUC",
#'  eval_threshold=0.7,output_options="best"),clust=0.5,output.dir)
#'@param input_data a list of \code{sf::data.frame} objects containing species
#'  occurrence data in binary format (ones for presence, zero for background
#'  points) along with the explanatory continuous variables to be used in ENFA
#'  or ENphylo. Each element of the list must be named using the names of the
#'  target species. Alternatively, ENFA model outputs generated through
#'  \code{ENphylo_modeling} can be supplied as named elements of
#'  \code{input_data} list.
#'@param tree an object of class \code{phylo} including all the species listed
#'  in \code{input_data}. The tree needs not to be ultrametric or fully
#'  dichotomous. Any species in the tree that do not match species in
#'  \code{input_data} are automatically dropped from the tree.
#'@param input_mask a \code{SpatRaster} object. It represents the geographical
#'  mask defining the spatial domain encompassing the background area enclosing
#'  all the species in \code{input_data}.
#'@param obs_col character. Name of the \code{input_data} column containing the
#'  vector of species occurrence data in binary format.
#'@param time_col character. Name of the \code{input_data} column containing the
#'  time intervals associated to each species presence and background point
#'  (optional).
#'@param min_occ_enfa numeric. The minimum number of occurrence data required
#'  for a species to be modeled with ENFA.
#'@param boot_test_perc numeric. Percentage of data (ranging between 0 and 100)
#'  used to calibrate ENFA and/or ENphylo models within a bootstrap
#'  cross-validation scheme. The remaining percentage
#'  (\code{100-boot_test_perc}) will be used to evaluate model performances.
#'@param boot_reps numeric. Number of evaluation runs performed within the
#'  bootstrap cross-validation scheme to evaluate ENFA and/or ENphylo models. If
#'  set to 0, models evaluation is skipped and the internal evaluation element
#'  returns \code{NULL}.
#'@param swap.args list of ENphylo parameters. It includes: \itemize{ \item nsim
#'  = number of alternative phylogenies generated by altering topology and
#'  branch lengths of the reference tree by means of
#'  \code{\link[RRphylo]{swapONE}}. \code{nsim} must be greater than or equal to
#'  1 (see details); \item si,si2 = arguments passed to
#'  \code{RRphylo::swapONE}.}
#'@param eval.args list of evaluation model parameters. It includes:
#'  \itemize{\item eval_metric_for_imputation = evaluation metric used to select
#'  the most accurate ENphylo models. The viable options are: \code{"AUC"},
#'  \code{"TSS"}, \code{"CBI"}, \code{"SORENSEN"}, or \code{"OMR"};
#'  \item eval_threshold = the minimum evaluation score required to assess ENFA
#'  and ENphylo performance. ENFA models having
#'  \code{eval_metric_for_imputation} lower than \code{eval_threshold} are
#'  compared to ENphylo models to keep the one fitting best. Additionally,
#'  within ENphylo, models derived from the swapped trees
#'  having \code{eval_metric_for_imputation} lower than \code{eval_threshold}
#'  are excluded from the output; \item output_options = the strategy adopted to
#'  return ENphylo models results (see details). The viable options are:
#'  \code{"full"}, \code{"weighted.mean"}, and \code{"best"}.
#'  }
#'@param clust numeric. The proportion of cores used to train ENFA and ENphylo
#'  models. If \code{NULL}, parallel computing is disabled. It is set at 0.5 by
#'  default.
#'@param output.dir the file path wherein \code{ENphylo_modeling} creates
#'  "ENphylo_enfa_models" and "ENphylo_imputed_models" folders to store modeling
#'  outputs (see details).
#'@author Alessandro Mondanaro, Mirko Di Febbraro, Silvia Castiglione, Carmela
#'  Serio, Marina Melchionna, Pasquale Raia
#'@details \code{ENphylo_modeling} automatically arranges \code{input_data} in a
#'  suitable format to run ENFA or ENphylo. The internal call of the function is
#'  \code{"calibrated_enfa"} for ENFA and \code{"calibrated_imputed"} for
#'  ENphylo, respectively.
#'
#'@details \strong{Phylogenetic uncertainty}
#'
#'  The function does not work with \code{nsim} < 1 since one of the strongest
#'  points of \code{ENphylo_modeling} is to test alternative phylogenies to
#'  provide the most accurate reconstruction of species environmental
#'  preferences. Similarly, setting \code{nsim = 1} limits the power of the
#'  function, as it will use the original tree without generating alternative
#'  phylogenies.
#'
#'@details \strong{Phylogenetic Imputation}
#'
#'  \code{ENphylo_modeling} automatically switches from ENFA to ENphylo
#'  algorithm for any species having less than \code{min_occ_enfa} occurrences
#'  or ENFA model accuracy below \code{eval_threshold}. In this latter case, the
#'  function performs both models and retains the one performing best according
#'  to \code{eval_metric_for_imputation}. Phylogenetic imputation is allowed for
#'  up to 30\% of the species on the tree. If the number of species to impute
#'  exceeds 30\%, \code{ENphylo_modeling} automatically splits the original tree
#'  into smaller subtrees, so that the maximum percentage of imputation is
#'  observed. Each subtree is designed to impute phylogenetically distant
#'  species and to retain species phylogenetically close to the taxa to be
#'  imputed (so that imputation is robust). In this case, the function prints
#'  the number of phylogenies used.
#'
#'@details \strong{Outputs}
#'
#'  If \code{ENphylo_modeling} runs the ENphylo algorithm, the outputs depend on
#'  the strategy adopted by the user through the \code{output_options} argument.
#'  If \code{output_options="full"}, all CO matrices and evaluation metrics for
#'  all the swapped trees tested are returned. Under
#'  \code{output_options="weighted.mean"}, the output consists of a subset of CO
#'  matrices and evaluation metrics for those tree swapping iterations achieving
#'  a predictive accuracy in terms of \code{eval_metric_for_imputation} above
#'  \code{eval_threshold}. Finally, if \code{output_options="best"}, a single CO
#'  matrix and evaluation scores list corresponding to the most accurate swapped
#'  tree is returned. If any tree swapping iterations under either \code{"best"}
#'  or \code{"weighted.mean"} results in accuracy below the threshold, the
#'  function automatically switches to \code{"full"} strategy.
#'
#'  Eventually, the function creates two new folders, "ENphylo_enfa_models" and
#'  "ENphylo_imputed_models", in \code{output.dir}. In each of these folders, a
#'  number of new named subfolders equal to the number of modeled species are
#'  created. Therein, model outputs and background area are saved as
#'  \code{model_outputs.RData} and \code{study_area.tif}, respectively.
#'  \code{model_outputs.RData} includes a list of three elements, regardless of
#'  whether ENFA or ENphylo is used: \enumerate{ \item
#'  \strong{$call} a character specifying the algorithm used to model the
#'  species (i.e. ENFA or ENphylo). \item \strong{$formatted data} a list of
#'  input data formatted to run either ENFA or ENphylo algorithms. Specifically,
#'  the list reports: the presence data points (\code{$input_ones}),
#'  the background points (\code{$input_back}),the name
#'  of the columns associated to the arguments \code{OBS_col} and
#'  \code{time_col} (if specified), the name of the column containing the cell
#'  numbers (\code{geoID_col}), and the coordinates of presence data only
#'  (\code{$one_coords}). \item \strong{$calibrated_model} a list. The output
#'  objects are different depending on whether ENFA or ENphylo is used to model
#'  the species:}
#'@details  ENFA
#'@details \itemize{\item$call: a character specifying the algorithm used.
#'  \item$full_ model: a list containing marginality and specialization factors,
#'  the 'co' matrix, the number of significant axes, and all the other objects
#'  generated by applying ENFA on the entire occurrence dataset (see
#'  \cite{Rinnan et al. 2019} for additional details). \item$evaluation: a
#'  matrix containing the evaluation scores of the ENFA model assessed by all
#'  possible evaluation metrics (i.e. Area Under the Curve (AUC), True Skill
#'  Statistic (TSS), Boyce Index (CBI), Sorensen Index, and Omission Rate (OMR))
#'  for each model evaluations run.}
#'@details ENphylo
#'@details \itemize{\item$call: a character specifying the algorithm used.
#'  \item$co: a list of the 'co' matrices of length equal to the number of
#'  alternative phylogenies tested (i.e. \code{nsim} argument). The number of
#'  'co' matrices also reflects the selected output_option strategy.
#'  \item$evaluation: a data.frame containing the evaluation scores of ENphylo
#'  model assessed by all possible evaluation metrics for each alternative
#'  phylogeny. The output of this object depends on the strategy adopted by the
#'  user through the \code{output_options} argument.Specifically, the function
#'  internally selects the model (or models) with the highest evaluation score
#'  according to the specified evaluation metric.\item$output_options:  a
#'  character vector including the argument \code{output_options} and
#'  \code{eval_metric_for_imputation} set to run the of ENphylo model.}
#'@return The function does not return the output into \code{.GlobalEnv}. Use
#'  the function \code{\link{getENphylo_results}} to collect results from local
#'  folders.
#'@importFrom ape drop.tip vcv cophenetic.phylo keep.tip
#'@importFrom methods extends as
#'@importFrom parallel detectCores parLapply
#'@importFrom stats hclust cutree as.dist
#'@importFrom terra extend res crop vect rasterize crds ext writeRaster
#'@export
#'@seealso \link{getENphylo_results}; \href{../doc/ENphylo.html}{\code{ENphylo} vignette}
#'@references Rinnan, D. S., &  Lawler, J. (2019). Climate-niche factor
#'  analysis: a spatial approach to quantifying species vulnerability to climate
#'  change. \emph{Ecography}, 42(9), 1494–1503. doi/full/10.1111/ecog.03937
#'@references Garland, T., & Ives, A. R. (2000). Using the past to predict the
#'  present: Confidence intervals for regression equations in phylogenetic
#'  comparative methods. \emph{American Naturalist}, 155(3),346–364.
#'  doi.org/10.1086/303327
#'@references Mondanaro, A., Di Febbraro, M., Castiglione, S., Melchionna, M.,
#'  Serio, C., Girardi, G., Blefiore, A.M., & Raia, P. (2023). ENphylo: A new
#'  method to model the distribution of extremely rare species. \emph{Methods in
#'  Ecology and Evolution}, 14: 911-922. doi:10.1111/2041-210X.14066
#'@examples
#' \donttest{
#' library(ape)
#' library(terra)
#' library(sf)
#' library(RRgeo)
#'
#' newwd<-tempdir()
#' # newwd<-"YOUR_DIRECTORY"
#' latesturl<-RRgeo:::get_latest_version("12734585")
#' curl::curl_download(url = paste0(latesturl,"/files/dat.Rda?download=1"),
#'                     destfile = file.path(newwd,"dat.Rda"), quiet = FALSE)
#' load(file.path(newwd,"dat.Rda"))
#' read.tree(system.file("exdata/Eucopdata_tree.txt", package="RRgeo"))->tree
#' tree$tip.label<-gsub("_"," ",tree$tip.label)
#' curl::curl_download(paste0(latesturl,"/files/X35kya.tif?download=1"),
#'                     destfile = file.path(newwd,"X35kya.tif"), quiet = FALSE)
#' rast(file.path(newwd,"X35kya.tif"))->map35
#' project(map35,st_crs(dat[[1]])$proj4string,res = 50000)->map
#'
#' ENphylo_modeling(input_data=dat[c(1,11)],
#'                  tree=tree,
#'                  input_mask=map[[1]],
#'                  obs_col="OBS",
#'                  time_col="age",
#'                  min_occ_enfa=15,
#'                  boot_test_perc=20,
#'                  boot_reps=10,
#'                  swap.args=list(nsim=5,si=0.2,si2=0.2),
#'                  eval.args=list(eval_metric_for_imputation="AUC",
#'                                 eval_threshold=0.7,
#'                                 output_options="best"),
#'                  clust=NULL,
#'                  output.dir=newwd)
#'
#'}


ENphylo_modeling<-function (input_data,
                            tree,
                            input_mask,
                            obs_col,
                            time_col = NULL,
                            min_occ_enfa = 30,
                            boot_test_perc = 20,
                            boot_reps = 10,
                            swap.args=list(nsim=10,si=0.2,si2=0.2),
                            eval.args=list(eval_metric_for_imputation="AUC",eval_threshold=0.7,output_options="best"),
                            clust = 0.5,
                            output.dir){

  if (any(is.null(names(input_data))))
    stop("all the elements in input_data list must be provided with a species name")
  if (any(!names(input_data) %in% tree$tip.label))
    stop("all species in input_data must be on the phylogenetic tree")
  if(is.null(output.dir))  stop('argument "output.dir" is missing, with no default')

  if(!is.null(swap.args)){
    swargs<-c(nsim=10,si=0.2,si2=0.2)
    if(any(!names(swargs)%in%names(swap.args)))
      swap.args<-c(swap.args,swargs[which(!names(swargs)%in%names(swap.args))])
  }
  if(!is.null(eval.args)){
    evargs<-c(eval_metric_for_imputation="AUC",eval_threshold=0.7,output_options="best")
    if(any(!names(evargs)%in%names(eval.args)))
      eval.args<-c(eval.args,evargs[which(!names(evargs)%in%names(eval.args))])
  }
  if (boot_reps > 0)
    evaluate_imputed = TRUE else evaluate_imputed = FALSE
    if (evaluate_imputed & length(eval.args$eval_metric_for_imputation) >1)
      stop("Please, set just one evaluation metric for imputation")
    if (evaluate_imputed & length(eval.args$output_options) >1)
      stop("Please, set just one output option")
    if (any(tree$edge.length == 0))
      tree$edge.length[which(tree$edge.length < max(diag(vcv(tree)))/1000)] <- tree$edge.length[which(tree$edge.length <
                                                                                                        max(diag(vcv(tree)))/1000)] + max(diag(vcv(tree))) *
        0.001
    if (!is.null(clust))
      clust <- detectCores() * clust
    start_data <- input_data[sapply(input_data, function(xx) class(xx)[1] ==
                                      "sf")]
    add_data <- input_data[sapply(input_data, function(xx) class(xx)[1] !=
                                    "sf")]
    all_models <- mapply(function(data, nam) {
      message(paste("\n", "Modelling", nam, "with ENFA", "\n"))
      if (is.null(time_col)) {
        prova <- subset(data, as.data.frame(data)[, obs_col] ==
                          0)
      } else {
        prova <- subset(data, as.data.frame(data)[, obs_col] ==
                          0 & as.data.frame(data)[, time_col] == unique(as.data.frame(data)[,
                                                                                            time_col])[1])
      }
      qq <- input_mask
      vv <- prova[, obs_col]
      vv <- vect(vv)
      ex <- extract(qq, vv)
      qq1 <- rasterize(vv, qq)
      qq_RTP <- apply(crds(qq1), 2, range)
      qq_RTP <- ext(qq_RTP[, 1], qq_RTP[, 2])
      qq_RTP <- extend(qq_RTP, res(qq)[1]*2)
      maskk <- crop(qq, qq_RTP)
      mydata <- DATA_PREPARATION(species_input_data = data,
                                 obs_col = obs_col, input_mask = maskk, time_col = time_col)
      if (nrow(mydata$ones_coords) >= min_occ_enfa) {
        mymodel <- ENFA_CALIBRATION(formatted_data = mydata,
                                    boot_test_perc = boot_test_perc, boot_reps = boot_reps,
                                    sig_axes_selection = "brStick", clust = clust)
        message(paste("\n", "Modelling", nam, "with ENFA: done.",
                  "\n"))
      } else {
        mymodel = NULL
        message(paste("\n", "Modelling", nam, "with ENFA: skipped.",
                  "\n"))
      }
      return(list(call = "calibrated_enfa", formatted_data = mydata,
                  calibrated_model = mymodel))
    }, data = start_data, nam = names(start_data), SIMPLIFY = FALSE)
    names(all_models) <- names(start_data)
    if (length(add_data) > 0) {
      add_mm <- unlist(add_data, recursive = FALSE, use.names = FALSE)
      names(add_mm) <- names(add_data)
      all_models <- c(add_mm, all_models)
    } else add_mm <- NULL

    all_models <- all_models[match(names(input_data), names(all_models))]
    enf_mod <- all_models
    if (evaluate_imputed) {
      all_models <- lapply(all_models, function(k) {
        if (!is.null(k$calibrated_model)) {
          if (mean(k$calibrated_model$evaluation[, eval.args$eval_metric_for_imputation]) <
              eval.args$eval_threshold) {
            k <- list(call = k$call, formatted_data = k$formatted_data,
                      calibrated_model = NULL)
          }
        }
        k
      })
    }
    if (all(sapply(all_models, function(k) !is.null(k$calibrated_model)) ==
            TRUE)) {
      message("All species are modelled with ENFA")

      lapply(1:length(all_models), function(bb) {
        model_outputs <- all_models[bb]
        dir.create(file.path(output.dir, "ENphylo_enfa_models",
                             names(model_outputs)), recursive = TRUE)
        writeRaster(all_models[[bb]]$formatted_data$study_area,
                    filename = paste0(output.dir, "/ENphylo_enfa_models/",
                                      names(model_outputs), "/study_area.tif"),
                    overwrite = TRUE)
        model_outputs[[1]]$formatted_data$study_area <- NULL
        save(model_outputs, file = paste0(output.dir,
                                          "/ENphylo_enfa_models/", names(model_outputs),
                                          "/model_outputs.RData"))
      })
    } else {
      if (any(tree$tip.label %in% names(all_models) == FALSE)) {
        sp <- tree$tip.label[!tree$tip.label %in% names(all_models)]
        tree <- drop.tip(tree, sp)
        warning("Some species are not present in input_data. They will be dropped from the tree")
      }

      all_good <- all_models[!sapply(all_models, function(j) is.null(j$calibrated_model))]
      all_out <- all_models[sapply(all_models, function(j) is.null(j$calibrated_model))]

      if (Ntip(tree)<=10 && length(all_out) >= length(all_good) * 0.3) {

        lapply(1:length(enf_mod), function(bb) {
          model_outputs <- enf_mod[bb]
          dir.create(file.path(output.dir, "ENphylo_enfa_models",
                               names(model_outputs)), recursive = TRUE)
          writeRaster(enf_mod[[bb]]$formatted_data$study_area,
                      filename = paste0(output.dir, "/ENphylo_enfa_models/",
                                        names(model_outputs), "/study_area.tif"), overwrite = TRUE)
          enf_mod[[1]]$formatted_data$study_area <- NULL
          save(model_outputs, file = paste0(output.dir, "/ENphylo_enfa_models/",
                                            names(model_outputs), "/model_outputs.RData"))
        })
        message("Phylogenetic imputation is unfeasible, too few species. All species are modelled with ENFA")
      }else {

        if (length(all_out) > length(all_good) * 0.3) {

          ENtree(tree=tree,
                 impa=tree$tip.label[tree$tip.label%in%names(all_out)],
                 imp.max=0.3)$trees->treeX

          message(paste0("Since more than 30% of the total species have to be modelled with phylogenetic imputation, the phylogenetic tree was split in ",
                       length(treeX), " different phylogenies"))
        }else treeX <- list(tree)


        myimputed <- list()
        for (jj in 1:length(treeX)) {
          if (swap.args$nsim == 0)
            stop("Please, set nsim as an integer > 0")
          enf <- all_models[names(all_models) %in% treeX[[jj]]$tip.label]
          myimputed[[jj]] <- IMPUTED_CALIBRATION(ENFA_output = enf,
                                                 tree = treeX[[jj]], nsim = swap.args$nsim, si = swap.args$si,
                                                 si2 = swap.args$si2, clust = clust, evaluate = evaluate_imputed,
                                                 boot_test_perc = boot_test_perc, boot_reps = boot_reps,
                                                 output_options = eval.args$output_options, eval.metric = eval.args$eval_metric_for_imputation,
                                                 eval_threshold = eval.args$eval_threshold)
        }
        gc()
        myimputed <- unlist(myimputed, recursive = FALSE)
        sel <- lapply(names(myimputed), function(x) {
          if (is.null(enf_mod[x][[1]]$calibrated_model)) {
            TRUE
          } else {
            a <- mean(enf_mod[x][[1]]$calibrated_model$evaluation[,
                                                                  eval.args$eval_metric_for_imputation])
            b <- mean(myimputed[x][[1]]$evaluation[, eval.args$eval_metric_for_imputation])
            b > a
          }
        })
        myimputed <- myimputed[which(sel == TRUE)]
        myimputed <- mapply(function(x, y) {
          list(call = "calibrated_imputed", formatted_data = y$formatted_data,
               calibrated_model = x)
        }, x = myimputed, y = all_models[names(myimputed)], SIMPLIFY = FALSE)
        w <- match(names(myimputed), names(enf_mod))
        rm(all_good)
        rm(all_out)
        enf_mod[w] <- myimputed
        if (length(add_data) < 1) {
          enf_mod2 <- enf_mod
        } else {
          enf_mod2 <- enf_mod[-which(names(enf_mod) %in% names(add_data))]
        }
        lapply(1:length(enf_mod2), function(bb) {
          if (enf_mod2[[bb]]$call == "calibrated_enfa") {
            model_outputs <- enf_mod2[bb]
            dir.create(file.path(output.dir, "ENphylo_enfa_models",
                                 names(model_outputs)), recursive = TRUE)
            writeRaster(enf_mod2[[bb]]$formatted_data$study_area,
                        filename = paste0(output.dir, "/ENphylo_enfa_models/",
                                          names(model_outputs), "/study_area.tif"),
                        overwrite = TRUE)
            model_outputs[[1]]$formatted_data$study_area <- NULL
            save(model_outputs, file = paste0(output.dir,
                                              "/ENphylo_enfa_models/", names(model_outputs),
                                              "/model_outputs.RData"))
          }
          if (enf_mod2[[bb]]$call == "calibrated_imputed") {
            model_outputs <- enf_mod2[bb]
            dir.create(file.path(output.dir, "ENphylo_imputed_models",
                                 names(model_outputs)), recursive = TRUE)
            writeRaster(enf_mod2[[bb]]$formatted_data$study_area,
                        filename = paste0(output.dir, "/ENphylo_imputed_models/",
                                          names(model_outputs), "/study_area.tif"),
                        overwrite = TRUE)
            model_outputs[[1]]$formatted_data$study_area <- NULL
            save(model_outputs, file = paste0(output.dir,
                                              "/ENphylo_imputed_models/", names(model_outputs),
                                              "/model_outputs.RData"))
          }
        })
        lapply(1:length(enf_mod), function(jj) enf_mod[[jj]]$formatted_data$study_area <<- NULL)
        all_models <- enf_mod
      }
    }
}