caretSDM: Species Distribution Models using caret

Documented in ci_95 se

# imports used in most of the functions
#' @import data.table
#' @import ggplot2
#' @importFrom foreach foreach %do% %dopar% %:% getDoParWorkers
#' @importFrom stats predict na.omit
#' @importFrom utils globalVariables setTxtProgressBar txtProgressBar
NULL


#' Combine objects
#'
#' You can use \code{function(model.list)} or \code{c(...)}
#' to combine multiple objects generated by functions from this package.
#'
#' The output \code{function(list(model1, model2))} is the same as \code{c(function(model1), function(model2))}.
#' But note that you can only concatenate objects obtained from models with different methods (e.g. 'rf','gbm', ...).
#' \cr\cr
#' In contrast, using \code{function(list(model1, model2))} will append '.1', '.2', etc
#' if a method is repeated in the models.
#' Optionally, you can furnish a named list, and the names will override the labels of the methods.
#' @param x,model A list of models created by \code{\link[caret]{train}}.
#' @param ... For \code{function(model.list, ...)}, further arguments passed to the function.
#' For \code{c(...)}, multiple objects returned by the same function.
#' @name combine
#' @rdname combine
NULL

globalVariables(c(".", ".SD", "method", "data", "metric", "variable", "value",
                  "error", "Resample", "thr", "coef"))


maxentThr <- function(x) {
    return(switch(x,
                  "Min_Presence" = "Minimum training presence",
                  "10%_Presence" = "10 percentile training presence",
                  "Sens=Spec" = "Equal training sensitivity and specificity",
                  "MaxSens+Spec" = "Maximum training sensitivity plus specificity",
                  "Balance" = "Balance training omission, predicted area and threshold value",
                  "Entropy" = "Equate entropy of thresholded and original distributions"))
}



get.response <- function(model, newdata, newy) {
    pred.name <- "testy"
    if (is.null(newy)) {
        if (inherits(model, "train.formula")) {
            pred.name <- all.vars(model$terms)[attr(model$terms, "response")]
        } else {
            stop("Either provide 'testy' or train using a formula")
        }
        newy <- newdata[[pred.name]]
    }
    if (!all(levels(newy) %in% model$levels))
        stop(pred.name, " must have levels ", paste(model$levels, collapse = " and "))

    return(newy)
}



createIndex <- function(y, method, number, repeats, p) {
    return(switch(tolower(method),
                  cv = caret::createFolds(y, number, returnTrain = TRUE),
                  repeatedcv =, adaptive_cv = caret::createMultiFolds(y, number, repeats),
                  boot =, adaptive_boot = caret::createResample(y, number),
                  lgocv =, adaptive_lgocv = caret::createDataPartition(y, number, p),
                  none = list(all = seq(along = y),
                stop("Value for 'method' must be one of cv, repeatedcv, boot, lgocv or none.
                    Perhaps you forgot to set 'testindex' to evaluate?"))
    ))
}

#' Functions to calculate confidence intervals
#'
#' \code{ci_95} calculates 95\% confidence intervals and \code{se} calculates stardard error.
#' @param x Numeric vector to calculate confidence intervals
#' @param alpha Probability used for the Student t Distribution
#' (\code{alpha = 0.05} is equivilant 95\% CI)
#' @param na.rm a logical value indicating whether NA values should be stripped.
#' @rdname CI
#' @export
ci_95 <- function(x, alpha = 0.05, na.rm = FALSE) {
    if (na.rm) x <- x[!is.na(x)]
    n <- length(x)
    out <- if (n > 1) stats::sd(x) / sqrt(n) * stats::qt(alpha / 2, n - 1, lower.tail = FALSE) else 0
    return(out)
}

#' @rdname CI
#' @export
se <- function(x, na.rm = FALSE) {
    if (na.rm) x <- x[!is.na(x)]
    return(stats::sd(x) / sqrt(length(x)))
}


# check models in list, and return a error if one of those checks is false
check_list <- function(model.list) {

    # check if all objects are a train
    if (!all(sapply(model.list, inherits, "train")))
        stop("All elements in the list must be objects returned by caret::train.")

    # check if all models are of the same problem type
    modelType <- sapply(model.list, `[[`, "modelType")
    if (!all(modelType == modelType[1]))
        stop("All models should be of the same type (classification or regression).")

    #  Check if all models have the same coefficients
    coefs <- lapply(model.list, `[[`, "coefnames")
    coefs_check <- all(sapply(coefs, function(x, y) all(x %in% y), y = coefs[[1]]))
    if (!coefs_check) stop("The first model should have all coeficients used in following models")

    return(invisible(NULL))
}



# return models with unique names
check_names <- function(model.list) {
    if (is.null(names(model.list))) {
        m.names <- sapply(model.list, function(x) x$modelInfo$label)
    } else {
        m.names <- names(model.list)
    }

    m.names <- make.unique(m.names)
    return(mapply(setname, model = model.list, name = m.names, SIMPLIFY = FALSE))

}
setname <- function(model, name) {
    model$modelInfo$label <- name
    return(model)
}


# check models before concatanating
check_c <- function(model.list, className) {
    if (!all(sapply(model.list, inherits, what = className)))
        stop("All objects must be an output of '", className, "'.")

    methods <- unlist(lapply(model.list, function(x) levels(x[[1]]$method)))
    if (any(duplicated(methods)))
        stop("Models should have different methods.")

    return(invisible(NULL))
}


check_train <- function(model) {
    if (!model$control$returnData)
        stop("returnData should be TRUE in the trControl.")
    return(invisible(NULL))
}

check_list_ensemble <- function(model.list) {
    if (!all(sapply(model.list, function(x) x$control$returnData)))
        stop("returnData should be TRUE in the trControl for all models.")

    if (model.list[[1]]$modelType == "Classification") { # assumed that passed check_list
        check_lvl <- sapply(model.list, function(x) length(x$levels))
        if (any(check_lvl > 2))
            warning("Multi class problem detected, confidence map will be based on the probabilities
                  of the first class only.")
    }

    return(invisible(NULL))
}

getcoefs <- function(model, strip = c(".outcome", ".weights")) {
    cnames <- colnames(model$trainingData)
    return(cnames[!(cnames %in% strip)])
}


create_args <- function(model) {

    control <- model$control
    control$method <- "none"
    control$index <- NULL
    control$indexOut <- NULL
    control$savePredictions <- FALSE # to use a little less memory

    # make a list of arguments
    args <- c(list(.outcome ~ .,
                   preProcess = model$call$preProcess,
                   metric = model$metric,
                   trControl = control,
                   tuneGrid = model$bestTune),
              model$dots)

    args$method <- if (model$method == "custom") model$modelInfo else model$method

    return(args)
}


raster2data <- function(raster, model) {
    r <- raster::as.data.frame(raster, na.rm = FALSE)
    r <- stats::na.omit(r)
    colnames(r) <- sub("_VALUE$", "", colnames(r))

    # converto to factor if avaiable
    varfactor <- names(model$xlevels)
    if (length(varfactor) > 0) {
        for (f in varfactor) {
            r[, f] <- as.factor(r[, f])
        }
    }

    return(r)
}