Nothing
#' Calculate Dissimilarity Index of training data
#' @description
#' This function estimates the Dissimilarity Index (DI)
#' within the training data set used for a prediction model.
#' Optionally, the local point density can also be calculated.
#' Predictors can be weighted based on the internal
#' variable importance of the machine learning algorithm used for model training.
#' @note
#' This function is called within \code{\link{aoa}} to estimate the DI and AOA of new data.
#' However, it may also be used on its own if only the DI of training data is of interest,
#' or to facilitate a parallelization of \code{\link{aoa}} by avoiding a repeated calculation of the DI within the training data.
#'
#' @param model A train object created with caret used to extract weights from (based on variable importance) as well as cross-validation folds
#' @param train A data.frame containing the data used for model training. Only required when no model is given
#' @param weight A data.frame containing weights for each variable. Only required if no model is given.
#' @param variables character vector of predictor variables. if "all" then all variables
#' of the model are used or if no model is given then of the train dataset.
#' @param CVtest list or vector. Either a list where each element contains the data points used for testing during the cross validation iteration (i.e. held back data).
#' Or a vector that contains the ID of the fold for each training point.
#' Only required if no model is given.
#' @param CVtrain list. Each element contains the data points used for training during the cross validation iteration (i.e. held back data).
#' Only required if no model is given and only required if CVtrain is not the opposite of CVtest (i.e. if a data point is not used for testing, it is used for training).
#' Relevant if some data points are excluded, e.g. when using \code{\link{nndm}}.
#' @param method Character. Method used for distance calculation. Currently euclidean distance (L2) and Mahalanobis distance (MD) are implemented but only L2 is tested. Note that MD takes considerably longer.
#' @param useWeight Logical. Only if a model is given. Weight variables according to importance in the model?
#' @param LPD Logical. Indicates whether the local point density should be calculated or not.
#' @param verbose Logical. Print progress or not?
#'
#' @seealso \code{\link{aoa}}
#' @importFrom graphics boxplot
#' @import ggplot2
#'
#' @return A list of class \code{trainDI} containing:
#' \item{train}{A data frame containing the training data}
#' \item{weight}{A data frame with weights based on the variable importance.}
#' \item{variables}{Names of the used variables}
#' \item{catvars}{Which variables are categorial}
#' \item{scaleparam}{Scaling parameters. Output from \code{scale}}
#' \item{trainDist_avrg}{A data frame with the average distance of each training point to every other point}
#' \item{trainDist_avrgmean}{The mean of trainDist_avrg. Used for normalizing the DI}
#' \item{trainDI}{Dissimilarity Index of the training data}
#' \item{threshold}{The DI threshold used for inside/outside AOA}
#' \item{trainLPD}{LPD of the training data}
#' \item{avrgLPD}{Average LPD of the training data}
#'
#'
#'
#' @export trainDI
#'
#' @author
#' Hanna Meyer, Marvin Ludwig, Fabian Schumacher
#'
#' @references Meyer, H., Pebesma, E. (2021): Predicting into unknown space?
#' Estimating the area of applicability of spatial prediction models.
#' \doi{10.1111/2041-210X.13650}
#'
#'
#' @examples
#' \dontrun{
#' library(sf)
#' library(terra)
#' library(caret)
#' library(CAST)
#'
#' # prepare sample data:
#' data("splotdata")
#' splotdata = st_drop_geometry(splotdata)
#'
#' # train a model:
#' set.seed(100)
#' model <- caret::train(splotdata[,6:16],
#' splotdata$Species_richness,
#' importance=TRUE, tuneLength=1, ntree = 15, method = "rf",
#' trControl = trainControl(method="cv", number=5, savePredictions=T))
#' # variable importance is used for scaling predictors
#' plot(varImp(model,scale=FALSE))
#'
#' # calculate the DI of the trained model:
#' DI = trainDI(model=model)
#' plot(DI)
#'
#' #...or calculate the DI and LPD of the trained model:
#' # DI = trainDI(model=model, LPD = TRUE)
#'
#' # the DI can now be used to compute the AOA (here with LPD):
#' studyArea = rast(system.file("extdata/predictors_chile.tif", package = "CAST"))
#' AOA = aoa(studyArea, model = model, trainDI = DI, LPD = TRUE, maxLPD = 1)
#' print(AOA)
#' plot(AOA)
#' plot(AOA$AOA)
#' plot(AOA$LPD)
#' }
#'
trainDI <- function(model = NA,
train = NULL,
variables = "all",
weight = NA,
CVtest = NULL,
CVtrain = NULL,
method="L2",
useWeight = TRUE,
LPD = FALSE,
verbose = TRUE){
# get parameters if they are not provided in function call-----
if(is.null(train)){train = aoa_get_train(model)}
if(length(variables) == 1){
if(variables == "all"){
variables = aoa_get_variables(variables, model, train)
}
}
if(is.na(weight)[1]){
if(useWeight){
weight = aoa_get_weights(model, variables = variables)
}else{
message("variable are not weighted. see ?aoa")
weight <- t(data.frame(rep(1,length(variables))))
names(weight) <- variables
}
}else{
weight <- user_weights(weight, variables)
}
# get CV folds from model or from parameters
folds <- aoa_get_folds(model,CVtrain,CVtest)
CVtest <- folds[[2]]
CVtrain <- folds[[1]]
# check for input errors -----
if(nrow(train)<=1){stop("at least two training points need to be specified")}
# reduce train to specified variables
train <- train[,na.omit(match(variables, names(train)))]
train_backup <- train
# convert categorial variables
catupdate <- aoa_categorial_train(train, variables, weight)
train <- catupdate$train
weight <- catupdate$weight
# scale train
train <- scale(train)
# make sure all variables have variance
if (any(apply(train, 2, FUN=function(x){all(is.na(x))}))){
stop("some variables in train seem to have no variance")
}
# save scale param for later
scaleparam <- attributes(train)
# multiply train data with variable weights (from variable importance)
if(!inherits(weight, "error")&!is.null(unlist(weight))){
train <- sapply(1:ncol(train),function(x){train[,x]*unlist(weight[x])})
}
# calculate average mean distance between training data
trainDist_avrg <- c()
trainDist_min <- c()
if(method=="MD"){
if(dim(train)[2] == 1){
S <- matrix(stats::var(train), 1, 1)
} else {
S <- stats::cov(train)
}
S_inv <- MASS::ginv(S)
}
if (verbose) {
message("Computing DI of training data...")
pb <- txtProgressBar(min = 0,
max = nrow(train),
style = 3)
}
for(i in seq(nrow(train))){
# distance to all other training data (for average)
## redundant distance calculation (removed 13.03.24)
#trainDistAll <- .alldistfun(t(train[i,]), train, method, S_inv=S_inv)[-1]
#trainDist_avrg <- append(trainDist_avrg, mean(trainDistAll, na.rm = TRUE))
# calculate distance to other training data:
trainDist <- matrix(.alldistfun(t(matrix(train[i,])), train, method, sorted = FALSE, S_inv))
trainDist[i] <- NA
trainDist_avrg <- append(trainDist_avrg, mean(trainDist, na.rm = TRUE))
# mask of any data that are not used for training for the respective data point (using CV)
whichfold <- NA
if(!is.null(CVtrain)&!is.null(CVtest)){
whichfold <- as.numeric(which(lapply(CVtest,function(x){any(x==i)})==TRUE)) # index of the fold where i is held back
if(length(whichfold)>1){stop("a datapoint is used for testing in more than one fold. currently this option is not implemented")}
if(length(whichfold)!=0){ # in case that a data point is never used for testing
trainDist[!seq(nrow(train))%in%CVtrain[[whichfold]]] <- NA # everything that is not in the training data for i is ignored
}
if(length(whichfold)==0){#in case that a data point is never used for testing, the distances for that point are ignored
trainDist <- NA
}
}
#######################################
if (length(whichfold)==0){
trainDist_min <- append(trainDist_min, NA)
}else{
trainDist_min <- append(trainDist_min, min(trainDist, na.rm = TRUE))
}
if (verbose) {
setTxtProgressBar(pb, i)
}
}
if (verbose) {
close(pb)
}
trainDist_avrgmean <- mean(trainDist_avrg,na.rm=TRUE)
# Dissimilarity Index of training data -----
TrainDI <- trainDist_min/trainDist_avrgmean
# AOA Threshold ----
threshold_quantile <- stats::quantile(TrainDI, 0.75,na.rm=TRUE)
threshold_iqr <- (1.5 * stats::IQR(TrainDI,na.rm=T))
thres <- threshold_quantile + threshold_iqr
# account for case that threshold_quantile + threshold_iqr is larger than maximum DI.
if (thres>max(TrainDI,na.rm=T)){
thres <- max(TrainDI,na.rm=T)
}
# note: previous versions of CAST derived the threshold this way:
# thres <- grDevices::boxplot.stats(TrainDI)$stats[5]
# calculate trainLPD and avrgLPD according to the CV folds
if (LPD == TRUE) {
if (verbose) {
message("Computing LPD of training data...")
pb <- txtProgressBar(min = 0,
max = nrow(train),
style = 3)
}
trainLPD <- c()
for (j in seq(nrow(train))) {
# calculate distance to other training data:
trainDist <- .alldistfun(t(matrix(train[j,])), train, method, sorted = FALSE, S_inv)
DItrainDist <- trainDist/trainDist_avrgmean
DItrainDist[j] <- NA
# mask of any data that are not used for training for the respective data point (using CV)
whichfold <- NA
if(!is.null(CVtrain)&!is.null(CVtest)){
whichfold <- as.numeric(which(lapply(CVtest,function(x){any(x==j)})==TRUE)) # index of the fold where i is held back
if(length(whichfold)>1){stop("a datapoint is used for testing in more than one fold. currently this option is not implemented")}
if(length(whichfold)!=0){ # in case that a data point is never used for testing
DItrainDist[!seq(nrow(train))%in%CVtrain[[whichfold]]] <- NA # everything that is not in the training data for i is ignored
}
if(length(whichfold)==0){#in case that a data point is never used for testing, the distances for that point are ignored
DItrainDist <- NA
}
}
#######################################
if (length(whichfold)==0){
trainLPD <- append(trainLPD, NA)
} else {
trainLPD <- append(trainLPD, sum(DItrainDist[,1] < thres, na.rm = TRUE))
}
if (verbose) {
setTxtProgressBar(pb, j)
}
}
if (verbose) {
close(pb)
}
# Average LPD in trainData
avrgLPD <- round(mean(trainLPD))
}
# Return: trainDI Object -------
aoa_results = list(
train = train_backup,
weight = weight,
variables = variables,
catvars = catupdate$catvars,
scaleparam = scaleparam,
trainDist_avrg = trainDist_avrg,
trainDist_avrgmean = trainDist_avrgmean,
trainDI = TrainDI,
threshold = thres,
method = method
)
if (LPD == TRUE) {
aoa_results$trainLPD <- trainLPD
aoa_results$avrgLPD <- avrgLPD
}
class(aoa_results) = "trainDI"
return(aoa_results)
}
################################################################################
# Helper functions
################################################################################
# Encode categorial variables
aoa_categorial_train <- function(train, variables, weight){
# get all categorial variables
catvars <- tryCatch(names(train)[which(sapply(train[,variables], class)%in%c("factor","character"))],
error=function(e) e)
if (!inherits(catvars,"error")&length(catvars)>0){
message("warning: predictors contain categorical variables. The integration is currently still under development. Please check results carefully!")
for (catvar in catvars){
# mask all unknown levels in newdata as NA (even technically no predictions can be made)
train[,catvar]<-droplevels(train[,catvar])
# then create dummy variables for the remaining levels in train:
dvi_train <- predict(caret::dummyVars(paste0("~",catvar), data = train),
train)
train <- data.frame(train,dvi_train)
if(!inherits(weight, "error")){
addweights <- data.frame(t(rep(weight[,which(names(weight)==catvar)],
ncol(dvi_train))))
names(addweights)<- colnames(dvi_train)
weight <- data.frame(weight,addweights)
}
}
if(!inherits(weight, "error")){
weight <- weight[,-which(names(weight)%in%catvars)]
}
train <- train[,-which(names(train)%in%catvars)]
}
return(list(train = train, weight = weight, catvars = catvars))
}
# Get weights from train object
aoa_get_weights = function(model, variables){
weight <- tryCatch(if(model$modelType=="Classification"){
as.data.frame(t(apply(caret::varImp(model,scale=F)$importance,1,mean)))
}else{
as.data.frame(t(caret::varImp(model,scale=F)$importance[,"Overall"]))
}, error=function(e) e)
if(!inherits(weight, "error") & length(variables)>1){
names(weight)<- rownames(caret::varImp(model,scale=F)$importance)
}else{
# set all weights to 1
weight <- as.data.frame(t(rep(1, length(variables))))
names(weight) = variables
message("note: variables were not weighted either because no weights or model were given,
no variable importance could be retrieved from the given model, or the model has a single feature.
Check caret::varImp(model)")
}
#set negative weights to 0
if(!inherits(weight, "error")){
weight <- weight[,na.omit(match(variables, names(weight)))]
if (any(weight<0)){
weight[weight<0]<-0
message("negative weights were set to 0")
}
}
return(weight)
}
# check user weight input
# make sure this function outputs a data.frame with
# one row and columns named after the variables
user_weights = function(weight, variables){
# list input support
if(inherits(weight, "list")){
# check if all list entries are in variables
weight = as.data.frame(weight)
}
#check if manually given weights are correct. otherwise ignore (set to 1):
if(nrow(weight)!=1 || !all(variables %in% names(weight))){
message("variable weights are not correctly specified and will be ignored. See ?aoa")
weight <- t(data.frame(rep(1,length(variables))))
names(weight) <- variables
}
weight <- weight[,na.omit(match(variables, names(weight)))]
if (any(weight<0)){
weight[weight<0]<-0
message("negative weights were set to 0")
}
return(weight)
}
# Get trainingdata from train object
aoa_get_train <- function(model){
train <- as.data.frame(model$trainingData)
return(train)
}
# Get folds from train object
aoa_get_folds <- function(model, CVtrain, CVtest){
### if folds are to be extracted from the model:
if (!is.na(model)[1]){
if(tolower(model$control$method)!="cv"){
message("note: Either no model was given or no CV was used for model training. The DI threshold is therefore based on all training data")
}else{
CVtest <- model$control$indexOut
CVtrain <- model$control$index
}
}
### if folds are specified manually:
if(is.na(model)[1]){
if(!is.null(CVtest)&!is.list(CVtest)){ # restructure input if CVtest only contains the fold ID
tmp <- list()
for (i in unique(CVtest)){
tmp[[i]] <- which(CVtest==i)
}
CVtest <- tmp
}
if(is.null(CVtest)&is.null(CVtrain)){
message("note: No model and no CV folds were given. The DI threshold is therefore based on all training data")
}else{
if(is.null(CVtest)){ # if CVtest is not given, then use the opposite of CVtrain
CVtest <- lapply(CVtrain,function(x){which(!sort(unique(unlist(CVtrain)))%in%x)})
}else{
if(is.null(CVtrain)){ # if CVtrain is not given, then use the opposite of CVtest
CVtrain <- lapply(CVtest,function(x){which(!sort(unique(unlist(CVtest)))%in%x)})
}
}
}
}
return(list(CVtrain,CVtest))
}
# Get variables from train object
aoa_get_variables <- function(variables, model, train){
if(length(variables) == 1){
if(variables == "all"){
if(!is.na(model)[1]){
variables <- names(model$trainingData)[-which(names(model$trainingData)==".outcome")]
}else{
variables <- names(train)
}
}
}
return(variables)
}
.mindistfun <- function(point, reference, method, S_inv=NULL){
if (method == "L2"){ # Euclidean Distance
return(c(FNN::knnx.dist(reference, point, k = 1)))
} else if (method == "MD"){ # Mahalanobis Distance
return(sapply(1:dim(point)[1],
function(y) min(sapply(1:dim(reference)[1],
function(x) sqrt( t(point[y,] - reference[x,]) %*% S_inv %*% (point[y,] - reference[x,]) )))))
}
}
.alldistfun <- function(point, reference, method, sorted = TRUE,S_inv=NULL){
if (method == "L2"){ # Euclidean Distance
if(sorted){
return(FNN::knnx.dist(reference, point, k = dim(reference)[1]))
} else {
return(FNN::knnx.dist(point,reference,k=1))
}
} else if (method == "MD"){ # Mahalanobis Distance
if(sorted){
return(t(sapply(1:dim(point)[1],
function(y) sort(sapply(1:dim(reference)[1],
function(x) sqrt( t(point[y,] - reference[x,]) %*% S_inv %*% (point[y,] - reference[x,]) ))))))
} else {
return(t(sapply(1:dim(point)[1],
function(y) sapply(1:dim(reference)[1],
function(x) sqrt( t(point[y,] - reference[x,]) %*% S_inv %*% (point[y,] - reference[x,]) )))))
}
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.