R/evalMultiAUC.r
In enmSdmX: Species Distribution Modeling and Ecological Niche Modeling

Documented in evalMultiAUC

#' Calculate multivariate weighted AUC
#'
#' This function calculates a multivariate version of the area under the receiver-operator characteristic curve (AUC). The multivariate version is simply the mean AUC across all possible pairwise AUCs for all cases (Hand & Till 2001). For example, if we have predictions that can be classified into three groups of expectation, say A, B, and C, where we expect predictions assigned to group A are > those in B and C, and predictions in group B are expected to be > those in group C, the multivariate AUC for this situation is \code{mean(wAB * auc_mean(A, B), wAC * auc_mean(A, C), wBC * auc_mean(B, C))}, where \code{auc_mean(X, Y)}, is the AUC calculated between cases \code{X} and \code{Y}, and \code{wXY} is a weight for that case-comparison.
#'
#' @param ... A set of two or more numeric vectors \emph{or} two or more 2-column matrices or data frames. The objects must be listed in order of \emph{expected} probability. For example, you might have a set of predictions for objects you expect to have a low predicted probability (e.g., long-term absences of an animal), a set that you expect to have middle levels of probability (e.g., sites that were recently vacated), and a set for which you expect a high level of predicted probability (e.g., sites that are currently occupied). In this case you should list the cases in order: low, middle, high. If a 2-column matrix or data frame is supplied, then the first column is assumed to represent predictions and the second assumed to represent site-level weights (see \code{\link{evalAUC}}). Note that site-level weighting is different from case-level weighting.
#' @param weightBySize Logical, if \code{FALSE} (default) then the multivariate measure of AUC will treat all comparisons as equal (e.g., low versus middle will weigh as much as middle versus high), and so will simply be the mean AUC across all possible comparisons. If \code{TRUE} then multivariate AUC is the weighted mean across all possible comparisons where weights are the number of comparisons between each of the two cases. For example, if a set of "low" predictions ("low") has 10 data points, "middle" has 10, and "high" has 20, then the multivariate AUC will be (10 * low + 10 * middle + 20 * high) / (10 + 10 + 20).
#' @param na.rm Logical. If \code{TRUE} then remove any cases in \code{...} that are \code{NA}.
#'
#' @return Named numeric vector. The names will appear as \code{case2_over_case1} (which in this example means the AUC of item #1 in the \code{...} when compared to the second item in \code{...}), plus \code{multivariate} (which is the multivariate AUC).
#' 
#' @seealso \code{\link[predicts]{pa_evaluate}}, \code{\link{evalAUC}}, \code{\link{evalContBoyce}}, \code{\link{evalThreshold}}, \code{\link{evalThresholdStats}}, \code{\link{evalTjursR2}}, \code{\link{evalTSS}}
#'
#' @references
#' Hand, DJ and Till, RJ. 2001. A simple generalisation of the area under the ROC curve for multiple class classification problems. \emph{Machine Learning} 45:171-186 \doi{10.1023/A:1010920819831}.
#' 
#' @examples
#' set.seed(123)
#' 
#' # no weights
#' low <- runif(10)^2
#' middle <- runif(10)
#' high <- sqrt(runif(20))
#' 
#' evalMultiAUC(low, middle, high)
#' 
#' # equal weights
#' low <- matrix(c(low, rep(1, length(low))), ncol=2)
#' middle <- matrix(c(middle, rep(1, length(middle))), ncol=2)
#' high <- matrix(c(high, rep(1, length(high))), ncol=2)
#' evalMultiAUC(low, middle, high)
#' 
#' # equal weights with weighting by number of comparisons
#' evalMultiAUC(low, middle, high, weightBySize=TRUE)
#' 
#' # unequal weights
#' middle[ , 2] <- ifelse(middle[ , 1] > 0.5, 0.1, 1)
#' evalMultiAUC(low, middle, high)
#' 
#' # unequal weights with weighting by number of comparisons
#' evalMultiAUC(low, middle, high, weightBySize=TRUE)
#' @export

evalMultiAUC <- function(
	...,
	weightBySize = FALSE,
	na.rm = FALSE
) {

	cases <- list(...)
	ncases <- length(cases)
	names(cases) <- paste0('case', seq_along(cases))

	# if input is a data frame or matrix with just one column, add another
	# to represent weights (all weights = 1), if input is a vector then
	# convert to two-column matrix with second column being weights of 1
	for (i in seq_along(cases)) {
		this <- cases[[i]]
		if (!any(c('data.frame', 'matrix') %in% class(this))) {
			cases[[i]] <- cbind(this, w=rep(1, length(this)))
		} else {
			if (ncol(this) == 1) {
				cases[[i]] <- cbind(this, w=rep(1, length(this)))
			} else if (ncol(this) > 2) {
				stop('Category #', i, ' has >2 columns. Values must be a vector or a 1- or 2-column matrix or data frame.')
			}
		}
	}
	
	# remove NAs
	if (na.rm) {
		for (i in seq_along(cases)) {
			cases[[i]] <- cases[[i]][stats::complete.cases(cases[[i]]), ]
		}
	}

	# AUC and number of cases
	aucs <- numCases <- numeric()
	
	# calculate AUC
	for (one in 1:(length(cases) - 1)) {
		for (two in (one + 1):length(cases)) {

			# neatify
			contrast <- cases[[one]][ , 1]
			pres <- cases[[two]][ , 1]
			
			contrastWeight <- cases[[one]][ , 2]
			presWeight <- cases[[two]][ , 2]
			
			# pairwise AUC
			thisAuc <- evalAUC(
				pres=pres,
				contrast=contrast,
				presWeight=presWeight,
				contrastWeight=contrastWeight
			)
			
			# remember and assign name
			aucs <- c(aucs, thisAuc)
			name1 <- names(cases)[one]
			name2 <- names(cases)[two]
			name <- paste0(name2, '_over_', name1)
			names(aucs)[length(aucs)] <- name
			
			# remember number of cases
			thisNumCases <- length(pres) * length(contrast)
			numCases <- c(numCases, thisNumCases)
			names(numCases)[length(numCases)] <- name
			
		}
	}
	
	# multivariate AUC
	evalMultiAUC <- if (weightBySize) {
		sum(aucs * numCases) / sum(numCases)
	} else {
		mean(aucs)
	}
		
	aucs <- c(aucs, evalMultiAUC)
	names(aucs)[length(aucs)] <- 'multivariate'
	
	aucs

}