Nothing
#' Collation and Visualization of Resampling Results
#'
#' These functions provide methods for collection, analyzing and visualizing a
#' set of resampling results from a common data set.
#'
#' The ideas and methods here are based on Hothorn et al. (2005) and Eugster et
#' al. (2008).
#'
#' The results from \code{\link{train}} can have more than one performance
#' metric per resample. Each metric in the input object is saved.
#'
#' \code{resamples} checks that the resampling results match; that is, the
#' indices in the object \code{trainObject$control$index} are the same. Also,
#' the argument \code{\link{trainControl}} \code{returnResamp} should have a
#' value of \code{"final"} for each model.
#'
#' The summary function computes summary statistics across each model/metric
#' combination.
#'
#' @aliases resamples.default resamples summary.resamples sort.resamples
#' as.matrix.resamples as.data.frame.resamples modelCor
#' @param x a list of two or more objects of class \code{\link{train}},
#' \code{\link{sbf}} or \code{\link{rfe}} with a common set of resampling
#' indices in the \code{control} object. For \code{sort.resamples}, it is an
#' object generated by \code{resamples}.
#' @param modelNames an optional set of names to give to the resampling results
#' @param object an object generated by \code{resamples}
#' @param metric a character string for the performance measure used to sort or
#' computing the between-model correlations
#' @param decreasing logical. Should the sort be increasing or decreasing?
#' @param FUN a function whose first argument is a vector and returns a scalar,
#' to be applied to each model's performance measure.
#' @param row.names,optional not currently used but included for consistency
#' with \code{as.data.frame}
#' @param \dots only used for \code{sort} and \code{modelCor} and captures
#' arguments to pass to \code{sort} or \code{FUN}.
#' @return For \code{resamples}: an object with class \code{"resamples"} with
#' elements \item{call }{the call} \item{values }{a data frame of results where
#' rows correspond to resampled data sets and columns indicate the model and
#' metric} \item{models }{a character string of model labels} \item{metrics }{a
#' character string of performance metrics} \item{methods }{a character string
#' of the \code{\link{train}} \code{method} argument values for each model }
#' For \code{sort.resamples} a character string in the sorted order is
#' generated. \code{modelCor} returns a correlation matrix.
#' @author Max Kuhn
#' @seealso \code{\link{train}}, \code{\link{trainControl}},
#' \code{\link{diff.resamples}}, \code{\link{xyplot.resamples}},
#' \code{\link{densityplot.resamples}}, \code{\link{bwplot.resamples}},
#' \code{\link{splom.resamples}}
#' @references Hothorn et al. The design and analysis of benchmark experiments.
#' Journal of Computational and Graphical Statistics (2005) vol. 14 (3) pp.
#' 675-699
#'
#' Eugster et al. Exploratory and inferential analysis of benchmark
#' experiments. Ludwigs-Maximilians-Universitat Munchen, Department of
#' Statistics, Tech. Rep (2008) vol. 30
#' @keywords models
#' @examples
#'
#'
#' data(BloodBrain)
#' set.seed(1)
#'
#' ## tmp <- createDataPartition(logBBB,
#' ## p = .8,
#' ## times = 100)
#'
#' ## rpartFit <- train(bbbDescr, logBBB,
#' ## "rpart",
#' ## tuneLength = 16,
#' ## trControl = trainControl(
#' ## method = "LGOCV", index = tmp))
#'
#' ## ctreeFit <- train(bbbDescr, logBBB,
#' ## "ctree",
#' ## trControl = trainControl(
#' ## method = "LGOCV", index = tmp))
#'
#' ## earthFit <- train(bbbDescr, logBBB,
#' ## "earth",
#' ## tuneLength = 20,
#' ## trControl = trainControl(
#' ## method = "LGOCV", index = tmp))
#'
#' ## or load pre-calculated results using:
#' ## load(url("http://caret.r-forge.r-project.org/exampleModels.RData"))
#'
#' ## resamps <- resamples(list(CART = rpartFit,
#' ## CondInfTree = ctreeFit,
#' ## MARS = earthFit))
#'
#' ## resamps
#' ## summary(resamps)
#'
#' @export resamples
"resamples" <- function(x, ...) UseMethod("resamples")
#' @rdname resamples
#' @method resamples default
#' @export
resamples.default <- function(x, modelNames = names(x), ...) {
## Do a lot of checkin of the object types and make sure
## that they actually used the samples samples in the resamples
if(length(x) < 2) stop("at least two train objects are needed")
classes <- unlist(lapply(x, function(x) class(x)[1]))
# if(!all(classes %in% c("sbf", "rfe", "train"))) stop("all objects in x must of class train, sbf or rfe")
if(is.null(modelNames)){
modelNames <- well_numbered("Model", length(x))
} else {
if(any(modelNames == "")) {
no_name <- which(modelNames == "")
modelNames[no_name] <- well_numbered("Model", length(x))[no_name]
}
}
rs_method <- vapply(x, function(x) x$control$method, character(1))
if (any(rs_method == "LOOCV")) {
stop("LOOCV is not compatible with `resamples()` since only one resampling ",
"estimate is available.", call. = FALSE)
}
numResamp <- unlist(lapply(x, function(x) length(x$control$index)))
if(length(unique(numResamp)) > 1) stop("There are different numbers of resamples in each model")
for(i in 1:numResamp[1]){
indices <- lapply(x, function(x, i) sort(x$control$index[[1]]), i = i)
uniqueIndex <- length(table(table(unlist(indices))))
if(length(uniqueIndex) > 1) stop("The samples indices are not equal across resamples")
}
getTimes <- function(x){
out <- rep(NA, 3)
if(all(names(x) != "times")) return(out)
if(any(names(x$times) == "everything")) out[1] <- x$times$everything[3]
if(any(names(x$times) == "final")) out[2] <- x$times$final[3]
if(any(names(x$times) == "prediction")) out[3] <- x$times$prediction[3]
out
}
rs_values <- vector(mode = "list", length = length(x))
for(i in seq(along = x)) {
if(class(x[[i]])[1] == "rfe" && x[[i]]$control$returnResamp == "all"){
warning(paste0("'", modelNames[i], "' did not have 'returnResamp=\"final\"; the optimal subset is used"))
}
if(class(x[[i]])[1] == "train" && x[[i]]$control$returnResamp == "all"){
warning(paste0("'", modelNames[i], "' did not have 'returnResamp=\"final\"; the optimal tuning parameters are used"))
}
if(class(x[[i]])[1] == "sbf" && x[[i]]$control$returnResamp == "all"){
warning(paste0("'", modelNames[i], "' did not have 'returnResamp=\"final\"; the optimal subset is used"))
}
rs_values[[i]] <- get_resample_perf(x[[i]])
}
all_names <- lapply(rs_values,
function(x) names(x)[names(x) != "Resample"])
all_names <- table(unlist(all_names))
if(length(all_names) == 0 || any(all_names == 0)) {
warning("Could not find performance measures")
}
if(any(all_names < length(x))) {
warning(paste("Some performance measures were not computed for each model:",
paste(names(all_names)[all_names < length(x)], collapse = ", ")))
}
pNames <- names(all_names)[all_names == length(x)]
rs_values <- lapply(rs_values,
function(x, n) x[,n,drop = FALSE],
n = c(pNames, "Resample"))
for(mod in seq(along = modelNames)) {
names(rs_values[[mod]])[names(rs_values[[mod]]) %in% pNames] <-
paste(modelNames[mod], names(rs_values[[mod]])[names(rs_values[[mod]]) %in% pNames], sep = "~")
out <- if(mod == 1) rs_values[[mod]] else merge(out, rs_values[[mod]])
}
timings <- do.call("rbind", lapply(x, getTimes))
colnames(timings) <- c("Everything", "FinalModel", "Prediction")
out <- structure(
list(
call = match.call(),
values = out,
models = modelNames,
metrics = pNames,
timings = as.data.frame(timings, stringsAsFactors = TRUE),
methods = unlist(lapply(x, function(x) x$method))),
class = "resamples")
out
}
#' @rdname resamples
#' @method sort resamples
#' @export
sort.resamples <- function(x, decreasing = FALSE, metric = x$metric[1], FUN = mean, ...)
{
dat <- x$values[, grep(paste("~", metric[1], sep = ""), names(x$values))]
colnames(dat) <- gsub(paste("~", metric[1], sep = ""), "", colnames(dat))
stats <- apply(dat, 2, FUN, ...)
names(sort(stats, decreasing = decreasing))
}
#' @rdname resamples
#' @method summary resamples
#' @export
summary.resamples <- function(object, metric = object$metrics, ...){
vals <- object$values[, names(object$values) != "Resample", drop = FALSE]
out <- vector(mode = "list", length = length(metric))
for(i in seq(along = metric)) {
tmpData <- vals[, grep(paste("~", metric[i], sep = ""), names(vals), fixed = TRUE), drop = FALSE]
out[[i]] <- do.call("rbind", lapply(tmpData, function(x) summary(x)[1:6]))
naSum <- matrix(unlist(lapply(tmpData, function(x) sum(is.na(x)))), ncol = 1)
colnames(naSum) <- "NA's"
out[[i]] <- cbind(out[[i]], naSum)
rownames(out[[i]]) <- gsub(paste("~", metric[i], sep = ""),
"",
rownames(out[[i]]),
fixed = TRUE)
}
names(out) <- metric
out <- structure(
list(values = vals,
call = match.call(),
statistics = out,
models = object$models,
metrics = metric,
methods = object$methods),
class = "summary.resamples")
out
}
#' @rdname resamples
#' @method as.matrix resamples
#' @export
as.matrix.resamples <- function(x, metric = x$metric[1], ...) {
get_cols <- grepl(paste0("~", metric[1]), colnames(x$values))
if(!(any(get_cols))) stop("no columns fit that metric")
out <- x$values[,get_cols,drop=FALSE]
rownames(out) <- as.character(x$values$Resample)
colnames(out) <- gsub(paste0("~", metric[1]), "", colnames(out))
as.matrix(out)
}
#' @rdname resamples
#' @method as.data.frame resamples
#' @export
as.data.frame.resamples <- function(x, row.names = NULL, optional = FALSE,
metric = x$metric[1], ...) {
get_cols <- grepl(paste0("~", metric[1]), colnames(x$values))
if(!(any(get_cols))) stop("no columns fit that metric")
out <- x$values[,get_cols,drop=FALSE]
out$Resample <- x$values$Resample
colnames(out) <- gsub(paste0("~", metric[1]), "", colnames(out))
out
}
#' @rdname resamples
#' @importFrom stats cor
#' @export
modelCor <- function(x, metric = x$metric[1], ...)
{
dat <- x$values[, grep(paste("~", metric[1], sep = ""), names(x$values))]
colnames(dat) <- gsub(paste("~", metric[1], sep = ""), "", colnames(dat))
cor(dat, ...)
}
#' Principal Components Analysis of Resampling Results
#'
#' Performs a principal components analysis on an object of class
#' \code{\link{resamples}} and returns the results as an object with classes
#' \code{prcomp.resamples} and \code{prcomp}.
#'
#' The principal components analysis treats the models as variables and the
#' resamples are realizations of the variables. In this way, we can use PCA to
#' "cluster" the assays and look for similarities. Most of the methods for
#' \code{\link[stats]{prcomp}} can be used, although custom \code{print} and
#' \code{plot} methods are used.
#'
#' The plot method uses lattice graphics. When \code{what = "scree"} or
#' \code{what = "cumulative"}, \code{\link[lattice:xyplot]{barchart}} is used.
#' When \code{what = "loadings"} or \code{what = "components"}, either
#' \code{\link[lattice:xyplot]{xyplot}} or \code{\link[lattice:splom]{splom}}
#' are used (the latter when \code{dims} > 2). Options can be passed to these
#' methods using \code{...}.
#'
#' When \code{what = "loadings"} or \code{what = "components"}, the plots are
#' put on a common scale so that later components are less likely to be
#' over-interpreted. See Geladi et al. (2003) for examples of why this can be
#' important.
#'
#' For clustering, \code{\link[stats]{hclust}} is used to determine clusters of
#' models based on the resampled performance values.
#'
#' @aliases prcomp.resamples cluster.resamples cluster plot.prcomp.resamples
#' @param x For \code{prcomp}, an object of class \code{\link{resamples}} and
#' for \code{plot.prcomp.resamples}, an object of class
#' \code{plot.prcomp.resamples}
#' @param metric a performance metric that was estimated for every resample
#' @param what the type of plot: \code{"scree"} produces a bar chart of
#' standard deviations, \code{"cumulative"} produces a bar chart of the
#' cumulative percent of variance, \code{"loadings"} produces a scatterplot
#' matrix of the loading values and \code{"components"} produces a scatterplot
#' matrix of the PCA components
#' @param dims The number of dimensions to plot when \code{what = "loadings"}
#' or \code{what = "components"}
#' @param \dots For \code{prcomp.resamples}, options to pass to
#' \code{\link[stats]{prcomp}}, for \code{plot.prcomp.resamples}, options to
#' pass to Lattice objects (see Details below) and, for
#' \code{cluster.resamples}, options to pass to \code{hclust}.
#' @return For \code{prcomp.resamples}, an object with classes
#' \code{prcomp.resamples} and \code{prcomp}. This object is the same as the
#' object produced by \code{prcomp}, but with additional elements: \item{metric
#' }{the value for the \code{metric} argument} \item{call }{the call}
#'
#' For \code{plot.prcomp.resamples}, a Lattice object (see Details above)
#' @author Max Kuhn
#' @seealso \code{\link{resamples}}, \code{\link[lattice:xyplot]{barchart}},
#' \code{\link[lattice:xyplot]{xyplot}}, \code{\link[lattice:splom]{splom}},
#' \code{\link[stats]{hclust}}
#' @references Geladi, P.; Manley, M.; and Lestander, T. (2003), "Scatter
#' plotting in multivariate data analysis," J. Chemometrics, 17: 503-511
#' @keywords hplot
#' @examples
#'
#' \dontrun{
#' #load(url("http://topepo.github.io/caret/exampleModels.RData"))
#'
#' resamps <- resamples(list(CART = rpartFit,
#' CondInfTree = ctreeFit,
#' MARS = earthFit))
#' resampPCA <- prcomp(resamps)
#'
#' resampPCA
#'
#' plot(resampPCA, what = "scree")
#'
#' plot(resampPCA, what = "components")
#'
#' plot(resampPCA, what = "components", dims = 2, auto.key = list(columns = 3))
#'
#' clustered <- cluster(resamps)
#' plot(clustered)
#'
#' }
#' @export
prcomp.resamples <- function(x, metric = x$metrics[1], ...)
{
if(length(metric) != 1) stop("exactly one metric must be given")
tmpData <- x$values[, grep(paste("~", metric, sep = ""),
names(x$value),
fixed = TRUE),
drop = FALSE]
names(tmpData) <- gsub(paste("~", metric, sep = ""),
"",
names(tmpData),
fixed = TRUE)
tmpData <- as.data.frame(t(tmpData), stringsAsFactors = TRUE)
colnames(tmpData) <- paste("Resample",
gsub(" ", "0", format(1:ncol(tmpData))),
sep = "")
out <- prcomp(~., data = tmpData, ...)
out$metric <- metric
out$call <- match.call()
class(out) <- c("prcomp.resamples", "prcomp")
out
}
#' @export
"cluster" <- function(x, ...) UseMethod("cluster")
#' @export
cluster.default <- function(x, ...) stop("only implemented for resamples objects")
#' @importFrom stats dist hclust
#' @method cluster resamples
#' @export
cluster.resamples <- function(x, metric = x$metrics[1], ...)
{
if(length(metric) != 1) stop("exactly one metric must be given")
tmpData <- x$values[, grep(paste("~", metric, sep = ""),
names(x$value),
fixed = TRUE),
drop = FALSE]
names(tmpData) <- gsub(paste("~", metric, sep = ""),
"",
names(tmpData),
fixed = TRUE)
tmpData <- t(tmpData)
dt <- dist(tmpData)
out <- hclust(dt, ...)
out$metric <- metric
out$call <- match.call()
class(out) <- c("cluster.resamples", "hclust")
out
}
#' @rdname prcomp.resamples
#' @method plot prcomp.resamples
#' @importFrom grDevices extendrange
#' @export
plot.prcomp.resamples <- function(x, what = "scree", dims = max(2, ncol(x$rotation)), ...)
{
if(length(what) > 1) stop("one plot at a time please")
switch(what,
scree =
{
barchart(x$sdev ~ paste("PC",
gsub(" ", "0", format(seq(along = x$sdev))),
sep = ""),
ylab = "Standard Deviation", ...)
},
cumulative =
{
barchart(cumsum(x$sdev^2)/sum(x$sdev^2) ~ paste("PC",
gsub(" ", "0", format(seq(along = x$sdev))),
sep = ""),
ylab = "Culmulative Percent of Variance", ...)
},
loadings =
{
panelRange <- extendrange(x$rotation[, 1:dims,drop = FALSE])
if(dims > 2)
{
splom(~x$rotation[, 1:dims,drop = FALSE],
main = useMathSymbols(x$metric),
prepanel.limits = function(x) panelRange,
type = c("p", "g"),
...)
} else {
xyplot(PC2~PC1, data = as.data.frame(x$rotation),
main = useMathSymbols(x$metric),
xlim = panelRange,
ylim = panelRange,
type = c("p", "g"),
...)
}
},
components =
{
panelRange <- extendrange(x$x[, 1:dims,drop = FALSE])
if(dims > 2)
{
splom(~x$x[, 1:dims,drop = FALSE],
main = useMathSymbols(x$metric),
prepanel.limits = function(x) panelRange,
groups = rownames(x$x),
type = c("p", "g"),
...)
} else {
xyplot(PC2~PC1, data = as.data.frame(x$x),
main = useMathSymbols(x$metric),
xlim = panelRange,
ylim = panelRange,
groups = rownames(x$x),
type = c("p", "g"),
...)
}
})
}
#' @method print prcomp.resamples
#' @export
print.prcomp.resamples <- function (x, digits = max(3, getOption("digits") - 3), print.x = FALSE, ...)
{
printCall(x$call)
cat("Metric:", x$metric, "\n")
sds <- rbind(x$sdev, cumsum(x$sdev^2)/sum(x$sdev^2))
rownames(sds) <- c("Std. Dev. ", "Cum. Percent Var. ")
colnames(sds) <- rep("", ncol(sds))
print(sds, digits = digits, ...)
cat("\nRotation:\n")
print(x$rotation, digits = digits, ...)
if (print.x && length(x$x)) {
cat("\nRotated variables:\n")
print(x$x, digits = digits, ...)
}
invisible(x)
}
#' @rdname resamples
#' @method print resamples
#' @export
print.resamples <- function(x, ...)
{
printCall(x$call)
cat("Models:", paste(x$models, collapse = ", "), "\n")
cat("Number of resamples:", nrow(x$values), "\n")
cat("Performance metrics:", paste(x$metrics, collapse = ", "), "\n")
hasTimes <- apply(x$timing, 2, function(a) !all(is.na(a)))
if(any(hasTimes))
{
names(hasTimes) <- gsub("FinalModel", "final model fit", names(hasTimes))
cat("Time estimates for:", paste(tolower(names(hasTimes))[hasTimes], collapse = ", "), "\n")
}
invisible(x)
}
#' @method print summary.resamples
#' @export
print.summary.resamples <- function(x, ...)
{
printCall(x$call)
cat("Models:", paste(x$models, collapse = ", "), "\n")
cat("Number of resamples:", nrow(x$values), "\n")
cat("\n")
for(i in seq(along = x$statistics))
{
cat(names(x$statistics)[i], "\n")
print(x$statistics[[i]])
cat("\n")
}
invisible(x)
}
#' Lattice Functions for Visualizing Resampling Results
#'
#' Lattice and ggplot functions for visualizing resampling results across models
#'
#' The ideas and methods here are based on Hothorn et al. (2005) and Eugster et
#' al. (2008).
#'
#' \code{dotplot} and \code{ggplot} plots the average performance value (with two-sided
#' confidence limits) for each model and metric.
#'
#' \code{densityplot} and \code{bwplot} display univariate visualizations of
#' the resampling distributions while \code{splom} shows the pair-wise
#' relationships.
#'
#' @aliases xyplot.resamples densityplot.resamples bwplot.resamples
#' splom.resamples parallelplot.resamples dotplot.resamples ggplot.resamples
#' @param x an object generated by \code{resamples}
#' @param data Only used for the \code{ggplot} method; an object generated by
#' \code{resamples}
#' @param models a character string for which models to plot. Note:
#' \code{xyplot} requires one or two models whereas the other methods can plot
#' more than two.
#' @param metric a character string for which metrics to use as conditioning
#' variables in the plot. \code{splom} requires exactly one metric when
#' \code{variables = "models"} and at least two when \code{variables =
#' "metrics"}.
#' @param variables either "models" or "metrics"; which variable should be
#' treated as the scatter plot variables?
#' @param panelRange a common range for the panels. If \code{NULL}, the panel
#' ranges are derived from the values across all the models
#' @param what for \code{xyplot}, the type of plot. Valid options are:
#' "scatter" (for a plot of the resampled results between two models),
#' "BlandAltman" (a Bland-Altman, aka MA plot between two models), "tTime" (for
#' the total time to run \code{train} versus the metric), "mTime" (for the time
#' to build the final model) or "pTime" (the time to predict samples - see the
#' \code{timingSamps} options in \code{\link{trainControl}},
#' \code{\link{rfeControl}}, or \code{\link{sbfControl}})
#' @param units either "sec", "min" or "hour"; which \code{what} is either
#' "tTime", "mTime" or "pTime", how should the timings be scaled?
#' @param conf.level the confidence level for intervals about the mean
#' (obtained using \code{\link[stats]{t.test}})
#' @param \dots further arguments to pass to either
#' \code{\link[lattice:histogram]{histogram}},
#' \code{\link[lattice:histogram]{densityplot}},
#' \code{\link[lattice:xyplot]{xyplot}}, \code{\link[lattice:xyplot]{dotplot}}
#' or \code{\link[lattice:splom]{splom}}
#' @return a lattice object
#' @author Max Kuhn
#' @seealso \code{\link{resamples}}, \code{\link[lattice:xyplot]{dotplot}},
#' \code{\link[lattice:bwplot]{bwplot}},
#' \code{\link[lattice:histogram]{densityplot}},
#' \code{\link[lattice:xyplot]{xyplot}}, \code{\link[lattice:splom]{splom}}
#' @references Hothorn et al. The design and analysis of benchmark experiments.
#' Journal of Computational and Graphical Statistics (2005) vol. 14 (3) pp.
#' 675-699
#'
#' Eugster et al. Exploratory and inferential analysis of benchmark
#' experiments. Ludwigs-Maximilians-Universitat Munchen, Department of
#' Statistics, Tech. Rep (2008) vol. 30
#' @keywords hplot
#' @examples
#'
#' \dontrun{
#' #load(url("http://topepo.github.io/caret/exampleModels.RData"))
#'
#' resamps <- resamples(list(CART = rpartFit,
#' CondInfTree = ctreeFit,
#' MARS = earthFit))
#'
#' dotplot(resamps,
#' scales =list(x = list(relation = "free")),
#' between = list(x = 2))
#'
#' bwplot(resamps,
#' metric = "RMSE")
#'
#' densityplot(resamps,
#' auto.key = list(columns = 3),
#' pch = "|")
#'
#' xyplot(resamps,
#' models = c("CART", "MARS"),
#' metric = "RMSE")
#'
#' splom(resamps, metric = "RMSE")
#' splom(resamps, variables = "metrics")
#'
#' parallelplot(resamps, metric = "RMSE")
#'
#'
#' }
#'
#' @method xyplot resamples
#' @export
xyplot.resamples <- function (x, data = NULL, what = "scatter", models = NULL, metric = x$metric[1], units = "min", ...)
{
if(!(units %in% c("min", "sec", "hour"))) stop("units should be 'sec', 'min' or 'hour'")
if(!(what %in% c("scatter", "BlandAltman", "tTime", "mTime", "pTime"))) stop("the what arg should be 'scatter', 'BlandAltman', 'tTime', 'mTime' or 'pTime'")
if(is.null(models)) models <- if(what %in% c("tTime", "mTime", "pTime")) x$models else x$models[1:2]
if(length(metric) != 1) stop("exactly one metric must be given")
if(what == "BlandAltman" & length(models) != 2) stop("exactly two model names must be given")
if(what == "BlandAltman")
{
tmpData <- x$values[, paste(models, metric, sep ="~")]
tmpData$Difference <- tmpData[,1] - tmpData[,2]
tmpData$Average <- (tmpData[,1] + tmpData[,2])/2
ylm <- extendrange(c(tmpData$Difference, 0))
out <- xyplot(Difference ~ Average,
data = tmpData,
ylab = paste(models, collapse = " - "),
ylim = ylm,
main = useMathSymbols(metric),
panel = function(x, y, ...)
{
panel.abline(h = 0, col = "darkgrey", lty = 2)
panel.xyplot(x, y, ...)
},
...)
}
if(what == "scatter")
{
tmpData <- x$values[, paste(models, metric, sep ="~")]
colnames(tmpData) <- gsub(paste("~", metric, sep = ""), "", colnames(tmpData))
ylm <- extendrange(c(tmpData[,1], tmpData[,2]))
out <- xyplot(tmpData[,1] ~ tmpData[,2],
ylab = colnames(tmpData)[1],
xlab = colnames(tmpData)[2],
xlim = ylm, ylim = ylm,
main = useMathSymbols(metric),
panel = function(x, y, ...)
{
panel.abline(0, 1, col = "darkgrey", lty = 2)
panel.xyplot(x, y, ...)
},
...)
}
if(what %in% c("tTime", "mTime", "pTime"))
{
## the following is based on
## file.show(system.file("demo/intervals.R", package = "lattice")
prepanel.ci <- function(x, y, lx, ux, subscripts, ...)
{
x <- as.numeric(x)
lx <- as.numeric(lx[subscripts])
ux <- as.numeric(ux[subscripts])
list(xlim = extendrange(c(x, ux, lx)),
ylim = extendrange(y))
}
panel.ci <- function(x, y, lx, ux, groups, subscripts, pch = 16, ...)
{
theme <- trellis.par.get()
x <- as.numeric(x)
y <- as.numeric(y)
lx <- as.numeric(lx[subscripts])
ux <- as.numeric(ux[subscripts])
gps <- unique(groups)
for(i in seq(along = gps))
{
panel.arrows(lx[groups == gps[i]],
y[groups == gps[i]],
ux[groups == gps[i]],
y[groups == gps[i]],
col = theme$superpose.line$col[i],
length = .01, unit = "npc",
angle = 90, code = 3)
panel.xyplot(x[groups == gps[i]],
y[groups == gps[i]],
type = "p",
col = theme$superpose.symbol$col[i],
cex = theme$superpose.symbol$cex[i],
pch = theme$superpose.symbol$pch[i],
, ...)
}
}
tmpData <- apply(x$values[,-1], 2,
function(x)
{
tmp <- t.test(x)
c(tmp$conf.int[1], tmp$estimate, tmp$conf.int[2])
})
rownames(tmpData) <- c("lower", "point", "upper")
tmpData <- tmpData[, paste(models, metric, sep ="~")]
colnames(tmpData) <- gsub(paste("~", metric, sep = ""), "", colnames(tmpData))
tmpData <- data.frame(t(tmpData))
tmpData$Model <- rownames(tmpData)
tm <- switch(what,
tTime = x$timings[,"Everything"],
mTime = x$timings[,"FinalModel"],
pTime = x$timings[,"Prediction"])
lab <- switch(what,
tTime = "Total Time (",
mTime = "Final Model Time (",
pTime = "Prediction Time (")
lab <- paste(lab, units, ")", sep = "")
times <- data.frame(Time = tm,
Model = rownames(x$timings))
plotData <- merge(times, tmpData)
if(units == "min") plotData$Time <- plotData$Time/60
if(units == "hour") plotData$Time <- plotData$Time/60/60
out <- with(plotData,
xyplot(Time ~ point,
lx = lower, ux = upper,
xlab = useMathSymbols(metric),
ylab = lab,
prepanel = prepanel.ci,
panel = panel.ci, groups = Model,
...))
}
out
}
#' @rdname xyplot.resamples
#' @importFrom stats median
#' @method parallelplot resamples
#' @export
parallelplot.resamples <- function (x, data = NULL, models = x$models, metric = x$metric[1], ...)
{
if(length(metric) != 1) stop("exactly one metric must be given")
tmpData <- x$values[, grep(paste("~", metric, sep = ""),
names(x$value),
fixed = TRUE),
drop = FALSE]
names(tmpData) <- gsub(paste("~", metric, sep = ""),
"",
names(tmpData),
fixed = TRUE)
rng <- range(unlist(lapply(lapply(tmpData, as.numeric), range, na.rm = TRUE)))
prng <- pretty(rng)
reord <- order(apply(tmpData, 2, median, na.rm = TRUE))
tmpData <- tmpData[, reord]
parallelplot(~tmpData,
common.scale = TRUE,
scales = list(x = list(at = (prng-min(rng))/diff(rng), labels = prng)),
xlab = useMathSymbols(metric),
...)
}
#' @rdname xyplot.resamples
#' @importFrom grDevices extendrange
#' @method splom resamples
#' @export
splom.resamples <- function (x, data = NULL, variables = "models",
models = x$models,
metric = NULL,
panelRange = NULL,
...) {
if(variables == "models") {
if(is.null(metric)) metric <- x$metric[1]
if(length(metric) != 1) stop("exactly one metric must be given")
tmpData <- x$values[, grep(paste("~", metric, sep = ""),
names(x$value),
fixed = TRUE),
drop = FALSE]
names(tmpData) <- gsub(paste("~", metric, sep = ""),
"",
names(tmpData),
fixed = TRUE)
tmpData <- tmpData[, models,drop = FALSE]
if(is.null(panelRange)) panelRange <- extendrange(tmpData)
splom(~tmpData,
panel = function(x, y) {
panel.splom(x, y, ...)
panel.abline(0, 1, lty = 2, col = "darkgrey")
},
main = useMathSymbols(metric),
prepanel.limits = function(x) panelRange,
...)
} else{
if(variables == "metrics") {
if(is.null(metric)) metric <- x$metric
if(length(metric) < 2) stop("There should be at least two metrics")
plotData <- melt(x$values, id.vars = "Resample")
tmp <- strsplit(as.character(plotData$variable), "~", fixed = TRUE)
plotData$Model <- unlist(lapply(tmp, function(x) x[1]))
plotData$Metric <- unlist(lapply(tmp, function(x) x[2]))
plotData <- subset(plotData, Model %in% models & Metric %in% metric)
means <- dcast(plotData, Model ~ Metric, mean, na.rm = TRUE)
splom(~means[, metric], groups = means$Model, ...)
} else stop ("'variables' should be either 'models' or 'metrics'")
}
}
#' @rdname xyplot.resamples
#' @importFrom stats as.formula
#' @method densityplot resamples
#' @export
densityplot.resamples <- function (x, data = NULL, models = x$models, metric = x$metric, ...)
{
plotData <- melt(x$values, id.vars = "Resample")
tmp <- strsplit(as.character(plotData$variable), "~", fixed = TRUE)
plotData$Model <- unlist(lapply(tmp, function(x) x[1]))
plotData$Metric <- unlist(lapply(tmp, function(x) x[2]))
plotData <- subset(plotData, Model %in% models & Metric %in% metric)
metricVals <- unique(plotData$Metric)
plotForm <- if(length(metricVals) > 1) as.formula(~value|Metric) else as.formula(~value)
densityplot(plotForm, data = plotData, groups = Model,
xlab = if(length(unique(plotData$Metric)) > 1) "" else metricVals, ...)
}
#' @rdname xyplot.resamples
#' @importFrom stats as.formula
#' @method bwplot resamples
#' @export
bwplot.resamples <- function (x, data = NULL, models = x$models, metric = x$metric, ...)
{
plotData <- melt(x$values, id.vars = "Resample")
tmp <- strsplit(as.character(plotData$variable), "~", fixed = TRUE)
plotData$Model <- unlist(lapply(tmp, function(x) x[1]))
plotData$Metric <- unlist(lapply(tmp, function(x) x[2]))
plotData <- subset(plotData, Model %in% models & Metric %in% metric)
avPerf <- ddply(subset(plotData, Metric == metric[1]),
.(Model),
function(x) c(Median = median(x$value, na.rm = TRUE)))
avPerf <- avPerf[order(avPerf$Median),]
plotData$Model <- factor(as.character(plotData$Model),
levels = avPerf$Model)
metricVals <- unique(plotData$Metric)
plotForm <- if(length(metricVals) > 1) as.formula(Model~value|Metric) else as.formula(Model~value)
bwplot(plotForm, data = plotData,
xlab = if(length(unique(plotData$Metric)) > 1) "" else metricVals, ...)
}
#' @rdname xyplot.resamples
#' @importFrom stats aggregate as.formula median t.test
#' @method dotplot resamples
#' @export
dotplot.resamples <- function (x, data = NULL, models = x$models, metric = x$metric, conf.level = 0.95, ...)
{
plotData <- melt(x$values, id.vars = "Resample")
tmp <- strsplit(as.character(plotData$variable), "~", fixed = TRUE)
plotData$Model <- unlist(lapply(tmp, function(x) x[1]))
plotData$Metric <- unlist(lapply(tmp, function(x) x[2]))
plotData <- subset(plotData, Model %in% models & Metric %in% metric)
plotData$variable <- factor(as.character(plotData$variable))
plotData <- split(plotData, plotData$variable)
results <- lapply(plotData,
function(x, cl)
{
ttest <- try(t.test(x$value, conf.level = cl, ...),
silent = TRUE)
if(class(ttest)[1] == "htest")
{
out <- c(ttest$conf.int, ttest$estimate)
names(out) <- c("LowerLimit", "UpperLimit", "Estimate")
} else {
out <- c(LowerLimit = NA_real_, UpperLimit = NA_real_, Estimate = mean(x$value, na.rm = TRUE))
}
out
},
cl = conf.level)
results <- do.call("rbind", results)
results <- melt(results)
results <- results[!is.nan(results$value),]
tmp <- strsplit(as.character(results$Var1), "~", fixed = TRUE)
results$Model <- unlist(lapply(tmp, function(x) x[1]))
results$Metric <- unlist(lapply(tmp, function(x) x[2]))
## to avoid "no visible binding for global variable 'Var2'"
Var2 <- NULL
avPerf <- ddply(subset(results, Metric == metric[1] & Var2 == "Estimate"),
.(Model),
function(x) c(Median = median(x$value, na.rm = TRUE)))
avPerf <- avPerf[order(avPerf$Median),]
results$Model <- factor(as.character(results$Model),
levels = avPerf$Model)
metricVals <- unique(results$Metric)
plotForm <- if(length(metricVals) > 1) as.formula(Model~value|Metric) else as.formula(Model~value)
dotplot(plotForm,
data = results,
xlab = if(length(unique(plotData$Metric)) > 1) "" else metricVals,
panel = function(x, y)
{
plotTheme <- trellis.par.get()
y <- as.numeric(y)
vals <- aggregate(x, list(group = y), function(x) c(min = min(x), mid = median(x), max = max(x)))
panel.dotplot(vals$x[,"mid"], vals$group,
pch = plotTheme$plot.symbol$pch[1],
col = plotTheme$plot.symbol$col[1],
cex = plotTheme$plot.symbol$cex[1])
panel.segments(vals$x[,"min"], vals$group,
vals$x[,"max"], vals$group,
lty = plotTheme$plot.line$lty[1],
col = plotTheme$plot.line$col[1],
lwd = plotTheme$plot.line$lwd[1])
len <- .03
panel.segments(vals$x[,"min"], vals$group+len,
vals$x[,"min"], vals$group-len,
lty = plotTheme$plot.line$lty[1],
col = plotTheme$plot.line$col[1],
lwd = plotTheme$plot.line$lwd[1])
panel.segments(vals$x[,"max"], vals$group+len,
vals$x[,"max"], vals$group-len,
lty = plotTheme$plot.line$lty[1],
col = plotTheme$plot.line$col[1],
lwd = plotTheme$plot.line$lwd[1])
},
sub = paste("Confidence Level:", conf.level),
...)
}
#' @rdname xyplot.resamples
#' @method ggplot resamples
#' @importFrom stats reorder
#' @param mapping,environment Not used.
#' @export
ggplot.resamples <-
function (data = NULL,
mapping = NULL,
environment = NULL,
models = data$models,
metric = data$metric[1],
conf.level = 0.95,
...) {
plotData <- melt(data$values, id.vars = "Resample")
tmp <- strsplit(as.character(plotData$variable), "~", fixed = TRUE)
plotData$Model <- unlist(lapply(tmp, function(x) x[1]))
plotData$Metric <- unlist(lapply(tmp, function(x) x[2]))
plotData <- subset(plotData, Model %in% models & Metric %in% metric)
plotData$variable <- factor(as.character(plotData$variable))
plotData <- split(plotData, plotData$variable)
results <- lapply(
plotData,
function(x, cl) {
ttest <- try(t.test(x$value, conf.level = cl, ...), silent = TRUE)
if (class(ttest)[1] == "htest") {
out <- c(ttest$conf.int, ttest$estimate)
names(out) <-
c("LowerLimit", "UpperLimit", "Estimate")
} else
out <-
c(LowerLimit = NA_real_, UpperLimit = NA_real_, Estimate = mean(x$value, na.rm = TRUE))
out
},
cl = conf.level)
results <- do.call("rbind", results)
results <- as.data.frame(results, stringsAsFactors = TRUE)
tmp <- strsplit(rownames(results), "~", fixed = TRUE)
results$Model <- unlist(lapply(tmp, function(x) x[1]))
results$Model <- reorder(results$Model, results$Estimate, median)
results$Metric <- unlist(lapply(tmp, function(x) x[2]))
metricVals <- unique(results$Metric)
p <- ggplot(results, aes(x = Model)) +
geom_point(aes(y = Estimate)) +
geom_errorbar(aes(ymin = LowerLimit, ymax = UpperLimit), width = .1) +
coord_flip() +
xlab("") + ylab("")
if (length(metricVals) > 1)
p <- p + facet_wrap(~Metric, scales = "free_y")
p
}
#' Inferential Assessments About Model Performance
#'
#' Methods for making inferences about differences between models
#'
#' The ideas and methods here are based on Hothorn et al. (2005) and Eugster et
#' al. (2008).
#'
#' For each metric, all pair-wise differences are computed and tested to assess
#' if the difference is equal to zero.
#'
#' When a Bonferroni correction is used, the confidence level is changed from
#' \code{confLevel} to \code{1-((1-confLevel)/p)} here \code{p} is the number
#' of pair-wise comparisons are being made. For other correction methods, no
#' such change is used.
#'
#' \code{compare_models} is a shorthand function to compare two models using a
#' single metric. It returns the results of \code{\link[stats]{t.test}} on the
#' differences.
#'
#' @aliases diff.resamples summary.diff.resamples compare_models
#' @param x an object generated by \code{resamples}
#' @param models a character string for which models to compare
#' @param metric a character string for which metrics to compare
#' @param test a function to compute differences. The output of this function
#' should have scalar outputs called \code{estimate} and \code{p.value}
#' @param object a object generated by \code{diff.resamples}
#' @param adjustment any p-value adjustment method to pass to
#' \code{\link[stats]{p.adjust}}.
#' @param confLevel confidence level to use for
#' \code{\link{dotplot.diff.resamples}}. See Details below.
#' @param digits the number of significant differences to display when printing
#' @param a,b two objects of class \code{\link{train}}, \code{\link{sbf}} or
#' \code{\link{rfe}} with a common set of resampling indices in the
#' \code{control} object.
#' @param \dots further arguments to pass to \code{test}
#' @return An object of class \code{"diff.resamples"} with elements: \item{call
#' }{the call} \item{difs }{a list for each metric being compared. Each list
#' contains a matrix with differences in columns and resamples in rows }
#' \item{statistics }{a list of results generated by \code{test}}
#' \item{adjustment}{the p-value adjustment used} \item{models}{a character
#' string for which models were compared.} \item{metrics }{a character string
#' of performance metrics that were used}
#'
#' or...
#'
#' An object of class \code{"summary.diff.resamples"} with elements: \item{call
#' }{the call} \item{table }{a list of tables that show the differences and
#' p-values }
#'
#' ...or (for \code{compare_models}) an object of class \code{htest} resulting
#' from \code{\link[stats]{t.test}}.
#' @author Max Kuhn
#' @seealso \code{\link{resamples}}, \code{\link{dotplot.diff.resamples}},
#' \code{\link{densityplot.diff.resamples}},
#' \code{\link{bwplot.diff.resamples}}, \code{\link{levelplot.diff.resamples}}
#' @references Hothorn et al. The design and analysis of benchmark experiments.
#' Journal of Computational and Graphical Statistics (2005) vol. 14 (3) pp.
#' 675-699
#'
#' Eugster et al. Exploratory and inferential analysis of benchmark
#' experiments. Ludwigs-Maximilians-Universitat Munchen, Department of
#' Statistics, Tech. Rep (2008) vol. 30
#' @keywords models
#' @examples
#'
#' \dontrun{
#' #load(url("http://topepo.github.io/caret/exampleModels.RData"))
#'
#' resamps <- resamples(list(CART = rpartFit,
#' CondInfTree = ctreeFit,
#' MARS = earthFit))
#'
#' difs <- diff(resamps)
#'
#' difs
#'
#' summary(difs)
#'
#' compare_models(rpartFit, ctreeFit)
#' }
#'
#' @method diff resamples
#' @export
diff.resamples <- function(x,
models = x$models,
metric = x$metrics,
test = t.test,
confLevel = 0.95,
adjustment = "bonferroni",
...)
{
allDif <- vector(mode = "list", length = length(metric))
names(allDif) <- metric
x$models <- x$models[x$models %in% models]
p <- length(x$models)
ncomp <- choose(p, 2)
if(adjustment == "bonferroni") confLevel <- 1 - ((1 - confLevel)/ncomp)
allStats <- allDif
for(h in seq(along = metric))
{
index <- 0
dif <- matrix(NA,
nrow = nrow(x$values),
ncol = choose(length(models), 2))
stat <- vector(mode = "list", length = choose(length(models), 2))
colnames(dif) <- paste("tmp", 1:ncol(dif), sep = "")
for(i in seq(along = models))
{
for(j in seq(along = models))
{
if(i < j)
{
index <- index + 1
left <- paste(models[i], metric[h], sep = "~")
right <- paste(models[j], metric[h], sep = "~")
dif[,index] <- x$values[,left] - x$values[,right]
colnames(dif)[index] <- paste(models[i], models[j], sep = ".diff.")
}
}
}
stats <- apply(dif, 2, function(x, tst, ...) tst(x, ...), tst = test, conf.level = confLevel, ...)
allDif[[h]] <- dif
allStats[[h]] <- stats
}
out <- structure(
list(
call = match.call(),
difs = allDif,
confLevel = confLevel,
adjustment = adjustment,
statistics = allStats,
models = models,
metric = metric),
class = "diff.resamples")
out
}
#' @importFrom stats as.formula
#' @importFrom utils stack
#' @method densityplot diff.resamples
#' @export
densityplot.diff.resamples <- function(x, data, metric = x$metric, ...)
{
plotData <- lapply(x$difs,
function(x) stack(as.data.frame(x, stringsAsFactors = TRUE)))
plotData <- do.call("rbind", plotData)
plotData$Metric <- rep(x$metric, each = length(x$difs[[1]]))
plotData$ind <- gsub(".diff.", " - ", plotData$ind, fixed = TRUE)
plotData <- subset(plotData, Metric %in% metric)
metricVals <- unique(plotData$Metric)
plotForm <- if(length(metricVals) > 1) as.formula(~values|Metric) else as.formula(~values)
densityplot(plotForm, data = plotData, groups = ind,
xlab = if(length(unique(plotData$Metric)) > 1) "" else metricVals, ...)
}
#' @importFrom stats as.formula
#' @importFrom utils stack
#' @method bwplot diff.resamples
#' @export
bwplot.diff.resamples <- function(x, data, metric = x$metric, ...)
{
plotData <- lapply(x$difs,
function(x) stack(as.data.frame(x, stringsAsFactors = TRUE)))
plotData <- do.call("rbind", plotData)
plotData$Metric <- rep(x$metric, each = length(x$difs[[1]]))
plotData$ind <- gsub(".diff.", " - ", plotData$ind, fixed = TRUE)
plotData <- subset(plotData, Metric %in% metric)
metricVals <- unique(plotData$Metric)
plotForm <- if(length(metricVals) > 1) as.formula(ind ~ values|Metric) else as.formula(ind ~ values)
bwplot(plotForm,
data = plotData,
xlab = if(length(unique(plotData$Metric)) > 1) "" else metricVals,
...)
}
#' @method print diff.resamples
#' @export
print.diff.resamples <- function(x, ...)
{
printCall(x$call)
cat("Models:", paste(x$models, collapse = ", "), "\n")
cat("Metrics:", paste(x$metric, collapse = ", "), "\n")
cat("Number of differences:", ncol(x$difs[[1]]), "\n")
cat("p-value adjustment:", x$adjustment, "\n")
invisible(x)
}
#' @importFrom stats p.adjust
#' @rdname diff.resamples
#' @method summary diff.resamples
#' @export
summary.diff.resamples <- function(object, digits = max(3, getOption("digits") - 3), ...)
{
all <- vector(mode = "list", length = length(object$metric))
names(all) <- object$metric
for(h in seq(along = object$metric))
{
pvals <- matrix(NA, nrow = length(object$models), ncol = length(object$models))
meanDiff <- pvals
index <- 0
for(i in seq(along = object$models)) {
for(j in seq(along = object$models)) {
if(i < j) {
index <- index + 1
meanDiff[i, j] <- object$statistics[[h]][index][[1]]$estimate
}
}
}
index <- 0
for(i in seq(along = object$models)) {
for(j in seq(along = object$models)) {
if(i < j) {
index <- index + 1
pvals[j, i] <- object$statistics[[h]][index][[1]]$p.value
}
}
}
pvals[lower.tri(pvals)] <- p.adjust(pvals[lower.tri(pvals)], method = object$adjustment)
asText <- matrix("", nrow = length(object$models), ncol = length( object$models))
meanDiff2 <- format(meanDiff, digits = digits)
pvals2 <- matrix(format.pval(pvals, digits = digits), nrow = length( object$models))
asText[upper.tri(asText)] <- meanDiff2[upper.tri(meanDiff2)]
asText[lower.tri(asText)] <- pvals2[lower.tri(pvals2)]
diag(asText) <- ""
colnames(asText) <- object$models
rownames(asText) <- object$models
all[[h]] <- asText
}
out <- structure(
list(
call = match.call(),
adjustment = object$adjustment,
table = all),
class = "summary.diff.resamples")
out
}
#' @importFrom stats complete.cases
#' @method levelplot diff.resamples
#' @export
levelplot.diff.resamples <- function(x, data = NULL, metric = x$metric[1], what = "pvalues", ...)
{
comps <- ncol(x$difs[[1]])
if(length(metric) != 1) stop("exactly one metric must be given")
all <- vector(mode = "list", length = length(x$metric))
names(all) <- x$metric
for(h in seq(along = x$metric))
{
temp <- matrix(NA, nrow = length(x$models), ncol = length( x$models))
index <- 0
for(i in seq(along = x$models))
{
for(j in seq(along = x$models))
{
if(i < j)
{
index <- index + 1
temp[i, j] <- if(what == "pvalues")
{
x$statistics[[h]][index][[1]]$p.value
} else x$statistics[[h]][index][[1]]$estimate
temp[j, i] <- temp[i, j]
}
}
}
colnames(temp) <- x$models
temp <- as.data.frame(temp, stringsAsFactors = TRUE)
temp$A <- x$models
temp$Metric <- x$metric[h]
all[[h]] <- temp
}
all <- do.call("rbind", all)
all <- melt(all, measure.vars = x$models)
names(all)[names(all) == "variable"] <- "B"
all$A <- factor(all$A, levels = x$models)
all$B <- factor(as.character(all$B), levels = x$models)
all <- all[complete.cases(all),]
levelplot(value ~ A + B|Metric,
data = all,
subset = Metric %in% metric,
xlab = "",
ylab = "",
sub = ifelse(what == "pvalues", "p-values", "difference = (x axis - y axis)"),
...)
}
#' @method print summary.diff.resamples
#' @export
print.summary.diff.resamples <- function(x, ...)
{
printCall(x$call)
if(x$adjustment != "none")
cat("p-value adjustment:", x$adjustment, "\n")
cat("Upper diagonal: estimates of the difference\n",
"Lower diagonal: p-value for H0: difference = 0\n\n",
sep = "")
for(i in seq(along = x$table))
{
cat(names(x$table)[i], "\n")
print(x$table[[i]], quote = FALSE)
cat("\n")
}
invisible(x)
}
#' Lattice Functions for Visualizing Resampling Differences
#'
#' Lattice functions for visualizing resampling result differences between
#' models
#'
#' \code{densityplot} and \code{bwplot} display univariate visualizations of
#' the resampling distributions. \code{levelplot} displays the matrix of
#' pair-wide comparisons. \code{dotplot} shows the differences along with their
#' associated confidence intervals.
#'
#' @aliases levelplot.diff.resamples densityplot.diff.resamples
#' bwplot.diff.resamples dotplot.diff.resamples
#' @param x an object generated by \code{\link{diff.resamples}}
#' @param data Not used
#' @param metric a character string for which metrics to plot. Note:
#' \code{dotplot} and \code{levelplot} require exactly two models whereas the
#' other methods can plot more than two.
#' @param \dots further arguments to pass to either
#' \code{\link[lattice:histogram]{densityplot}},
#' \code{\link[lattice:dotplot]{dotplot}} or
#' \code{\link[lattice:levelplot]{levelplot}}
#' @return a lattice object
#' @author Max Kuhn
#' @seealso \code{\link{resamples}}, \code{\link{diff.resamples}},
#' \code{\link[lattice:bwplot]{bwplot}},
#' \code{\link[lattice:histogram]{densityplot}},
#' \code{\link[lattice:xyplot]{xyplot}}, \code{\link[lattice:splom]{splom}}
#' @keywords hplot
#' @examples
#'
#' \dontrun{
#' #load(url("http://topepo.github.io/caret/exampleModels.RData"))
#'
#' resamps <- resamples(list(CART = rpartFit,
#' CondInfTree = ctreeFit,
#' MARS = earthFit))
#' difs <- diff(resamps)
#'
#' dotplot(difs)
#'
#' densityplot(difs,
#' metric = "RMSE",
#' auto.key = TRUE,
#' pch = "|")
#'
#' bwplot(difs,
#' metric = "RMSE")
#'
#' levelplot(difs, what = "differences")
#'
#' }
#'
#' @method dotplot diff.resamples
#' @export
dotplot.diff.resamples <- function(x, data = NULL, metric = x$metric[1], ...)
{
if(length(metric) > 1)
{
metric <- metric[1]
warning("Sorry Dave, only one value of metric is allowed right now. I'll use the first value")
}
h <- which(x$metric == metric)
plotData <- as.data.frame(matrix(NA, ncol = 3, nrow = ncol(x$difs[[metric]])), stringsAsFactors = TRUE)
## Get point and interval estimates on the differences
index <- 0
for(i in seq(along = x$models))
{
for(j in seq(along = x$models))
{
if(i < j)
{
index <- index + 1
plotData[index, 1] <- x$statistics[[h]][index][[1]]$estimate
plotData[index, 2:3] <- x$statistics[[h]][index][[1]]$conf.int
}
}
}
names(plotData)[1:3] <- c("Estimate", "LowerLimit", "UpperLimit")
plotData$Difference <- gsub(".diff.", " - ", colnames(x$difs[[metric]]), fixed = TRUE)
plotData <- melt(plotData, id.vars = "Difference")
xText <- paste("Difference in",
useMathSymbols(metric),
"\nConfidence Level",
round(x$confLevel, 3),
ifelse(x$adjustment == "bonferroni",
" (multiplicity adjusted)",
" (no multiplicity adjustment)"))
dotplot(Difference ~ value,
data = plotData,
xlab = xText,
panel = function(x, y)
{
plotTheme <- trellis.par.get()
middle <- aggregate(x, list(mod = y), median)
upper <- aggregate(x, list(mod = as.numeric(y)), max)
lower <- aggregate(x, list(mod = as.numeric(y)), min)
panel.dotplot(middle$x, middle$mod,
col = plotTheme$plot.symbol$col[1],
pch = plotTheme$plot.symbol$pch[1],
cex = plotTheme$plot.symbol$cex[1])
panel.abline(v = 0,
col = plotTheme$reference.line$col[1],
lty = plotTheme$reference.line$lty[1],
lwd = plotTheme$reference.line$lwd[1])
for(i in seq(along = upper$mod))
{
panel.segments(upper$x[i], upper$mod[i], lower$x[i], lower$mod[i],
col = plotTheme$plot.line$col[1],
lwd = plotTheme$plot.line$lwd[1],
lty = plotTheme$plot.line$lty[1])
len <- .03
panel.segments(lower$x[i], upper$mod[i]+len,
lower$x[i], lower$mod[i]-len,
lty = plotTheme$plot.line$lty[1],
col = plotTheme$plot.line$col[1],
lwd = plotTheme$plot.line$lwd[1])
panel.segments(upper$x[i],upper$mod[i]+len,
upper$x[i], lower$mod[i]-len,
lty = plotTheme$plot.line$lty[1],
col = plotTheme$plot.line$col[1],
lwd = plotTheme$plot.line$lwd[1])
}
},
...)
}
#' @rdname diff.resamples
#' @export
compare_models <- function(a, b, metric = a$metric[1]) {
mods <- list(a, b)
rs <- resamples(mods)
diffs <- diff(rs, metric = metric[1])
diffs$statistics[[1]][[1]]
}
#' @importFrom utils globalVariables
utils::globalVariables(c("LowerLimit", "UpperLimit"))
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.