Nothing
#' Add predictive models to Summary list.
#'
#' \code{add_model} Conduct model training and add each model fit to a summary list.
#' The function ensures that the model and data supports classification.
#' If model supports class probability, then the best model is choosen based on
#' AUC. If model doesnt support class probabilities, the best model is
#' choosen based on accuracy. We use 10-fold cross validation by default.
#'
#' @param addTo Summary list that will contain all model fit results.
#' @param x A dataframe of input variables.
#' @param y A vector or a dataframe of output variable.
#' @param model A vector of model names to train.
#' @param tuneLength the maximum number of tuning parameter combinations
#' that will be generated by the random search
#' @param modelTag A charactor value of tag that to be added to model name.
#' @param tf A single charactor value for transformation options. tf1, tf2, and tf3.
#' For details, see the suggest_transformation function.
#' @param sampling A single character value to pass caret::trainControl.
#' Values are "none", "down", "up", "smote", or "rose". The latter two values require
#' the DMwR and ROSE packages, respectively.
#' @import knitr
#' @import mlbench
#' @import e1071
#' @importFrom magrittr %>%
#' @importFrom purrr is_list
#' @importFrom purrr is_vector
#' @importFrom purrr is_character
#' @importFrom purrr as_vector
#' @importFrom purrr map_df
#' @importFrom purrr map_lgl
#' @importFrom purrr map
#' @importFrom stats predict
#' @importFrom stats reorder
#' @importFrom stats median
#' @importFrom stats loess
#' @importFrom caret train
#' @importFrom caret trainControl
#' @importFrom caret postResample
#' @importFrom caret sensitivity
#' @importFrom caret specificity
#' @importFrom caret getModelInfo
#' @importFrom dplyr bind_cols
#' @importFrom dplyr distinct
#' @importFrom dplyr bind_rows
#' @importFrom dplyr setdiff
#' @importFrom ModelMetrics auc
#' @importFrom utils install.packages
#' @importFrom utils installed.packages
#' @importFrom stringr str_c
#' @importFrom ggplot2 ggplot
#' @importFrom ggplot2 aes
#' @importFrom ggplot2 geom_boxplot
#' @importFrom ggplot2 geom_bar
#' @importFrom ggplot2 geom_density
#' @importFrom ggplot2 geom_point
#' @importFrom ggplot2 geom_label
#' @importFrom ggplot2 geom_rect
#' @importFrom ggplot2 geom_tile
#' @importFrom ggplot2 geom_line
#' @importFrom ggplot2 geom_smooth
#' @importFrom ggplot2 geom_text
#' @importFrom ggplot2 margin
#' @importFrom ggplot2 facet_wrap
#' @importFrom ggplot2 xlab
#' @importFrom ggplot2 ylab
#' @importFrom ggplot2 xlim
#' @importFrom ggplot2 ylim
#' @importFrom ggplot2 ggtitle
#' @importFrom ggplot2 coord_flip
#' @importFrom ggplot2 theme
#' @importFrom ggplot2 guides
#' @importFrom ggplot2 element_blank
#' @importFrom ggplot2 scale_fill_gradient
#' @importFrom ggplot2 element_text
#' @return If input model and data supports classification, then the results of
#' each model fittings are added to the summary list.
#' @export
#' @examples
#' \dontrun{
#' library(mlbench)
#' data(PimaIndiansDiabetes)
#' index <- sample(seq_len(nrow(PimaIndiansDiabetes)), 500)
#' trainingSet <- PimaIndiansDiabetes[index, ]
#' testSet <- PimaIndiansDiabetes[-index, ]
#' x <- trainingSet[, -9]
#' y <- trainingSet[, 9]
#' x_test <- testSet[, -9]
#' y_test <- testSet[, 9]
#'
#' sSummary <- list()
#' sSummary <- add_model(sSummary, x, y)
#' sSummary <- add_model(sSummary, x, y, model = c("C5.0Cost", "glmnet"), modelTag = "others")
#'
#' # vignette("modeval") #check a vignette for further details
#' }
add_model <- function(addTo, x, y, model = NULL, tuneLength = 5L, modelTag = NULL, tf = NULL, sampling = NULL) {
`.` <- NULL # for CMD check pass
if (is.null(model)) {
model = c("glm", "lda", "bayesglm", "knn", "nnet", "qda", "svmLinear", "svmRadial", "rf", "rpart", "treebag")
}
# input argument control
stopifnot(purrr::is_list(addTo), is.null(model) || (purrr::is_vector(model) && purrr::is_character(model) && NCOL(model) == 1),
purrr::is_null(tuneLength) || (purrr::is_numeric(tuneLength) && tuneLength %% 1 == 0),
purrr::is_character(modelTag) || purrr::is_null(modelTag),
is.data.frame(x) && !is.null(n <- nrow(x)),
!is.matrix(y) && purrr::is_vector(y) && NCOL(y) == 1 && NROW(y) == NROW(y),
length(levels(factor((purrr::as_vector(y))))) == 2 # two class factor only
)
x <- as.data.frame(x)
y <- as.data.frame(y)
colnames(y) <- "Class"
trainingData <- dplyr::bind_cols(x, y)
# check required packages and install if required
model %>%
purrr::map(~caret::getModelInfo()[[.]]$library) %>%
purrr::map_lgl(~length(.) > 0) %>% model[.] %>%
purrr::map(~caret::getModelInfo()[[.]]$library) %>%
purrr::map(tibble::tibble) %>% purrr::map_df(dplyr::bind_rows) %>%
dplyr::distinct() %>% purrr::as_vector() %>%
dplyr::setdiff(installed.packages()[, 1]) -> packageToInstall
if (length(packageToInstall) > 0) install.packages(packageToInstall, repos = "http://cran.us.r-project.org")
# classification only
model %>%
purrr::map(~caret::getModelInfo()[[.]]$type) %>%
purrr::map(~ . %in% "Classification") %>%
purrr::map_lgl(any) -> classificationAvailable
modelToCompute <- model[classificationAvailable]
modelNotToCompute <- model[!classificationAvailable]
# check availability of class probability
modelToCompute %>%
purrr::map(~caret::getModelInfo()[[.]]$prob) %>%
purrr::map_lgl(is.null) -> classProbNotAvailable
modelWithoutClassProb <- modelToCompute[classProbNotAvailable]
modelWithClassProb <- modelToCompute[!classProbNotAvailable]
# data transformation
if(is.null(tf)) {
tfArg <- NULL
} else {
tfArg <- switch(tf,
"tf1" = c("BoxCox"),
"tf2" = c("center", "scale", "YeoJohnson"),
"tf3" = c("center", "scale", "pca")
)
}
modelTag <- stringr::str_c(tf, modelTag, sep = "_")
# combine caret::defaultSummary and caret::twoClasssummary into one
togetherSummary <- function(data, lev = NULL, model = NULL) {
# Accuracy - !minor modification only. source from caret::defaultSummary
data1 <- data
lev1 <- lev
model1 <- model
if (is.character(data1$obs))
data1$obs <- factor(data1$obs, levels = lev1)
out1 <- caret::postResample(data1[, "pred"], data1[, "obs"])
# AUC - !minor modification only. source from caret::twoClasssummary
lvls <- levels(data$obs)
if (!all(levels(data[, "pred"]) == lvls))
stop("levels of observed and predicted data do not match")
data$y = as.numeric(data$obs == lvls[2])
rocAUC <- ModelMetrics::auc(ifelse(data$obs == lev[2], 0, 1), data[, lvls[1]])
out2 <- c(rocAUC,
caret::sensitivity(data[, "pred"], data[, "obs"], lev[1]),
caret::specificity(data[, "pred"], data[, "obs"], lev[2])
)
names(out2) <- c("ROC", "Sens", "Spec")
out <- c(out1, out2)
out
}
# trainControl set-up
trainControlWithClassProb <- caret::trainControl(verboseIter = TRUE,
method="cv",
number=10,
search = "random",
returnResamp = "final",
savePredictions = "final",
classProbs = TRUE,
summaryFunction = togetherSummary,
timingSamps = 1,
sampling = sampling
)
trainControlWithoutClassProb <- caret::trainControl(verboseIter = TRUE,
method="cv",
number=10,
search = "random",
returnResamp = "final",
savePredictions = "final",
summaryFunction = caret::defaultSummary,
timingSamps = 1,
sampling = sampling
)
modelWithCP <- modelWithoutCP <- NULL
# train models w/ ROC(primary) and Accuracy & Kappa(reference)
if (length(modelWithClassProb) > 0) {
modelWithCP <- modelWithClassProb %>%
purrr::map(~caret::train(method = .,
Class ~ .,
data = trainingData,
metric = "ROC", # if prob available, choose best model with ROC
tuneLength = tuneLength,
trControl = trainControlWithClassProb,
preProcess = tfArg)
)
names(modelWithCP) <- stringr::str_c(modelWithClassProb, modelTag, sep = "_")
}
# train models (w/ Accuracy & metric)
if (length(modelWithoutClassProb) > 0) {
modelWithoutCP <- modelWithoutClassProb %>%
purrr::map(~caret::train(method = .,
Class ~ .,
data = trainingData,
metric = "Accuracy",
tuneLength = tuneLength,
trControl = trainControlWithoutClassProb,
preProcess = tfArg)
)
names(modelWithoutCP) <- stringr::str_c(modelWithoutClassProb, modelTag, sep = "_")
}
# overwrite if same name of object is added
for(i in seq_along(modelWithCP)) {
addTo[[names(modelWithCP)[i]]] <- modelWithCP[[i]]
}
for(i in seq_along(modelWithoutCP)) {
addTo[[names(modelWithoutCP)[i]]] <- modelWithoutCP[[i]]
}
if (length(modelWithoutClassProb) > 0) {
cat("\n Model: ", stringr::str_c(names(modelWithoutCP), collapse = ", "),
"\n >> Accuracy and Kappa metrics are available.",
"\n >> Best model is selected based on accuracy. \n\n",
collapse = "")
}
if (length(modelWithClassProb) > 0) {
cat("\n Model: ", stringr::str_c(names(modelWithCP), collapse = ", "),
"\n >> ROC, Sens, Spec, Accuracy and Kappa metrics are available.",
"\n >> Best model is selected based on AUC. \n\n",
collapse = "")
}
if (length(modelNotToCompute) > 0) {
cat("\n Model: ", stringr::str_c(modelNotToCompute, collapse = ", "),
"\n >> Model(s) not support classification problem.\n\n",
collapse = "")
}
return(addTo)
}
#' Add Class Probability
#'
#' \code{add_prob} Adds class probability results with given dataset (x, y)
#' to the summary list for the all model fits exists in it. Like
#' \code{\link{add_model}}, it also ensures to conduct prediction for the
#' model and data supports classification.
#'
#' @param addTo Summary list that will contain all model fit results.
#' @param x A dataframe of input variables.
#' @param y A vector or a dataframe of output variable.
#' @param outChar A charactor value of output class name.
#' @param predTag A charactor value of tag that to be added to the pred result.
#' @export
#' @return Results of each model fitting are added to the summary list.
#' @examples
#' \dontrun{
#' library(mlbench)
#' data(PimaIndiansDiabetes)
#' index <- sample(seq_len(nrow(PimaIndiansDiabetes)), 500)
#' trainingSet <- PimaIndiansDiabetes[index, ]
#' testSet <- PimaIndiansDiabetes[-index, ]
#' x <- trainingSet[, -9]
#' y <- trainingSet[, 9]
#' x_test <- testSet[, -9]
#' y_test <- testSet[, 9]
#'
#' sSummary <- list()
#' sSummary <- add_model(sSummary, x, y)
#' sSummary <- add_model(sSummary, x, y, model = c("C5.0Cost", "glmnet"), modelTag = "others")
#' sSummary <- add_prob(sSummary, x_test, y_test, outChar = "pos")
#' sSummary$glm$pred_test
#'
#' # vignette("modeval") #check a vignette for further details
#' }
add_prob <- function(addTo, x, y, outChar, predTag = NULL) {
# input argument control
stopifnot(purrr::is_list(addTo), is.data.frame(x) && !is.null(n <- nrow(x)),
!is.matrix(y) && purrr::is_vector(y) && NCOL(y) == 1 && NROW(y) == NROW(y),
purrr::is_character(predTag) || purrr::is_null(predTag),
length(levels(factor((purrr::as_vector(y))))) == 2 # two class factor only
)
x <- as.data.frame(x)
y <- as.data.frame(y)
colnames(y) <- "Class"
testData <- dplyr::bind_cols(x, y)
addTo %>%
purrr::map_chr(~.$method) -> model
# following section is used at add_model but still need to repeat here becuase...
# enable add_prob for any trained object that are created by add_model function
# classification only
model %>%
purrr::map(~caret::getModelInfo()[[.]]$type) %>%
purrr::map(~ . %in% "Classification") %>%
purrr::map_lgl(any) -> classificationAvailable
modelToCompute <- model[classificationAvailable]
# check availability of class probability
modelToCompute %>%
purrr::map(~caret::getModelInfo()[[.]]$prob) %>%
purrr::map_lgl(is.null) -> classProbNotAvailable
modelWithClassProb <- names(modelToCompute[!classProbNotAvailable])
modelWithoutClassProb <- names(modelToCompute[classProbNotAvailable])
# add pred result to addTo object
if (length(modelWithClassProb)) {
for(i in seq_along(modelWithClassProb)) {
pred_test <- data.frame(testData["Class"])
cat(stringr::str_c("Calculating prediction: ", modelWithClassProb[i],"\n", collapse = TRUE))
pred_test[outChar] <- predict(addTo[[modelWithClassProb[i]]], testData, type = "prob")[, outChar]
if (is.null(predTag)) {
addTo[[modelWithClassProb[i]]]$pred_test <- pred_test
} else {
addTo[[modelWithClassProb[i]]][[predTag]] <- pred_test
}
}
}
return(addTo)
}
#' Plot Skewness and Kurtosis and suggest transformations
#'
#' \code{suggest_transformation} Evaluate the normality of the data by
#' calculating skew and kurtosis. The user is provided guidance regarding
#' whether data tranformation is advised, and how well each of three
#' tranformation options (Box-Cox, Yeo-Johnson, or PCA) does in reducing
#' non-normality as compared to each other and the original, untransformed
#' data set.
#'
#' @param x A dataframe of input variables.
#' @export
#' @return Four plots depicting skew and kurtosis of each variable. The four
#' plots are (1) the original, untransformed data, (2) transformed data
#' applying Box-Cox (tagged "tf1"), (3) tranformed data applying Yeo-Johnson
#' (tagged applying "tf2"), and (4) transformed data using Principal
#' Components (tagged "tf3"). Additionally, the three transformed data sets
#' generated and ready to be called with other functions using the tags.
#' @examples
#' \dontrun{
#' library(mlbench)
#' data(PimaIndiansDiabetes)
#' index <- sample(seq_len(nrow(PimaIndiansDiabetes)), 500)
#' trainingSet <- PimaIndiansDiabetes[index, ]
#' testSet <- PimaIndiansDiabetes[-index, ]
#' x <- trainingSet[, -9]
#' y <- trainingSet[, 9]
#' x_test <- testSet[, -9]
#' y_test <- testSet[, 9]
#' suggest_transformation(x)
#'
#' # vignette("modeval") #check a vignette for further details
#' }
suggest_transformation <- function(x) {
kurtosis <- skew <- varNames <- NULL # for CMD check pass
x <- x[, purrr::map_lgl(x, is.numeric)]
### Check for Skewness, Kurtosis and plot
cat("\n Consider transforming data if skew or kurtosis of any variable is > 2 or < -2 \n")
cat("\n BoxCox : The distribution of an attribute can be shifted to reduce the skew and make it more Gaussian. \n")
cat("\n Yeo-Johnson : Like the Box-Cox transform, but it supports raw values that are equal to zero and negative. \n")
cat("\n PCA : Transform the data to the principal components. \n")
varaibleTest <- function(x, title) {
checkSK <- psych::describe(x) # Put results of 'describe' into a data frame
checkSK <- data.frame(checkSK)
checkSK$varNames <- rownames(checkSK)
ggplot(checkSK, aes(y = kurtosis, x = skew)) +
xlab("Skewness") +
ylab("Kurtosis") +
geom_label(aes(label = varNames), nudge_x = .01, nudge_y = 0, size = 3) +
geom_point(data = checkSK[checkSK$skew > 2 | checkSK$skew < -2,],
aes(x=skew, y=kurtosis), size = 4, color = "red", alpha = 0.2) +
geom_point(data = checkSK[checkSK$kurtosis > 2 | checkSK$kurtosis < -2,],
aes(x=skew, y=kurtosis), size = 4, color = "blue", alpha = 0.2) +
ggtitle(title)+
theme(plot.title = element_text(hjust = 0.5, lineheight=.8, face="bold")) +
geom_rect(aes(xmin = -2, ymin = -2, xmax = 2, ymax = 2), color = "gray", alpha = 0.002)
}
tf1 <- caret::preProcess(x, method = c("BoxCox"))
tf1 <- predict(tf1, newdata = x)
tf2 <- caret::preProcess(x, method = c("center", "scale", "YeoJohnson"))
tf2 <- predict(tf2, newdata = x)
tf3 <- caret::preProcess(x, method = c("center", "scale", "pca"))
tf3<- predict(tf3, newdata = x)
summary <- list(varaibleTest(x, "No transformation"),
varaibleTest(tf1, "tf1: BoxCox"),
varaibleTest(tf2, "tf2: Yeo-Johnson" ),
varaibleTest(tf3, "tf3: Principal Component")
)
return(gridExtra::grid.arrange(grobs = summary))
}
#' Compare performance based on accuracy metrics.
#'
#' \code{suggest_accuracy} Compare performance for all model fits in the
#' Summary list based on Accuracy and Kappa metrics, together with training
#' time for a single tuning of each model fit.
#'
#' @param addTo Summary list that contains model fits to compare.
#' @param modelTag Select model fits that contains modelTag in their name.
#' @param time If TRUE, calculates average time to train model for a single
#' tuning.
#' @export
#' @examples
#' \dontrun{
#' library(mlbench)
#' data(PimaIndiansDiabetes)
#' index <- sample(seq_len(nrow(PimaIndiansDiabetes)), 500)
#' trainingSet <- PimaIndiansDiabetes[index, ]
#' testSet <- PimaIndiansDiabetes[-index, ]
#' x <- trainingSet[, -9]
#' y <- trainingSet[, 9]
#' x_test <- testSet[, -9]
#' y_test <- testSet[, 9]
#' sSummary <- list()
#' sSummary <- add_model(sSummary, x, y)
#' sSummary <- add_model(sSummary, x, y, model = c("C5.0Cost", "glmnet"), modelTag = "others")
#'
#' suggest_accuracy(sSummary)
#' suggest_accuracy(sSummary, time = TRUE)
#' suggest_accuracy(sSummary, time = TRUE, modelTag = "glm|svm")
#'
#' # vignette("modeval") #check a vignette for further details
#' }
suggest_accuracy <- function(addTo, modelTag = NULL, time = FALSE) {
`.` <- Model <- Accuracy <- Kappa <- Time <- TuneLength <- TimeSingle <- TimeTotal <- NULL #for CMD check pass
stopifnot(purrr::is_list(addTo),
purrr::is_character(modelTag) || purrr::is_null(modelTag)
)
if (!is.null(modelTag)) {
grepIndex <- grep(modelTag, names(addTo))
addTo <- addTo[grepIndex]
}
addToAcc <- addTo %>% purrr::map_lgl(~(all(c("Accuracy", "Kappa") %in% .$perfNames))) %>% addTo[.]
# performance comparision based on Accuracy
if (length(addToAcc)) {
gridAcc <- list(Model = names(addToAcc),
Accuracy = addToAcc %>% purrr::map(~.$resample$Accuracy),
Kappa = addToAcc %>% purrr::map(~.$resample$Kappa),
Time = addToAcc %>% purrr::map(~.$times$everything["elapsed"]),
TuneLength = addToAcc %>% purrr::map(~.$results) %>% purrr::map(nrow)
)
tableAcc <- purrr::pmap(gridAcc, tibble::tibble) %>% dplyr::bind_rows()
tableAcc <- tableAcc %>%
dplyr::mutate(TimeSingle = Time / 60 / TuneLength) %>%
dplyr::mutate(TimeTotal = Time / 60)
perf_Acc <- ggplot(data = tableAcc, mapping = aes(x = reorder(Model, Accuracy, FUN = stats::median), y = Accuracy)) +
geom_boxplot(alpha = 0.8) +
xlab("") + ylab("") +
ggtitle("Accuracy") +
coord_flip()
perf_Kap <- ggplot(data = tableAcc, mapping = aes(x = reorder(Model, Accuracy, FUN = stats::median), y = Kappa)) +
geom_boxplot(alpha = 0.8) +
xlab("") + ylab("") +
ggtitle("Kappa") +
theme(axis.text.y = element_blank(), axis.ticks.y = element_blank()) +
coord_flip()
time_sig <- ggplot(data = tableAcc, mapping = aes(x = reorder(Model, Accuracy, FUN = stats::median), y = TimeSingle)) +
geom_bar(stat = "identity", alpha = 0.8) +
xlab("") + ylab("") +
ggtitle("Time/Tuning (min)") +
theme(axis.text.y = element_blank(), axis.ticks.y = element_blank()) +
coord_flip()
if (time) {
layout <- rbind(c(1, 1, 2))
withTime <- gridExtra::grid.arrange(grobs = list(perf_Acc, time_sig), layout_matrix = layout)
} else {
layout <- rbind(c(1, 1, 1, 1, 2, 2, 2))
withKappa <- gridExtra::grid.arrange(grobs = list(perf_Acc, perf_Kap), layout_matrix = layout)
}
}
}
#' Compare performance based on Area Under the Curve, Sensitivity, and
#' Specificity.
#'
#' \code{suggest_auc} Compare performance for all model fits in Summary list
#' based on AUC, Sensitivity and Specificity, together with training time
#' for a single tuning for each model fit.
#'
#' @param addTo Summary list that contains model fits to compare.
#' @param modelTag Select model fits that contains modelTag on their name.
#' @param time If TRUE, calculates average time to train model for a single
#' tuning.
#' @export
#' @examples
#' \dontrun{
#' library(mlbench)
#' data(PimaIndiansDiabetes)
#' index <- sample(seq_len(nrow(PimaIndiansDiabetes)), 500)
#' trainingSet <- PimaIndiansDiabetes[index, ]
#' testSet <- PimaIndiansDiabetes[-index, ]
#' x <- trainingSet[, -9]
#' y <- trainingSet[, 9]
#' x_test <- testSet[, -9]
#' y_test <- testSet[, 9]
#' sSummary <- list()
#' sSummary <- add_model(sSummary, x, y)
#' sSummary <- add_model(sSummary, x, y, model = c("C5.0Cost", "glmnet"), modelTag = "others")
#'
#' suggest_auc(sSummary)
#' suggest_auc(sSummary, time = TRUE)
#' suggest_auc(sSummary, time = TRUE, modelTag = "glm|svm")
#'
#' # vignette("modeval") #check a vignette for further details
#' }
suggest_auc <- function(addTo, modelTag = NULL, time = FALSE) {
`.` <- Time <- TuneLength <- Model <- ROC <- TimeSingle <- TimeTotal <- Sens <- Spec <- NULL # for CMD check pass
stopifnot(purrr::is_list(addTo),
purrr::is_character(modelTag) || purrr::is_null(modelTag)
)
if (!is.null(modelTag)) {
grepIndex <- grep(modelTag, names(addTo))
addTo <- addTo[grepIndex]
}
addToROC <- addTo %>% purrr::map_lgl(~(all(c("ROC", "Sens", "Spec") %in% .$perfNames))) %>% addTo[.]
# performance comparision based on AUC
if (length(addToROC)) {
gridAcc <- list(Model = names(addToROC),
ROC = addToROC %>% purrr::map(~.$resample$ROC),
Sens = addToROC %>% purrr::map(~.$resample$Sens),
Spec = addToROC %>% purrr::map(~.$resample$Spec),
Time = addToROC %>% purrr::map(~.$times$everything["elapsed"]),
TuneLength = addToROC %>% purrr::map(~.$results) %>% purrr::map(nrow)
)
tableROC <- purrr::pmap(gridAcc, tibble::tibble) %>% dplyr::bind_rows()
tableROC <- tableROC %>%
dplyr::mutate(TimeSingle = Time / 60 / TuneLength) %>%
dplyr::mutate(TimeTotal = Time / 60)
perf_ROC <- ggplot(data = tableROC, mapping = aes(x = reorder(Model, ROC, FUN = stats::median), y = ROC)) +
geom_boxplot(alpha = 0.8) +
xlab("") + ylab("") +
ggtitle("AUC") +
coord_flip()
perf_Sens <- ggplot(data = tableROC, mapping = aes(x = reorder(Model, ROC, FUN = stats::median), y = Sens)) +
geom_boxplot(alpha = 0.8) +
xlab("") + ylab("") +
ggtitle("Sensitivity") +
theme(axis.text.y = element_blank(), axis.ticks.y = element_blank()) +
coord_flip()
perf_Spec <- ggplot(data = tableROC, mapping = aes(x = reorder(Model, ROC, FUN = stats::median), y = Spec)) +
geom_boxplot(alpha = 0.8) +
xlab("") + ylab("") +
ggtitle("Specificity") +
theme(axis.text.y = element_blank(), axis.ticks.y = element_blank()) +
coord_flip()
time_sig <- ggplot(data = tableROC, mapping = aes(x = reorder(Model, ROC, FUN = stats::median), y = TimeSingle)) +
geom_bar(stat = "identity", alpha = 0.8) +
xlab("") + ylab("") +
ggtitle("Time/Tuning (min)") +
theme(axis.text.y = element_blank(), axis.ticks.y = element_blank()) +
coord_flip()
if (time) {
layout <- rbind(c(1, 1, 2))
gridExtra::grid.arrange(grobs = list(perf_ROC, time_sig), layout_matrix = layout)
} else {
layout <- rbind(c(1, 1, 1, 2, 2, 3, 3))
gridExtra::grid.arrange(grobs = list(perf_ROC, perf_Sens, perf_Spec), layout_matrix = layout)
}
}
}
#' Compare model performance by model category from \pkg{caret} package
#'
#' \code{suggest_category} collects AUC and accuracy results by model category
#' and presents average performance using the model category tags as definded
#' by \pkg{caret}. This helps the user identify the most promising category
#' of models for further exploration.
#'
#' @param addTo Summary list that contains model fits to compare.
#' @param modelTag Select model fits containing \code{modelTag} in their name.
#' @export
#' @return Plots of AUC and accuracy, organized by category tags.
#' @examples
#' \dontrun{
#' library(mlbench)
#' data(PimaIndiansDiabetes)
#' index <- sample(seq_len(nrow(PimaIndiansDiabetes)), 500)
#' trainingSet <- PimaIndiansDiabetes[index, ]
#' testSet <- PimaIndiansDiabetes[-index, ]
#' x <- trainingSet[, -9]
#' y <- trainingSet[, 9]
#' x_test <- testSet[, -9]
#' y_test <- testSet[, 9]
#' sSummary <- list()
#' sSummary <- add_model(sSummary, x, y)
#' sSummary <- add_model(sSummary, x, y, model = c("C5.0Cost", "glmnet"), modelTag = "others")
#'
#' suggest_category(sSummary)
#'
#' # vignette("modeval") #check a vignette for further details
#' }
suggest_category <- function(addTo, modelTag = NULL) {
`.` <- median <- Method <- Tag <- Accuracy <- ROC <- NULL # for CMD check pass
stopifnot(purrr::is_list(addTo),
purrr::is_character(modelTag) || purrr::is_null(modelTag)
)
if (!is.null(modelTag)) {
grepIndex <- grep(modelTag, names(addTo))
addTo <- addTo[grepIndex]
}
addToAcc <- addTo %>% purrr::map_lgl(~(all(c("Accuracy", "Kappa") %in% .$perfNames))) %>%addTo[.]
addToROC <- addTo %>% purrr::map_lgl(~(all(c("ROC", "Sens", "Spec") %in% .$perfNames))) %>% addTo[.]
gridAcc <- list(Method = addToAcc %>% purrr::map(~.$method),
Accuracy = addToAcc %>% purrr::map(~.$resample$Accuracy) %>% purrr::map(stats::median),
Tag = addToAcc %>% purrr::map(~.$modelInfo$tags)
)
tableAcc <- purrr::pmap(gridAcc, tibble::tibble) %>% dplyr::bind_rows() %>%
dplyr::group_by(Method, Tag) %>% dplyr::summarise(Accuracy = mean(Accuracy, na.rm = TRUE)) %>% dplyr::ungroup()
gridROC <- list(Method = addToROC %>% purrr::map(~.$method),
ROC = addToROC %>% purrr::map(~.$resample$ROC) %>% purrr::map(stats::median),
Tag = addToROC %>% purrr::map(~.$modelInfo$tags)
)
tableROC <- purrr::pmap(gridROC, tibble::tibble) %>% dplyr::bind_rows() %>%
dplyr::group_by(Method, Tag) %>% dplyr::summarise(ROC = mean(ROC, na.rm = TRUE)) %>% dplyr::ungroup()
tableAcc$Tag <- tableAcc$Tag %>%
stringr::str_replace_all("L1 Regularization Models", "L1 Regularization") %>%
stringr::str_replace_all("L2 Regularization Models", "L2 Regularization") %>%
stringr::str_replace_all("Generalized Linear Models", "Generalized Linear Model") %>%
stringr::str_replace_all("Discriminant Analysis Models", "Discriminant Analysis") %>%
stringr::str_replace_all("Feature Extraction Models", "Feature Extraction") %>%
stringr::str_replace_all("Linear Classifier Models", "Linear Classifier") %>%
stringr::str_replace_all("Linear Regression Models", "Linear Regression")
tableROC$Tag <- tableROC$Tag %>%
stringr::str_replace_all("L1 Regularization Models", "L1 Regularization") %>%
stringr::str_replace_all("L2 Regularization Models", "L2 Regularization") %>%
stringr::str_replace_all("Generalized Linear Models", "Generalized Linear Model") %>%
stringr::str_replace_all("Discriminant Analysis Models", "Discriminant Analysis") %>%
stringr::str_replace_all("Feature Extraction Models", "Feature Extraction") %>%
stringr::str_replace_all("Linear Classifier Models", "Linear Classifier") %>%
stringr::str_replace_all("Linear Regression Models", "Linear Regression")
tableAcc$Accuracy <- round(tableAcc$Accuracy * 100, digits = 0)
tableROC$ROC <- round(tableROC$ROC * 100, digits = 0)
ctg_acc <- ggplot(data = tableAcc, mapping = aes(x = reorder(Tag, Accuracy, FUN = stats::median),
y = reorder(Method, dplyr::desc(Accuracy), FUN = stats::median),
fill = Accuracy)
) +
geom_tile(colour="gray", size=1, stat="identity") +
scale_fill_gradient(low = "white", high = "gray", space = "Lab") +
geom_text(data = tableAcc, mapping = aes(x = reorder(Tag, Accuracy, FUN = stats::median),
y = reorder(Method, dplyr::desc(Accuracy), FUN = stats::median),
label = Accuracy)
) +
xlab("") + ylab("") +
ggtitle("Tag-Model Grid with Accuracy(%)") + guides(fill = "none") +
coord_flip()
ctg_roc <- ggplot(data = tableROC, mapping = aes(x = reorder(Tag, ROC, FUN = stats::median),
y = reorder(Method, dplyr::desc(ROC), FUN = stats::median),
fill = ROC)
) +
geom_tile(colour="gray", size=1, stat="identity") +
scale_fill_gradient(low = "white", high = "gray", space = "Lab") +
geom_text(data = tableROC, mapping = aes(x = reorder(Tag, dplyr::desc(ROC), FUN = stats::median),
y = reorder(Method, ROC, FUN = stats::median),
label = ROC)
) +
xlab("") + ylab("") +
ggtitle("Tag-Model Grid with AUC(%)") + guides(fill = "none") +
coord_flip()
return(gridExtra::grid.arrange(grobs = list(ctg_roc, ctg_acc)))
}
#' Compare variables by importance.
#'
#' \code{suggest_variable} Collect and compare variable importance metrics
#' if available.
#'
#' @param addTo Summary list that contains model fits to compare.
#' @param modelTag Select model fits that contains modelTag on their name.
#' @param sep If TRUE, show variable importance results by each model fit.
#' @export
#' @return Plots of variable importance results from all available models.
#' @examples
#' \dontrun{
#' library(mlbench)
#' data(PimaIndiansDiabetes)
#' index <- sample(seq_len(nrow(PimaIndiansDiabetes)), 500)
#' trainingSet <- PimaIndiansDiabetes[index, ]
#' testSet <- PimaIndiansDiabetes[-index, ]
#' x <- trainingSet[, -9]
#' y <- trainingSet[, 9]
#' x_test <- testSet[, -9]
#' y_test <- testSet[, 9]
#' sSummary <- list()
#' sSummary <- add_model(sSummary, x, y)
#' sSummary <- add_model(sSummary, x, y, model = c("C5.0Cost", "glmnet"), modelTag = "others")
#'
#' suggest_variable(sSummary)
#'
#' # vignette("modeval") #check a vignette for further details
#' }
suggest_variable <- function(addTo, modelTag = NULL, sep = FALSE) {
`.` <- varImp <- Variables <- variableImportance <- median <- NULL # for CMD check pass
stopifnot(purrr::is_list(addTo),
purrr::is_character(modelTag) || purrr::is_null(modelTag)
)
if (!is.null(modelTag)) {
grepIndex <- grep(modelTag, names(addTo))
addTo <- addTo[grepIndex]
}
noVarImp <- addTo %>% purrr::map(~.$modelInfo$varImp) %>% purrr::map_lgl(is.null)
addToVar <- addTo[!noVarImp]
vi <- list(variableImportance = addToVar %>% purrr::map(caret::varImp) %>% purrr::map(~.$importance[,1]),
Model = addToVar %>% purrr::map(caret::varImp) %>% names(),
Variables = addToVar %>% purrr::map(caret::varImp) %>% purrr::map(~.$importance) %>% purrr::map(rownames)
)
viSummary <- purrr::pmap(vi, tibble::tibble) %>% dplyr::bind_rows()
viConsolidation <- ggplot(data = viSummary,
mapping = aes(x = reorder(Variables,
variableImportance,
FUN = stats::median),
y = variableImportance)
) +
geom_boxplot(alpha = 0.8) +
xlab("") + ylab("") +
ggtitle("Variable Importance\nfrom all models") +
coord_flip()
vicomparison <- ggplot(data = viSummary,
mapping = aes(x = reorder(Variables,
variableImportance,
FUN = stats::median),
y = variableImportance)
) +
geom_bar(stat = "identity", alpha = 0.7) +
xlab("") + ylab("") +
ggtitle("Variable Importance by model") +
coord_flip() +
facet_wrap(~ Model)
viCompare = list(viConsolidation, vicomparison)
viSeparate = viSummary %>% split(.$Model)
for (i in seq_along(viSeparate)) {
j = match(names(viSeparate), names(addToVar))[i]
viSeparate[[i]] = ggplot(data = viSeparate[[i]],
mapping = aes(x = reorder(Variables,
variableImportance,
FUN = stats::median),
y = variableImportance)
) +
geom_bar(stat = "identity", alpha = 0.7) +
xlab("") + ylab("") +
ggtitle(stringr::str_c(names(viSeparate)[[i]], " | ", addToVar[[j]]$modelInfo$label)) +
coord_flip()
}
if (sep) {
gridExtra::grid.arrange(grobs = viSeparate)
} else {
lay <- rbind(c(1,1,2,2,2))
gridExtra::grid.arrange(grobs = viCompare, layout_matrix = lay)
}
}
#' Compare class probability distributions based on the population.
#'
#' \code{suggest_probPop} Compare class probability distribution based on
#' the population of observations, sorted by probability values.
#' x-axis: percentile of class probability.
#' y-axis: observation density.
#'
#' @param addTo Summary list that contains model fits to compare.
#' @param outChar A charactor value of output class name.
#' @param predTag Select prediction results that contains predTag on their name.
#' @param modelTag Select model fits that contains modelTag on their name.
#' @export
#' @return Plot indicating density distribution by observed population.
#' @examples
#' \dontrun{
#' library(mlbench)
#' data(PimaIndiansDiabetes)
#' index <- sample(seq_len(nrow(PimaIndiansDiabetes)), 500)
#' trainingSet <- PimaIndiansDiabetes[index, ]
#' testSet <- PimaIndiansDiabetes[-index, ]
#' x <- trainingSet[, -9]
#' y <- trainingSet[, 9]
#' x_test <- testSet[, -9]
#' y_test <- testSet[, 9]
#' sSummary <- list()
#' sSummary <- add_model(sSummary, x, y)
#' sSummary <- add_model(sSummary, x, y, model = c("C5.0Cost", "glmnet"), modelTag = "others")
#' sSummary <- add_prob(sSummary, x_test, y_test, outChar = "pos")
#'
#' suggest_probPop(sSummary, "pos")
#' suggest_probPop(sSummary, "pos", modelTag = "glm|svm")
#'
#' # vignette("modeval") #check a vignette for further details
#' }
suggest_probPop <- function(addTo, outChar, predTag = "pred_test", modelTag = NULL) {
`.` <- obs <- population <- NULL # for CMD check pass
stopifnot(purrr::is_list(addTo),
purrr::is_character(modelTag) || purrr::is_null(modelTag),
purrr::is_character(predTag) || purrr::is_null(predTag)
)
if (!is.null(modelTag)) {
grepIndex <- grep(modelTag, names(addTo))
addTo <- addTo[grepIndex]
}
addToROC <- addTo %>% purrr::map_lgl(~(all(c("ROC", "Sens", "Spec") %in% .$perfNames))) %>% addTo[.]
addToROC <- addToROC %>% purrr::map_lgl(~(predTag %in% names(.))) %>% addToROC[.]
densityGrid <- list(obs = addToROC %>% purrr::map(~.[[predTag]]$Class),
model = addToROC %>% names(),
pred = addToROC %>% purrr::map(~.[[predTag]][,outChar]),
nTotal = addToROC %>% purrr::map(~.[[predTag]]) %>% purrr::map(nrow) %>% purrr::map(as.character),
probCut = addToROC %>% purrr::map(~.[[predTag]][,outChar]),
population = addToROC %>% purrr::map(~.[[predTag]][,outChar]) %>% purrr::map(dplyr::percent_rank)
)
densitySummary <- purrr::pmap(densityGrid, tibble::tibble)
density <- densitySummary %>% dplyr::bind_rows()
ggplot(data = density, aes(x = population, fill = obs)) +
xlab("Probability percent rankon") + ylab("Density") +
ggtitle("Density distribution by obs population") +
geom_density(alpha = 0.2) + ylim(0, 5) +
facet_wrap(~model)
}
#' Compare Class Probability Distribution based on Probability Cut-off
#'
#' \code{suggest_probCut} Compare class probability distribution
#' based on probability cut-off.
#' x-axis: probability value.
#' y-axis: observation density.
#'
#' @param addTo Summary list that contains model fits to compare.
#' @param outChar A charactor value of output class name.
#' @param predTag Select prediction results that contains predTag on their name.
#' @param modelTag Select model fits that contains modelTag on their name.
#' @export
#' @return Plot indicating density distribution by probability cut-off.
#' @examples
#' \dontrun{
#' library(mlbench)
#' data(PimaIndiansDiabetes)
#' index <- sample(seq_len(nrow(PimaIndiansDiabetes)), 500)
#' trainingSet <- PimaIndiansDiabetes[index, ]
#' testSet <- PimaIndiansDiabetes[-index, ]
#' x <- trainingSet[, -9]
#' y <- trainingSet[, 9]
#' x_test <- testSet[, -9]
#' y_test <- testSet[, 9]
#' sSummary <- list()
#' sSummary <- add_model(sSummary, x, y)
#' sSummary <- add_model(sSummary, x, y, model = c("C5.0Cost", "glmnet"), modelTag = "others")
#' sSummary <- add_prob(sSummary, x_test, y_test, outChar = "pos")
#'
#' suggest_probCut(sSummary, "pos")
#' suggest_probCut(sSummary, "pos", modelTag = "glm|svm")
#'
#' # vignette("modeval") #check a vignette for further details
#' }
suggest_probCut <- function(addTo, outChar, predTag = "pred_test", modelTag = NULL) {
`.` <- probCut <- obs <- NULL # for CMD check pass
stopifnot(purrr::is_list(addTo),
purrr::is_character(modelTag) || purrr::is_null(modelTag),
purrr::is_character(predTag) || purrr::is_null(predTag)
)
if (!is.null(modelTag)) {
grepIndex <- grep(modelTag, names(addTo))
addTo <- addTo[grepIndex]
}
addToROC <- addTo %>% purrr::map_lgl(~(all(c("ROC", "Sens", "Spec") %in% .$perfNames))) %>% addTo[.]
addToROC <- addToROC %>% purrr::map_lgl(~(predTag %in% names(.))) %>% addToROC[.]
densityGrid <- list(obs = addToROC %>% purrr::map(~.[[predTag]]$Class),
model = addToROC %>% names(),
pred = addToROC %>% purrr::map(~.[[predTag]][,outChar]),
nTotal = addToROC %>% purrr::map(~.[[predTag]]) %>% purrr::map(nrow) %>% purrr::map(as.character),
probCut = addToROC %>% purrr::map(~.[[predTag]][,outChar]),
population = addToROC %>% purrr::map(~.[[predTag]][,outChar]) %>% purrr::map(dplyr::percent_rank)
)
densitySummary <- purrr::pmap(densityGrid, tibble::tibble)
density <- densitySummary %>% dplyr::bind_rows()
ggplot(data = density, aes(x = probCut, fill = obs)) +
xlab("Probability cut off") + ylab("Density") +
ggtitle("Density distribution by probability cut off") +
geom_density(alpha = 0.2) + ylim(0, 5) +
facet_wrap(~model)
}
#' Plot Gain Chart and Lift Chart
#'
#' \code{suggest_gain} Gain and Lift charts are widely used in marketing and related contexts.
#' They indicate the effectiveness of predictive models compared to the results obtained
#' without the predictive model.
#'
#' @param addTo Summary list that contains model fits to compare.
#' @param outChar A charactor value of output class name.
#' @param predTag Select prediction results that contains predTag on their name.
#' @param modelTag Select model fits that contains modelTag on their name.
#' @param cuts Integer indicating the number of splits of probability buckets.
#' @param type Plot different type of charts. "Gain" for gain chart.
#' "Lift" for lift chart. "PctAcc" for accumulated event percent. "Pct" for event
#' percent.
#' @export
#' @examples
#' \dontrun{
#' library(mlbench)
#' data(PimaIndiansDiabetes)
#' index <- sample(seq_len(nrow(PimaIndiansDiabetes)), 500)
#' trainingSet <- PimaIndiansDiabetes[index, ]
#' testSet <- PimaIndiansDiabetes[-index, ]
#' x <- trainingSet[, -9]
#' y <- trainingSet[, 9]
#' x_test <- testSet[, -9]
#' y_test <- testSet[, 9]
#' sSummary <- list()
#' sSummary <- add_model(sSummary, x, y)
#' sSummary <- add_model(sSummary, x, y, model = c("C5.0Cost", "glmnet"), modelTag = "others")
#' sSummary <- add_prob(sSummary, x_test, y_test, outChar = "pos")
#'
#' suggest_gain(sSummary, outChar = "pos")
#' suggest_gain(sSummary, outChar = "pos", modelTag = "glm|svm", type = "Lift")
#' suggest_gain(sSummary, outChar = "pos", modelTag = "glm|svm", type = "PctAcc")
#' suggest_gain(sSummary, outChar = "pos", modelTag = "glm|svm", type = "Pct")
#' suggest_gain(sSummary, outChar = "pos", modelTag = "glm|svm", type = "Gain")
#' suggest_gain(sSummary, outChar = "pos", modelTag = "glm|svm", type = "Gain") + xlim(0, 0.5)
#'
#' # vignette("modeval") #check a vignette for further details
#' }
suggest_gain <- function(addTo, outChar, predTag = "pred_test", modelTag = NULL, cuts = 51, type = NULL) {
`.` <- obs <- bucket <- model <- nTotal <- n <- event <- eventCum <- nCum <- pctCum <- lift <- pct <- NULL # for CMD check pass
stopifnot(purrr::is_list(addTo),
purrr::is_character(modelTag) || purrr::is_null(modelTag),
purrr::is_character(predTag) || purrr::is_null(predTag)
)
if (!is.null(modelTag)) {
grepIndex <- grep(modelTag, names(addTo))
addTo <- addTo[grepIndex]
}
addToROC <- addTo %>% purrr::map_lgl(~(all(c("ROC", "Sens", "Spec") %in% .$perfNames))) %>% addTo[.]
addToROC <- addToROC %>% purrr::map_lgl(~(predTag %in% names(.))) %>% addToROC[.]
liftGrid <- list(obs = addToROC %>% purrr::map(~.[[predTag]]$Class),
model = addToROC %>% names(),
pred = addToROC %>% purrr::map(~.[[predTag]][,outChar]),
nTotal = addToROC %>% purrr::map(~.[[predTag]]) %>% purrr::map(nrow) %>% purrr::map(as.character),
probCut = addToROC %>% purrr::map(~.[[predTag]][,outChar]),
population = addToROC %>% purrr::map(~.[[predTag]][,outChar]) %>% purrr::map(dplyr::percent_rank)
)
liftSummary <- purrr::pmap(liftGrid, tibble::tibble)
distribution <- liftSummary %>% dplyr::bind_rows()
for (i in seq_along(liftSummary)) {
liftSummary[[i]]$bucket = dplyr::ntile(liftSummary[[i]]$pred, cuts)
liftSummary[[i]] = liftSummary[[i]] %>%
dplyr::mutate(obs = ifelse(obs==outChar, 1, 0)) %>%
dplyr::group_by(bucket, model, nTotal) %>%
dplyr::summarize(event = sum(obs), n = n()) %>%
dplyr::arrange(dplyr::desc(bucket)) %>% as.data.frame() %>%
dplyr::mutate(eventCum = cumsum(event), nCum = cumsum(n)) %>%
dplyr::mutate(pctCum = eventCum / nCum, pct = event / n,
xAxis = nCum / as.integer(nTotal),
lift = pctCum /(max(eventCum)/max(as.integer(nTotal)))
)
}
eventMax <- max(liftSummary[[1]]$eventCum)
nMax <- max(liftSummary[[1]]$nCum)
baseline <- eventMax / nMax
liftObjectDataToPlot <- liftSummary %>% dplyr::bind_rows() %>% as.data.frame() %>%
dplyr::filter(nCum <= nMax) %>% dplyr::filter(nCum >= nMax * 0.001)
lift_gain <- ggplot(data = liftObjectDataToPlot) +
xlim(0.0001, 1) + ggtitle("Gain Chart") +
xlab("Population of observation") + ylab("Proportion of event") +
geom_line(mapping = aes(x = nCum / nMax, y = eventCum / eventMax, color = model)) +
geom_line(mapping = aes(x = nCum / nMax, y = nCum / nMax))
lift_lift <- ggplot(data = liftObjectDataToPlot) +
xlim(0.0001, 1) + ggtitle("Lift Chart") +
xlab("Population of observation") + ylab("Lift Value") +
geom_line(mapping = aes(x = nCum / nMax, y = lift, color = model)) +
geom_line(mapping = aes(x = nCum / nMax, y = 1))
lift_PctAcc <- ggplot(data = liftObjectDataToPlot) +
xlim(0.0001, 1) + ggtitle("Accumulated percent of event(%)") +
xlab("Population of observation") + ylab("Percent of event") +
geom_line(mapping = aes(x = nCum / nMax, y = pctCum, color = model)) +
geom_line(mapping = aes(x = nCum / nMax, y = baseline))
lift_Pct <- ggplot(data = liftObjectDataToPlot) +
ggtitle("Percent of event(%)") +
xlab("Population of observation") + ylab("Percent of event") +
geom_line(mapping = aes(x = nCum / nMax, y = pct, color = model), alpha = 0.1) +
geom_smooth(mapping = aes(x = nCum / nMax, y = pct, color = model), alpha = 0.1, method = "loess") +
geom_line(mapping = aes(x = nCum / nMax, y = baseline))
if (is.null(type)) {
lay <- rbind(c(1,2), c(3,4))
gridExtra::grid.arrange(grobs = list(lift_gain, lift_lift, lift_PctAcc, lift_Pct), layout_matrix = lay)
} else {
switch (type,
"Gain" = lift_gain,
"Lift" = lift_lift,
"PctAcc" = lift_PctAcc,
"Pct" = lift_Pct,
stop("Type doesn't match")
)
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.