Nothing
## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>",
fig.path = "man/figures/vignette_cv_custom_fn-",
dpi = 92,
fig.retina = 2,
eval = requireNamespace("e1071") # Only evaluate chunks when e1071 is installed!
)
options(rmarkdown.html_vignette.check_title = FALSE)
## ----warning=FALSE, message=FALSE---------------------------------------------
library(cvms)
library(groupdata2) # fold()
library(dplyr)
library(knitr) # kable() : formats the output as a table
library(e1071) # svm()
set.seed(1)
## -----------------------------------------------------------------------------
# Enable parallelization by uncommenting
# library(doParallel)
# registerDoParallel(4) # 4 cores
## -----------------------------------------------------------------------------
# Prepare dataset
data <- participant.scores
data$diagnosis <- factor(data$diagnosis)
# Create 5 fold columns with 3 folds each
data <- fold(
data,
k = 3,
cat_col = "diagnosis",
id_col = "participant",
num_fold_cols = 5,
parallel = FALSE # set to TRUE to run in parallel
)
# Order by participant
data <- data %>%
dplyr::arrange(participant)
# Look at the first 12 rows
# Note: kable() just formats the table
data %>% head(12) %>% kable()
## -----------------------------------------------------------------------------
# Check the distribution of 'diagnosis' in the first fold column
# Note: this would be more even for a larger dataset
data %>%
dplyr::count(.folds_1, diagnosis) %>%
kable()
# Check the distribution of 'participant' in the first fold column
# Note that all rows for a participant are in the same fold
data %>%
dplyr::count(.folds_1, participant) %>%
kable()
## -----------------------------------------------------------------------------
# Split into train and test sets
test_set <- data %>%
dplyr::filter(.folds_1 == 3)
train_set <- data %>%
dplyr::filter(.folds_1 != 3)
# Fit SVM model
svm_model <- e1071::svm(
formula = score ~ diagnosis + age + session,
data = train_set,
kernel = "linear",
cost = 10,
type = "eps-regression"
)
# Predict scores in the test set
predicted_scores <- predict(
svm_model,
newdata = test_set,
allow.new.levels = TRUE)
predicted_scores
# Add predictions to test set
test_set[["predicted score"]] <- predicted_scores
# Evaluate the predictions
evaluate(
data = test_set,
target_col = "score",
prediction_cols = "predicted score",
type = "gaussian"
)
## -----------------------------------------------------------------------------
svm_model_fn <- function(train_data, formula, hyperparameters) {
e1071::svm(
formula = formula,
data = train_data,
kernel = "linear",
cost = 10,
type = "eps-regression"
)
}
## -----------------------------------------------------------------------------
# Try the model function
# Due to "lazy evaluation" in R, we don't have to pass
# the arguments that are not used inside the function
m0 <- svm_model_fn(train_data = train_set,
formula = score ~ diagnosis + age + session)
m0
## -----------------------------------------------------------------------------
svm_predict_fn <- function(test_data, model, formula, hyperparameters, train_data) {
predict(object = model,
newdata = test_data,
allow.new.levels = TRUE)
}
# Try the predict function
svm_predict_fn(test_data = test_set, model = m0)
## -----------------------------------------------------------------------------
cv_1 <- cross_validate_fn(
data = data,
formulas = c("score ~ diagnosis + age + session",
"score ~ diagnosis + age",
"score ~ diagnosis"),
type = "gaussian",
model_fn = svm_model_fn,
predict_fn = svm_predict_fn,
fold_cols = paste0(".folds_", 1:5),
parallel = FALSE # set to TRUE to run in parallel
)
cv_1
## -----------------------------------------------------------------------------
svm_model_fn <- function(train_data, formula, hyperparameters) {
# Required hyperparameters:
# - kernel
# - cost
if (!"kernel" %in% names(hyperparameters))
stop("'hyperparameters' must include 'kernel'")
if (!"cost" %in% names(hyperparameters))
stop("'hyperparameters' must include 'cost'")
e1071::svm(
formula = formula,
data = train_data,
kernel = hyperparameters[["kernel"]],
cost = hyperparameters[["cost"]],
scale = FALSE,
type = "eps-regression"
)
}
# Try the model function
svm_model_fn(
train_data = train_set,
formula = score ~ diagnosis + age + session,
hyperparameters = list(
"kernel" = "linear",
"cost" = 5
)
)
## -----------------------------------------------------------------------------
svm_model_fn <- function(train_data, formula, hyperparameters) {
# Required hyperparameters:
# - cost
# Optional hyperparameters:
# - kernel
# 1) If 'cost' is not present in hyperparameters, throw error
# 2) If 'kernel' is not present in hyperparameters, set to "radial"
hyperparameters <- update_hyperparameters(
kernel = "radial",
hyperparameters = hyperparameters,
required = "cost"
)
e1071::svm(
formula = formula,
data = train_data,
kernel = hyperparameters[["kernel"]],
cost = hyperparameters[["cost"]],
type = "eps-regression"
)
}
## -----------------------------------------------------------------------------
hparams <- list(
"kernel" = c("linear", "radial"),
"cost" = c(1, 5, 10)
)
## -----------------------------------------------------------------------------
hparams <- list(
".n" = 4,
"kernel" = c("linear", "radial"),
"cost" = c(1, 5, 10)
)
## -----------------------------------------------------------------------------
df_hparams <- data.frame(
"kernel" = c("linear", "radial", "radial"),
"cost" = c(10, 1, 10)
)
df_hparams
## -----------------------------------------------------------------------------
# Set seed for the sampling of the hyperparameter combinations
set.seed(1)
cv_2 <- cross_validate_fn(
data = data,
formulas = c("score ~ diagnosis + age + session",
"score ~ diagnosis"),
type = "gaussian",
model_fn = svm_model_fn,
predict_fn = svm_predict_fn,
hyperparameters = hparams, # Pass the list of values to test
fold_cols = paste0(".folds_", 1:5)
)
cv_2
## -----------------------------------------------------------------------------
cv_2 %>%
# Create Model ID with values 1:8
dplyr::mutate(`Model ID` = 1:nrow(cv_2)) %>%
# Order by RMSE
dplyr::arrange(RMSE) %>%
# Extract formulas and hyperparameters
select_definitions(additional_includes = c("RMSE", "Model ID")) %>%
# Pretty table
kable()
## -----------------------------------------------------------------------------
# Extract fold results for the best model
cv_2$Results[[1]] %>% kable()
# Extract 10 predictions from the best model
cv_2$Predictions[[1]] %>% head(10) %>% kable()
## ----eval=FALSE---------------------------------------------------------------
# # NOTE: Don't run this
# preprocess_fn <- function(train_data, test_data, formula, hyperparameters) {
#
# # Do preprocessing
# # Create data frame with applied preprocessing parameters
#
# # Return list with these names
# list("train" = train_data,
# "test" = test_data,
# "parameters" = preprocess_parameters)
# }
#
## -----------------------------------------------------------------------------
preprocess_fn <- function(train_data, test_data, formula, hyperparameters) {
# Standardize the age column
# Get the mean and standard deviation from the train_data
mean_age <- mean(train_data[["age"]])
sd_age <- sd(train_data[["age"]])
# Standardize both train_data and test_data
train_data[["age"]] <- (train_data[["age"]] - mean_age) / sd_age
test_data[["age"]] <- (test_data[["age"]] - mean_age) / sd_age
# Create data frame with applied preprocessing parameters
preprocess_parameters <- data.frame(
"Measure" = c("Mean", "SD"),
"age" = c(mean_age, sd_age)
)
# Return list with these names
list("train" = train_data,
"test" = test_data,
"parameters" = preprocess_parameters)
}
# Try the preprocess function
prepped <- preprocess_fn(train_data = train_set, test_data = test_set)
# Inspect preprocessed training set
# Note that the age column has changed
prepped$train %>% head(5) %>% kable()
# Inspect preprocessing parameters
prepped$parameters %>% kable()
## -----------------------------------------------------------------------------
cv_3 <- cross_validate_fn(
data = data,
formulas = c("score ~ diagnosis + age + session",
"score ~ diagnosis"),
type = "gaussian",
model_fn = svm_model_fn,
predict_fn = svm_predict_fn,
preprocess_fn = preprocess_fn,
hyperparameters = list(
"kernel" = "linear",
"cost" = 1
),
fold_cols = paste0(".folds_", 1:5)
)
cv_3
## -----------------------------------------------------------------------------
# Extract first 10 rows of the preprocess parameters
# for the first and best model
cv_3$Preprocess[[1]] %>% head(10) %>% kable()
## -----------------------------------------------------------------------------
# Get built-in preprocess function
preprocess_functions("standardize")
## -----------------------------------------------------------------------------
# SVM model function for classification
clf_svm_model_fn <- function(train_data, formula, hyperparameters) {
# Optional hyperparameters:
# - kernel
# - cost
# Update missing hyperparameters with default values
hyperparameters <- update_hyperparameters(
kernel = "radial",
cost = 1,
hyperparameters = hyperparameters
)
e1071::svm(
formula = formula,
data = train_data,
kernel = hyperparameters[["kernel"]],
cost = hyperparameters[["cost"]],
type = "C-classification",
probability = TRUE # Must enable probability here
)
}
# Try the model function
m1 <- clf_svm_model_fn(train_data = data, formula = diagnosis ~ score,
hyperparameters = list("kernel" = "linear"))
m1
## -----------------------------------------------------------------------------
# Predict function for binomial SVM
bnml_svm_predict_fn <- function(test_data, model, formula, hyperparameters, train_data) {
# Predict test set
predictions <- predict(
object = model,
newdata = test_data,
allow.new.levels = TRUE,
probability = TRUE
)
# Extract probabilities
probabilities <- dplyr::as_tibble(attr(predictions, "probabilities"))
# Return second column
probabilities[[2]]
}
p1 <- bnml_svm_predict_fn(test_data = data, model = m1)
p1 # Vector with probabilities that diagnosis is 1
## ----warning=FALSE------------------------------------------------------------
cv_4 <- cross_validate_fn(
data = data,
formulas = c("diagnosis ~ score",
"diagnosis ~ age"),
type = "binomial",
model_fn = clf_svm_model_fn,
predict_fn = bnml_svm_predict_fn,
hyperparameters = list(
"kernel" = c("linear", "radial"),
"cost" = c(1, 5, 10)
),
fold_cols = paste0(".folds_", 1:5)
)
cv_4
## -----------------------------------------------------------------------------
cv_4 %>%
dplyr::mutate(`Model ID` = 1:nrow(cv_4)) %>%
dplyr::arrange(dplyr::desc(`Balanced Accuracy`)) %>%
select_definitions(additional_includes = c("Balanced Accuracy", "F1", "MCC", "Model ID")) %>%
kable()
## -----------------------------------------------------------------------------
# Set seed for reproducibility
set.seed(1)
# Prepare dataset
data_mc <- musicians
data_mc[["ID"]] <- as.factor(data_mc[["ID"]])
# Create 5 fold columns with 5 folds each
data_mc <- fold(
data = data_mc,
k = 5,
cat_col = "Class",
num_col = "Age",
num_fold_cols = 5
)
data_mc %>% head(10) %>% kable()
# You can use skimr to get a better overview of the dataset
# Uncomment:
# library(skimr)
# skimr::skim(data_mc)
## -----------------------------------------------------------------------------
# Predict function for multinomial SVM
mc_svm_predict_fn <- function(test_data, model, formula, hyperparameters, train_data) {
predictions <- predict(
object = model,
newdata = test_data,
allow.new.levels = TRUE,
probability = TRUE
)
# Extract probabilities
probabilities <- dplyr::as_tibble(attr(predictions, "probabilities"))
# Return all columns
probabilities
}
## -----------------------------------------------------------------------------
cv_5 <- cross_validate_fn(
data = data_mc,
formulas = c("Class ~ Age + Height",
"Class ~ Age + Height + Bass + Guitar + Keys + Vocals"),
type = "multinomial",
model_fn = clf_svm_model_fn,
predict_fn = mc_svm_predict_fn,
hyperparameters = list(
"kernel" = c("linear", "radial"),
"cost" = c(1, 5, 10)
),
fold_cols = paste0(".folds_", 1:5)
)
cv_5
## -----------------------------------------------------------------------------
cv_5 %>%
dplyr::mutate(`Model ID` = 1:nrow(cv_5)) %>%
dplyr::arrange(dplyr::desc(`Balanced Accuracy`)) %>%
select_definitions(additional_includes = c(
"Balanced Accuracy", "F1", "Model ID")) %>%
kable()
## -----------------------------------------------------------------------------
# Extract Class Level Results for the best model
cv_5$`Class Level Results`[[11]]
## -----------------------------------------------------------------------------
# Extract fold results for the best model
cv_5$Results[[11]]
## -----------------------------------------------------------------------------
# Extract multiclass confusion matrices for the best model
# One per fold column
cv_5$`Confusion Matrix`[[11]]
## ----fig.width=5.5, fig.height=5.5, fig.align='center'------------------------
# Sum the fold column confusion matrices
# to one overall confusion matrix
overall_confusion_matrix <- cv_5$`Confusion Matrix`[[11]] %>%
dplyr::group_by(Prediction, Target) %>%
dplyr::summarise(N = sum(N))
overall_confusion_matrix %>% kable()
# Plot the overall confusion matrix
plot_confusion_matrix(overall_confusion_matrix, add_sums = TRUE)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.