#' Automatic tuning
#'
#' Automated tuning process for the penalty parameter lambda, with built-in
#' feature selection. Lambda directly influences the granularity of the
#' segmentation, with low/high values resulting in a fine/coarse segmentation.
#'
#' @param mfit Fitted model object (e.g., a "gbm" or "randomForest" object).
#' @param data Data frame containing the original training data.
#' @param vars Character vector specifying the features in \code{data} to use.
#' @param target String specifying the target (or response) variable to model.
#' @param max_ngrps Integer specifying the maximum number of groups that each
#' feature's values/levels are allowed to be grouped into.
#' @param hcut Numeric in the range \[0,1\] specifying the cut-off value for the
#' normalized cumulative H-statistic over all two-way interactions, ordered
#' from most to least important, between the features in \code{vars}. Note
#' that \code{hcut = 0} will consider the single most important interaction,
#' while \code{hcut = 1} will consider all possible two-way interactions.
#' Setting \code{hcut = -1} will only consider main effects in the tuning.
#' @param ignr_intr Optional character string specifying features to ignore when
#' searching for meaningful interactions to incorporate in the GLM.
#' @param pred_fun Optional prediction function to calculate feature effects for
#' the model in \code{mfit}. Requires two arguments: \code{object} and
#' \code{newdata}. See \code{\link[pdp:partial]{pdp::partial}} and this
#' \href{https://bgreenwell.github.io/pdp/articles/pdp-extending.html}{article}
#' for the details. See also the function \code{gbm_fun} in the example.
#' @param lambdas Numeric vector with the possible lambda values to explore. The
#' search grid is created automatically via \code{\link{lambda_grid}} such
#' that it contains only those values of lambda that result in a unique
#' grouping of the full set of features. A seperate grid is generated for main
#' and interaction effects, due to the scale difference in both types.
#' @param nfolds Integer for the number of folds in K-fold cross-validation.
#' @param strat_vars Character (vector) specifying the feature(s) to use for
#' stratified sampling. The default NULL implies no stratification is applied.
#' @param glm_par Named list, constructed via \code{\link{alist}}, containing
#' arguments to be passed on to \code{\link[stats]{glm}}. Examples are:
#' \code{family}, \code{weights} or \code{offset}. Note that \code{formula}
#' will be ignored as the GLM formula is determined by the specified
#' \code{target} and the automatic feature selection in the tuning process.
#' @param err_fun Error function to calculate the prediction errors on the
#' validation folds. This must be an R function which outputs a single number
#' and takes two vectors \code{y_pred} and \code{y_true} as input for the
#' predicted and true target values respectively. An additional input vector
#' \code{w_case} is allowed to use case weights in the error function. The
#' weights are determined automatically based on the \code{weights} field
#' supplied to \code{glm_par}. Examples already included in the package are:
#' \describe{ \item{mse}{mean squared error loss function (default).}
#' \item{wgt_mse}{weighted mean squared error loss function.}
#' \item{poi_dev}{Poisson deviance loss function.} } See \code{\link{err_fun}}
#' for details on these predefined functions.
#' @param ncores Integer specifying the number of cores to use. The default
#' \code{ncores = -1} uses all the available physical cores (not threads), as
#' determined by \code{parallel::detectCores(logical = 'FALSE')}.
#' @param out_pds Boolean to indicate whether to add the calculated PD effects
#' for the selected features to the output list.
#' @return List with the following elements: \describe{ \item{slct_feat}{named
#' vector containing the selected features (names) and the optimal number of
#' groups for each feature (values).} \item{best_surr}{the optimal GLM
#' surrogate, which is fit to all observations in \code{data}. The segmented
#' data can be obtained via the \code{$data} attribute of the GLM fit.}
#' \item{tune_main}{ the cross-validation results for the main effects as a
#' tidy data frame. The column \code{cv_err} contains the cross-validated
#' error, while the columns \code{1:nfolds} contain the error on the
#' validation folds.} \item{tune_intr}{cross-validation results for the
#' interaction effects.} \item{pd_fx}{List with the PD effects for the
#' selected features (only present if \code{out_pds = TRUE}).}}
#' @examples
#' \dontrun{
#' data('mtpl_be')
#' features <- setdiff(names(mtpl_be), c('id', 'nclaims', 'expo', 'long', 'lat'))
#' set.seed(12345)
#' gbm_fit <- gbm::gbm(as.formula(paste('nclaims ~',
#' paste(features, collapse = ' + '))),
#' distribution = 'poisson',
#' data = mtpl_be,
#' n.trees = 50,
#' interaction.depth = 3,
#' shrinkage = 0.1)
#' gbm_fun <- function(object, newdata) mean(predict(object, newdata, n.trees = object$n.trees, type = 'response'))
#' gbm_fit %>% autotune(data = mtpl_be,
#' vars = c('ageph', 'bm', 'coverage', 'fuel', 'sex', 'fleet', 'use'),
#' target = 'nclaims',
#' hcut = 0.75,
#' pred_fun = gbm_fun,
#' lambdas = as.vector(outer(seq(1, 10, 1), 10^(-6:-2))),
#' nfolds = 5,
#' strat_vars = c('nclaims', 'expo'),
#' glm_par = alist(family = poisson(link = 'log'),
#' offset = log(expo)),
#' err_fun = poi_dev,
#' ncores = -1)
#' }
#' @export
autotune <- function(mfit, data, vars, target, max_ngrps = 15, hcut = 0.75, ignr_intr = NULL, pred_fun = NULL, lambdas = as.vector(outer(seq(1, 10, 0.1), 10^(-7:3))), nfolds = 5, strat_vars = NULL, glm_par = alist(), err_fun = mse, ncores = -1, out_pds = FALSE) {
if (sum(grepl('_', vars)) > 0) stop('No underscores allowed in the variable names, these are interpreted as interactions in maidrr.')
if (! all(vars %in% names(data))) stop('All the variables needs to be present in the data.')
if (! target %in% names(data)) stop('The target variable needs to be present in the data.')
if (! ((hcut == -1) | (0 <= hcut & hcut <= 1))) stop('Invalid value specified for hcut, must equal -1 or lie within the range [0, 1].')
if (nfolds <= 1) stop('At least two folds are needed to perform K-fold cross-validation.')
if ((! is.null(strat_vars)) & (! all(strat_vars %in% names(data)))) stop('The stratification variables in strat_vars need to be present in the data.')
if ('formula' %in% names(glm_par)) {
warning('The field "formula" is removed from glm_par as this is determined by the supplied target and automatic feature selection.')
glm_par[['formula']] <- NULL
}
if(! all(paste0('y_', c('true', 'pred')) %in% names(formals(err_fun)))) stop('The error function must contain arguments y_true and y_pred.')
if (length(setdiff(names(formals(err_fun)), c('y_true', 'y_pred', 'w_case'))) > 0) stop('The error function can only contain arguments y_true, y_pred and w_case.')
if (('w_case' %in% names(formals(err_fun))) & (! 'weights' %in% names(glm_par))) stop('If w_case is an argument in err_fun, weights must be an argument in glm_par.')
if (ncores < -1 | ncores == 0) stop('The number of cores must be strictly positive, or equal to -1 for all available cores.')
if (ncores > parallel::detectCores()) warning('The asked number of cores is larger than parallel::detectCores, so there might be trouble ahead.')
# Function to calculate validation errors
Kfold_cross_valid <- function(data, target, nfolds, glm_par, err_fun, fixed_terms = NULL){
val_err <- rep(NA, nfolds)
for (f in seq_len(nfolds)) {
# Get training fold
trn_data <- dplyr::filter(data, fold != f)
# Feature selection (only keep those with >1 group) and determine the GLM formula
slct_feat <- names(which(unlist(lapply(trn_data[, grepl('_$', names(trn_data))], function(x) length(unique(x)) > 1))))
glm_par[['formula']] <- as.formula(paste(target, '~', paste(c(1, fixed_terms, slct_feat), collapse = ' + ')))
# Fit surrogate GLM on the training fold
glm_fit <- maidrr::surrogate(trn_data, glm_par)
# Get validation fold and calculate the prediction error
val_data <- maidrr::rm_lvls(glm_fit, dplyr::filter(data, fold == f))
y_pred <- suppressWarnings(predict(glm_fit, newdata = val_data, type = 'response'))
y_true <- dplyr::pull(val_data, target)
w_case <- val_data[[deparse(glm_fit$call$weights)]]
val_err[f] <- ifelse(length(glm_fit$coefficients) > glm_fit$rank, Inf,
do.call(err_fun, setNames(lapply(names(formals(err_fun)), function(x) eval(as.name(x))), names(formals(err_fun)))))
}
return(val_err)
}
# Create the fold indicator via (stratified) sampling
data <- data %>% dplyr::sample_n(size = nrow(.), replace = FALSE) %>%
dplyr::arrange(!!! rlang::syms(strat_vars)) %>%
dplyr::mutate(fold = rep_len(seq_len(nfolds), nrow(.)))
# Generate the feature effects and the lambda grid for main effects
fx_main <- maidrr::insights(mfit = mfit,
vars = vars,
data = data,
interactions = 'user',
pred_fun = pred_fun,
ncores = ncores)
lmbd_main <- maidrr::lambda_grid(fx_vars = fx_main, lambda_range = lambdas, max_ngrps = max_ngrps)
# Set up a cluster and iterate over the lambda grid
switch(as.character(ncores),
'-1' = doParallel::registerDoParallel(parallel::detectCores(logical = 'FALSE')),
'1' = foreach::registerDoSEQ(),
doParallel::registerDoParallel(ncores))
out_main <- foreach::foreach (l = seq_len(nrow(lmbd_main)), .combine = rbind) %dopar% {
# Segmentation for current lambda values
data_segm <- maidrr::segmentation(fx_vars = fx_main, data = data, type = 'ngroups',
values = unlist(dplyr::select(dplyr::slice(lmbd_main, l), vars), use.names = TRUE))
# Calculate the validation errors
Kfold_cross_valid(data = data_segm, target = target, nfolds = nfolds, glm_par = glm_par, err_fun = err_fun)
}
doParallel::stopImplicitCluster()
# Get the cross-validation error
if (! ('matrix' %in% class(out_main) & nrow(lmbd_main) > 1)) out_main <- out_main %>% as.matrix() %>% t()
out_main <- dplyr::bind_cols(lmbd_main, out_main %>% as.data.frame %>% setNames(seq_len(nfolds))) %>%
dplyr::mutate(cv_err = rowMeans(dplyr::select(., as.character(seq_len(nfolds))), na.rm = TRUE)) %>% dplyr::arrange(cv_err)
# Get the selected features
slct_main <- out_main %>% dplyr::slice(1) %>% dplyr::select(!!!rlang::syms(vars)) %>%
dplyr::select_if(~. > 1) %>% unlist(., use.names = TRUE)
if (length(slct_main) == 0) stop('Not a single feature was selected, please try again with lower values for lambda.')
# Segment the data based on the selected features and optimal number of groups
data_main <- maidrr::segmentation(fx_vars = fx_main[names(slct_main)], data = data, type = 'ngroups', values = slct_main)
# Return these results if no interactions are asked for
if (hcut == -1) {
# Fit the optimal surrogate GLM
glm_par[['formula']] <- as.formula(paste(target, '~', paste(c(1, paste0(names(slct_main), '_')), collapse = ' + ')))
opt_surro <- maidrr::surrogate(data = data_main, par_list = glm_par)
# Combine results in a list and return
output <- list('slct_feat' = slct_main,
'best_surr' = opt_surro,
'tune_main' = out_main,
'tune_intr' = NULL)
# Add PD effects if asked for
if (out_pds) output$pd_fx <- fx_main[names(slct_main)]
# Output
return(output)
}
# Generate the feature effects and the lambda grid for interactions
fx_intr <- maidrr::insights(mfit = mfit,
vars = setdiff(names(slct_main), ignr_intr),
data = data,
interactions = 'auto',
hcut = hcut,
pred_fun = pred_fun,
fx_in = fx_main,
ncores = ncores)
fx_intr <- fx_intr[grepl('_', names(fx_intr))]
vars_intr <- names(fx_intr)
lmbd_intr <- maidrr::lambda_grid(fx_vars = fx_intr, lambda_range = lambdas, max_ngrps = max_ngrps)
# Set up a cluster and iterate over the lambda grid
switch(as.character(ncores),
'-1' = doParallel::registerDoParallel(parallel::detectCores(logical = 'FALSE')),
'1' = foreach::registerDoSEQ(),
doParallel::registerDoParallel(ncores))
out_intr <- foreach::foreach (l = seq_len(nrow(lmbd_intr)), .combine = rbind) %dopar% {
# Segmentation for current lambda values
data_segm <- maidrr::segmentation(fx_vars = fx_intr, data = data_main, type = 'ngroups',
values = unlist(dplyr::select(dplyr::slice(lmbd_intr, l), vars_intr), use.names = TRUE))
# Calculate the validation errors
Kfold_cross_valid(data = data_segm, target = target, nfolds = nfolds, glm_par = glm_par, err_fun = err_fun, fixed_terms = paste0(names(slct_main), '_'))
}
doParallel::stopImplicitCluster()
# Get the cross-validation error
if (! ('matrix' %in% class(out_intr) & nrow(lmbd_intr) > 1)) out_intr <- out_intr %>% as.matrix() %>% t()
out_intr <- dplyr::bind_cols(lmbd_intr, out_intr %>% as.data.frame %>% setNames(seq_len(nfolds))) %>%
dplyr::mutate(cv_err = rowMeans(dplyr::select(., as.character(seq_len(nfolds))), na.rm = TRUE)) %>% dplyr::arrange(cv_err)
if (all(out_intr$cv_err == Inf)) warning('All interaction combinations lead to rank-deficient GLM fits, try adding larger values for lambda to the grid.')
# Get the selected features
slct_intr <- out_intr %>% dplyr::slice(1) %>% dplyr::select(!!!rlang::syms(vars_intr)) %>%
dplyr::select_if(~. > 1) %>% unlist(., use.names = TRUE)
# Check if interactions improve the fit with only main effects
if (min(out_intr$cv_err) < min(out_main$cv_err)) slct_feat <- c(slct_main, slct_intr) else slct_feat <- slct_main
# Segment the data based on the selected features and optimal number of groups
data_segm <- maidrr::segmentation(fx_vars = c(fx_main, fx_intr)[names(slct_feat)], data = data, type = 'ngroups', values = slct_feat)
# Fit the optimal surrogate GLM
glm_par[['formula']] <- as.formula(paste(target, '~', paste(c(1, paste0(names(slct_feat), '_')), collapse = ' + ')))
opt_surro <- maidrr::surrogate(data = data_segm, par_list = glm_par)
# Combine results in a list and return
output <- list('slct_feat' = slct_feat,
'best_surr' = opt_surro,
'tune_main' = out_main,
'tune_intr' = out_intr)
# Add PD effects if asked for
if (out_pds) output$pd_fx <- c(fx_main, fx_intr)[names(slct_feat)]
# Output
return(output)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.