#' Specifies cross validation folds for lightgbm models
#'
#' @param cv_folds number of cross validation folds
#' @param nrows training data row count
#' @return list of cross validation folds
generate_lgb_folds <- function(cv_folds, nrows){
folds_idx <- sample(c(1:cv_folds), nrows, replace = T)
folds <- list()
for(i in unique(folds_idx)){
folds[[i]] <- c(1:nrows)[folds_idx == i]
}
folds
}
#' Automatically generates lightgbm hyperparameter sets for optimziation
#' tuning guide: https://towardsdatascience.com/automated-machine-learning-hyperparameter-tuning-in-python-dfda59b72f8a
#'
#' @param tune_rounds number of parameter sets to generate
#' @return a dataframe with hyperparameters for tuning
generate_lgb_params <- function(tune_rounds){
set.seed(1234)
out <- data.frame(boosting_type = "goss",
#ints
max_depth = runif(tune_rounds, -100, 100) %>% round(),
num_leaves = runif(tune_rounds, 30, 150) %>% round(),
min_data_in_leaf = runif(tune_rounds, 10, 30) %>% round(),
#numeric
learning_rate = KScorrect::rlunif(tune_rounds, 0.005, 0.2),
reg_alpha = runif(tune_rounds, 0.0, 1.0),
reg_lambda = runif(tune_rounds, 0.0, 1.0))
#save model evaluations
out$max_depth <- ifelse(out$max_depth < 0, 0, out$max_depth)
out$nrounds <- NA
out$eval <- NA
out
}
#' Automatically models regression objectives with random search hyperparameter optimziation
#' and may be converted to bayesian hyperparameter optimization in the future.
#'
#' @param mdl_data a list of frames from the treat_numeric_data function.
#' @param target target variable for prediction
#' @param cv_folds number of cross validation folds
#' @param tune_rounds number of hyperoptimization tuning rounds
#' @param verbose print model iterations (T/F)
#' @param max_rounds int. Activates early stopping. Requires at least one validation data
#' and one metric. If there's more than one, will check all of them except the training data.
#' Returns the model with (best_iter + early_stopping_rounds). If early stopping occurs, the
#' model will have 'best_iter' field.
#' @param early_stopping_rounds
#' @return model, data, and model results
model_lgb_reg <- function(mdl_data,
target_var,
cv_folds = 5,
tune_rounds = 50,
verbose = T,
max_rounds = 10^5,
early_stopping_rounds = 10,
obj = 'regression'){
#generate parameters
params <- generate_lgb_params(tune_rounds)
params$eval <- NA
#specify fold indexes
folds <- generate_lgb_folds(cv_folds, nrow(mdl_data$train))
# find best model params
for(i in 1:tune_rounds){
if(verbose == T){
print(glue::glue("Testing param set {i} of {tune_rounds}"))
}
lgb_mdl <- lightgbm::lgb.cv(params = list(
boosting_type = params$boosting_type[i],
max_depth = params$max_depth[i],
num_leaves = params$num_leaves[i],
min_data_in_leaf = params$min_data_in_leaf[i],
learning_rate = params$learning_rate[i],
reg_alpha = params$reg_alph[i],
reg_lambda = params$reg_lambda[i]
),
obj = obj,
verbose = -1,
data = mdl_data$train_lgb_data_set,
nrounds = max_rounds,
early_stopping_rounds = early_stopping_rounds,
seed = 1234,
folds = folds
)
#record early stopping and best result
params$eval[i] <- lgb_mdl$best_score
params$nrounds[i] <- lgb_mdl$best_iter
}
#fit best param set
final_mdl <- lightgbm::lgb.train(data = mdl_data$train_lgb_data_set,
obj = 'regression',
verbose = -1,
nrounds = params$nrounds[params$eval == min(params$eval)],
params = list(
boosting_type = params$boosting_type[params$eval == min(params$eval)],
max_depth = params$max_depth[params$eval == min(params$eval)],
num_leaves = params$num_leaves[params$eval == min(params$eval)],
min_data_in_leaf = params$min_data_in_leaf[params$eval == min(params$eval)],
learning_rate = params$learning_rate[params$eval == min(params$eval)],
reg_alpha = params$reg_alph[params$eval == min(params$eval)],
reg_lambda = params$reg_lambda[params$eval == min(params$eval)]
))
#record params tested
mdl_data$params_tested <- params
#record best model
mdl_data$final_mdl <- final_mdl
mdl_data$importance <- lgb.importance(final_mdl)
#generate metrics
mdl_data$test_yhat <- predict(final_mdl, mdl_data$test_lgb)
mdl_data$rmse <- yardstick::rmse_vec(mdl_data$test_y, mdl_data$test_yhat)
mdl_data$r2 <- yardstick::rsq_vec(mdl_data$test_y, mdl_data$test_yhat)
mdl_data$spearman_cor <- cor(mdl_data$test_y, mdl_data$test_yhat, method = 'spearman')
mdl_data$mae <- yardstick::mae_vec(mdl_data$test_y, mdl_data$test_yhat)
return(mdl_data)
}
#' Takes a mdl_data list from mdl_lgb_reg and returns interpretations for train and test sets
#'
#' @param mdl_data mdl_data list from mdl_lgb_reg
#' @return mdl_data list with interpretations from train and test sets
interpret_results <- function(mdl_data){
#interpreting data
print("Interpreting test data")
mdl_data$test_interpreted <- lgb.interprete(mdl_data$final_mdl, mdl_data$test_lgb, c(1:nrow(mdl_data$test_lgb)))
print("Interpreting train data")
mdl_data$train_interpreted <- lgb.interprete(mdl_data$final_mdl, mdl_data$train_lgb, c(1:nrow(mdl_data$train_lgb)))
return(mdl_data)
}
#' randomly samples columns and creates bootstrap replicates to find feature importances
#'
#' @param mdl_data mdl_data list
#' @param col_perc number of colums to select for filtering
#' @param n number of feature filter iterations
#' @param objective lightgbm objective to evaluate
#' @return mdl_data list with interpretations from train and test sets
#'
lgb_feature_filter <- function(mdl_data,
target_var,
col_perc = 0.2,
n = 300,
objective = 'regression'){
set.seed(1234)
cnames <- colnames(mdl_data$train)
cnames <- cnames[!(cnames %in% target_var)]
train <- mdl_data$train
target <- train[,colnames(train) %in% target_var]
col_sample = floor(col_perc * length(cnames))
out <- list()
for(i in 1:n){
flush.console()
print(paste0("Iteration ", i, " of ", n))
#setup data
boot <-sample(1:nrow(train),nrow(train), replace = T)
train_boot <- train[boot, sample(cnames, col_sample, replace = F)]
target_boot <- target[boot]
train_boot <- lightgbm::lgb.Dataset(data = as.matrix(train_boot),
label = as.matrix(target_boot))
#build model
mdl <- lightgbm::lgb.train(data = train_boot,
objective = objective,
nrounds = 50,
verbose = -1)
#extract importance
imp <- lgb.importance(mdl)
imp <- data.frame(feature = imp$Feature, important = ifelse(imp$Gain > mean(imp$Gain), 1, 0))
out[[i]] <- imp
}
out <- do.call(rbind, out)
out <- out %>%
dplyr::group_by(feature) %>%
summarise(
selected_count = n(),
importance_count = sum(important),
importance_perc = sum(important) / n()
) %>%
dplyr::arrange(-importance_perc)
return(out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.