#' Generates a set of cross validation folds used in the xgb_reg, xgb_cat, and xgb_pois functions.
#'
#' @param tune_rounds Specifies the number of random hyperparamter combos to return
#' @return List with cross validation fold indices
generate_xgb_folds <- function(cv_folds, nrows){
folds_idx <- sample(c(1:cv_folds), nrows, replace = T)
folds <- list()
for(i in unique(folds_idx)){
folds[[i]] <- c(1:nrows)[folds_idx == i]
}
return(folds)
}
#' Generates a random set of hyperparamters used in the xgb_reg, xgb_cat, and xgb_pois functions.
#'
#' @param tune_rounds specifies the number of random hyperparamter combos to return.
#' @return data.frame with parameters with the number of rows of tune_rounds.
generate_xgb_params <- function(tune_rounds){
# randomly samples xgboost hyperparameters based on the number of tune rounds
set.seed(1234)
out <- data.frame(eta = runif(tune_rounds, 0.025, 0.5),
gamma = runif(tune_rounds, 0.1, 1),
max_depth = 0,
subsample = runif(tune_rounds, 0.5, 1),
colsample_bytree = runif(tune_rounds, 0.5, 1.0),
min_child_weight = sample(1:6, tune_rounds, replace = T),
max_leaves = sample(c(5, 10, 25, 50, 100, 250, 500, 1000), tune_rounds, replace = T))
out$eval <- NA
return(out)
}
#' Generates bootstrap replicates of y and y_hat and calculates error metrics on the replicates.
#' Metrics are focused on common regression tasks.
#'
#' @param y True values.
#' @param y_hat Predicted values.
#' @param n Indicates the number of boostrap replicates.
#' @return data.frame with error metrics.
generate_reg_metrics <- function(y, y_hat, n = 10000){
out <- list()
for(i in 1:n){
samp <- sample(1:length(y), length(y), replace = T)
rmse <- yardstick::rmse_vec(y[samp], y_hat[samp])
mae <- yardstick::mae_vec(y[samp], y_hat[samp])
rsq <- yardstick::rsq_vec(y[samp], y_hat[samp])
spearman_cor <- cor(y[samp], y_hat[samp], method = "spearman")
out[[i]] <- data.frame(rmse, mae, rsq, spearman_cor)
}
out <- do.call(rbind, out)
return(out)
}
#' Calculates residual deviance for poisson regression. 0 y values are replaced with
#' 10^-10 and this seems to align with the residual deviance calc in predict.glm.
#'
#' @param y True values.
#' @param y_hat Predicted values.
#' @return Residual deviance.
calc_deviance <- function(y, y_hat){
y <- ifelse(y <= 0, 10^-10, y)
r <- (y * log(y / y_hat) - (y - y_hat))
dev <- 2 * sum(r)
return(dev)
}
#' Generates bootstrap replicates of y and y_hat and calculates residual deviance.
#'
#' @param y True values.
#' @param y_hat Predicted values.
#' @param n Indicates the number of bootstrap replicates.
#' @return Vector of n length of residual deviance calculations.
generate_deviance_metrics <- function(y, y_hat, n = 1000){
out <- c()
for(i in 1:n){
samp <- sample(1:length(y), length(y), replace = T)
out <- c(out, calc_deviance(y[samp], y_hat[samp]))
}
return(out)
}
#' splits train and test frames
#'
#' @param df data.frame to split into train and test.
#' @param perc_train Percent of data to place in train data set.
#' @return a list with a test data.frame, train data.frame and train idices.
split_data <- function(df, perc_train = 0.8){
set.seed(1234)
train_idx <- sample(1:nrow(df), floor(nrow(df) * perc_train), replace = F)
df_train <- df[train_idx, ]
df_test <- df[-train_idx, ]
#outputs
out <- list()
out$df_train <- df_train
out$df_test <- df_test
out$train_idx <- train_idx
return(out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.